npm - @softspark/ai-toolkit - Versions diffs - 1.0.0 - Mend

@softspark/ai-toolkit 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (327) hide show

package/AGENTS.md +412 -0
package/CHANGELOG.md +68 -0
package/LICENSE +21 -0
package/README.md +632 -0
package/action.yml +53 -0
package/app/.claude-plugin/plugin.json +44 -0
package/app/ARCHITECTURE.md +306 -0
package/app/CLAUDE.md.template +23 -0
package/app/agents/ai-engineer.md +128 -0
package/app/agents/backend-specialist.md +193 -0
package/app/agents/business-intelligence.md +54 -0
package/app/agents/chaos-monkey.md +67 -0
package/app/agents/chief-of-staff.md +51 -0
package/app/agents/code-archaeologist.md +127 -0
package/app/agents/code-reviewer.md +184 -0
package/app/agents/command-expert.md +131 -0
package/app/agents/data-analyst.md +205 -0
package/app/agents/data-scientist.md +151 -0
package/app/agents/database-architect.md +317 -0
package/app/agents/debugger.md +238 -0
package/app/agents/devops-implementer.md +194 -0
package/app/agents/documenter.md +364 -0
package/app/agents/explorer-agent.md +145 -0
package/app/agents/fact-checker.md +172 -0
package/app/agents/frontend-specialist.md +209 -0
package/app/agents/game-developer.md +216 -0
package/app/agents/incident-responder.md +226 -0
package/app/agents/infrastructure-architect.md +127 -0
package/app/agents/infrastructure-validator.md +247 -0
package/app/agents/llm-ops-engineer.md +237 -0
package/app/agents/mcp-expert.md +228 -0
package/app/agents/mcp-server-architect.md +195 -0
package/app/agents/mcp-testing-engineer.md +292 -0
package/app/agents/meta-architect.md +58 -0
package/app/agents/ml-engineer.md +136 -0
package/app/agents/mobile-developer.md +190 -0
package/app/agents/night-watchman.md +55 -0
package/app/agents/nlp-engineer.md +154 -0
package/app/agents/orchestrator.md +437 -0
package/app/agents/performance-optimizer.md +254 -0
package/app/agents/predictive-analyst.md +57 -0
package/app/agents/product-manager.md +194 -0
package/app/agents/project-planner.md +287 -0
package/app/agents/prompt-engineer.md +103 -0
package/app/agents/qa-automation-engineer.md +182 -0
package/app/agents/rag-engineer.md +201 -0
package/app/agents/research-synthesizer.md +138 -0
package/app/agents/search-specialist.md +101 -0
package/app/agents/security-architect.md +62 -0
package/app/agents/security-auditor.md +293 -0
package/app/agents/seo-specialist.md +111 -0
package/app/agents/system-governor.md +57 -0
package/app/agents/tech-lead.md +62 -0
package/app/agents/technical-researcher.md +103 -0
package/app/agents/test-engineer.md +264 -0
package/app/constitution.md +38 -0
package/app/hooks/_profile-check.sh +11 -0
package/app/hooks/guard-destructive.sh +74 -0
package/app/hooks/guard-path.sh +73 -0
package/app/hooks/post-tool-use.sh +35 -0
package/app/hooks/pre-compact.sh +31 -0
package/app/hooks/quality-check.sh +22 -0
package/app/hooks/quality-gate.sh +49 -0
package/app/hooks/save-session.sh +24 -0
package/app/hooks/session-end.sh +37 -0
package/app/hooks/session-start.sh +29 -0
package/app/hooks/subagent-start.sh +16 -0
package/app/hooks/subagent-stop.sh +16 -0
package/app/hooks/track-usage.sh +50 -0
package/app/hooks/user-prompt-submit.sh +25 -0
package/app/hooks.json +178 -0
package/app/mcp-defaults.json +23 -0
package/app/output-styles/golden-rules.md +43 -0
package/app/plugins/README.md +19 -0
package/app/plugins/csharp-pack/README.md +11 -0
package/app/plugins/csharp-pack/plugin.json +18 -0
package/app/plugins/enterprise-pack/README.md +16 -0
package/app/plugins/enterprise-pack/hooks/output-style.sh +6 -0
package/app/plugins/enterprise-pack/hooks/status-line.sh +8 -0
package/app/plugins/enterprise-pack/plugin.json +24 -0
package/app/plugins/frontend-pack/README.md +14 -0
package/app/plugins/frontend-pack/plugin.json +22 -0
package/app/plugins/java-pack/README.md +11 -0
package/app/plugins/java-pack/plugin.json +18 -0
package/app/plugins/kotlin-pack/README.md +11 -0
package/app/plugins/kotlin-pack/plugin.json +18 -0
package/app/plugins/memory-pack/README.md +24 -0
package/app/plugins/memory-pack/hooks/observation-capture.sh +67 -0
package/app/plugins/memory-pack/hooks/session-summary.sh +71 -0
package/app/plugins/memory-pack/plugin.json +22 -0
package/app/plugins/memory-pack/scripts/init_db.py +81 -0
package/app/plugins/memory-pack/scripts/strip_private.py +22 -0
package/app/plugins/memory-pack/skills/mem-search/SKILL.md +70 -0
package/app/plugins/research-pack/README.md +14 -0
package/app/plugins/research-pack/plugin.json +22 -0
package/app/plugins/ruby-pack/README.md +11 -0
package/app/plugins/ruby-pack/plugin.json +18 -0
package/app/plugins/rust-pack/README.md +11 -0
package/app/plugins/rust-pack/plugin.json +18 -0
package/app/plugins/security-pack/README.md +15 -0
package/app/plugins/security-pack/plugin.json +23 -0
package/app/plugins/swift-pack/README.md +11 -0
package/app/plugins/swift-pack/plugin.json +18 -0
package/app/rules/claude-toolkit-rules.md +21 -0
package/app/rules/git-conventions.md +5 -0
package/app/rules/quality-gates.md +10 -0
package/app/skills/_lib/__init__.py +1 -0
package/app/skills/_lib/detect_utils.py +150 -0
package/app/skills/agent-creator/SKILL.md +82 -0
package/app/skills/analyze/SKILL.md +92 -0
package/app/skills/analyze/scripts/complexity.py +165 -0
package/app/skills/api-patterns/SKILL.md +305 -0
package/app/skills/app-builder/SKILL.md +187 -0
package/app/skills/architecture-audit/SKILL.md +141 -0
package/app/skills/architecture-decision/SKILL.md +55 -0
package/app/skills/architecture-decision/templates/adr-template.md +36 -0
package/app/skills/biz-scan/SKILL.md +30 -0
package/app/skills/briefing/SKILL.md +27 -0
package/app/skills/build/SKILL.md +97 -0
package/app/skills/build/scripts/detect-build.py +151 -0
package/app/skills/chaos/SKILL.md +32 -0
package/app/skills/ci/SKILL.md +77 -0
package/app/skills/ci/scripts/ci-detect.py +135 -0
package/app/skills/ci/templates/github-actions-node.yml +38 -0
package/app/skills/ci/templates/github-actions-python.yml +42 -0
package/app/skills/ci-cd-patterns/SKILL.md +299 -0
package/app/skills/clean-code/SKILL.md +110 -0
package/app/skills/clean-code/reference/dart.md +18 -0
package/app/skills/clean-code/reference/go.md +23 -0
package/app/skills/clean-code/reference/php.md +32 -0
package/app/skills/clean-code/reference/python.md +180 -0
package/app/skills/clean-code/reference/typescript.md +26 -0
package/app/skills/command-creator/SKILL.md +83 -0
package/app/skills/commit/SKILL.md +98 -0
package/app/skills/commit/scripts/pre-commit-check.py +87 -0
package/app/skills/commit/templates/conventional-commit.md +52 -0
package/app/skills/csharp-patterns/SKILL.md +450 -0
package/app/skills/database-patterns/SKILL.md +297 -0
package/app/skills/debug/SKILL.md +154 -0
package/app/skills/debug/scripts/error-parser.py +187 -0
package/app/skills/debugging-tactics/SKILL.md +136 -0
package/app/skills/deploy/SKILL.md +130 -0
package/app/skills/deploy/scripts/pre_deploy_check.py +171 -0
package/app/skills/deploy/templates/deployment-checklist.md +31 -0
package/app/skills/design-an-interface/SKILL.md +105 -0
package/app/skills/design-engineering/SKILL.md +260 -0
package/app/skills/docker-devops/SKILL.md +303 -0
package/app/skills/docs/SKILL.md +145 -0
package/app/skills/docs/scripts/doc-inventory.py +176 -0
package/app/skills/docs/templates/adr-template.md +36 -0
package/app/skills/docs/templates/readme-template.md +67 -0
package/app/skills/documentation-standards/SKILL.md +191 -0
package/app/skills/ecommerce-patterns/SKILL.md +209 -0
package/app/skills/evaluate/SKILL.md +132 -0
package/app/skills/evolve/SKILL.md +27 -0
package/app/skills/explain/SKILL.md +54 -0
package/app/skills/explain/scripts/dependency-graph.py +215 -0
package/app/skills/explore/SKILL.md +112 -0
package/app/skills/explore/scripts/visualize.py +117 -0
package/app/skills/fix/SKILL.md +78 -0
package/app/skills/fix/scripts/error-classifier.py +191 -0
package/app/skills/flutter-patterns/SKILL.md +254 -0
package/app/skills/git-mastery/SKILL.md +70 -0
package/app/skills/grill-me/SKILL.md +38 -0
package/app/skills/health/SKILL.md +91 -0
package/app/skills/health/scripts/health_check.py +162 -0
package/app/skills/hive-mind/SKILL.md +56 -0
package/app/skills/hook-creator/SKILL.md +107 -0
package/app/skills/index/SKILL.md +74 -0
package/app/skills/instinct-review/SKILL.md +77 -0
package/app/skills/java-patterns/SKILL.md +442 -0
package/app/skills/kotlin-patterns/SKILL.md +446 -0
package/app/skills/lint/SKILL.md +103 -0
package/app/skills/lint/scripts/detect-linters.py +112 -0
package/app/skills/mcp-patterns/SKILL.md +270 -0
package/app/skills/mem-search/SKILL.md +70 -0
package/app/skills/migrate/SKILL.md +90 -0
package/app/skills/migrate/scripts/migration-status.py +195 -0
package/app/skills/migration-patterns/SKILL.md +260 -0
package/app/skills/night-watch/SKILL.md +28 -0
package/app/skills/observability-patterns/SKILL.md +203 -0
package/app/skills/onboard/SKILL.md +76 -0
package/app/skills/orchestrate/SKILL.md +86 -0
package/app/skills/panic/SKILL.md +30 -0
package/app/skills/performance-profiling/SKILL.md +59 -0
package/app/skills/plan/SKILL.md +110 -0
package/app/skills/plan/templates/plan-template.md +40 -0
package/app/skills/plan-writing/SKILL.md +201 -0
package/app/skills/plugin-creator/SKILL.md +78 -0
package/app/skills/pr/SKILL.md +129 -0
package/app/skills/pr/scripts/pr-summary.py +175 -0
package/app/skills/prd-to-issues/SKILL.md +108 -0
package/app/skills/prd-to-plan/SKILL.md +120 -0
package/app/skills/predict/SKILL.md +30 -0
package/app/skills/qa-session/SKILL.md +110 -0
package/app/skills/rag-patterns/SKILL.md +203 -0
package/app/skills/refactor/SKILL.md +124 -0
package/app/skills/refactor/scripts/refactor-scan.py +210 -0
package/app/skills/refactor-plan/SKILL.md +112 -0
package/app/skills/repeat/SKILL.md +149 -0
package/app/skills/research-mastery/SKILL.md +56 -0
package/app/skills/review/SKILL.md +141 -0
package/app/skills/review/scripts/diff-analyzer.py +170 -0
package/app/skills/rollback/SKILL.md +87 -0
package/app/skills/rollback/scripts/rollback_info.py +149 -0
package/app/skills/ruby-patterns/SKILL.md +454 -0
package/app/skills/rust-patterns/SKILL.md +446 -0
package/app/skills/search/SKILL.md +64 -0
package/app/skills/security-patterns/SKILL.md +91 -0
package/app/skills/security-patterns/reference/authentication.md +37 -0
package/app/skills/security-patterns/reference/authorization.md +22 -0
package/app/skills/security-patterns/reference/input-validation.md +30 -0
package/app/skills/security-patterns/reference/oauth-csrf-audit.md +131 -0
package/app/skills/skill-creator/SKILL.md +154 -0
package/app/skills/skill-creator/templates/dashboard/index.html +130 -0
package/app/skills/skill-creator/templates/reasoning-engine/assets/example.json +12 -0
package/app/skills/skill-creator/templates/reasoning-engine/search.py +110 -0
package/app/skills/subagent-development/SKILL.md +225 -0
package/app/skills/subagent-development/reference/code-quality-reviewer-prompt.md +145 -0
package/app/skills/subagent-development/reference/implementer-prompt.md +118 -0
package/app/skills/subagent-development/reference/spec-reviewer-prompt.md +100 -0
package/app/skills/swarm/SKILL.md +81 -0
package/app/skills/swift-patterns/SKILL.md +500 -0
package/app/skills/tdd/SKILL.md +174 -0
package/app/skills/tdd/reference/deep-modules.md +32 -0
package/app/skills/tdd/reference/interface-design.md +32 -0
package/app/skills/tdd/reference/mocking.md +52 -0
package/app/skills/tdd/reference/refactoring.md +10 -0
package/app/skills/tdd/reference/tests.md +59 -0
package/app/skills/teams/SKILL.md +101 -0
package/app/skills/test/SKILL.md +107 -0
package/app/skills/test/scripts/detect-runner.py +113 -0
package/app/skills/testing-patterns/SKILL.md +73 -0
package/app/skills/testing-patterns/reference/flutter-testing.md +33 -0
package/app/skills/testing-patterns/reference/go-testing.md +52 -0
package/app/skills/testing-patterns/reference/php-phpunit.md +39 -0
package/app/skills/testing-patterns/reference/python-pytest.md +228 -0
package/app/skills/testing-patterns/reference/typescript-vitest.md +50 -0
package/app/skills/triage-issue/SKILL.md +120 -0
package/app/skills/typescript-patterns/SKILL.md +256 -0
package/app/skills/ubiquitous-language/SKILL.md +74 -0
package/app/skills/verification-before-completion/SKILL.md +108 -0
package/app/skills/workflow/SKILL.md +250 -0
package/app/skills/write-a-prd/SKILL.md +129 -0
package/app/skills/write-a-prd/reference/visual-companion.md +78 -0
package/app/skills/write-a-prd/scripts/frame-template.html +111 -0
package/app/skills/write-a-prd/scripts/visual-server.cjs +79 -0
package/app/templates/skill/generator/SKILL.md.template +40 -0
package/app/templates/skill/knowledge/SKILL.md.template +52 -0
package/app/templates/skill/linter/SKILL.md.template +34 -0
package/app/templates/skill/reviewer/SKILL.md.template +51 -0
package/app/templates/skill/workflow/SKILL.md.template +49 -0
package/benchmarks/README.md +111 -0
package/benchmarks/ecosystem-dashboard.json +148 -0
package/benchmarks/ecosystem-harvest.json +148 -0
package/benchmarks/results.json +38 -0
package/benchmarks/run.py +351 -0
package/bin/ai-toolkit.js +345 -0
package/kb/best-practices/README.md +11 -0
package/kb/howto/README.md +11 -0
package/kb/procedures/maintenance-sop.md +306 -0
package/kb/reference/agents-catalog.md +124 -0
package/kb/reference/anti-pattern-registry-format.md +221 -0
package/kb/reference/architecture-overview.md +232 -0
package/kb/reference/benchmark-config.md +62 -0
package/kb/reference/ci-integration.md +66 -0
package/kb/reference/claude-ecosystem-benchmark-snapshot.md +80 -0
package/kb/reference/claude-ecosystem-expansion-foundations.md +102 -0
package/kb/reference/commands-catalog.md +21 -0
package/kb/reference/distribution-model.md +63 -0
package/kb/reference/global-install-model.md +56 -0
package/kb/reference/hierarchical-override-pattern.md +200 -0
package/kb/reference/hooks-catalog.md +306 -0
package/kb/reference/integrations.md +88 -0
package/kb/reference/language-packs.md +52 -0
package/kb/reference/merge-friendly-install-model.md +58 -0
package/kb/reference/plugin-pack-conventions.md +151 -0
package/kb/reference/quick-wins-implementation-summary.md +70 -0
package/kb/reference/skill-templates.md +50 -0
package/kb/reference/skills-catalog.md +215 -0
package/kb/reference/skills-unification.md +57 -0
package/kb/reference/stats.md +69 -0
package/kb/reference/sync.md +76 -0
package/kb/troubleshooting/README.md +11 -0
package/llms-full.txt +3068 -0
package/llms.txt +39 -0
package/package.json +75 -0
package/scripts/_common.py +160 -0
package/scripts/add_rule.py +50 -0
package/scripts/benchmark_config.py +127 -0
package/scripts/benchmark_ecosystem.py +288 -0
package/scripts/check_deps.py +260 -0
package/scripts/create_skill.py +118 -0
package/scripts/doctor.py +504 -0
package/scripts/eject.py +113 -0
package/scripts/emission.py +256 -0
package/scripts/evaluate_skills.py +260 -0
package/scripts/frontmatter.py +58 -0
package/scripts/generate_agents_md.py +91 -0
package/scripts/generate_aider_conf.py +51 -0
package/scripts/generate_cline.py +35 -0
package/scripts/generate_copilot.py +30 -0
package/scripts/generate_cursor_rules.py +35 -0
package/scripts/generate_gemini.py +28 -0
package/scripts/generate_llms_txt.py +164 -0
package/scripts/generate_roo_modes.py +80 -0
package/scripts/generate_windsurf.py +35 -0
package/scripts/generator_base.py +140 -0
package/scripts/harvest_ecosystem.py +50 -0
package/scripts/inject_rule_cli.py +101 -0
package/scripts/inject_section_cli.py +47 -0
package/scripts/injection.py +180 -0
package/scripts/install.py +236 -0
package/scripts/install_git_hooks.py +71 -0
package/scripts/install_steps/__init__.py +5 -0
package/scripts/install_steps/ai_tools.py +261 -0
package/scripts/install_steps/hooks.py +90 -0
package/scripts/install_steps/markers.py +79 -0
package/scripts/install_steps/symlinks.py +87 -0
package/scripts/merge-hooks.py +192 -0
package/scripts/plugin.py +642 -0
package/scripts/plugin_schema.py +138 -0
package/scripts/remove_rule.py +58 -0
package/scripts/stats.py +81 -0
package/scripts/sync.py +215 -0
package/scripts/uninstall.py +292 -0
package/scripts/validate.py +700 -0

package/benchmarks/ecosystem-harvest.json ADDED Viewed

@@ -0,0 +1,148 @@
+{
+  "generated_at": "2026-03-28T09:37:52Z",
+  "mode": "offline",
+  "snapshot_date": "2026-03-28",
+  "freshness": {
+    "stale_threshold_days": 30,
+    "age_days": 0,
+    "status": "fresh"
+  },
+  "summary": {
+    "repo_count": 6,
+    "stars_total": 206035,
+    "categories": [
+      "cross-tool",
+      "ecosystem-scale",
+      "hooks-reference",
+      "meta-tooling",
+      "official",
+      "practical-showcase"
+    ],
+    "official_repo": "anthropics/claude-code"
+  },
+  "comparison_matrix": [
+    {
+      "pattern": "plugin-manifest-support",
+      "current_state": "implemented",
+      "benchmark_signal": "official Claude Code plugin layout",
+      "priority": "high",
+      "evidence": [
+        "anthropics/claude-code"
+      ]
+    },
+    {
+      "pattern": "creator-workflows",
+      "current_state": "implemented",
+      "benchmark_signal": "meta-tooling for commands, hooks, agents, plugins",
+      "priority": "high",
+      "evidence": [
+        "anthropics/claude-code",
+        "alirezarezvani/claude-code-skill-factory"
+      ]
+    },
+    {
+      "pattern": "lifecycle-breadth",
+      "current_state": "implemented",
+      "benchmark_signal": "prompt governance, post-tool feedback, subagent hooks, session end",
+      "priority": "high",
+      "evidence": [
+        "disler/claude-code-hooks-mastery",
+        "ChrisWiles/claude-code-showcase"
+      ]
+    },
+    {
+      "pattern": "plugin-packs",
+      "current_state": "implemented-experimental",
+      "benchmark_signal": "modular domain packaging",
+      "priority": "medium",
+      "evidence": [
+        "anthropics/claude-code",
+        "affaan-m/everything-claude-code"
+      ]
+    },
+    {
+      "pattern": "benchmark-harvesting",
+      "current_state": "implemented",
+      "benchmark_signal": "repeatable evidence for docs and roadmap decisions",
+      "priority": "medium",
+      "evidence": [
+        "anthropics/claude-code",
+        "codeaholicguy/ai-devkit"
+      ]
+    }
+  ],
+  "repos": [
+    {
+      "repo": "anthropics/claude-code",
+      "category": "official",
+      "stars": 83535,
+      "updated_at": "2026-03-27T16:50:16Z",
+      "commands_md": 18,
+      "agents_md": 15,
+      "skills": 10,
+      "hook_settings_files": 5,
+      "notes": "Official Claude Code repo with plugin layout, development kits, and modular commands/agents/hooks.",
+      "source": "snapshot"
+    },
+    {
+      "repo": "affaan-m/everything-claude-code",
+      "category": "ecosystem-scale",
+      "stars": 111863,
+      "updated_at": "2026-03-27T16:55:18Z",
+      "commands_md": 271,
+      "agents_md": 152,
+      "skills": 397,
+      "hook_settings_files": 2,
+      "notes": "Large ecosystem catalog. High inspiration value, high discoverability-debt risk.",
+      "source": "snapshot"
+    },
+    {
+      "repo": "ChrisWiles/claude-code-showcase",
+      "category": "practical-showcase",
+      "stars": 5593,
+      "updated_at": "2026-03-27T13:13:35Z",
+      "commands_md": 6,
+      "agents_md": 2,
+      "skills": 6,
+      "hook_settings_files": 1,
+      "notes": "Practical edit-time hooks, branch safety, formatting, and testing patterns.",
+      "source": "snapshot"
+    },
+    {
+      "repo": "disler/claude-code-hooks-mastery",
+      "category": "hooks-reference",
+      "stars": 3421,
+      "updated_at": "2026-03-27T15:49:11Z",
+      "commands_md": 21,
+      "agents_md": 19,
+      "skills": 0,
+      "hook_settings_files": 1,
+      "notes": "Strong reference for lifecycle breadth, status lines, and operational hook patterns.",
+      "source": "snapshot"
+    },
+    {
+      "repo": "codeaholicguy/ai-devkit",
+      "category": "cross-tool",
+      "stars": 985,
+      "updated_at": "2026-03-27T00:00:00Z",
+      "commands_md": 0,
+      "agents_md": 0,
+      "skills": 0,
+      "hook_settings_files": 0,
+      "notes": "Cross-tool toolkit positioning benchmark.",
+      "source": "snapshot"
+    },
+    {
+      "repo": "alirezarezvani/claude-code-skill-factory",
+      "category": "meta-tooling",
+      "stars": 638,
+      "updated_at": "2026-03-27T00:00:00Z",
+      "commands_md": 0,
+      "agents_md": 0,
+      "skills": 0,
+      "hook_settings_files": 0,
+      "notes": "Skill/agent/prompt factory inspiration for creator workflows.",
+      "source": "snapshot"
+    }
+  ]
+}

package/benchmarks/results.json ADDED Viewed

@@ -0,0 +1,38 @@
+{
+  "run_date": "2026-03-26",
+  "model": "claude-sonnet-4-6",
+  "note": "B4 vanilla run found existing test file from with-toolkit run — tool_call count inflated, score still valid",
+  "with_toolkit": {
+    "agent_routing": "specialized (debugger, code-reviewer, test-engineer, documenter)",
+    "results": {
+      "B1": { "task": "Debug FastAPI JWT bug",           "score": 4, "max": 4, "tool_calls": 1 },
+      "B2": { "task": "Code review — 3 bugs",           "score": 4, "max": 4, "tool_calls": 1 },
+      "B3": { "task": "Refactor god function",          "score": 4, "max": 4, "tool_calls": 1 },
+      "B4": { "task": "Generate tests (payments)",      "score": 4, "max": 4, "tool_calls": 1 },
+      "B5": { "task": "Docs for Flask microservice",    "score": 4, "max": 4, "tool_calls": 3 }
+    },
+    "total_score": 20,
+    "max_score": 20,
+    "completion_rate": 1.0,
+    "avg_tool_calls": 1.4
+  },
+  "without_toolkit": {
+    "agent_routing": "general-purpose only",
+    "results": {
+      "B1": { "task": "Debug FastAPI JWT bug",           "score": 4, "max": 4, "tool_calls": 1 },
+      "B2": { "task": "Code review — 3 bugs",           "score": 4, "max": 4, "tool_calls": 1 },
+      "B3": { "task": "Refactor god function",          "score": 4, "max": 4, "tool_calls": 2 },
+      "B4": { "task": "Generate tests (payments)",      "score": 4, "max": 4, "tool_calls": 8 },
+      "B5": { "task": "Docs for Flask microservice",    "score": 4, "max": 4, "tool_calls": 10 }
+    },
+    "total_score": 20,
+    "max_score": 20,
+    "completion_rate": 1.0,
+    "avg_tool_calls": 4.4
+  },
+  "comparison": {
+    "score_delta": 0,
+    "tool_call_reduction_pct": 68,
+    "finding": "Same accuracy on isolated single-file tasks. Toolkit advantage: 3.1x fewer tool calls on complex tasks (B4/B5). Real gains expected on multi-file, multi-step scenarios where agent specialization and skill context matter."
+  }
+}

package/benchmarks/run.py ADDED Viewed

@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""benchmarks/run.py — structured benchmark runner for ai-toolkit.
+Usage:
+    ./benchmarks/run.py          Show available benchmarks
+    ./benchmarks/run.py B1       Scaffold B1 benchmark environment
+    ./benchmarks/run.py all      Scaffold all 5 benchmarks
+    ./benchmarks/run.py --report Print last results
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+TOOLKIT_DIR = Path(__file__).resolve().parent.parent
+BENCHMARKS_DIR = TOOLKIT_DIR / "benchmarks"
+RESULTS_FILE = BENCHMARKS_DIR / "results.json"
+USAGE_TEXT = """\
+ai-toolkit Benchmark Runner
+Usage:
+  ./benchmarks/run.py              Show available benchmarks
+  ./benchmarks/run.py B1           Scaffold B1 (debug) benchmark environment
+  ./benchmarks/run.py all          Scaffold all 5 benchmarks
+  ./benchmarks/run.py --report     Print results from last run
+Benchmarks:
+  B1  Debug multi-file bug (FastAPI JWT authentication)
+  B2  Code review (SQL injection, error handling, N+1)
+  B3  Refactor to clean code (god function -> SRP)
+  B4  Generate tests (payment processing module)
+  B5  Generate documentation (Flask microservice)
+Methodology:
+  Each benchmark run measures:
+    - time_to_first_output (seconds)
+    - tool_calls (count)
+    - completion_rate (0.0-1.0)
+    - corrections_needed (count)
+  Run each benchmark WITH and WITHOUT the toolkit, compare.
+  See benchmarks/README.md for full methodology.
+"""
+# ---------------------------------------------------------------------------
+# Benchmark scaffolding files
+# ---------------------------------------------------------------------------
+B1_AUTH = '''\
+from fastapi import FastAPI, Header, HTTPException
+import jwt
+app = FastAPI()
+SECRET = "mysecret"
+@app.get("/protected")
+def protected(authorization: str = Header(None)):
+    # BUG: token validation fails silently — no exception raised on invalid token
+    try:
+        payload = jwt.decode(authorization, SECRET, algorithms=["HS256"])
+    except:
+        pass  # silent failure — any token passes
+    return {"user": payload.get("sub", "unknown")}
+'''
+B2_USERS = '''\
+import sqlite3
+def get_user(db_path, username):
+    # BUG 1: SQL injection
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute(f"SELECT * FROM users WHERE username = '{username}'")
+    return cursor.fetchone()
+def get_user_posts(db_path, user_ids):
+    # BUG 2: N+1 query
+    conn = sqlite3.connect(db_path)
+    posts = []
+    for uid in user_ids:
+        cursor = conn.cursor()
+        cursor.execute("SELECT * FROM posts WHERE user_id = ?", (uid,))
+        posts.extend(cursor.fetchall())
+    return posts
+def delete_user(db_path, user_id):
+    # BUG 3: No error handling — conn.close() never called on exception
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("DELETE FROM users WHERE id = ?", (user_id,))
+    conn.commit()
+    conn.close()
+'''
+B3_ORDER = '''\
+import sqlite3
+import smtplib
+import logging
+from datetime import datetime
+# God function — 4 responsibilities in one: validate, calculate, persist, notify
+def process_order(db_path, customer_id, items, discount_code, email):
+    # 1. Validate
+    if not items:
+        raise ValueError("No items")
+    if not customer_id:
+        raise ValueError("No customer")
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("SELECT * FROM customers WHERE id = ?", (customer_id,))
+    customer = cursor.fetchone()
+    if not customer:
+        raise ValueError("Customer not found")
+    # 2. Calculate total
+    total = 0
+    for item in items:
+        cursor.execute("SELECT price FROM products WHERE id = ?", (item["product_id"],))
+        row = cursor.fetchone()
+        if not row:
+            raise ValueError(f"Product {item[\\'product_id\\']} not found")
+        total += row[0] * item["quantity"]
+    if discount_code == "SAVE10":
+        total *= 0.9
+    elif discount_code == "SAVE20":
+        total *= 0.8
+    tax = total * 0.23
+    total_with_tax = total + tax
+    # 3. Persist
+    cursor.execute(
+        "INSERT INTO orders (customer_id, total, tax, created_at) VALUES (?, ?, ?, ?)",
+        (customer_id, total_with_tax, tax, datetime.now().isoformat())
+    )
+    order_id = cursor.lastrowid
+    for item in items:
+        cursor.execute(
+            "INSERT INTO order_items (order_id, product_id, quantity) VALUES (?, ?, ?)",
+            (order_id, item["product_id"], item["quantity"])
+        )
+    conn.commit()
+    conn.close()
+    # 4. Notify
+    try:
+        server = smtplib.SMTP("smtp.example.com", 587)
+        server.starttls()
+        server.login("noreply@example.com", "password123")
+        server.sendmail(
+            "noreply@example.com", email,
+            f"Subject: Order #{order_id} confirmed\\n\\nTotal: {total_with_tax:.2f}"
+        )
+        server.quit()
+    except Exception as e:
+        logging.error(f"Email failed: {e}")
+    return order_id
+'''
+B4_PAYMENTS = '''\
+import uuid
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class Payment:
+    id: str
+    amount: float
+    currency: str
+    status: str  # pending | completed | failed | refunded
+class PaymentProcessor:
+    def __init__(self, gateway_client):
+        self.gateway = gateway_client
+    def charge(self, amount: float, currency: str, card_token: str) -> Payment:
+        if amount <= 0:
+            raise ValueError("Amount must be positive")
+        if currency not in ("USD", "EUR", "GBP"):
+            raise ValueError(f"Unsupported currency: {currency}")
+        result = self.gateway.charge(card_token, amount, currency)
+        return Payment(id=result["id"], amount=amount, currency=currency, status="completed")
+    def refund(self, payment: Payment, amount: Optional[float] = None) -> Payment:
+        if payment.status != "completed":
+            raise ValueError("Can only refund completed payments")
+        refund_amount = amount or payment.amount
+        if refund_amount > payment.amount:
+            raise ValueError("Refund exceeds original amount")
+        self.gateway.refund(payment.id, refund_amount)
+        return Payment(id=payment.id, amount=refund_amount, currency=payment.currency, status="refunded")
+    def get_status(self, payment_id: str) -> str:
+        result = self.gateway.get_payment(payment_id)
+        return result.get("status", "unknown")
+    def batch_charge(self, charges: list) -> list:
+        results = []
+        for charge in charges:
+            try:
+                p = self.charge(charge["amount"], charge["currency"], charge["card_token"])
+                results.append({"success": True, "payment": p})
+            except Exception as e:
+                results.append({"success": False, "error": str(e)})
+        return results
+    def calculate_fee(self, amount: float, currency: str) -> float:
+        base_fee = 0.029 * amount + 0.30
+        if currency != "USD":
+            base_fee *= 1.015  # FX surcharge
+        return round(base_fee, 2)
+'''
+B5_APP = '''\
+from flask import Flask, request, jsonify
+from models import db, Task
+from auth import require_api_key
+app = Flask(__name__)
+app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///tasks.db"
+db.init_app(app)
+@app.route("/tasks", methods=["GET"])
+@require_api_key
+def list_tasks():
+    tasks = Task.query.all()
+    return jsonify([t.to_dict() for t in tasks])
+@app.route("/tasks", methods=["POST"])
+@require_api_key
+def create_task():
+    data = request.get_json()
+    task = Task(title=data["title"], done=False)
+    db.session.add(task)
+    db.session.commit()
+    return jsonify(task.to_dict()), 201
+@app.route("/tasks/<int:task_id>", methods=["PATCH"])
+@require_api_key
+def update_task(task_id):
+    task = Task.query.get_or_404(task_id)
+    data = request.get_json()
+    if "done" in data:
+        task.done = data["done"]
+    if "title" in data:
+        task.title = data["title"]
+    db.session.commit()
+    return jsonify(task.to_dict())
+@app.route("/tasks/<int:task_id>", methods=["DELETE"])
+@require_api_key
+def delete_task(task_id):
+    task = Task.query.get_or_404(task_id)
+    db.session.delete(task)
+    db.session.commit()
+    return "", 204
+'''
+B5_MODELS = '''\
+from flask_sqlalchemy import SQLAlchemy
+db = SQLAlchemy()
+class Task:
+    id: int
+    title: str
+    done: bool
+    def to_dict(self):
+        return {"id": self.id, "title": self.title, "done": self.done}
+'''
+B5_AUTH = '''\
+import os
+from functools import wraps
+from flask import request, jsonify
+API_KEY = os.environ.get("API_KEY", "dev-key")
+def require_api_key(f):
+    @wraps(f)
+    def decorated(*args, **kwargs):
+        key = request.headers.get("X-API-Key")
+        if key != API_KEY:
+            return jsonify({"error": "Unauthorized"}), 401
+        return f(*args, **kwargs)
+    return decorated
+'''
+def scaffold(name: str, directory: str, files: dict[str, str], task_desc: str) -> None:
+    d = Path(directory)
+    d.mkdir(parents=True, exist_ok=True)
+    for filename, content in files.items():
+        (d / filename).write_text(content, encoding="utf-8")
+    print(f"{name} scaffolded at: {directory}")
+    print(f"Task: {task_desc}")
+def scaffold_b1(d: str = "/tmp/benchmark-b1") -> None:
+    scaffold("B1", d, {"auth.py": B1_AUTH},
+             "Find why any JWT token (including invalid ones) is accepted.")
+def scaffold_b2(d: str = "/tmp/benchmark-b2") -> None:
+    scaffold("B2", d, {"users.py": B2_USERS},
+             "Find 3 issues: SQL injection, N+1 query, missing error handling.")
+def scaffold_b3(d: str = "/tmp/benchmark-b3") -> None:
+    scaffold("B3", d, {"order_processor.py": B3_ORDER},
+             "Refactor process_order() — split into 4 single-responsibility functions.")
+def scaffold_b4(d: str = "/tmp/benchmark-b4") -> None:
+    scaffold("B4", d, {"payments.py": B4_PAYMENTS},
+             "Generate unit tests for all 5 PaymentProcessor methods with edge cases.")
+def scaffold_b5(d: str = "/tmp/benchmark-b5") -> None:
+    scaffold("B5", d, {"app.py": B5_APP, "models.py": B5_MODELS, "auth.py": B5_AUTH},
+             "Generate README, API docs, and docstrings for this Flask task API.")
+SCAFFOLDERS = {"B1": scaffold_b1, "B2": scaffold_b2, "B3": scaffold_b3, "B4": scaffold_b4, "B5": scaffold_b5}
+def main() -> None:
+    arg = sys.argv[1] if len(sys.argv) > 1 else ""
+    target = sys.argv[2] if len(sys.argv) > 2 else ""
+    if arg == "--report":
+        if RESULTS_FILE.is_file():
+            print(RESULTS_FILE.read_text())
+        else:
+            print("No results yet. Run benchmarks first.")
+            print(f"Results will be saved to: {RESULTS_FILE}")
+    elif arg == "all":
+        for name, fn in SCAFFOLDERS.items():
+            fn(f"/tmp/benchmark-{name.lower()}")
+    elif arg in SCAFFOLDERS:
+        d = target or f"/tmp/benchmark-{arg.lower()}"
+        SCAFFOLDERS[arg](d)
+    elif arg:
+        print(f"Unknown: {arg}")
+        print(USAGE_TEXT)
+        sys.exit(1)
+    else:
+        print(USAGE_TEXT)
+if __name__ == "__main__":
+    main()