@shakudo/kaji-setup-external 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +155 -0
- package/assets/skills/ci-cd/.claude-plugin/plugin.json +8 -0
- package/assets/skills/ci-cd/SKILL.md +573 -0
- package/assets/skills/ci-cd/assets/templates/github-actions/docker-build.yml +164 -0
- package/assets/skills/ci-cd/assets/templates/github-actions/go-ci.yml +420 -0
- package/assets/skills/ci-cd/assets/templates/github-actions/node-ci.yml +313 -0
- package/assets/skills/ci-cd/assets/templates/github-actions/python-ci.yml +388 -0
- package/assets/skills/ci-cd/assets/templates/github-actions/security-scan.yml +416 -0
- package/assets/skills/ci-cd/assets/templates/gitlab-ci/docker-build.yml +298 -0
- package/assets/skills/ci-cd/assets/templates/gitlab-ci/go-ci.yml +548 -0
- package/assets/skills/ci-cd/assets/templates/gitlab-ci/node-ci.yml +334 -0
- package/assets/skills/ci-cd/assets/templates/gitlab-ci/python-ci.yml +472 -0
- package/assets/skills/ci-cd/assets/templates/gitlab-ci/security-scan.yml +479 -0
- package/assets/skills/ci-cd/references/best_practices.md +675 -0
- package/assets/skills/ci-cd/references/devsecops.md +862 -0
- package/assets/skills/ci-cd/references/optimization.md +651 -0
- package/assets/skills/ci-cd/references/security.md +611 -0
- package/assets/skills/ci-cd/references/troubleshooting.md +656 -0
- package/assets/skills/ci-cd/scripts/ci_health.py +301 -0
- package/assets/skills/ci-cd/scripts/pipeline_analyzer.py +440 -0
- package/assets/skills/context-optimization/CONTRIBUTING.md +78 -0
- package/assets/skills/context-optimization/LICENSE +22 -0
- package/assets/skills/context-optimization/README.md +228 -0
- package/assets/skills/context-optimization/SKILL.md +104 -0
- package/assets/skills/context-optimization/docs/agentskills.md +1264 -0
- package/assets/skills/context-optimization/docs/blogs.md +1230 -0
- package/assets/skills/context-optimization/docs/claude_research.md +85 -0
- package/assets/skills/context-optimization/docs/compression.md +298 -0
- package/assets/skills/context-optimization/docs/gemini_research.md +22 -0
- package/assets/skills/context-optimization/docs/hncapsule.md +92 -0
- package/assets/skills/context-optimization/docs/netflix_context.md +10 -0
- package/assets/skills/context-optimization/docs/vercel_tool.md +140 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/README.md +78 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/SKILL.md +380 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/README.md +168 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/dataset_sample.jsonl +5 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.05.04/342/200/257AM.png +0 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.05.36/342/200/257AM.png +0 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.07.18/342/200/257AM.png +0 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/sample_outputs.md +63 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/training_config.json +80 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/references/segmentation-strategies.md +324 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/references/tinker-format.md +211 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/references/tinker.txt +3176 -0
- package/assets/skills/context-optimization/examples/book-sft-pipeline/scripts/pipeline_example.py +187 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/AGENT.md +35 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/HOW-SKILLS-BUILT-THIS.md +407 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/README.md +209 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/SKILL.md +203 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/SKILLS-MAPPING.md +219 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/agents/AGENTS.md +82 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/content_ideas.py +132 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/idea_to_draft.py +181 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/stale_contacts.py +139 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/weekly_review.py +121 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/CONTENT.md +88 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/calendar.md +108 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/engagement.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/ideas.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/posts.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/linkedin-post.md +102 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/newsletter.md +92 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/thread.md +73 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/examples/content-workflow.md +204 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/examples/meeting-prep.md +243 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/identity/IDENTITY.md +46 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/identity/bio-variants.md +101 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/identity/brand.md +165 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/identity/prompts/content-generation.xml +46 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/identity/prompts/reply-generator.xml +40 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/identity/values.yaml +60 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/identity/voice.md +165 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/KNOWLEDGE.md +85 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/bookmarks.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/competitors.md +117 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/learning.yaml +74 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/research/_template.md +79 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/network/NETWORK.md +110 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/network/circles.yaml +80 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/network/contacts.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/network/interactions.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/network/intros.md +92 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/operations/OPERATIONS.md +75 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/operations/goals.yaml +83 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/operations/meetings.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/operations/metrics.jsonl +2 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/operations/reviews/_weekly_template.md +114 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/operations/todos.md +76 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/package.json +41 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/references/file-formats.md +386 -0
- package/assets/skills/context-optimization/examples/digital-brain-skill/scripts/install.sh +79 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/README.md +620 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/SKILL.md +221 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/docs/agentthinking.md +63 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/docs/interleavedthinking.md +610 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/docs/m2-1.md +224 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/examples/01_basic_capture.py +76 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/examples/02_tool_usage.py +187 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/examples/03_full_optimization.py +1222 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/SKILL.md +90 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/optimization_summary.json +9 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/optimized_prompt.txt +1 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/patterns_found.json +205 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/final_prompt.txt +67 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/analysis.txt +48 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/optimization.txt +15 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/optimized_prompt.txt +1 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/trace.txt +178 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_10/analysis.txt +47 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_10/trace.txt +162 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/analysis.txt +48 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/optimization.txt +130 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/optimized_prompt.txt +72 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/trace.txt +156 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/analysis.txt +46 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/optimization.txt +147 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/optimized_prompt.txt +84 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/trace.txt +159 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/analysis.txt +46 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/optimization.txt +134 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/optimized_prompt.txt +67 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/trace.txt +165 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/analysis.txt +50 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/optimization.txt +135 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/optimized_prompt.txt +71 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/trace.txt +146 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/analysis.txt +15 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/optimization.txt +15 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/optimized_prompt.txt +1 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/trace.txt +147 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/analysis.txt +46 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/optimization.txt +103 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/optimized_prompt.txt +45 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/trace.txt +134 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/analysis.txt +47 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/optimization.txt +114 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/optimized_prompt.txt +60 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/trace.txt +135 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/analysis.txt +44 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/optimization.txt +106 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/optimized_prompt.txt +51 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/trace.txt +170 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/summary.json +11 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/pyproject.toml +70 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/__init__.py +53 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/analyzer.py +465 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/capture.py +417 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/cli.py +271 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/loop.py +468 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/models.py +193 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/optimizer.py +449 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/skill_generator.py +502 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/tests/__init__.py +1 -0
- package/assets/skills/context-optimization/examples/interleaved_thinking/tests/test_models.py +144 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/.prettierrc +8 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/CONTRIBUTING.md +78 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/LICENSE +21 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/README.md +659 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/evaluator-agent/evaluator-agent.md +177 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/index.md +114 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/orchestrator-agent/orchestrator-agent.md +205 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/research-agent/research-agent.md +183 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/env.example +6 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/eslint.config.js +18 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/basic-evaluation.ts +89 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/full-evaluation-workflow.ts +136 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/generate-rubric.ts +67 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/pairwise-comparison.ts +97 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/package.json +79 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/agent-system/orchestrator-prompt.md +197 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/evaluation/direct-scoring-prompt.md +153 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/evaluation/pairwise-comparison-prompt.md +200 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/index.md +138 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/research/research-synthesis-prompt.md +171 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/context-fundamentals/context-fundamentals.md +114 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/index.md +79 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/llm-evaluator/llm-evaluator.md +77 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/tool-design/tool-design.md +198 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/agents/evaluator.ts +112 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/agents/index.ts +3 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/config/index.ts +18 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/index.ts +19 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts +164 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/generate-rubric.ts +161 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/index.ts +9 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/pairwise-compare.ts +255 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/evaluation.test.ts +233 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/setup.ts +27 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/skills.test.ts +213 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/direct-score.md +159 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/generate-rubric.md +189 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/pairwise-compare.md +182 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/index.md +141 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/orchestration/delegate-to-agent.md +171 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/research/read-url.md +162 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/research/web-search.md +128 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/tsconfig.json +26 -0
- package/assets/skills/context-optimization/examples/llm-as-judge-skills/vitest.config.ts +20 -0
- package/assets/skills/context-optimization/examples/x-to-book-system/PRD.md +644 -0
- package/assets/skills/context-optimization/examples/x-to-book-system/README.md +181 -0
- package/assets/skills/context-optimization/examples/x-to-book-system/SKILLS-MAPPING.md +187 -0
- package/assets/skills/context-optimization/researcher/example_output.md +75 -0
- package/assets/skills/context-optimization/researcher/llm-as-a-judge.md +362 -0
- package/assets/skills/context-optimization/skills/advanced-evaluation/SKILL.md +454 -0
- package/assets/skills/context-optimization/skills/advanced-evaluation/references/bias-mitigation.md +288 -0
- package/assets/skills/context-optimization/skills/advanced-evaluation/references/implementation-patterns.md +315 -0
- package/assets/skills/context-optimization/skills/advanced-evaluation/references/metrics-guide.md +331 -0
- package/assets/skills/context-optimization/skills/advanced-evaluation/scripts/evaluation_example.py +337 -0
- package/assets/skills/context-optimization/skills/bdi-mental-states/SKILL.md +295 -0
- package/assets/skills/context-optimization/skills/bdi-mental-states/references/bdi-ontology-core.md +207 -0
- package/assets/skills/context-optimization/skills/bdi-mental-states/references/framework-integration.md +582 -0
- package/assets/skills/context-optimization/skills/bdi-mental-states/references/rdf-examples.md +315 -0
- package/assets/skills/context-optimization/skills/bdi-mental-states/references/sparql-competency.md +420 -0
- package/assets/skills/context-optimization/skills/context-compression/SKILL.md +265 -0
- package/assets/skills/context-optimization/skills/context-compression/references/evaluation-framework.md +213 -0
- package/assets/skills/context-optimization/skills/context-compression/scripts/compression_evaluator.py +658 -0
- package/assets/skills/context-optimization/skills/context-degradation/SKILL.md +231 -0
- package/assets/skills/context-optimization/skills/context-degradation/references/patterns.md +314 -0
- package/assets/skills/context-optimization/skills/context-degradation/scripts/degradation_detector.py +419 -0
- package/assets/skills/context-optimization/skills/context-fundamentals/SKILL.md +185 -0
- package/assets/skills/context-optimization/skills/context-fundamentals/references/context-components.md +283 -0
- package/assets/skills/context-optimization/skills/context-fundamentals/scripts/context_manager.py +370 -0
- package/assets/skills/context-optimization/skills/context-optimization/SKILL.md +179 -0
- package/assets/skills/context-optimization/skills/context-optimization/references/optimization_techniques.md +272 -0
- package/assets/skills/context-optimization/skills/context-optimization/scripts/compaction.py +379 -0
- package/assets/skills/context-optimization/skills/evaluation/SKILL.md +231 -0
- package/assets/skills/context-optimization/skills/evaluation/references/metrics.md +339 -0
- package/assets/skills/context-optimization/skills/evaluation/scripts/evaluator.py +474 -0
- package/assets/skills/context-optimization/skills/filesystem-context/SKILL.md +321 -0
- package/assets/skills/context-optimization/skills/filesystem-context/references/implementation-patterns.md +549 -0
- package/assets/skills/context-optimization/skills/filesystem-context/scripts/filesystem_context.py +353 -0
- package/assets/skills/context-optimization/skills/hosted-agents/SKILL.md +279 -0
- package/assets/skills/context-optimization/skills/hosted-agents/references/infrastructure-patterns.md +700 -0
- package/assets/skills/context-optimization/skills/hosted-agents/scripts/sandbox_manager.py +495 -0
- package/assets/skills/context-optimization/skills/memory-systems/SKILL.md +221 -0
- package/assets/skills/context-optimization/skills/memory-systems/references/implementation.md +458 -0
- package/assets/skills/context-optimization/skills/memory-systems/scripts/memory_store.py +396 -0
- package/assets/skills/context-optimization/skills/multi-agent-patterns/SKILL.md +255 -0
- package/assets/skills/context-optimization/skills/multi-agent-patterns/references/frameworks.md +433 -0
- package/assets/skills/context-optimization/skills/multi-agent-patterns/scripts/coordination.py +439 -0
- package/assets/skills/context-optimization/skills/project-development/SKILL.md +342 -0
- package/assets/skills/context-optimization/skills/project-development/references/case-studies.md +388 -0
- package/assets/skills/context-optimization/skills/project-development/references/pipeline-patterns.md +610 -0
- package/assets/skills/context-optimization/skills/project-development/scripts/pipeline_template.py +677 -0
- package/assets/skills/context-optimization/skills/tool-design/SKILL.md +311 -0
- package/assets/skills/context-optimization/skills/tool-design/references/architectural_reduction.md +210 -0
- package/assets/skills/context-optimization/skills/tool-design/references/best_practices.md +176 -0
- package/assets/skills/context-optimization/skills/tool-design/scripts/description_generator.py +237 -0
- package/assets/skills/context-optimization/template/SKILL.md +98 -0
- package/assets/skills/dremio-analytics/SKILL.md +287 -0
- package/assets/skills/elevenlabs-voice/SKILL.md +269 -0
- package/assets/skills/git-workflow/SKILL.md +266 -0
- package/assets/skills/gitops-workflows/.claude-plugin/plugin.json +8 -0
- package/assets/skills/gitops-workflows/SKILL.md +568 -0
- package/assets/skills/gitops-workflows/assets/applicationsets/cluster-generator.yaml +32 -0
- package/assets/skills/gitops-workflows/assets/argocd/install-argocd-3.x.yaml +92 -0
- package/assets/skills/gitops-workflows/assets/flux/flux-bootstrap-github.sh +49 -0
- package/assets/skills/gitops-workflows/assets/flux/oci-helmrelease.yaml +38 -0
- package/assets/skills/gitops-workflows/assets/progressive-delivery/argo-rollouts-canary.yaml +62 -0
- package/assets/skills/gitops-workflows/assets/secrets/sops-age-config.yaml +33 -0
- package/assets/skills/gitops-workflows/references/argocd_vs_flux.md +243 -0
- package/assets/skills/gitops-workflows/references/best_practices.md +160 -0
- package/assets/skills/gitops-workflows/references/multi_cluster.md +80 -0
- package/assets/skills/gitops-workflows/references/oci_artifacts.md +290 -0
- package/assets/skills/gitops-workflows/references/progressive_delivery.md +94 -0
- package/assets/skills/gitops-workflows/references/repo_patterns.md +184 -0
- package/assets/skills/gitops-workflows/references/secret_management.md +213 -0
- package/assets/skills/gitops-workflows/references/troubleshooting.md +134 -0
- package/assets/skills/gitops-workflows/scripts/applicationset_generator.py +156 -0
- package/assets/skills/gitops-workflows/scripts/check_argocd_health.py +275 -0
- package/assets/skills/gitops-workflows/scripts/check_flux_health.py +418 -0
- package/assets/skills/gitops-workflows/scripts/oci_artifact_checker.py +150 -0
- package/assets/skills/gitops-workflows/scripts/promotion_validator.py +88 -0
- package/assets/skills/gitops-workflows/scripts/secret_audit.py +178 -0
- package/assets/skills/gitops-workflows/scripts/sync_drift_detector.py +144 -0
- package/assets/skills/gitops-workflows/scripts/validate_gitops_repo.py +299 -0
- package/assets/skills/iac-terraform/.claude-plugin/plugin.json +8 -0
- package/assets/skills/iac-terraform/SKILL.md +653 -0
- package/assets/skills/iac-terraform/assets/templates/MODULE_TEMPLATE.md +386 -0
- package/assets/skills/iac-terraform/assets/workflows/github-actions-terraform.yml +224 -0
- package/assets/skills/iac-terraform/assets/workflows/github-actions-terragrunt.yml +236 -0
- package/assets/skills/iac-terraform/assets/workflows/gitlab-ci-terraform.yml +184 -0
- package/assets/skills/iac-terraform/references/best_practices.md +709 -0
- package/assets/skills/iac-terraform/references/cost_optimization.md +665 -0
- package/assets/skills/iac-terraform/references/troubleshooting.md +635 -0
- package/assets/skills/iac-terraform/scripts/init_module.py +319 -0
- package/assets/skills/iac-terraform/scripts/inspect_state.py +232 -0
- package/assets/skills/iac-terraform/scripts/validate_module.py +227 -0
- package/assets/skills/k8s-troubleshooter/.claude-plugin/plugin.json +8 -0
- package/assets/skills/k8s-troubleshooter/SKILL.md +336 -0
- package/assets/skills/k8s-troubleshooter/references/common_issues.md +582 -0
- package/assets/skills/k8s-troubleshooter/references/helm_troubleshooting.md +708 -0
- package/assets/skills/k8s-troubleshooter/references/incident_response.md +466 -0
- package/assets/skills/k8s-troubleshooter/references/performance_troubleshooting.md +687 -0
- package/assets/skills/k8s-troubleshooter/scripts/check_namespace.py +500 -0
- package/assets/skills/k8s-troubleshooter/scripts/cluster_health.py +223 -0
- package/assets/skills/k8s-troubleshooter/scripts/diagnose_pod.py +157 -0
- package/assets/skills/mattermost-notify/SKILL.md +248 -0
- package/assets/skills/monitoring-observability/SKILL.md +869 -0
- package/assets/skills/monitoring-observability/assets/templates/otel-config/collector-config.yaml +227 -0
- package/assets/skills/monitoring-observability/assets/templates/prometheus-alerts/kubernetes-alerts.yml +293 -0
- package/assets/skills/monitoring-observability/assets/templates/prometheus-alerts/webapp-alerts.yml +243 -0
- package/assets/skills/monitoring-observability/assets/templates/runbooks/incident-runbook-template.md +409 -0
- package/assets/skills/monitoring-observability/monitoring-observability.skill +0 -0
- package/assets/skills/monitoring-observability/references/alerting_best_practices.md +609 -0
- package/assets/skills/monitoring-observability/references/datadog_migration.md +649 -0
- package/assets/skills/monitoring-observability/references/dql_promql_translation.md +756 -0
- package/assets/skills/monitoring-observability/references/logging_guide.md +775 -0
- package/assets/skills/monitoring-observability/references/metrics_design.md +406 -0
- package/assets/skills/monitoring-observability/references/slo_sla_guide.md +652 -0
- package/assets/skills/monitoring-observability/references/tool_comparison.md +697 -0
- package/assets/skills/monitoring-observability/references/tracing_guide.md +663 -0
- package/assets/skills/monitoring-observability/scripts/alert_quality_checker.py +315 -0
- package/assets/skills/monitoring-observability/scripts/analyze_metrics.py +279 -0
- package/assets/skills/monitoring-observability/scripts/dashboard_generator.py +395 -0
- package/assets/skills/monitoring-observability/scripts/datadog_cost_analyzer.py +477 -0
- package/assets/skills/monitoring-observability/scripts/health_check_validator.py +297 -0
- package/assets/skills/monitoring-observability/scripts/log_analyzer.py +321 -0
- package/assets/skills/monitoring-observability/scripts/slo_calculator.py +365 -0
- package/assets/skills/neo4j-graph-rag/SKILL.md +258 -0
- package/assets/skills/pagerduty-ops/SKILL.md +380 -0
- package/assets/skills/playwright/API_REFERENCE.md +653 -0
- package/assets/skills/playwright/SKILL.md +453 -0
- package/assets/skills/playwright/lib/helpers.js +441 -0
- package/assets/skills/playwright/package.json +26 -0
- package/assets/skills/playwright/run.js +228 -0
- package/assets/skills/project-memory/README.md +687 -0
- package/assets/skills/project-memory/SKILL.md +298 -0
- package/assets/skills/project-memory/references/bugs_template.md +41 -0
- package/assets/skills/project-memory/references/decisions_template.md +92 -0
- package/assets/skills/project-memory/references/issues_template.md +76 -0
- package/assets/skills/project-memory/references/key_facts_template.md +158 -0
- package/assets/skills/recruit-workflow/SKILL.md +276 -0
- package/assets/skills/recruit-workflow/references/email-templates.md +347 -0
- package/assets/skills/recruit-workflow/references/workflow-stages.md +395 -0
- package/assets/skills/recruit-workflow/scripts/clay_client.py +188 -0
- package/assets/skills/recruit-workflow/scripts/lever_client.py +197 -0
- package/assets/skills/recruit-workflow/scripts/mailgun_client.py +245 -0
- package/assets/skills/recruit-workflow/scripts/minio_client.py +426 -0
- package/assets/skills/shakudo-microservice/SKILL.md +215 -0
- package/assets/skills/tmux/SKILL.md +631 -0
- package/assets/skills/tmux/references/direct-socket-control.md +108 -0
- package/assets/skills/tmux/references/session-lifecycle.md +503 -0
- package/assets/skills/tmux/references/session-registry.md +1484 -0
- package/assets/skills/tmux/tools/cleanup-sessions.sh +263 -0
- package/assets/skills/tmux/tools/create-session.sh +224 -0
- package/assets/skills/tmux/tools/find-sessions.sh +262 -0
- package/assets/skills/tmux/tools/kill-session.sh +308 -0
- package/assets/skills/tmux/tools/lib/registry.sh +437 -0
- package/assets/skills/tmux/tools/lib/time_utils.sh +54 -0
- package/assets/skills/tmux/tools/list-sessions.sh +255 -0
- package/assets/skills/tmux/tools/pane-health.sh +424 -0
- package/assets/skills/tmux/tools/safe-send.sh +503 -0
- package/assets/skills/tmux/tools/wait-for-text.sh +260 -0
- package/assets/skills/twilio-sms/SKILL.md +508 -0
- package/assets/skills/zellij/SKILL.md +274 -0
- package/assets/skills/zellij/references/actions.md +558 -0
- package/assets/skills/zellij/references/layouts.md +424 -0
- package/bin/cli.ts +46 -0
- package/package.json +43 -0
- package/src/alias.ts +108 -0
- package/src/backup.ts +51 -0
- package/src/config.ts +115 -0
- package/src/dependencies.ts +163 -0
- package/src/errors.ts +77 -0
- package/src/index.ts +207 -0
- package/src/prompts.ts +142 -0
- package/src/schemas.ts +21 -0
- package/src/skills.ts +45 -0
- package/src/speckit.ts +116 -0
- package/src/types.ts +106 -0
- package/src/utils.ts +110 -0
- package/src/vibe-git.ts +50 -0
- package/templates/.specify/memory/constitution.md +109 -0
- package/templates/.specify/scripts/bash/check-prerequisites.sh +262 -0
- package/templates/.specify/scripts/bash/common.sh +670 -0
- package/templates/.specify/scripts/bash/create-new-feature.sh +594 -0
- package/templates/.specify/scripts/bash/create-worktree-feature.sh +401 -0
- package/templates/.specify/scripts/bash/init-workspace.sh +433 -0
- package/templates/.specify/scripts/bash/list-spec-worktrees.sh +198 -0
- package/templates/.specify/scripts/bash/setup-plan.sh +105 -0
- package/templates/.specify/scripts/bash/test-workspace-rollup.sh +175 -0
- package/templates/.specify/scripts/bash/update-agent-context.sh +799 -0
- package/templates/.specify/templates/agent-file-template.md +28 -0
- package/templates/.specify/templates/checklist-template.md +40 -0
- package/templates/.specify/templates/commands/analyze.md +197 -0
- package/templates/.specify/templates/commands/checklist.md +306 -0
- package/templates/.specify/templates/commands/clarify.md +194 -0
- package/templates/.specify/templates/commands/constitution.md +97 -0
- package/templates/.specify/templates/commands/implement.md +149 -0
- package/templates/.specify/templates/commands/plan.md +123 -0
- package/templates/.specify/templates/commands/projects.md +48 -0
- package/templates/.specify/templates/commands/rollup.md +66 -0
- package/templates/.specify/templates/commands/specify.md +275 -0
- package/templates/.specify/templates/commands/specs.md +71 -0
- package/templates/.specify/templates/commands/tasks.md +151 -0
- package/templates/.specify/templates/commands/taskstoissues.md +35 -0
- package/templates/.specify/templates/commands/workspace.md +128 -0
- package/templates/.specify/templates/plan-template.md +104 -0
- package/templates/.specify/templates/spec-template.md +115 -0
- package/templates/.specify/templates/tasks-template.md +251 -0
- package/templates/.specify/templates/workspace.yaml +110 -0
- package/templates/.specify/workspace.yaml +95 -0
- package/templates/AGENTS.md +460 -0
- package/templates/oh-my-opencode.json +27 -0
- package/templates/opencode.json +383 -0
- package/templates/package.json +10 -0
- package/templates/project-memory/bugs.md +16 -0
- package/templates/project-memory/decisions.md +22 -0
- package/templates/project-memory/issues.md +15 -0
- package/templates/project-memory/key_facts.md +26 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# Evaluation Reference: Metrics and Implementation
|
|
2
|
+
|
|
3
|
+
This document provides implementation details for evaluation metrics and evaluation systems.
|
|
4
|
+
|
|
5
|
+
## Core Metric Definitions
|
|
6
|
+
|
|
7
|
+
### Factual Accuracy
|
|
8
|
+
|
|
9
|
+
Factual accuracy measures whether claims in agent output match ground truth.
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Excellent (1.0): All claims verified against ground truth, no errors
|
|
13
|
+
Good (0.8): Minor errors that do not affect main conclusions
|
|
14
|
+
Acceptable (0.6): Major claims correct, minor inaccuracies present
|
|
15
|
+
Poor (0.3): Significant factual errors in key claims
|
|
16
|
+
Failed (0.0): Fundamental factual errors that invalidate output
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Calculation approach:
|
|
20
|
+
- Extract claims from output
|
|
21
|
+
- Verify each claim against ground truth
|
|
22
|
+
- Weight claims by importance (major claims more weight)
|
|
23
|
+
- Calculate weighted average of claim accuracy
|
|
24
|
+
|
|
25
|
+
### Completeness
|
|
26
|
+
|
|
27
|
+
Completeness measures whether output covers all requested aspects.
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
Excellent (1.0): All requested aspects thoroughly covered
|
|
31
|
+
Good (0.8): Most aspects covered with minor gaps
|
|
32
|
+
Acceptable (0.6): Key aspects covered, some gaps
|
|
33
|
+
Poor (0.3): Major aspects missing from output
|
|
34
|
+
Failed (0.0): Fundamental aspects not addressed
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Citation Accuracy
|
|
38
|
+
|
|
39
|
+
Citation accuracy measures whether cited sources match claimed sources.
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
Excellent (1.0): All citations accurate and complete
|
|
43
|
+
Good (0.8): Minor citation formatting issues
|
|
44
|
+
Acceptable (0.6): Major citations accurate
|
|
45
|
+
Poor (0.3): Significant citation problems
|
|
46
|
+
Failed (0.0): Citations missing or completely incorrect
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Source Quality
|
|
50
|
+
|
|
51
|
+
Source quality measures whether appropriate primary sources were used.
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
Excellent (1.0): Primary authoritative sources
|
|
55
|
+
Good (0.8): Mostly primary sources with some secondary
|
|
56
|
+
Acceptable (0.6): Mix of primary and secondary sources
|
|
57
|
+
Poor (0.3): Mostly secondary or unreliable sources
|
|
58
|
+
Failed (0.0): No credible sources cited
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Tool Efficiency
|
|
62
|
+
|
|
63
|
+
Tool efficiency measures whether the agent used appropriate tools a reasonable number of times.
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
Excellent (1.0): Optimal tool selection and call count
|
|
67
|
+
Good (0.8): Good tool selection with minor inefficiencies
|
|
68
|
+
Acceptable (0.6): Appropriate tools with some redundancy
|
|
69
|
+
Poor (0.3): Wrong tools or excessive call counts
|
|
70
|
+
Failed (0.0): Severe tool misuse or extremely excessive calls
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Rubric Implementation
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
EVALUATION_DIMENSIONS = {
|
|
77
|
+
"factual_accuracy": {
|
|
78
|
+
"weight": 0.30,
|
|
79
|
+
"description": "Claims match ground truth",
|
|
80
|
+
"levels": {
|
|
81
|
+
"excellent": 1.0,
|
|
82
|
+
"good": 0.8,
|
|
83
|
+
"acceptable": 0.6,
|
|
84
|
+
"poor": 0.3,
|
|
85
|
+
"failed": 0.0
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
"completeness": {
|
|
89
|
+
"weight": 0.25,
|
|
90
|
+
"description": "All requested aspects covered",
|
|
91
|
+
"levels": {
|
|
92
|
+
"excellent": 1.0,
|
|
93
|
+
"good": 0.8,
|
|
94
|
+
"acceptable": 0.6,
|
|
95
|
+
"poor": 0.3,
|
|
96
|
+
"failed": 0.0
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
"citation_accuracy": {
|
|
100
|
+
"weight": 0.15,
|
|
101
|
+
"description": "Citations match sources",
|
|
102
|
+
"levels": {
|
|
103
|
+
"excellent": 1.0,
|
|
104
|
+
"good": 0.8,
|
|
105
|
+
"acceptable": 0.6,
|
|
106
|
+
"poor": 0.3,
|
|
107
|
+
"failed": 0.0
|
|
108
|
+
}
|
|
109
|
+
},
|
|
110
|
+
"source_quality": {
|
|
111
|
+
"weight": 0.10,
|
|
112
|
+
"description": "Appropriate primary sources used",
|
|
113
|
+
"levels": {
|
|
114
|
+
"excellent": 1.0,
|
|
115
|
+
"good": 0.8,
|
|
116
|
+
"acceptable": 0.6,
|
|
117
|
+
"poor": 0.3,
|
|
118
|
+
"failed": 0.0
|
|
119
|
+
}
|
|
120
|
+
},
|
|
121
|
+
"tool_efficiency": {
|
|
122
|
+
"weight": 0.20,
|
|
123
|
+
"description": "Right tools used reasonably",
|
|
124
|
+
"levels": {
|
|
125
|
+
"excellent": 1.0,
|
|
126
|
+
"good": 0.8,
|
|
127
|
+
"acceptable": 0.6,
|
|
128
|
+
"poor": 0.3,
|
|
129
|
+
"failed": 0.0
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
def calculate_overall_score(dimension_scores, rubric):
|
|
135
|
+
"""Calculate weighted overall score from dimension scores."""
|
|
136
|
+
total_weight = 0
|
|
137
|
+
weighted_sum = 0
|
|
138
|
+
|
|
139
|
+
for dimension, score in dimension_scores.items():
|
|
140
|
+
if dimension in rubric:
|
|
141
|
+
weight = rubric[dimension]["weight"]
|
|
142
|
+
weighted_sum += score * weight
|
|
143
|
+
total_weight += weight
|
|
144
|
+
|
|
145
|
+
return weighted_sum / total_weight if total_weight > 0 else 0
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Test Set Management
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
class TestSet:
|
|
152
|
+
def __init__(self, name):
|
|
153
|
+
self.name = name
|
|
154
|
+
self.tests = []
|
|
155
|
+
self.tags = {}
|
|
156
|
+
|
|
157
|
+
def add_test(self, test_case):
|
|
158
|
+
"""Add test case to test set."""
|
|
159
|
+
self.tests.append(test_case)
|
|
160
|
+
|
|
161
|
+
# Index by tags
|
|
162
|
+
for tag in test_case.get("tags", []):
|
|
163
|
+
if tag not in self.tags:
|
|
164
|
+
self.tags[tag] = []
|
|
165
|
+
self.tags[tag].append(len(self.tests) - 1)
|
|
166
|
+
|
|
167
|
+
def filter(self, **criteria):
|
|
168
|
+
"""Filter tests by criteria."""
|
|
169
|
+
filtered = []
|
|
170
|
+
for test in self.tests:
|
|
171
|
+
match = True
|
|
172
|
+
for key, value in criteria.items():
|
|
173
|
+
if test.get(key) != value:
|
|
174
|
+
match = False
|
|
175
|
+
break
|
|
176
|
+
if match:
|
|
177
|
+
filtered.append(test)
|
|
178
|
+
return filtered
|
|
179
|
+
|
|
180
|
+
def get_complexity_distribution(self):
|
|
181
|
+
"""Get distribution of tests by complexity."""
|
|
182
|
+
distribution = {}
|
|
183
|
+
for test in self.tests:
|
|
184
|
+
complexity = test.get("complexity", "medium")
|
|
185
|
+
distribution[complexity] = distribution.get(complexity, 0) + 1
|
|
186
|
+
return distribution
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Evaluation Runner
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
class EvaluationRunner:
|
|
193
|
+
def __init__(self, test_set, rubric, agent):
|
|
194
|
+
self.test_set = test_set
|
|
195
|
+
self.rubric = rubric
|
|
196
|
+
self.agent = agent
|
|
197
|
+
self.results = []
|
|
198
|
+
|
|
199
|
+
def run_all(self, verbose=False):
|
|
200
|
+
"""Run evaluation on all tests."""
|
|
201
|
+
self.results = []
|
|
202
|
+
|
|
203
|
+
for i, test in enumerate(self.test_set.tests):
|
|
204
|
+
if verbose:
|
|
205
|
+
print(f"Running test {i+1}/{len(self.test_set.tests)}")
|
|
206
|
+
|
|
207
|
+
result = self.run_test(test)
|
|
208
|
+
self.results.append(result)
|
|
209
|
+
|
|
210
|
+
return self.summarize()
|
|
211
|
+
|
|
212
|
+
def run_test(self, test):
|
|
213
|
+
"""Run single evaluation test."""
|
|
214
|
+
# Get agent output
|
|
215
|
+
output = self.agent.run(test["input"])
|
|
216
|
+
|
|
217
|
+
# Evaluate
|
|
218
|
+
evaluation = self.evaluate_output(output, test)
|
|
219
|
+
|
|
220
|
+
return {
|
|
221
|
+
"test": test,
|
|
222
|
+
"output": output,
|
|
223
|
+
"evaluation": evaluation
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
def evaluate_output(self, output, test):
|
|
227
|
+
"""Evaluate agent output against test."""
|
|
228
|
+
ground_truth = test.get("expected", {})
|
|
229
|
+
|
|
230
|
+
dimension_scores = {}
|
|
231
|
+
for dimension, config in self.rubric.items():
|
|
232
|
+
score = self.evaluate_dimension(
|
|
233
|
+
output, ground_truth, dimension, config
|
|
234
|
+
)
|
|
235
|
+
dimension_scores[dimension] = score
|
|
236
|
+
|
|
237
|
+
overall = calculate_overall_score(dimension_scores, self.rubric)
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
"overall_score": overall,
|
|
241
|
+
"dimension_scores": dimension_scores,
|
|
242
|
+
"passed": overall >= 0.7
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
def summarize(self):
|
|
246
|
+
"""Summarize evaluation results."""
|
|
247
|
+
if not self.results:
|
|
248
|
+
return {"error": "No results"}
|
|
249
|
+
|
|
250
|
+
passed = sum(1 for r in self.results if r["evaluation"]["passed"])
|
|
251
|
+
|
|
252
|
+
dimension_totals = {}
|
|
253
|
+
for dimension in self.rubric.keys():
|
|
254
|
+
dimension_totals[dimension] = {
|
|
255
|
+
"total": 0,
|
|
256
|
+
"count": 0
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
for result in self.results:
|
|
260
|
+
for dimension, score in result["evaluation"]["dimension_scores"].items():
|
|
261
|
+
if dimension in dimension_totals:
|
|
262
|
+
dimension_totals[dimension]["total"] += score
|
|
263
|
+
dimension_totals[dimension]["count"] += 1
|
|
264
|
+
|
|
265
|
+
dimension_averages = {}
|
|
266
|
+
for dimension, data in dimension_totals.items():
|
|
267
|
+
if data["count"] > 0:
|
|
268
|
+
dimension_averages[dimension] = data["total"] / data["count"]
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
"total_tests": len(self.results),
|
|
272
|
+
"passed": passed,
|
|
273
|
+
"failed": len(self.results) - passed,
|
|
274
|
+
"pass_rate": passed / len(self.results) if self.results else 0,
|
|
275
|
+
"dimension_averages": dimension_averages,
|
|
276
|
+
"failures": [
|
|
277
|
+
r for r in self.results
|
|
278
|
+
if not r["evaluation"]["passed"]
|
|
279
|
+
]
|
|
280
|
+
}
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## Production Monitoring
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
class ProductionMonitor:
|
|
287
|
+
def __init__(self, sample_rate=0.01):
|
|
288
|
+
self.sample_rate = sample_rate
|
|
289
|
+
self.samples = []
|
|
290
|
+
self.alert_thresholds = {
|
|
291
|
+
"pass_rate_warning": 0.85,
|
|
292
|
+
"pass_rate_critical": 0.70
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
def sample_and_evaluate(self, query, output):
|
|
296
|
+
"""Sample production interaction for evaluation."""
|
|
297
|
+
if random.random() > self.sample_rate:
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
evaluation = evaluate_output(output, {}, EVALUATION_RUBRIC)
|
|
301
|
+
|
|
302
|
+
sample = {
|
|
303
|
+
"query": query[:200],
|
|
304
|
+
"output_preview": output[:200],
|
|
305
|
+
"score": evaluation["overall_score"],
|
|
306
|
+
"passed": evaluation["passed"],
|
|
307
|
+
"timestamp": current_timestamp()
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
self.samples.append(sample)
|
|
311
|
+
return sample
|
|
312
|
+
|
|
313
|
+
def get_metrics(self):
|
|
314
|
+
"""Calculate current metrics from samples."""
|
|
315
|
+
if not self.samples:
|
|
316
|
+
return {"status": "insufficient_data"}
|
|
317
|
+
|
|
318
|
+
passed = sum(1 for s in self.samples if s["passed"])
|
|
319
|
+
pass_rate = passed / len(self.samples)
|
|
320
|
+
|
|
321
|
+
avg_score = sum(s["score"] for s in self.samples) / len(self.samples)
|
|
322
|
+
|
|
323
|
+
return {
|
|
324
|
+
"sample_count": len(self.samples),
|
|
325
|
+
"pass_rate": pass_rate,
|
|
326
|
+
"average_score": avg_score,
|
|
327
|
+
"status": self._get_status(pass_rate)
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
def _get_status(self, pass_rate):
|
|
331
|
+
"""Get status based on pass rate."""
|
|
332
|
+
if pass_rate < self.alert_thresholds["pass_rate_critical"]:
|
|
333
|
+
return "critical"
|
|
334
|
+
elif pass_rate < self.alert_thresholds["pass_rate_warning"]:
|
|
335
|
+
return "warning"
|
|
336
|
+
else:
|
|
337
|
+
return "healthy"
|
|
338
|
+
```
|
|
339
|
+
|