@shakudo/kaji-setup-external 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (411) hide show
  1. package/README.md +155 -0
  2. package/assets/skills/ci-cd/.claude-plugin/plugin.json +8 -0
  3. package/assets/skills/ci-cd/SKILL.md +573 -0
  4. package/assets/skills/ci-cd/assets/templates/github-actions/docker-build.yml +164 -0
  5. package/assets/skills/ci-cd/assets/templates/github-actions/go-ci.yml +420 -0
  6. package/assets/skills/ci-cd/assets/templates/github-actions/node-ci.yml +313 -0
  7. package/assets/skills/ci-cd/assets/templates/github-actions/python-ci.yml +388 -0
  8. package/assets/skills/ci-cd/assets/templates/github-actions/security-scan.yml +416 -0
  9. package/assets/skills/ci-cd/assets/templates/gitlab-ci/docker-build.yml +298 -0
  10. package/assets/skills/ci-cd/assets/templates/gitlab-ci/go-ci.yml +548 -0
  11. package/assets/skills/ci-cd/assets/templates/gitlab-ci/node-ci.yml +334 -0
  12. package/assets/skills/ci-cd/assets/templates/gitlab-ci/python-ci.yml +472 -0
  13. package/assets/skills/ci-cd/assets/templates/gitlab-ci/security-scan.yml +479 -0
  14. package/assets/skills/ci-cd/references/best_practices.md +675 -0
  15. package/assets/skills/ci-cd/references/devsecops.md +862 -0
  16. package/assets/skills/ci-cd/references/optimization.md +651 -0
  17. package/assets/skills/ci-cd/references/security.md +611 -0
  18. package/assets/skills/ci-cd/references/troubleshooting.md +656 -0
  19. package/assets/skills/ci-cd/scripts/ci_health.py +301 -0
  20. package/assets/skills/ci-cd/scripts/pipeline_analyzer.py +440 -0
  21. package/assets/skills/context-optimization/CONTRIBUTING.md +78 -0
  22. package/assets/skills/context-optimization/LICENSE +22 -0
  23. package/assets/skills/context-optimization/README.md +228 -0
  24. package/assets/skills/context-optimization/SKILL.md +104 -0
  25. package/assets/skills/context-optimization/docs/agentskills.md +1264 -0
  26. package/assets/skills/context-optimization/docs/blogs.md +1230 -0
  27. package/assets/skills/context-optimization/docs/claude_research.md +85 -0
  28. package/assets/skills/context-optimization/docs/compression.md +298 -0
  29. package/assets/skills/context-optimization/docs/gemini_research.md +22 -0
  30. package/assets/skills/context-optimization/docs/hncapsule.md +92 -0
  31. package/assets/skills/context-optimization/docs/netflix_context.md +10 -0
  32. package/assets/skills/context-optimization/docs/vercel_tool.md +140 -0
  33. package/assets/skills/context-optimization/examples/book-sft-pipeline/README.md +78 -0
  34. package/assets/skills/context-optimization/examples/book-sft-pipeline/SKILL.md +380 -0
  35. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/README.md +168 -0
  36. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/dataset_sample.jsonl +5 -0
  37. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.05.04/342/200/257AM.png +0 -0
  38. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.05.36/342/200/257AM.png +0 -0
  39. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.07.18/342/200/257AM.png +0 -0
  40. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/sample_outputs.md +63 -0
  41. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/training_config.json +80 -0
  42. package/assets/skills/context-optimization/examples/book-sft-pipeline/references/segmentation-strategies.md +324 -0
  43. package/assets/skills/context-optimization/examples/book-sft-pipeline/references/tinker-format.md +211 -0
  44. package/assets/skills/context-optimization/examples/book-sft-pipeline/references/tinker.txt +3176 -0
  45. package/assets/skills/context-optimization/examples/book-sft-pipeline/scripts/pipeline_example.py +187 -0
  46. package/assets/skills/context-optimization/examples/digital-brain-skill/AGENT.md +35 -0
  47. package/assets/skills/context-optimization/examples/digital-brain-skill/HOW-SKILLS-BUILT-THIS.md +407 -0
  48. package/assets/skills/context-optimization/examples/digital-brain-skill/README.md +209 -0
  49. package/assets/skills/context-optimization/examples/digital-brain-skill/SKILL.md +203 -0
  50. package/assets/skills/context-optimization/examples/digital-brain-skill/SKILLS-MAPPING.md +219 -0
  51. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/AGENTS.md +82 -0
  52. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/content_ideas.py +132 -0
  53. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/idea_to_draft.py +181 -0
  54. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/stale_contacts.py +139 -0
  55. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/weekly_review.py +121 -0
  56. package/assets/skills/context-optimization/examples/digital-brain-skill/content/CONTENT.md +88 -0
  57. package/assets/skills/context-optimization/examples/digital-brain-skill/content/calendar.md +108 -0
  58. package/assets/skills/context-optimization/examples/digital-brain-skill/content/engagement.jsonl +2 -0
  59. package/assets/skills/context-optimization/examples/digital-brain-skill/content/ideas.jsonl +2 -0
  60. package/assets/skills/context-optimization/examples/digital-brain-skill/content/posts.jsonl +2 -0
  61. package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/linkedin-post.md +102 -0
  62. package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/newsletter.md +92 -0
  63. package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/thread.md +73 -0
  64. package/assets/skills/context-optimization/examples/digital-brain-skill/examples/content-workflow.md +204 -0
  65. package/assets/skills/context-optimization/examples/digital-brain-skill/examples/meeting-prep.md +243 -0
  66. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/IDENTITY.md +46 -0
  67. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/bio-variants.md +101 -0
  68. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/brand.md +165 -0
  69. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/prompts/content-generation.xml +46 -0
  70. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/prompts/reply-generator.xml +40 -0
  71. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/values.yaml +60 -0
  72. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/voice.md +165 -0
  73. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/KNOWLEDGE.md +85 -0
  74. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/bookmarks.jsonl +2 -0
  75. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/competitors.md +117 -0
  76. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/learning.yaml +74 -0
  77. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/research/_template.md +79 -0
  78. package/assets/skills/context-optimization/examples/digital-brain-skill/network/NETWORK.md +110 -0
  79. package/assets/skills/context-optimization/examples/digital-brain-skill/network/circles.yaml +80 -0
  80. package/assets/skills/context-optimization/examples/digital-brain-skill/network/contacts.jsonl +2 -0
  81. package/assets/skills/context-optimization/examples/digital-brain-skill/network/interactions.jsonl +2 -0
  82. package/assets/skills/context-optimization/examples/digital-brain-skill/network/intros.md +92 -0
  83. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/OPERATIONS.md +75 -0
  84. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/goals.yaml +83 -0
  85. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/meetings.jsonl +2 -0
  86. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/metrics.jsonl +2 -0
  87. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/reviews/_weekly_template.md +114 -0
  88. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/todos.md +76 -0
  89. package/assets/skills/context-optimization/examples/digital-brain-skill/package.json +41 -0
  90. package/assets/skills/context-optimization/examples/digital-brain-skill/references/file-formats.md +386 -0
  91. package/assets/skills/context-optimization/examples/digital-brain-skill/scripts/install.sh +79 -0
  92. package/assets/skills/context-optimization/examples/interleaved_thinking/README.md +620 -0
  93. package/assets/skills/context-optimization/examples/interleaved_thinking/SKILL.md +221 -0
  94. package/assets/skills/context-optimization/examples/interleaved_thinking/docs/agentthinking.md +63 -0
  95. package/assets/skills/context-optimization/examples/interleaved_thinking/docs/interleavedthinking.md +610 -0
  96. package/assets/skills/context-optimization/examples/interleaved_thinking/docs/m2-1.md +224 -0
  97. package/assets/skills/context-optimization/examples/interleaved_thinking/examples/01_basic_capture.py +76 -0
  98. package/assets/skills/context-optimization/examples/interleaved_thinking/examples/02_tool_usage.py +187 -0
  99. package/assets/skills/context-optimization/examples/interleaved_thinking/examples/03_full_optimization.py +1222 -0
  100. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/SKILL.md +90 -0
  101. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/optimization_summary.json +9 -0
  102. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/optimized_prompt.txt +1 -0
  103. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/patterns_found.json +205 -0
  104. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/final_prompt.txt +67 -0
  105. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/analysis.txt +48 -0
  106. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/optimization.txt +15 -0
  107. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/optimized_prompt.txt +1 -0
  108. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/trace.txt +178 -0
  109. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_10/analysis.txt +47 -0
  110. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_10/trace.txt +162 -0
  111. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/analysis.txt +48 -0
  112. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/optimization.txt +130 -0
  113. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/optimized_prompt.txt +72 -0
  114. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/trace.txt +156 -0
  115. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/analysis.txt +46 -0
  116. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/optimization.txt +147 -0
  117. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/optimized_prompt.txt +84 -0
  118. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/trace.txt +159 -0
  119. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/analysis.txt +46 -0
  120. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/optimization.txt +134 -0
  121. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/optimized_prompt.txt +67 -0
  122. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/trace.txt +165 -0
  123. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/analysis.txt +50 -0
  124. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/optimization.txt +135 -0
  125. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/optimized_prompt.txt +71 -0
  126. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/trace.txt +146 -0
  127. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/analysis.txt +15 -0
  128. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/optimization.txt +15 -0
  129. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/optimized_prompt.txt +1 -0
  130. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/trace.txt +147 -0
  131. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/analysis.txt +46 -0
  132. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/optimization.txt +103 -0
  133. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/optimized_prompt.txt +45 -0
  134. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/trace.txt +134 -0
  135. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/analysis.txt +47 -0
  136. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/optimization.txt +114 -0
  137. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/optimized_prompt.txt +60 -0
  138. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/trace.txt +135 -0
  139. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/analysis.txt +44 -0
  140. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/optimization.txt +106 -0
  141. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/optimized_prompt.txt +51 -0
  142. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/trace.txt +170 -0
  143. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/summary.json +11 -0
  144. package/assets/skills/context-optimization/examples/interleaved_thinking/pyproject.toml +70 -0
  145. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/__init__.py +53 -0
  146. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/analyzer.py +465 -0
  147. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/capture.py +417 -0
  148. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/cli.py +271 -0
  149. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/loop.py +468 -0
  150. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/models.py +193 -0
  151. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/optimizer.py +449 -0
  152. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/skill_generator.py +502 -0
  153. package/assets/skills/context-optimization/examples/interleaved_thinking/tests/__init__.py +1 -0
  154. package/assets/skills/context-optimization/examples/interleaved_thinking/tests/test_models.py +144 -0
  155. package/assets/skills/context-optimization/examples/llm-as-judge-skills/.prettierrc +8 -0
  156. package/assets/skills/context-optimization/examples/llm-as-judge-skills/CONTRIBUTING.md +78 -0
  157. package/assets/skills/context-optimization/examples/llm-as-judge-skills/LICENSE +21 -0
  158. package/assets/skills/context-optimization/examples/llm-as-judge-skills/README.md +659 -0
  159. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/evaluator-agent/evaluator-agent.md +177 -0
  160. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/index.md +114 -0
  161. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/orchestrator-agent/orchestrator-agent.md +205 -0
  162. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/research-agent/research-agent.md +183 -0
  163. package/assets/skills/context-optimization/examples/llm-as-judge-skills/env.example +6 -0
  164. package/assets/skills/context-optimization/examples/llm-as-judge-skills/eslint.config.js +18 -0
  165. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/basic-evaluation.ts +89 -0
  166. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/full-evaluation-workflow.ts +136 -0
  167. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/generate-rubric.ts +67 -0
  168. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/pairwise-comparison.ts +97 -0
  169. package/assets/skills/context-optimization/examples/llm-as-judge-skills/package.json +79 -0
  170. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/agent-system/orchestrator-prompt.md +197 -0
  171. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/evaluation/direct-scoring-prompt.md +153 -0
  172. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/evaluation/pairwise-comparison-prompt.md +200 -0
  173. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/index.md +138 -0
  174. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/research/research-synthesis-prompt.md +171 -0
  175. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/context-fundamentals/context-fundamentals.md +114 -0
  176. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/index.md +79 -0
  177. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/llm-evaluator/llm-evaluator.md +77 -0
  178. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/tool-design/tool-design.md +198 -0
  179. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/agents/evaluator.ts +112 -0
  180. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/agents/index.ts +3 -0
  181. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/config/index.ts +18 -0
  182. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/index.ts +19 -0
  183. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts +164 -0
  184. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/generate-rubric.ts +161 -0
  185. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/index.ts +9 -0
  186. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/pairwise-compare.ts +255 -0
  187. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/evaluation.test.ts +233 -0
  188. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/setup.ts +27 -0
  189. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/skills.test.ts +213 -0
  190. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/direct-score.md +159 -0
  191. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/generate-rubric.md +189 -0
  192. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/pairwise-compare.md +182 -0
  193. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/index.md +141 -0
  194. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/orchestration/delegate-to-agent.md +171 -0
  195. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/research/read-url.md +162 -0
  196. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/research/web-search.md +128 -0
  197. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tsconfig.json +26 -0
  198. package/assets/skills/context-optimization/examples/llm-as-judge-skills/vitest.config.ts +20 -0
  199. package/assets/skills/context-optimization/examples/x-to-book-system/PRD.md +644 -0
  200. package/assets/skills/context-optimization/examples/x-to-book-system/README.md +181 -0
  201. package/assets/skills/context-optimization/examples/x-to-book-system/SKILLS-MAPPING.md +187 -0
  202. package/assets/skills/context-optimization/researcher/example_output.md +75 -0
  203. package/assets/skills/context-optimization/researcher/llm-as-a-judge.md +362 -0
  204. package/assets/skills/context-optimization/skills/advanced-evaluation/SKILL.md +454 -0
  205. package/assets/skills/context-optimization/skills/advanced-evaluation/references/bias-mitigation.md +288 -0
  206. package/assets/skills/context-optimization/skills/advanced-evaluation/references/implementation-patterns.md +315 -0
  207. package/assets/skills/context-optimization/skills/advanced-evaluation/references/metrics-guide.md +331 -0
  208. package/assets/skills/context-optimization/skills/advanced-evaluation/scripts/evaluation_example.py +337 -0
  209. package/assets/skills/context-optimization/skills/bdi-mental-states/SKILL.md +295 -0
  210. package/assets/skills/context-optimization/skills/bdi-mental-states/references/bdi-ontology-core.md +207 -0
  211. package/assets/skills/context-optimization/skills/bdi-mental-states/references/framework-integration.md +582 -0
  212. package/assets/skills/context-optimization/skills/bdi-mental-states/references/rdf-examples.md +315 -0
  213. package/assets/skills/context-optimization/skills/bdi-mental-states/references/sparql-competency.md +420 -0
  214. package/assets/skills/context-optimization/skills/context-compression/SKILL.md +265 -0
  215. package/assets/skills/context-optimization/skills/context-compression/references/evaluation-framework.md +213 -0
  216. package/assets/skills/context-optimization/skills/context-compression/scripts/compression_evaluator.py +658 -0
  217. package/assets/skills/context-optimization/skills/context-degradation/SKILL.md +231 -0
  218. package/assets/skills/context-optimization/skills/context-degradation/references/patterns.md +314 -0
  219. package/assets/skills/context-optimization/skills/context-degradation/scripts/degradation_detector.py +419 -0
  220. package/assets/skills/context-optimization/skills/context-fundamentals/SKILL.md +185 -0
  221. package/assets/skills/context-optimization/skills/context-fundamentals/references/context-components.md +283 -0
  222. package/assets/skills/context-optimization/skills/context-fundamentals/scripts/context_manager.py +370 -0
  223. package/assets/skills/context-optimization/skills/context-optimization/SKILL.md +179 -0
  224. package/assets/skills/context-optimization/skills/context-optimization/references/optimization_techniques.md +272 -0
  225. package/assets/skills/context-optimization/skills/context-optimization/scripts/compaction.py +379 -0
  226. package/assets/skills/context-optimization/skills/evaluation/SKILL.md +231 -0
  227. package/assets/skills/context-optimization/skills/evaluation/references/metrics.md +339 -0
  228. package/assets/skills/context-optimization/skills/evaluation/scripts/evaluator.py +474 -0
  229. package/assets/skills/context-optimization/skills/filesystem-context/SKILL.md +321 -0
  230. package/assets/skills/context-optimization/skills/filesystem-context/references/implementation-patterns.md +549 -0
  231. package/assets/skills/context-optimization/skills/filesystem-context/scripts/filesystem_context.py +353 -0
  232. package/assets/skills/context-optimization/skills/hosted-agents/SKILL.md +279 -0
  233. package/assets/skills/context-optimization/skills/hosted-agents/references/infrastructure-patterns.md +700 -0
  234. package/assets/skills/context-optimization/skills/hosted-agents/scripts/sandbox_manager.py +495 -0
  235. package/assets/skills/context-optimization/skills/memory-systems/SKILL.md +221 -0
  236. package/assets/skills/context-optimization/skills/memory-systems/references/implementation.md +458 -0
  237. package/assets/skills/context-optimization/skills/memory-systems/scripts/memory_store.py +396 -0
  238. package/assets/skills/context-optimization/skills/multi-agent-patterns/SKILL.md +255 -0
  239. package/assets/skills/context-optimization/skills/multi-agent-patterns/references/frameworks.md +433 -0
  240. package/assets/skills/context-optimization/skills/multi-agent-patterns/scripts/coordination.py +439 -0
  241. package/assets/skills/context-optimization/skills/project-development/SKILL.md +342 -0
  242. package/assets/skills/context-optimization/skills/project-development/references/case-studies.md +388 -0
  243. package/assets/skills/context-optimization/skills/project-development/references/pipeline-patterns.md +610 -0
  244. package/assets/skills/context-optimization/skills/project-development/scripts/pipeline_template.py +677 -0
  245. package/assets/skills/context-optimization/skills/tool-design/SKILL.md +311 -0
  246. package/assets/skills/context-optimization/skills/tool-design/references/architectural_reduction.md +210 -0
  247. package/assets/skills/context-optimization/skills/tool-design/references/best_practices.md +176 -0
  248. package/assets/skills/context-optimization/skills/tool-design/scripts/description_generator.py +237 -0
  249. package/assets/skills/context-optimization/template/SKILL.md +98 -0
  250. package/assets/skills/dremio-analytics/SKILL.md +287 -0
  251. package/assets/skills/elevenlabs-voice/SKILL.md +269 -0
  252. package/assets/skills/git-workflow/SKILL.md +266 -0
  253. package/assets/skills/gitops-workflows/.claude-plugin/plugin.json +8 -0
  254. package/assets/skills/gitops-workflows/SKILL.md +568 -0
  255. package/assets/skills/gitops-workflows/assets/applicationsets/cluster-generator.yaml +32 -0
  256. package/assets/skills/gitops-workflows/assets/argocd/install-argocd-3.x.yaml +92 -0
  257. package/assets/skills/gitops-workflows/assets/flux/flux-bootstrap-github.sh +49 -0
  258. package/assets/skills/gitops-workflows/assets/flux/oci-helmrelease.yaml +38 -0
  259. package/assets/skills/gitops-workflows/assets/progressive-delivery/argo-rollouts-canary.yaml +62 -0
  260. package/assets/skills/gitops-workflows/assets/secrets/sops-age-config.yaml +33 -0
  261. package/assets/skills/gitops-workflows/references/argocd_vs_flux.md +243 -0
  262. package/assets/skills/gitops-workflows/references/best_practices.md +160 -0
  263. package/assets/skills/gitops-workflows/references/multi_cluster.md +80 -0
  264. package/assets/skills/gitops-workflows/references/oci_artifacts.md +290 -0
  265. package/assets/skills/gitops-workflows/references/progressive_delivery.md +94 -0
  266. package/assets/skills/gitops-workflows/references/repo_patterns.md +184 -0
  267. package/assets/skills/gitops-workflows/references/secret_management.md +213 -0
  268. package/assets/skills/gitops-workflows/references/troubleshooting.md +134 -0
  269. package/assets/skills/gitops-workflows/scripts/applicationset_generator.py +156 -0
  270. package/assets/skills/gitops-workflows/scripts/check_argocd_health.py +275 -0
  271. package/assets/skills/gitops-workflows/scripts/check_flux_health.py +418 -0
  272. package/assets/skills/gitops-workflows/scripts/oci_artifact_checker.py +150 -0
  273. package/assets/skills/gitops-workflows/scripts/promotion_validator.py +88 -0
  274. package/assets/skills/gitops-workflows/scripts/secret_audit.py +178 -0
  275. package/assets/skills/gitops-workflows/scripts/sync_drift_detector.py +144 -0
  276. package/assets/skills/gitops-workflows/scripts/validate_gitops_repo.py +299 -0
  277. package/assets/skills/iac-terraform/.claude-plugin/plugin.json +8 -0
  278. package/assets/skills/iac-terraform/SKILL.md +653 -0
  279. package/assets/skills/iac-terraform/assets/templates/MODULE_TEMPLATE.md +386 -0
  280. package/assets/skills/iac-terraform/assets/workflows/github-actions-terraform.yml +224 -0
  281. package/assets/skills/iac-terraform/assets/workflows/github-actions-terragrunt.yml +236 -0
  282. package/assets/skills/iac-terraform/assets/workflows/gitlab-ci-terraform.yml +184 -0
  283. package/assets/skills/iac-terraform/references/best_practices.md +709 -0
  284. package/assets/skills/iac-terraform/references/cost_optimization.md +665 -0
  285. package/assets/skills/iac-terraform/references/troubleshooting.md +635 -0
  286. package/assets/skills/iac-terraform/scripts/init_module.py +319 -0
  287. package/assets/skills/iac-terraform/scripts/inspect_state.py +232 -0
  288. package/assets/skills/iac-terraform/scripts/validate_module.py +227 -0
  289. package/assets/skills/k8s-troubleshooter/.claude-plugin/plugin.json +8 -0
  290. package/assets/skills/k8s-troubleshooter/SKILL.md +336 -0
  291. package/assets/skills/k8s-troubleshooter/references/common_issues.md +582 -0
  292. package/assets/skills/k8s-troubleshooter/references/helm_troubleshooting.md +708 -0
  293. package/assets/skills/k8s-troubleshooter/references/incident_response.md +466 -0
  294. package/assets/skills/k8s-troubleshooter/references/performance_troubleshooting.md +687 -0
  295. package/assets/skills/k8s-troubleshooter/scripts/check_namespace.py +500 -0
  296. package/assets/skills/k8s-troubleshooter/scripts/cluster_health.py +223 -0
  297. package/assets/skills/k8s-troubleshooter/scripts/diagnose_pod.py +157 -0
  298. package/assets/skills/mattermost-notify/SKILL.md +248 -0
  299. package/assets/skills/monitoring-observability/SKILL.md +869 -0
  300. package/assets/skills/monitoring-observability/assets/templates/otel-config/collector-config.yaml +227 -0
  301. package/assets/skills/monitoring-observability/assets/templates/prometheus-alerts/kubernetes-alerts.yml +293 -0
  302. package/assets/skills/monitoring-observability/assets/templates/prometheus-alerts/webapp-alerts.yml +243 -0
  303. package/assets/skills/monitoring-observability/assets/templates/runbooks/incident-runbook-template.md +409 -0
  304. package/assets/skills/monitoring-observability/monitoring-observability.skill +0 -0
  305. package/assets/skills/monitoring-observability/references/alerting_best_practices.md +609 -0
  306. package/assets/skills/monitoring-observability/references/datadog_migration.md +649 -0
  307. package/assets/skills/monitoring-observability/references/dql_promql_translation.md +756 -0
  308. package/assets/skills/monitoring-observability/references/logging_guide.md +775 -0
  309. package/assets/skills/monitoring-observability/references/metrics_design.md +406 -0
  310. package/assets/skills/monitoring-observability/references/slo_sla_guide.md +652 -0
  311. package/assets/skills/monitoring-observability/references/tool_comparison.md +697 -0
  312. package/assets/skills/monitoring-observability/references/tracing_guide.md +663 -0
  313. package/assets/skills/monitoring-observability/scripts/alert_quality_checker.py +315 -0
  314. package/assets/skills/monitoring-observability/scripts/analyze_metrics.py +279 -0
  315. package/assets/skills/monitoring-observability/scripts/dashboard_generator.py +395 -0
  316. package/assets/skills/monitoring-observability/scripts/datadog_cost_analyzer.py +477 -0
  317. package/assets/skills/monitoring-observability/scripts/health_check_validator.py +297 -0
  318. package/assets/skills/monitoring-observability/scripts/log_analyzer.py +321 -0
  319. package/assets/skills/monitoring-observability/scripts/slo_calculator.py +365 -0
  320. package/assets/skills/neo4j-graph-rag/SKILL.md +258 -0
  321. package/assets/skills/pagerduty-ops/SKILL.md +380 -0
  322. package/assets/skills/playwright/API_REFERENCE.md +653 -0
  323. package/assets/skills/playwright/SKILL.md +453 -0
  324. package/assets/skills/playwright/lib/helpers.js +441 -0
  325. package/assets/skills/playwright/package.json +26 -0
  326. package/assets/skills/playwright/run.js +228 -0
  327. package/assets/skills/project-memory/README.md +687 -0
  328. package/assets/skills/project-memory/SKILL.md +298 -0
  329. package/assets/skills/project-memory/references/bugs_template.md +41 -0
  330. package/assets/skills/project-memory/references/decisions_template.md +92 -0
  331. package/assets/skills/project-memory/references/issues_template.md +76 -0
  332. package/assets/skills/project-memory/references/key_facts_template.md +158 -0
  333. package/assets/skills/recruit-workflow/SKILL.md +276 -0
  334. package/assets/skills/recruit-workflow/references/email-templates.md +347 -0
  335. package/assets/skills/recruit-workflow/references/workflow-stages.md +395 -0
  336. package/assets/skills/recruit-workflow/scripts/clay_client.py +188 -0
  337. package/assets/skills/recruit-workflow/scripts/lever_client.py +197 -0
  338. package/assets/skills/recruit-workflow/scripts/mailgun_client.py +245 -0
  339. package/assets/skills/recruit-workflow/scripts/minio_client.py +426 -0
  340. package/assets/skills/shakudo-microservice/SKILL.md +215 -0
  341. package/assets/skills/tmux/SKILL.md +631 -0
  342. package/assets/skills/tmux/references/direct-socket-control.md +108 -0
  343. package/assets/skills/tmux/references/session-lifecycle.md +503 -0
  344. package/assets/skills/tmux/references/session-registry.md +1484 -0
  345. package/assets/skills/tmux/tools/cleanup-sessions.sh +263 -0
  346. package/assets/skills/tmux/tools/create-session.sh +224 -0
  347. package/assets/skills/tmux/tools/find-sessions.sh +262 -0
  348. package/assets/skills/tmux/tools/kill-session.sh +308 -0
  349. package/assets/skills/tmux/tools/lib/registry.sh +437 -0
  350. package/assets/skills/tmux/tools/lib/time_utils.sh +54 -0
  351. package/assets/skills/tmux/tools/list-sessions.sh +255 -0
  352. package/assets/skills/tmux/tools/pane-health.sh +424 -0
  353. package/assets/skills/tmux/tools/safe-send.sh +503 -0
  354. package/assets/skills/tmux/tools/wait-for-text.sh +260 -0
  355. package/assets/skills/twilio-sms/SKILL.md +508 -0
  356. package/assets/skills/zellij/SKILL.md +274 -0
  357. package/assets/skills/zellij/references/actions.md +558 -0
  358. package/assets/skills/zellij/references/layouts.md +424 -0
  359. package/bin/cli.ts +46 -0
  360. package/package.json +43 -0
  361. package/src/alias.ts +108 -0
  362. package/src/backup.ts +51 -0
  363. package/src/config.ts +115 -0
  364. package/src/dependencies.ts +163 -0
  365. package/src/errors.ts +77 -0
  366. package/src/index.ts +207 -0
  367. package/src/prompts.ts +142 -0
  368. package/src/schemas.ts +21 -0
  369. package/src/skills.ts +45 -0
  370. package/src/speckit.ts +116 -0
  371. package/src/types.ts +106 -0
  372. package/src/utils.ts +110 -0
  373. package/src/vibe-git.ts +50 -0
  374. package/templates/.specify/memory/constitution.md +109 -0
  375. package/templates/.specify/scripts/bash/check-prerequisites.sh +262 -0
  376. package/templates/.specify/scripts/bash/common.sh +670 -0
  377. package/templates/.specify/scripts/bash/create-new-feature.sh +594 -0
  378. package/templates/.specify/scripts/bash/create-worktree-feature.sh +401 -0
  379. package/templates/.specify/scripts/bash/init-workspace.sh +433 -0
  380. package/templates/.specify/scripts/bash/list-spec-worktrees.sh +198 -0
  381. package/templates/.specify/scripts/bash/setup-plan.sh +105 -0
  382. package/templates/.specify/scripts/bash/test-workspace-rollup.sh +175 -0
  383. package/templates/.specify/scripts/bash/update-agent-context.sh +799 -0
  384. package/templates/.specify/templates/agent-file-template.md +28 -0
  385. package/templates/.specify/templates/checklist-template.md +40 -0
  386. package/templates/.specify/templates/commands/analyze.md +197 -0
  387. package/templates/.specify/templates/commands/checklist.md +306 -0
  388. package/templates/.specify/templates/commands/clarify.md +194 -0
  389. package/templates/.specify/templates/commands/constitution.md +97 -0
  390. package/templates/.specify/templates/commands/implement.md +149 -0
  391. package/templates/.specify/templates/commands/plan.md +123 -0
  392. package/templates/.specify/templates/commands/projects.md +48 -0
  393. package/templates/.specify/templates/commands/rollup.md +66 -0
  394. package/templates/.specify/templates/commands/specify.md +275 -0
  395. package/templates/.specify/templates/commands/specs.md +71 -0
  396. package/templates/.specify/templates/commands/tasks.md +151 -0
  397. package/templates/.specify/templates/commands/taskstoissues.md +35 -0
  398. package/templates/.specify/templates/commands/workspace.md +128 -0
  399. package/templates/.specify/templates/plan-template.md +104 -0
  400. package/templates/.specify/templates/spec-template.md +115 -0
  401. package/templates/.specify/templates/tasks-template.md +251 -0
  402. package/templates/.specify/templates/workspace.yaml +110 -0
  403. package/templates/.specify/workspace.yaml +95 -0
  404. package/templates/AGENTS.md +460 -0
  405. package/templates/oh-my-opencode.json +27 -0
  406. package/templates/opencode.json +383 -0
  407. package/templates/package.json +10 -0
  408. package/templates/project-memory/bugs.md +16 -0
  409. package/templates/project-memory/decisions.md +22 -0
  410. package/templates/project-memory/issues.md +15 -0
  411. package/templates/project-memory/key_facts.md +26 -0
@@ -0,0 +1,331 @@
1
+ # Metric Selection Guide for LLM Evaluation
2
+
3
+ This reference provides guidance on selecting appropriate metrics for different evaluation scenarios.
4
+
5
+ ## Metric Categories
6
+
7
+ ### Classification Metrics
8
+
9
+ Use for binary or multi-class evaluation tasks (pass/fail, correct/incorrect).
10
+
11
+ #### Precision
12
+
13
+ ```
14
+ Precision = True Positives / (True Positives + False Positives)
15
+ ```
16
+
17
+ **Interpretation**: Of all responses the judge said were good, what fraction were actually good?
18
+
19
+ **Use when**: False positives are costly (e.g., approving unsafe content)
20
+
21
+ ```python
22
+ def precision(predictions, ground_truth):
23
+ true_positives = sum(1 for p, g in zip(predictions, ground_truth) if p == 1 and g == 1)
24
+ predicted_positives = sum(predictions)
25
+ return true_positives / predicted_positives if predicted_positives > 0 else 0
26
+ ```
27
+
28
+ #### Recall
29
+
30
+ ```
31
+ Recall = True Positives / (True Positives + False Negatives)
32
+ ```
33
+
34
+ **Interpretation**: Of all actually good responses, what fraction did the judge identify?
35
+
36
+ **Use when**: False negatives are costly (e.g., missing good content in filtering)
37
+
38
+ ```python
39
+ def recall(predictions, ground_truth):
40
+ true_positives = sum(1 for p, g in zip(predictions, ground_truth) if p == 1 and g == 1)
41
+ actual_positives = sum(ground_truth)
42
+ return true_positives / actual_positives if actual_positives > 0 else 0
43
+ ```
44
+
45
+ #### F1 Score
46
+
47
+ ```
48
+ F1 = 2 * (Precision * Recall) / (Precision + Recall)
49
+ ```
50
+
51
+ **Interpretation**: Harmonic mean of precision and recall
52
+
53
+ **Use when**: You need a single number balancing both concerns
54
+
55
+ ```python
56
+ def f1_score(predictions, ground_truth):
57
+ p = precision(predictions, ground_truth)
58
+ r = recall(predictions, ground_truth)
59
+ return 2 * p * r / (p + r) if (p + r) > 0 else 0
60
+ ```
61
+
62
+ ### Agreement Metrics
63
+
64
+ Use for comparing automated evaluation with human judgment.
65
+
66
+ #### Cohen's Kappa (κ)
67
+
68
+ ```
69
+ κ = (Observed Agreement - Expected Agreement) / (1 - Expected Agreement)
70
+ ```
71
+
72
+ **Interpretation**: Agreement adjusted for chance
73
+ - κ > 0.8: Almost perfect agreement
74
+ - κ 0.6-0.8: Substantial agreement
75
+ - κ 0.4-0.6: Moderate agreement
76
+ - κ < 0.4: Fair to poor agreement
77
+
78
+ **Use for**: Binary or categorical judgments
79
+
80
+ ```python
81
+ def cohens_kappa(judge1, judge2):
82
+ from sklearn.metrics import cohen_kappa_score
83
+ return cohen_kappa_score(judge1, judge2)
84
+ ```
85
+
86
+ #### Weighted Kappa
87
+
88
+ For ordinal scales where disagreement severity matters:
89
+
90
+ ```python
91
+ def weighted_kappa(judge1, judge2):
92
+ from sklearn.metrics import cohen_kappa_score
93
+ return cohen_kappa_score(judge1, judge2, weights='quadratic')
94
+ ```
95
+
96
+ **Interpretation**: Penalizes large disagreements more than small ones
97
+
98
+ ### Correlation Metrics
99
+
100
+ Use for ordinal/continuous scores.
101
+
102
+ #### Spearman's Rank Correlation (ρ)
103
+
104
+ **Interpretation**: Correlation between rankings, not absolute values
105
+ - ρ > 0.9: Very strong correlation
106
+ - ρ 0.7-0.9: Strong correlation
107
+ - ρ 0.5-0.7: Moderate correlation
108
+ - ρ < 0.5: Weak correlation
109
+
110
+ **Use when**: Order matters more than exact values
111
+
112
+ ```python
113
+ def spearmans_rho(scores1, scores2):
114
+ from scipy.stats import spearmanr
115
+ rho, p_value = spearmanr(scores1, scores2)
116
+ return {'rho': rho, 'p_value': p_value}
117
+ ```
118
+
119
+ #### Kendall's Tau (τ)
120
+
121
+ **Interpretation**: Similar to Spearman but based on pairwise concordance
122
+
123
+ **Use when**: You have many tied values
124
+
125
+ ```python
126
+ def kendalls_tau(scores1, scores2):
127
+ from scipy.stats import kendalltau
128
+ tau, p_value = kendalltau(scores1, scores2)
129
+ return {'tau': tau, 'p_value': p_value}
130
+ ```
131
+
132
+ #### Pearson Correlation (r)
133
+
134
+ **Interpretation**: Linear correlation between scores
135
+
136
+ **Use when**: Exact score values matter, not just order
137
+
138
+ ```python
139
+ def pearsons_r(scores1, scores2):
140
+ from scipy.stats import pearsonr
141
+ r, p_value = pearsonr(scores1, scores2)
142
+ return {'r': r, 'p_value': p_value}
143
+ ```
144
+
145
+ ### Pairwise Comparison Metrics
146
+
147
+ #### Agreement Rate
148
+
149
+ ```
150
+ Agreement = (Matching Decisions) / (Total Comparisons)
151
+ ```
152
+
153
+ **Interpretation**: Simple percentage of agreement
154
+
155
+ ```python
156
+ def pairwise_agreement(decisions1, decisions2):
157
+ matches = sum(1 for d1, d2 in zip(decisions1, decisions2) if d1 == d2)
158
+ return matches / len(decisions1)
159
+ ```
160
+
161
+ #### Position Consistency
162
+
163
+ ```
164
+ Consistency = (Consistent across position swaps) / (Total comparisons)
165
+ ```
166
+
167
+ **Interpretation**: How often does swapping position change the decision?
168
+
169
+ ```python
170
+ def position_consistency(results):
171
+ consistent = sum(1 for r in results if r['position_consistent'])
172
+ return consistent / len(results)
173
+ ```
174
+
175
+ ## Selection Decision Tree
176
+
177
+ ```
178
+ What type of evaluation task?
179
+
180
+ ├── Binary classification (pass/fail)
181
+ │ └── Use: Precision, Recall, F1, Cohen's κ
182
+
183
+ ├── Ordinal scale (1-5 rating)
184
+ │ ├── Comparing to human judgments?
185
+ │ │ └── Use: Spearman's ρ, Weighted κ
186
+ │ └── Comparing two automated judges?
187
+ │ └── Use: Kendall's τ, Spearman's ρ
188
+
189
+ ├── Pairwise preference
190
+ │ └── Use: Agreement rate, Position consistency
191
+
192
+ └── Multi-label classification
193
+ └── Use: Macro-F1, Micro-F1, Per-label metrics
194
+ ```
195
+
196
+ ## Metric Selection by Use Case
197
+
198
+ ### Use Case 1: Validating Automated Evaluation
199
+
200
+ **Goal**: Ensure automated evaluation correlates with human judgment
201
+
202
+ **Recommended Metrics**:
203
+ 1. Primary: Spearman's ρ (for ordinal scales) or Cohen's κ (for categorical)
204
+ 2. Secondary: Per-criterion agreement
205
+ 3. Diagnostic: Confusion matrix for systematic errors
206
+
207
+ ```python
208
+ def validate_automated_eval(automated_scores, human_scores, criteria):
209
+ results = {}
210
+
211
+ # Overall correlation
212
+ results['overall_spearman'] = spearmans_rho(automated_scores, human_scores)
213
+
214
+ # Per-criterion agreement
215
+ for criterion in criteria:
216
+ auto_crit = [s[criterion] for s in automated_scores]
217
+ human_crit = [s[criterion] for s in human_scores]
218
+ results[f'{criterion}_spearman'] = spearmans_rho(auto_crit, human_crit)
219
+
220
+ return results
221
+ ```
222
+
223
+ ### Use Case 2: Comparing Two Models
224
+
225
+ **Goal**: Determine which model produces better outputs
226
+
227
+ **Recommended Metrics**:
228
+ 1. Primary: Win rate (from pairwise comparison)
229
+ 2. Secondary: Position consistency (bias check)
230
+ 3. Diagnostic: Per-criterion breakdown
231
+
232
+ ```python
233
+ def compare_models(model_a_outputs, model_b_outputs, prompts):
234
+ results = []
235
+ for a, b, p in zip(model_a_outputs, model_b_outputs, prompts):
236
+ comparison = await compare_with_position_swap(a, b, p)
237
+ results.append(comparison)
238
+
239
+ return {
240
+ 'a_wins': sum(1 for r in results if r['winner'] == 'A'),
241
+ 'b_wins': sum(1 for r in results if r['winner'] == 'B'),
242
+ 'ties': sum(1 for r in results if r['winner'] == 'TIE'),
243
+ 'position_consistency': position_consistency(results)
244
+ }
245
+ ```
246
+
247
+ ### Use Case 3: Quality Monitoring
248
+
249
+ **Goal**: Track evaluation quality over time
250
+
251
+ **Recommended Metrics**:
252
+ 1. Primary: Rolling agreement with human spot-checks
253
+ 2. Secondary: Score distribution stability
254
+ 3. Diagnostic: Bias indicators (position, length)
255
+
256
+ ```python
257
+ class QualityMonitor:
258
+ def __init__(self, window_size=100):
259
+ self.window = deque(maxlen=window_size)
260
+
261
+ def add_evaluation(self, automated, human_spot_check=None):
262
+ self.window.append({
263
+ 'automated': automated,
264
+ 'human': human_spot_check,
265
+ 'length': len(automated['response'])
266
+ })
267
+
268
+ def get_metrics(self):
269
+ # Filter to evaluations with human spot-checks
270
+ with_human = [e for e in self.window if e['human'] is not None]
271
+
272
+ if len(with_human) < 10:
273
+ return {'insufficient_data': True}
274
+
275
+ auto_scores = [e['automated']['score'] for e in with_human]
276
+ human_scores = [e['human']['score'] for e in with_human]
277
+
278
+ return {
279
+ 'correlation': spearmans_rho(auto_scores, human_scores),
280
+ 'mean_difference': np.mean([a - h for a, h in zip(auto_scores, human_scores)]),
281
+ 'length_correlation': spearmans_rho(
282
+ [e['length'] for e in self.window],
283
+ [e['automated']['score'] for e in self.window]
284
+ )
285
+ }
286
+ ```
287
+
288
+ ## Interpreting Metric Results
289
+
290
+ ### Good Evaluation System Indicators
291
+
292
+ | Metric | Good | Acceptable | Concerning |
293
+ |--------|------|------------|------------|
294
+ | Spearman's ρ | > 0.8 | 0.6-0.8 | < 0.6 |
295
+ | Cohen's κ | > 0.7 | 0.5-0.7 | < 0.5 |
296
+ | Position consistency | > 0.9 | 0.8-0.9 | < 0.8 |
297
+ | Length correlation | < 0.2 | 0.2-0.4 | > 0.4 |
298
+
299
+ ### Warning Signs
300
+
301
+ 1. **High agreement but low correlation**: May indicate calibration issues
302
+ 2. **Low position consistency**: Position bias affecting results
303
+ 3. **High length correlation**: Length bias inflating scores
304
+ 4. **Per-criterion variance**: Some criteria may be poorly defined
305
+
306
+ ## Reporting Template
307
+
308
+ ```markdown
309
+ ## Evaluation System Metrics Report
310
+
311
+ ### Human Agreement
312
+ - Spearman's ρ: 0.82 (p < 0.001)
313
+ - Cohen's κ: 0.74
314
+ - Sample size: 500 evaluations
315
+
316
+ ### Bias Indicators
317
+ - Position consistency: 91%
318
+ - Length-score correlation: 0.12
319
+
320
+ ### Per-Criterion Performance
321
+ | Criterion | Spearman's ρ | κ |
322
+ |-----------|--------------|---|
323
+ | Accuracy | 0.88 | 0.79 |
324
+ | Clarity | 0.76 | 0.68 |
325
+ | Completeness | 0.81 | 0.72 |
326
+
327
+ ### Recommendations
328
+ - All metrics within acceptable ranges
329
+ - Monitor "Clarity" criterion - lower agreement may indicate need for rubric refinement
330
+ ```
331
+
@@ -0,0 +1,337 @@
1
+ """
2
+ Advanced Evaluation Example
3
+
4
+ This script demonstrates the core evaluation patterns from the advanced-evaluation skill.
5
+ It uses pseudocode that works across Python environments without specific dependencies.
6
+ """
7
+
8
+ # =============================================================================
9
+ # DIRECT SCORING EXAMPLE
10
+ # =============================================================================
11
+
12
+ def direct_scoring_example():
13
+ """
14
+ Direct scoring: Rate a single response against defined criteria.
15
+ Best for objective criteria like accuracy, completeness, instruction following.
16
+ """
17
+
18
+ # Input
19
+ prompt = "Explain quantum entanglement to a high school student"
20
+ response = """
21
+ Quantum entanglement is like having two magical coins that are connected.
22
+ When you flip one and it lands on heads, the other instantly shows tails,
23
+ no matter how far apart they are. Scientists call this "spooky action at a distance."
24
+ """
25
+
26
+ criteria = [
27
+ {"name": "Accuracy", "description": "Scientific correctness", "weight": 0.4},
28
+ {"name": "Clarity", "description": "Understandable for audience", "weight": 0.3},
29
+ {"name": "Engagement", "description": "Interesting and memorable", "weight": 0.3}
30
+ ]
31
+
32
+ # System prompt for the evaluator
33
+ system_prompt = """You are an expert evaluator. Assess the response against each criterion.
34
+
35
+ For each criterion:
36
+ 1. Find specific evidence in the response
37
+ 2. Score according to the rubric (1-5 scale)
38
+ 3. Justify your score with evidence
39
+ 4. Suggest one specific improvement
40
+
41
+ Be objective and consistent. Base scores on explicit evidence."""
42
+
43
+ # User prompt structure
44
+ user_prompt = f"""## Original Prompt
45
+ {prompt}
46
+
47
+ ## Response to Evaluate
48
+ {response}
49
+
50
+ ## Criteria
51
+ 1. **Accuracy** (weight: 0.4): Scientific correctness
52
+ 2. **Clarity** (weight: 0.3): Understandable for audience
53
+ 3. **Engagement** (weight: 0.3): Interesting and memorable
54
+
55
+ ## Output Format
56
+ Respond with valid JSON:
57
+ {{
58
+ "scores": [
59
+ {{
60
+ "criterion": "Accuracy",
61
+ "score": 4,
62
+ "evidence": ["quote or observation"],
63
+ "justification": "why this score",
64
+ "improvement": "specific suggestion"
65
+ }}
66
+ ],
67
+ "summary": {{
68
+ "assessment": "overall quality summary",
69
+ "strengths": ["strength 1"],
70
+ "weaknesses": ["weakness 1"]
71
+ }}
72
+ }}"""
73
+
74
+ # Expected output structure
75
+ expected_output = {
76
+ "scores": [
77
+ {
78
+ "criterion": "Accuracy",
79
+ "score": 4,
80
+ "evidence": ["Correctly uses analogy", "Mentions spooky action at a distance"],
81
+ "justification": "Core concept is correct, analogy is appropriate",
82
+ "improvement": "Could mention it's a quantum mechanical phenomenon"
83
+ },
84
+ {
85
+ "criterion": "Clarity",
86
+ "score": 5,
87
+ "evidence": ["Simple coin analogy", "No jargon"],
88
+ "justification": "Appropriate for high school level",
89
+ "improvement": "None needed"
90
+ },
91
+ {
92
+ "criterion": "Engagement",
93
+ "score": 4,
94
+ "evidence": ["Magical coins", "Spooky action quote"],
95
+ "justification": "Memorable imagery and Einstein quote",
96
+ "improvement": "Could add a real-world application"
97
+ }
98
+ ],
99
+ "summary": {
100
+ "assessment": "Good explanation suitable for the target audience",
101
+ "strengths": ["Clear analogy", "Age-appropriate language"],
102
+ "weaknesses": ["Could be more comprehensive"]
103
+ }
104
+ }
105
+
106
+ # Calculate weighted score
107
+ total_weight = sum(c["weight"] for c in criteria)
108
+ weighted_score = sum(
109
+ s["score"] * next(c["weight"] for c in criteria if c["name"] == s["criterion"])
110
+ for s in expected_output["scores"]
111
+ ) / total_weight
112
+
113
+ print(f"Weighted Score: {weighted_score:.2f}/5")
114
+ return expected_output
115
+
116
+
117
+ # =============================================================================
118
+ # PAIRWISE COMPARISON WITH POSITION BIAS MITIGATION
119
+ # =============================================================================
120
+
121
+ def pairwise_comparison_example():
122
+ """
123
+ Pairwise comparison: Compare two responses and select the better one.
124
+ Includes position swapping to mitigate position bias.
125
+ Best for subjective preferences like tone, style, persuasiveness.
126
+ """
127
+
128
+ prompt = "Explain machine learning to a beginner"
129
+
130
+ response_a = """
131
+ Machine learning is a subset of artificial intelligence that enables
132
+ systems to learn and improve from experience without being explicitly
133
+ programmed. It uses statistical techniques to give computers the ability
134
+ to identify patterns in data.
135
+ """
136
+
137
+ response_b = """
138
+ Imagine teaching a dog a new trick. You show the dog what to do, give
139
+ treats when it's right, and eventually it learns. Machine learning works
140
+ similarly - we show computers lots of examples, tell them when they're
141
+ right, and they learn to recognize patterns on their own.
142
+ """
143
+
144
+ criteria = ["clarity", "accessibility", "accuracy"]
145
+
146
+ # System prompt emphasizing bias awareness
147
+ system_prompt = """You are an expert evaluator comparing two AI responses.
148
+
149
+ CRITICAL INSTRUCTIONS:
150
+ - Do NOT prefer responses because they are longer
151
+ - Do NOT prefer responses based on position (first vs second)
152
+ - Focus ONLY on quality according to the specified criteria
153
+ - Ties are acceptable when responses are genuinely equivalent"""
154
+
155
+ # First pass: A first, B second
156
+ def evaluate_pass(first_response, second_response, first_label, second_label):
157
+ user_prompt = f"""## Original Prompt
158
+ {prompt}
159
+
160
+ ## Response {first_label}
161
+ {first_response}
162
+
163
+ ## Response {second_label}
164
+ {second_response}
165
+
166
+ ## Comparison Criteria
167
+ {', '.join(criteria)}
168
+
169
+ ## Output Format
170
+ {{
171
+ "comparison": [
172
+ {{"criterion": "clarity", "winner": "A|B|TIE", "reasoning": "..."}}
173
+ ],
174
+ "result": {{
175
+ "winner": "A|B|TIE",
176
+ "confidence": 0.0-1.0,
177
+ "reasoning": "overall reasoning"
178
+ }}
179
+ }}"""
180
+ return user_prompt
181
+
182
+ # Position bias mitigation protocol
183
+ print("Pass 1: A in first position")
184
+ pass1_result = {"winner": "B", "confidence": 0.8}
185
+
186
+ print("Pass 2: B in first position (swapped)")
187
+ pass2_result = {"winner": "A", "confidence": 0.75} # A because B was first
188
+
189
+ # Map pass2 result back (swap labels)
190
+ def map_winner(winner):
191
+ return {"A": "B", "B": "A", "TIE": "TIE"}[winner]
192
+
193
+ pass2_mapped = map_winner(pass2_result["winner"])
194
+ print(f"Pass 2 mapped winner: {pass2_mapped}")
195
+
196
+ # Check consistency
197
+ consistent = pass1_result["winner"] == pass2_mapped
198
+
199
+ if consistent:
200
+ final_result = {
201
+ "winner": pass1_result["winner"],
202
+ "confidence": (pass1_result["confidence"] + pass2_result["confidence"]) / 2,
203
+ "position_consistent": True
204
+ }
205
+ else:
206
+ final_result = {
207
+ "winner": "TIE",
208
+ "confidence": 0.5,
209
+ "position_consistent": False,
210
+ "bias_detected": True
211
+ }
212
+
213
+ print(f"\nFinal Result: {final_result}")
214
+ return final_result
215
+
216
+
217
+ # =============================================================================
218
+ # RUBRIC GENERATION
219
+ # =============================================================================
220
+
221
+ def rubric_generation_example():
222
+ """
223
+ Generate a domain-specific scoring rubric.
224
+ Rubrics reduce evaluation variance by 40-60%.
225
+ """
226
+
227
+ criterion_name = "Code Readability"
228
+ criterion_description = "How easy the code is to understand and maintain"
229
+ domain = "software engineering"
230
+ scale = "1-5"
231
+ strictness = "balanced"
232
+
233
+ system_prompt = f"""You are an expert in creating evaluation rubrics.
234
+ Create clear, actionable rubrics with distinct boundaries between levels.
235
+
236
+ Strictness: {strictness}
237
+ - lenient: Lower bar for passing scores
238
+ - balanced: Fair, typical expectations
239
+ - strict: High standards, critical evaluation"""
240
+
241
+ user_prompt = f"""Create a scoring rubric for:
242
+
243
+ **Criterion**: {criterion_name}
244
+ **Description**: {criterion_description}
245
+ **Scale**: {scale}
246
+ **Domain**: {domain}
247
+
248
+ Generate:
249
+ 1. Clear descriptions for each score level
250
+ 2. Specific characteristics that define each level
251
+ 3. Brief example text for each level
252
+ 4. General scoring guidelines
253
+ 5. Edge cases with guidance"""
254
+
255
+ # Expected rubric structure
256
+ rubric = {
257
+ "criterion": criterion_name,
258
+ "scale": {"min": 1, "max": 5},
259
+ "levels": [
260
+ {
261
+ "score": 1,
262
+ "label": "Poor",
263
+ "description": "Code is difficult to understand without significant effort",
264
+ "characteristics": [
265
+ "No meaningful variable or function names",
266
+ "No comments or documentation",
267
+ "Deeply nested or convoluted logic"
268
+ ],
269
+ "example": "def f(x): return x[0]*x[1]+x[2]"
270
+ },
271
+ {
272
+ "score": 3,
273
+ "label": "Adequate",
274
+ "description": "Code is understandable with some effort",
275
+ "characteristics": [
276
+ "Most variables have meaningful names",
277
+ "Basic comments for complex sections",
278
+ "Logic is followable but could be cleaner"
279
+ ],
280
+ "example": "def calc_total(items): # calculate sum\n total = 0\n for i in items: total += i\n return total"
281
+ },
282
+ {
283
+ "score": 5,
284
+ "label": "Excellent",
285
+ "description": "Code is immediately clear and maintainable",
286
+ "characteristics": [
287
+ "All names are descriptive and consistent",
288
+ "Comprehensive documentation",
289
+ "Clean, modular structure"
290
+ ],
291
+ "example": "def calculate_total_price(items: List[Item]) -> Decimal:\n '''Calculate the total price of all items.'''\n return sum(item.price for item in items)"
292
+ }
293
+ ],
294
+ "scoring_guidelines": [
295
+ "Focus on readability, not cleverness",
296
+ "Consider the intended audience (team skill level)",
297
+ "Consistency matters more than style preference"
298
+ ],
299
+ "edge_cases": [
300
+ {
301
+ "situation": "Code uses domain-specific abbreviations",
302
+ "guidance": "Score based on readability for domain experts, not general audience"
303
+ },
304
+ {
305
+ "situation": "Code is auto-generated",
306
+ "guidance": "Apply same standards but note in evaluation"
307
+ }
308
+ ]
309
+ }
310
+
311
+ print("Generated Rubric:")
312
+ for level in rubric["levels"]:
313
+ print(f" {level['score']}: {level['label']} - {level['description']}")
314
+
315
+ return rubric
316
+
317
+
318
+ # =============================================================================
319
+ # MAIN
320
+ # =============================================================================
321
+
322
+ if __name__ == "__main__":
323
+ print("=" * 60)
324
+ print("DIRECT SCORING EXAMPLE")
325
+ print("=" * 60)
326
+ direct_scoring_example()
327
+
328
+ print("\n" + "=" * 60)
329
+ print("PAIRWISE COMPARISON EXAMPLE")
330
+ print("=" * 60)
331
+ pairwise_comparison_example()
332
+
333
+ print("\n" + "=" * 60)
334
+ print("RUBRIC GENERATION EXAMPLE")
335
+ print("=" * 60)
336
+ rubric_generation_example()
337
+