@shakudo/kaji-setup-external 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (411) hide show
  1. package/README.md +155 -0
  2. package/assets/skills/ci-cd/.claude-plugin/plugin.json +8 -0
  3. package/assets/skills/ci-cd/SKILL.md +573 -0
  4. package/assets/skills/ci-cd/assets/templates/github-actions/docker-build.yml +164 -0
  5. package/assets/skills/ci-cd/assets/templates/github-actions/go-ci.yml +420 -0
  6. package/assets/skills/ci-cd/assets/templates/github-actions/node-ci.yml +313 -0
  7. package/assets/skills/ci-cd/assets/templates/github-actions/python-ci.yml +388 -0
  8. package/assets/skills/ci-cd/assets/templates/github-actions/security-scan.yml +416 -0
  9. package/assets/skills/ci-cd/assets/templates/gitlab-ci/docker-build.yml +298 -0
  10. package/assets/skills/ci-cd/assets/templates/gitlab-ci/go-ci.yml +548 -0
  11. package/assets/skills/ci-cd/assets/templates/gitlab-ci/node-ci.yml +334 -0
  12. package/assets/skills/ci-cd/assets/templates/gitlab-ci/python-ci.yml +472 -0
  13. package/assets/skills/ci-cd/assets/templates/gitlab-ci/security-scan.yml +479 -0
  14. package/assets/skills/ci-cd/references/best_practices.md +675 -0
  15. package/assets/skills/ci-cd/references/devsecops.md +862 -0
  16. package/assets/skills/ci-cd/references/optimization.md +651 -0
  17. package/assets/skills/ci-cd/references/security.md +611 -0
  18. package/assets/skills/ci-cd/references/troubleshooting.md +656 -0
  19. package/assets/skills/ci-cd/scripts/ci_health.py +301 -0
  20. package/assets/skills/ci-cd/scripts/pipeline_analyzer.py +440 -0
  21. package/assets/skills/context-optimization/CONTRIBUTING.md +78 -0
  22. package/assets/skills/context-optimization/LICENSE +22 -0
  23. package/assets/skills/context-optimization/README.md +228 -0
  24. package/assets/skills/context-optimization/SKILL.md +104 -0
  25. package/assets/skills/context-optimization/docs/agentskills.md +1264 -0
  26. package/assets/skills/context-optimization/docs/blogs.md +1230 -0
  27. package/assets/skills/context-optimization/docs/claude_research.md +85 -0
  28. package/assets/skills/context-optimization/docs/compression.md +298 -0
  29. package/assets/skills/context-optimization/docs/gemini_research.md +22 -0
  30. package/assets/skills/context-optimization/docs/hncapsule.md +92 -0
  31. package/assets/skills/context-optimization/docs/netflix_context.md +10 -0
  32. package/assets/skills/context-optimization/docs/vercel_tool.md +140 -0
  33. package/assets/skills/context-optimization/examples/book-sft-pipeline/README.md +78 -0
  34. package/assets/skills/context-optimization/examples/book-sft-pipeline/SKILL.md +380 -0
  35. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/README.md +168 -0
  36. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/dataset_sample.jsonl +5 -0
  37. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.05.04/342/200/257AM.png +0 -0
  38. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.05.36/342/200/257AM.png +0 -0
  39. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/pangram/Screenshot 2025-12-27 at 3.07.18/342/200/257AM.png +0 -0
  40. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/sample_outputs.md +63 -0
  41. package/assets/skills/context-optimization/examples/book-sft-pipeline/examples/gertrude-stein/training_config.json +80 -0
  42. package/assets/skills/context-optimization/examples/book-sft-pipeline/references/segmentation-strategies.md +324 -0
  43. package/assets/skills/context-optimization/examples/book-sft-pipeline/references/tinker-format.md +211 -0
  44. package/assets/skills/context-optimization/examples/book-sft-pipeline/references/tinker.txt +3176 -0
  45. package/assets/skills/context-optimization/examples/book-sft-pipeline/scripts/pipeline_example.py +187 -0
  46. package/assets/skills/context-optimization/examples/digital-brain-skill/AGENT.md +35 -0
  47. package/assets/skills/context-optimization/examples/digital-brain-skill/HOW-SKILLS-BUILT-THIS.md +407 -0
  48. package/assets/skills/context-optimization/examples/digital-brain-skill/README.md +209 -0
  49. package/assets/skills/context-optimization/examples/digital-brain-skill/SKILL.md +203 -0
  50. package/assets/skills/context-optimization/examples/digital-brain-skill/SKILLS-MAPPING.md +219 -0
  51. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/AGENTS.md +82 -0
  52. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/content_ideas.py +132 -0
  53. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/idea_to_draft.py +181 -0
  54. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/stale_contacts.py +139 -0
  55. package/assets/skills/context-optimization/examples/digital-brain-skill/agents/scripts/weekly_review.py +121 -0
  56. package/assets/skills/context-optimization/examples/digital-brain-skill/content/CONTENT.md +88 -0
  57. package/assets/skills/context-optimization/examples/digital-brain-skill/content/calendar.md +108 -0
  58. package/assets/skills/context-optimization/examples/digital-brain-skill/content/engagement.jsonl +2 -0
  59. package/assets/skills/context-optimization/examples/digital-brain-skill/content/ideas.jsonl +2 -0
  60. package/assets/skills/context-optimization/examples/digital-brain-skill/content/posts.jsonl +2 -0
  61. package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/linkedin-post.md +102 -0
  62. package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/newsletter.md +92 -0
  63. package/assets/skills/context-optimization/examples/digital-brain-skill/content/templates/thread.md +73 -0
  64. package/assets/skills/context-optimization/examples/digital-brain-skill/examples/content-workflow.md +204 -0
  65. package/assets/skills/context-optimization/examples/digital-brain-skill/examples/meeting-prep.md +243 -0
  66. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/IDENTITY.md +46 -0
  67. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/bio-variants.md +101 -0
  68. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/brand.md +165 -0
  69. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/prompts/content-generation.xml +46 -0
  70. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/prompts/reply-generator.xml +40 -0
  71. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/values.yaml +60 -0
  72. package/assets/skills/context-optimization/examples/digital-brain-skill/identity/voice.md +165 -0
  73. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/KNOWLEDGE.md +85 -0
  74. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/bookmarks.jsonl +2 -0
  75. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/competitors.md +117 -0
  76. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/learning.yaml +74 -0
  77. package/assets/skills/context-optimization/examples/digital-brain-skill/knowledge/research/_template.md +79 -0
  78. package/assets/skills/context-optimization/examples/digital-brain-skill/network/NETWORK.md +110 -0
  79. package/assets/skills/context-optimization/examples/digital-brain-skill/network/circles.yaml +80 -0
  80. package/assets/skills/context-optimization/examples/digital-brain-skill/network/contacts.jsonl +2 -0
  81. package/assets/skills/context-optimization/examples/digital-brain-skill/network/interactions.jsonl +2 -0
  82. package/assets/skills/context-optimization/examples/digital-brain-skill/network/intros.md +92 -0
  83. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/OPERATIONS.md +75 -0
  84. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/goals.yaml +83 -0
  85. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/meetings.jsonl +2 -0
  86. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/metrics.jsonl +2 -0
  87. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/reviews/_weekly_template.md +114 -0
  88. package/assets/skills/context-optimization/examples/digital-brain-skill/operations/todos.md +76 -0
  89. package/assets/skills/context-optimization/examples/digital-brain-skill/package.json +41 -0
  90. package/assets/skills/context-optimization/examples/digital-brain-skill/references/file-formats.md +386 -0
  91. package/assets/skills/context-optimization/examples/digital-brain-skill/scripts/install.sh +79 -0
  92. package/assets/skills/context-optimization/examples/interleaved_thinking/README.md +620 -0
  93. package/assets/skills/context-optimization/examples/interleaved_thinking/SKILL.md +221 -0
  94. package/assets/skills/context-optimization/examples/interleaved_thinking/docs/agentthinking.md +63 -0
  95. package/assets/skills/context-optimization/examples/interleaved_thinking/docs/interleavedthinking.md +610 -0
  96. package/assets/skills/context-optimization/examples/interleaved_thinking/docs/m2-1.md +224 -0
  97. package/assets/skills/context-optimization/examples/interleaved_thinking/examples/01_basic_capture.py +76 -0
  98. package/assets/skills/context-optimization/examples/interleaved_thinking/examples/02_tool_usage.py +187 -0
  99. package/assets/skills/context-optimization/examples/interleaved_thinking/examples/03_full_optimization.py +1222 -0
  100. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/SKILL.md +90 -0
  101. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/optimization_summary.json +9 -0
  102. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/optimized_prompt.txt +1 -0
  103. package/assets/skills/context-optimization/examples/interleaved_thinking/generated_skills/comprehensive-research-agent/references/patterns_found.json +205 -0
  104. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/final_prompt.txt +67 -0
  105. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/analysis.txt +48 -0
  106. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/optimization.txt +15 -0
  107. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/optimized_prompt.txt +1 -0
  108. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_1/trace.txt +178 -0
  109. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_10/analysis.txt +47 -0
  110. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_10/trace.txt +162 -0
  111. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/analysis.txt +48 -0
  112. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/optimization.txt +130 -0
  113. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/optimized_prompt.txt +72 -0
  114. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_2/trace.txt +156 -0
  115. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/analysis.txt +46 -0
  116. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/optimization.txt +147 -0
  117. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/optimized_prompt.txt +84 -0
  118. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_3/trace.txt +159 -0
  119. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/analysis.txt +46 -0
  120. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/optimization.txt +134 -0
  121. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/optimized_prompt.txt +67 -0
  122. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_4/trace.txt +165 -0
  123. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/analysis.txt +50 -0
  124. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/optimization.txt +135 -0
  125. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/optimized_prompt.txt +71 -0
  126. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_5/trace.txt +146 -0
  127. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/analysis.txt +15 -0
  128. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/optimization.txt +15 -0
  129. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/optimized_prompt.txt +1 -0
  130. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_6/trace.txt +147 -0
  131. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/analysis.txt +46 -0
  132. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/optimization.txt +103 -0
  133. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/optimized_prompt.txt +45 -0
  134. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_7/trace.txt +134 -0
  135. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/analysis.txt +47 -0
  136. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/optimization.txt +114 -0
  137. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/optimized_prompt.txt +60 -0
  138. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_8/trace.txt +135 -0
  139. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/analysis.txt +44 -0
  140. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/optimization.txt +106 -0
  141. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/optimized_prompt.txt +51 -0
  142. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/iteration_9/trace.txt +170 -0
  143. package/assets/skills/context-optimization/examples/interleaved_thinking/optimization_artifacts/summary.json +11 -0
  144. package/assets/skills/context-optimization/examples/interleaved_thinking/pyproject.toml +70 -0
  145. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/__init__.py +53 -0
  146. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/analyzer.py +465 -0
  147. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/capture.py +417 -0
  148. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/cli.py +271 -0
  149. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/loop.py +468 -0
  150. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/models.py +193 -0
  151. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/optimizer.py +449 -0
  152. package/assets/skills/context-optimization/examples/interleaved_thinking/reasoning_trace_optimizer/skill_generator.py +502 -0
  153. package/assets/skills/context-optimization/examples/interleaved_thinking/tests/__init__.py +1 -0
  154. package/assets/skills/context-optimization/examples/interleaved_thinking/tests/test_models.py +144 -0
  155. package/assets/skills/context-optimization/examples/llm-as-judge-skills/.prettierrc +8 -0
  156. package/assets/skills/context-optimization/examples/llm-as-judge-skills/CONTRIBUTING.md +78 -0
  157. package/assets/skills/context-optimization/examples/llm-as-judge-skills/LICENSE +21 -0
  158. package/assets/skills/context-optimization/examples/llm-as-judge-skills/README.md +659 -0
  159. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/evaluator-agent/evaluator-agent.md +177 -0
  160. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/index.md +114 -0
  161. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/orchestrator-agent/orchestrator-agent.md +205 -0
  162. package/assets/skills/context-optimization/examples/llm-as-judge-skills/agents/research-agent/research-agent.md +183 -0
  163. package/assets/skills/context-optimization/examples/llm-as-judge-skills/env.example +6 -0
  164. package/assets/skills/context-optimization/examples/llm-as-judge-skills/eslint.config.js +18 -0
  165. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/basic-evaluation.ts +89 -0
  166. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/full-evaluation-workflow.ts +136 -0
  167. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/generate-rubric.ts +67 -0
  168. package/assets/skills/context-optimization/examples/llm-as-judge-skills/examples/pairwise-comparison.ts +97 -0
  169. package/assets/skills/context-optimization/examples/llm-as-judge-skills/package.json +79 -0
  170. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/agent-system/orchestrator-prompt.md +197 -0
  171. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/evaluation/direct-scoring-prompt.md +153 -0
  172. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/evaluation/pairwise-comparison-prompt.md +200 -0
  173. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/index.md +138 -0
  174. package/assets/skills/context-optimization/examples/llm-as-judge-skills/prompts/research/research-synthesis-prompt.md +171 -0
  175. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/context-fundamentals/context-fundamentals.md +114 -0
  176. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/index.md +79 -0
  177. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/llm-evaluator/llm-evaluator.md +77 -0
  178. package/assets/skills/context-optimization/examples/llm-as-judge-skills/skills/tool-design/tool-design.md +198 -0
  179. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/agents/evaluator.ts +112 -0
  180. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/agents/index.ts +3 -0
  181. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/config/index.ts +18 -0
  182. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/index.ts +19 -0
  183. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts +164 -0
  184. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/generate-rubric.ts +161 -0
  185. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/index.ts +9 -0
  186. package/assets/skills/context-optimization/examples/llm-as-judge-skills/src/tools/evaluation/pairwise-compare.ts +255 -0
  187. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/evaluation.test.ts +233 -0
  188. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/setup.ts +27 -0
  189. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tests/skills.test.ts +213 -0
  190. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/direct-score.md +159 -0
  191. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/generate-rubric.md +189 -0
  192. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/evaluation/pairwise-compare.md +182 -0
  193. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/index.md +141 -0
  194. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/orchestration/delegate-to-agent.md +171 -0
  195. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/research/read-url.md +162 -0
  196. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tools/research/web-search.md +128 -0
  197. package/assets/skills/context-optimization/examples/llm-as-judge-skills/tsconfig.json +26 -0
  198. package/assets/skills/context-optimization/examples/llm-as-judge-skills/vitest.config.ts +20 -0
  199. package/assets/skills/context-optimization/examples/x-to-book-system/PRD.md +644 -0
  200. package/assets/skills/context-optimization/examples/x-to-book-system/README.md +181 -0
  201. package/assets/skills/context-optimization/examples/x-to-book-system/SKILLS-MAPPING.md +187 -0
  202. package/assets/skills/context-optimization/researcher/example_output.md +75 -0
  203. package/assets/skills/context-optimization/researcher/llm-as-a-judge.md +362 -0
  204. package/assets/skills/context-optimization/skills/advanced-evaluation/SKILL.md +454 -0
  205. package/assets/skills/context-optimization/skills/advanced-evaluation/references/bias-mitigation.md +288 -0
  206. package/assets/skills/context-optimization/skills/advanced-evaluation/references/implementation-patterns.md +315 -0
  207. package/assets/skills/context-optimization/skills/advanced-evaluation/references/metrics-guide.md +331 -0
  208. package/assets/skills/context-optimization/skills/advanced-evaluation/scripts/evaluation_example.py +337 -0
  209. package/assets/skills/context-optimization/skills/bdi-mental-states/SKILL.md +295 -0
  210. package/assets/skills/context-optimization/skills/bdi-mental-states/references/bdi-ontology-core.md +207 -0
  211. package/assets/skills/context-optimization/skills/bdi-mental-states/references/framework-integration.md +582 -0
  212. package/assets/skills/context-optimization/skills/bdi-mental-states/references/rdf-examples.md +315 -0
  213. package/assets/skills/context-optimization/skills/bdi-mental-states/references/sparql-competency.md +420 -0
  214. package/assets/skills/context-optimization/skills/context-compression/SKILL.md +265 -0
  215. package/assets/skills/context-optimization/skills/context-compression/references/evaluation-framework.md +213 -0
  216. package/assets/skills/context-optimization/skills/context-compression/scripts/compression_evaluator.py +658 -0
  217. package/assets/skills/context-optimization/skills/context-degradation/SKILL.md +231 -0
  218. package/assets/skills/context-optimization/skills/context-degradation/references/patterns.md +314 -0
  219. package/assets/skills/context-optimization/skills/context-degradation/scripts/degradation_detector.py +419 -0
  220. package/assets/skills/context-optimization/skills/context-fundamentals/SKILL.md +185 -0
  221. package/assets/skills/context-optimization/skills/context-fundamentals/references/context-components.md +283 -0
  222. package/assets/skills/context-optimization/skills/context-fundamentals/scripts/context_manager.py +370 -0
  223. package/assets/skills/context-optimization/skills/context-optimization/SKILL.md +179 -0
  224. package/assets/skills/context-optimization/skills/context-optimization/references/optimization_techniques.md +272 -0
  225. package/assets/skills/context-optimization/skills/context-optimization/scripts/compaction.py +379 -0
  226. package/assets/skills/context-optimization/skills/evaluation/SKILL.md +231 -0
  227. package/assets/skills/context-optimization/skills/evaluation/references/metrics.md +339 -0
  228. package/assets/skills/context-optimization/skills/evaluation/scripts/evaluator.py +474 -0
  229. package/assets/skills/context-optimization/skills/filesystem-context/SKILL.md +321 -0
  230. package/assets/skills/context-optimization/skills/filesystem-context/references/implementation-patterns.md +549 -0
  231. package/assets/skills/context-optimization/skills/filesystem-context/scripts/filesystem_context.py +353 -0
  232. package/assets/skills/context-optimization/skills/hosted-agents/SKILL.md +279 -0
  233. package/assets/skills/context-optimization/skills/hosted-agents/references/infrastructure-patterns.md +700 -0
  234. package/assets/skills/context-optimization/skills/hosted-agents/scripts/sandbox_manager.py +495 -0
  235. package/assets/skills/context-optimization/skills/memory-systems/SKILL.md +221 -0
  236. package/assets/skills/context-optimization/skills/memory-systems/references/implementation.md +458 -0
  237. package/assets/skills/context-optimization/skills/memory-systems/scripts/memory_store.py +396 -0
  238. package/assets/skills/context-optimization/skills/multi-agent-patterns/SKILL.md +255 -0
  239. package/assets/skills/context-optimization/skills/multi-agent-patterns/references/frameworks.md +433 -0
  240. package/assets/skills/context-optimization/skills/multi-agent-patterns/scripts/coordination.py +439 -0
  241. package/assets/skills/context-optimization/skills/project-development/SKILL.md +342 -0
  242. package/assets/skills/context-optimization/skills/project-development/references/case-studies.md +388 -0
  243. package/assets/skills/context-optimization/skills/project-development/references/pipeline-patterns.md +610 -0
  244. package/assets/skills/context-optimization/skills/project-development/scripts/pipeline_template.py +677 -0
  245. package/assets/skills/context-optimization/skills/tool-design/SKILL.md +311 -0
  246. package/assets/skills/context-optimization/skills/tool-design/references/architectural_reduction.md +210 -0
  247. package/assets/skills/context-optimization/skills/tool-design/references/best_practices.md +176 -0
  248. package/assets/skills/context-optimization/skills/tool-design/scripts/description_generator.py +237 -0
  249. package/assets/skills/context-optimization/template/SKILL.md +98 -0
  250. package/assets/skills/dremio-analytics/SKILL.md +287 -0
  251. package/assets/skills/elevenlabs-voice/SKILL.md +269 -0
  252. package/assets/skills/git-workflow/SKILL.md +266 -0
  253. package/assets/skills/gitops-workflows/.claude-plugin/plugin.json +8 -0
  254. package/assets/skills/gitops-workflows/SKILL.md +568 -0
  255. package/assets/skills/gitops-workflows/assets/applicationsets/cluster-generator.yaml +32 -0
  256. package/assets/skills/gitops-workflows/assets/argocd/install-argocd-3.x.yaml +92 -0
  257. package/assets/skills/gitops-workflows/assets/flux/flux-bootstrap-github.sh +49 -0
  258. package/assets/skills/gitops-workflows/assets/flux/oci-helmrelease.yaml +38 -0
  259. package/assets/skills/gitops-workflows/assets/progressive-delivery/argo-rollouts-canary.yaml +62 -0
  260. package/assets/skills/gitops-workflows/assets/secrets/sops-age-config.yaml +33 -0
  261. package/assets/skills/gitops-workflows/references/argocd_vs_flux.md +243 -0
  262. package/assets/skills/gitops-workflows/references/best_practices.md +160 -0
  263. package/assets/skills/gitops-workflows/references/multi_cluster.md +80 -0
  264. package/assets/skills/gitops-workflows/references/oci_artifacts.md +290 -0
  265. package/assets/skills/gitops-workflows/references/progressive_delivery.md +94 -0
  266. package/assets/skills/gitops-workflows/references/repo_patterns.md +184 -0
  267. package/assets/skills/gitops-workflows/references/secret_management.md +213 -0
  268. package/assets/skills/gitops-workflows/references/troubleshooting.md +134 -0
  269. package/assets/skills/gitops-workflows/scripts/applicationset_generator.py +156 -0
  270. package/assets/skills/gitops-workflows/scripts/check_argocd_health.py +275 -0
  271. package/assets/skills/gitops-workflows/scripts/check_flux_health.py +418 -0
  272. package/assets/skills/gitops-workflows/scripts/oci_artifact_checker.py +150 -0
  273. package/assets/skills/gitops-workflows/scripts/promotion_validator.py +88 -0
  274. package/assets/skills/gitops-workflows/scripts/secret_audit.py +178 -0
  275. package/assets/skills/gitops-workflows/scripts/sync_drift_detector.py +144 -0
  276. package/assets/skills/gitops-workflows/scripts/validate_gitops_repo.py +299 -0
  277. package/assets/skills/iac-terraform/.claude-plugin/plugin.json +8 -0
  278. package/assets/skills/iac-terraform/SKILL.md +653 -0
  279. package/assets/skills/iac-terraform/assets/templates/MODULE_TEMPLATE.md +386 -0
  280. package/assets/skills/iac-terraform/assets/workflows/github-actions-terraform.yml +224 -0
  281. package/assets/skills/iac-terraform/assets/workflows/github-actions-terragrunt.yml +236 -0
  282. package/assets/skills/iac-terraform/assets/workflows/gitlab-ci-terraform.yml +184 -0
  283. package/assets/skills/iac-terraform/references/best_practices.md +709 -0
  284. package/assets/skills/iac-terraform/references/cost_optimization.md +665 -0
  285. package/assets/skills/iac-terraform/references/troubleshooting.md +635 -0
  286. package/assets/skills/iac-terraform/scripts/init_module.py +319 -0
  287. package/assets/skills/iac-terraform/scripts/inspect_state.py +232 -0
  288. package/assets/skills/iac-terraform/scripts/validate_module.py +227 -0
  289. package/assets/skills/k8s-troubleshooter/.claude-plugin/plugin.json +8 -0
  290. package/assets/skills/k8s-troubleshooter/SKILL.md +336 -0
  291. package/assets/skills/k8s-troubleshooter/references/common_issues.md +582 -0
  292. package/assets/skills/k8s-troubleshooter/references/helm_troubleshooting.md +708 -0
  293. package/assets/skills/k8s-troubleshooter/references/incident_response.md +466 -0
  294. package/assets/skills/k8s-troubleshooter/references/performance_troubleshooting.md +687 -0
  295. package/assets/skills/k8s-troubleshooter/scripts/check_namespace.py +500 -0
  296. package/assets/skills/k8s-troubleshooter/scripts/cluster_health.py +223 -0
  297. package/assets/skills/k8s-troubleshooter/scripts/diagnose_pod.py +157 -0
  298. package/assets/skills/mattermost-notify/SKILL.md +248 -0
  299. package/assets/skills/monitoring-observability/SKILL.md +869 -0
  300. package/assets/skills/monitoring-observability/assets/templates/otel-config/collector-config.yaml +227 -0
  301. package/assets/skills/monitoring-observability/assets/templates/prometheus-alerts/kubernetes-alerts.yml +293 -0
  302. package/assets/skills/monitoring-observability/assets/templates/prometheus-alerts/webapp-alerts.yml +243 -0
  303. package/assets/skills/monitoring-observability/assets/templates/runbooks/incident-runbook-template.md +409 -0
  304. package/assets/skills/monitoring-observability/monitoring-observability.skill +0 -0
  305. package/assets/skills/monitoring-observability/references/alerting_best_practices.md +609 -0
  306. package/assets/skills/monitoring-observability/references/datadog_migration.md +649 -0
  307. package/assets/skills/monitoring-observability/references/dql_promql_translation.md +756 -0
  308. package/assets/skills/monitoring-observability/references/logging_guide.md +775 -0
  309. package/assets/skills/monitoring-observability/references/metrics_design.md +406 -0
  310. package/assets/skills/monitoring-observability/references/slo_sla_guide.md +652 -0
  311. package/assets/skills/monitoring-observability/references/tool_comparison.md +697 -0
  312. package/assets/skills/monitoring-observability/references/tracing_guide.md +663 -0
  313. package/assets/skills/monitoring-observability/scripts/alert_quality_checker.py +315 -0
  314. package/assets/skills/monitoring-observability/scripts/analyze_metrics.py +279 -0
  315. package/assets/skills/monitoring-observability/scripts/dashboard_generator.py +395 -0
  316. package/assets/skills/monitoring-observability/scripts/datadog_cost_analyzer.py +477 -0
  317. package/assets/skills/monitoring-observability/scripts/health_check_validator.py +297 -0
  318. package/assets/skills/monitoring-observability/scripts/log_analyzer.py +321 -0
  319. package/assets/skills/monitoring-observability/scripts/slo_calculator.py +365 -0
  320. package/assets/skills/neo4j-graph-rag/SKILL.md +258 -0
  321. package/assets/skills/pagerduty-ops/SKILL.md +380 -0
  322. package/assets/skills/playwright/API_REFERENCE.md +653 -0
  323. package/assets/skills/playwright/SKILL.md +453 -0
  324. package/assets/skills/playwright/lib/helpers.js +441 -0
  325. package/assets/skills/playwright/package.json +26 -0
  326. package/assets/skills/playwright/run.js +228 -0
  327. package/assets/skills/project-memory/README.md +687 -0
  328. package/assets/skills/project-memory/SKILL.md +298 -0
  329. package/assets/skills/project-memory/references/bugs_template.md +41 -0
  330. package/assets/skills/project-memory/references/decisions_template.md +92 -0
  331. package/assets/skills/project-memory/references/issues_template.md +76 -0
  332. package/assets/skills/project-memory/references/key_facts_template.md +158 -0
  333. package/assets/skills/recruit-workflow/SKILL.md +276 -0
  334. package/assets/skills/recruit-workflow/references/email-templates.md +347 -0
  335. package/assets/skills/recruit-workflow/references/workflow-stages.md +395 -0
  336. package/assets/skills/recruit-workflow/scripts/clay_client.py +188 -0
  337. package/assets/skills/recruit-workflow/scripts/lever_client.py +197 -0
  338. package/assets/skills/recruit-workflow/scripts/mailgun_client.py +245 -0
  339. package/assets/skills/recruit-workflow/scripts/minio_client.py +426 -0
  340. package/assets/skills/shakudo-microservice/SKILL.md +215 -0
  341. package/assets/skills/tmux/SKILL.md +631 -0
  342. package/assets/skills/tmux/references/direct-socket-control.md +108 -0
  343. package/assets/skills/tmux/references/session-lifecycle.md +503 -0
  344. package/assets/skills/tmux/references/session-registry.md +1484 -0
  345. package/assets/skills/tmux/tools/cleanup-sessions.sh +263 -0
  346. package/assets/skills/tmux/tools/create-session.sh +224 -0
  347. package/assets/skills/tmux/tools/find-sessions.sh +262 -0
  348. package/assets/skills/tmux/tools/kill-session.sh +308 -0
  349. package/assets/skills/tmux/tools/lib/registry.sh +437 -0
  350. package/assets/skills/tmux/tools/lib/time_utils.sh +54 -0
  351. package/assets/skills/tmux/tools/list-sessions.sh +255 -0
  352. package/assets/skills/tmux/tools/pane-health.sh +424 -0
  353. package/assets/skills/tmux/tools/safe-send.sh +503 -0
  354. package/assets/skills/tmux/tools/wait-for-text.sh +260 -0
  355. package/assets/skills/twilio-sms/SKILL.md +508 -0
  356. package/assets/skills/zellij/SKILL.md +274 -0
  357. package/assets/skills/zellij/references/actions.md +558 -0
  358. package/assets/skills/zellij/references/layouts.md +424 -0
  359. package/bin/cli.ts +46 -0
  360. package/package.json +43 -0
  361. package/src/alias.ts +108 -0
  362. package/src/backup.ts +51 -0
  363. package/src/config.ts +115 -0
  364. package/src/dependencies.ts +163 -0
  365. package/src/errors.ts +77 -0
  366. package/src/index.ts +207 -0
  367. package/src/prompts.ts +142 -0
  368. package/src/schemas.ts +21 -0
  369. package/src/skills.ts +45 -0
  370. package/src/speckit.ts +116 -0
  371. package/src/types.ts +106 -0
  372. package/src/utils.ts +110 -0
  373. package/src/vibe-git.ts +50 -0
  374. package/templates/.specify/memory/constitution.md +109 -0
  375. package/templates/.specify/scripts/bash/check-prerequisites.sh +262 -0
  376. package/templates/.specify/scripts/bash/common.sh +670 -0
  377. package/templates/.specify/scripts/bash/create-new-feature.sh +594 -0
  378. package/templates/.specify/scripts/bash/create-worktree-feature.sh +401 -0
  379. package/templates/.specify/scripts/bash/init-workspace.sh +433 -0
  380. package/templates/.specify/scripts/bash/list-spec-worktrees.sh +198 -0
  381. package/templates/.specify/scripts/bash/setup-plan.sh +105 -0
  382. package/templates/.specify/scripts/bash/test-workspace-rollup.sh +175 -0
  383. package/templates/.specify/scripts/bash/update-agent-context.sh +799 -0
  384. package/templates/.specify/templates/agent-file-template.md +28 -0
  385. package/templates/.specify/templates/checklist-template.md +40 -0
  386. package/templates/.specify/templates/commands/analyze.md +197 -0
  387. package/templates/.specify/templates/commands/checklist.md +306 -0
  388. package/templates/.specify/templates/commands/clarify.md +194 -0
  389. package/templates/.specify/templates/commands/constitution.md +97 -0
  390. package/templates/.specify/templates/commands/implement.md +149 -0
  391. package/templates/.specify/templates/commands/plan.md +123 -0
  392. package/templates/.specify/templates/commands/projects.md +48 -0
  393. package/templates/.specify/templates/commands/rollup.md +66 -0
  394. package/templates/.specify/templates/commands/specify.md +275 -0
  395. package/templates/.specify/templates/commands/specs.md +71 -0
  396. package/templates/.specify/templates/commands/tasks.md +151 -0
  397. package/templates/.specify/templates/commands/taskstoissues.md +35 -0
  398. package/templates/.specify/templates/commands/workspace.md +128 -0
  399. package/templates/.specify/templates/plan-template.md +104 -0
  400. package/templates/.specify/templates/spec-template.md +115 -0
  401. package/templates/.specify/templates/tasks-template.md +251 -0
  402. package/templates/.specify/templates/workspace.yaml +110 -0
  403. package/templates/.specify/workspace.yaml +95 -0
  404. package/templates/AGENTS.md +460 -0
  405. package/templates/oh-my-opencode.json +27 -0
  406. package/templates/opencode.json +383 -0
  407. package/templates/package.json +10 -0
  408. package/templates/project-memory/bugs.md +16 -0
  409. package/templates/project-memory/decisions.md +22 -0
  410. package/templates/project-memory/issues.md +15 -0
  411. package/templates/project-memory/key_facts.md +26 -0
@@ -0,0 +1,659 @@
1
+ # LLM-as-a-Judge Skills
2
+
3
+ > A practical implementation of LLM evaluation skills built using insights from [Eugene Yan's LLM-Evaluators research](https://eugeneyan.com/writing/llm-evaluators/) and [Vercel AI SDK 6](https://vercel.com/blog/ai-sdk-6).
4
+
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![TypeScript](https://img.shields.io/badge/TypeScript-5.6-blue.svg)](https://www.typescriptlang.org/)
7
+ [![AI SDK](https://img.shields.io/badge/AI%20SDK-4.1-green.svg)](https://sdk.vercel.ai/)
8
+ [![Tests](https://img.shields.io/badge/Tests-19%20passed-brightgreen.svg)](#test-results)
9
+
10
+ ## 🎯 Purpose
11
+
12
+ This repository demonstrates how to build **production-ready LLM evaluation skills** as part of the [Agent Skills for Context Engineering](https://github.com/muratcankoylan/Agent-Skills-for-Context-Engineering) project. It serves as a practical example of:
13
+
14
+ 1. **Skill Development**: How to transform research insights into executable agent skills
15
+ 2. **Tool Design**: Best practices for building AI tools with proper schemas and error handling
16
+ 3. **Evaluation Patterns**: Implementation of LLM-as-a-Judge patterns for quality assessment
17
+
18
+ ### Part of the Context Engineering Ecosystem
19
+
20
+ This project is an example implementation to be added to:
21
+ - 📁 [`Agent-Skills-for-Context-Engineering/examples/`](https://github.com/muratcankoylan/Agent-Skills-for-Context-Engineering/tree/main/examples)
22
+
23
+ It builds upon the foundational skills from:
24
+ - 📚 [`skills/context-fundamentals`](https://github.com/muratcankoylan/Agent-Skills-for-Context-Engineering/tree/main/skills/context-fundamentals) - Context engineering principles
25
+ - 🔧 [`skills/tool-design`](https://github.com/muratcankoylan/Agent-Skills-for-Context-Engineering/tree/main/skills/tool-design) - Tool design best practices
26
+
27
+ ---
28
+
29
+ ## 📖 Background & Research
30
+
31
+ ### The LLM-as-a-Judge Problem
32
+
33
+ Evaluating AI-generated content is challenging. Traditional metrics (BLEU, ROUGE) often miss nuances that matter. Eugene Yan's research on [LLM-Evaluators](https://eugeneyan.com/writing/llm-evaluators/) identifies practical patterns for using LLMs to judge LLM outputs.
34
+
35
+ **Key insights we implemented:**
36
+
37
+ | Insight | Implementation |
38
+ |---------|----------------|
39
+ | Direct scoring works best for objective criteria | `directScore` tool with rubric support |
40
+ | Pairwise comparison is more reliable for preferences | `pairwiseCompare` tool with position swapping |
41
+ | Position bias affects pairwise judgments | Automatic position swapping in comparisons |
42
+ | Chain-of-thought improves reliability | All evaluations require justification with evidence |
43
+ | Clear rubrics reduce variance | `generateRubric` tool for consistent standards |
44
+
45
+ ### Vercel AI SDK 6 Patterns
46
+
47
+ We leveraged AI SDK 6's new patterns:
48
+
49
+ - **Agent Abstraction**: Reusable `EvaluatorAgent` class with multiple capabilities
50
+ - **Type-safe Tools**: Zod schemas for all inputs/outputs
51
+ - **Structured Output**: JSON responses parsed and validated
52
+ - **Error Handling**: Graceful degradation when API calls fail
53
+
54
+ ---
55
+
56
+ ## 🏗️ What We Built
57
+
58
+ ### Architecture Overview
59
+
60
+ ```
61
+ ┌─────────────────────────────────────────────────────────────────────┐
62
+ │ LLM-as-a-Judge Skills │
63
+ ├─────────────────────────────────────────────────────────────────────┤
64
+ │ │
65
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │
66
+ │ │ Skills │ │ Prompts │ │ Tools │ │
67
+ │ │ (MD docs) │───▶│ (templates)│───▶│ (TypeScript impl) │ │
68
+ │ └─────────────┘ └─────────────┘ └─────────────────────────┘ │
69
+ │ │ │ │
70
+ │ │ ▼ │
71
+ │ │ ┌─────────────────────────┐ │
72
+ │ └─────────────────────────────▶│ EvaluatorAgent │ │
73
+ │ │ ├── score() │ │
74
+ │ │ ├── compare() │ │
75
+ │ │ ├── generateRubric() │ │
76
+ │ │ └── chat() │ │
77
+ │ └─────────────────────────┘ │
78
+ │ │ │
79
+ │ ▼ │
80
+ │ ┌─────────────────────────┐ │
81
+ │ │ OpenAI GPT-5.2 API │ │
82
+ │ └─────────────────────────┘ │
83
+ │ │
84
+ └─────────────────────────────────────────────────────────────────────┘
85
+ ```
86
+
87
+ ### Directory Structure
88
+
89
+ ```
90
+ llm-as-judge-skills/
91
+ ├── skills/ # Foundational knowledge (MD docs)
92
+ │ ├── llm-evaluator/ # LLM-as-a-Judge patterns
93
+ │ │ └── llm-evaluator.md # Evaluation methods, metrics, bias mitigation
94
+ │ ├── context-fundamentals/ # Context engineering principles
95
+ │ │ └── context-fundamentals.md # Managing context effectively
96
+ │ └── tool-design/ # Tool design best practices
97
+ │ └── tool-design.md # Schema design, error handling
98
+
99
+ ├── prompts/ # Prompt templates
100
+ │ ├── evaluation/
101
+ │ │ ├── direct-scoring-prompt.md # Scoring prompt template
102
+ │ │ └── pairwise-comparison-prompt.md # Comparison prompt template
103
+ │ ├── research/
104
+ │ │ └── research-synthesis-prompt.md
105
+ │ └── agent-system/
106
+ │ └── orchestrator-prompt.md
107
+
108
+ ├── tools/ # Tool documentation (MD)
109
+ │ ├── evaluation/
110
+ │ │ ├── direct-score.md # Direct scoring tool spec
111
+ │ │ ├── pairwise-compare.md # Pairwise comparison spec
112
+ │ │ └── generate-rubric.md # Rubric generation spec
113
+ │ ├── research/
114
+ │ │ ├── web-search.md
115
+ │ │ └── read-url.md
116
+ │ └── orchestration/
117
+ │ └── delegate-to-agent.md
118
+
119
+ ├── agents/ # Agent documentation (MD)
120
+ │ ├── evaluator-agent/
121
+ │ │ └── evaluator-agent.md
122
+ │ ├── research-agent/
123
+ │ │ └── research-agent.md
124
+ │ └── orchestrator-agent/
125
+ │ └── orchestrator-agent.md
126
+
127
+ ├── src/ # TypeScript implementation
128
+ │ ├── tools/evaluation/
129
+ │ │ ├── direct-score.ts # 165 lines - Direct scoring implementation
130
+ │ │ ├── pairwise-compare.ts # 255 lines - Pairwise with bias mitigation
131
+ │ │ └── generate-rubric.ts # 162 lines - Rubric generation
132
+ │ ├── agents/
133
+ │ │ └── evaluator.ts # 112 lines - EvaluatorAgent class
134
+ │ ├── config/
135
+ │ │ └── index.ts # Configuration and validation
136
+ │ └── index.ts # Main exports
137
+
138
+ ├── tests/ # Test suite
139
+ │ ├── evaluation.test.ts # 9 tests for tools
140
+ │ ├── skills.test.ts # 10 tests for skills
141
+ │ └── setup.ts # Test configuration
142
+
143
+ └── examples/ # Usage examples
144
+ ├── basic-evaluation.ts
145
+ ├── pairwise-comparison.ts
146
+ ├── generate-rubric.ts
147
+ └── full-evaluation-workflow.ts
148
+ ```
149
+
150
+ ---
151
+
152
+ ## 🔧 Core Tools Implemented
153
+
154
+ ### 1. Direct Score Tool (`directScore`)
155
+
156
+ **Purpose**: Evaluate a single response against defined criteria with numerical scores.
157
+
158
+ **When to Use**:
159
+ - Factual accuracy checks
160
+ - Instruction following assessment
161
+ - Content quality grading
162
+ - Compliance verification
163
+
164
+ **Implementation Highlights**:
165
+
166
+ ```typescript
167
+ // From src/tools/evaluation/direct-score.ts
168
+
169
+ const systemPrompt = `You are an expert evaluator. Assess the response against each criterion.
170
+ For each criterion:
171
+ 1. Find specific evidence in the response
172
+ 2. Score according to the rubric (1-5 scale)
173
+ 3. Justify your score
174
+ 4. Suggest one improvement
175
+
176
+ Be objective and consistent. Base scores on explicit evidence.`;
177
+ ```
178
+
179
+ **Key Features**:
180
+ - Weighted criteria support
181
+ - Chain-of-thought justification required
182
+ - Evidence extraction from response
183
+ - Improvement suggestions per criterion
184
+ - Configurable rubrics (1-3, 1-5, 1-10 scales)
185
+
186
+ **Example Usage**:
187
+
188
+ ```typescript
189
+ const result = await executeDirectScore({
190
+ response: 'Quantum entanglement is like having two magical coins...',
191
+ prompt: 'Explain quantum entanglement to a high school student',
192
+ criteria: [
193
+ { name: 'Accuracy', description: 'Scientific correctness', weight: 0.4 },
194
+ { name: 'Clarity', description: 'Understandable for audience', weight: 0.3 },
195
+ { name: 'Engagement', description: 'Interesting and memorable', weight: 0.3 }
196
+ ],
197
+ rubric: { scale: '1-5' }
198
+ });
199
+
200
+ // Output:
201
+ // {
202
+ // success: true,
203
+ // scores: [
204
+ // { criterion: 'Accuracy', score: 4, justification: '...', evidence: [...] },
205
+ // { criterion: 'Clarity', score: 5, justification: '...', evidence: [...] },
206
+ // { criterion: 'Engagement', score: 4, justification: '...', evidence: [...] }
207
+ // ],
208
+ // overallScore: 4.33,
209
+ // weightedScore: 4.3,
210
+ // summary: { assessment: '...', strengths: [...], weaknesses: [...] }
211
+ // }
212
+ ```
213
+
214
+ ---
215
+
216
+ ### 2. Pairwise Compare Tool (`pairwiseCompare`)
217
+
218
+ **Purpose**: Compare two responses and determine which is better, with position bias mitigation.
219
+
220
+ **When to Use**:
221
+ - A/B testing responses
222
+ - Preference evaluation
223
+ - Style and tone assessment
224
+ - Ranking quality differences
225
+
226
+ **Implementation Highlights**:
227
+
228
+ ```typescript
229
+ // Position bias mitigation: evaluate twice with swapped positions
230
+ if (input.swapPositions) {
231
+ // First pass: A first, B second
232
+ const pass1 = await evaluatePair(input.responseA, input.responseB, ...);
233
+
234
+ // Second pass: B first, A second
235
+ const pass2 = await evaluatePair(input.responseB, input.responseA, ...);
236
+
237
+ // Map pass2 result back and check consistency
238
+ const pass2WinnerMapped = pass2.winner === 'A' ? 'B' : pass2.winner === 'B' ? 'A' : 'TIE';
239
+ const consistent = pass1.winner === pass2WinnerMapped;
240
+
241
+ // If inconsistent, return TIE with lower confidence
242
+ if (!consistent) {
243
+ finalWinner = 'TIE';
244
+ finalConfidence = 0.5;
245
+ }
246
+ }
247
+ ```
248
+
249
+ **Key Features**:
250
+ - **Position Swapping**: Automatically runs evaluation twice with swapped positions
251
+ - **Consistency Check**: Detects when position affects judgment
252
+ - **Confidence Scoring**: 0-1 confidence based on consistency
253
+ - **Per-criterion Comparison**: Detailed breakdown for each aspect
254
+ - **Bias-aware Prompting**: Explicit instructions to ignore length and position
255
+
256
+ **Example Usage**:
257
+
258
+ ```typescript
259
+ const result = await executePairwiseCompare({
260
+ responseA: GOOD_RESPONSE,
261
+ responseB: POOR_RESPONSE,
262
+ prompt: 'Explain quantum entanglement',
263
+ criteria: ['accuracy', 'clarity', 'completeness', 'engagement'],
264
+ allowTie: true,
265
+ swapPositions: true // Enable position bias mitigation
266
+ });
267
+
268
+ // Output:
269
+ // {
270
+ // success: true,
271
+ // winner: 'A',
272
+ // confidence: 0.85,
273
+ // positionConsistency: { consistent: true, firstPassWinner: 'A', secondPassWinner: 'A' },
274
+ // comparison: [
275
+ // { criterion: 'accuracy', winner: 'A', reasoning: '...' },
276
+ // { criterion: 'clarity', winner: 'A', reasoning: '...' },
277
+ // ...
278
+ // ]
279
+ // }
280
+ ```
281
+
282
+ ---
283
+
284
+ ### 3. Generate Rubric Tool (`generateRubric`)
285
+
286
+ **Purpose**: Create detailed scoring rubrics for consistent evaluation standards.
287
+
288
+ **When to Use**:
289
+ - Establishing evaluation criteria
290
+ - Training human evaluators
291
+ - Ensuring consistency across evaluations
292
+ - Documenting quality standards
293
+
294
+ **Implementation Highlights**:
295
+
296
+ ```typescript
297
+ // Strictness affects the generated rubric:
298
+ // - lenient: Lower bar for passing scores
299
+ // - balanced: Fair, typical expectations
300
+ // - strict: High standards, critical evaluation
301
+
302
+ const userPrompt = `Create a scoring rubric for:
303
+ **Criterion**: ${input.criterionName}
304
+ **Description**: ${input.criterionDescription}
305
+ **Scale**: ${input.scale}
306
+ **Domain**: ${input.domain}
307
+
308
+ Generate:
309
+ 1. Clear descriptions for each score level
310
+ 2. Specific characteristics that define each level
311
+ 3. Brief example text for each level
312
+ 4. General scoring guidelines
313
+ 5. Edge cases with guidance`;
314
+ ```
315
+
316
+ **Key Features**:
317
+ - Domain-specific terminology
318
+ - Configurable strictness levels
319
+ - Example generation for each level
320
+ - Edge case guidance
321
+ - Scoring guidelines
322
+
323
+ **Example Usage**:
324
+
325
+ ```typescript
326
+ const result = await executeGenerateRubric({
327
+ criterionName: 'Code Readability',
328
+ criterionDescription: 'How easy the code is to understand and maintain',
329
+ scale: '1-5',
330
+ domain: 'software engineering',
331
+ includeExamples: true,
332
+ strictness: 'balanced'
333
+ });
334
+
335
+ // Output:
336
+ // {
337
+ // success: true,
338
+ // levels: [
339
+ // { score: 1, label: 'Poor', description: '...', characteristics: [...], example: '...' },
340
+ // { score: 2, label: 'Below Average', ... },
341
+ // { score: 3, label: 'Average', ... },
342
+ // { score: 4, label: 'Good', ... },
343
+ // { score: 5, label: 'Excellent', ... }
344
+ // ],
345
+ // scoringGuidelines: [...],
346
+ // edgeCases: [{ situation: '...', guidance: '...' }]
347
+ // }
348
+ ```
349
+
350
+ ---
351
+
352
+ ### 4. Evaluator Agent
353
+
354
+ **Purpose**: High-level agent that combines all evaluation tools with conversational capability.
355
+
356
+ **Implementation**:
357
+
358
+ ```typescript
359
+ export class EvaluatorAgent {
360
+ private model: string;
361
+ private temperature: number;
362
+
363
+ constructor(config?: EvaluatorAgentConfig) {
364
+ this.model = config?.model || 'gpt-5.2';
365
+ this.temperature = config?.temperature || 0.3;
366
+ }
367
+
368
+ // Score a response
369
+ async score(input: DirectScoreInput) { ... }
370
+
371
+ // Compare two responses
372
+ async compare(input: PairwiseCompareInput) { ... }
373
+
374
+ // Generate a rubric
375
+ async generateRubric(input: GenerateRubricInput) { ... }
376
+
377
+ // Full workflow: generate rubric then score
378
+ async evaluateWithGeneratedRubric(response, prompt, criteria) { ... }
379
+
380
+ // Chat-based evaluation
381
+ async chat(userMessage: string) { ... }
382
+ }
383
+ ```
384
+
385
+ ---
386
+
387
+ ## 📊 Test Results
388
+
389
+ All 19 tests pass successfully. Here are the actual test logs from our test run:
390
+
391
+ ### Test Output
392
+
393
+ ```
394
+ > readwren-agent-system@1.0.0 test
395
+ > vitest run --testTimeout=120000
396
+
397
+ RUN v2.1.9 /Users/muratcankoylan/app_readwren
398
+
399
+ ✓ tests/skills.test.ts (10 tests) 159317ms
400
+ ✓ LLM Evaluator Skill Tests > Direct Scoring Skill > should use chain-of-thought in scoring 4439ms
401
+ ✓ LLM Evaluator Skill Tests > Direct Scoring Skill > should handle multiple weighted criteria 7218ms
402
+ ✓ LLM Evaluator Skill Tests > Pairwise Comparison Skill > should mitigate position bias with swap 13002ms
403
+ ✓ LLM Evaluator Skill Tests > Pairwise Comparison Skill > should identify clear winner for quality difference 25914ms
404
+ ✓ LLM Evaluator Skill Tests > Rubric Generation Skill > should generate domain-specific rubrics 37165ms
405
+ ✓ LLM Evaluator Skill Tests > Rubric Generation Skill > should provide edge case guidance 29088ms
406
+ ✓ LLM Evaluator Skill Tests > Context Fundamentals Skill Application > should utilize provided context in evaluation 11133ms
407
+ ✓ Skill Input/Output Validation > should validate DirectScore input schema 4733ms
408
+ ✓ Skill Input/Output Validation > should validate PairwiseCompare output structure 4123ms
409
+ ✓ Skill Input/Output Validation > should validate GenerateRubric output structure 22500ms
410
+
411
+ ✓ tests/evaluation.test.ts (9 tests) 216353ms
412
+ ✓ Direct Score Tool > should score a response against criteria 13219ms
413
+ ✓ Direct Score Tool > should provide lower scores for poor responses 14834ms
414
+ ✓ Pairwise Compare Tool > should correctly identify the better response 29254ms
415
+ ✓ Pairwise Compare Tool > should handle similar responses appropriately 14418ms
416
+ ✓ Pairwise Compare Tool > should provide comparison details for each criterion 9931ms
417
+ ✓ Generate Rubric Tool > should generate a complete rubric 24106ms
418
+ ✓ Generate Rubric Tool > should respect strictness setting 57919ms
419
+ ✓ Evaluator Agent > should provide integrated evaluation workflow 48112ms
420
+ ✓ Evaluator Agent > should support chat-based evaluation 4558ms
421
+
422
+ Test Files 2 passed (2)
423
+ Tests 19 passed (19)
424
+ Start at 00:25:16
425
+ Duration 216.66s (transform 68ms, setup 32ms, collect 148ms, tests 375.67s, environment 0ms, prepare 105ms)
426
+ ```
427
+
428
+ ### Test Coverage Summary
429
+
430
+ | Test Category | Tests | Pass Rate | Avg Duration |
431
+ |--------------|-------|-----------|--------------|
432
+ | Direct Scoring | 4 | 100% | 9.9s |
433
+ | Pairwise Comparison | 4 | 100% | 17.9s |
434
+ | Rubric Generation | 4 | 100% | 33.2s |
435
+ | Context Integration | 1 | 100% | 11.1s |
436
+ | Agent Integration | 2 | 100% | 26.3s |
437
+ | Schema Validation | 4 | 100% | 8.8s |
438
+
439
+ ---
440
+
441
+ ## 📚 Key Learnings
442
+
443
+ ### 1. Position Bias is Real
444
+
445
+ During testing, we confirmed Eugene Yan's research findings:
446
+
447
+ ```
448
+ Test: "should mitigate position bias with swap" - 13002ms
449
+ Result: Position consistency check correctly detected and mitigated bias
450
+ ```
451
+
452
+ When comparing identical responses, the system correctly returns `TIE`. When comparing clearly different quality responses, the winner is consistent across position swaps.
453
+
454
+ ### 2. Chain-of-Thought Improves Quality
455
+
456
+ Tests confirm that requiring justification produces more reliable evaluations:
457
+
458
+ ```
459
+ Test: "should use chain-of-thought in scoring" - 4439ms
460
+ Result: All scores include justifications >20 characters with specific evidence
461
+ ```
462
+
463
+ ### 3. Domain-Specific Rubrics Matter
464
+
465
+ The rubric generator adapts to the specified domain:
466
+
467
+ ```
468
+ Test: "should generate domain-specific rubrics" - 37165ms
469
+ Result: Software engineering rubric included terms like "variable", "function", "comment"
470
+ ```
471
+
472
+ ### 4. Weighted Criteria Enable Nuanced Evaluation
473
+
474
+ ```
475
+ Test: "should handle multiple weighted criteria" - 7218ms
476
+ Result: weightedScore differs from overallScore when weights are unequal
477
+ ```
478
+
479
+ ### 5. Context Affects Evaluation
480
+
481
+ The context fundamentals skill proves valuable:
482
+
483
+ ```
484
+ Test: "should utilize provided context in evaluation" - 11133ms
485
+ Result: Medical context allowed technical terminology to score well
486
+ ```
487
+
488
+ ---
489
+
490
+ ## 🚀 Quick Start
491
+
492
+ ### Installation
493
+
494
+ ```bash
495
+ git clone https://github.com/muratcankoylan/llm-as-judge-skills.git
496
+ cd llm-as-judge-skills
497
+ npm install
498
+ ```
499
+
500
+ ### Configuration
501
+
502
+ Create a `.env` file:
503
+
504
+ ```bash
505
+ OPENAI_API_KEY=your_openai_api_key_here
506
+ OPENAI_MODEL=gpt-5.2
507
+ ```
508
+
509
+ ### Run Tests
510
+
511
+ ```bash
512
+ npm test
513
+ ```
514
+
515
+ ### Basic Usage
516
+
517
+ ```typescript
518
+ import { EvaluatorAgent } from './src/agents/evaluator';
519
+
520
+ const agent = new EvaluatorAgent();
521
+
522
+ // Score a response
523
+ const scoreResult = await agent.score({
524
+ response: 'Your AI-generated response',
525
+ prompt: 'The original prompt',
526
+ criteria: [
527
+ { name: 'Accuracy', description: 'Factual correctness', weight: 1 }
528
+ ]
529
+ });
530
+
531
+ console.log(`Score: ${scoreResult.overallScore}/5`);
532
+
533
+ // Compare two responses
534
+ const compareResult = await agent.compare({
535
+ responseA: 'First response',
536
+ responseB: 'Second response',
537
+ prompt: 'The prompt',
538
+ criteria: ['quality', 'completeness'],
539
+ allowTie: true,
540
+ swapPositions: true
541
+ });
542
+
543
+ console.log(`Winner: ${compareResult.winner} (confidence: ${compareResult.confidence})`);
544
+ ```
545
+
546
+ ---
547
+
548
+ ## 🔗 Integration with Agent Skills Repository
549
+
550
+ This project is designed to be added to the examples section of the main repository:
551
+
552
+ ```
553
+ Agent-Skills-for-Context-Engineering/
554
+ ├── skills/
555
+ │ ├── context-fundamentals/ # Foundation (referenced by this project)
556
+ │ └── tool-design/ # Foundation (referenced by this project)
557
+ ├── examples/
558
+ │ └── llm-as-judge-skills/ # ← This project
559
+ │ ├── README.md
560
+ │ ├── skills/
561
+ │ ├── tools/
562
+ │ ├── agents/
563
+ │ └── src/
564
+ ```
565
+
566
+ ### How This Example Demonstrates the Framework
567
+
568
+ 1. **Skills → Prompts → Tools**: Shows the progression from knowledge (MD files) to executable code
569
+ 2. **Context Engineering**: Applies context fundamentals in evaluation prompts
570
+ 3. **Tool Design Patterns**: Implements Zod schemas, error handling, and clear interfaces
571
+ 4. **Agent Architecture**: Uses AI SDK patterns for agent abstraction
572
+
573
+ ---
574
+
575
+ ## 📋 API Reference
576
+
577
+ ### DirectScoreInput
578
+
579
+ ```typescript
580
+ interface DirectScoreInput {
581
+ response: string; // The response to evaluate
582
+ prompt: string; // Original prompt
583
+ context?: string; // Additional context
584
+ criteria: Array<{
585
+ name: string; // Criterion name
586
+ description: string; // What it measures
587
+ weight: number; // Relative importance (0-1)
588
+ }>;
589
+ rubric?: {
590
+ scale: '1-3' | '1-5' | '1-10';
591
+ levelDescriptions?: Record<string, string>;
592
+ };
593
+ }
594
+ ```
595
+
596
+ ### PairwiseCompareInput
597
+
598
+ ```typescript
599
+ interface PairwiseCompareInput {
600
+ responseA: string; // First response
601
+ responseB: string; // Second response
602
+ prompt: string; // Original prompt
603
+ context?: string; // Additional context
604
+ criteria: string[]; // Comparison aspects
605
+ allowTie?: boolean; // Allow tie verdict (default: true)
606
+ swapPositions?: boolean; // Mitigate position bias (default: true)
607
+ }
608
+ ```
609
+
610
+ ### GenerateRubricInput
611
+
612
+ ```typescript
613
+ interface GenerateRubricInput {
614
+ criterionName: string; // Name of criterion
615
+ criterionDescription: string; // What it measures
616
+ scale?: '1-3' | '1-5' | '1-10';
617
+ domain?: string; // Domain for terminology
618
+ includeExamples?: boolean; // Generate examples
619
+ strictness?: 'lenient' | 'balanced' | 'strict';
620
+ }
621
+ ```
622
+
623
+ ---
624
+
625
+ ## 🛠️ Development
626
+
627
+ ### Scripts
628
+
629
+ ```bash
630
+ npm run build # Compile TypeScript
631
+ npm run dev # Watch mode
632
+ npm test # Run tests
633
+ npm run lint # ESLint
634
+ npm run format # Prettier
635
+ npm run typecheck # Type check
636
+ ```
637
+
638
+ ### Adding New Tools
639
+
640
+ 1. Create `src/tools/<category>/<tool-name>.ts`
641
+ 2. Define input/output Zod schemas
642
+ 3. Implement execute function
643
+ 4. Export from `src/tools/<category>/index.ts`
644
+ 5. Add documentation in `tools/<category>/<tool-name>.md`
645
+ 6. Write tests
646
+
647
+ ---
648
+
649
+ ## 📄 License
650
+
651
+ MIT License - see [LICENSE](LICENSE) for details.
652
+
653
+ ---
654
+
655
+ ## 🙏 Acknowledgments
656
+
657
+ - [Eugene Yan](https://eugeneyan.com/writing/llm-evaluators/) - LLM-as-a-Judge research
658
+ - [Vercel AI SDK](https://sdk.vercel.ai/) - Agent patterns and tooling
659
+ - [Agent Skills for Context Engineering](https://github.com/muratcankoylan/Agent-Skills-for-Context-Engineering) - Foundation framework