@bastani/atomic 0.5.11-0 → 0.5.12-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (506) hide show
  1. package/.agents/skills/adapt/SKILL.md +199 -0
  2. package/.agents/skills/advanced-evaluation/SKILL.md +402 -0
  3. package/.agents/skills/advanced-evaluation/references/bias-mitigation.md +288 -0
  4. package/.agents/skills/advanced-evaluation/references/evaluation-pipeline.md +43 -0
  5. package/.agents/skills/advanced-evaluation/references/implementation-patterns.md +315 -0
  6. package/.agents/skills/advanced-evaluation/references/metrics-guide.md +331 -0
  7. package/.agents/skills/advanced-evaluation/scripts/evaluation_example.py +392 -0
  8. package/.agents/skills/animate/SKILL.md +175 -0
  9. package/.agents/skills/arrange/SKILL.md +124 -0
  10. package/.agents/skills/audit/SKILL.md +148 -0
  11. package/.agents/skills/bdi-mental-states/SKILL.md +311 -0
  12. package/.agents/skills/bdi-mental-states/references/bdi-ontology-core.md +207 -0
  13. package/.agents/skills/bdi-mental-states/references/framework-integration.md +582 -0
  14. package/.agents/skills/bdi-mental-states/references/rdf-examples.md +315 -0
  15. package/.agents/skills/bdi-mental-states/references/sparql-competency.md +420 -0
  16. package/.agents/skills/bolder/SKILL.md +117 -0
  17. package/.agents/skills/bun/SKILL.md +199 -0
  18. package/.agents/skills/clarify/SKILL.md +183 -0
  19. package/.agents/skills/colorize/SKILL.md +143 -0
  20. package/.agents/skills/context-compression/SKILL.md +272 -0
  21. package/.agents/skills/context-compression/references/evaluation-framework.md +213 -0
  22. package/.agents/skills/context-compression/scripts/compression_evaluator.py +862 -0
  23. package/.agents/skills/context-compression/tests/test_compression_evaluator.py +56 -0
  24. package/.agents/skills/context-degradation/SKILL.md +206 -0
  25. package/.agents/skills/context-degradation/references/patterns.md +314 -0
  26. package/.agents/skills/context-degradation/scripts/degradation_detector.py +614 -0
  27. package/.agents/skills/context-fundamentals/SKILL.md +201 -0
  28. package/.agents/skills/context-fundamentals/references/context-components.md +283 -0
  29. package/.agents/skills/context-fundamentals/scripts/context_manager.py +533 -0
  30. package/.agents/skills/context-optimization/SKILL.md +195 -0
  31. package/.agents/skills/context-optimization/references/optimization_techniques.md +272 -0
  32. package/.agents/skills/context-optimization/scripts/compaction.py +562 -0
  33. package/.agents/skills/create-spec/SKILL.md +244 -0
  34. package/.agents/skills/critique/SKILL.md +225 -0
  35. package/.agents/skills/critique/reference/cognitive-load.md +106 -0
  36. package/.agents/skills/critique/reference/heuristics-scoring.md +234 -0
  37. package/.agents/skills/critique/reference/personas.md +178 -0
  38. package/.agents/skills/delight/SKILL.md +304 -0
  39. package/.agents/skills/distill/SKILL.md +122 -0
  40. package/.agents/skills/docx/LICENSE.txt +30 -0
  41. package/.agents/skills/docx/SKILL.md +590 -0
  42. package/.agents/skills/docx/scripts/__init__.py +1 -0
  43. package/.agents/skills/docx/scripts/accept_changes.py +135 -0
  44. package/.agents/skills/docx/scripts/comment.py +318 -0
  45. package/.agents/skills/docx/scripts/office/helpers/__init__.py +0 -0
  46. package/.agents/skills/docx/scripts/office/helpers/merge_runs.py +199 -0
  47. package/.agents/skills/docx/scripts/office/helpers/simplify_redlines.py +197 -0
  48. package/.agents/skills/docx/scripts/office/pack.py +159 -0
  49. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  50. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  51. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  52. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  53. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  54. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  55. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  56. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  57. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  58. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  59. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  60. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  61. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  62. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  63. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  64. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  65. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  66. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  67. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  68. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  69. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  70. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  71. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  72. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  73. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  74. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  75. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  76. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  77. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  78. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  79. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  80. package/.agents/skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
  81. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  82. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  83. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  84. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  85. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  86. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  87. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  88. package/.agents/skills/docx/scripts/office/soffice.py +183 -0
  89. package/.agents/skills/docx/scripts/office/unpack.py +132 -0
  90. package/.agents/skills/docx/scripts/office/validate.py +111 -0
  91. package/.agents/skills/docx/scripts/office/validators/__init__.py +15 -0
  92. package/.agents/skills/docx/scripts/office/validators/base.py +847 -0
  93. package/.agents/skills/docx/scripts/office/validators/docx.py +446 -0
  94. package/.agents/skills/docx/scripts/office/validators/pptx.py +275 -0
  95. package/.agents/skills/docx/scripts/office/validators/redlining.py +247 -0
  96. package/.agents/skills/docx/scripts/templates/comments.xml +3 -0
  97. package/.agents/skills/docx/scripts/templates/commentsExtended.xml +3 -0
  98. package/.agents/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
  99. package/.agents/skills/docx/scripts/templates/commentsIds.xml +3 -0
  100. package/.agents/skills/docx/scripts/templates/people.xml +3 -0
  101. package/.agents/skills/evaluation/SKILL.md +251 -0
  102. package/.agents/skills/evaluation/references/metrics.md +339 -0
  103. package/.agents/skills/evaluation/scripts/evaluator.py +627 -0
  104. package/.agents/skills/explain-code/SKILL.md +230 -0
  105. package/.agents/skills/extract/SKILL.md +91 -0
  106. package/.agents/skills/filesystem-context/SKILL.md +287 -0
  107. package/.agents/skills/filesystem-context/references/implementation-patterns.md +549 -0
  108. package/.agents/skills/filesystem-context/scripts/filesystem_context.py +425 -0
  109. package/.agents/skills/find-skills/SKILL.md +142 -0
  110. package/.agents/skills/frontend-design/SKILL.md +147 -0
  111. package/.agents/skills/frontend-design/reference/color-and-contrast.md +132 -0
  112. package/.agents/skills/frontend-design/reference/interaction-design.md +195 -0
  113. package/.agents/skills/frontend-design/reference/motion-design.md +99 -0
  114. package/.agents/skills/frontend-design/reference/responsive-design.md +114 -0
  115. package/.agents/skills/frontend-design/reference/spatial-design.md +100 -0
  116. package/.agents/skills/frontend-design/reference/typography.md +133 -0
  117. package/.agents/skills/frontend-design/reference/ux-writing.md +107 -0
  118. package/.agents/skills/gh-commit/SKILL.md +243 -0
  119. package/.agents/skills/gh-create-pr/SKILL.md +93 -0
  120. package/.agents/skills/harden/SKILL.md +354 -0
  121. package/.agents/skills/hosted-agents/SKILL.md +260 -0
  122. package/.agents/skills/hosted-agents/references/infrastructure-patterns.md +700 -0
  123. package/.agents/skills/hosted-agents/scripts/sandbox_manager.py +590 -0
  124. package/.agents/skills/impeccable/SKILL.md +365 -0
  125. package/.agents/skills/impeccable/reference/color-and-contrast.md +105 -0
  126. package/.agents/skills/impeccable/reference/craft.md +70 -0
  127. package/.agents/skills/impeccable/reference/extract.md +70 -0
  128. package/.agents/skills/impeccable/reference/interaction-design.md +195 -0
  129. package/.agents/skills/impeccable/reference/motion-design.md +99 -0
  130. package/.agents/skills/impeccable/reference/responsive-design.md +114 -0
  131. package/.agents/skills/impeccable/reference/spatial-design.md +100 -0
  132. package/.agents/skills/impeccable/reference/typography.md +142 -0
  133. package/.agents/skills/impeccable/reference/ux-writing.md +107 -0
  134. package/.agents/skills/impeccable/scripts/cleanup-deprecated.mjs +214 -0
  135. package/.agents/skills/init/SKILL.md +138 -0
  136. package/.agents/skills/layout/SKILL.md +125 -0
  137. package/.agents/skills/liteparse/SKILL.md +222 -0
  138. package/.agents/skills/memory-systems/SKILL.md +219 -0
  139. package/.agents/skills/memory-systems/references/implementation.md +551 -0
  140. package/.agents/skills/memory-systems/scripts/memory_store.py +616 -0
  141. package/.agents/skills/multi-agent-patterns/SKILL.md +257 -0
  142. package/.agents/skills/multi-agent-patterns/references/frameworks.md +433 -0
  143. package/.agents/skills/multi-agent-patterns/scripts/coordination.py +613 -0
  144. package/.agents/skills/normalize/SKILL.md +70 -0
  145. package/.agents/skills/onboard/SKILL.md +245 -0
  146. package/.agents/skills/opentui/SKILL.md +201 -0
  147. package/.agents/skills/opentui/references/animation/REFERENCE.md +431 -0
  148. package/.agents/skills/opentui/references/components/REFERENCE.md +144 -0
  149. package/.agents/skills/opentui/references/components/code-diff.md +672 -0
  150. package/.agents/skills/opentui/references/components/containers.md +417 -0
  151. package/.agents/skills/opentui/references/components/inputs.md +531 -0
  152. package/.agents/skills/opentui/references/components/text-display.md +386 -0
  153. package/.agents/skills/opentui/references/core/REFERENCE.md +145 -0
  154. package/.agents/skills/opentui/references/core/api.md +543 -0
  155. package/.agents/skills/opentui/references/core/configuration.md +168 -0
  156. package/.agents/skills/opentui/references/core/gotchas.md +393 -0
  157. package/.agents/skills/opentui/references/core/patterns.md +449 -0
  158. package/.agents/skills/opentui/references/keyboard/REFERENCE.md +617 -0
  159. package/.agents/skills/opentui/references/layout/REFERENCE.md +337 -0
  160. package/.agents/skills/opentui/references/layout/patterns.md +444 -0
  161. package/.agents/skills/opentui/references/react/REFERENCE.md +174 -0
  162. package/.agents/skills/opentui/references/react/api.md +436 -0
  163. package/.agents/skills/opentui/references/react/configuration.md +302 -0
  164. package/.agents/skills/opentui/references/react/gotchas.md +443 -0
  165. package/.agents/skills/opentui/references/react/patterns.md +501 -0
  166. package/.agents/skills/opentui/references/solid/REFERENCE.md +201 -0
  167. package/.agents/skills/opentui/references/solid/api.md +564 -0
  168. package/.agents/skills/opentui/references/solid/configuration.md +316 -0
  169. package/.agents/skills/opentui/references/solid/gotchas.md +427 -0
  170. package/.agents/skills/opentui/references/solid/patterns.md +560 -0
  171. package/.agents/skills/opentui/references/testing/REFERENCE.md +614 -0
  172. package/.agents/skills/optimize/SKILL.md +266 -0
  173. package/.agents/skills/overdrive/SKILL.md +142 -0
  174. package/.agents/skills/pdf/LICENSE.txt +30 -0
  175. package/.agents/skills/pdf/SKILL.md +314 -0
  176. package/.agents/skills/pdf/forms.md +294 -0
  177. package/.agents/skills/pdf/reference.md +612 -0
  178. package/.agents/skills/pdf/scripts/check_bounding_boxes.py +65 -0
  179. package/.agents/skills/pdf/scripts/check_fillable_fields.py +11 -0
  180. package/.agents/skills/pdf/scripts/convert_pdf_to_images.py +33 -0
  181. package/.agents/skills/pdf/scripts/create_validation_image.py +37 -0
  182. package/.agents/skills/pdf/scripts/extract_form_field_info.py +122 -0
  183. package/.agents/skills/pdf/scripts/extract_form_structure.py +115 -0
  184. package/.agents/skills/pdf/scripts/fill_fillable_fields.py +98 -0
  185. package/.agents/skills/pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
  186. package/.agents/skills/playwright-cli/SKILL.md +344 -0
  187. package/.agents/skills/playwright-cli/references/element-attributes.md +23 -0
  188. package/.agents/skills/playwright-cli/references/playwright-tests.md +39 -0
  189. package/.agents/skills/playwright-cli/references/request-mocking.md +87 -0
  190. package/.agents/skills/playwright-cli/references/running-code.md +231 -0
  191. package/.agents/skills/playwright-cli/references/session-management.md +169 -0
  192. package/.agents/skills/playwright-cli/references/storage-state.md +275 -0
  193. package/.agents/skills/playwright-cli/references/test-generation.md +88 -0
  194. package/.agents/skills/playwright-cli/references/tracing.md +139 -0
  195. package/.agents/skills/playwright-cli/references/video-recording.md +143 -0
  196. package/.agents/skills/polish/SKILL.md +224 -0
  197. package/.agents/skills/pptx/LICENSE.txt +30 -0
  198. package/.agents/skills/pptx/SKILL.md +232 -0
  199. package/.agents/skills/pptx/editing.md +205 -0
  200. package/.agents/skills/pptx/pptxgenjs.md +420 -0
  201. package/.agents/skills/pptx/scripts/__init__.py +0 -0
  202. package/.agents/skills/pptx/scripts/add_slide.py +195 -0
  203. package/.agents/skills/pptx/scripts/clean.py +286 -0
  204. package/.agents/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  205. package/.agents/skills/pptx/scripts/office/helpers/merge_runs.py +199 -0
  206. package/.agents/skills/pptx/scripts/office/helpers/simplify_redlines.py +197 -0
  207. package/.agents/skills/pptx/scripts/office/pack.py +159 -0
  208. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  209. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  210. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  211. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  212. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  213. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  214. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  215. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  216. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  217. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  218. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  219. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  220. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  221. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  222. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  223. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  224. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  225. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  226. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  227. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  228. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  229. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  230. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  231. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  232. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  233. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  234. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  235. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  236. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  237. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  238. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  239. package/.agents/skills/pptx/scripts/office/schemas/mce/mc.xsd +75 -0
  240. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  241. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  242. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  243. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  244. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  245. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  246. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  247. package/.agents/skills/pptx/scripts/office/soffice.py +183 -0
  248. package/.agents/skills/pptx/scripts/office/unpack.py +132 -0
  249. package/.agents/skills/pptx/scripts/office/validate.py +111 -0
  250. package/.agents/skills/pptx/scripts/office/validators/__init__.py +15 -0
  251. package/.agents/skills/pptx/scripts/office/validators/base.py +847 -0
  252. package/.agents/skills/pptx/scripts/office/validators/docx.py +446 -0
  253. package/.agents/skills/pptx/scripts/office/validators/pptx.py +275 -0
  254. package/.agents/skills/pptx/scripts/office/validators/redlining.py +247 -0
  255. package/.agents/skills/pptx/scripts/thumbnail.py +289 -0
  256. package/.agents/skills/project-development/SKILL.md +291 -0
  257. package/.agents/skills/project-development/references/case-studies.md +388 -0
  258. package/.agents/skills/project-development/references/pipeline-patterns.md +610 -0
  259. package/.agents/skills/project-development/scripts/pipeline_template.py +796 -0
  260. package/.agents/skills/prompt-engineer/SKILL.md +263 -0
  261. package/.agents/skills/prompt-engineer/references/advanced_patterns.md +271 -0
  262. package/.agents/skills/prompt-engineer/references/core_prompting.md +137 -0
  263. package/.agents/skills/prompt-engineer/references/quality_improvement.md +193 -0
  264. package/.agents/skills/quieter/SKILL.md +103 -0
  265. package/.agents/skills/research-codebase/SKILL.md +227 -0
  266. package/.agents/skills/shape/SKILL.md +96 -0
  267. package/.agents/skills/skill-creator/LICENSE.txt +202 -0
  268. package/.agents/skills/skill-creator/SKILL.md +485 -0
  269. package/.agents/skills/skill-creator/agents/analyzer.md +274 -0
  270. package/.agents/skills/skill-creator/agents/comparator.md +202 -0
  271. package/.agents/skills/skill-creator/agents/grader.md +223 -0
  272. package/.agents/skills/skill-creator/assets/eval_review.html +146 -0
  273. package/.agents/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  274. package/.agents/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  275. package/.agents/skills/skill-creator/references/schemas.md +430 -0
  276. package/.agents/skills/skill-creator/scripts/__init__.py +0 -0
  277. package/.agents/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  278. package/.agents/skills/skill-creator/scripts/generate_report.py +326 -0
  279. package/.agents/skills/skill-creator/scripts/improve_description.py +247 -0
  280. package/.agents/skills/skill-creator/scripts/package_skill.py +136 -0
  281. package/.agents/skills/skill-creator/scripts/quick_validate.py +103 -0
  282. package/.agents/skills/skill-creator/scripts/run_eval.py +310 -0
  283. package/.agents/skills/skill-creator/scripts/run_loop.py +328 -0
  284. package/.agents/skills/skill-creator/scripts/utils.py +47 -0
  285. package/.agents/skills/sl-commit/SKILL.md +51 -0
  286. package/.agents/skills/sl-submit-diff/SKILL.md +55 -0
  287. package/.agents/skills/teach-impeccable/SKILL.md +71 -0
  288. package/.agents/skills/test-driven-development/SKILL.md +371 -0
  289. package/.agents/skills/test-driven-development/testing-anti-patterns.md +299 -0
  290. package/.agents/skills/tool-design/SKILL.md +271 -0
  291. package/.agents/skills/tool-design/references/architectural_reduction.md +210 -0
  292. package/.agents/skills/tool-design/references/best_practices.md +176 -0
  293. package/.agents/skills/tool-design/scripts/description_generator.py +528 -0
  294. package/.agents/skills/typescript-advanced-types/SKILL.md +719 -0
  295. package/.agents/skills/typescript-expert/SKILL.md +428 -0
  296. package/.agents/skills/typescript-expert/references/tsconfig-strict.json +92 -0
  297. package/.agents/skills/typescript-expert/references/typescript-cheatsheet.md +383 -0
  298. package/.agents/skills/typescript-expert/references/utility-types.ts +335 -0
  299. package/.agents/skills/typescript-expert/scripts/ts_diagnostic.py +203 -0
  300. package/.agents/skills/typescript-react-reviewer/SKILL.md +200 -0
  301. package/.agents/skills/typescript-react-reviewer/references/antipatterns.md +510 -0
  302. package/.agents/skills/typescript-react-reviewer/references/checklist.md +267 -0
  303. package/.agents/skills/typescript-react-reviewer/references/react19-patterns.md +305 -0
  304. package/.agents/skills/typeset/SKILL.md +116 -0
  305. package/.agents/skills/workflow-creator/SKILL.md +337 -0
  306. package/.agents/skills/workflow-creator/references/agent-sessions.md +789 -0
  307. package/.agents/skills/workflow-creator/references/computation-and-validation.md +224 -0
  308. package/.agents/skills/workflow-creator/references/control-flow.md +450 -0
  309. package/.agents/skills/workflow-creator/references/discovery-and-verification.md +156 -0
  310. package/.agents/skills/workflow-creator/references/failure-modes.md +732 -0
  311. package/.agents/skills/workflow-creator/references/getting-started.md +289 -0
  312. package/.agents/skills/workflow-creator/references/session-config.md +355 -0
  313. package/.agents/skills/workflow-creator/references/state-and-data-flow.md +374 -0
  314. package/.agents/skills/workflow-creator/references/user-input.md +206 -0
  315. package/.agents/skills/workflow-creator/references/workflow-inputs.md +274 -0
  316. package/.agents/skills/xlsx/LICENSE.txt +30 -0
  317. package/.agents/skills/xlsx/SKILL.md +292 -0
  318. package/.agents/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  319. package/.agents/skills/xlsx/scripts/office/helpers/merge_runs.py +199 -0
  320. package/.agents/skills/xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
  321. package/.agents/skills/xlsx/scripts/office/pack.py +159 -0
  322. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  323. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  324. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  325. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  326. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  327. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  328. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  329. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  330. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  331. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  332. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  333. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  334. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  335. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  336. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  337. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  338. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  339. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  340. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  341. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  342. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  343. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  344. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  345. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  346. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  347. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  348. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  349. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  350. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  351. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  352. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  353. package/.agents/skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
  354. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  355. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  356. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  357. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  358. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  359. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  360. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  361. package/.agents/skills/xlsx/scripts/office/soffice.py +183 -0
  362. package/.agents/skills/xlsx/scripts/office/unpack.py +132 -0
  363. package/.agents/skills/xlsx/scripts/office/validate.py +111 -0
  364. package/.agents/skills/xlsx/scripts/office/validators/__init__.py +15 -0
  365. package/.agents/skills/xlsx/scripts/office/validators/base.py +847 -0
  366. package/.agents/skills/xlsx/scripts/office/validators/docx.py +446 -0
  367. package/.agents/skills/xlsx/scripts/office/validators/pptx.py +275 -0
  368. package/.agents/skills/xlsx/scripts/office/validators/redlining.py +247 -0
  369. package/.agents/skills/xlsx/scripts/recalc.py +184 -0
  370. package/.claude/agents/reviewer.md +1 -0
  371. package/.github/agents/reviewer.md +1 -0
  372. package/.opencode/agents/reviewer.md +1 -0
  373. package/README.md +274 -169
  374. package/package.json +6 -7
  375. package/src/commands/cli/init/index.ts +2 -2
  376. package/src/commands/cli/init/scm.ts +7 -8
  377. package/src/commands/cli/workflow-command.test.ts +74 -0
  378. package/src/commands/cli/workflow.ts +7 -2
  379. package/src/scripts/bundle-configs.ts +128 -0
  380. package/src/sdk/components/compact-switcher.tsx +1 -1
  381. package/src/sdk/components/orchestrator-panel-store.ts +13 -0
  382. package/src/sdk/components/orchestrator-panel.tsx +10 -0
  383. package/src/sdk/components/statusline.tsx +13 -1
  384. package/src/sdk/providers/claude.ts +42 -0
  385. package/src/sdk/runtime/executor.ts +111 -32
  386. package/src/sdk/types.ts +7 -0
  387. package/src/sdk/workflows/builtin/ralph/claude/index.ts +132 -76
  388. package/src/sdk/workflows/builtin/ralph/copilot/index.ts +129 -71
  389. package/src/sdk/workflows/builtin/ralph/helpers/git.ts +184 -17
  390. package/src/sdk/workflows/builtin/ralph/helpers/prompts.ts +463 -79
  391. package/src/sdk/workflows/builtin/ralph/opencode/index.ts +124 -80
  392. package/src/services/system/auto-sync.ts +31 -51
  393. package/src/services/system/skills.ts +56 -60
  394. package/dist/lib/path-root-guard.d.ts +0 -4
  395. package/dist/lib/path-root-guard.d.ts.map +0 -1
  396. package/dist/sdk/components/color-utils.d.ts +0 -4
  397. package/dist/sdk/components/color-utils.d.ts.map +0 -1
  398. package/dist/sdk/components/compact-switcher.d.ts +0 -10
  399. package/dist/sdk/components/compact-switcher.d.ts.map +0 -1
  400. package/dist/sdk/components/connectors.d.ts +0 -15
  401. package/dist/sdk/components/connectors.d.ts.map +0 -1
  402. package/dist/sdk/components/connectors.test.d.ts +0 -2
  403. package/dist/sdk/components/connectors.test.d.ts.map +0 -1
  404. package/dist/sdk/components/edge.d.ts +0 -4
  405. package/dist/sdk/components/edge.d.ts.map +0 -1
  406. package/dist/sdk/components/error-boundary.d.ts +0 -23
  407. package/dist/sdk/components/error-boundary.d.ts.map +0 -1
  408. package/dist/sdk/components/graph-theme.d.ts +0 -17
  409. package/dist/sdk/components/graph-theme.d.ts.map +0 -1
  410. package/dist/sdk/components/header.d.ts +0 -3
  411. package/dist/sdk/components/header.d.ts.map +0 -1
  412. package/dist/sdk/components/hooks.d.ts +0 -15
  413. package/dist/sdk/components/hooks.d.ts.map +0 -1
  414. package/dist/sdk/components/layout.d.ts +0 -27
  415. package/dist/sdk/components/layout.d.ts.map +0 -1
  416. package/dist/sdk/components/layout.test.d.ts +0 -2
  417. package/dist/sdk/components/layout.test.d.ts.map +0 -1
  418. package/dist/sdk/components/node-card.d.ts +0 -10
  419. package/dist/sdk/components/node-card.d.ts.map +0 -1
  420. package/dist/sdk/components/orchestrator-panel-contexts.d.ts +0 -16
  421. package/dist/sdk/components/orchestrator-panel-contexts.d.ts.map +0 -1
  422. package/dist/sdk/components/orchestrator-panel-store.d.ts +0 -46
  423. package/dist/sdk/components/orchestrator-panel-store.d.ts.map +0 -1
  424. package/dist/sdk/components/orchestrator-panel-store.test.d.ts +0 -2
  425. package/dist/sdk/components/orchestrator-panel-store.test.d.ts.map +0 -1
  426. package/dist/sdk/components/orchestrator-panel-types.d.ts +0 -18
  427. package/dist/sdk/components/orchestrator-panel-types.d.ts.map +0 -1
  428. package/dist/sdk/components/orchestrator-panel.d.ts +0 -52
  429. package/dist/sdk/components/orchestrator-panel.d.ts.map +0 -1
  430. package/dist/sdk/components/session-graph-panel.d.ts +0 -7
  431. package/dist/sdk/components/session-graph-panel.d.ts.map +0 -1
  432. package/dist/sdk/components/status-helpers.d.ts +0 -6
  433. package/dist/sdk/components/status-helpers.d.ts.map +0 -1
  434. package/dist/sdk/components/statusline.d.ts +0 -7
  435. package/dist/sdk/components/statusline.d.ts.map +0 -1
  436. package/dist/sdk/components/workflow-picker-panel.d.ts +0 -123
  437. package/dist/sdk/components/workflow-picker-panel.d.ts.map +0 -1
  438. package/dist/sdk/define-workflow.d.ts +0 -78
  439. package/dist/sdk/define-workflow.d.ts.map +0 -1
  440. package/dist/sdk/define-workflow.test.d.ts +0 -2
  441. package/dist/sdk/define-workflow.test.d.ts.map +0 -1
  442. package/dist/sdk/errors.d.ts +0 -24
  443. package/dist/sdk/errors.d.ts.map +0 -1
  444. package/dist/sdk/errors.test.d.ts +0 -2
  445. package/dist/sdk/errors.test.d.ts.map +0 -1
  446. package/dist/sdk/index.d.ts +0 -13
  447. package/dist/sdk/index.d.ts.map +0 -1
  448. package/dist/sdk/providers/claude.d.ts +0 -170
  449. package/dist/sdk/providers/claude.d.ts.map +0 -1
  450. package/dist/sdk/providers/copilot.d.ts +0 -11
  451. package/dist/sdk/providers/copilot.d.ts.map +0 -1
  452. package/dist/sdk/providers/opencode.d.ts +0 -11
  453. package/dist/sdk/providers/opencode.d.ts.map +0 -1
  454. package/dist/sdk/runtime/discovery.d.ts +0 -86
  455. package/dist/sdk/runtime/discovery.d.ts.map +0 -1
  456. package/dist/sdk/runtime/executor-entry.d.ts +0 -11
  457. package/dist/sdk/runtime/executor-entry.d.ts.map +0 -1
  458. package/dist/sdk/runtime/executor.d.ts +0 -72
  459. package/dist/sdk/runtime/executor.d.ts.map +0 -1
  460. package/dist/sdk/runtime/executor.test.d.ts +0 -2
  461. package/dist/sdk/runtime/executor.test.d.ts.map +0 -1
  462. package/dist/sdk/runtime/graph-inference.d.ts +0 -35
  463. package/dist/sdk/runtime/graph-inference.d.ts.map +0 -1
  464. package/dist/sdk/runtime/loader.d.ts +0 -70
  465. package/dist/sdk/runtime/loader.d.ts.map +0 -1
  466. package/dist/sdk/runtime/panel.d.ts +0 -9
  467. package/dist/sdk/runtime/panel.d.ts.map +0 -1
  468. package/dist/sdk/runtime/theme.d.ts +0 -28
  469. package/dist/sdk/runtime/theme.d.ts.map +0 -1
  470. package/dist/sdk/runtime/tmux.d.ts +0 -297
  471. package/dist/sdk/runtime/tmux.d.ts.map +0 -1
  472. package/dist/sdk/types.d.ts +0 -295
  473. package/dist/sdk/types.d.ts.map +0 -1
  474. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts +0 -62
  475. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts.map +0 -1
  476. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts +0 -46
  477. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts.map +0 -1
  478. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts +0 -26
  479. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts.map +0 -1
  480. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts +0 -92
  481. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts.map +0 -1
  482. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts +0 -57
  483. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts.map +0 -1
  484. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts +0 -49
  485. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts.map +0 -1
  486. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts +0 -14
  487. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts.map +0 -1
  488. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts +0 -14
  489. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts.map +0 -1
  490. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts +0 -17
  491. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts.map +0 -1
  492. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts +0 -119
  493. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts.map +0 -1
  494. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts +0 -20
  495. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts.map +0 -1
  496. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts +0 -14
  497. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts.map +0 -1
  498. package/dist/sdk/workflows/index.d.ts +0 -24
  499. package/dist/sdk/workflows/index.d.ts.map +0 -1
  500. package/dist/services/config/definitions.d.ts +0 -85
  501. package/dist/services/config/definitions.d.ts.map +0 -1
  502. package/dist/services/system/copy.d.ts +0 -77
  503. package/dist/services/system/copy.d.ts.map +0 -1
  504. package/dist/services/system/detect.d.ts +0 -75
  505. package/dist/services/system/detect.d.ts.map +0 -1
  506. package/tsconfig.json +0 -33
@@ -0,0 +1,288 @@
1
+ # Bias Mitigation Techniques for LLM Evaluation
2
+
3
+ This reference details specific techniques for mitigating known biases in LLM-as-a-Judge systems.
4
+
5
+ ## Position Bias
6
+
7
+ ### The Problem
8
+
9
+ In pairwise comparison, LLMs systematically prefer responses in certain positions. Research shows:
10
+ - GPT has mild first-position bias (~55% preference for first position in ties)
11
+ - Claude shows similar patterns
12
+ - Smaller models often show stronger bias
13
+
14
+ ### Mitigation: Position Swapping Protocol
15
+
16
+ ```python
17
+ async def position_swap_comparison(response_a, response_b, prompt, criteria):
18
+ # Pass 1: Original order
19
+ result_ab = await compare(response_a, response_b, prompt, criteria)
20
+
21
+ # Pass 2: Swapped order
22
+ result_ba = await compare(response_b, response_a, prompt, criteria)
23
+
24
+ # Map second result (A in second position → B in first)
25
+ result_ba_mapped = {
26
+ 'winner': {'A': 'B', 'B': 'A', 'TIE': 'TIE'}[result_ba['winner']],
27
+ 'confidence': result_ba['confidence']
28
+ }
29
+
30
+ # Consistency check
31
+ if result_ab['winner'] == result_ba_mapped['winner']:
32
+ return {
33
+ 'winner': result_ab['winner'],
34
+ 'confidence': (result_ab['confidence'] + result_ba_mapped['confidence']) / 2,
35
+ 'position_consistent': True
36
+ }
37
+ else:
38
+ # Disagreement indicates position bias was a factor
39
+ return {
40
+ 'winner': 'TIE',
41
+ 'confidence': 0.5,
42
+ 'position_consistent': False,
43
+ 'bias_detected': True
44
+ }
45
+ ```
46
+
47
+ ### Alternative: Multiple Shuffles
48
+
49
+ For higher reliability, use multiple position orderings:
50
+
51
+ ```python
52
+ async def multi_shuffle_comparison(response_a, response_b, prompt, criteria, n_shuffles=3):
53
+ results = []
54
+ for i in range(n_shuffles):
55
+ if i % 2 == 0:
56
+ r = await compare(response_a, response_b, prompt, criteria)
57
+ else:
58
+ r = await compare(response_b, response_a, prompt, criteria)
59
+ r['winner'] = {'A': 'B', 'B': 'A', 'TIE': 'TIE'}[r['winner']]
60
+ results.append(r)
61
+
62
+ # Majority vote
63
+ winners = [r['winner'] for r in results]
64
+ final_winner = max(set(winners), key=winners.count)
65
+ agreement = winners.count(final_winner) / len(winners)
66
+
67
+ return {
68
+ 'winner': final_winner,
69
+ 'confidence': agreement,
70
+ 'n_shuffles': n_shuffles
71
+ }
72
+ ```
73
+
74
+ ## Length Bias
75
+
76
+ ### The Problem
77
+
78
+ LLMs tend to rate longer responses higher, regardless of quality. This manifests as:
79
+ - Verbose responses receiving inflated scores
80
+ - Concise but complete responses penalized
81
+ - Padding and repetition being rewarded
82
+
83
+ ### Mitigation: Explicit Prompting
84
+
85
+ Include anti-length-bias instructions in the prompt:
86
+
87
+ ```
88
+ CRITICAL EVALUATION GUIDELINES:
89
+ - Do NOT prefer responses because they are longer
90
+ - Concise, complete answers are as valuable as detailed ones
91
+ - Penalize unnecessary verbosity or repetition
92
+ - Focus on information density, not word count
93
+ ```
94
+
95
+ ### Mitigation: Length-Normalized Scoring
96
+
97
+ ```python
98
+ def length_normalized_score(score, response_length, target_length=500):
99
+ """Adjust score based on response length."""
100
+ length_ratio = response_length / target_length
101
+
102
+ if length_ratio > 2.0:
103
+ # Penalize excessively long responses
104
+ penalty = (length_ratio - 2.0) * 0.1
105
+ return max(score - penalty, 1)
106
+ elif length_ratio < 0.3:
107
+ # Penalize excessively short responses
108
+ penalty = (0.3 - length_ratio) * 0.5
109
+ return max(score - penalty, 1)
110
+ else:
111
+ return score
112
+ ```
113
+
114
+ ### Mitigation: Separate Length Criterion
115
+
116
+ Make length a separate, explicit criterion so it's not implicitly rewarded:
117
+
118
+ ```python
119
+ criteria = [
120
+ {"name": "Accuracy", "description": "Factual correctness", "weight": 0.4},
121
+ {"name": "Completeness", "description": "Covers key points", "weight": 0.3},
122
+ {"name": "Conciseness", "description": "No unnecessary content", "weight": 0.3} # Explicit
123
+ ]
124
+ ```
125
+
126
+ ## Self-Enhancement Bias
127
+
128
+ ### The Problem
129
+
130
+ Models rate outputs generated by themselves (or similar models) higher than outputs from different models.
131
+
132
+ ### Mitigation: Cross-Model Evaluation
133
+
134
+ Use a different model family for evaluation than generation:
135
+
136
+ ```python
137
+ def get_evaluator_model(generator_model):
138
+ """Select evaluator to avoid self-enhancement bias."""
139
+ if 'gpt' in generator_model.lower():
140
+ return 'claude-4-5-sonnet'
141
+ elif 'claude' in generator_model.lower():
142
+ return 'gpt-5.2'
143
+ else:
144
+ return 'gpt-5.2' # Default
145
+ ```
146
+
147
+ ### Mitigation: Blind Evaluation
148
+
149
+ Remove model attribution from responses before evaluation:
150
+
151
+ ```python
152
+ def anonymize_response(response, model_name):
153
+ """Remove model-identifying patterns."""
154
+ patterns = [
155
+ f"As {model_name}",
156
+ "I am an AI",
157
+ "I don't have personal opinions",
158
+ # Model-specific patterns
159
+ ]
160
+ anonymized = response
161
+ for pattern in patterns:
162
+ anonymized = anonymized.replace(pattern, "[REDACTED]")
163
+ return anonymized
164
+ ```
165
+
166
+ ## Verbosity Bias
167
+
168
+ ### The Problem
169
+
170
+ Detailed explanations receive higher scores even when the extra detail is irrelevant or incorrect.
171
+
172
+ ### Mitigation: Relevance-Weighted Scoring
173
+
174
+ ```python
175
+ async def relevance_weighted_evaluation(response, prompt, criteria):
176
+ # First, assess relevance of each segment
177
+ relevance_scores = await assess_relevance(response, prompt)
178
+
179
+ # Weight evaluation by relevance
180
+ segments = split_into_segments(response)
181
+ weighted_scores = []
182
+ for segment, relevance in zip(segments, relevance_scores):
183
+ if relevance > 0.5: # Only count relevant segments
184
+ score = await evaluate_segment(segment, prompt, criteria)
185
+ weighted_scores.append(score * relevance)
186
+
187
+ return sum(weighted_scores) / len(weighted_scores)
188
+ ```
189
+
190
+ ### Mitigation: Rubric with Verbosity Penalty
191
+
192
+ Include explicit verbosity penalties in rubrics:
193
+
194
+ ```python
195
+ rubric_levels = [
196
+ {
197
+ "score": 5,
198
+ "description": "Complete and concise. All necessary information, nothing extraneous.",
199
+ "characteristics": ["Every sentence adds value", "No repetition", "Appropriately scoped"]
200
+ },
201
+ {
202
+ "score": 3,
203
+ "description": "Complete but verbose. Contains unnecessary detail or repetition.",
204
+ "characteristics": ["Main points covered", "Some tangents", "Could be more concise"]
205
+ },
206
+ # ... etc
207
+ ]
208
+ ```
209
+
210
+ ## Authority Bias
211
+
212
+ ### The Problem
213
+
214
+ Confident, authoritative tone is rated higher regardless of accuracy.
215
+
216
+ ### Mitigation: Evidence Requirement
217
+
218
+ Require explicit evidence for claims:
219
+
220
+ ```
221
+ For each claim in the response:
222
+ 1. Identify whether it's a factual claim
223
+ 2. Note if evidence or sources are provided
224
+ 3. Score based on verifiability, not confidence
225
+
226
+ IMPORTANT: Confident claims without evidence should NOT receive higher scores than
227
+ hedged claims with evidence.
228
+ ```
229
+
230
+ ### Mitigation: Fact-Checking Layer
231
+
232
+ Add a fact-checking step before scoring:
233
+
234
+ ```python
235
+ async def fact_checked_evaluation(response, prompt, criteria):
236
+ # Extract claims
237
+ claims = await extract_claims(response)
238
+
239
+ # Fact-check each claim
240
+ fact_check_results = await asyncio.gather(*[
241
+ verify_claim(claim) for claim in claims
242
+ ])
243
+
244
+ # Adjust score based on fact-check results
245
+ accuracy_factor = sum(r['verified'] for r in fact_check_results) / len(fact_check_results)
246
+
247
+ base_score = await evaluate(response, prompt, criteria)
248
+ return base_score * (0.7 + 0.3 * accuracy_factor) # At least 70% of score
249
+ ```
250
+
251
+ ## Aggregate Bias Detection
252
+
253
+ Monitor for systematic biases in production:
254
+
255
+ ```python
256
+ class BiasMonitor:
257
+ def __init__(self):
258
+ self.evaluations = []
259
+
260
+ def record(self, evaluation):
261
+ self.evaluations.append(evaluation)
262
+
263
+ def detect_position_bias(self):
264
+ """Detect if first position wins more often than expected."""
265
+ first_wins = sum(1 for e in self.evaluations if e['first_position_winner'])
266
+ expected = len(self.evaluations) * 0.5
267
+ z_score = (first_wins - expected) / (expected * 0.5) ** 0.5
268
+ return {'bias_detected': abs(z_score) > 2, 'z_score': z_score}
269
+
270
+ def detect_length_bias(self):
271
+ """Detect if longer responses score higher."""
272
+ from scipy.stats import spearmanr
273
+ lengths = [e['response_length'] for e in self.evaluations]
274
+ scores = [e['score'] for e in self.evaluations]
275
+ corr, p_value = spearmanr(lengths, scores)
276
+ return {'bias_detected': corr > 0.3 and p_value < 0.05, 'correlation': corr}
277
+ ```
278
+
279
+ ## Summary Table
280
+
281
+ | Bias | Primary Mitigation | Secondary Mitigation | Detection Method |
282
+ |------|-------------------|---------------------|------------------|
283
+ | Position | Position swapping | Multiple shuffles | Consistency check |
284
+ | Length | Explicit prompting | Length normalization | Length-score correlation |
285
+ | Self-enhancement | Cross-model evaluation | Anonymization | Model comparison study |
286
+ | Verbosity | Relevance weighting | Rubric penalties | Relevance scoring |
287
+ | Authority | Evidence requirement | Fact-checking layer | Confidence-accuracy correlation |
288
+
@@ -0,0 +1,43 @@
1
+ # Evaluation Pipeline Diagram
2
+
3
+ Visual layout of a production evaluation pipeline.
4
+
5
+ ```
6
+ ┌─────────────────────────────────────────────────┐
7
+ │ Evaluation Pipeline │
8
+ ├─────────────────────────────────────────────────┤
9
+ │ │
10
+ │ Input: Response + Prompt + Context │
11
+ │ │ │
12
+ │ ▼ │
13
+ │ ┌─────────────────────┐ │
14
+ │ │ Criteria Loader │ ◄── Rubrics, weights │
15
+ │ └──────────┬──────────┘ │
16
+ │ │ │
17
+ │ ▼ │
18
+ │ ┌─────────────────────┐ │
19
+ │ │ Primary Scorer │ ◄── Direct or Pairwise │
20
+ │ └──────────┬──────────┘ │
21
+ │ │ │
22
+ │ ▼ │
23
+ │ ┌─────────────────────┐ │
24
+ │ │ Bias Mitigation │ ◄── Position swap, etc. │
25
+ │ └──────────┬──────────┘ │
26
+ │ │ │
27
+ │ ▼ │
28
+ │ ┌─────────────────────┐ │
29
+ │ │ Confidence Scoring │ ◄── Calibration │
30
+ │ └──────────┬──────────┘ │
31
+ │ │ │
32
+ │ ▼ │
33
+ │ Output: Scores + Justifications + Confidence │
34
+ │ │
35
+ └─────────────────────────────────────────────────┘
36
+ ```
37
+
38
+ ## Pipeline Stages
39
+
40
+ 1. **Criteria Loader**: Loads rubrics and criterion weights from configuration
41
+ 2. **Primary Scorer**: Applies direct scoring or pairwise comparison
42
+ 3. **Bias Mitigation**: Runs position swaps, length normalization, and other debiasing
43
+ 4. **Confidence Scoring**: Calibrates confidence based on position consistency and evidence strength
@@ -0,0 +1,315 @@
1
+ # LLM-as-Judge Implementation Patterns
2
+
3
+ This reference provides detailed implementation patterns for building production-grade LLM evaluation systems.
4
+
5
+ ## Pattern 1: Structured Evaluation Pipeline
6
+
7
+ The most reliable evaluation systems follow a structured pipeline that separates concerns:
8
+
9
+ ```
10
+ Input Validation → Criteria Loading → Scoring → Bias Mitigation → Output Formatting
11
+ ```
12
+
13
+ ### Input Validation Layer
14
+
15
+ Before evaluation begins, validate:
16
+
17
+ 1. **Response presence**: Non-empty response to evaluate
18
+ 2. **Prompt presence**: Original prompt for context
19
+ 3. **Criteria validity**: At least one criterion with name and description
20
+ 4. **Weight normalization**: Weights sum to 1.0 (or normalize them)
21
+
22
+ ```python
23
+ def validate_input(response, prompt, criteria):
24
+ if not response or not response.strip():
25
+ raise ValueError("Response cannot be empty")
26
+ if not prompt or not prompt.strip():
27
+ raise ValueError("Prompt cannot be empty")
28
+ if not criteria or len(criteria) == 0:
29
+ raise ValueError("At least one criterion required")
30
+
31
+ # Normalize weights
32
+ total_weight = sum(c.get('weight', 1) for c in criteria)
33
+ for c in criteria:
34
+ c['weight'] = c.get('weight', 1) / total_weight
35
+ ```
36
+
37
+ ### Criteria Loading Layer
38
+
39
+ Criteria should be loaded from configuration, not hardcoded:
40
+
41
+ ```python
42
+ class CriteriaLoader:
43
+ def __init__(self, rubric_path=None):
44
+ self.rubrics = self._load_rubrics(rubric_path)
45
+
46
+ def get_criteria(self, task_type):
47
+ return self.rubrics.get(task_type, self.default_criteria)
48
+
49
+ def get_rubric(self, criterion_name):
50
+ return self.rubrics.get(criterion_name, {}).get('levels', [])
51
+ ```
52
+
53
+ ### Scoring Layer
54
+
55
+ The scoring layer handles the actual LLM call:
56
+
57
+ ```python
58
+ async def score_response(response, prompt, criteria, rubric, model):
59
+ system_prompt = build_system_prompt(criteria, rubric)
60
+ user_prompt = build_user_prompt(response, prompt, criteria)
61
+
62
+ result = await generate_text(
63
+ model=model,
64
+ system=system_prompt,
65
+ prompt=user_prompt,
66
+ temperature=0.3 # Lower temperature for consistency
67
+ )
68
+
69
+ return parse_scores(result.text)
70
+ ```
71
+
72
+ ### Bias Mitigation Layer
73
+
74
+ For pairwise comparison, always include position swapping:
75
+
76
+ ```python
77
+ async def compare_with_bias_mitigation(response_a, response_b, prompt, criteria, model):
78
+ # First pass: A first
79
+ pass1 = await compare_pair(response_a, response_b, prompt, criteria, model)
80
+
81
+ # Second pass: B first
82
+ pass2 = await compare_pair(response_b, response_a, prompt, criteria, model)
83
+
84
+ # Map pass2 winner back
85
+ pass2_mapped = map_winner(pass2.winner) # A→B, B→A, TIE→TIE
86
+
87
+ # Check consistency
88
+ if pass1.winner == pass2_mapped:
89
+ return {
90
+ 'winner': pass1.winner,
91
+ 'confidence': (pass1.confidence + pass2.confidence) / 2,
92
+ 'consistent': True
93
+ }
94
+ else:
95
+ return {
96
+ 'winner': 'TIE',
97
+ 'confidence': 0.5,
98
+ 'consistent': False
99
+ }
100
+ ```
101
+
102
+ ## Pattern 2: Hierarchical Evaluation
103
+
104
+ For complex evaluations, use a hierarchical approach:
105
+
106
+ ```
107
+ Quick Screen (cheap model) → Detailed Evaluation (expensive model) → Human Review (edge cases)
108
+ ```
109
+
110
+ ### Quick Screen Implementation
111
+
112
+ ```python
113
+ async def quick_screen(response, prompt, threshold=0.7):
114
+ """Fast, cheap screening for obvious passes/fails."""
115
+ result = await generate_text(
116
+ model='gpt-5.2', # Cheaper model
117
+ prompt=f"Rate 0-1 if this response adequately addresses the prompt:\n\nPrompt: {prompt}\n\nResponse: {response}",
118
+ temperature=0
119
+ )
120
+ score = float(result.text.strip())
121
+ return score, score > threshold
122
+ ```
123
+
124
+ ### Detailed Evaluation
125
+
126
+ ```python
127
+ async def detailed_evaluation(response, prompt, criteria):
128
+ """Full evaluation for borderline or important cases."""
129
+ result = await generate_text(
130
+ model='gpt-5.2', # More capable model
131
+ system=DETAILED_EVALUATION_PROMPT,
132
+ prompt=build_detailed_prompt(response, prompt, criteria),
133
+ temperature=0.3
134
+ )
135
+ return parse_detailed_scores(result.text)
136
+ ```
137
+
138
+ ## Pattern 3: Panel of LLM Judges (PoLL)
139
+
140
+ For high-stakes evaluation, use multiple models:
141
+
142
+ ```python
143
+ async def poll_evaluation(response, prompt, criteria, models):
144
+ """Aggregate judgments from multiple LLM judges."""
145
+ results = await asyncio.gather(*[
146
+ score_with_model(response, prompt, criteria, model)
147
+ for model in models
148
+ ])
149
+
150
+ # Aggregate scores
151
+ aggregated = aggregate_scores(results)
152
+
153
+ # Calculate agreement
154
+ agreement = calculate_agreement(results)
155
+
156
+ return {
157
+ 'scores': aggregated,
158
+ 'agreement': agreement,
159
+ 'individual_results': results
160
+ }
161
+
162
+ def aggregate_scores(results):
163
+ """Aggregate scores using median (robust to outliers)."""
164
+ scores = {}
165
+ for criterion in results[0]['scores'].keys():
166
+ criterion_scores = [r['scores'][criterion] for r in results]
167
+ scores[criterion] = {
168
+ 'score': statistics.median(criterion_scores),
169
+ 'std': statistics.stdev(criterion_scores) if len(criterion_scores) > 1 else 0
170
+ }
171
+ return scores
172
+ ```
173
+
174
+ ## Pattern 4: Confidence Calibration
175
+
176
+ Confidence scores should be calibrated to actual reliability:
177
+
178
+ ```python
179
+ def calibrate_confidence(raw_confidence, position_consistent, evidence_count):
180
+ """Calibrate confidence based on multiple signals."""
181
+
182
+ # Base confidence from model output
183
+ calibrated = raw_confidence
184
+
185
+ # Position consistency is a strong signal
186
+ if not position_consistent:
187
+ calibrated *= 0.6 # Significant reduction
188
+
189
+ # More evidence = higher confidence
190
+ evidence_factor = min(evidence_count / 3, 1.0) # Cap at 3 pieces
191
+ calibrated *= (0.7 + 0.3 * evidence_factor)
192
+
193
+ return min(calibrated, 0.99) # Never 100% confident
194
+ ```
195
+
196
+ ## Pattern 5: Output Formatting
197
+
198
+ Always return structured outputs with consistent schemas:
199
+
200
+ ```python
201
+ @dataclass
202
+ class ScoreResult:
203
+ criterion: str
204
+ score: float
205
+ max_score: float
206
+ justification: str
207
+ evidence: List[str]
208
+ improvement: str
209
+
210
+ @dataclass
211
+ class EvaluationResult:
212
+ success: bool
213
+ scores: List[ScoreResult]
214
+ overall_score: float
215
+ weighted_score: float
216
+ summary: Dict[str, Any]
217
+ metadata: Dict[str, Any]
218
+
219
+ def format_output(scores, metadata) -> EvaluationResult:
220
+ """Format evaluation results consistently."""
221
+ return EvaluationResult(
222
+ success=True,
223
+ scores=scores,
224
+ overall_score=sum(s.score for s in scores) / len(scores),
225
+ weighted_score=calculate_weighted_score(scores),
226
+ summary=generate_summary(scores),
227
+ metadata=metadata
228
+ )
229
+ ```
230
+
231
+ ## Error Handling Patterns
232
+
233
+ ### Graceful Degradation
234
+
235
+ ```python
236
+ async def evaluate_with_fallback(response, prompt, criteria):
237
+ try:
238
+ return await full_evaluation(response, prompt, criteria)
239
+ except RateLimitError:
240
+ # Fall back to simpler evaluation
241
+ return await simple_evaluation(response, prompt, criteria)
242
+ except ParseError as e:
243
+ # Return partial results with error flag
244
+ return {
245
+ 'success': False,
246
+ 'partial_results': e.partial_data,
247
+ 'error': str(e)
248
+ }
249
+ ```
250
+
251
+ ### Retry Logic
252
+
253
+ ```python
254
+ async def evaluate_with_retry(response, prompt, criteria, max_retries=3):
255
+ for attempt in range(max_retries):
256
+ try:
257
+ result = await evaluate(response, prompt, criteria)
258
+ if is_valid_result(result):
259
+ return result
260
+ except TransientError:
261
+ await asyncio.sleep(2 ** attempt) # Exponential backoff
262
+
263
+ raise EvaluationError("Max retries exceeded")
264
+ ```
265
+
266
+ ## Testing Patterns
267
+
268
+ ### Unit Tests for Parsing
269
+
270
+ ```python
271
+ def test_score_parsing():
272
+ raw_output = '{"scores": [{"criterion": "Accuracy", "score": 4}]}'
273
+ result = parse_scores(raw_output)
274
+ assert result.scores[0].criterion == "Accuracy"
275
+ assert result.scores[0].score == 4
276
+
277
+ def test_malformed_output():
278
+ raw_output = 'Invalid JSON'
279
+ with pytest.raises(ParseError):
280
+ parse_scores(raw_output)
281
+ ```
282
+
283
+ ### Integration Tests with Real API
284
+
285
+ ```python
286
+ @pytest.mark.integration
287
+ async def test_full_evaluation_pipeline():
288
+ result = await evaluate(
289
+ response="Water boils at 100°C at sea level.",
290
+ prompt="At what temperature does water boil?",
291
+ criteria=[{"name": "Accuracy", "description": "Factual correctness", "weight": 1}]
292
+ )
293
+
294
+ assert result.success
295
+ assert len(result.scores) == 1
296
+ assert result.scores[0].score >= 4 # Should score high for accurate response
297
+ ```
298
+
299
+ ### Bias Detection Tests
300
+
301
+ ```python
302
+ async def test_position_bias_mitigation():
303
+ # Same response in both positions should tie
304
+ result = await compare(
305
+ response_a="Same response",
306
+ response_b="Same response",
307
+ prompt="Test prompt",
308
+ criteria=["quality"],
309
+ swap_positions=True
310
+ )
311
+
312
+ assert result.winner == "TIE"
313
+ assert result.consistent == True
314
+ ```
315
+