@bastani/atomic 0.6.8 → 0.7.0-2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (765) hide show
  1. package/bin/atomic +65 -0
  2. package/package.json +17 -82
  3. package/postinstall.mjs +47 -0
  4. package/.agents/skills/ado-commit/SKILL.md +0 -94
  5. package/.agents/skills/ado-create-pr/SKILL.md +0 -211
  6. package/.agents/skills/advanced-evaluation/SKILL.md +0 -404
  7. package/.agents/skills/advanced-evaluation/references/bias-mitigation.md +0 -288
  8. package/.agents/skills/advanced-evaluation/references/evaluation-pipeline.md +0 -43
  9. package/.agents/skills/advanced-evaluation/references/implementation-patterns.md +0 -315
  10. package/.agents/skills/advanced-evaluation/references/metrics-guide.md +0 -331
  11. package/.agents/skills/advanced-evaluation/scripts/evaluation_example.py +0 -392
  12. package/.agents/skills/ast-grep/SKILL.md +0 -325
  13. package/.agents/skills/ast-grep/references/rule_reference.md +0 -297
  14. package/.agents/skills/bdi-mental-states/SKILL.md +0 -313
  15. package/.agents/skills/bdi-mental-states/references/bdi-ontology-core.md +0 -207
  16. package/.agents/skills/bdi-mental-states/references/framework-integration.md +0 -582
  17. package/.agents/skills/bdi-mental-states/references/rdf-examples.md +0 -315
  18. package/.agents/skills/bdi-mental-states/references/sparql-competency.md +0 -420
  19. package/.agents/skills/bun/SKILL.md +0 -233
  20. package/.agents/skills/context-compression/SKILL.md +0 -274
  21. package/.agents/skills/context-compression/references/evaluation-framework.md +0 -213
  22. package/.agents/skills/context-compression/scripts/compression_evaluator.py +0 -862
  23. package/.agents/skills/context-compression/tests/test_compression_evaluator.py +0 -56
  24. package/.agents/skills/context-degradation/SKILL.md +0 -208
  25. package/.agents/skills/context-degradation/references/patterns.md +0 -314
  26. package/.agents/skills/context-degradation/scripts/degradation_detector.py +0 -614
  27. package/.agents/skills/context-fundamentals/SKILL.md +0 -203
  28. package/.agents/skills/context-fundamentals/references/context-components.md +0 -283
  29. package/.agents/skills/context-fundamentals/scripts/context_manager.py +0 -533
  30. package/.agents/skills/context-optimization/SKILL.md +0 -197
  31. package/.agents/skills/context-optimization/references/optimization_techniques.md +0 -272
  32. package/.agents/skills/context-optimization/scripts/compaction.py +0 -562
  33. package/.agents/skills/create-spec/SKILL.md +0 -249
  34. package/.agents/skills/docx/LICENSE.txt +0 -30
  35. package/.agents/skills/docx/SKILL.md +0 -592
  36. package/.agents/skills/docx/scripts/__init__.py +0 -1
  37. package/.agents/skills/docx/scripts/accept_changes.py +0 -135
  38. package/.agents/skills/docx/scripts/comment.py +0 -318
  39. package/.agents/skills/docx/scripts/office/helpers/__init__.py +0 -0
  40. package/.agents/skills/docx/scripts/office/helpers/merge_runs.py +0 -199
  41. package/.agents/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -197
  42. package/.agents/skills/docx/scripts/office/pack.py +0 -159
  43. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  44. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  45. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  46. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  47. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  48. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  49. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  50. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  51. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  52. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  53. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  54. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  55. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  56. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  57. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  58. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  59. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  60. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  61. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  62. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  63. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  64. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  65. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  66. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  67. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  68. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  69. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  70. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  71. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  72. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  73. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  74. package/.agents/skills/docx/scripts/office/schemas/mce/mc.xsd +0 -75
  75. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  76. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  77. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  78. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  79. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  80. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  81. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  82. package/.agents/skills/docx/scripts/office/soffice.py +0 -183
  83. package/.agents/skills/docx/scripts/office/unpack.py +0 -132
  84. package/.agents/skills/docx/scripts/office/validate.py +0 -111
  85. package/.agents/skills/docx/scripts/office/validators/__init__.py +0 -15
  86. package/.agents/skills/docx/scripts/office/validators/base.py +0 -847
  87. package/.agents/skills/docx/scripts/office/validators/docx.py +0 -446
  88. package/.agents/skills/docx/scripts/office/validators/pptx.py +0 -275
  89. package/.agents/skills/docx/scripts/office/validators/redlining.py +0 -247
  90. package/.agents/skills/docx/scripts/templates/comments.xml +0 -3
  91. package/.agents/skills/docx/scripts/templates/commentsExtended.xml +0 -3
  92. package/.agents/skills/docx/scripts/templates/commentsExtensible.xml +0 -3
  93. package/.agents/skills/docx/scripts/templates/commentsIds.xml +0 -3
  94. package/.agents/skills/docx/scripts/templates/people.xml +0 -3
  95. package/.agents/skills/evaluation/SKILL.md +0 -253
  96. package/.agents/skills/evaluation/references/metrics.md +0 -339
  97. package/.agents/skills/evaluation/scripts/evaluator.py +0 -627
  98. package/.agents/skills/explain-code/SKILL.md +0 -232
  99. package/.agents/skills/filesystem-context/SKILL.md +0 -289
  100. package/.agents/skills/filesystem-context/references/implementation-patterns.md +0 -549
  101. package/.agents/skills/filesystem-context/scripts/filesystem_context.py +0 -425
  102. package/.agents/skills/find-skills/SKILL.md +0 -144
  103. package/.agents/skills/gh-commit/SKILL.md +0 -245
  104. package/.agents/skills/gh-create-pr/SKILL.md +0 -95
  105. package/.agents/skills/hosted-agents/SKILL.md +0 -262
  106. package/.agents/skills/hosted-agents/references/infrastructure-patterns.md +0 -700
  107. package/.agents/skills/hosted-agents/scripts/sandbox_manager.py +0 -590
  108. package/.agents/skills/impeccable/SKILL.md +0 -178
  109. package/.agents/skills/impeccable/agents/openai.yaml +0 -4
  110. package/.agents/skills/impeccable/reference/adapt.md +0 -190
  111. package/.agents/skills/impeccable/reference/animate.md +0 -175
  112. package/.agents/skills/impeccable/reference/audit.md +0 -134
  113. package/.agents/skills/impeccable/reference/bolder.md +0 -113
  114. package/.agents/skills/impeccable/reference/brand.md +0 -114
  115. package/.agents/skills/impeccable/reference/clarify.md +0 -174
  116. package/.agents/skills/impeccable/reference/cognitive-load.md +0 -106
  117. package/.agents/skills/impeccable/reference/color-and-contrast.md +0 -105
  118. package/.agents/skills/impeccable/reference/colorize.md +0 -154
  119. package/.agents/skills/impeccable/reference/craft.md +0 -193
  120. package/.agents/skills/impeccable/reference/critique.md +0 -213
  121. package/.agents/skills/impeccable/reference/delight.md +0 -302
  122. package/.agents/skills/impeccable/reference/distill.md +0 -111
  123. package/.agents/skills/impeccable/reference/document.md +0 -427
  124. package/.agents/skills/impeccable/reference/extract.md +0 -70
  125. package/.agents/skills/impeccable/reference/harden.md +0 -347
  126. package/.agents/skills/impeccable/reference/heuristics-scoring.md +0 -234
  127. package/.agents/skills/impeccable/reference/interaction-design.md +0 -195
  128. package/.agents/skills/impeccable/reference/layout.md +0 -141
  129. package/.agents/skills/impeccable/reference/live.md +0 -594
  130. package/.agents/skills/impeccable/reference/motion-design.md +0 -109
  131. package/.agents/skills/impeccable/reference/onboard.md +0 -234
  132. package/.agents/skills/impeccable/reference/optimize.md +0 -258
  133. package/.agents/skills/impeccable/reference/overdrive.md +0 -130
  134. package/.agents/skills/impeccable/reference/personas.md +0 -178
  135. package/.agents/skills/impeccable/reference/polish.md +0 -232
  136. package/.agents/skills/impeccable/reference/product.md +0 -62
  137. package/.agents/skills/impeccable/reference/quieter.md +0 -99
  138. package/.agents/skills/impeccable/reference/responsive-design.md +0 -114
  139. package/.agents/skills/impeccable/reference/shape.md +0 -151
  140. package/.agents/skills/impeccable/reference/spatial-design.md +0 -100
  141. package/.agents/skills/impeccable/reference/teach.md +0 -156
  142. package/.agents/skills/impeccable/reference/typeset.md +0 -124
  143. package/.agents/skills/impeccable/reference/typography.md +0 -159
  144. package/.agents/skills/impeccable/reference/ux-writing.md +0 -107
  145. package/.agents/skills/impeccable/scripts/cleanup-deprecated.mjs +0 -284
  146. package/.agents/skills/impeccable/scripts/command-metadata.json +0 -94
  147. package/.agents/skills/impeccable/scripts/design-parser.mjs +0 -820
  148. package/.agents/skills/impeccable/scripts/detect-csp.mjs +0 -198
  149. package/.agents/skills/impeccable/scripts/is-generated.mjs +0 -69
  150. package/.agents/skills/impeccable/scripts/live-accept.mjs +0 -595
  151. package/.agents/skills/impeccable/scripts/live-browser.js +0 -4781
  152. package/.agents/skills/impeccable/scripts/live-inject.mjs +0 -445
  153. package/.agents/skills/impeccable/scripts/live-poll.mjs +0 -186
  154. package/.agents/skills/impeccable/scripts/live-server.mjs +0 -694
  155. package/.agents/skills/impeccable/scripts/live-wrap.mjs +0 -571
  156. package/.agents/skills/impeccable/scripts/live.mjs +0 -247
  157. package/.agents/skills/impeccable/scripts/load-context.mjs +0 -141
  158. package/.agents/skills/impeccable/scripts/modern-screenshot.umd.js +0 -14
  159. package/.agents/skills/impeccable/scripts/pin.mjs +0 -214
  160. package/.agents/skills/init/SKILL.md +0 -140
  161. package/.agents/skills/liteparse/SKILL.md +0 -223
  162. package/.agents/skills/memory-systems/SKILL.md +0 -221
  163. package/.agents/skills/memory-systems/references/implementation.md +0 -551
  164. package/.agents/skills/memory-systems/scripts/memory_store.py +0 -616
  165. package/.agents/skills/multi-agent-patterns/SKILL.md +0 -259
  166. package/.agents/skills/multi-agent-patterns/references/frameworks.md +0 -433
  167. package/.agents/skills/multi-agent-patterns/scripts/coordination.py +0 -613
  168. package/.agents/skills/opentui/SKILL.md +0 -202
  169. package/.agents/skills/opentui/references/animation/REFERENCE.md +0 -431
  170. package/.agents/skills/opentui/references/components/REFERENCE.md +0 -144
  171. package/.agents/skills/opentui/references/components/code-diff.md +0 -672
  172. package/.agents/skills/opentui/references/components/containers.md +0 -417
  173. package/.agents/skills/opentui/references/components/inputs.md +0 -531
  174. package/.agents/skills/opentui/references/components/text-display.md +0 -386
  175. package/.agents/skills/opentui/references/core/REFERENCE.md +0 -145
  176. package/.agents/skills/opentui/references/core/api.md +0 -543
  177. package/.agents/skills/opentui/references/core/configuration.md +0 -168
  178. package/.agents/skills/opentui/references/core/gotchas.md +0 -393
  179. package/.agents/skills/opentui/references/core/patterns.md +0 -449
  180. package/.agents/skills/opentui/references/keyboard/REFERENCE.md +0 -617
  181. package/.agents/skills/opentui/references/layout/REFERENCE.md +0 -337
  182. package/.agents/skills/opentui/references/layout/patterns.md +0 -444
  183. package/.agents/skills/opentui/references/react/REFERENCE.md +0 -174
  184. package/.agents/skills/opentui/references/react/api.md +0 -436
  185. package/.agents/skills/opentui/references/react/configuration.md +0 -302
  186. package/.agents/skills/opentui/references/react/gotchas.md +0 -443
  187. package/.agents/skills/opentui/references/react/patterns.md +0 -501
  188. package/.agents/skills/opentui/references/solid/REFERENCE.md +0 -201
  189. package/.agents/skills/opentui/references/solid/api.md +0 -564
  190. package/.agents/skills/opentui/references/solid/configuration.md +0 -316
  191. package/.agents/skills/opentui/references/solid/gotchas.md +0 -427
  192. package/.agents/skills/opentui/references/solid/patterns.md +0 -560
  193. package/.agents/skills/opentui/references/testing/REFERENCE.md +0 -614
  194. package/.agents/skills/pdf/LICENSE.txt +0 -30
  195. package/.agents/skills/pdf/SKILL.md +0 -316
  196. package/.agents/skills/pdf/forms.md +0 -294
  197. package/.agents/skills/pdf/reference.md +0 -612
  198. package/.agents/skills/pdf/scripts/check_bounding_boxes.py +0 -65
  199. package/.agents/skills/pdf/scripts/check_fillable_fields.py +0 -11
  200. package/.agents/skills/pdf/scripts/convert_pdf_to_images.py +0 -33
  201. package/.agents/skills/pdf/scripts/create_validation_image.py +0 -37
  202. package/.agents/skills/pdf/scripts/extract_form_field_info.py +0 -122
  203. package/.agents/skills/pdf/scripts/extract_form_structure.py +0 -115
  204. package/.agents/skills/pdf/scripts/fill_fillable_fields.py +0 -98
  205. package/.agents/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -107
  206. package/.agents/skills/playwright-cli/SKILL.md +0 -390
  207. package/.agents/skills/playwright-cli/references/element-attributes.md +0 -23
  208. package/.agents/skills/playwright-cli/references/playwright-tests.md +0 -39
  209. package/.agents/skills/playwright-cli/references/request-mocking.md +0 -87
  210. package/.agents/skills/playwright-cli/references/running-code.md +0 -241
  211. package/.agents/skills/playwright-cli/references/session-management.md +0 -225
  212. package/.agents/skills/playwright-cli/references/spec-driven-testing.md +0 -305
  213. package/.agents/skills/playwright-cli/references/storage-state.md +0 -275
  214. package/.agents/skills/playwright-cli/references/test-generation.md +0 -134
  215. package/.agents/skills/playwright-cli/references/tracing.md +0 -139
  216. package/.agents/skills/playwright-cli/references/video-recording.md +0 -143
  217. package/.agents/skills/pptx/LICENSE.txt +0 -30
  218. package/.agents/skills/pptx/SKILL.md +0 -234
  219. package/.agents/skills/pptx/editing.md +0 -205
  220. package/.agents/skills/pptx/pptxgenjs.md +0 -420
  221. package/.agents/skills/pptx/scripts/__init__.py +0 -0
  222. package/.agents/skills/pptx/scripts/add_slide.py +0 -195
  223. package/.agents/skills/pptx/scripts/clean.py +0 -286
  224. package/.agents/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  225. package/.agents/skills/pptx/scripts/office/helpers/merge_runs.py +0 -199
  226. package/.agents/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -197
  227. package/.agents/skills/pptx/scripts/office/pack.py +0 -159
  228. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  229. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  230. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  231. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  232. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  233. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  234. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  235. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  236. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  237. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  238. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  239. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  240. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  241. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  242. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  243. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  244. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  245. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  246. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  247. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  248. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  249. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  250. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  251. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  252. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  253. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  254. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  255. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  256. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  257. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  258. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  259. package/.agents/skills/pptx/scripts/office/schemas/mce/mc.xsd +0 -75
  260. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  261. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  262. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  263. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  264. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  265. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  266. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  267. package/.agents/skills/pptx/scripts/office/soffice.py +0 -183
  268. package/.agents/skills/pptx/scripts/office/unpack.py +0 -132
  269. package/.agents/skills/pptx/scripts/office/validate.py +0 -111
  270. package/.agents/skills/pptx/scripts/office/validators/__init__.py +0 -15
  271. package/.agents/skills/pptx/scripts/office/validators/base.py +0 -847
  272. package/.agents/skills/pptx/scripts/office/validators/docx.py +0 -446
  273. package/.agents/skills/pptx/scripts/office/validators/pptx.py +0 -275
  274. package/.agents/skills/pptx/scripts/office/validators/redlining.py +0 -247
  275. package/.agents/skills/pptx/scripts/thumbnail.py +0 -289
  276. package/.agents/skills/project-development/SKILL.md +0 -293
  277. package/.agents/skills/project-development/references/case-studies.md +0 -388
  278. package/.agents/skills/project-development/references/pipeline-patterns.md +0 -610
  279. package/.agents/skills/project-development/scripts/pipeline_template.py +0 -796
  280. package/.agents/skills/prompt-engineer/SKILL.md +0 -265
  281. package/.agents/skills/prompt-engineer/references/advanced_patterns.md +0 -271
  282. package/.agents/skills/prompt-engineer/references/core_prompting.md +0 -137
  283. package/.agents/skills/prompt-engineer/references/quality_improvement.md +0 -193
  284. package/.agents/skills/research-codebase/SKILL.md +0 -229
  285. package/.agents/skills/ripgrep/SKILL.md +0 -384
  286. package/.agents/skills/skill-creator/LICENSE.txt +0 -202
  287. package/.agents/skills/skill-creator/SKILL.md +0 -487
  288. package/.agents/skills/skill-creator/agents/analyzer.md +0 -274
  289. package/.agents/skills/skill-creator/agents/comparator.md +0 -202
  290. package/.agents/skills/skill-creator/agents/grader.md +0 -223
  291. package/.agents/skills/skill-creator/assets/eval_review.html +0 -146
  292. package/.agents/skills/skill-creator/eval-viewer/generate_review.py +0 -471
  293. package/.agents/skills/skill-creator/eval-viewer/viewer.html +0 -1325
  294. package/.agents/skills/skill-creator/references/schemas.md +0 -430
  295. package/.agents/skills/skill-creator/scripts/__init__.py +0 -0
  296. package/.agents/skills/skill-creator/scripts/aggregate_benchmark.py +0 -401
  297. package/.agents/skills/skill-creator/scripts/generate_report.py +0 -326
  298. package/.agents/skills/skill-creator/scripts/improve_description.py +0 -247
  299. package/.agents/skills/skill-creator/scripts/package_skill.py +0 -136
  300. package/.agents/skills/skill-creator/scripts/quick_validate.py +0 -103
  301. package/.agents/skills/skill-creator/scripts/run_eval.py +0 -310
  302. package/.agents/skills/skill-creator/scripts/run_loop.py +0 -328
  303. package/.agents/skills/skill-creator/scripts/utils.py +0 -47
  304. package/.agents/skills/sl-commit/SKILL.md +0 -53
  305. package/.agents/skills/sl-submit-diff/SKILL.md +0 -57
  306. package/.agents/skills/tdd/SKILL.md +0 -111
  307. package/.agents/skills/tdd/deep-modules.md +0 -33
  308. package/.agents/skills/tdd/interface-design.md +0 -31
  309. package/.agents/skills/tdd/mocking.md +0 -59
  310. package/.agents/skills/tdd/refactoring.md +0 -10
  311. package/.agents/skills/tdd/tests.md +0 -61
  312. package/.agents/skills/tool-design/SKILL.md +0 -273
  313. package/.agents/skills/tool-design/references/architectural_reduction.md +0 -210
  314. package/.agents/skills/tool-design/references/best_practices.md +0 -176
  315. package/.agents/skills/tool-design/scripts/description_generator.py +0 -528
  316. package/.agents/skills/typescript-advanced-types/SKILL.md +0 -720
  317. package/.agents/skills/typescript-expert/SKILL.md +0 -434
  318. package/.agents/skills/typescript-expert/references/tsconfig-strict.json +0 -92
  319. package/.agents/skills/typescript-expert/references/typescript-cheatsheet.md +0 -383
  320. package/.agents/skills/typescript-expert/references/utility-types.ts +0 -335
  321. package/.agents/skills/typescript-expert/scripts/ts_diagnostic.py +0 -203
  322. package/.agents/skills/typescript-react-reviewer/SKILL.md +0 -201
  323. package/.agents/skills/typescript-react-reviewer/references/antipatterns.md +0 -510
  324. package/.agents/skills/typescript-react-reviewer/references/checklist.md +0 -267
  325. package/.agents/skills/typescript-react-reviewer/references/react19-patterns.md +0 -305
  326. package/.agents/skills/workflow-creator/SKILL.md +0 -553
  327. package/.agents/skills/workflow-creator/references/agent-sessions.md +0 -891
  328. package/.agents/skills/workflow-creator/references/agent-setup-recipe.md +0 -266
  329. package/.agents/skills/workflow-creator/references/computation-and-validation.md +0 -201
  330. package/.agents/skills/workflow-creator/references/control-flow.md +0 -470
  331. package/.agents/skills/workflow-creator/references/failure-modes.md +0 -1014
  332. package/.agents/skills/workflow-creator/references/getting-started.md +0 -392
  333. package/.agents/skills/workflow-creator/references/registry-and-validation.md +0 -141
  334. package/.agents/skills/workflow-creator/references/running-workflows.md +0 -418
  335. package/.agents/skills/workflow-creator/references/session-config.md +0 -431
  336. package/.agents/skills/workflow-creator/references/state-and-data-flow.md +0 -356
  337. package/.agents/skills/workflow-creator/references/user-input.md +0 -234
  338. package/.agents/skills/workflow-creator/references/workflow-inputs.md +0 -392
  339. package/.agents/skills/xlsx/LICENSE.txt +0 -30
  340. package/.agents/skills/xlsx/SKILL.md +0 -294
  341. package/.agents/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  342. package/.agents/skills/xlsx/scripts/office/helpers/merge_runs.py +0 -199
  343. package/.agents/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -197
  344. package/.agents/skills/xlsx/scripts/office/pack.py +0 -159
  345. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  346. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  347. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  348. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  349. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  350. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  351. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  352. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  353. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  354. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  355. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  356. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  357. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  358. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  359. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  360. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  361. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  362. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  363. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  364. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  365. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  366. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  367. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  368. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  369. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  370. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  371. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  372. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  373. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  374. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  375. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  376. package/.agents/skills/xlsx/scripts/office/schemas/mce/mc.xsd +0 -75
  377. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  378. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  379. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  380. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  381. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  382. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  383. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  384. package/.agents/skills/xlsx/scripts/office/soffice.py +0 -183
  385. package/.agents/skills/xlsx/scripts/office/unpack.py +0 -132
  386. package/.agents/skills/xlsx/scripts/office/validate.py +0 -111
  387. package/.agents/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  388. package/.agents/skills/xlsx/scripts/office/validators/base.py +0 -847
  389. package/.agents/skills/xlsx/scripts/office/validators/docx.py +0 -446
  390. package/.agents/skills/xlsx/scripts/office/validators/pptx.py +0 -275
  391. package/.agents/skills/xlsx/scripts/office/validators/redlining.py +0 -247
  392. package/.agents/skills/xlsx/scripts/recalc.py +0 -184
  393. package/.claude/agents/code-simplifier.md +0 -52
  394. package/.claude/agents/codebase-analyzer.md +0 -166
  395. package/.claude/agents/codebase-locator.md +0 -122
  396. package/.claude/agents/codebase-online-researcher.md +0 -148
  397. package/.claude/agents/codebase-pattern-finder.md +0 -247
  398. package/.claude/agents/codebase-research-analyzer.md +0 -179
  399. package/.claude/agents/codebase-research-locator.md +0 -145
  400. package/.claude/agents/debugger.md +0 -91
  401. package/.claude/agents/orchestrator.md +0 -19
  402. package/.claude/agents/planner.md +0 -295
  403. package/.claude/agents/reviewer.md +0 -98
  404. package/.claude/agents/worker.md +0 -165
  405. package/.claude/settings.json +0 -27
  406. package/.github/agents/code-simplifier.md +0 -52
  407. package/.github/agents/codebase-analyzer.md +0 -166
  408. package/.github/agents/codebase-locator.md +0 -122
  409. package/.github/agents/codebase-online-researcher.md +0 -146
  410. package/.github/agents/codebase-pattern-finder.md +0 -247
  411. package/.github/agents/codebase-research-analyzer.md +0 -179
  412. package/.github/agents/codebase-research-locator.md +0 -145
  413. package/.github/agents/debugger.md +0 -98
  414. package/.github/agents/orchestrator.md +0 -27
  415. package/.github/agents/planner.md +0 -305
  416. package/.github/agents/reviewer.md +0 -95
  417. package/.github/agents/worker.md +0 -237
  418. package/.github/lsp.json +0 -93
  419. package/.mcp.json +0 -20
  420. package/.opencode/agents/code-simplifier.md +0 -62
  421. package/.opencode/agents/codebase-analyzer.md +0 -171
  422. package/.opencode/agents/codebase-locator.md +0 -127
  423. package/.opencode/agents/codebase-online-researcher.md +0 -152
  424. package/.opencode/agents/codebase-pattern-finder.md +0 -252
  425. package/.opencode/agents/codebase-research-analyzer.md +0 -183
  426. package/.opencode/agents/codebase-research-locator.md +0 -149
  427. package/.opencode/agents/debugger.md +0 -99
  428. package/.opencode/agents/orchestrator.md +0 -27
  429. package/.opencode/agents/planner.md +0 -309
  430. package/.opencode/agents/reviewer.md +0 -103
  431. package/.opencode/agents/worker.md +0 -165
  432. package/.opencode/opencode.json +0 -25
  433. package/README.md +0 -1624
  434. package/assets/settings.schema.json +0 -51
  435. package/dist/commands/cli/claude-inflight-hook.d.ts +0 -100
  436. package/dist/commands/cli/claude-inflight-hook.d.ts.map +0 -1
  437. package/dist/commands/cli/claude-stop-hook.d.ts +0 -80
  438. package/dist/commands/cli/claude-stop-hook.d.ts.map +0 -1
  439. package/dist/lib/atomic-temp.d.ts +0 -8
  440. package/dist/lib/atomic-temp.d.ts.map +0 -1
  441. package/dist/lib/path-root-guard.d.ts +0 -4
  442. package/dist/lib/path-root-guard.d.ts.map +0 -1
  443. package/dist/lib/spawn.d.ts +0 -102
  444. package/dist/lib/spawn.d.ts.map +0 -1
  445. package/dist/lib/terminal-env.d.ts +0 -9
  446. package/dist/lib/terminal-env.d.ts.map +0 -1
  447. package/dist/sdk/components/attached-statusline.d.ts +0 -26
  448. package/dist/sdk/components/attached-statusline.d.ts.map +0 -1
  449. package/dist/sdk/components/color-utils.d.ts +0 -4
  450. package/dist/sdk/components/color-utils.d.ts.map +0 -1
  451. package/dist/sdk/components/compact-switcher.d.ts +0 -10
  452. package/dist/sdk/components/compact-switcher.d.ts.map +0 -1
  453. package/dist/sdk/components/connectors.d.ts +0 -16
  454. package/dist/sdk/components/connectors.d.ts.map +0 -1
  455. package/dist/sdk/components/edge.d.ts +0 -4
  456. package/dist/sdk/components/edge.d.ts.map +0 -1
  457. package/dist/sdk/components/error-boundary.d.ts +0 -23
  458. package/dist/sdk/components/error-boundary.d.ts.map +0 -1
  459. package/dist/sdk/components/graph-theme.d.ts +0 -18
  460. package/dist/sdk/components/graph-theme.d.ts.map +0 -1
  461. package/dist/sdk/components/header.d.ts +0 -3
  462. package/dist/sdk/components/header.d.ts.map +0 -1
  463. package/dist/sdk/components/hooks.d.ts +0 -15
  464. package/dist/sdk/components/hooks.d.ts.map +0 -1
  465. package/dist/sdk/components/layout.d.ts +0 -27
  466. package/dist/sdk/components/layout.d.ts.map +0 -1
  467. package/dist/sdk/components/node-card.d.ts +0 -10
  468. package/dist/sdk/components/node-card.d.ts.map +0 -1
  469. package/dist/sdk/components/orchestrator-panel-contexts.d.ts +0 -16
  470. package/dist/sdk/components/orchestrator-panel-contexts.d.ts.map +0 -1
  471. package/dist/sdk/components/orchestrator-panel-store.d.ts +0 -52
  472. package/dist/sdk/components/orchestrator-panel-store.d.ts.map +0 -1
  473. package/dist/sdk/components/orchestrator-panel-types.d.ts +0 -18
  474. package/dist/sdk/components/orchestrator-panel-types.d.ts.map +0 -1
  475. package/dist/sdk/components/orchestrator-panel.d.ts +0 -86
  476. package/dist/sdk/components/orchestrator-panel.d.ts.map +0 -1
  477. package/dist/sdk/components/renderer-background.d.ts +0 -9
  478. package/dist/sdk/components/renderer-background.d.ts.map +0 -1
  479. package/dist/sdk/components/session-graph-panel.d.ts +0 -7
  480. package/dist/sdk/components/session-graph-panel.d.ts.map +0 -1
  481. package/dist/sdk/components/status-helpers.d.ts +0 -6
  482. package/dist/sdk/components/status-helpers.d.ts.map +0 -1
  483. package/dist/sdk/components/statusline.d.ts +0 -5
  484. package/dist/sdk/components/statusline.d.ts.map +0 -1
  485. package/dist/sdk/components/tui-diagnostics.d.ts +0 -56
  486. package/dist/sdk/components/tui-diagnostics.d.ts.map +0 -1
  487. package/dist/sdk/components/workflow-picker-panel.d.ts +0 -126
  488. package/dist/sdk/components/workflow-picker-panel.d.ts.map +0 -1
  489. package/dist/sdk/define-workflow.d.ts +0 -107
  490. package/dist/sdk/define-workflow.d.ts.map +0 -1
  491. package/dist/sdk/errors.d.ts +0 -46
  492. package/dist/sdk/errors.d.ts.map +0 -1
  493. package/dist/sdk/index.d.ts +0 -26
  494. package/dist/sdk/index.d.ts.map +0 -1
  495. package/dist/sdk/primitives/inputs.d.ts +0 -36
  496. package/dist/sdk/primitives/inputs.d.ts.map +0 -1
  497. package/dist/sdk/primitives/metadata.d.ts +0 -40
  498. package/dist/sdk/primitives/metadata.d.ts.map +0 -1
  499. package/dist/sdk/primitives/run.d.ts +0 -57
  500. package/dist/sdk/primitives/run.d.ts.map +0 -1
  501. package/dist/sdk/primitives/sessions.d.ts +0 -128
  502. package/dist/sdk/primitives/sessions.d.ts.map +0 -1
  503. package/dist/sdk/providers/claude.d.ts +0 -392
  504. package/dist/sdk/providers/claude.d.ts.map +0 -1
  505. package/dist/sdk/providers/copilot.d.ts +0 -55
  506. package/dist/sdk/providers/copilot.d.ts.map +0 -1
  507. package/dist/sdk/providers/opencode.d.ts +0 -27
  508. package/dist/sdk/providers/opencode.d.ts.map +0 -1
  509. package/dist/sdk/registry.d.ts +0 -27
  510. package/dist/sdk/registry.d.ts.map +0 -1
  511. package/dist/sdk/runtime/attached-footer.d.ts +0 -31
  512. package/dist/sdk/runtime/attached-footer.d.ts.map +0 -1
  513. package/dist/sdk/runtime/cc-debounce.d.ts +0 -29
  514. package/dist/sdk/runtime/cc-debounce.d.ts.map +0 -1
  515. package/dist/sdk/runtime/executor-env.d.ts +0 -20
  516. package/dist/sdk/runtime/executor-env.d.ts.map +0 -1
  517. package/dist/sdk/runtime/executor.d.ts +0 -265
  518. package/dist/sdk/runtime/executor.d.ts.map +0 -1
  519. package/dist/sdk/runtime/graph-inference.d.ts +0 -35
  520. package/dist/sdk/runtime/graph-inference.d.ts.map +0 -1
  521. package/dist/sdk/runtime/orchestrator-entry.d.ts +0 -26
  522. package/dist/sdk/runtime/orchestrator-entry.d.ts.map +0 -1
  523. package/dist/sdk/runtime/panel.d.ts +0 -9
  524. package/dist/sdk/runtime/panel.d.ts.map +0 -1
  525. package/dist/sdk/runtime/port-discovery.d.ts +0 -71
  526. package/dist/sdk/runtime/port-discovery.d.ts.map +0 -1
  527. package/dist/sdk/runtime/status-writer.d.ts +0 -101
  528. package/dist/sdk/runtime/status-writer.d.ts.map +0 -1
  529. package/dist/sdk/runtime/theme.d.ts +0 -33
  530. package/dist/sdk/runtime/theme.d.ts.map +0 -1
  531. package/dist/sdk/runtime/tmux.d.ts +0 -307
  532. package/dist/sdk/runtime/tmux.d.ts.map +0 -1
  533. package/dist/sdk/runtime/version-compat.d.ts +0 -28
  534. package/dist/sdk/runtime/version-compat.d.ts.map +0 -1
  535. package/dist/sdk/types.d.ts +0 -435
  536. package/dist/sdk/types.d.ts.map +0 -1
  537. package/dist/sdk/worker-shared.d.ts +0 -42
  538. package/dist/sdk/worker-shared.d.ts.map +0 -1
  539. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts +0 -81
  540. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts.map +0 -1
  541. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts +0 -37
  542. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts.map +0 -1
  543. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/batching.d.ts +0 -43
  544. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/batching.d.ts.map +0 -1
  545. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts +0 -14
  546. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts.map +0 -1
  547. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts +0 -136
  548. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts.map +0 -1
  549. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts +0 -58
  550. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts.map +0 -1
  551. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scratch.d.ts +0 -43
  552. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scratch.d.ts.map +0 -1
  553. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts +0 -37
  554. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts.map +0 -1
  555. package/dist/sdk/workflows/builtin/open-claude-design/claude/index.d.ts +0 -68
  556. package/dist/sdk/workflows/builtin/open-claude-design/claude/index.d.ts.map +0 -1
  557. package/dist/sdk/workflows/builtin/open-claude-design/copilot/index.d.ts +0 -56
  558. package/dist/sdk/workflows/builtin/open-claude-design/copilot/index.d.ts.map +0 -1
  559. package/dist/sdk/workflows/builtin/open-claude-design/helpers/constants.d.ts +0 -72
  560. package/dist/sdk/workflows/builtin/open-claude-design/helpers/constants.d.ts.map +0 -1
  561. package/dist/sdk/workflows/builtin/open-claude-design/helpers/design-system.d.ts +0 -46
  562. package/dist/sdk/workflows/builtin/open-claude-design/helpers/design-system.d.ts.map +0 -1
  563. package/dist/sdk/workflows/builtin/open-claude-design/helpers/export.d.ts +0 -32
  564. package/dist/sdk/workflows/builtin/open-claude-design/helpers/export.d.ts.map +0 -1
  565. package/dist/sdk/workflows/builtin/open-claude-design/helpers/import.d.ts +0 -33
  566. package/dist/sdk/workflows/builtin/open-claude-design/helpers/import.d.ts.map +0 -1
  567. package/dist/sdk/workflows/builtin/open-claude-design/helpers/prompts.d.ts +0 -106
  568. package/dist/sdk/workflows/builtin/open-claude-design/helpers/prompts.d.ts.map +0 -1
  569. package/dist/sdk/workflows/builtin/open-claude-design/helpers/scan.d.ts +0 -50
  570. package/dist/sdk/workflows/builtin/open-claude-design/helpers/scan.d.ts.map +0 -1
  571. package/dist/sdk/workflows/builtin/open-claude-design/helpers/validation.d.ts +0 -12
  572. package/dist/sdk/workflows/builtin/open-claude-design/helpers/validation.d.ts.map +0 -1
  573. package/dist/sdk/workflows/builtin/open-claude-design/opencode/index.d.ts +0 -58
  574. package/dist/sdk/workflows/builtin/open-claude-design/opencode/index.d.ts.map +0 -1
  575. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts +0 -37
  576. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts.map +0 -1
  577. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts +0 -34
  578. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts.map +0 -1
  579. package/dist/sdk/workflows/builtin/ralph/helpers/copilot-reviewer.d.ts +0 -25
  580. package/dist/sdk/workflows/builtin/ralph/helpers/copilot-reviewer.d.ts.map +0 -1
  581. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts +0 -69
  582. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts.map +0 -1
  583. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts +0 -266
  584. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts.map +0 -1
  585. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts +0 -24
  586. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts.map +0 -1
  587. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts +0 -33
  588. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts.map +0 -1
  589. package/dist/sdk/workflows/index.d.ts +0 -32
  590. package/dist/sdk/workflows/index.d.ts.map +0 -1
  591. package/dist/services/config/additional-instructions.d.ts +0 -67
  592. package/dist/services/config/additional-instructions.d.ts.map +0 -1
  593. package/dist/services/config/atomic-config.d.ts +0 -42
  594. package/dist/services/config/atomic-config.d.ts.map +0 -1
  595. package/dist/services/config/definitions.d.ts +0 -52
  596. package/dist/services/config/definitions.d.ts.map +0 -1
  597. package/dist/services/config/index.d.ts +0 -7
  598. package/dist/services/config/index.d.ts.map +0 -1
  599. package/dist/services/config/scm-sync.d.ts +0 -37
  600. package/dist/services/config/scm-sync.d.ts.map +0 -1
  601. package/dist/services/config/settings-schema.d.ts +0 -2
  602. package/dist/services/config/settings-schema.d.ts.map +0 -1
  603. package/dist/services/system/copy.d.ts +0 -84
  604. package/dist/services/system/copy.d.ts.map +0 -1
  605. package/dist/services/system/detect.d.ts +0 -75
  606. package/dist/services/system/detect.d.ts.map +0 -1
  607. package/dist/theme/colors.d.ts +0 -35
  608. package/dist/theme/colors.d.ts.map +0 -1
  609. package/src/cli.ts +0 -397
  610. package/src/commands/builtin-registry.ts +0 -37
  611. package/src/commands/cli/chat/index.test.ts +0 -252
  612. package/src/commands/cli/chat/index.ts +0 -430
  613. package/src/commands/cli/chat.ts +0 -8
  614. package/src/commands/cli/claude-ask-hook.test.ts +0 -128
  615. package/src/commands/cli/claude-ask-hook.ts +0 -84
  616. package/src/commands/cli/claude-inflight-hook.test.ts +0 -598
  617. package/src/commands/cli/claude-inflight-hook.ts +0 -359
  618. package/src/commands/cli/claude-session-start-hook.ts +0 -61
  619. package/src/commands/cli/claude-stop-hook.test.ts +0 -317
  620. package/src/commands/cli/claude-stop-hook.ts +0 -441
  621. package/src/commands/cli/completions.ts +0 -24
  622. package/src/commands/cli/config.ts +0 -80
  623. package/src/commands/cli/footer.tsx +0 -248
  624. package/src/commands/cli/init/index.ts +0 -41
  625. package/src/commands/cli/init/onboarding.ts +0 -61
  626. package/src/commands/cli/init.ts +0 -8
  627. package/src/commands/cli/management-commands.ts +0 -112
  628. package/src/commands/cli/session.test.ts +0 -830
  629. package/src/commands/cli/session.ts +0 -447
  630. package/src/commands/cli/workflow-command.test.ts +0 -618
  631. package/src/commands/cli/workflow-inputs.test.ts +0 -353
  632. package/src/commands/cli/workflow-inputs.ts +0 -266
  633. package/src/commands/cli/workflow-list.test.ts +0 -235
  634. package/src/commands/cli/workflow-list.ts +0 -0
  635. package/src/commands/cli/workflow-status.test.ts +0 -451
  636. package/src/commands/cli/workflow-status.ts +0 -330
  637. package/src/commands/cli/workflow.ts +0 -196
  638. package/src/completions/bash.ts +0 -102
  639. package/src/completions/fish.ts +0 -136
  640. package/src/completions/index.ts +0 -7
  641. package/src/completions/powershell.ts +0 -195
  642. package/src/completions/zsh.ts +0 -150
  643. package/src/lib/atomic-temp.test.ts +0 -86
  644. package/src/lib/atomic-temp.ts +0 -62
  645. package/src/lib/common-ignore.ts +0 -46
  646. package/src/lib/merge.ts +0 -103
  647. package/src/lib/path-root-guard.ts +0 -38
  648. package/src/lib/spawn.test.ts +0 -109
  649. package/src/lib/spawn.ts +0 -678
  650. package/src/lib/terminal-env.test.ts +0 -343
  651. package/src/lib/terminal-env.ts +0 -100
  652. package/src/scripts/bump-version.ts +0 -94
  653. package/src/scripts/bundle-configs.ts +0 -116
  654. package/src/scripts/clean-dist.test.ts +0 -53
  655. package/src/scripts/clean-dist.ts +0 -37
  656. package/src/scripts/constants-base.ts +0 -14
  657. package/src/scripts/constants.ts +0 -35
  658. package/src/sdk/components/attached-statusline.tsx +0 -86
  659. package/src/sdk/components/color-utils.ts +0 -20
  660. package/src/sdk/components/compact-switcher.tsx +0 -78
  661. package/src/sdk/components/connectors.test.ts +0 -707
  662. package/src/sdk/components/connectors.ts +0 -160
  663. package/src/sdk/components/edge.tsx +0 -13
  664. package/src/sdk/components/error-boundary.tsx +0 -38
  665. package/src/sdk/components/graph-theme.ts +0 -37
  666. package/src/sdk/components/header.tsx +0 -85
  667. package/src/sdk/components/hooks.ts +0 -21
  668. package/src/sdk/components/layout.test.ts +0 -1245
  669. package/src/sdk/components/layout.ts +0 -223
  670. package/src/sdk/components/node-card.tsx +0 -91
  671. package/src/sdk/components/orchestrator-panel-contexts.ts +0 -35
  672. package/src/sdk/components/orchestrator-panel-store.test.ts +0 -847
  673. package/src/sdk/components/orchestrator-panel-store.ts +0 -187
  674. package/src/sdk/components/orchestrator-panel-types.ts +0 -23
  675. package/src/sdk/components/orchestrator-panel.tsx +0 -262
  676. package/src/sdk/components/renderer-background.ts +0 -49
  677. package/src/sdk/components/session-graph-panel.tsx +0 -471
  678. package/src/sdk/components/status-helpers.ts +0 -33
  679. package/src/sdk/components/statusline.tsx +0 -68
  680. package/src/sdk/components/tui-diagnostics.ts +0 -273
  681. package/src/sdk/components/workflow-picker-panel.tsx +0 -1613
  682. package/src/sdk/define-workflow.test.ts +0 -354
  683. package/src/sdk/define-workflow.ts +0 -275
  684. package/src/sdk/errors.test.ts +0 -83
  685. package/src/sdk/errors.ts +0 -77
  686. package/src/sdk/index.test.ts +0 -92
  687. package/src/sdk/index.ts +0 -101
  688. package/src/sdk/primitives/inputs.ts +0 -48
  689. package/src/sdk/primitives/metadata.ts +0 -63
  690. package/src/sdk/primitives/run.ts +0 -81
  691. package/src/sdk/primitives/sessions.test.ts +0 -594
  692. package/src/sdk/primitives/sessions.ts +0 -328
  693. package/src/sdk/providers/claude.ts +0 -1450
  694. package/src/sdk/providers/copilot.test.ts +0 -365
  695. package/src/sdk/providers/copilot.ts +0 -185
  696. package/src/sdk/providers/headless-hil-policy.test.ts +0 -211
  697. package/src/sdk/providers/opencode.ts +0 -88
  698. package/src/sdk/registry.ts +0 -132
  699. package/src/sdk/runtime/attached-footer.ts +0 -155
  700. package/src/sdk/runtime/cc-debounce.ts +0 -104
  701. package/src/sdk/runtime/executor-env.ts +0 -45
  702. package/src/sdk/runtime/executor.test.ts +0 -1321
  703. package/src/sdk/runtime/executor.ts +0 -2136
  704. package/src/sdk/runtime/graph-inference.ts +0 -50
  705. package/src/sdk/runtime/orchestrator-entry.ts +0 -110
  706. package/src/sdk/runtime/panel.tsx +0 -9
  707. package/src/sdk/runtime/port-discovery.test.ts +0 -573
  708. package/src/sdk/runtime/port-discovery.ts +0 -496
  709. package/src/sdk/runtime/status-writer.test.ts +0 -245
  710. package/src/sdk/runtime/status-writer.ts +0 -201
  711. package/src/sdk/runtime/theme.ts +0 -71
  712. package/src/sdk/runtime/tmux.conf +0 -112
  713. package/src/sdk/runtime/tmux.ts +0 -785
  714. package/src/sdk/runtime/version-compat.ts +0 -68
  715. package/src/sdk/types.ts +0 -548
  716. package/src/sdk/worker-shared.test.ts +0 -163
  717. package/src/sdk/worker-shared.ts +0 -155
  718. package/src/sdk/workflows/builtin/deep-research-codebase/claude/index.ts +0 -569
  719. package/src/sdk/workflows/builtin/deep-research-codebase/copilot/index.ts +0 -481
  720. package/src/sdk/workflows/builtin/deep-research-codebase/helpers/batching.ts +0 -65
  721. package/src/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.ts +0 -24
  722. package/src/sdk/workflows/builtin/deep-research-codebase/helpers/ignore-by-default.d.ts +0 -8
  723. package/src/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.ts +0 -958
  724. package/src/sdk/workflows/builtin/deep-research-codebase/helpers/scout.ts +0 -505
  725. package/src/sdk/workflows/builtin/deep-research-codebase/helpers/scratch.ts +0 -115
  726. package/src/sdk/workflows/builtin/deep-research-codebase/opencode/index.ts +0 -530
  727. package/src/sdk/workflows/builtin/open-claude-design/claude/index.ts +0 -500
  728. package/src/sdk/workflows/builtin/open-claude-design/copilot/index.ts +0 -508
  729. package/src/sdk/workflows/builtin/open-claude-design/helpers/constants.ts +0 -159
  730. package/src/sdk/workflows/builtin/open-claude-design/helpers/design-system.ts +0 -88
  731. package/src/sdk/workflows/builtin/open-claude-design/helpers/export.ts +0 -193
  732. package/src/sdk/workflows/builtin/open-claude-design/helpers/import.ts +0 -52
  733. package/src/sdk/workflows/builtin/open-claude-design/helpers/prompts.ts +0 -1110
  734. package/src/sdk/workflows/builtin/open-claude-design/helpers/scan.ts +0 -117
  735. package/src/sdk/workflows/builtin/open-claude-design/helpers/validation.ts +0 -38
  736. package/src/sdk/workflows/builtin/open-claude-design/opencode/index.ts +0 -610
  737. package/src/sdk/workflows/builtin/ralph/claude/index.ts +0 -272
  738. package/src/sdk/workflows/builtin/ralph/copilot/index.ts +0 -298
  739. package/src/sdk/workflows/builtin/ralph/helpers/copilot-reviewer.ts +0 -105
  740. package/src/sdk/workflows/builtin/ralph/helpers/git.ts +0 -201
  741. package/src/sdk/workflows/builtin/ralph/helpers/prompts.ts +0 -1108
  742. package/src/sdk/workflows/builtin/ralph/helpers/review.ts +0 -33
  743. package/src/sdk/workflows/builtin/ralph/opencode/index.ts +0 -290
  744. package/src/sdk/workflows/index.ts +0 -116
  745. package/src/services/config/additional-instructions.ts +0 -273
  746. package/src/services/config/atomic-config.ts +0 -210
  747. package/src/services/config/atomic-global-config.ts +0 -348
  748. package/src/services/config/config-path.ts +0 -19
  749. package/src/services/config/definitions.ts +0 -125
  750. package/src/services/config/index.ts +0 -7
  751. package/src/services/config/scm-sync.ts +0 -185
  752. package/src/services/config/settings-schema.ts +0 -2
  753. package/src/services/config/settings.ts +0 -144
  754. package/src/services/system/agents.ts +0 -95
  755. package/src/services/system/auth.test.ts +0 -343
  756. package/src/services/system/auth.ts +0 -140
  757. package/src/services/system/auto-sync.ts +0 -128
  758. package/src/services/system/copy.ts +0 -392
  759. package/src/services/system/detect.ts +0 -161
  760. package/src/services/system/file-lock.ts +0 -289
  761. package/src/services/system/install-ui.ts +0 -296
  762. package/src/services/system/skills.ts +0 -58
  763. package/src/theme/colors.ts +0 -96
  764. package/src/theme/logo.ts +0 -123
  765. package/src/version.ts +0 -7
@@ -1,253 +0,0 @@
1
- ---
2
- name: evaluation
3
- description: This skill should be used when the user asks to "evaluate agent performance", "build test framework", "measure agent quality", "create evaluation rubrics", or mentions LLM-as-judge, multi-dimensional evaluation, agent testing, or quality gates for agent pipelines. Part of the context engineering skill suite — also activates when the user mentions "context engineering" or "context-engineering" in the context of measuring agent effectiveness.
4
- metadata:
5
- provider: atomic
6
- ---
7
-
8
- # Evaluation Methods for Agent Systems
9
-
10
- Evaluate agent systems differently from traditional software because agents make dynamic decisions, are non-deterministic between runs, and often lack single correct answers. Build evaluation frameworks that account for these characteristics, provide actionable feedback, catch regressions, and validate that context engineering choices achieve intended effects.
11
-
12
- ## When to Activate
13
-
14
- Activate this skill when:
15
- - Testing agent performance systematically
16
- - Validating context engineering choices
17
- - Measuring improvements over time
18
- - Catching regressions before deployment
19
- - Building quality gates for agent pipelines
20
- - Comparing different agent configurations
21
- - Evaluating production systems continuously
22
-
23
- ## Core Concepts
24
-
25
- Focus evaluation on outcomes rather than execution paths, because agents may find alternative valid routes to goals. Judge whether the agent achieves the right outcome via a reasonable process, not whether it followed a specific sequence of steps.
26
-
27
- Use multi-dimensional rubrics instead of single scores because one number hides critical failures in specific dimensions. Capture factual accuracy, completeness, citation accuracy, source quality, and tool efficiency as separate dimensions, then weight them for the use case.
28
-
29
- Deploy LLM-as-judge for scalable evaluation across large test sets while supplementing with human review to catch edge cases, hallucinations, and subtle biases that automated evaluation misses.
30
-
31
- **Performance Drivers: The 95% Finding**
32
-
33
- Apply the BrowseComp research finding when designing evaluation budgets: three factors explain 95% of browsing agent performance variance.
34
-
35
- | Factor | Variance Explained | Implication |
36
- |--------|-------------------|-------------|
37
- | Token usage | 80% | More tokens = better performance |
38
- | Number of tool calls | ~10% | More exploration helps |
39
- | Model choice | ~5% | Better models multiply efficiency |
40
-
41
- Act on these implications when designing evaluations:
42
- - **Set realistic token budgets**: Evaluate agents with production-realistic token limits, not unlimited resources, because token usage drives 80% of variance.
43
- - **Prioritize model upgrades over token increases**: Upgrading model versions provides larger gains than doubling token budgets on previous versions because better models use tokens more efficiently.
44
- - **Validate multi-agent architectures**: The finding supports distributing work across agents with separate context windows, so evaluate multi-agent setups against single-agent baselines.
45
-
46
- ## Detailed Topics
47
-
48
- ### Evaluation Challenges
49
-
50
- **Handle Non-Determinism and Multiple Valid Paths**
51
-
52
- Design evaluations that tolerate path variation because agents may take completely different valid paths to reach goals. One agent might search three sources while another searches ten; both may produce correct answers. Avoid checking for specific steps. Instead, define outcome criteria (correctness, completeness, quality) and score against those, treating the execution path as informational rather than evaluative.
53
-
54
- **Test Context-Dependent Failures**
55
-
56
- Evaluate across a range of complexity levels and interaction lengths because agent failures often depend on context in subtle ways. An agent might succeed on simple queries but fail on complex ones, work well with one tool set but fail with another, or degrade after extended interaction as context accumulates. Include simple, medium, complex, and very complex test cases to surface these patterns.
57
-
58
- **Score Composite Quality Dimensions Separately**
59
-
60
- Break agent quality into separate dimensions (factual accuracy, completeness, coherence, tool efficiency, process quality) and score each independently because an agent might score high on accuracy but low on efficiency, or vice versa. Then compute weighted aggregates tuned to use-case priorities. This approach reveals which dimensions need improvement rather than averaging away the signal.
61
-
62
- ### Evaluation Rubric Design
63
-
64
- **Build Multi-Dimensional Rubrics**
65
-
66
- Define rubrics covering key dimensions with descriptive levels from excellent to failed. Include these core dimensions and adapt weights per use case:
67
-
68
- - Factual accuracy: Claims match ground truth (weight heavily for knowledge tasks)
69
- - Completeness: Output covers requested aspects (weight heavily for research tasks)
70
- - Citation accuracy: Citations match claimed sources (weight for trust-sensitive contexts)
71
- - Source quality: Uses appropriate primary sources (weight for authoritative outputs)
72
- - Tool efficiency: Uses right tools a reasonable number of times (weight for cost-sensitive systems)
73
-
74
- **Convert Rubrics to Numeric Scores**
75
-
76
- Map dimension assessments to numeric scores (0.0 to 1.0), apply per-dimension weights, and calculate weighted overall scores. Set passing thresholds based on use-case requirements, typically 0.7 for general use and 0.9 for high-stakes applications. Store individual dimension scores alongside the aggregate because the breakdown drives targeted improvement.
77
-
78
- ### Evaluation Methodologies
79
-
80
- **Use LLM-as-Judge for Scale**
81
-
82
- Build LLM-based evaluation prompts that include: clear task description, the agent output under test, ground truth when available, an evaluation scale with explicit level descriptions, and a request for structured judgment with reasoning. LLM judges provide consistent, scalable evaluation across large test sets. Use a different model family than the agent being evaluated to avoid self-enhancement bias.
83
-
84
- **Supplement with Human Evaluation**
85
-
86
- Route edge cases, unusual queries, and a random sample of production traffic to human reviewers because humans notice hallucinated answers, system failures, and subtle biases that automated evaluation misses. Track patterns across human reviews to identify systematic issues and feed findings back into automated evaluation criteria.
87
-
88
- **Apply End-State Evaluation for Stateful Agents**
89
-
90
- For agents that mutate persistent state (files, databases, configurations), evaluate whether the final state matches expectations rather than how the agent got there. Define expected end-state assertions and verify them programmatically after each test run.
91
-
92
- ### Test Set Design
93
-
94
- **Select Representative Samples**
95
-
96
- Start with small samples (20-30 cases) during early development when changes have dramatic impacts and low-hanging fruit is abundant. Scale to 50+ cases for reliable signal as the system matures. Sample from real usage patterns, add known edge cases, and ensure coverage across complexity levels.
97
-
98
- **Stratify by Complexity**
99
-
100
- Structure test sets across complexity levels to prevent easy examples from inflating scores:
101
- - Simple: single tool call, factual lookup
102
- - Medium: multiple tool calls, comparison logic
103
- - Complex: many tool calls, significant ambiguity
104
- - Very complex: extended interaction, deep reasoning, synthesis
105
-
106
- Report scores per stratum alongside overall scores to reveal where the agent actually struggles.
107
-
108
- ### Context Engineering Evaluation
109
-
110
- **Validate Context Strategies Systematically**
111
-
112
- Run agents with different context strategies on the same test set and compare quality scores, token usage, and efficiency metrics. This isolates the effect of context engineering from other variables and prevents anecdote-driven decisions.
113
-
114
- **Run Degradation Tests**
115
-
116
- Test how context degradation affects performance by running agents at different context sizes. Identify performance cliffs where context becomes problematic and establish safe operating limits. Feed these limits back into context management strategies.
117
-
118
- ### Continuous Evaluation
119
-
120
- **Build Automated Evaluation Pipelines**
121
-
122
- Integrate evaluation into the development workflow so evaluations run automatically on agent changes. Track results over time, compare versions, and block deployments that regress on key metrics.
123
-
124
- **Monitor Production Quality**
125
-
126
- Sample production interactions and evaluate them continuously. Set alerts for quality drops below warning (0.85 pass rate) and critical (0.70 pass rate) thresholds. Maintain dashboards showing trend analysis over time windows to detect gradual degradation.
127
-
128
- ## Practical Guidance
129
-
130
- ### Building Evaluation Frameworks
131
-
132
- Follow this sequence to build an evaluation framework, because skipping early steps leads to measurements that do not reflect real quality:
133
-
134
- 1. Define quality dimensions relevant to the use case before writing any evaluation code, because dimensions chosen later tend to reflect what is easy to measure rather than what matters.
135
- 2. Create rubrics with clear, descriptive level definitions so evaluators (human or LLM) produce consistent scores.
136
- 3. Build test sets from real usage patterns and edge cases, stratified by complexity, with at least 50 cases for reliable signal.
137
- 4. Implement automated evaluation pipelines that run on every significant change.
138
- 5. Establish baseline metrics before making changes so improvements can be measured against a known reference.
139
- 6. Run evaluations on all significant changes and compare against the baseline.
140
- 7. Track metrics over time for trend analysis because gradual degradation is harder to notice than sudden drops.
141
- 8. Supplement automated evaluation with human review on a regular cadence.
142
-
143
- ### Avoiding Evaluation Pitfalls
144
-
145
- Guard against these common failures that undermine evaluation reliability:
146
-
147
- - **Overfitting to specific paths**: Evaluate outcomes, not specific steps, because agents find novel valid paths.
148
- - **Ignoring edge cases**: Include diverse test scenarios covering the full complexity spectrum.
149
- - **Single-metric obsession**: Use multi-dimensional rubrics because a single score hides dimension-specific failures.
150
- - **Neglecting context effects**: Test with realistic context sizes and histories rather than clean-room conditions.
151
- - **Skipping human evaluation**: Automated evaluation misses subtle issues that humans catch reliably.
152
-
153
- ## Examples
154
-
155
- **Example 1: Simple Evaluation**
156
- ```python
157
- def evaluate_agent_response(response, expected):
158
- rubric = load_rubric()
159
- scores = {}
160
- for dimension, config in rubric.items():
161
- scores[dimension] = assess_dimension(response, expected, dimension)
162
- overall = weighted_average(scores, config["weights"])
163
- return {"passed": overall >= 0.7, "scores": scores}
164
- ```
165
-
166
- **Example 2: Test Set Structure**
167
-
168
- Test sets should span multiple complexity levels to ensure comprehensive evaluation:
169
-
170
- ```python
171
- test_set = [
172
- {
173
- "name": "simple_lookup",
174
- "input": "What is the capital of France?",
175
- "expected": {"type": "fact", "answer": "Paris"},
176
- "complexity": "simple",
177
- "description": "Single tool call, factual lookup"
178
- },
179
- {
180
- "name": "medium_query",
181
- "input": "Compare the revenue of Apple and Microsoft last quarter",
182
- "complexity": "medium",
183
- "description": "Multiple tool calls, comparison logic"
184
- },
185
- {
186
- "name": "multi_step_reasoning",
187
- "input": "Analyze sales data from Q1-Q4 and create a summary report with trends",
188
- "complexity": "complex",
189
- "description": "Many tool calls, aggregation, analysis"
190
- },
191
- {
192
- "name": "research_synthesis",
193
- "input": "Research emerging AI technologies, evaluate their potential impact, and recommend adoption strategy",
194
- "complexity": "very_complex",
195
- "description": "Extended interaction, deep reasoning, synthesis"
196
- }
197
- ]
198
- ```
199
-
200
- ## Guidelines
201
-
202
- 1. Use multi-dimensional rubrics, not single metrics
203
- 2. Evaluate outcomes, not specific execution paths
204
- 3. Cover complexity levels from simple to complex
205
- 4. Test with realistic context sizes and histories
206
- 5. Run evaluations continuously, not just before release
207
- 6. Supplement LLM evaluation with human review
208
- 7. Track metrics over time for trend detection
209
- 8. Set clear pass/fail thresholds based on use case
210
-
211
- ## Gotchas
212
-
213
- 1. **Overfitting evals to specific code paths**: Tests pass but the agent fails on slight input variations. Write eval criteria against outcomes and semantics, not surface patterns, and rotate test inputs periodically.
214
- 2. **LLM-judge self-enhancement bias**: Models rate their own outputs higher than independent judges do. Use a different model family as the evaluation judge than the model being evaluated.
215
- 3. **Test set contamination**: Eval examples leak into training data or prompt templates, inflating scores. Keep eval sets versioned and separate from any data used in prompts or fine-tuning.
216
- 4. **Metric gaming**: Optimizing for the metric rather than actual quality produces agents that score well but disappoint users. Cross-validate automated metrics against human judgments regularly.
217
- 5. **Single-dimension scoring**: One aggregate number hides critical failures in specific dimensions. Always report per-dimension scores alongside the overall score, and fail the eval if any single dimension falls below its minimum threshold.
218
- 6. **Eval set too small**: Fewer than 50 examples produces unreliable signal with high variance between runs. Scale the eval set to at least 50 cases and report confidence intervals.
219
- 7. **Not stratifying by difficulty**: Easy examples inflate overall scores, masking failures on hard cases. Report scores per complexity stratum and weight the overall score to prevent easy-case dominance.
220
- 8. **Treating eval as one-time**: Evaluation must be continuous, not a launch gate. Agent quality drifts as models update, tools change, and usage patterns evolve. Run evals on every change and on a regular production cadence.
221
-
222
- ## Integration
223
-
224
- This skill connects to all other skills as a cross-cutting concern:
225
-
226
- - context-fundamentals - Evaluating context usage
227
- - context-degradation - Detecting degradation
228
- - context-optimization - Measuring optimization effectiveness
229
- - multi-agent-patterns - Evaluating coordination
230
- - tool-design - Evaluating tool effectiveness
231
- - memory-systems - Evaluating memory quality
232
-
233
- ## References
234
-
235
- Internal reference:
236
- - [Metrics Reference](./references/metrics.md) - Read when: designing specific evaluation metrics, choosing scoring scales, or implementing weighted rubric calculations
237
-
238
- Internal skills:
239
- - All other skills connect to evaluation for quality measurement
240
-
241
- External resources:
242
- - LLM evaluation benchmarks - Read when: selecting or building benchmark suites for agent comparison
243
- - Agent evaluation research papers - Read when: adopting new evaluation methodologies or validating current approach
244
- - Production monitoring practices - Read when: setting up alerting, dashboards, or sampling strategies for live systems
245
-
246
- ---
247
-
248
- ## Skill Metadata
249
-
250
- **Created**: 2025-12-20
251
- **Last Updated**: 2026-03-17
252
- **Author**: Agent Skills for Context Engineering Contributors
253
- **Version**: 1.1.0
@@ -1,339 +0,0 @@
1
- # Evaluation Reference: Metrics and Implementation
2
-
3
- This document provides implementation details for evaluation metrics and evaluation systems.
4
-
5
- ## Core Metric Definitions
6
-
7
- ### Factual Accuracy
8
-
9
- Factual accuracy measures whether claims in agent output match ground truth.
10
-
11
- ```
12
- Excellent (1.0): All claims verified against ground truth, no errors
13
- Good (0.8): Minor errors that do not affect main conclusions
14
- Acceptable (0.6): Major claims correct, minor inaccuracies present
15
- Poor (0.3): Significant factual errors in key claims
16
- Failed (0.0): Fundamental factual errors that invalidate output
17
- ```
18
-
19
- Calculation approach:
20
- - Extract claims from output
21
- - Verify each claim against ground truth
22
- - Weight claims by importance (major claims more weight)
23
- - Calculate weighted average of claim accuracy
24
-
25
- ### Completeness
26
-
27
- Completeness measures whether output covers all requested aspects.
28
-
29
- ```
30
- Excellent (1.0): All requested aspects thoroughly covered
31
- Good (0.8): Most aspects covered with minor gaps
32
- Acceptable (0.6): Key aspects covered, some gaps
33
- Poor (0.3): Major aspects missing from output
34
- Failed (0.0): Fundamental aspects not addressed
35
- ```
36
-
37
- ### Citation Accuracy
38
-
39
- Citation accuracy measures whether cited sources match claimed sources.
40
-
41
- ```
42
- Excellent (1.0): All citations accurate and complete
43
- Good (0.8): Minor citation formatting issues
44
- Acceptable (0.6): Major citations accurate
45
- Poor (0.3): Significant citation problems
46
- Failed (0.0): Citations missing or completely incorrect
47
- ```
48
-
49
- ### Source Quality
50
-
51
- Source quality measures whether appropriate primary sources were used.
52
-
53
- ```
54
- Excellent (1.0): Primary authoritative sources
55
- Good (0.8): Mostly primary sources with some secondary
56
- Acceptable (0.6): Mix of primary and secondary sources
57
- Poor (0.3): Mostly secondary or unreliable sources
58
- Failed (0.0): No credible sources cited
59
- ```
60
-
61
- ### Tool Efficiency
62
-
63
- Tool efficiency measures whether the agent used appropriate tools a reasonable number of times.
64
-
65
- ```
66
- Excellent (1.0): Optimal tool selection and call count
67
- Good (0.8): Good tool selection with minor inefficiencies
68
- Acceptable (0.6): Appropriate tools with some redundancy
69
- Poor (0.3): Wrong tools or excessive call counts
70
- Failed (0.0): Severe tool misuse or extremely excessive calls
71
- ```
72
-
73
- ## Rubric Implementation
74
-
75
- ```python
76
- EVALUATION_DIMENSIONS = {
77
- "factual_accuracy": {
78
- "weight": 0.30,
79
- "description": "Claims match ground truth",
80
- "levels": {
81
- "excellent": 1.0,
82
- "good": 0.8,
83
- "acceptable": 0.6,
84
- "poor": 0.3,
85
- "failed": 0.0
86
- }
87
- },
88
- "completeness": {
89
- "weight": 0.25,
90
- "description": "All requested aspects covered",
91
- "levels": {
92
- "excellent": 1.0,
93
- "good": 0.8,
94
- "acceptable": 0.6,
95
- "poor": 0.3,
96
- "failed": 0.0
97
- }
98
- },
99
- "citation_accuracy": {
100
- "weight": 0.15,
101
- "description": "Citations match sources",
102
- "levels": {
103
- "excellent": 1.0,
104
- "good": 0.8,
105
- "acceptable": 0.6,
106
- "poor": 0.3,
107
- "failed": 0.0
108
- }
109
- },
110
- "source_quality": {
111
- "weight": 0.10,
112
- "description": "Appropriate primary sources used",
113
- "levels": {
114
- "excellent": 1.0,
115
- "good": 0.8,
116
- "acceptable": 0.6,
117
- "poor": 0.3,
118
- "failed": 0.0
119
- }
120
- },
121
- "tool_efficiency": {
122
- "weight": 0.20,
123
- "description": "Right tools used reasonably",
124
- "levels": {
125
- "excellent": 1.0,
126
- "good": 0.8,
127
- "acceptable": 0.6,
128
- "poor": 0.3,
129
- "failed": 0.0
130
- }
131
- }
132
- }
133
-
134
- def calculate_overall_score(dimension_scores, rubric):
135
- """Calculate weighted overall score from dimension scores."""
136
- total_weight = 0
137
- weighted_sum = 0
138
-
139
- for dimension, score in dimension_scores.items():
140
- if dimension in rubric:
141
- weight = rubric[dimension]["weight"]
142
- weighted_sum += score * weight
143
- total_weight += weight
144
-
145
- return weighted_sum / total_weight if total_weight > 0 else 0
146
- ```
147
-
148
- ## Test Set Management
149
-
150
- ```python
151
- class TestSet:
152
- def __init__(self, name):
153
- self.name = name
154
- self.tests = []
155
- self.tags = {}
156
-
157
- def add_test(self, test_case):
158
- """Add test case to test set."""
159
- self.tests.append(test_case)
160
-
161
- # Index by tags
162
- for tag in test_case.get("tags", []):
163
- if tag not in self.tags:
164
- self.tags[tag] = []
165
- self.tags[tag].append(len(self.tests) - 1)
166
-
167
- def filter(self, **criteria):
168
- """Filter tests by criteria."""
169
- filtered = []
170
- for test in self.tests:
171
- match = True
172
- for key, value in criteria.items():
173
- if test.get(key) != value:
174
- match = False
175
- break
176
- if match:
177
- filtered.append(test)
178
- return filtered
179
-
180
- def get_complexity_distribution(self):
181
- """Get distribution of tests by complexity."""
182
- distribution = {}
183
- for test in self.tests:
184
- complexity = test.get("complexity", "medium")
185
- distribution[complexity] = distribution.get(complexity, 0) + 1
186
- return distribution
187
- ```
188
-
189
- ## Evaluation Runner
190
-
191
- ```python
192
- class EvaluationRunner:
193
- def __init__(self, test_set, rubric, agent):
194
- self.test_set = test_set
195
- self.rubric = rubric
196
- self.agent = agent
197
- self.results = []
198
-
199
- def run_all(self, verbose=False):
200
- """Run evaluation on all tests."""
201
- self.results = []
202
-
203
- for i, test in enumerate(self.test_set.tests):
204
- if verbose:
205
- print(f"Running test {i+1}/{len(self.test_set.tests)}")
206
-
207
- result = self.run_test(test)
208
- self.results.append(result)
209
-
210
- return self.summarize()
211
-
212
- def run_test(self, test):
213
- """Run single evaluation test."""
214
- # Get agent output
215
- output = self.agent.run(test["input"])
216
-
217
- # Evaluate
218
- evaluation = self.evaluate_output(output, test)
219
-
220
- return {
221
- "test": test,
222
- "output": output,
223
- "evaluation": evaluation
224
- }
225
-
226
- def evaluate_output(self, output, test):
227
- """Evaluate agent output against test."""
228
- ground_truth = test.get("expected", {})
229
-
230
- dimension_scores = {}
231
- for dimension, config in self.rubric.items():
232
- score = self.evaluate_dimension(
233
- output, ground_truth, dimension, config
234
- )
235
- dimension_scores[dimension] = score
236
-
237
- overall = calculate_overall_score(dimension_scores, self.rubric)
238
-
239
- return {
240
- "overall_score": overall,
241
- "dimension_scores": dimension_scores,
242
- "passed": overall >= 0.7
243
- }
244
-
245
- def summarize(self):
246
- """Summarize evaluation results."""
247
- if not self.results:
248
- return {"error": "No results"}
249
-
250
- passed = sum(1 for r in self.results if r["evaluation"]["passed"])
251
-
252
- dimension_totals = {}
253
- for dimension in self.rubric.keys():
254
- dimension_totals[dimension] = {
255
- "total": 0,
256
- "count": 0
257
- }
258
-
259
- for result in self.results:
260
- for dimension, score in result["evaluation"]["dimension_scores"].items():
261
- if dimension in dimension_totals:
262
- dimension_totals[dimension]["total"] += score
263
- dimension_totals[dimension]["count"] += 1
264
-
265
- dimension_averages = {}
266
- for dimension, data in dimension_totals.items():
267
- if data["count"] > 0:
268
- dimension_averages[dimension] = data["total"] / data["count"]
269
-
270
- return {
271
- "total_tests": len(self.results),
272
- "passed": passed,
273
- "failed": len(self.results) - passed,
274
- "pass_rate": passed / len(self.results) if self.results else 0,
275
- "dimension_averages": dimension_averages,
276
- "failures": [
277
- r for r in self.results
278
- if not r["evaluation"]["passed"]
279
- ]
280
- }
281
- ```
282
-
283
- ## Production Monitoring
284
-
285
- ```python
286
- class ProductionMonitor:
287
- def __init__(self, sample_rate=0.01):
288
- self.sample_rate = sample_rate
289
- self.samples = []
290
- self.alert_thresholds = {
291
- "pass_rate_warning": 0.85,
292
- "pass_rate_critical": 0.70
293
- }
294
-
295
- def sample_and_evaluate(self, query, output):
296
- """Sample production interaction for evaluation."""
297
- if random.random() > self.sample_rate:
298
- return None
299
-
300
- evaluation = evaluate_output(output, {}, EVALUATION_RUBRIC)
301
-
302
- sample = {
303
- "query": query[:200],
304
- "output_preview": output[:200],
305
- "score": evaluation["overall_score"],
306
- "passed": evaluation["passed"],
307
- "timestamp": current_timestamp()
308
- }
309
-
310
- self.samples.append(sample)
311
- return sample
312
-
313
- def get_metrics(self):
314
- """Calculate current metrics from samples."""
315
- if not self.samples:
316
- return {"status": "insufficient_data"}
317
-
318
- passed = sum(1 for s in self.samples if s["passed"])
319
- pass_rate = passed / len(self.samples)
320
-
321
- avg_score = sum(s["score"] for s in self.samples) / len(self.samples)
322
-
323
- return {
324
- "sample_count": len(self.samples),
325
- "pass_rate": pass_rate,
326
- "average_score": avg_score,
327
- "status": self._get_status(pass_rate)
328
- }
329
-
330
- def _get_status(self, pass_rate):
331
- """Get status based on pass rate."""
332
- if pass_rate < self.alert_thresholds["pass_rate_critical"]:
333
- return "critical"
334
- elif pass_rate < self.alert_thresholds["pass_rate_warning"]:
335
- return "warning"
336
- else:
337
- return "healthy"
338
- ```
339
-