@bastani/atomic 0.5.11 → 0.5.12-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (506) hide show
  1. package/.agents/skills/adapt/SKILL.md +199 -0
  2. package/.agents/skills/advanced-evaluation/SKILL.md +402 -0
  3. package/.agents/skills/advanced-evaluation/references/bias-mitigation.md +288 -0
  4. package/.agents/skills/advanced-evaluation/references/evaluation-pipeline.md +43 -0
  5. package/.agents/skills/advanced-evaluation/references/implementation-patterns.md +315 -0
  6. package/.agents/skills/advanced-evaluation/references/metrics-guide.md +331 -0
  7. package/.agents/skills/advanced-evaluation/scripts/evaluation_example.py +392 -0
  8. package/.agents/skills/animate/SKILL.md +175 -0
  9. package/.agents/skills/arrange/SKILL.md +124 -0
  10. package/.agents/skills/audit/SKILL.md +148 -0
  11. package/.agents/skills/bdi-mental-states/SKILL.md +311 -0
  12. package/.agents/skills/bdi-mental-states/references/bdi-ontology-core.md +207 -0
  13. package/.agents/skills/bdi-mental-states/references/framework-integration.md +582 -0
  14. package/.agents/skills/bdi-mental-states/references/rdf-examples.md +315 -0
  15. package/.agents/skills/bdi-mental-states/references/sparql-competency.md +420 -0
  16. package/.agents/skills/bolder/SKILL.md +117 -0
  17. package/.agents/skills/bun/SKILL.md +199 -0
  18. package/.agents/skills/clarify/SKILL.md +183 -0
  19. package/.agents/skills/colorize/SKILL.md +143 -0
  20. package/.agents/skills/context-compression/SKILL.md +272 -0
  21. package/.agents/skills/context-compression/references/evaluation-framework.md +213 -0
  22. package/.agents/skills/context-compression/scripts/compression_evaluator.py +862 -0
  23. package/.agents/skills/context-compression/tests/test_compression_evaluator.py +56 -0
  24. package/.agents/skills/context-degradation/SKILL.md +206 -0
  25. package/.agents/skills/context-degradation/references/patterns.md +314 -0
  26. package/.agents/skills/context-degradation/scripts/degradation_detector.py +614 -0
  27. package/.agents/skills/context-fundamentals/SKILL.md +201 -0
  28. package/.agents/skills/context-fundamentals/references/context-components.md +283 -0
  29. package/.agents/skills/context-fundamentals/scripts/context_manager.py +533 -0
  30. package/.agents/skills/context-optimization/SKILL.md +195 -0
  31. package/.agents/skills/context-optimization/references/optimization_techniques.md +272 -0
  32. package/.agents/skills/context-optimization/scripts/compaction.py +562 -0
  33. package/.agents/skills/create-spec/SKILL.md +244 -0
  34. package/.agents/skills/critique/SKILL.md +225 -0
  35. package/.agents/skills/critique/reference/cognitive-load.md +106 -0
  36. package/.agents/skills/critique/reference/heuristics-scoring.md +234 -0
  37. package/.agents/skills/critique/reference/personas.md +178 -0
  38. package/.agents/skills/delight/SKILL.md +304 -0
  39. package/.agents/skills/distill/SKILL.md +122 -0
  40. package/.agents/skills/docx/LICENSE.txt +30 -0
  41. package/.agents/skills/docx/SKILL.md +590 -0
  42. package/.agents/skills/docx/scripts/__init__.py +1 -0
  43. package/.agents/skills/docx/scripts/accept_changes.py +135 -0
  44. package/.agents/skills/docx/scripts/comment.py +318 -0
  45. package/.agents/skills/docx/scripts/office/helpers/__init__.py +0 -0
  46. package/.agents/skills/docx/scripts/office/helpers/merge_runs.py +199 -0
  47. package/.agents/skills/docx/scripts/office/helpers/simplify_redlines.py +197 -0
  48. package/.agents/skills/docx/scripts/office/pack.py +159 -0
  49. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  50. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  51. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  52. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  53. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  54. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  55. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  56. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  57. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  58. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  59. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  60. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  61. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  62. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  63. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  64. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  65. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  66. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  67. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  68. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  69. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  70. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  71. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  72. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  73. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  74. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  75. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  76. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  77. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  78. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  79. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  80. package/.agents/skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
  81. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  82. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  83. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  84. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  85. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  86. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  87. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  88. package/.agents/skills/docx/scripts/office/soffice.py +183 -0
  89. package/.agents/skills/docx/scripts/office/unpack.py +132 -0
  90. package/.agents/skills/docx/scripts/office/validate.py +111 -0
  91. package/.agents/skills/docx/scripts/office/validators/__init__.py +15 -0
  92. package/.agents/skills/docx/scripts/office/validators/base.py +847 -0
  93. package/.agents/skills/docx/scripts/office/validators/docx.py +446 -0
  94. package/.agents/skills/docx/scripts/office/validators/pptx.py +275 -0
  95. package/.agents/skills/docx/scripts/office/validators/redlining.py +247 -0
  96. package/.agents/skills/docx/scripts/templates/comments.xml +3 -0
  97. package/.agents/skills/docx/scripts/templates/commentsExtended.xml +3 -0
  98. package/.agents/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
  99. package/.agents/skills/docx/scripts/templates/commentsIds.xml +3 -0
  100. package/.agents/skills/docx/scripts/templates/people.xml +3 -0
  101. package/.agents/skills/evaluation/SKILL.md +251 -0
  102. package/.agents/skills/evaluation/references/metrics.md +339 -0
  103. package/.agents/skills/evaluation/scripts/evaluator.py +627 -0
  104. package/.agents/skills/explain-code/SKILL.md +230 -0
  105. package/.agents/skills/extract/SKILL.md +91 -0
  106. package/.agents/skills/filesystem-context/SKILL.md +287 -0
  107. package/.agents/skills/filesystem-context/references/implementation-patterns.md +549 -0
  108. package/.agents/skills/filesystem-context/scripts/filesystem_context.py +425 -0
  109. package/.agents/skills/find-skills/SKILL.md +142 -0
  110. package/.agents/skills/frontend-design/SKILL.md +147 -0
  111. package/.agents/skills/frontend-design/reference/color-and-contrast.md +132 -0
  112. package/.agents/skills/frontend-design/reference/interaction-design.md +195 -0
  113. package/.agents/skills/frontend-design/reference/motion-design.md +99 -0
  114. package/.agents/skills/frontend-design/reference/responsive-design.md +114 -0
  115. package/.agents/skills/frontend-design/reference/spatial-design.md +100 -0
  116. package/.agents/skills/frontend-design/reference/typography.md +133 -0
  117. package/.agents/skills/frontend-design/reference/ux-writing.md +107 -0
  118. package/.agents/skills/gh-commit/SKILL.md +243 -0
  119. package/.agents/skills/gh-create-pr/SKILL.md +93 -0
  120. package/.agents/skills/harden/SKILL.md +354 -0
  121. package/.agents/skills/hosted-agents/SKILL.md +260 -0
  122. package/.agents/skills/hosted-agents/references/infrastructure-patterns.md +700 -0
  123. package/.agents/skills/hosted-agents/scripts/sandbox_manager.py +590 -0
  124. package/.agents/skills/impeccable/SKILL.md +365 -0
  125. package/.agents/skills/impeccable/reference/color-and-contrast.md +105 -0
  126. package/.agents/skills/impeccable/reference/craft.md +70 -0
  127. package/.agents/skills/impeccable/reference/extract.md +70 -0
  128. package/.agents/skills/impeccable/reference/interaction-design.md +195 -0
  129. package/.agents/skills/impeccable/reference/motion-design.md +99 -0
  130. package/.agents/skills/impeccable/reference/responsive-design.md +114 -0
  131. package/.agents/skills/impeccable/reference/spatial-design.md +100 -0
  132. package/.agents/skills/impeccable/reference/typography.md +142 -0
  133. package/.agents/skills/impeccable/reference/ux-writing.md +107 -0
  134. package/.agents/skills/impeccable/scripts/cleanup-deprecated.mjs +214 -0
  135. package/.agents/skills/init/SKILL.md +138 -0
  136. package/.agents/skills/layout/SKILL.md +125 -0
  137. package/.agents/skills/liteparse/SKILL.md +222 -0
  138. package/.agents/skills/memory-systems/SKILL.md +219 -0
  139. package/.agents/skills/memory-systems/references/implementation.md +551 -0
  140. package/.agents/skills/memory-systems/scripts/memory_store.py +616 -0
  141. package/.agents/skills/multi-agent-patterns/SKILL.md +257 -0
  142. package/.agents/skills/multi-agent-patterns/references/frameworks.md +433 -0
  143. package/.agents/skills/multi-agent-patterns/scripts/coordination.py +613 -0
  144. package/.agents/skills/normalize/SKILL.md +70 -0
  145. package/.agents/skills/onboard/SKILL.md +245 -0
  146. package/.agents/skills/opentui/SKILL.md +201 -0
  147. package/.agents/skills/opentui/references/animation/REFERENCE.md +431 -0
  148. package/.agents/skills/opentui/references/components/REFERENCE.md +144 -0
  149. package/.agents/skills/opentui/references/components/code-diff.md +672 -0
  150. package/.agents/skills/opentui/references/components/containers.md +417 -0
  151. package/.agents/skills/opentui/references/components/inputs.md +531 -0
  152. package/.agents/skills/opentui/references/components/text-display.md +386 -0
  153. package/.agents/skills/opentui/references/core/REFERENCE.md +145 -0
  154. package/.agents/skills/opentui/references/core/api.md +543 -0
  155. package/.agents/skills/opentui/references/core/configuration.md +168 -0
  156. package/.agents/skills/opentui/references/core/gotchas.md +393 -0
  157. package/.agents/skills/opentui/references/core/patterns.md +449 -0
  158. package/.agents/skills/opentui/references/keyboard/REFERENCE.md +617 -0
  159. package/.agents/skills/opentui/references/layout/REFERENCE.md +337 -0
  160. package/.agents/skills/opentui/references/layout/patterns.md +444 -0
  161. package/.agents/skills/opentui/references/react/REFERENCE.md +174 -0
  162. package/.agents/skills/opentui/references/react/api.md +436 -0
  163. package/.agents/skills/opentui/references/react/configuration.md +302 -0
  164. package/.agents/skills/opentui/references/react/gotchas.md +443 -0
  165. package/.agents/skills/opentui/references/react/patterns.md +501 -0
  166. package/.agents/skills/opentui/references/solid/REFERENCE.md +201 -0
  167. package/.agents/skills/opentui/references/solid/api.md +564 -0
  168. package/.agents/skills/opentui/references/solid/configuration.md +316 -0
  169. package/.agents/skills/opentui/references/solid/gotchas.md +427 -0
  170. package/.agents/skills/opentui/references/solid/patterns.md +560 -0
  171. package/.agents/skills/opentui/references/testing/REFERENCE.md +614 -0
  172. package/.agents/skills/optimize/SKILL.md +266 -0
  173. package/.agents/skills/overdrive/SKILL.md +142 -0
  174. package/.agents/skills/pdf/LICENSE.txt +30 -0
  175. package/.agents/skills/pdf/SKILL.md +314 -0
  176. package/.agents/skills/pdf/forms.md +294 -0
  177. package/.agents/skills/pdf/reference.md +612 -0
  178. package/.agents/skills/pdf/scripts/check_bounding_boxes.py +65 -0
  179. package/.agents/skills/pdf/scripts/check_fillable_fields.py +11 -0
  180. package/.agents/skills/pdf/scripts/convert_pdf_to_images.py +33 -0
  181. package/.agents/skills/pdf/scripts/create_validation_image.py +37 -0
  182. package/.agents/skills/pdf/scripts/extract_form_field_info.py +122 -0
  183. package/.agents/skills/pdf/scripts/extract_form_structure.py +115 -0
  184. package/.agents/skills/pdf/scripts/fill_fillable_fields.py +98 -0
  185. package/.agents/skills/pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
  186. package/.agents/skills/playwright-cli/SKILL.md +344 -0
  187. package/.agents/skills/playwright-cli/references/element-attributes.md +23 -0
  188. package/.agents/skills/playwright-cli/references/playwright-tests.md +39 -0
  189. package/.agents/skills/playwright-cli/references/request-mocking.md +87 -0
  190. package/.agents/skills/playwright-cli/references/running-code.md +231 -0
  191. package/.agents/skills/playwright-cli/references/session-management.md +169 -0
  192. package/.agents/skills/playwright-cli/references/storage-state.md +275 -0
  193. package/.agents/skills/playwright-cli/references/test-generation.md +88 -0
  194. package/.agents/skills/playwright-cli/references/tracing.md +139 -0
  195. package/.agents/skills/playwright-cli/references/video-recording.md +143 -0
  196. package/.agents/skills/polish/SKILL.md +224 -0
  197. package/.agents/skills/pptx/LICENSE.txt +30 -0
  198. package/.agents/skills/pptx/SKILL.md +232 -0
  199. package/.agents/skills/pptx/editing.md +205 -0
  200. package/.agents/skills/pptx/pptxgenjs.md +420 -0
  201. package/.agents/skills/pptx/scripts/__init__.py +0 -0
  202. package/.agents/skills/pptx/scripts/add_slide.py +195 -0
  203. package/.agents/skills/pptx/scripts/clean.py +286 -0
  204. package/.agents/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  205. package/.agents/skills/pptx/scripts/office/helpers/merge_runs.py +199 -0
  206. package/.agents/skills/pptx/scripts/office/helpers/simplify_redlines.py +197 -0
  207. package/.agents/skills/pptx/scripts/office/pack.py +159 -0
  208. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  209. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  210. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  211. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  212. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  213. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  214. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  215. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  216. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  217. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  218. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  219. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  220. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  221. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  222. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  223. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  224. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  225. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  226. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  227. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  228. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  229. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  230. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  231. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  232. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  233. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  234. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  235. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  236. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  237. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  238. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  239. package/.agents/skills/pptx/scripts/office/schemas/mce/mc.xsd +75 -0
  240. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  241. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  242. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  243. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  244. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  245. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  246. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  247. package/.agents/skills/pptx/scripts/office/soffice.py +183 -0
  248. package/.agents/skills/pptx/scripts/office/unpack.py +132 -0
  249. package/.agents/skills/pptx/scripts/office/validate.py +111 -0
  250. package/.agents/skills/pptx/scripts/office/validators/__init__.py +15 -0
  251. package/.agents/skills/pptx/scripts/office/validators/base.py +847 -0
  252. package/.agents/skills/pptx/scripts/office/validators/docx.py +446 -0
  253. package/.agents/skills/pptx/scripts/office/validators/pptx.py +275 -0
  254. package/.agents/skills/pptx/scripts/office/validators/redlining.py +247 -0
  255. package/.agents/skills/pptx/scripts/thumbnail.py +289 -0
  256. package/.agents/skills/project-development/SKILL.md +291 -0
  257. package/.agents/skills/project-development/references/case-studies.md +388 -0
  258. package/.agents/skills/project-development/references/pipeline-patterns.md +610 -0
  259. package/.agents/skills/project-development/scripts/pipeline_template.py +796 -0
  260. package/.agents/skills/prompt-engineer/SKILL.md +263 -0
  261. package/.agents/skills/prompt-engineer/references/advanced_patterns.md +271 -0
  262. package/.agents/skills/prompt-engineer/references/core_prompting.md +137 -0
  263. package/.agents/skills/prompt-engineer/references/quality_improvement.md +193 -0
  264. package/.agents/skills/quieter/SKILL.md +103 -0
  265. package/.agents/skills/research-codebase/SKILL.md +227 -0
  266. package/.agents/skills/shape/SKILL.md +96 -0
  267. package/.agents/skills/skill-creator/LICENSE.txt +202 -0
  268. package/.agents/skills/skill-creator/SKILL.md +485 -0
  269. package/.agents/skills/skill-creator/agents/analyzer.md +274 -0
  270. package/.agents/skills/skill-creator/agents/comparator.md +202 -0
  271. package/.agents/skills/skill-creator/agents/grader.md +223 -0
  272. package/.agents/skills/skill-creator/assets/eval_review.html +146 -0
  273. package/.agents/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  274. package/.agents/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  275. package/.agents/skills/skill-creator/references/schemas.md +430 -0
  276. package/.agents/skills/skill-creator/scripts/__init__.py +0 -0
  277. package/.agents/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  278. package/.agents/skills/skill-creator/scripts/generate_report.py +326 -0
  279. package/.agents/skills/skill-creator/scripts/improve_description.py +247 -0
  280. package/.agents/skills/skill-creator/scripts/package_skill.py +136 -0
  281. package/.agents/skills/skill-creator/scripts/quick_validate.py +103 -0
  282. package/.agents/skills/skill-creator/scripts/run_eval.py +310 -0
  283. package/.agents/skills/skill-creator/scripts/run_loop.py +328 -0
  284. package/.agents/skills/skill-creator/scripts/utils.py +47 -0
  285. package/.agents/skills/sl-commit/SKILL.md +51 -0
  286. package/.agents/skills/sl-submit-diff/SKILL.md +55 -0
  287. package/.agents/skills/teach-impeccable/SKILL.md +71 -0
  288. package/.agents/skills/test-driven-development/SKILL.md +371 -0
  289. package/.agents/skills/test-driven-development/testing-anti-patterns.md +299 -0
  290. package/.agents/skills/tool-design/SKILL.md +271 -0
  291. package/.agents/skills/tool-design/references/architectural_reduction.md +210 -0
  292. package/.agents/skills/tool-design/references/best_practices.md +176 -0
  293. package/.agents/skills/tool-design/scripts/description_generator.py +528 -0
  294. package/.agents/skills/typescript-advanced-types/SKILL.md +719 -0
  295. package/.agents/skills/typescript-expert/SKILL.md +428 -0
  296. package/.agents/skills/typescript-expert/references/tsconfig-strict.json +92 -0
  297. package/.agents/skills/typescript-expert/references/typescript-cheatsheet.md +383 -0
  298. package/.agents/skills/typescript-expert/references/utility-types.ts +335 -0
  299. package/.agents/skills/typescript-expert/scripts/ts_diagnostic.py +203 -0
  300. package/.agents/skills/typescript-react-reviewer/SKILL.md +200 -0
  301. package/.agents/skills/typescript-react-reviewer/references/antipatterns.md +510 -0
  302. package/.agents/skills/typescript-react-reviewer/references/checklist.md +267 -0
  303. package/.agents/skills/typescript-react-reviewer/references/react19-patterns.md +305 -0
  304. package/.agents/skills/typeset/SKILL.md +116 -0
  305. package/.agents/skills/workflow-creator/SKILL.md +337 -0
  306. package/.agents/skills/workflow-creator/references/agent-sessions.md +789 -0
  307. package/.agents/skills/workflow-creator/references/computation-and-validation.md +224 -0
  308. package/.agents/skills/workflow-creator/references/control-flow.md +450 -0
  309. package/.agents/skills/workflow-creator/references/discovery-and-verification.md +156 -0
  310. package/.agents/skills/workflow-creator/references/failure-modes.md +732 -0
  311. package/.agents/skills/workflow-creator/references/getting-started.md +289 -0
  312. package/.agents/skills/workflow-creator/references/session-config.md +355 -0
  313. package/.agents/skills/workflow-creator/references/state-and-data-flow.md +374 -0
  314. package/.agents/skills/workflow-creator/references/user-input.md +206 -0
  315. package/.agents/skills/workflow-creator/references/workflow-inputs.md +274 -0
  316. package/.agents/skills/xlsx/LICENSE.txt +30 -0
  317. package/.agents/skills/xlsx/SKILL.md +292 -0
  318. package/.agents/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  319. package/.agents/skills/xlsx/scripts/office/helpers/merge_runs.py +199 -0
  320. package/.agents/skills/xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
  321. package/.agents/skills/xlsx/scripts/office/pack.py +159 -0
  322. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  323. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  324. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  325. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  326. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  327. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  328. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  329. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  330. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  331. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  332. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  333. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  334. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  335. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  336. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  337. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  338. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  339. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  340. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  341. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  342. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  343. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  344. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  345. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  346. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  347. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  348. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  349. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  350. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  351. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  352. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  353. package/.agents/skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
  354. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  355. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  356. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  357. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  358. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  359. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  360. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  361. package/.agents/skills/xlsx/scripts/office/soffice.py +183 -0
  362. package/.agents/skills/xlsx/scripts/office/unpack.py +132 -0
  363. package/.agents/skills/xlsx/scripts/office/validate.py +111 -0
  364. package/.agents/skills/xlsx/scripts/office/validators/__init__.py +15 -0
  365. package/.agents/skills/xlsx/scripts/office/validators/base.py +847 -0
  366. package/.agents/skills/xlsx/scripts/office/validators/docx.py +446 -0
  367. package/.agents/skills/xlsx/scripts/office/validators/pptx.py +275 -0
  368. package/.agents/skills/xlsx/scripts/office/validators/redlining.py +247 -0
  369. package/.agents/skills/xlsx/scripts/recalc.py +184 -0
  370. package/.claude/agents/reviewer.md +1 -0
  371. package/.github/agents/reviewer.md +1 -0
  372. package/.opencode/agents/reviewer.md +1 -0
  373. package/README.md +274 -169
  374. package/package.json +6 -7
  375. package/src/commands/cli/init/index.ts +2 -2
  376. package/src/commands/cli/init/scm.ts +7 -8
  377. package/src/commands/cli/workflow-command.test.ts +74 -0
  378. package/src/commands/cli/workflow.ts +7 -2
  379. package/src/scripts/bundle-configs.ts +128 -0
  380. package/src/sdk/components/compact-switcher.tsx +1 -1
  381. package/src/sdk/components/orchestrator-panel-store.ts +13 -0
  382. package/src/sdk/components/orchestrator-panel.tsx +10 -0
  383. package/src/sdk/components/statusline.tsx +13 -1
  384. package/src/sdk/providers/claude.ts +42 -0
  385. package/src/sdk/runtime/executor.ts +111 -32
  386. package/src/sdk/types.ts +7 -0
  387. package/src/sdk/workflows/builtin/ralph/claude/index.ts +132 -76
  388. package/src/sdk/workflows/builtin/ralph/copilot/index.ts +129 -71
  389. package/src/sdk/workflows/builtin/ralph/helpers/git.ts +184 -17
  390. package/src/sdk/workflows/builtin/ralph/helpers/prompts.ts +463 -79
  391. package/src/sdk/workflows/builtin/ralph/opencode/index.ts +124 -80
  392. package/src/services/system/auto-sync.ts +31 -51
  393. package/src/services/system/skills.ts +56 -60
  394. package/dist/lib/path-root-guard.d.ts +0 -4
  395. package/dist/lib/path-root-guard.d.ts.map +0 -1
  396. package/dist/sdk/components/color-utils.d.ts +0 -4
  397. package/dist/sdk/components/color-utils.d.ts.map +0 -1
  398. package/dist/sdk/components/compact-switcher.d.ts +0 -10
  399. package/dist/sdk/components/compact-switcher.d.ts.map +0 -1
  400. package/dist/sdk/components/connectors.d.ts +0 -15
  401. package/dist/sdk/components/connectors.d.ts.map +0 -1
  402. package/dist/sdk/components/connectors.test.d.ts +0 -2
  403. package/dist/sdk/components/connectors.test.d.ts.map +0 -1
  404. package/dist/sdk/components/edge.d.ts +0 -4
  405. package/dist/sdk/components/edge.d.ts.map +0 -1
  406. package/dist/sdk/components/error-boundary.d.ts +0 -23
  407. package/dist/sdk/components/error-boundary.d.ts.map +0 -1
  408. package/dist/sdk/components/graph-theme.d.ts +0 -17
  409. package/dist/sdk/components/graph-theme.d.ts.map +0 -1
  410. package/dist/sdk/components/header.d.ts +0 -3
  411. package/dist/sdk/components/header.d.ts.map +0 -1
  412. package/dist/sdk/components/hooks.d.ts +0 -15
  413. package/dist/sdk/components/hooks.d.ts.map +0 -1
  414. package/dist/sdk/components/layout.d.ts +0 -27
  415. package/dist/sdk/components/layout.d.ts.map +0 -1
  416. package/dist/sdk/components/layout.test.d.ts +0 -2
  417. package/dist/sdk/components/layout.test.d.ts.map +0 -1
  418. package/dist/sdk/components/node-card.d.ts +0 -10
  419. package/dist/sdk/components/node-card.d.ts.map +0 -1
  420. package/dist/sdk/components/orchestrator-panel-contexts.d.ts +0 -16
  421. package/dist/sdk/components/orchestrator-panel-contexts.d.ts.map +0 -1
  422. package/dist/sdk/components/orchestrator-panel-store.d.ts +0 -46
  423. package/dist/sdk/components/orchestrator-panel-store.d.ts.map +0 -1
  424. package/dist/sdk/components/orchestrator-panel-store.test.d.ts +0 -2
  425. package/dist/sdk/components/orchestrator-panel-store.test.d.ts.map +0 -1
  426. package/dist/sdk/components/orchestrator-panel-types.d.ts +0 -18
  427. package/dist/sdk/components/orchestrator-panel-types.d.ts.map +0 -1
  428. package/dist/sdk/components/orchestrator-panel.d.ts +0 -52
  429. package/dist/sdk/components/orchestrator-panel.d.ts.map +0 -1
  430. package/dist/sdk/components/session-graph-panel.d.ts +0 -7
  431. package/dist/sdk/components/session-graph-panel.d.ts.map +0 -1
  432. package/dist/sdk/components/status-helpers.d.ts +0 -6
  433. package/dist/sdk/components/status-helpers.d.ts.map +0 -1
  434. package/dist/sdk/components/statusline.d.ts +0 -7
  435. package/dist/sdk/components/statusline.d.ts.map +0 -1
  436. package/dist/sdk/components/workflow-picker-panel.d.ts +0 -123
  437. package/dist/sdk/components/workflow-picker-panel.d.ts.map +0 -1
  438. package/dist/sdk/define-workflow.d.ts +0 -78
  439. package/dist/sdk/define-workflow.d.ts.map +0 -1
  440. package/dist/sdk/define-workflow.test.d.ts +0 -2
  441. package/dist/sdk/define-workflow.test.d.ts.map +0 -1
  442. package/dist/sdk/errors.d.ts +0 -24
  443. package/dist/sdk/errors.d.ts.map +0 -1
  444. package/dist/sdk/errors.test.d.ts +0 -2
  445. package/dist/sdk/errors.test.d.ts.map +0 -1
  446. package/dist/sdk/index.d.ts +0 -13
  447. package/dist/sdk/index.d.ts.map +0 -1
  448. package/dist/sdk/providers/claude.d.ts +0 -170
  449. package/dist/sdk/providers/claude.d.ts.map +0 -1
  450. package/dist/sdk/providers/copilot.d.ts +0 -11
  451. package/dist/sdk/providers/copilot.d.ts.map +0 -1
  452. package/dist/sdk/providers/opencode.d.ts +0 -11
  453. package/dist/sdk/providers/opencode.d.ts.map +0 -1
  454. package/dist/sdk/runtime/discovery.d.ts +0 -86
  455. package/dist/sdk/runtime/discovery.d.ts.map +0 -1
  456. package/dist/sdk/runtime/executor-entry.d.ts +0 -11
  457. package/dist/sdk/runtime/executor-entry.d.ts.map +0 -1
  458. package/dist/sdk/runtime/executor.d.ts +0 -72
  459. package/dist/sdk/runtime/executor.d.ts.map +0 -1
  460. package/dist/sdk/runtime/executor.test.d.ts +0 -2
  461. package/dist/sdk/runtime/executor.test.d.ts.map +0 -1
  462. package/dist/sdk/runtime/graph-inference.d.ts +0 -35
  463. package/dist/sdk/runtime/graph-inference.d.ts.map +0 -1
  464. package/dist/sdk/runtime/loader.d.ts +0 -70
  465. package/dist/sdk/runtime/loader.d.ts.map +0 -1
  466. package/dist/sdk/runtime/panel.d.ts +0 -9
  467. package/dist/sdk/runtime/panel.d.ts.map +0 -1
  468. package/dist/sdk/runtime/theme.d.ts +0 -28
  469. package/dist/sdk/runtime/theme.d.ts.map +0 -1
  470. package/dist/sdk/runtime/tmux.d.ts +0 -297
  471. package/dist/sdk/runtime/tmux.d.ts.map +0 -1
  472. package/dist/sdk/types.d.ts +0 -295
  473. package/dist/sdk/types.d.ts.map +0 -1
  474. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts +0 -62
  475. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts.map +0 -1
  476. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts +0 -46
  477. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts.map +0 -1
  478. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts +0 -26
  479. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts.map +0 -1
  480. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts +0 -92
  481. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts.map +0 -1
  482. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts +0 -57
  483. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts.map +0 -1
  484. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts +0 -49
  485. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts.map +0 -1
  486. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts +0 -14
  487. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts.map +0 -1
  488. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts +0 -14
  489. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts.map +0 -1
  490. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts +0 -17
  491. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts.map +0 -1
  492. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts +0 -119
  493. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts.map +0 -1
  494. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts +0 -20
  495. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts.map +0 -1
  496. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts +0 -14
  497. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts.map +0 -1
  498. package/dist/sdk/workflows/index.d.ts +0 -24
  499. package/dist/sdk/workflows/index.d.ts.map +0 -1
  500. package/dist/services/config/definitions.d.ts +0 -85
  501. package/dist/services/config/definitions.d.ts.map +0 -1
  502. package/dist/services/system/copy.d.ts +0 -77
  503. package/dist/services/system/copy.d.ts.map +0 -1
  504. package/dist/services/system/detect.d.ts +0 -75
  505. package/dist/services/system/detect.d.ts.map +0 -1
  506. package/tsconfig.json +0 -33
@@ -0,0 +1,627 @@
1
+ """Agent Evaluation Framework for context-engineered agent systems.
2
+
3
+ Use when: building evaluation pipelines, scoring agent outputs against
4
+ multi-dimensional rubrics, managing test sets, or monitoring production
5
+ agent quality. Provides composable classes that can be used independently
6
+ or wired together into a full evaluation pipeline.
7
+
8
+ Typical usage::
9
+
10
+ evaluator = AgentEvaluator()
11
+ test_set = TestSet("my_tests").create_standard_tests()
12
+ runner = EvaluationRunner(evaluator, test_set)
13
+ summary = runner.run_all(verbose=True)
14
+ print(summary)
15
+ """
16
+
17
+ from typing import Dict, List, Any, Optional
18
+ from dataclasses import dataclass
19
+ from enum import Enum
20
+ import time
21
+
22
+ __all__ = [
23
+ "ScoreLevel",
24
+ "RubricDimension",
25
+ "DEFAULT_RUBRIC",
26
+ "AgentEvaluator",
27
+ "TestSet",
28
+ "EvaluationRunner",
29
+ "ProductionMonitor",
30
+ ]
31
+
32
+
33
+ class ScoreLevel(Enum):
34
+ """Use when: mapping qualitative judgments to numeric scores."""
35
+
36
+ EXCELLENT = 1.0
37
+ GOOD = 0.8
38
+ ACCEPTABLE = 0.6
39
+ POOR = 0.3
40
+ FAILED = 0.0
41
+
42
+
43
+ @dataclass
44
+ class RubricDimension:
45
+ """Definition of a single evaluation dimension.
46
+
47
+ Use when: defining custom rubric dimensions beyond the defaults.
48
+ """
49
+
50
+ name: str
51
+ weight: float
52
+ description: str
53
+ levels: Dict[str, str] # level_name -> description
54
+
55
+
56
+ DEFAULT_RUBRIC: Dict[str, RubricDimension] = {
57
+ "factual_accuracy": RubricDimension(
58
+ name="factual_accuracy",
59
+ weight=0.30,
60
+ description="Claims in output match ground truth",
61
+ levels={
62
+ "excellent": "All claims verified, no errors",
63
+ "good": "Minor errors not affecting main conclusions",
64
+ "acceptable": "Major claims correct, minor inaccuracies",
65
+ "poor": "Significant factual errors",
66
+ "failed": "Fundamental factual errors",
67
+ },
68
+ ),
69
+ "completeness": RubricDimension(
70
+ name="completeness",
71
+ weight=0.25,
72
+ description="Output covers all requested aspects",
73
+ levels={
74
+ "excellent": "All aspects thoroughly covered",
75
+ "good": "Most aspects covered, minor gaps",
76
+ "acceptable": "Key aspects covered, some gaps",
77
+ "poor": "Major aspects missing",
78
+ "failed": "Fundamental aspects missing",
79
+ },
80
+ ),
81
+ "citation_accuracy": RubricDimension(
82
+ name="citation_accuracy",
83
+ weight=0.15,
84
+ description="Citations match claimed sources",
85
+ levels={
86
+ "excellent": "All citations accurate and complete",
87
+ "good": "Minor citation issues",
88
+ "acceptable": "Major citations accurate",
89
+ "poor": "Significant citation problems",
90
+ "failed": "Citations missing or incorrect",
91
+ },
92
+ ),
93
+ "source_quality": RubricDimension(
94
+ name="source_quality",
95
+ weight=0.10,
96
+ description="Uses appropriate primary sources",
97
+ levels={
98
+ "excellent": "Primary sources, authoritative",
99
+ "good": "Mostly primary, some secondary",
100
+ "acceptable": "Mix of primary and secondary",
101
+ "poor": "Mostly secondary or unreliable",
102
+ "failed": "No credible sources",
103
+ },
104
+ ),
105
+ "tool_efficiency": RubricDimension(
106
+ name="tool_efficiency",
107
+ weight=0.20,
108
+ description="Uses right tools reasonable number of times",
109
+ levels={
110
+ "excellent": "Optimal tool selection and count",
111
+ "good": "Good tool selection, minor inefficiencies",
112
+ "acceptable": "Appropriate tools, some redundancy",
113
+ "poor": "Wrong tools or excessive calls",
114
+ "failed": "Severe tool misuse",
115
+ },
116
+ ),
117
+ }
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Evaluation Engine
122
+ # ---------------------------------------------------------------------------
123
+
124
+
125
+ class AgentEvaluator:
126
+ """Main evaluation engine for agent outputs.
127
+
128
+ Use when: scoring a single agent output against a multi-dimensional rubric.
129
+ Instantiate with a custom rubric or rely on ``DEFAULT_RUBRIC``.
130
+ """
131
+
132
+ def __init__(self, rubric: Optional[Dict[str, RubricDimension]] = None) -> None:
133
+ self.rubric: Dict[str, RubricDimension] = rubric or DEFAULT_RUBRIC
134
+ self.evaluation_history: List[Dict[str, Any]] = []
135
+
136
+ def evaluate(
137
+ self,
138
+ task: Dict[str, Any],
139
+ output: str,
140
+ ground_truth: Optional[Dict[str, Any]] = None,
141
+ tool_calls: Optional[List[Dict[str, Any]]] = None,
142
+ ) -> Dict[str, Any]:
143
+ """Evaluate agent output against task requirements.
144
+
145
+ Use when: you have a single (task, output) pair and need per-dimension
146
+ scores plus an overall pass/fail verdict.
147
+
148
+ Returns evaluation results with per-dimension scores.
149
+ """
150
+ scores: Dict[str, Dict[str, Any]] = {}
151
+
152
+ for dimension_name, dimension in self.rubric.items():
153
+ score = self._evaluate_dimension(
154
+ dimension=dimension,
155
+ task=task,
156
+ output=output,
157
+ ground_truth=ground_truth,
158
+ tool_calls=tool_calls,
159
+ )
160
+
161
+ scores[dimension_name] = {
162
+ "score": score,
163
+ "weight": dimension.weight,
164
+ "level": self._score_to_level(score),
165
+ }
166
+
167
+ # Calculate weighted overall
168
+ overall: float = sum(
169
+ s["score"] * self.rubric[k].weight for k, s in scores.items()
170
+ )
171
+
172
+ result: Dict[str, Any] = {
173
+ "overall_score": overall,
174
+ "dimension_scores": scores,
175
+ "passed": overall >= 0.7,
176
+ "timestamp": time.time(),
177
+ }
178
+
179
+ self.evaluation_history.append(result)
180
+ return result
181
+
182
+ def _evaluate_dimension(
183
+ self,
184
+ dimension: RubricDimension,
185
+ task: Dict[str, Any],
186
+ output: str,
187
+ ground_truth: Optional[Dict[str, Any]] = None,
188
+ tool_calls: Optional[List[Dict[str, Any]]] = None,
189
+ ) -> float:
190
+ """Evaluate a single dimension.
191
+
192
+ Use when: extending the evaluator with custom dimension logic.
193
+ In production, replace heuristics with LLM judgment or human evaluation.
194
+ """
195
+ output_lower: str = output.lower()
196
+ task_type: str = task.get("type", "")
197
+
198
+ if dimension.name == "factual_accuracy":
199
+ if ground_truth:
200
+ return self._check_factual_accuracy(output, ground_truth)
201
+ return 0.7 # Default assumption
202
+
203
+ elif dimension.name == "completeness":
204
+ required: List[str] = task.get("requirements", [])
205
+ if required:
206
+ covered = sum(1 for r in required if r.lower() in output_lower)
207
+ return covered / len(required)
208
+ return 0.8
209
+
210
+ elif dimension.name == "citation_accuracy":
211
+ if task.get("requires_citations"):
212
+ # Look for citation patterns like [1], [Author 2024], [source]
213
+ # Avoid false positives from code brackets or JSON
214
+ citation_pattern = r'\[\d+\]|\[[A-Z][a-z]+(?:\s+(?:et al\.?|&)\s+[A-Z][a-z]+)?\s*[\d,]+\]|\[(?:source|ref|cite)[^\]]*\]'
215
+ import re as _re
216
+ citations_found = _re.findall(citation_pattern, output)
217
+ if len(citations_found) >= 1:
218
+ return 1.0
219
+ elif any(marker in output_lower for marker in ["according to", "cited in", "reported by"]):
220
+ return 0.7
221
+ return 0.4
222
+ return 0.8 # Citations not required
223
+
224
+ elif dimension.name == "source_quality":
225
+ quality_markers = ["according to", "reported by", "data from", "study"]
226
+ quality_count = sum(1 for m in quality_markers if m in output_lower)
227
+ return min(1.0, 0.5 + quality_count * 0.1)
228
+
229
+ elif dimension.name == "tool_efficiency":
230
+ if tool_calls:
231
+ expected_count = self._estimate_expected_tools(task_type)
232
+ actual_count = len(tool_calls)
233
+ if actual_count <= expected_count:
234
+ return 1.0
235
+ elif actual_count <= expected_count * 1.5:
236
+ return 0.7
237
+ else:
238
+ return 0.4
239
+ return 0.8 # No tool calls needed or recorded
240
+
241
+ return 0.5 # Default
242
+
243
+ def _check_factual_accuracy(
244
+ self, output: str, ground_truth: Dict[str, Any]
245
+ ) -> float:
246
+ """Check output against ground truth.
247
+
248
+ Use when: ground truth key_claims are available for comparison.
249
+ """
250
+ if not ground_truth:
251
+ return 0.7
252
+
253
+ key_claims: List[str] = ground_truth.get("key_claims", [])
254
+ if not key_claims:
255
+ return 0.7
256
+
257
+ output_lower: str = output.lower()
258
+ matched: int = sum(1 for claim in key_claims if claim.lower() in output_lower)
259
+
260
+ if matched == len(key_claims):
261
+ return 1.0
262
+ elif matched >= len(key_claims) * 0.7:
263
+ return 0.8
264
+ elif matched >= len(key_claims) * 0.5:
265
+ return 0.6
266
+ else:
267
+ return 0.3
268
+
269
+ def _estimate_expected_tools(self, task_type: str) -> int:
270
+ """Estimate expected tool count for task type."""
271
+ estimates: Dict[str, int] = {
272
+ "research": 3,
273
+ "create": 2,
274
+ "analyze": 2,
275
+ "general": 1,
276
+ }
277
+ return estimates.get(task_type, 1)
278
+
279
+ def _score_to_level(self, score: float) -> str:
280
+ """Convert numeric score to level name."""
281
+ if score >= 0.9:
282
+ return "excellent"
283
+ elif score >= 0.7:
284
+ return "good"
285
+ elif score >= 0.5:
286
+ return "acceptable"
287
+ elif score >= 0.25:
288
+ return "poor"
289
+ else:
290
+ return "failed"
291
+
292
+
293
+ # ---------------------------------------------------------------------------
294
+ # Test Set Management
295
+ # ---------------------------------------------------------------------------
296
+
297
+
298
+ class TestSet:
299
+ """Manage evaluation test sets with tagging and complexity stratification.
300
+
301
+ Use when: building, filtering, or analyzing collections of evaluation
302
+ test cases. Supports tag-based indexing and complexity distribution
303
+ analysis.
304
+ """
305
+
306
+ def __init__(self, name: str) -> None:
307
+ self.name: str = name
308
+ self.tests: List[Dict[str, Any]] = []
309
+ self.tags: Dict[str, List[int]] = {}
310
+
311
+ def add_test(self, test: Dict[str, Any]) -> None:
312
+ """Add a test case to the test set.
313
+
314
+ Use when: incrementally building a test set from individual cases.
315
+ """
316
+ self.tests.append(test)
317
+ idx: int = len(self.tests) - 1
318
+
319
+ for tag in test.get("tags", []):
320
+ if tag not in self.tags:
321
+ self.tags[tag] = []
322
+ self.tags[tag].append(idx)
323
+
324
+ def filter(self, **criteria: Any) -> List[Dict[str, Any]]:
325
+ """Filter tests by criteria.
326
+
327
+ Use when: selecting a subset of tests matching specific field values.
328
+ """
329
+ results: List[Dict[str, Any]] = []
330
+ for test in self.tests:
331
+ match = True
332
+ for key, value in criteria.items():
333
+ if test.get(key) != value:
334
+ match = False
335
+ break
336
+ if match:
337
+ results.append(test)
338
+ return results
339
+
340
+ def get_complexity_distribution(self) -> Dict[str, int]:
341
+ """Get distribution of tests by complexity.
342
+
343
+ Use when: verifying test set balance across difficulty levels.
344
+ """
345
+ distribution: Dict[str, int] = {}
346
+ for test in self.tests:
347
+ complexity: str = test.get("complexity", "medium")
348
+ distribution[complexity] = distribution.get(complexity, 0) + 1
349
+ return distribution
350
+
351
+ def create_standard_tests(self) -> "TestSet":
352
+ """Populate with standard test cases for context engineering evaluation.
353
+
354
+ Use when: bootstrapping a test set quickly for initial development.
355
+ """
356
+ tests: List[Dict[str, Any]] = [
357
+ {
358
+ "name": "simple_lookup",
359
+ "input": "What is the capital of France?",
360
+ "expected": {"type": "fact", "answer": "Paris"},
361
+ "complexity": "simple",
362
+ "tags": ["knowledge", "simple"],
363
+ },
364
+ {
365
+ "name": "context_retrieval",
366
+ "input": "Based on the user preferences, recommend a restaurant",
367
+ "context": {
368
+ "user_preferences": {
369
+ "cuisine": "Italian",
370
+ "price_range": "moderate",
371
+ }
372
+ },
373
+ "complexity": "medium",
374
+ "tags": ["retrieval", "reasoning"],
375
+ },
376
+ {
377
+ "name": "multi_step_reasoning",
378
+ "input": "Analyze the sales data and create a summary report",
379
+ "complexity": "complex",
380
+ "tags": ["analysis", "multi-step"],
381
+ },
382
+ ]
383
+
384
+ for test in tests:
385
+ self.add_test(test)
386
+
387
+ return self
388
+
389
+
390
+ # ---------------------------------------------------------------------------
391
+ # Evaluation Runner
392
+ # ---------------------------------------------------------------------------
393
+
394
+
395
+ class EvaluationRunner:
396
+ """Run evaluations across an entire test set and produce summaries.
397
+
398
+ Use when: executing a full evaluation pass over a test set, comparing
399
+ agent versions, or generating evaluation reports.
400
+ """
401
+
402
+ def __init__(self, evaluator: AgentEvaluator, test_set: TestSet) -> None:
403
+ self.evaluator: AgentEvaluator = evaluator
404
+ self.test_set: TestSet = test_set
405
+ self.results: List[Dict[str, Any]] = []
406
+
407
+ def run_all(self, verbose: bool = False) -> Dict[str, Any]:
408
+ """Run evaluation on all tests in the test set.
409
+
410
+ Use when: performing a complete evaluation pass.
411
+ """
412
+ self.results = []
413
+
414
+ for i, test in enumerate(self.test_set.tests):
415
+ if verbose:
416
+ print(
417
+ f"Running test {i + 1}/{len(self.test_set.tests)}: {test['name']}"
418
+ )
419
+
420
+ result = self.run_test(test)
421
+ self.results.append(result)
422
+
423
+ return self.summarize()
424
+
425
+ def run_test(self, test: Dict[str, Any]) -> Dict[str, Any]:
426
+ """Run a single evaluation test.
427
+
428
+ Use when: evaluating an individual test case outside of a full run.
429
+ In production, replace the simulated output with actual agent execution.
430
+ """
431
+ # In production, run actual agent
432
+ # Here we simulate
433
+ output: str = f"Simulated output for: {test.get('input', '')}"
434
+
435
+ evaluation: Dict[str, Any] = self.evaluator.evaluate(
436
+ task=test,
437
+ output=output,
438
+ ground_truth=test.get("expected"),
439
+ tool_calls=[],
440
+ )
441
+
442
+ return {
443
+ "test": test,
444
+ "output": output,
445
+ "evaluation": evaluation,
446
+ "passed": evaluation["passed"],
447
+ }
448
+
449
+ def summarize(self) -> Dict[str, Any]:
450
+ """Summarize evaluation results with per-dimension averages.
451
+
452
+ Use when: generating a report after a full evaluation run.
453
+ """
454
+ if not self.results:
455
+ return {"error": "No results"}
456
+
457
+ passed: int = sum(1 for r in self.results if r["passed"])
458
+
459
+ # Dimension averages
460
+ dimension_totals: Dict[str, Dict[str, float]] = {}
461
+ for dim_name in self.evaluator.rubric.keys():
462
+ dimension_totals[dim_name] = {"total": 0.0, "count": 0.0}
463
+
464
+ for result in self.results:
465
+ for dim_name, score in result["evaluation"]["dimension_scores"].items():
466
+ dimension_totals[dim_name]["total"] += score["score"]
467
+ dimension_totals[dim_name]["count"] += 1
468
+
469
+ dimension_averages: Dict[str, float] = {}
470
+ for dim_name, data in dimension_totals.items():
471
+ if data["count"] > 0:
472
+ dimension_averages[dim_name] = data["total"] / data["count"]
473
+
474
+ return {
475
+ "total_tests": len(self.results),
476
+ "passed": passed,
477
+ "failed": len(self.results) - passed,
478
+ "pass_rate": passed / len(self.results) if self.results else 0,
479
+ "dimension_averages": dimension_averages,
480
+ "failures": [
481
+ {
482
+ "test": r["test"]["name"],
483
+ "score": r["evaluation"]["overall_score"],
484
+ }
485
+ for r in self.results
486
+ if not r["passed"]
487
+ ],
488
+ }
489
+
490
+
491
+ # ---------------------------------------------------------------------------
492
+ # Production Monitoring
493
+ # ---------------------------------------------------------------------------
494
+
495
+
496
+ class ProductionMonitor:
497
+ """Monitor agent performance in production via sampling.
498
+
499
+ Use when: setting up continuous quality monitoring for a deployed agent.
500
+ Samples interactions at a configurable rate and tracks pass rate, average
501
+ score, and alert status.
502
+ """
503
+
504
+ def __init__(self, sample_rate: float = 0.01) -> None:
505
+ import random
506
+
507
+ self.sample_rate: float = sample_rate
508
+ self._rng: random.Random = random.Random()
509
+ self.samples: List[Dict[str, Any]] = []
510
+ self.alert_thresholds: Dict[str, float] = {
511
+ "pass_rate_warning": 0.85,
512
+ "pass_rate_critical": 0.70,
513
+ }
514
+
515
+ def should_sample(self) -> bool:
516
+ """Determine if current interaction should be sampled.
517
+
518
+ Use when: deciding at request time whether to evaluate this interaction.
519
+ """
520
+ return self._rng.random() < self.sample_rate
521
+
522
+ def record_sample(
523
+ self, query: str, output: str, evaluation: Dict[str, Any]
524
+ ) -> None:
525
+ """Record a production sample for evaluation.
526
+
527
+ Use when: storing evaluated production interactions for trend analysis.
528
+ """
529
+ sample: Dict[str, Any] = {
530
+ "query": query[:200],
531
+ "output_preview": output[:200],
532
+ "score": evaluation.get("overall_score", 0),
533
+ "passed": evaluation.get("passed", False),
534
+ "timestamp": time.time(),
535
+ }
536
+ self.samples.append(sample)
537
+
538
+ def get_metrics(self) -> Dict[str, Any]:
539
+ """Calculate current metrics from collected samples.
540
+
541
+ Use when: checking production health or generating monitoring reports.
542
+ """
543
+ if not self.samples:
544
+ return {"status": "insufficient_data"}
545
+
546
+ passed: int = sum(1 for s in self.samples if s["passed"])
547
+ pass_rate: float = passed / len(self.samples)
548
+ avg_score: float = sum(s["score"] for s in self.samples) / len(self.samples)
549
+
550
+ status: str = "healthy"
551
+ if pass_rate < self.alert_thresholds["pass_rate_critical"]:
552
+ status = "critical"
553
+ elif pass_rate < self.alert_thresholds["pass_rate_warning"]:
554
+ status = "warning"
555
+
556
+ return {
557
+ "sample_count": len(self.samples),
558
+ "pass_rate": pass_rate,
559
+ "average_score": avg_score,
560
+ "status": status,
561
+ "alerts": self._generate_alerts(pass_rate, avg_score),
562
+ }
563
+
564
+ def _generate_alerts(
565
+ self, pass_rate: float, avg_score: float
566
+ ) -> List[Dict[str, str]]:
567
+ """Generate alerts based on metrics."""
568
+ alerts: List[Dict[str, str]] = []
569
+
570
+ if pass_rate < self.alert_thresholds["pass_rate_critical"]:
571
+ alerts.append(
572
+ {
573
+ "type": "critical",
574
+ "message": f"Pass rate ({pass_rate:.2f}) below critical threshold",
575
+ }
576
+ )
577
+ elif pass_rate < self.alert_thresholds["pass_rate_warning"]:
578
+ alerts.append(
579
+ {
580
+ "type": "warning",
581
+ "message": f"Pass rate ({pass_rate:.2f}) below warning threshold",
582
+ }
583
+ )
584
+
585
+ if avg_score < 0.6:
586
+ alerts.append(
587
+ {
588
+ "type": "quality",
589
+ "message": f"Average score ({avg_score:.2f}) indicates quality issues",
590
+ }
591
+ )
592
+
593
+ return alerts
594
+
595
+
596
+ # ---------------------------------------------------------------------------
597
+ # CLI entry point
598
+ # ---------------------------------------------------------------------------
599
+
600
+ if __name__ == "__main__":
601
+ print("=== Agent Evaluation Framework Demo ===\n")
602
+
603
+ # 1. Create evaluator with default rubric
604
+ evaluator = AgentEvaluator()
605
+ print(f"Rubric dimensions: {list(evaluator.rubric.keys())}\n")
606
+
607
+ # 2. Build a standard test set
608
+ test_set = TestSet("demo").create_standard_tests()
609
+ print(f"Test set: {test_set.name}")
610
+ print(f"Test count: {len(test_set.tests)}")
611
+ print(f"Complexity distribution: {test_set.get_complexity_distribution()}\n")
612
+
613
+ # 3. Run evaluation
614
+ runner = EvaluationRunner(evaluator, test_set)
615
+ summary = runner.run_all(verbose=True)
616
+
617
+ print(f"\n--- Summary ---")
618
+ print(f"Total: {summary['total_tests']}")
619
+ print(f"Passed: {summary['passed']}")
620
+ print(f"Failed: {summary['failed']}")
621
+ print(f"Pass rate: {summary['pass_rate']:.1%}")
622
+ print(f"Dimension averages: {summary['dimension_averages']}")
623
+
624
+ if summary["failures"]:
625
+ print(f"\nFailures:")
626
+ for f in summary["failures"]:
627
+ print(f" - {f['test']}: {f['score']:.2f}")