@bastani/atomic 0.5.11 → 0.5.12-1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (507) hide show
  1. package/.agents/skills/adapt/SKILL.md +199 -0
  2. package/.agents/skills/advanced-evaluation/SKILL.md +402 -0
  3. package/.agents/skills/advanced-evaluation/references/bias-mitigation.md +288 -0
  4. package/.agents/skills/advanced-evaluation/references/evaluation-pipeline.md +43 -0
  5. package/.agents/skills/advanced-evaluation/references/implementation-patterns.md +315 -0
  6. package/.agents/skills/advanced-evaluation/references/metrics-guide.md +331 -0
  7. package/.agents/skills/advanced-evaluation/scripts/evaluation_example.py +392 -0
  8. package/.agents/skills/animate/SKILL.md +175 -0
  9. package/.agents/skills/arrange/SKILL.md +124 -0
  10. package/.agents/skills/audit/SKILL.md +148 -0
  11. package/.agents/skills/bdi-mental-states/SKILL.md +311 -0
  12. package/.agents/skills/bdi-mental-states/references/bdi-ontology-core.md +207 -0
  13. package/.agents/skills/bdi-mental-states/references/framework-integration.md +582 -0
  14. package/.agents/skills/bdi-mental-states/references/rdf-examples.md +315 -0
  15. package/.agents/skills/bdi-mental-states/references/sparql-competency.md +420 -0
  16. package/.agents/skills/bolder/SKILL.md +117 -0
  17. package/.agents/skills/bun/SKILL.md +199 -0
  18. package/.agents/skills/clarify/SKILL.md +183 -0
  19. package/.agents/skills/colorize/SKILL.md +143 -0
  20. package/.agents/skills/context-compression/SKILL.md +272 -0
  21. package/.agents/skills/context-compression/references/evaluation-framework.md +213 -0
  22. package/.agents/skills/context-compression/scripts/compression_evaluator.py +862 -0
  23. package/.agents/skills/context-compression/tests/test_compression_evaluator.py +56 -0
  24. package/.agents/skills/context-degradation/SKILL.md +206 -0
  25. package/.agents/skills/context-degradation/references/patterns.md +314 -0
  26. package/.agents/skills/context-degradation/scripts/degradation_detector.py +614 -0
  27. package/.agents/skills/context-fundamentals/SKILL.md +201 -0
  28. package/.agents/skills/context-fundamentals/references/context-components.md +283 -0
  29. package/.agents/skills/context-fundamentals/scripts/context_manager.py +533 -0
  30. package/.agents/skills/context-optimization/SKILL.md +195 -0
  31. package/.agents/skills/context-optimization/references/optimization_techniques.md +272 -0
  32. package/.agents/skills/context-optimization/scripts/compaction.py +562 -0
  33. package/.agents/skills/create-spec/SKILL.md +244 -0
  34. package/.agents/skills/critique/SKILL.md +225 -0
  35. package/.agents/skills/critique/reference/cognitive-load.md +106 -0
  36. package/.agents/skills/critique/reference/heuristics-scoring.md +234 -0
  37. package/.agents/skills/critique/reference/personas.md +178 -0
  38. package/.agents/skills/delight/SKILL.md +304 -0
  39. package/.agents/skills/distill/SKILL.md +122 -0
  40. package/.agents/skills/docx/LICENSE.txt +30 -0
  41. package/.agents/skills/docx/SKILL.md +590 -0
  42. package/.agents/skills/docx/scripts/__init__.py +1 -0
  43. package/.agents/skills/docx/scripts/accept_changes.py +135 -0
  44. package/.agents/skills/docx/scripts/comment.py +318 -0
  45. package/.agents/skills/docx/scripts/office/helpers/__init__.py +0 -0
  46. package/.agents/skills/docx/scripts/office/helpers/merge_runs.py +199 -0
  47. package/.agents/skills/docx/scripts/office/helpers/simplify_redlines.py +197 -0
  48. package/.agents/skills/docx/scripts/office/pack.py +159 -0
  49. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  50. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  51. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  52. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  53. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  54. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  55. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  56. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  57. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  58. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  59. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  60. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  61. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  62. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  63. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  64. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  65. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  66. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  67. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  68. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  69. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  70. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  71. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  72. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  73. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  74. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  75. package/.agents/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  76. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  77. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  78. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  79. package/.agents/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  80. package/.agents/skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
  81. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  82. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  83. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  84. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  85. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  86. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  87. package/.agents/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  88. package/.agents/skills/docx/scripts/office/soffice.py +183 -0
  89. package/.agents/skills/docx/scripts/office/unpack.py +132 -0
  90. package/.agents/skills/docx/scripts/office/validate.py +111 -0
  91. package/.agents/skills/docx/scripts/office/validators/__init__.py +15 -0
  92. package/.agents/skills/docx/scripts/office/validators/base.py +847 -0
  93. package/.agents/skills/docx/scripts/office/validators/docx.py +446 -0
  94. package/.agents/skills/docx/scripts/office/validators/pptx.py +275 -0
  95. package/.agents/skills/docx/scripts/office/validators/redlining.py +247 -0
  96. package/.agents/skills/docx/scripts/templates/comments.xml +3 -0
  97. package/.agents/skills/docx/scripts/templates/commentsExtended.xml +3 -0
  98. package/.agents/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
  99. package/.agents/skills/docx/scripts/templates/commentsIds.xml +3 -0
  100. package/.agents/skills/docx/scripts/templates/people.xml +3 -0
  101. package/.agents/skills/evaluation/SKILL.md +251 -0
  102. package/.agents/skills/evaluation/references/metrics.md +339 -0
  103. package/.agents/skills/evaluation/scripts/evaluator.py +627 -0
  104. package/.agents/skills/explain-code/SKILL.md +230 -0
  105. package/.agents/skills/extract/SKILL.md +91 -0
  106. package/.agents/skills/filesystem-context/SKILL.md +287 -0
  107. package/.agents/skills/filesystem-context/references/implementation-patterns.md +549 -0
  108. package/.agents/skills/filesystem-context/scripts/filesystem_context.py +425 -0
  109. package/.agents/skills/find-skills/SKILL.md +142 -0
  110. package/.agents/skills/frontend-design/SKILL.md +147 -0
  111. package/.agents/skills/frontend-design/reference/color-and-contrast.md +132 -0
  112. package/.agents/skills/frontend-design/reference/interaction-design.md +195 -0
  113. package/.agents/skills/frontend-design/reference/motion-design.md +99 -0
  114. package/.agents/skills/frontend-design/reference/responsive-design.md +114 -0
  115. package/.agents/skills/frontend-design/reference/spatial-design.md +100 -0
  116. package/.agents/skills/frontend-design/reference/typography.md +133 -0
  117. package/.agents/skills/frontend-design/reference/ux-writing.md +107 -0
  118. package/.agents/skills/gh-commit/SKILL.md +243 -0
  119. package/.agents/skills/gh-create-pr/SKILL.md +93 -0
  120. package/.agents/skills/harden/SKILL.md +354 -0
  121. package/.agents/skills/hosted-agents/SKILL.md +260 -0
  122. package/.agents/skills/hosted-agents/references/infrastructure-patterns.md +700 -0
  123. package/.agents/skills/hosted-agents/scripts/sandbox_manager.py +590 -0
  124. package/.agents/skills/impeccable/SKILL.md +365 -0
  125. package/.agents/skills/impeccable/reference/color-and-contrast.md +105 -0
  126. package/.agents/skills/impeccable/reference/craft.md +70 -0
  127. package/.agents/skills/impeccable/reference/extract.md +70 -0
  128. package/.agents/skills/impeccable/reference/interaction-design.md +195 -0
  129. package/.agents/skills/impeccable/reference/motion-design.md +99 -0
  130. package/.agents/skills/impeccable/reference/responsive-design.md +114 -0
  131. package/.agents/skills/impeccable/reference/spatial-design.md +100 -0
  132. package/.agents/skills/impeccable/reference/typography.md +142 -0
  133. package/.agents/skills/impeccable/reference/ux-writing.md +107 -0
  134. package/.agents/skills/impeccable/scripts/cleanup-deprecated.mjs +214 -0
  135. package/.agents/skills/init/SKILL.md +138 -0
  136. package/.agents/skills/layout/SKILL.md +125 -0
  137. package/.agents/skills/liteparse/SKILL.md +222 -0
  138. package/.agents/skills/memory-systems/SKILL.md +219 -0
  139. package/.agents/skills/memory-systems/references/implementation.md +551 -0
  140. package/.agents/skills/memory-systems/scripts/memory_store.py +616 -0
  141. package/.agents/skills/multi-agent-patterns/SKILL.md +257 -0
  142. package/.agents/skills/multi-agent-patterns/references/frameworks.md +433 -0
  143. package/.agents/skills/multi-agent-patterns/scripts/coordination.py +613 -0
  144. package/.agents/skills/normalize/SKILL.md +70 -0
  145. package/.agents/skills/onboard/SKILL.md +245 -0
  146. package/.agents/skills/opentui/SKILL.md +201 -0
  147. package/.agents/skills/opentui/references/animation/REFERENCE.md +431 -0
  148. package/.agents/skills/opentui/references/components/REFERENCE.md +144 -0
  149. package/.agents/skills/opentui/references/components/code-diff.md +672 -0
  150. package/.agents/skills/opentui/references/components/containers.md +417 -0
  151. package/.agents/skills/opentui/references/components/inputs.md +531 -0
  152. package/.agents/skills/opentui/references/components/text-display.md +386 -0
  153. package/.agents/skills/opentui/references/core/REFERENCE.md +145 -0
  154. package/.agents/skills/opentui/references/core/api.md +543 -0
  155. package/.agents/skills/opentui/references/core/configuration.md +168 -0
  156. package/.agents/skills/opentui/references/core/gotchas.md +393 -0
  157. package/.agents/skills/opentui/references/core/patterns.md +449 -0
  158. package/.agents/skills/opentui/references/keyboard/REFERENCE.md +617 -0
  159. package/.agents/skills/opentui/references/layout/REFERENCE.md +337 -0
  160. package/.agents/skills/opentui/references/layout/patterns.md +444 -0
  161. package/.agents/skills/opentui/references/react/REFERENCE.md +174 -0
  162. package/.agents/skills/opentui/references/react/api.md +436 -0
  163. package/.agents/skills/opentui/references/react/configuration.md +302 -0
  164. package/.agents/skills/opentui/references/react/gotchas.md +443 -0
  165. package/.agents/skills/opentui/references/react/patterns.md +501 -0
  166. package/.agents/skills/opentui/references/solid/REFERENCE.md +201 -0
  167. package/.agents/skills/opentui/references/solid/api.md +564 -0
  168. package/.agents/skills/opentui/references/solid/configuration.md +316 -0
  169. package/.agents/skills/opentui/references/solid/gotchas.md +427 -0
  170. package/.agents/skills/opentui/references/solid/patterns.md +560 -0
  171. package/.agents/skills/opentui/references/testing/REFERENCE.md +614 -0
  172. package/.agents/skills/optimize/SKILL.md +266 -0
  173. package/.agents/skills/overdrive/SKILL.md +142 -0
  174. package/.agents/skills/pdf/LICENSE.txt +30 -0
  175. package/.agents/skills/pdf/SKILL.md +314 -0
  176. package/.agents/skills/pdf/forms.md +294 -0
  177. package/.agents/skills/pdf/reference.md +612 -0
  178. package/.agents/skills/pdf/scripts/check_bounding_boxes.py +65 -0
  179. package/.agents/skills/pdf/scripts/check_fillable_fields.py +11 -0
  180. package/.agents/skills/pdf/scripts/convert_pdf_to_images.py +33 -0
  181. package/.agents/skills/pdf/scripts/create_validation_image.py +37 -0
  182. package/.agents/skills/pdf/scripts/extract_form_field_info.py +122 -0
  183. package/.agents/skills/pdf/scripts/extract_form_structure.py +115 -0
  184. package/.agents/skills/pdf/scripts/fill_fillable_fields.py +98 -0
  185. package/.agents/skills/pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
  186. package/.agents/skills/playwright-cli/SKILL.md +344 -0
  187. package/.agents/skills/playwright-cli/references/element-attributes.md +23 -0
  188. package/.agents/skills/playwright-cli/references/playwright-tests.md +39 -0
  189. package/.agents/skills/playwright-cli/references/request-mocking.md +87 -0
  190. package/.agents/skills/playwright-cli/references/running-code.md +231 -0
  191. package/.agents/skills/playwright-cli/references/session-management.md +169 -0
  192. package/.agents/skills/playwright-cli/references/storage-state.md +275 -0
  193. package/.agents/skills/playwright-cli/references/test-generation.md +88 -0
  194. package/.agents/skills/playwright-cli/references/tracing.md +139 -0
  195. package/.agents/skills/playwright-cli/references/video-recording.md +143 -0
  196. package/.agents/skills/polish/SKILL.md +224 -0
  197. package/.agents/skills/pptx/LICENSE.txt +30 -0
  198. package/.agents/skills/pptx/SKILL.md +232 -0
  199. package/.agents/skills/pptx/editing.md +205 -0
  200. package/.agents/skills/pptx/pptxgenjs.md +420 -0
  201. package/.agents/skills/pptx/scripts/__init__.py +0 -0
  202. package/.agents/skills/pptx/scripts/add_slide.py +195 -0
  203. package/.agents/skills/pptx/scripts/clean.py +286 -0
  204. package/.agents/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  205. package/.agents/skills/pptx/scripts/office/helpers/merge_runs.py +199 -0
  206. package/.agents/skills/pptx/scripts/office/helpers/simplify_redlines.py +197 -0
  207. package/.agents/skills/pptx/scripts/office/pack.py +159 -0
  208. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  209. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  210. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  211. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  212. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  213. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  214. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  215. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  216. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  217. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  218. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  219. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  220. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  221. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  222. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  223. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  224. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  225. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  226. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  227. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  228. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  229. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  230. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  231. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  232. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  233. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  234. package/.agents/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  235. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  236. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  237. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  238. package/.agents/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  239. package/.agents/skills/pptx/scripts/office/schemas/mce/mc.xsd +75 -0
  240. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  241. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  242. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  243. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  244. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  245. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  246. package/.agents/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  247. package/.agents/skills/pptx/scripts/office/soffice.py +183 -0
  248. package/.agents/skills/pptx/scripts/office/unpack.py +132 -0
  249. package/.agents/skills/pptx/scripts/office/validate.py +111 -0
  250. package/.agents/skills/pptx/scripts/office/validators/__init__.py +15 -0
  251. package/.agents/skills/pptx/scripts/office/validators/base.py +847 -0
  252. package/.agents/skills/pptx/scripts/office/validators/docx.py +446 -0
  253. package/.agents/skills/pptx/scripts/office/validators/pptx.py +275 -0
  254. package/.agents/skills/pptx/scripts/office/validators/redlining.py +247 -0
  255. package/.agents/skills/pptx/scripts/thumbnail.py +289 -0
  256. package/.agents/skills/project-development/SKILL.md +291 -0
  257. package/.agents/skills/project-development/references/case-studies.md +388 -0
  258. package/.agents/skills/project-development/references/pipeline-patterns.md +610 -0
  259. package/.agents/skills/project-development/scripts/pipeline_template.py +796 -0
  260. package/.agents/skills/prompt-engineer/SKILL.md +263 -0
  261. package/.agents/skills/prompt-engineer/references/advanced_patterns.md +271 -0
  262. package/.agents/skills/prompt-engineer/references/core_prompting.md +137 -0
  263. package/.agents/skills/prompt-engineer/references/quality_improvement.md +193 -0
  264. package/.agents/skills/quieter/SKILL.md +103 -0
  265. package/.agents/skills/research-codebase/SKILL.md +227 -0
  266. package/.agents/skills/shape/SKILL.md +96 -0
  267. package/.agents/skills/skill-creator/LICENSE.txt +202 -0
  268. package/.agents/skills/skill-creator/SKILL.md +485 -0
  269. package/.agents/skills/skill-creator/agents/analyzer.md +274 -0
  270. package/.agents/skills/skill-creator/agents/comparator.md +202 -0
  271. package/.agents/skills/skill-creator/agents/grader.md +223 -0
  272. package/.agents/skills/skill-creator/assets/eval_review.html +146 -0
  273. package/.agents/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  274. package/.agents/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  275. package/.agents/skills/skill-creator/references/schemas.md +430 -0
  276. package/.agents/skills/skill-creator/scripts/__init__.py +0 -0
  277. package/.agents/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  278. package/.agents/skills/skill-creator/scripts/generate_report.py +326 -0
  279. package/.agents/skills/skill-creator/scripts/improve_description.py +247 -0
  280. package/.agents/skills/skill-creator/scripts/package_skill.py +136 -0
  281. package/.agents/skills/skill-creator/scripts/quick_validate.py +103 -0
  282. package/.agents/skills/skill-creator/scripts/run_eval.py +310 -0
  283. package/.agents/skills/skill-creator/scripts/run_loop.py +328 -0
  284. package/.agents/skills/skill-creator/scripts/utils.py +47 -0
  285. package/.agents/skills/sl-commit/SKILL.md +51 -0
  286. package/.agents/skills/sl-submit-diff/SKILL.md +55 -0
  287. package/.agents/skills/teach-impeccable/SKILL.md +71 -0
  288. package/.agents/skills/test-driven-development/SKILL.md +371 -0
  289. package/.agents/skills/test-driven-development/testing-anti-patterns.md +299 -0
  290. package/.agents/skills/tool-design/SKILL.md +271 -0
  291. package/.agents/skills/tool-design/references/architectural_reduction.md +210 -0
  292. package/.agents/skills/tool-design/references/best_practices.md +176 -0
  293. package/.agents/skills/tool-design/scripts/description_generator.py +528 -0
  294. package/.agents/skills/typescript-advanced-types/SKILL.md +719 -0
  295. package/.agents/skills/typescript-expert/SKILL.md +428 -0
  296. package/.agents/skills/typescript-expert/references/tsconfig-strict.json +92 -0
  297. package/.agents/skills/typescript-expert/references/typescript-cheatsheet.md +383 -0
  298. package/.agents/skills/typescript-expert/references/utility-types.ts +335 -0
  299. package/.agents/skills/typescript-expert/scripts/ts_diagnostic.py +203 -0
  300. package/.agents/skills/typescript-react-reviewer/SKILL.md +200 -0
  301. package/.agents/skills/typescript-react-reviewer/references/antipatterns.md +510 -0
  302. package/.agents/skills/typescript-react-reviewer/references/checklist.md +267 -0
  303. package/.agents/skills/typescript-react-reviewer/references/react19-patterns.md +305 -0
  304. package/.agents/skills/typeset/SKILL.md +116 -0
  305. package/.agents/skills/workflow-creator/SKILL.md +337 -0
  306. package/.agents/skills/workflow-creator/references/agent-sessions.md +789 -0
  307. package/.agents/skills/workflow-creator/references/computation-and-validation.md +224 -0
  308. package/.agents/skills/workflow-creator/references/control-flow.md +450 -0
  309. package/.agents/skills/workflow-creator/references/discovery-and-verification.md +156 -0
  310. package/.agents/skills/workflow-creator/references/failure-modes.md +732 -0
  311. package/.agents/skills/workflow-creator/references/getting-started.md +289 -0
  312. package/.agents/skills/workflow-creator/references/session-config.md +355 -0
  313. package/.agents/skills/workflow-creator/references/state-and-data-flow.md +374 -0
  314. package/.agents/skills/workflow-creator/references/user-input.md +206 -0
  315. package/.agents/skills/workflow-creator/references/workflow-inputs.md +274 -0
  316. package/.agents/skills/xlsx/LICENSE.txt +30 -0
  317. package/.agents/skills/xlsx/SKILL.md +292 -0
  318. package/.agents/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  319. package/.agents/skills/xlsx/scripts/office/helpers/merge_runs.py +199 -0
  320. package/.agents/skills/xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
  321. package/.agents/skills/xlsx/scripts/office/pack.py +159 -0
  322. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  323. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  324. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  325. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  326. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  327. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  328. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  329. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  330. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  331. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  332. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  333. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  334. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  335. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  336. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  337. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  338. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  339. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  340. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  341. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  342. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  343. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  344. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  345. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  346. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  347. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  348. package/.agents/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  349. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  350. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  351. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  352. package/.agents/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  353. package/.agents/skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
  354. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  355. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  356. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  357. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  358. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  359. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  360. package/.agents/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  361. package/.agents/skills/xlsx/scripts/office/soffice.py +183 -0
  362. package/.agents/skills/xlsx/scripts/office/unpack.py +132 -0
  363. package/.agents/skills/xlsx/scripts/office/validate.py +111 -0
  364. package/.agents/skills/xlsx/scripts/office/validators/__init__.py +15 -0
  365. package/.agents/skills/xlsx/scripts/office/validators/base.py +847 -0
  366. package/.agents/skills/xlsx/scripts/office/validators/docx.py +446 -0
  367. package/.agents/skills/xlsx/scripts/office/validators/pptx.py +275 -0
  368. package/.agents/skills/xlsx/scripts/office/validators/redlining.py +247 -0
  369. package/.agents/skills/xlsx/scripts/recalc.py +184 -0
  370. package/.claude/agents/reviewer.md +1 -0
  371. package/.github/agents/reviewer.md +1 -0
  372. package/.opencode/agents/reviewer.md +1 -0
  373. package/README.md +274 -169
  374. package/package.json +6 -7
  375. package/src/commands/cli/init/index.ts +2 -2
  376. package/src/commands/cli/init/scm.ts +7 -8
  377. package/src/commands/cli/workflow-command.test.ts +74 -0
  378. package/src/commands/cli/workflow.ts +7 -2
  379. package/src/scripts/bundle-configs.ts +128 -0
  380. package/src/sdk/components/compact-switcher.tsx +1 -1
  381. package/src/sdk/components/orchestrator-panel-store.ts +13 -0
  382. package/src/sdk/components/orchestrator-panel.tsx +10 -0
  383. package/src/sdk/components/statusline.tsx +13 -1
  384. package/src/sdk/components/workflow-picker-panel.tsx +407 -296
  385. package/src/sdk/providers/claude.ts +50 -0
  386. package/src/sdk/runtime/executor.ts +111 -32
  387. package/src/sdk/types.ts +7 -0
  388. package/src/sdk/workflows/builtin/ralph/claude/index.ts +132 -76
  389. package/src/sdk/workflows/builtin/ralph/copilot/index.ts +129 -71
  390. package/src/sdk/workflows/builtin/ralph/helpers/git.ts +184 -17
  391. package/src/sdk/workflows/builtin/ralph/helpers/prompts.ts +463 -79
  392. package/src/sdk/workflows/builtin/ralph/opencode/index.ts +124 -80
  393. package/src/services/system/auto-sync.ts +31 -51
  394. package/src/services/system/skills.ts +56 -60
  395. package/dist/lib/path-root-guard.d.ts +0 -4
  396. package/dist/lib/path-root-guard.d.ts.map +0 -1
  397. package/dist/sdk/components/color-utils.d.ts +0 -4
  398. package/dist/sdk/components/color-utils.d.ts.map +0 -1
  399. package/dist/sdk/components/compact-switcher.d.ts +0 -10
  400. package/dist/sdk/components/compact-switcher.d.ts.map +0 -1
  401. package/dist/sdk/components/connectors.d.ts +0 -15
  402. package/dist/sdk/components/connectors.d.ts.map +0 -1
  403. package/dist/sdk/components/connectors.test.d.ts +0 -2
  404. package/dist/sdk/components/connectors.test.d.ts.map +0 -1
  405. package/dist/sdk/components/edge.d.ts +0 -4
  406. package/dist/sdk/components/edge.d.ts.map +0 -1
  407. package/dist/sdk/components/error-boundary.d.ts +0 -23
  408. package/dist/sdk/components/error-boundary.d.ts.map +0 -1
  409. package/dist/sdk/components/graph-theme.d.ts +0 -17
  410. package/dist/sdk/components/graph-theme.d.ts.map +0 -1
  411. package/dist/sdk/components/header.d.ts +0 -3
  412. package/dist/sdk/components/header.d.ts.map +0 -1
  413. package/dist/sdk/components/hooks.d.ts +0 -15
  414. package/dist/sdk/components/hooks.d.ts.map +0 -1
  415. package/dist/sdk/components/layout.d.ts +0 -27
  416. package/dist/sdk/components/layout.d.ts.map +0 -1
  417. package/dist/sdk/components/layout.test.d.ts +0 -2
  418. package/dist/sdk/components/layout.test.d.ts.map +0 -1
  419. package/dist/sdk/components/node-card.d.ts +0 -10
  420. package/dist/sdk/components/node-card.d.ts.map +0 -1
  421. package/dist/sdk/components/orchestrator-panel-contexts.d.ts +0 -16
  422. package/dist/sdk/components/orchestrator-panel-contexts.d.ts.map +0 -1
  423. package/dist/sdk/components/orchestrator-panel-store.d.ts +0 -46
  424. package/dist/sdk/components/orchestrator-panel-store.d.ts.map +0 -1
  425. package/dist/sdk/components/orchestrator-panel-store.test.d.ts +0 -2
  426. package/dist/sdk/components/orchestrator-panel-store.test.d.ts.map +0 -1
  427. package/dist/sdk/components/orchestrator-panel-types.d.ts +0 -18
  428. package/dist/sdk/components/orchestrator-panel-types.d.ts.map +0 -1
  429. package/dist/sdk/components/orchestrator-panel.d.ts +0 -52
  430. package/dist/sdk/components/orchestrator-panel.d.ts.map +0 -1
  431. package/dist/sdk/components/session-graph-panel.d.ts +0 -7
  432. package/dist/sdk/components/session-graph-panel.d.ts.map +0 -1
  433. package/dist/sdk/components/status-helpers.d.ts +0 -6
  434. package/dist/sdk/components/status-helpers.d.ts.map +0 -1
  435. package/dist/sdk/components/statusline.d.ts +0 -7
  436. package/dist/sdk/components/statusline.d.ts.map +0 -1
  437. package/dist/sdk/components/workflow-picker-panel.d.ts +0 -123
  438. package/dist/sdk/components/workflow-picker-panel.d.ts.map +0 -1
  439. package/dist/sdk/define-workflow.d.ts +0 -78
  440. package/dist/sdk/define-workflow.d.ts.map +0 -1
  441. package/dist/sdk/define-workflow.test.d.ts +0 -2
  442. package/dist/sdk/define-workflow.test.d.ts.map +0 -1
  443. package/dist/sdk/errors.d.ts +0 -24
  444. package/dist/sdk/errors.d.ts.map +0 -1
  445. package/dist/sdk/errors.test.d.ts +0 -2
  446. package/dist/sdk/errors.test.d.ts.map +0 -1
  447. package/dist/sdk/index.d.ts +0 -13
  448. package/dist/sdk/index.d.ts.map +0 -1
  449. package/dist/sdk/providers/claude.d.ts +0 -170
  450. package/dist/sdk/providers/claude.d.ts.map +0 -1
  451. package/dist/sdk/providers/copilot.d.ts +0 -11
  452. package/dist/sdk/providers/copilot.d.ts.map +0 -1
  453. package/dist/sdk/providers/opencode.d.ts +0 -11
  454. package/dist/sdk/providers/opencode.d.ts.map +0 -1
  455. package/dist/sdk/runtime/discovery.d.ts +0 -86
  456. package/dist/sdk/runtime/discovery.d.ts.map +0 -1
  457. package/dist/sdk/runtime/executor-entry.d.ts +0 -11
  458. package/dist/sdk/runtime/executor-entry.d.ts.map +0 -1
  459. package/dist/sdk/runtime/executor.d.ts +0 -72
  460. package/dist/sdk/runtime/executor.d.ts.map +0 -1
  461. package/dist/sdk/runtime/executor.test.d.ts +0 -2
  462. package/dist/sdk/runtime/executor.test.d.ts.map +0 -1
  463. package/dist/sdk/runtime/graph-inference.d.ts +0 -35
  464. package/dist/sdk/runtime/graph-inference.d.ts.map +0 -1
  465. package/dist/sdk/runtime/loader.d.ts +0 -70
  466. package/dist/sdk/runtime/loader.d.ts.map +0 -1
  467. package/dist/sdk/runtime/panel.d.ts +0 -9
  468. package/dist/sdk/runtime/panel.d.ts.map +0 -1
  469. package/dist/sdk/runtime/theme.d.ts +0 -28
  470. package/dist/sdk/runtime/theme.d.ts.map +0 -1
  471. package/dist/sdk/runtime/tmux.d.ts +0 -297
  472. package/dist/sdk/runtime/tmux.d.ts.map +0 -1
  473. package/dist/sdk/types.d.ts +0 -295
  474. package/dist/sdk/types.d.ts.map +0 -1
  475. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts +0 -62
  476. package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts.map +0 -1
  477. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts +0 -46
  478. package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts.map +0 -1
  479. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts +0 -26
  480. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/heuristic.d.ts.map +0 -1
  481. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts +0 -92
  482. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts.map +0 -1
  483. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts +0 -57
  484. package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts.map +0 -1
  485. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts +0 -49
  486. package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts.map +0 -1
  487. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts +0 -14
  488. package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts.map +0 -1
  489. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts +0 -14
  490. package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts.map +0 -1
  491. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts +0 -17
  492. package/dist/sdk/workflows/builtin/ralph/helpers/git.d.ts.map +0 -1
  493. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts +0 -119
  494. package/dist/sdk/workflows/builtin/ralph/helpers/prompts.d.ts.map +0 -1
  495. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts +0 -20
  496. package/dist/sdk/workflows/builtin/ralph/helpers/review.d.ts.map +0 -1
  497. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts +0 -14
  498. package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts.map +0 -1
  499. package/dist/sdk/workflows/index.d.ts +0 -24
  500. package/dist/sdk/workflows/index.d.ts.map +0 -1
  501. package/dist/services/config/definitions.d.ts +0 -85
  502. package/dist/services/config/definitions.d.ts.map +0 -1
  503. package/dist/services/system/copy.d.ts +0 -77
  504. package/dist/services/system/copy.d.ts.map +0 -1
  505. package/dist/services/system/detect.d.ts +0 -75
  506. package/dist/services/system/detect.d.ts.map +0 -1
  507. package/tsconfig.json +0 -33
@@ -0,0 +1,862 @@
1
+ """
2
+ Context Compression Evaluation
3
+
4
+ Public API for evaluating context compression quality using probe-based
5
+ assessment. This module provides three composable components:
6
+
7
+ - **ProbeGenerator**: Extracts factual claims, file operations, and decisions
8
+ from conversation history, then generates typed probes for evaluation.
9
+ Use when: building a compression evaluation pipeline and needing to
10
+ automatically derive test questions from raw conversation history.
11
+
12
+ - **CompressionEvaluator**: Scores probe responses against a multi-dimensional
13
+ rubric (accuracy, context awareness, artifact trail, completeness,
14
+ continuity, instruction following). Use when: comparing compression methods
15
+ or validating that a compression strategy preserves critical information.
16
+
17
+ - **StructuredSummarizer**: Implements anchored iterative summarization with
18
+ explicit sections for session intent, file tracking, decisions, and next
19
+ steps. Use when: compressing long-running coding sessions where file
20
+ tracking and decision rationale must survive compression.
21
+
22
+ Top-level convenience function:
23
+ - **evaluate_compression_quality**: End-to-end pipeline that generates probes,
24
+ collects model responses, evaluates them, and returns a scored summary with
25
+ recommendations. Use when: running a one-shot compression quality check
26
+ without wiring up individual components.
27
+
28
+ PRODUCTION NOTES:
29
+ - The LLM judge calls are stubbed for demonstration. Production systems
30
+ should implement actual API calls to a frontier model.
31
+ - Token estimation uses simplified heuristics. Production systems should
32
+ use model-specific tokenizers.
33
+ - Ground truth extraction uses pattern matching. Production systems may
34
+ benefit from more sophisticated fact extraction.
35
+ """
36
+
37
+ from dataclasses import dataclass, field
38
+ from typing import List, Dict, Optional, Callable
39
+ from enum import Enum
40
+ import json
41
+ import re
42
+
43
+ __all__ = [
44
+ "ProbeType",
45
+ "Probe",
46
+ "CriterionResult",
47
+ "EvaluationResult",
48
+ "RUBRIC_CRITERIA",
49
+ "ProbeGenerator",
50
+ "CompressionEvaluator",
51
+ "StructuredSummarizer",
52
+ "evaluate_compression_quality",
53
+ ]
54
+
55
+
56
+ class ProbeType(Enum):
57
+ """Types of evaluation probes for compression quality assessment."""
58
+ RECALL = "recall"
59
+ ARTIFACT = "artifact"
60
+ CONTINUATION = "continuation"
61
+ DECISION = "decision"
62
+
63
+
64
+ @dataclass
65
+ class Probe:
66
+ """A probe question for evaluating compression quality.
67
+
68
+ Use when: constructing evaluation inputs for CompressionEvaluator.
69
+ Each probe targets a specific information category that compression
70
+ may have lost.
71
+ """
72
+ probe_type: ProbeType
73
+ question: str
74
+ ground_truth: Optional[str] = None
75
+ context_reference: Optional[str] = None
76
+
77
+
78
+ @dataclass
79
+ class CriterionResult:
80
+ """Result for a single evaluation criterion."""
81
+ criterion_id: str
82
+ score: float
83
+ reasoning: str
84
+
85
+
86
+ @dataclass
87
+ class EvaluationResult:
88
+ """Complete evaluation result for a probe response.
89
+
90
+ Contains per-criterion scores, per-dimension aggregates, and an
91
+ overall aggregate score.
92
+ """
93
+ probe: Probe
94
+ response: str
95
+ criterion_results: List[CriterionResult]
96
+ aggregate_score: float
97
+ dimension_scores: Dict[str, float] = field(default_factory=dict)
98
+
99
+
100
+ # Evaluation Rubrics
101
+
102
+ RUBRIC_CRITERIA: Dict[str, List[Dict]] = {
103
+ "accuracy": [
104
+ {
105
+ "id": "accuracy_factual",
106
+ "question": "Are facts, file paths, and technical details correct?",
107
+ "weight": 0.6
108
+ },
109
+ {
110
+ "id": "accuracy_technical",
111
+ "question": "Are code references and technical concepts correct?",
112
+ "weight": 0.4
113
+ }
114
+ ],
115
+ "context_awareness": [
116
+ {
117
+ "id": "context_conversation_state",
118
+ "question": "Does the response reflect current conversation state?",
119
+ "weight": 0.5
120
+ },
121
+ {
122
+ "id": "context_artifact_state",
123
+ "question": "Does the response reflect which files/artifacts were accessed?",
124
+ "weight": 0.5
125
+ }
126
+ ],
127
+ "artifact_trail": [
128
+ {
129
+ "id": "artifact_files_created",
130
+ "question": "Does the agent know which files were created?",
131
+ "weight": 0.3
132
+ },
133
+ {
134
+ "id": "artifact_files_modified",
135
+ "question": "Does the agent know which files were modified?",
136
+ "weight": 0.4
137
+ },
138
+ {
139
+ "id": "artifact_key_details",
140
+ "question": "Does the agent remember function names, variable names, error messages?",
141
+ "weight": 0.3
142
+ }
143
+ ],
144
+ "completeness": [
145
+ {
146
+ "id": "completeness_coverage",
147
+ "question": "Does the response address all parts of the question?",
148
+ "weight": 0.6
149
+ },
150
+ {
151
+ "id": "completeness_depth",
152
+ "question": "Is sufficient detail provided?",
153
+ "weight": 0.4
154
+ }
155
+ ],
156
+ "continuity": [
157
+ {
158
+ "id": "continuity_work_state",
159
+ "question": "Can the agent continue without re-fetching information?",
160
+ "weight": 0.4
161
+ },
162
+ {
163
+ "id": "continuity_todo_state",
164
+ "question": "Does the agent maintain awareness of pending tasks?",
165
+ "weight": 0.3
166
+ },
167
+ {
168
+ "id": "continuity_reasoning",
169
+ "question": "Does the agent retain rationale behind previous decisions?",
170
+ "weight": 0.3
171
+ }
172
+ ],
173
+ "instruction_following": [
174
+ {
175
+ "id": "instruction_format",
176
+ "question": "Does the response follow the requested format?",
177
+ "weight": 0.5
178
+ },
179
+ {
180
+ "id": "instruction_constraints",
181
+ "question": "Does the response respect stated constraints?",
182
+ "weight": 0.5
183
+ }
184
+ ]
185
+ }
186
+
187
+
188
+ class ProbeGenerator:
189
+ """Generate typed probes from conversation history.
190
+
191
+ Use when: automatically deriving evaluation questions from raw
192
+ conversation history at compression points. Extracts facts, file
193
+ operations, and decisions via pattern matching, then produces
194
+ one probe per category.
195
+
196
+ For production systems, replace the regex-based extraction with
197
+ an LLM-based extractor for higher recall.
198
+ """
199
+
200
+ def __init__(self, conversation_history: str) -> None:
201
+ self.history = conversation_history
202
+ self.extracted_facts = self._extract_facts()
203
+ self.extracted_files = self._extract_files()
204
+ self.extracted_decisions = self._extract_decisions()
205
+
206
+ def generate_probes(self) -> List[Probe]:
207
+ """Generate all probe types for evaluation.
208
+
209
+ Use when: preparing evaluation inputs at a compression point.
210
+ Returns one probe per category (recall, artifact, continuation,
211
+ decision) based on extractable content from the history.
212
+ """
213
+ probes: List[Probe] = []
214
+
215
+ # Recall probes
216
+ if self.extracted_facts:
217
+ probes.append(Probe(
218
+ probe_type=ProbeType.RECALL,
219
+ question="What was the original error or issue that started this session?",
220
+ ground_truth=self.extracted_facts.get("original_error"),
221
+ context_reference="session_start"
222
+ ))
223
+
224
+ # Artifact probes
225
+ if self.extracted_files:
226
+ probes.append(Probe(
227
+ probe_type=ProbeType.ARTIFACT,
228
+ question="Which files have we modified? Describe what changed in each.",
229
+ ground_truth=json.dumps(self.extracted_files),
230
+ context_reference="file_operations"
231
+ ))
232
+
233
+ # Continuation probes
234
+ probes.append(Probe(
235
+ probe_type=ProbeType.CONTINUATION,
236
+ question="What should we do next?",
237
+ ground_truth=self.extracted_facts.get("next_steps"),
238
+ context_reference="task_state"
239
+ ))
240
+
241
+ # Decision probes
242
+ if self.extracted_decisions:
243
+ probes.append(Probe(
244
+ probe_type=ProbeType.DECISION,
245
+ question="What key decisions did we make and why?",
246
+ ground_truth=json.dumps(self.extracted_decisions),
247
+ context_reference="decision_points"
248
+ ))
249
+
250
+ return probes
251
+
252
+ def _extract_facts(self) -> Dict[str, str]:
253
+ """Extract factual claims from history."""
254
+ facts: Dict[str, str] = {}
255
+
256
+ # Extract error patterns
257
+ error_patterns = [
258
+ r"error[:\s]+(.+?)(?:\n|$)",
259
+ r"(\d{3})\s+(Unauthorized|Not Found|Internal Server Error)",
260
+ r"exception[:\s]+(.+?)(?:\n|$)"
261
+ ]
262
+
263
+ for pattern in error_patterns:
264
+ match = re.search(pattern, self.history, re.IGNORECASE)
265
+ if match:
266
+ facts["original_error"] = match.group(0).strip()
267
+ break
268
+
269
+ # Extract next steps
270
+ next_step_patterns = [
271
+ r"next[:\s]+(.+?)(?:\n|$)",
272
+ r"TODO[:\s]+(.+?)(?:\n|$)",
273
+ r"remaining[:\s]+(.+?)(?:\n|$)"
274
+ ]
275
+
276
+ for pattern in next_step_patterns:
277
+ match = re.search(pattern, self.history, re.IGNORECASE)
278
+ if match:
279
+ facts["next_steps"] = match.group(0).strip()
280
+ break
281
+
282
+ return facts
283
+
284
+ def _extract_files(self) -> List[Dict[str, str]]:
285
+ """Extract file operations from history."""
286
+ files: List[Dict[str, str]] = []
287
+
288
+ # Common file patterns
289
+ file_patterns = [
290
+ r"(?:modified|changed|updated|edited)\s+([^\s]+\.[a-z]+)",
291
+ r"(?:created|added)\s+([^\s]+\.[a-z]+)",
292
+ r"(?:read|examined|opened)\s+([^\s]+\.[a-z]+)"
293
+ ]
294
+
295
+ for pattern in file_patterns:
296
+ matches = re.findall(pattern, self.history, re.IGNORECASE)
297
+ for match in matches:
298
+ if match not in [f["path"] for f in files]:
299
+ files.append({
300
+ "path": match,
301
+ "operation": "modified" if "modif" in pattern else "created" if "creat" in pattern else "read"
302
+ })
303
+
304
+ return files
305
+
306
+ def _extract_decisions(self) -> List[Dict[str, str]]:
307
+ """Extract decision points from history."""
308
+ decisions: List[Dict[str, str]] = []
309
+
310
+ decision_patterns = [
311
+ r"decided to\s+(.+?)(?:\n|$)",
312
+ r"chose\s+(.+?)(?:\n|$)",
313
+ r"going with\s+(.+?)(?:\n|$)",
314
+ r"will use\s+(.+?)(?:\n|$)"
315
+ ]
316
+
317
+ for pattern in decision_patterns:
318
+ matches = re.findall(pattern, self.history, re.IGNORECASE)
319
+ for match in matches:
320
+ decisions.append({
321
+ "decision": match.strip(),
322
+ "context": pattern.split("\\s+")[0]
323
+ })
324
+
325
+ return decisions[:5] # Limit to 5 decisions
326
+
327
+
328
+ class CompressionEvaluator:
329
+ """Evaluate compression quality using probes and LLM judge.
330
+
331
+ Use when: comparing compression methods or validating that a specific
332
+ compression pass preserved critical information. Scores responses
333
+ across six dimensions (accuracy, context awareness, artifact trail,
334
+ completeness, continuity, instruction following) and produces an
335
+ aggregate quality score.
336
+
337
+ The evaluate() method is the primary entry point. Call it once per
338
+ probe, then call get_summary() to retrieve aggregated results.
339
+ """
340
+
341
+ def __init__(self, model: str = "gpt-5.2") -> None:
342
+ self.model = model
343
+ self.results: List[EvaluationResult] = []
344
+
345
+ def evaluate(self,
346
+ probe: Probe,
347
+ response: str,
348
+ compressed_context: str) -> EvaluationResult:
349
+ """Evaluate a single probe response against the rubric.
350
+
351
+ Use when: scoring how well a model's response (given compressed
352
+ context) answers a probe question. Returns per-criterion scores,
353
+ per-dimension aggregates, and an overall score.
354
+
355
+ Args:
356
+ probe: The probe question with expected ground truth.
357
+ response: The model's response to evaluate.
358
+ compressed_context: The compressed context that was provided
359
+ to the model when generating the response.
360
+
361
+ Returns:
362
+ EvaluationResult with scores and reasoning across all
363
+ applicable dimensions.
364
+ """
365
+ # Get relevant criteria based on probe type
366
+ criteria = self._get_criteria_for_probe(probe.probe_type)
367
+
368
+ # Evaluate each criterion
369
+ criterion_results: List[CriterionResult] = []
370
+ for criterion in criteria:
371
+ result = self._evaluate_criterion(
372
+ criterion,
373
+ probe,
374
+ response,
375
+ compressed_context
376
+ )
377
+ criterion_results.append(result)
378
+
379
+ # Calculate dimension scores
380
+ dimension_scores = self._calculate_dimension_scores(criterion_results)
381
+
382
+ # Calculate aggregate score
383
+ aggregate_score = sum(dimension_scores.values()) / len(dimension_scores) if dimension_scores else 0.0
384
+
385
+ result = EvaluationResult(
386
+ probe=probe,
387
+ response=response,
388
+ criterion_results=criterion_results,
389
+ aggregate_score=aggregate_score,
390
+ dimension_scores=dimension_scores
391
+ )
392
+
393
+ self.results.append(result)
394
+ return result
395
+
396
+ def get_summary(self) -> Dict:
397
+ """Get summary of all evaluation results.
398
+
399
+ Use when: all probes have been evaluated and an aggregate
400
+ report is needed to compare methods or make a go/no-go
401
+ decision on a compression strategy.
402
+
403
+ Returns:
404
+ Dictionary with total evaluations, average score,
405
+ per-dimension averages, and weakest/strongest dimensions.
406
+ """
407
+ if not self.results:
408
+ return {"error": "No evaluations performed"}
409
+
410
+ avg_score = sum(r.aggregate_score for r in self.results) / len(self.results)
411
+
412
+ # Average dimension scores
413
+ dimension_totals: Dict[str, float] = {}
414
+ dimension_counts: Dict[str, int] = {}
415
+
416
+ for result in self.results:
417
+ for dim, score in result.dimension_scores.items():
418
+ dimension_totals[dim] = dimension_totals.get(dim, 0) + score
419
+ dimension_counts[dim] = dimension_counts.get(dim, 0) + 1
420
+
421
+ avg_dimensions = {
422
+ dim: dimension_totals[dim] / dimension_counts[dim]
423
+ for dim in dimension_totals
424
+ }
425
+
426
+ return {
427
+ "total_evaluations": len(self.results),
428
+ "average_score": avg_score,
429
+ "dimension_averages": avg_dimensions,
430
+ "weakest_dimension": min(avg_dimensions, key=avg_dimensions.get) if avg_dimensions else None,
431
+ "strongest_dimension": max(avg_dimensions, key=avg_dimensions.get) if avg_dimensions else None,
432
+ }
433
+
434
+ def _get_criteria_for_probe(self, probe_type: ProbeType) -> List[Dict]:
435
+ """Get relevant criteria for probe type."""
436
+ criteria: List[Dict] = []
437
+
438
+ # All probes get accuracy and completeness
439
+ criteria.extend(RUBRIC_CRITERIA["accuracy"])
440
+ criteria.extend(RUBRIC_CRITERIA["completeness"])
441
+
442
+ # Add type-specific criteria
443
+ if probe_type == ProbeType.ARTIFACT:
444
+ criteria.extend(RUBRIC_CRITERIA["artifact_trail"])
445
+ elif probe_type == ProbeType.CONTINUATION:
446
+ criteria.extend(RUBRIC_CRITERIA["continuity"])
447
+ elif probe_type == ProbeType.RECALL:
448
+ criteria.extend(RUBRIC_CRITERIA["context_awareness"])
449
+ elif probe_type == ProbeType.DECISION:
450
+ criteria.extend(RUBRIC_CRITERIA["context_awareness"])
451
+ criteria.extend(RUBRIC_CRITERIA["continuity"])
452
+
453
+ criteria.extend(RUBRIC_CRITERIA["instruction_following"])
454
+
455
+ return criteria
456
+
457
+ def _evaluate_criterion(self,
458
+ criterion: Dict,
459
+ probe: Probe,
460
+ response: str,
461
+ context: str) -> CriterionResult:
462
+ """
463
+ Evaluate a single criterion using LLM judge.
464
+
465
+ PRODUCTION NOTE: This is a stub implementation.
466
+ Production systems should call the actual LLM API:
467
+
468
+ ```python
469
+ result = openai.chat.completions.create(
470
+ model="gpt-5.2",
471
+ messages=[
472
+ {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
473
+ {"role": "user", "content": self._format_judge_input(criterion, probe, response, context)}
474
+ ]
475
+ )
476
+ return self._parse_judge_output(result)
477
+ ```
478
+ """
479
+ # Stub implementation - in production, call LLM judge
480
+ score = self._heuristic_score(criterion, response, probe.ground_truth)
481
+ reasoning = f"Evaluated {criterion['id']} based on response content."
482
+
483
+ return CriterionResult(
484
+ criterion_id=criterion["id"],
485
+ score=score,
486
+ reasoning=reasoning
487
+ )
488
+
489
+ def _heuristic_score(self,
490
+ criterion: Dict,
491
+ response: str,
492
+ ground_truth: Optional[str]) -> float:
493
+ """
494
+ Heuristic scoring for demonstration.
495
+
496
+ Production systems should use LLM judge instead.
497
+ """
498
+ score = 3.0 # Base score
499
+
500
+ # Adjust based on response length and content
501
+ if len(response) < 50:
502
+ score -= 1.0 # Too short
503
+ elif len(response) > 500:
504
+ score += 0.5 # Detailed
505
+
506
+ # Check for technical content
507
+ if any(ext in response for ext in [".ts", ".py", ".js", ".md"]):
508
+ score += 0.5 # Contains file references
509
+
510
+ overlap_ratio = self._ground_truth_overlap_ratio(response, ground_truth)
511
+ if overlap_ratio >= 0.75:
512
+ score += 1.0
513
+ elif overlap_ratio >= 0.4:
514
+ score += 0.5
515
+ elif ground_truth:
516
+ score -= 0.5
517
+
518
+ return min(5.0, max(0.0, score))
519
+
520
+ def _ground_truth_overlap_ratio(self,
521
+ response: str,
522
+ ground_truth: Optional[str]) -> float:
523
+ if not ground_truth:
524
+ return 0.0
525
+
526
+ terms = self._extract_ground_truth_terms(ground_truth)
527
+ if not terms:
528
+ return 1.0 if ground_truth.lower() in response.lower() else 0.0
529
+
530
+ response_lower = response.lower()
531
+ matches = sum(1 for term in terms if term in response_lower)
532
+ return matches / len(terms)
533
+
534
+ def _extract_ground_truth_terms(self, ground_truth: str) -> List[str]:
535
+ try:
536
+ parsed = json.loads(ground_truth)
537
+ except json.JSONDecodeError:
538
+ return [ground_truth.lower()] if ground_truth.strip() else []
539
+
540
+ terms: List[str] = []
541
+
542
+ def collect(value) -> None:
543
+ if isinstance(value, str):
544
+ normalized = value.strip().lower()
545
+ if normalized:
546
+ terms.append(normalized)
547
+ elif isinstance(value, dict):
548
+ for nested in value.values():
549
+ collect(nested)
550
+ elif isinstance(value, list):
551
+ for nested in value:
552
+ collect(nested)
553
+
554
+ collect(parsed)
555
+ return list(dict.fromkeys(terms))
556
+
557
+ def _calculate_dimension_scores(self,
558
+ criterion_results: List[CriterionResult]) -> Dict[str, float]:
559
+ """Calculate dimension scores from criterion results."""
560
+ dimension_scores: Dict[str, float] = {}
561
+
562
+ for dimension, criteria in RUBRIC_CRITERIA.items():
563
+ criterion_ids = [c["id"] for c in criteria]
564
+ relevant_results = [
565
+ r for r in criterion_results
566
+ if r.criterion_id in criterion_ids
567
+ ]
568
+
569
+ if relevant_results:
570
+ # Weighted average
571
+ total_weight = sum(
572
+ c["weight"] for c in criteria
573
+ if c["id"] in [r.criterion_id for r in relevant_results]
574
+ )
575
+ weighted_sum = sum(
576
+ r.score * next(c["weight"] for c in criteria if c["id"] == r.criterion_id)
577
+ for r in relevant_results
578
+ )
579
+ dimension_scores[dimension] = weighted_sum / total_weight if total_weight > 0 else 0.0
580
+
581
+ return dimension_scores
582
+
583
+
584
+ class StructuredSummarizer:
585
+ """Generate structured summaries with explicit sections.
586
+
587
+ Use when: implementing anchored iterative summarization for
588
+ long-running coding sessions. Maintains a persistent summary
589
+ with dedicated sections for session intent, file modifications,
590
+ decisions, current state, and next steps.
591
+
592
+ Call update_from_span() each time a new content span is truncated.
593
+ The summarizer merges new information into existing sections rather
594
+ than regenerating, preventing cumulative detail loss.
595
+ """
596
+
597
+ TEMPLATE = """## Session Intent
598
+ {intent}
599
+
600
+ ## Files Modified
601
+ {files_modified}
602
+
603
+ ## Files Read (Not Modified)
604
+ {files_read}
605
+
606
+ ## Decisions Made
607
+ {decisions}
608
+
609
+ ## Current State
610
+ {current_state}
611
+
612
+ ## Next Steps
613
+ {next_steps}
614
+ """
615
+
616
+ def __init__(self) -> None:
617
+ self.sections: Dict = {
618
+ "intent": "",
619
+ "files_modified": [],
620
+ "files_read": [],
621
+ "decisions": [],
622
+ "current_state": "",
623
+ "next_steps": []
624
+ }
625
+
626
+ def update_from_span(self, new_content: str) -> str:
627
+ """Update summary from newly truncated content span.
628
+
629
+ Use when: a compression trigger fires and a portion of
630
+ conversation history is about to be discarded. Pass the
631
+ content that will be truncated; the summarizer extracts
632
+ structured information and merges it with prior state.
633
+
634
+ Args:
635
+ new_content: The conversation span being truncated.
636
+
637
+ Returns:
638
+ Formatted summary string with all sections populated.
639
+ """
640
+ # Extract information from new content
641
+ new_info = self._extract_from_content(new_content)
642
+
643
+ # Merge with existing sections
644
+ self._merge_sections(new_info)
645
+
646
+ # Generate formatted summary
647
+ return self._format_summary()
648
+
649
+ def _extract_from_content(self, content: str) -> Dict:
650
+ """Extract structured information from content."""
651
+ extracted: Dict = {
652
+ "intent": "",
653
+ "files_modified": [],
654
+ "files_read": [],
655
+ "decisions": [],
656
+ "current_state": "",
657
+ "next_steps": []
658
+ }
659
+
660
+ # Extract file modifications
661
+ mod_pattern = r"(?:modified|changed|updated|fixed)\s+([^\s]+\.[a-z]+)[:\s]*(.+?)(?:\n|$)"
662
+ for match in re.finditer(mod_pattern, content, re.IGNORECASE):
663
+ extracted["files_modified"].append({
664
+ "path": match.group(1),
665
+ "change": match.group(2).strip()[:100]
666
+ })
667
+
668
+ # Extract file reads
669
+ read_pattern = r"(?:read|examined|opened|checked)\s+([^\s]+\.[a-z]+)"
670
+ for match in re.finditer(read_pattern, content, re.IGNORECASE):
671
+ file_path = match.group(1)
672
+ if file_path not in [f["path"] for f in extracted["files_modified"]]:
673
+ extracted["files_read"].append(file_path)
674
+
675
+ # Extract decisions
676
+ decision_pattern = r"(?:decided|chose|going with|will use)\s+(.+?)(?:\n|$)"
677
+ for match in re.finditer(decision_pattern, content, re.IGNORECASE):
678
+ extracted["decisions"].append(match.group(1).strip()[:150])
679
+
680
+ return extracted
681
+
682
+ def _merge_sections(self, new_info: Dict) -> None:
683
+ """Merge new information with existing sections."""
684
+ # Update intent if empty
685
+ if new_info["intent"] and not self.sections["intent"]:
686
+ self.sections["intent"] = new_info["intent"]
687
+
688
+ # Merge file lists (deduplicate by path)
689
+ existing_mod_paths = [f["path"] for f in self.sections["files_modified"]]
690
+ for file_info in new_info["files_modified"]:
691
+ if file_info["path"] not in existing_mod_paths:
692
+ self.sections["files_modified"].append(file_info)
693
+
694
+ # Merge read files
695
+ for file_path in new_info["files_read"]:
696
+ if file_path not in self.sections["files_read"]:
697
+ self.sections["files_read"].append(file_path)
698
+
699
+ # Append decisions
700
+ self.sections["decisions"].extend(new_info["decisions"])
701
+
702
+ # Update current state (latest wins)
703
+ if new_info["current_state"]:
704
+ self.sections["current_state"] = new_info["current_state"]
705
+
706
+ # Merge next steps
707
+ self.sections["next_steps"].extend(new_info["next_steps"])
708
+
709
+ def _format_summary(self) -> str:
710
+ """Format sections into summary string."""
711
+ files_modified_str = "\n".join(
712
+ f"- {f['path']}: {f['change']}"
713
+ for f in self.sections["files_modified"]
714
+ ) or "None"
715
+
716
+ files_read_str = "\n".join(
717
+ f"- {f}" for f in self.sections["files_read"]
718
+ ) or "None"
719
+
720
+ decisions_str = "\n".join(
721
+ f"- {d}" for d in self.sections["decisions"][-5:] # Keep last 5
722
+ ) or "None"
723
+
724
+ next_steps_str = "\n".join(
725
+ f"{i+1}. {s}" for i, s in enumerate(self.sections["next_steps"][-5:])
726
+ ) or "None"
727
+
728
+ return self.TEMPLATE.format(
729
+ intent=self.sections["intent"] or "Not specified",
730
+ files_modified=files_modified_str,
731
+ files_read=files_read_str,
732
+ decisions=decisions_str,
733
+ current_state=self.sections["current_state"] or "In progress",
734
+ next_steps=next_steps_str
735
+ )
736
+
737
+
738
+ def evaluate_compression_quality(
739
+ original_history: str,
740
+ compressed_context: str,
741
+ model_response_fn: Callable[[str, str], str],
742
+ ) -> Dict:
743
+ """Evaluate compression quality for a conversation end-to-end.
744
+
745
+ Use when: running a one-shot quality check on a compression pass.
746
+ Generates probes from original history, collects model responses
747
+ using the compressed context, evaluates each response, and returns
748
+ a scored summary with actionable recommendations.
749
+
750
+ Args:
751
+ original_history: The full conversation before compression.
752
+ compressed_context: The compressed version to evaluate.
753
+ model_response_fn: Callable that takes (compressed_context, question)
754
+ and returns the model's response string.
755
+
756
+ Returns:
757
+ Dictionary with total evaluations, average score, per-dimension
758
+ averages, weakest/strongest dimensions, and recommendations list.
759
+ """
760
+ # Generate probes
761
+ generator = ProbeGenerator(original_history)
762
+ probes = generator.generate_probes()
763
+
764
+ # Evaluate each probe
765
+ evaluator = CompressionEvaluator()
766
+
767
+ for probe in probes:
768
+ # Get model response using compressed context
769
+ response = model_response_fn(compressed_context, probe.question)
770
+
771
+ # Evaluate response
772
+ evaluator.evaluate(probe, response, compressed_context)
773
+
774
+ # Get summary
775
+ summary = evaluator.get_summary()
776
+
777
+ # Add recommendations
778
+ summary["recommendations"] = []
779
+
780
+ if summary.get("weakest_dimension") == "artifact_trail":
781
+ summary["recommendations"].append(
782
+ "Consider implementing separate artifact tracking outside compression"
783
+ )
784
+
785
+ if summary.get("average_score", 0) < 3.5:
786
+ summary["recommendations"].append(
787
+ "Compression quality is below threshold - consider less aggressive compression"
788
+ )
789
+
790
+ return summary
791
+
792
+
793
+ if __name__ == "__main__":
794
+ # Demo: generate probes and evaluate a sample compression
795
+
796
+ sample_history = """
797
+ User reported error: 401 Unauthorized on /api/auth/login endpoint.
798
+ Examined auth.controller.ts - JWT generation looks correct.
799
+ Examined middleware/cors.ts - no issues found.
800
+ Modified config/redis.ts: Fixed connection pooling configuration.
801
+ Modified services/session.service.ts: Added retry logic for transient failures.
802
+ Decided to use Redis connection pool instead of per-request connections.
803
+ Modified tests/auth.test.ts: Updated mock setup for new config.
804
+ 14 tests passing, 2 failing (mock setup issues).
805
+ Next: Fix remaining test failures in session service mocks.
806
+ """
807
+
808
+ sample_compressed = """
809
+ ## Session Intent
810
+ Debug 401 Unauthorized on /api/auth/login.
811
+
812
+ ## Root Cause
813
+ Stale Redis connection in session store.
814
+
815
+ ## Files Modified
816
+ - config/redis.ts: Fixed connection pooling
817
+ - services/session.service.ts: Added retry logic
818
+ - tests/auth.test.ts: Updated mock setup
819
+
820
+ ## Test Status
821
+ 14 passing, 2 failing
822
+
823
+ ## Next Steps
824
+ 1. Fix remaining test failures
825
+ """
826
+
827
+ # Stub model response function
828
+ def mock_model_response(context: str, question: str) -> str:
829
+ if "error" in question.lower():
830
+ return "The original error was a 401 Unauthorized on /api/auth/login."
831
+ if "files" in question.lower():
832
+ return "Modified config/redis.ts, services/session.service.ts, tests/auth.test.ts."
833
+ if "next" in question.lower():
834
+ return "Fix remaining test failures in session service mocks."
835
+ if "decision" in question.lower():
836
+ return "Decided to use Redis connection pool instead of per-request connections."
837
+ return "No specific information available."
838
+
839
+ # Run evaluation
840
+ result = evaluate_compression_quality(
841
+ original_history=sample_history,
842
+ compressed_context=sample_compressed,
843
+ model_response_fn=mock_model_response,
844
+ )
845
+
846
+ print("=== Compression Quality Evaluation ===")
847
+ print(f"Total evaluations: {result['total_evaluations']}")
848
+ print(f"Average score: {result['average_score']:.2f}")
849
+ print()
850
+ print("Dimension averages:")
851
+ for dim, score in result.get("dimension_averages", {}).items():
852
+ print(f" {dim}: {score:.2f}")
853
+ print()
854
+ print(f"Weakest dimension: {result.get('weakest_dimension')}")
855
+ print(f"Strongest dimension: {result.get('strongest_dimension')}")
856
+ print()
857
+ if result.get("recommendations"):
858
+ print("Recommendations:")
859
+ for rec in result["recommendations"]:
860
+ print(f" - {rec}")
861
+ else:
862
+ print("No recommendations - compression quality looks acceptable.")