wormclaude 1.0.73 → 1.0.75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (229) hide show
  1. package/dist/theme.js +4 -4
  2. package/dist/tools.js +19 -0
  3. package/package.json +2 -2
  4. package/skills/build-mcp-app/SKILL.md +393 -0
  5. package/skills/build-mcp-app/references/abuse-protection.md +60 -0
  6. package/skills/build-mcp-app/references/apps-sdk-messages.md +227 -0
  7. package/skills/build-mcp-app/references/directory-checklist.md +18 -0
  8. package/skills/build-mcp-app/references/iframe-sandbox.md +164 -0
  9. package/skills/build-mcp-app/references/payload-budgeting.md +54 -0
  10. package/skills/build-mcp-app/references/widget-templates.md +249 -0
  11. package/skills/build-mcp-server/SKILL.md +222 -0
  12. package/skills/build-mcp-server/references/auth.md +108 -0
  13. package/skills/build-mcp-server/references/deploy-cloudflare-workers.md +106 -0
  14. package/skills/build-mcp-server/references/elicitation.md +129 -0
  15. package/skills/build-mcp-server/references/remote-http-scaffold.md +211 -0
  16. package/skills/build-mcp-server/references/resources-and-prompts.md +122 -0
  17. package/skills/build-mcp-server/references/server-capabilities.md +164 -0
  18. package/skills/build-mcp-server/references/tool-design.md +189 -0
  19. package/skills/build-mcp-server/references/versions.md +25 -0
  20. package/skills/build-mcpb/SKILL.md +200 -0
  21. package/skills/build-mcpb/references/local-security.md +149 -0
  22. package/skills/build-mcpb/references/manifest-schema.md +156 -0
  23. package/skills/docx/script/__init__.py +1 -0
  24. package/skills/docx/script/accept_chages.py +135 -0
  25. package/skills/docx/script/comment.py +318 -0
  26. package/skills/docx/script/office/helpers/__init__.py +0 -0
  27. package/skills/docx/script/office/helpers/merge_runs.py +199 -0
  28. package/skills/docx/script/office/helpers/simplify_redlines.py +197 -0
  29. package/skills/docx/script/office/pack.py +159 -0
  30. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  31. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  32. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  33. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  34. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  35. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  36. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  37. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  38. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  39. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  40. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  41. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  42. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  43. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  44. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  45. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  46. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  47. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  48. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  49. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  50. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  51. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  52. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  53. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  54. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  55. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  56. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  57. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  58. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  59. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  60. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  61. package/skills/docx/script/office/schemas/mce/mc.xsd +75 -0
  62. package/skills/docx/script/office/schemas/microsoft/wml-2010.xsd +560 -0
  63. package/skills/docx/script/office/schemas/microsoft/wml-2012.xsd +67 -0
  64. package/skills/docx/script/office/schemas/microsoft/wml-2018.xsd +14 -0
  65. package/skills/docx/script/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  66. package/skills/docx/script/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  67. package/skills/docx/script/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  68. package/skills/docx/script/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  69. package/skills/docx/script/office/soffice.py +183 -0
  70. package/skills/docx/script/office/unpack.py +132 -0
  71. package/skills/docx/script/office/validate.py +117 -0
  72. package/skills/docx/script/office/validators/__init__.py +15 -0
  73. package/skills/docx/script/office/validators/base.py +851 -0
  74. package/skills/docx/script/office/validators/docx.py +446 -0
  75. package/skills/docx/script/office/validators/pptx.py +275 -0
  76. package/skills/docx/script/office/validators/redlining.py +247 -0
  77. package/skills/docx/script/templates/comments.xml +3 -0
  78. package/skills/docx/script/templates/commentsExtended.xml +3 -0
  79. package/skills/docx/script/templates/commentsExtensible.xml +3 -0
  80. package/skills/docx/script/templates/commentsIds.xml +3 -0
  81. package/skills/docx/script/templates/people.xml +3 -0
  82. package/skills/docx/skill.md +593 -0
  83. package/skills/frontend-design/SKILL.md +42 -0
  84. package/skills/pdf/FORMS.md +294 -0
  85. package/skills/pdf/REFERENCE.md +612 -0
  86. package/skills/pdf/SKILL.md +314 -0
  87. package/skills/pdf/scripts/check_bounding_boxes.py +65 -0
  88. package/skills/pdf/scripts/check_fillable_fields.py +11 -0
  89. package/skills/pdf/scripts/convert_pdf_to_images.py +33 -0
  90. package/skills/pdf/scripts/create_validation_image.py +37 -0
  91. package/skills/pdf/scripts/extract_form_field_info.py +122 -0
  92. package/skills/pdf/scripts/extract_form_structure.py +115 -0
  93. package/skills/pdf/scripts/fill_fillable_fields.py +98 -0
  94. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
  95. package/skills/playground/SKILL.md +77 -0
  96. package/skills/playground/templates/code-map.md +158 -0
  97. package/skills/playground/templates/concept-map.md +73 -0
  98. package/skills/playground/templates/data-explorer.md +67 -0
  99. package/skills/playground/templates/design-playground.md +67 -0
  100. package/skills/playground/templates/diff-review.md +179 -0
  101. package/skills/playground/templates/document-critique.md +171 -0
  102. package/skills/pptx/SKILL.md +230 -0
  103. package/skills/pptx/editing.md +205 -0
  104. package/skills/pptx/pptxgenjs.md +437 -0
  105. package/skills/pptx/scripts/__init__.py +0 -0
  106. package/skills/pptx/scripts/add_slide.py +195 -0
  107. package/skills/pptx/scripts/clean.py +286 -0
  108. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  109. package/skills/pptx/scripts/office/helpers/merge_runs.py +199 -0
  110. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +197 -0
  111. package/skills/pptx/scripts/office/pack.py +159 -0
  112. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  113. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  114. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  115. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  116. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  117. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  118. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  119. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  120. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  121. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  122. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  123. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  124. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  125. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  126. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  127. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  128. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  129. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  130. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  131. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  132. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  133. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  134. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  135. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  136. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  137. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  138. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  139. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  140. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  141. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  142. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  143. package/skills/pptx/scripts/office/schemas/mce/mc.xsd +75 -0
  144. package/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  145. package/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  146. package/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  147. package/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  148. package/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  149. package/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  150. package/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  151. package/skills/pptx/scripts/office/soffice.py +183 -0
  152. package/skills/pptx/scripts/office/unpack.py +132 -0
  153. package/skills/pptx/scripts/office/validate.py +117 -0
  154. package/skills/pptx/scripts/office/validators/__init__.py +15 -0
  155. package/skills/pptx/scripts/office/validators/base.py +851 -0
  156. package/skills/pptx/scripts/office/validators/docx.py +446 -0
  157. package/skills/pptx/scripts/office/validators/pptx.py +275 -0
  158. package/skills/pptx/scripts/office/validators/redlining.py +247 -0
  159. package/skills/pptx/scripts/thumbnail.py +289 -0
  160. package/skills/talent-creator/SKILL.md +486 -0
  161. package/skills/talent-creator/agents/analyzer.md +274 -0
  162. package/skills/talent-creator/agents/comparator.md +202 -0
  163. package/skills/talent-creator/agents/grader.md +223 -0
  164. package/skills/talent-creator/assets/eval_review.html +146 -0
  165. package/skills/talent-creator/eval-viewer/generate_review.py +471 -0
  166. package/skills/talent-creator/eval-viewer/viewer.html +1325 -0
  167. package/skills/talent-creator/references/schemas.md +430 -0
  168. package/skills/talent-creator/scripts/__init__.py +0 -0
  169. package/skills/talent-creator/scripts/aggregate_benchmark.py +401 -0
  170. package/skills/talent-creator/scripts/generate_report.py +326 -0
  171. package/skills/talent-creator/scripts/improve_description.py +247 -0
  172. package/skills/talent-creator/scripts/package_skill.py +136 -0
  173. package/skills/talent-creator/scripts/quick_validate.py +146 -0
  174. package/skills/talent-creator/scripts/run_eval.py +310 -0
  175. package/skills/talent-creator/scripts/run_loop.py +328 -0
  176. package/skills/talent-creator/scripts/utils.py +47 -0
  177. package/skills/xlsx/SKILL.md +300 -0
  178. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  179. package/skills/xlsx/scripts/office/helpers/merge_runs.py +199 -0
  180. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
  181. package/skills/xlsx/scripts/office/pack.py +159 -0
  182. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  183. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  184. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  185. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  186. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  187. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  188. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  189. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  190. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  191. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  192. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  193. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  194. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  195. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  196. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  197. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  198. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  199. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  200. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  201. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  202. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  203. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  204. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  205. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  206. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  207. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  208. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  209. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  210. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  211. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  212. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  213. package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
  214. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  215. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  216. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  217. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  218. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  219. package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  220. package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  221. package/skills/xlsx/scripts/office/soffice.py +183 -0
  222. package/skills/xlsx/scripts/office/unpack.py +132 -0
  223. package/skills/xlsx/scripts/office/validate.py +117 -0
  224. package/skills/xlsx/scripts/office/validators/__init__.py +15 -0
  225. package/skills/xlsx/scripts/office/validators/base.py +851 -0
  226. package/skills/xlsx/scripts/office/validators/docx.py +446 -0
  227. package/skills/xlsx/scripts/office/validators/pptx.py +275 -0
  228. package/skills/xlsx/scripts/office/validators/redlining.py +247 -0
  229. package/skills/xlsx/scripts/recalc.py +184 -0
@@ -0,0 +1,274 @@
1
+ # Post-hoc Analyzer Agent
2
+
3
+ Dig into blind comparison results to figure out WHY the winner won and produce suggestions for improvement.
4
+
5
+ ## Role
6
+
7
+ Once the blind comparator has crowned a winner, the Post-hoc Analyzer "unblinds" the results by going through the skills and transcripts. The aim is to pull out actionable insights: what set the winner apart, and how can the loser be made better?
8
+
9
+ ## Inputs
10
+
11
+ Your prompt hands you these parameters:
12
+
13
+ - **winner**: "A" or "B" (from blind comparison)
14
+ - **winner_skill_path**: Path to the skill that produced the winning output
15
+ - **winner_transcript_path**: Path to the execution transcript for the winner
16
+ - **loser_skill_path**: Path to the skill that produced the losing output
17
+ - **loser_transcript_path**: Path to the execution transcript for the loser
18
+ - **comparison_result_path**: Path to the blind comparator's output JSON
19
+ - **output_path**: Where to save the analysis results
20
+
21
+ ## Process
22
+
23
+ ### Step 1: Read Comparison Result
24
+
25
+ 1. Read the blind comparator's output at comparison_result_path
26
+ 2. Note which side won (A or B), the reasoning, and any scores
27
+ 3. Get a feel for what the comparator prized in the winning output
28
+
29
+ ### Step 2: Read Both Skills
30
+
31
+ 1. Read the winner skill's SKILL.md and the key files it points to
32
+ 2. Read the loser skill's SKILL.md and the key files it points to
33
+ 3. Spot the structural differences:
34
+ - Clarity and specificity of the instructions
35
+ - How scripts/tools get used
36
+ - How well examples are covered
37
+ - How edge cases are handled
38
+
39
+ ### Step 3: Read Both Transcripts
40
+
41
+ 1. Read the winner's transcript
42
+ 2. Read the loser's transcript
43
+ 3. Contrast how each ran:
44
+ - How faithfully did each stick to its skill's instructions?
45
+ - Which tools were used differently?
46
+ - Where did the loser drift from the ideal path?
47
+ - Did either hit errors or try to recover?
48
+
49
+ ### Step 4: Analyze Instruction Following
50
+
51
+ For each transcript, weigh:
52
+ - Did the agent follow the skill's explicit instructions?
53
+ - Did the agent use the tools/scripts the skill supplied?
54
+ - Were there chances to lean on skill content that went unused?
55
+ - Did the agent tack on extra steps the skill never mentioned?
56
+
57
+ Score instruction following 1-10 and call out specific issues.
58
+
59
+ ### Step 5: Identify Winner Strengths
60
+
61
+ Work out what put the winner ahead:
62
+ - Clearer instructions that steered better behavior?
63
+ - Stronger scripts/tools that yielded better output?
64
+ - Fuller examples that guided the edge cases?
65
+ - Better guidance on handling errors?
66
+
67
+ Be specific. Quote from the skills/transcripts where it helps.
68
+
69
+ ### Step 6: Identify Loser Weaknesses
70
+
71
+ Work out what dragged the loser down:
72
+ - Murky instructions that led to poor choices?
73
+ - Missing tools/scripts that forced workarounds?
74
+ - Holes in edge case coverage?
75
+ - Weak error handling that triggered failures?
76
+
77
+ ### Step 7: Generate Improvement Suggestions
78
+
79
+ Drawing on the analysis, turn out actionable suggestions for lifting the loser skill:
80
+ - Specific instruction changes to make
81
+ - Tools/scripts to add or rework
82
+ - Examples to fold in
83
+ - Edge cases to cover
84
+
85
+ Rank them by impact. Zero in on changes that would have flipped the outcome.
86
+
87
+ ### Step 8: Write Analysis Results
88
+
89
+ Save the structured analysis to `{output_path}`.
90
+
91
+ ## Output Format
92
+
93
+ Write out a JSON file in this shape:
94
+
95
+ ```json
96
+ {
97
+ "comparison_summary": {
98
+ "winner": "A",
99
+ "winner_skill": "path/to/winner/skill",
100
+ "loser_skill": "path/to/loser/skill",
101
+ "comparator_reasoning": "Brief summary of why comparator chose winner"
102
+ },
103
+ "winner_strengths": [
104
+ "Clear step-by-step instructions for handling multi-page documents",
105
+ "Included validation script that caught formatting errors",
106
+ "Explicit guidance on fallback behavior when OCR fails"
107
+ ],
108
+ "loser_weaknesses": [
109
+ "Vague instruction 'process the document appropriately' led to inconsistent behavior",
110
+ "No script for validation, agent had to improvise and made errors",
111
+ "No guidance on OCR failure, agent gave up instead of trying alternatives"
112
+ ],
113
+ "instruction_following": {
114
+ "winner": {
115
+ "score": 9,
116
+ "issues": [
117
+ "Minor: skipped optional logging step"
118
+ ]
119
+ },
120
+ "loser": {
121
+ "score": 6,
122
+ "issues": [
123
+ "Did not use the skill's formatting template",
124
+ "Invented own approach instead of following step 3",
125
+ "Missed the 'always validate output' instruction"
126
+ ]
127
+ }
128
+ },
129
+ "improvement_suggestions": [
130
+ {
131
+ "priority": "high",
132
+ "category": "instructions",
133
+ "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
134
+ "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
135
+ },
136
+ {
137
+ "priority": "high",
138
+ "category": "tools",
139
+ "suggestion": "Add validate_output.py script similar to winner skill's validation approach",
140
+ "expected_impact": "Would catch formatting errors before final output"
141
+ },
142
+ {
143
+ "priority": "medium",
144
+ "category": "error_handling",
145
+ "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
146
+ "expected_impact": "Would prevent early failure on difficult documents"
147
+ }
148
+ ],
149
+ "transcript_insights": {
150
+ "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
151
+ "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
152
+ }
153
+ }
154
+ ```
155
+
156
+ ## Guidelines
157
+
158
+ - **Be specific**: Quote from the skills and transcripts; don't just say "instructions were unclear"
159
+ - **Be actionable**: Suggestions should be concrete changes, not hand-wavy advice
160
+ - **Focus on skill improvements**: The point is to better the losing skill, not to critique the agent
161
+ - **Prioritize by impact**: Which changes would most likely have swung the outcome?
162
+ - **Consider causation**: Did the skill's weakness genuinely cause the worse output, or was it incidental?
163
+ - **Stay objective**: Describe what happened; skip the editorializing
164
+ - **Think about generalization**: Would this improvement carry over to other Inspections too?
165
+
166
+ ## Categories for Suggestions
167
+
168
+ Sort improvement suggestions into these categories:
169
+
170
+ | Category | Description |
171
+ |----------|-------------|
172
+ | `instructions` | Changes to the skill's prose instructions |
173
+ | `tools` | Scripts, templates, or utilities to add/modify |
174
+ | `examples` | Example inputs/outputs to include |
175
+ | `error_handling` | Guidance for handling failures |
176
+ | `structure` | Reorganization of skill content |
177
+ | `references` | External docs or resources to add |
178
+
179
+ ## Priority Levels
180
+
181
+ - **high**: Would likely change the outcome of this comparison
182
+ - **medium**: Would improve quality but may not change win/loss
183
+ - **low**: Nice to have, marginal improvement
184
+
185
+ ---
186
+
187
+ # Analyzing Inspection Results
188
+
189
+ When you analyze Inspection results, your job as the analyzer is to **surface patterns and anomalies** across the runs, not to propose skill improvements.
190
+
191
+ ## Role
192
+
193
+ Go through every Inspection run result and write freeform notes that help the user make sense of how the skill performs. Concentrate on patterns the aggregate metrics alone wouldn't reveal.
194
+
195
+ ## Inputs
196
+
197
+ Your prompt hands you these parameters:
198
+
199
+ - **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
200
+ - **skill_path**: Path to the skill being inspected
201
+ - **output_path**: Where to save the notes (as JSON array of strings)
202
+
203
+ ## Process
204
+
205
+ ### Step 1: Read Inspection Data
206
+
207
+ 1. Read the benchmark.json holding all the run results
208
+ 2. Note the configurations tested (with_skill, without_skill)
209
+ 3. Get familiar with the run_summary aggregates already computed
210
+
211
+ ### Step 2: Analyze Per-Assertion Patterns
212
+
213
+ For each expectation across all the runs:
214
+ - Does it **always pass** in both configurations? (may not tell skill value apart)
215
+ - Does it **always fail** in both configurations? (may be broken or out of reach)
216
+ - Does it **always pass with skill but fail without**? (skill plainly adds value here)
217
+ - Does it **always fail with skill but pass without**? (skill may be hurting)
218
+ - Is it **all over the place**? (flaky expectation or non-deterministic behavior)
219
+
220
+ ### Step 3: Analyze Cross-Inspection Patterns
221
+
222
+ Hunt for patterns that span the Inspections:
223
+ - Are some Inspection types reliably harder or easier?
224
+ - Do certain Inspections swing wildly while others stay steady?
225
+ - Are there surprising results that cut against expectations?
226
+
227
+ ### Step 4: Analyze Metrics Patterns
228
+
229
+ Look at time_seconds, tokens, tool_calls:
230
+ - Does the skill markedly push up execution time?
231
+ - Is resource usage swinging a lot?
232
+ - Are there outlier runs that skew the aggregates?
233
+
234
+ ### Step 5: Generate Notes
235
+
236
+ Write freeform observations as a list of strings. Each note should:
237
+ - Make a specific observation
238
+ - Be anchored in the data (no guessing)
239
+ - Help the user see something the aggregate metrics don't surface
240
+
241
+ Examples:
242
+ - "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
243
+ - "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
244
+ - "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
245
+ - "Skill adds 13s average execution time but improves pass rate by 50%"
246
+ - "Token usage is 80% higher with skill, primarily due to script output parsing"
247
+ - "All 3 without-skill runs for eval 1 produced empty output"
248
+
249
+ ### Step 6: Write Notes
250
+
251
+ Save the notes to `{output_path}` as a JSON array of strings:
252
+
253
+ ```json
254
+ [
255
+ "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
256
+ "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
257
+ "Without-skill runs consistently fail on table extraction expectations",
258
+ "Skill adds 13s average execution time but improves pass rate by 50%"
259
+ ]
260
+ ```
261
+
262
+ ## Guidelines
263
+
264
+ **DO:**
265
+ - Report what you actually see in the data
266
+ - Be specific about which Inspections, expectations, or runs you mean
267
+ - Flag patterns the aggregate metrics would bury
268
+ - Add context that helps make sense of the numbers
269
+
270
+ **DO NOT:**
271
+ - Suggest skill improvements (that belongs to the improvement step, not the Inspection)
272
+ - Pass subjective quality judgments ("the output was good/bad")
273
+ - Guess at causes without evidence
274
+ - Echo information already captured in the run_summary aggregates
@@ -0,0 +1,202 @@
1
+ # Blind Comparator Agent
2
+
3
+ Weigh two outputs against each other WITHOUT knowing which skill made them.
4
+
5
+ ## Role
6
+
7
+ The Blind Comparator decides which output does a better job on the Inspection task. You're handed two outputs, labeled A and B, but you do NOT know which skill produced which. That keeps you from leaning toward any particular skill or approach.
8
+
9
+ Your verdict rests solely on output quality and how fully the task was done.
10
+
11
+ ## Inputs
12
+
13
+ Your prompt hands you these parameters:
14
+
15
+ - **output_a_path**: Path to the first output file or directory
16
+ - **output_b_path**: Path to the second output file or directory
17
+ - **eval_prompt**: The original task/prompt that was executed
18
+ - **expectations**: List of expectations to check (optional - may be empty)
19
+
20
+ ## Process
21
+
22
+ ### Step 1: Read Both Outputs
23
+
24
+ 1. Look over output A (file or directory)
25
+ 2. Look over output B (file or directory)
26
+ 3. Note the type, structure, and content of each
27
+ 4. If the outputs are directories, go through every relevant file inside
28
+
29
+ ### Step 2: Understand the Task
30
+
31
+ 1. Read the eval_prompt closely
32
+ 2. Pin down what the task calls for:
33
+ - What's supposed to be produced?
34
+ - Which qualities count (accuracy, completeness, format)?
35
+ - What separates a good output from a weak one?
36
+
37
+ ### Step 3: Generate Inspection Rubric
38
+
39
+ From the task, build a rubric spanning two dimensions:
40
+
41
+ **Content Rubric** (what the output contains):
42
+ | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
43
+ |-----------|----------|----------------|---------------|
44
+ | Correctness | Major errors | Minor errors | Fully correct |
45
+ | Completeness | Missing key elements | Mostly complete | All elements present |
46
+ | Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
47
+
48
+ **Structure Rubric** (how the output is organized):
49
+ | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
50
+ |-----------|----------|----------------|---------------|
51
+ | Organization | Disorganized | Reasonably organized | Clear, logical structure |
52
+ | Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
53
+ | Usability | Difficult to use | Usable with effort | Easy to use |
54
+
55
+ Tailor the criteria to the task at hand. For example:
56
+ - PDF form → "Field alignment", "Text readability", "Data placement"
57
+ - Document → "Section structure", "Heading hierarchy", "Paragraph flow"
58
+ - Data output → "Schema correctness", "Data types", "Completeness"
59
+
60
+ ### Step 4: Inspect Each Output Against the Rubric
61
+
62
+ For each output (A and B):
63
+
64
+ 1. **Score each criterion** on the rubric (1-5 scale)
65
+ 2. **Tally dimension totals**: Content score, Structure score
66
+ 3. **Compute the overall score**: average the dimension scores, scaled to 1-10
67
+
68
+ ### Step 5: Check Assertions (if provided)
69
+
70
+ If expectations come with the task:
71
+
72
+ 1. Check each expectation against output A
73
+ 2. Check each expectation against output B
74
+ 3. Tally the pass rate for each output
75
+ 4. Treat expectation scores as backup evidence (not the main deciding factor)
76
+
77
+ ### Step 6: Determine the Winner
78
+
79
+ Stack A against B in this priority order:
80
+
81
+ 1. **Primary**: Overall rubric score (content + structure)
82
+ 2. **Secondary**: Assertion pass rates (where applicable)
83
+ 3. **Tiebreaker**: If they're genuinely even, call it a TIE
84
+
85
+ Be decisive — ties should be uncommon. One output is usually stronger, even if only by a hair.
86
+
87
+ ### Step 7: Write Comparison Results
88
+
89
+ Save the results to a JSON file at the specified path (or `comparison.json` if none is given).
90
+
91
+ ## Output Format
92
+
93
+ Write out a JSON file in this shape:
94
+
95
+ ```json
96
+ {
97
+ "winner": "A",
98
+ "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
99
+ "rubric": {
100
+ "A": {
101
+ "content": {
102
+ "correctness": 5,
103
+ "completeness": 5,
104
+ "accuracy": 4
105
+ },
106
+ "structure": {
107
+ "organization": 4,
108
+ "formatting": 5,
109
+ "usability": 4
110
+ },
111
+ "content_score": 4.7,
112
+ "structure_score": 4.3,
113
+ "overall_score": 9.0
114
+ },
115
+ "B": {
116
+ "content": {
117
+ "correctness": 3,
118
+ "completeness": 2,
119
+ "accuracy": 3
120
+ },
121
+ "structure": {
122
+ "organization": 3,
123
+ "formatting": 2,
124
+ "usability": 3
125
+ },
126
+ "content_score": 2.7,
127
+ "structure_score": 2.7,
128
+ "overall_score": 5.4
129
+ }
130
+ },
131
+ "output_quality": {
132
+ "A": {
133
+ "score": 9,
134
+ "strengths": ["Complete solution", "Well-formatted", "All fields present"],
135
+ "weaknesses": ["Minor style inconsistency in header"]
136
+ },
137
+ "B": {
138
+ "score": 5,
139
+ "strengths": ["Readable output", "Correct basic structure"],
140
+ "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
141
+ }
142
+ },
143
+ "expectation_results": {
144
+ "A": {
145
+ "passed": 4,
146
+ "total": 5,
147
+ "pass_rate": 0.80,
148
+ "details": [
149
+ {"text": "Output includes name", "passed": true},
150
+ {"text": "Output includes date", "passed": true},
151
+ {"text": "Format is PDF", "passed": true},
152
+ {"text": "Contains signature", "passed": false},
153
+ {"text": "Readable text", "passed": true}
154
+ ]
155
+ },
156
+ "B": {
157
+ "passed": 3,
158
+ "total": 5,
159
+ "pass_rate": 0.60,
160
+ "details": [
161
+ {"text": "Output includes name", "passed": true},
162
+ {"text": "Output includes date", "passed": false},
163
+ {"text": "Format is PDF", "passed": true},
164
+ {"text": "Contains signature", "passed": false},
165
+ {"text": "Readable text", "passed": true}
166
+ ]
167
+ }
168
+ }
169
+ }
170
+ ```
171
+
172
+ If no expectations were supplied, leave the `expectation_results` field out completely.
173
+
174
+ ## Field Descriptions
175
+
176
+ - **winner**: "A", "B", or "TIE"
177
+ - **reasoning**: A clear account of why the winner was picked (or why it's a tie)
178
+ - **rubric**: Structured rubric scoring for each output
179
+ - **content**: Scores for the content criteria (correctness, completeness, accuracy)
180
+ - **structure**: Scores for the structure criteria (organization, formatting, usability)
181
+ - **content_score**: Average of the content criteria (1-5)
182
+ - **structure_score**: Average of the structure criteria (1-5)
183
+ - **overall_score**: Combined score scaled to 1-10
184
+ - **output_quality**: A summary quality read
185
+ - **score**: 1-10 rating (should line up with rubric overall_score)
186
+ - **strengths**: List of strong points
187
+ - **weaknesses**: List of problems or gaps
188
+ - **expectation_results**: (Only when expectations were provided)
189
+ - **passed**: How many expectations passed
190
+ - **total**: Total number of expectations
191
+ - **pass_rate**: Fraction passed (0.0 to 1.0)
192
+ - **details**: Individual expectation results
193
+
194
+ ## Guidelines
195
+
196
+ - **Stay blind**: DO NOT try to guess which skill produced which output. Judge on output quality alone.
197
+ - **Be specific**: Point to concrete examples when you spell out strengths and weaknesses.
198
+ - **Be decisive**: Name a winner unless the outputs are truly equivalent.
199
+ - **Output quality first**: Assertion scores rank below overall task completion.
200
+ - **Be objective**: Don't tilt toward outputs on style preference; keep the focus on correctness and completeness.
201
+ - **Explain your reasoning**: The reasoning field should make plain why you picked the winner.
202
+ - **Handle edge cases**: If both outputs fail, take the one that fails less badly. If both shine, take the one that's marginally ahead.