wormclaude 1.0.119 → 1.0.121

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/dist/theme.js +1 -1
  2. package/dist/tui.js +6 -1
  3. package/package.json +1 -1
  4. package/skills/build-mcp-app/SKILL.md +0 -393
  5. package/skills/build-mcp-app/references/abuse-protection.md +0 -60
  6. package/skills/build-mcp-app/references/apps-sdk-messages.md +0 -227
  7. package/skills/build-mcp-app/references/directory-checklist.md +0 -18
  8. package/skills/build-mcp-app/references/iframe-sandbox.md +0 -164
  9. package/skills/build-mcp-app/references/payload-budgeting.md +0 -54
  10. package/skills/build-mcp-app/references/widget-templates.md +0 -249
  11. package/skills/build-mcp-server/SKILL.md +0 -222
  12. package/skills/build-mcp-server/references/auth.md +0 -108
  13. package/skills/build-mcp-server/references/deploy-cloudflare-workers.md +0 -106
  14. package/skills/build-mcp-server/references/elicitation.md +0 -129
  15. package/skills/build-mcp-server/references/remote-http-scaffold.md +0 -211
  16. package/skills/build-mcp-server/references/resources-and-prompts.md +0 -122
  17. package/skills/build-mcp-server/references/server-capabilities.md +0 -164
  18. package/skills/build-mcp-server/references/tool-design.md +0 -189
  19. package/skills/build-mcp-server/references/versions.md +0 -25
  20. package/skills/build-mcpb/SKILL.md +0 -200
  21. package/skills/build-mcpb/references/local-security.md +0 -149
  22. package/skills/build-mcpb/references/manifest-schema.md +0 -156
  23. package/skills/docx/script/__init__.py +0 -1
  24. package/skills/docx/script/accept_chages.py +0 -135
  25. package/skills/docx/script/comment.py +0 -318
  26. package/skills/docx/script/office/helpers/__init__.py +0 -0
  27. package/skills/docx/script/office/helpers/merge_runs.py +0 -199
  28. package/skills/docx/script/office/helpers/simplify_redlines.py +0 -197
  29. package/skills/docx/script/office/pack.py +0 -159
  30. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  31. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  32. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  33. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  34. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  35. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  36. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  37. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  38. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  39. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  40. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  41. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  42. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  43. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  44. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  45. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  46. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  47. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  48. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  49. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  50. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  51. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  52. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  53. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  54. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  55. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  56. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  57. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  58. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  59. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  60. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  61. package/skills/docx/script/office/schemas/mce/mc.xsd +0 -75
  62. package/skills/docx/script/office/schemas/microsoft/wml-2010.xsd +0 -560
  63. package/skills/docx/script/office/schemas/microsoft/wml-2012.xsd +0 -67
  64. package/skills/docx/script/office/schemas/microsoft/wml-2018.xsd +0 -14
  65. package/skills/docx/script/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  66. package/skills/docx/script/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  67. package/skills/docx/script/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  68. package/skills/docx/script/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  69. package/skills/docx/script/office/soffice.py +0 -183
  70. package/skills/docx/script/office/unpack.py +0 -132
  71. package/skills/docx/script/office/validate.py +0 -117
  72. package/skills/docx/script/office/validators/__init__.py +0 -15
  73. package/skills/docx/script/office/validators/base.py +0 -851
  74. package/skills/docx/script/office/validators/docx.py +0 -446
  75. package/skills/docx/script/office/validators/pptx.py +0 -275
  76. package/skills/docx/script/office/validators/redlining.py +0 -247
  77. package/skills/docx/script/templates/comments.xml +0 -3
  78. package/skills/docx/script/templates/commentsExtended.xml +0 -3
  79. package/skills/docx/script/templates/commentsExtensible.xml +0 -3
  80. package/skills/docx/script/templates/commentsIds.xml +0 -3
  81. package/skills/docx/script/templates/people.xml +0 -3
  82. package/skills/docx/skill.md +0 -593
  83. package/skills/explain.md +0 -14
  84. package/skills/frontend-design/SKILL.md +0 -42
  85. package/skills/pdf/FORMS.md +0 -294
  86. package/skills/pdf/REFERENCE.md +0 -612
  87. package/skills/pdf/SKILL.md +0 -314
  88. package/skills/pdf/scripts/check_bounding_boxes.py +0 -65
  89. package/skills/pdf/scripts/check_fillable_fields.py +0 -11
  90. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -33
  91. package/skills/pdf/scripts/create_validation_image.py +0 -37
  92. package/skills/pdf/scripts/extract_form_field_info.py +0 -122
  93. package/skills/pdf/scripts/extract_form_structure.py +0 -115
  94. package/skills/pdf/scripts/fill_fillable_fields.py +0 -98
  95. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -107
  96. package/skills/playground/SKILL.md +0 -77
  97. package/skills/playground/templates/code-map.md +0 -158
  98. package/skills/playground/templates/concept-map.md +0 -73
  99. package/skills/playground/templates/data-explorer.md +0 -67
  100. package/skills/playground/templates/design-playground.md +0 -67
  101. package/skills/playground/templates/diff-review.md +0 -179
  102. package/skills/playground/templates/document-critique.md +0 -171
  103. package/skills/pptx/SKILL.md +0 -230
  104. package/skills/pptx/editing.md +0 -205
  105. package/skills/pptx/pptxgenjs.md +0 -437
  106. package/skills/pptx/scripts/__init__.py +0 -0
  107. package/skills/pptx/scripts/add_slide.py +0 -195
  108. package/skills/pptx/scripts/clean.py +0 -286
  109. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  110. package/skills/pptx/scripts/office/helpers/merge_runs.py +0 -199
  111. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -197
  112. package/skills/pptx/scripts/office/pack.py +0 -159
  113. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  114. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  115. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  116. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  117. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  118. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  119. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  120. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  121. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  122. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  123. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  124. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  125. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  126. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  127. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  128. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  129. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  130. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  131. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  132. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  133. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  134. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  135. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  136. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  137. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  138. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  139. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  140. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  141. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  142. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  143. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  144. package/skills/pptx/scripts/office/schemas/mce/mc.xsd +0 -75
  145. package/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  146. package/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  147. package/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  148. package/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  149. package/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  150. package/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  151. package/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  152. package/skills/pptx/scripts/office/soffice.py +0 -183
  153. package/skills/pptx/scripts/office/unpack.py +0 -132
  154. package/skills/pptx/scripts/office/validate.py +0 -117
  155. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  156. package/skills/pptx/scripts/office/validators/base.py +0 -851
  157. package/skills/pptx/scripts/office/validators/docx.py +0 -446
  158. package/skills/pptx/scripts/office/validators/pptx.py +0 -275
  159. package/skills/pptx/scripts/office/validators/redlining.py +0 -247
  160. package/skills/pptx/scripts/thumbnail.py +0 -289
  161. package/skills/recon.md +0 -16
  162. package/skills/security-audit/SKILL.md +0 -26
  163. package/skills/talent-creator/SKILL.md +0 -486
  164. package/skills/talent-creator/agents/analyzer.md +0 -274
  165. package/skills/talent-creator/agents/comparator.md +0 -202
  166. package/skills/talent-creator/agents/grader.md +0 -223
  167. package/skills/talent-creator/assets/eval_review.html +0 -146
  168. package/skills/talent-creator/eval-viewer/generate_review.py +0 -471
  169. package/skills/talent-creator/eval-viewer/viewer.html +0 -1325
  170. package/skills/talent-creator/references/schemas.md +0 -430
  171. package/skills/talent-creator/scripts/__init__.py +0 -0
  172. package/skills/talent-creator/scripts/aggregate_benchmark.py +0 -401
  173. package/skills/talent-creator/scripts/generate_report.py +0 -326
  174. package/skills/talent-creator/scripts/improve_description.py +0 -247
  175. package/skills/talent-creator/scripts/package_skill.py +0 -136
  176. package/skills/talent-creator/scripts/quick_validate.py +0 -146
  177. package/skills/talent-creator/scripts/run_eval.py +0 -310
  178. package/skills/talent-creator/scripts/run_loop.py +0 -328
  179. package/skills/talent-creator/scripts/utils.py +0 -47
  180. package/skills/xlsx/SKILL.md +0 -300
  181. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  182. package/skills/xlsx/scripts/office/helpers/merge_runs.py +0 -199
  183. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -197
  184. package/skills/xlsx/scripts/office/pack.py +0 -159
  185. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  186. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  187. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  188. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  189. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  190. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  191. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  192. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  193. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  194. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  195. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  196. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  197. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  198. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  199. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  200. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  201. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  202. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  203. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  204. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  205. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  206. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  207. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  208. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  209. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  210. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  211. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  212. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  213. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  214. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  215. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  216. package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +0 -75
  217. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  218. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  219. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  220. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  221. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  222. package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  223. package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  224. package/skills/xlsx/scripts/office/soffice.py +0 -183
  225. package/skills/xlsx/scripts/office/unpack.py +0 -132
  226. package/skills/xlsx/scripts/office/validate.py +0 -117
  227. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  228. package/skills/xlsx/scripts/office/validators/base.py +0 -851
  229. package/skills/xlsx/scripts/office/validators/docx.py +0 -446
  230. package/skills/xlsx/scripts/office/validators/pptx.py +0 -275
  231. package/skills/xlsx/scripts/office/validators/redlining.py +0 -247
  232. package/skills/xlsx/scripts/recalc.py +0 -184
@@ -1,401 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Aggregate individual run results into benchmark summary statistics.
4
-
5
- Reads grading.json files from run directories and produces:
6
- - run_summary with mean, stddev, min, max for each metric
7
- - delta between with_skill and without_skill configurations
8
-
9
- Usage:
10
- python aggregate_benchmark.py <benchmark_dir>
11
-
12
- Example:
13
- python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
14
-
15
- The script supports two directory layouts:
16
-
17
- Workspace layout (from skill-creator iterations):
18
- <benchmark_dir>/
19
- └── eval-N/
20
- ├── with_skill/
21
- │ ├── run-1/grading.json
22
- │ └── run-2/grading.json
23
- └── without_skill/
24
- ├── run-1/grading.json
25
- └── run-2/grading.json
26
-
27
- Legacy layout (with runs/ subdirectory):
28
- <benchmark_dir>/
29
- └── runs/
30
- └── eval-N/
31
- ├── with_skill/
32
- │ └── run-1/grading.json
33
- └── without_skill/
34
- └── run-1/grading.json
35
- """
36
-
37
- import argparse
38
- import json
39
- import math
40
- import sys
41
- from datetime import datetime, timezone
42
- from pathlib import Path
43
-
44
-
45
- def calculate_stats(values: list[float]) -> dict:
46
- """Calculate mean, stddev, min, max for a list of values."""
47
- if not values:
48
- return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
49
-
50
- n = len(values)
51
- mean = sum(values) / n
52
-
53
- if n > 1:
54
- variance = sum((x - mean) ** 2 for x in values) / (n - 1)
55
- stddev = math.sqrt(variance)
56
- else:
57
- stddev = 0.0
58
-
59
- return {
60
- "mean": round(mean, 4),
61
- "stddev": round(stddev, 4),
62
- "min": round(min(values), 4),
63
- "max": round(max(values), 4)
64
- }
65
-
66
-
67
- def load_run_results(benchmark_dir: Path) -> dict:
68
- """
69
- Load all run results from a benchmark directory.
70
-
71
- Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
72
- or "new_skill"/"old_skill"), each containing a list of run results.
73
- """
74
- # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
75
- runs_dir = benchmark_dir / "runs"
76
- if runs_dir.exists():
77
- search_dir = runs_dir
78
- elif list(benchmark_dir.glob("eval-*")):
79
- search_dir = benchmark_dir
80
- else:
81
- print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
82
- return {}
83
-
84
- results: dict[str, list] = {}
85
-
86
- for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
87
- metadata_path = eval_dir / "eval_metadata.json"
88
- if metadata_path.exists():
89
- try:
90
- with open(metadata_path) as mf:
91
- eval_id = json.load(mf).get("eval_id", eval_idx)
92
- except (json.JSONDecodeError, OSError):
93
- eval_id = eval_idx
94
- else:
95
- try:
96
- eval_id = int(eval_dir.name.split("-")[1])
97
- except ValueError:
98
- eval_id = eval_idx
99
-
100
- # Discover config directories dynamically rather than hardcoding names
101
- for config_dir in sorted(eval_dir.iterdir()):
102
- if not config_dir.is_dir():
103
- continue
104
- # Skip non-config directories (inputs, outputs, etc.)
105
- if not list(config_dir.glob("run-*")):
106
- continue
107
- config = config_dir.name
108
- if config not in results:
109
- results[config] = []
110
-
111
- for run_dir in sorted(config_dir.glob("run-*")):
112
- run_number = int(run_dir.name.split("-")[1])
113
- grading_file = run_dir / "grading.json"
114
-
115
- if not grading_file.exists():
116
- print(f"Warning: grading.json not found in {run_dir}")
117
- continue
118
-
119
- try:
120
- with open(grading_file) as f:
121
- grading = json.load(f)
122
- except json.JSONDecodeError as e:
123
- print(f"Warning: Invalid JSON in {grading_file}: {e}")
124
- continue
125
-
126
- # Extract metrics
127
- result = {
128
- "eval_id": eval_id,
129
- "run_number": run_number,
130
- "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
131
- "passed": grading.get("summary", {}).get("passed", 0),
132
- "failed": grading.get("summary", {}).get("failed", 0),
133
- "total": grading.get("summary", {}).get("total", 0),
134
- }
135
-
136
- # Extract timing — check grading.json first, then sibling timing.json
137
- timing = grading.get("timing", {})
138
- result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
139
- timing_file = run_dir / "timing.json"
140
- if result["time_seconds"] == 0.0 and timing_file.exists():
141
- try:
142
- with open(timing_file) as tf:
143
- timing_data = json.load(tf)
144
- result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
145
- result["tokens"] = timing_data.get("total_tokens", 0)
146
- except json.JSONDecodeError:
147
- pass
148
-
149
- # Extract metrics if available
150
- metrics = grading.get("execution_metrics", {})
151
- result["tool_calls"] = metrics.get("total_tool_calls", 0)
152
- if not result.get("tokens"):
153
- result["tokens"] = metrics.get("output_chars", 0)
154
- result["errors"] = metrics.get("errors_encountered", 0)
155
-
156
- # Extract expectations — viewer requires fields: text, passed, evidence
157
- raw_expectations = grading.get("expectations", [])
158
- for exp in raw_expectations:
159
- if "text" not in exp or "passed" not in exp:
160
- print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
161
- result["expectations"] = raw_expectations
162
-
163
- # Extract notes from user_notes_summary
164
- notes_summary = grading.get("user_notes_summary", {})
165
- notes = []
166
- notes.extend(notes_summary.get("uncertainties", []))
167
- notes.extend(notes_summary.get("needs_review", []))
168
- notes.extend(notes_summary.get("workarounds", []))
169
- result["notes"] = notes
170
-
171
- results[config].append(result)
172
-
173
- return results
174
-
175
-
176
- def aggregate_results(results: dict) -> dict:
177
- """
178
- Aggregate run results into summary statistics.
179
-
180
- Returns run_summary with stats for each configuration and delta.
181
- """
182
- run_summary = {}
183
- configs = list(results.keys())
184
-
185
- for config in configs:
186
- runs = results.get(config, [])
187
-
188
- if not runs:
189
- run_summary[config] = {
190
- "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
191
- "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
192
- "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
193
- }
194
- continue
195
-
196
- pass_rates = [r["pass_rate"] for r in runs]
197
- times = [r["time_seconds"] for r in runs]
198
- tokens = [r.get("tokens", 0) for r in runs]
199
-
200
- run_summary[config] = {
201
- "pass_rate": calculate_stats(pass_rates),
202
- "time_seconds": calculate_stats(times),
203
- "tokens": calculate_stats(tokens)
204
- }
205
-
206
- # Calculate delta between the first two configs (if two exist)
207
- if len(configs) >= 2:
208
- primary = run_summary.get(configs[0], {})
209
- baseline = run_summary.get(configs[1], {})
210
- else:
211
- primary = run_summary.get(configs[0], {}) if configs else {}
212
- baseline = {}
213
-
214
- delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
215
- delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
216
- delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
217
-
218
- run_summary["delta"] = {
219
- "pass_rate": f"{delta_pass_rate:+.2f}",
220
- "time_seconds": f"{delta_time:+.1f}",
221
- "tokens": f"{delta_tokens:+.0f}"
222
- }
223
-
224
- return run_summary
225
-
226
-
227
- def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
228
- """
229
- Generate complete benchmark.json from run results.
230
- """
231
- results = load_run_results(benchmark_dir)
232
- run_summary = aggregate_results(results)
233
-
234
- # Build runs array for benchmark.json
235
- runs = []
236
- for config in results:
237
- for result in results[config]:
238
- runs.append({
239
- "eval_id": result["eval_id"],
240
- "configuration": config,
241
- "run_number": result["run_number"],
242
- "result": {
243
- "pass_rate": result["pass_rate"],
244
- "passed": result["passed"],
245
- "failed": result["failed"],
246
- "total": result["total"],
247
- "time_seconds": result["time_seconds"],
248
- "tokens": result.get("tokens", 0),
249
- "tool_calls": result.get("tool_calls", 0),
250
- "errors": result.get("errors", 0)
251
- },
252
- "expectations": result["expectations"],
253
- "notes": result["notes"]
254
- })
255
-
256
- # Determine eval IDs from results
257
- eval_ids = sorted(set(
258
- r["eval_id"]
259
- for config in results.values()
260
- for r in config
261
- ))
262
-
263
- benchmark = {
264
- "metadata": {
265
- "skill_name": skill_name or "<skill-name>",
266
- "skill_path": skill_path or "<path/to/skill>",
267
- "executor_model": "<model-name>",
268
- "analyzer_model": "<model-name>",
269
- "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
270
- "evals_run": eval_ids,
271
- "runs_per_configuration": 3
272
- },
273
- "runs": runs,
274
- "run_summary": run_summary,
275
- "notes": [] # To be filled by analyzer
276
- }
277
-
278
- return benchmark
279
-
280
-
281
- def generate_markdown(benchmark: dict) -> str:
282
- """Generate human-readable benchmark.md from benchmark data."""
283
- metadata = benchmark["metadata"]
284
- run_summary = benchmark["run_summary"]
285
-
286
- # Determine config names (excluding "delta")
287
- configs = [k for k in run_summary if k != "delta"]
288
- config_a = configs[0] if len(configs) >= 1 else "config_a"
289
- config_b = configs[1] if len(configs) >= 2 else "config_b"
290
- label_a = config_a.replace("_", " ").title()
291
- label_b = config_b.replace("_", " ").title()
292
-
293
- lines = [
294
- f"# Skill Benchmark: {metadata['skill_name']}",
295
- "",
296
- f"**Model**: {metadata['executor_model']}",
297
- f"**Date**: {metadata['timestamp']}",
298
- f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
299
- "",
300
- "## Summary",
301
- "",
302
- f"| Metric | {label_a} | {label_b} | Delta |",
303
- "|--------|------------|---------------|-------|",
304
- ]
305
-
306
- a_summary = run_summary.get(config_a, {})
307
- b_summary = run_summary.get(config_b, {})
308
- delta = run_summary.get("delta", {})
309
-
310
- # Format pass rate
311
- a_pr = a_summary.get("pass_rate", {})
312
- b_pr = b_summary.get("pass_rate", {})
313
- lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
314
-
315
- # Format time
316
- a_time = a_summary.get("time_seconds", {})
317
- b_time = b_summary.get("time_seconds", {})
318
- lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
319
-
320
- # Format tokens
321
- a_tokens = a_summary.get("tokens", {})
322
- b_tokens = b_summary.get("tokens", {})
323
- lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
324
-
325
- # Notes section
326
- if benchmark.get("notes"):
327
- lines.extend([
328
- "",
329
- "## Notes",
330
- ""
331
- ])
332
- for note in benchmark["notes"]:
333
- lines.append(f"- {note}")
334
-
335
- return "\n".join(lines)
336
-
337
-
338
- def main():
339
- parser = argparse.ArgumentParser(
340
- description="Aggregate benchmark run results into summary statistics"
341
- )
342
- parser.add_argument(
343
- "benchmark_dir",
344
- type=Path,
345
- help="Path to the benchmark directory"
346
- )
347
- parser.add_argument(
348
- "--skill-name",
349
- default="",
350
- help="Name of the skill being benchmarked"
351
- )
352
- parser.add_argument(
353
- "--skill-path",
354
- default="",
355
- help="Path to the skill being benchmarked"
356
- )
357
- parser.add_argument(
358
- "--output", "-o",
359
- type=Path,
360
- help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
361
- )
362
-
363
- args = parser.parse_args()
364
-
365
- if not args.benchmark_dir.exists():
366
- print(f"Directory not found: {args.benchmark_dir}")
367
- sys.exit(1)
368
-
369
- # Generate benchmark
370
- benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
371
-
372
- # Determine output paths
373
- output_json = args.output or (args.benchmark_dir / "benchmark.json")
374
- output_md = output_json.with_suffix(".md")
375
-
376
- # Write benchmark.json
377
- with open(output_json, "w") as f:
378
- json.dump(benchmark, f, indent=2)
379
- print(f"Generated: {output_json}")
380
-
381
- # Write benchmark.md
382
- markdown = generate_markdown(benchmark)
383
- with open(output_md, "w") as f:
384
- f.write(markdown)
385
- print(f"Generated: {output_md}")
386
-
387
- # Print summary
388
- run_summary = benchmark["run_summary"]
389
- configs = [k for k in run_summary if k != "delta"]
390
- delta = run_summary.get("delta", {})
391
-
392
- print(f"\nSummary:")
393
- for config in configs:
394
- pr = run_summary[config]["pass_rate"]["mean"]
395
- label = config.replace("_", " ").title()
396
- print(f" {label}: {pr*100:.1f}% pass rate")
397
- print(f" Delta: {delta.get('pass_rate', '—')}")
398
-
399
-
400
- if __name__ == "__main__":
401
- main()