wormclaude 1.0.119 → 1.0.121

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/dist/theme.js +1 -1
  2. package/dist/tui.js +6 -1
  3. package/package.json +1 -1
  4. package/skills/build-mcp-app/SKILL.md +0 -393
  5. package/skills/build-mcp-app/references/abuse-protection.md +0 -60
  6. package/skills/build-mcp-app/references/apps-sdk-messages.md +0 -227
  7. package/skills/build-mcp-app/references/directory-checklist.md +0 -18
  8. package/skills/build-mcp-app/references/iframe-sandbox.md +0 -164
  9. package/skills/build-mcp-app/references/payload-budgeting.md +0 -54
  10. package/skills/build-mcp-app/references/widget-templates.md +0 -249
  11. package/skills/build-mcp-server/SKILL.md +0 -222
  12. package/skills/build-mcp-server/references/auth.md +0 -108
  13. package/skills/build-mcp-server/references/deploy-cloudflare-workers.md +0 -106
  14. package/skills/build-mcp-server/references/elicitation.md +0 -129
  15. package/skills/build-mcp-server/references/remote-http-scaffold.md +0 -211
  16. package/skills/build-mcp-server/references/resources-and-prompts.md +0 -122
  17. package/skills/build-mcp-server/references/server-capabilities.md +0 -164
  18. package/skills/build-mcp-server/references/tool-design.md +0 -189
  19. package/skills/build-mcp-server/references/versions.md +0 -25
  20. package/skills/build-mcpb/SKILL.md +0 -200
  21. package/skills/build-mcpb/references/local-security.md +0 -149
  22. package/skills/build-mcpb/references/manifest-schema.md +0 -156
  23. package/skills/docx/script/__init__.py +0 -1
  24. package/skills/docx/script/accept_chages.py +0 -135
  25. package/skills/docx/script/comment.py +0 -318
  26. package/skills/docx/script/office/helpers/__init__.py +0 -0
  27. package/skills/docx/script/office/helpers/merge_runs.py +0 -199
  28. package/skills/docx/script/office/helpers/simplify_redlines.py +0 -197
  29. package/skills/docx/script/office/pack.py +0 -159
  30. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  31. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  32. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  33. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  34. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  35. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  36. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  37. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  38. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  39. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  40. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  41. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  42. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  43. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  44. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  45. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  46. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  47. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  48. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  49. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  50. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  51. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  52. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  53. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  54. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  55. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  56. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  57. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  58. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  59. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  60. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  61. package/skills/docx/script/office/schemas/mce/mc.xsd +0 -75
  62. package/skills/docx/script/office/schemas/microsoft/wml-2010.xsd +0 -560
  63. package/skills/docx/script/office/schemas/microsoft/wml-2012.xsd +0 -67
  64. package/skills/docx/script/office/schemas/microsoft/wml-2018.xsd +0 -14
  65. package/skills/docx/script/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  66. package/skills/docx/script/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  67. package/skills/docx/script/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  68. package/skills/docx/script/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  69. package/skills/docx/script/office/soffice.py +0 -183
  70. package/skills/docx/script/office/unpack.py +0 -132
  71. package/skills/docx/script/office/validate.py +0 -117
  72. package/skills/docx/script/office/validators/__init__.py +0 -15
  73. package/skills/docx/script/office/validators/base.py +0 -851
  74. package/skills/docx/script/office/validators/docx.py +0 -446
  75. package/skills/docx/script/office/validators/pptx.py +0 -275
  76. package/skills/docx/script/office/validators/redlining.py +0 -247
  77. package/skills/docx/script/templates/comments.xml +0 -3
  78. package/skills/docx/script/templates/commentsExtended.xml +0 -3
  79. package/skills/docx/script/templates/commentsExtensible.xml +0 -3
  80. package/skills/docx/script/templates/commentsIds.xml +0 -3
  81. package/skills/docx/script/templates/people.xml +0 -3
  82. package/skills/docx/skill.md +0 -593
  83. package/skills/explain.md +0 -14
  84. package/skills/frontend-design/SKILL.md +0 -42
  85. package/skills/pdf/FORMS.md +0 -294
  86. package/skills/pdf/REFERENCE.md +0 -612
  87. package/skills/pdf/SKILL.md +0 -314
  88. package/skills/pdf/scripts/check_bounding_boxes.py +0 -65
  89. package/skills/pdf/scripts/check_fillable_fields.py +0 -11
  90. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -33
  91. package/skills/pdf/scripts/create_validation_image.py +0 -37
  92. package/skills/pdf/scripts/extract_form_field_info.py +0 -122
  93. package/skills/pdf/scripts/extract_form_structure.py +0 -115
  94. package/skills/pdf/scripts/fill_fillable_fields.py +0 -98
  95. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -107
  96. package/skills/playground/SKILL.md +0 -77
  97. package/skills/playground/templates/code-map.md +0 -158
  98. package/skills/playground/templates/concept-map.md +0 -73
  99. package/skills/playground/templates/data-explorer.md +0 -67
  100. package/skills/playground/templates/design-playground.md +0 -67
  101. package/skills/playground/templates/diff-review.md +0 -179
  102. package/skills/playground/templates/document-critique.md +0 -171
  103. package/skills/pptx/SKILL.md +0 -230
  104. package/skills/pptx/editing.md +0 -205
  105. package/skills/pptx/pptxgenjs.md +0 -437
  106. package/skills/pptx/scripts/__init__.py +0 -0
  107. package/skills/pptx/scripts/add_slide.py +0 -195
  108. package/skills/pptx/scripts/clean.py +0 -286
  109. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  110. package/skills/pptx/scripts/office/helpers/merge_runs.py +0 -199
  111. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -197
  112. package/skills/pptx/scripts/office/pack.py +0 -159
  113. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  114. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  115. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  116. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  117. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  118. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  119. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  120. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  121. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  122. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  123. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  124. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  125. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  126. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  127. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  128. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  129. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  130. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  131. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  132. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  133. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  134. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  135. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  136. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  137. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  138. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  139. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  140. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  141. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  142. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  143. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  144. package/skills/pptx/scripts/office/schemas/mce/mc.xsd +0 -75
  145. package/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  146. package/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  147. package/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  148. package/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  149. package/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  150. package/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  151. package/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  152. package/skills/pptx/scripts/office/soffice.py +0 -183
  153. package/skills/pptx/scripts/office/unpack.py +0 -132
  154. package/skills/pptx/scripts/office/validate.py +0 -117
  155. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  156. package/skills/pptx/scripts/office/validators/base.py +0 -851
  157. package/skills/pptx/scripts/office/validators/docx.py +0 -446
  158. package/skills/pptx/scripts/office/validators/pptx.py +0 -275
  159. package/skills/pptx/scripts/office/validators/redlining.py +0 -247
  160. package/skills/pptx/scripts/thumbnail.py +0 -289
  161. package/skills/recon.md +0 -16
  162. package/skills/security-audit/SKILL.md +0 -26
  163. package/skills/talent-creator/SKILL.md +0 -486
  164. package/skills/talent-creator/agents/analyzer.md +0 -274
  165. package/skills/talent-creator/agents/comparator.md +0 -202
  166. package/skills/talent-creator/agents/grader.md +0 -223
  167. package/skills/talent-creator/assets/eval_review.html +0 -146
  168. package/skills/talent-creator/eval-viewer/generate_review.py +0 -471
  169. package/skills/talent-creator/eval-viewer/viewer.html +0 -1325
  170. package/skills/talent-creator/references/schemas.md +0 -430
  171. package/skills/talent-creator/scripts/__init__.py +0 -0
  172. package/skills/talent-creator/scripts/aggregate_benchmark.py +0 -401
  173. package/skills/talent-creator/scripts/generate_report.py +0 -326
  174. package/skills/talent-creator/scripts/improve_description.py +0 -247
  175. package/skills/talent-creator/scripts/package_skill.py +0 -136
  176. package/skills/talent-creator/scripts/quick_validate.py +0 -146
  177. package/skills/talent-creator/scripts/run_eval.py +0 -310
  178. package/skills/talent-creator/scripts/run_loop.py +0 -328
  179. package/skills/talent-creator/scripts/utils.py +0 -47
  180. package/skills/xlsx/SKILL.md +0 -300
  181. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  182. package/skills/xlsx/scripts/office/helpers/merge_runs.py +0 -199
  183. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -197
  184. package/skills/xlsx/scripts/office/pack.py +0 -159
  185. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  186. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  187. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  188. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  189. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  190. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  191. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  192. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  193. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  194. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  195. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  196. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  197. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  198. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  199. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  200. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  201. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  202. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  203. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  204. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  205. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  206. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  207. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  208. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  209. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  210. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  211. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  212. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  213. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  214. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  215. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  216. package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +0 -75
  217. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  218. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  219. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  220. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  221. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  222. package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  223. package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  224. package/skills/xlsx/scripts/office/soffice.py +0 -183
  225. package/skills/xlsx/scripts/office/unpack.py +0 -132
  226. package/skills/xlsx/scripts/office/validate.py +0 -117
  227. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  228. package/skills/xlsx/scripts/office/validators/base.py +0 -851
  229. package/skills/xlsx/scripts/office/validators/docx.py +0 -446
  230. package/skills/xlsx/scripts/office/validators/pptx.py +0 -275
  231. package/skills/xlsx/scripts/office/validators/redlining.py +0 -247
  232. package/skills/xlsx/scripts/recalc.py +0 -184
@@ -1,223 +0,0 @@
1
- # Grader Agent
2
-
3
- Judge expectations against an execution transcript and its outputs.
4
-
5
- ## Role
6
-
7
- The Grader goes over a transcript and the output files, then rules each expectation pass or fail. Back every ruling with clear evidence.
8
-
9
- You wear two hats: grade the outputs, and critique the Inspections themselves. A pass on a flimsy assertion is worse than useless — it breeds false confidence. When you spot an assertion that's trivially met, or an important outcome no assertion checks, say so.
10
-
11
- ## Inputs
12
-
13
- Your prompt hands you these parameters:
14
-
15
- - **expectations**: List of expectations to inspect (strings)
16
- - **transcript_path**: Path to the execution transcript (markdown file)
17
- - **outputs_dir**: Directory containing output files from execution
18
-
19
- ## Process
20
-
21
- ### Step 1: Read the Transcript
22
-
23
- 1. Read the transcript file end to end
24
- 2. Note the Inspection prompt, the execution steps, and the final result
25
- 3. Pick out any issues or errors that got documented
26
-
27
- ### Step 2: Examine Output Files
28
-
29
- 1. List the files in outputs_dir
30
- 2. Read/examine each file that bears on the expectations. If the outputs aren't plain text, use the inspection tools your prompt provides — don't just take the transcript's word for what the executor produced.
31
- 3. Note the contents, structure, and quality
32
-
33
- ### Step 3: Inspect Each Assertion
34
-
35
- For each expectation:
36
-
37
- 1. **Hunt for evidence** in the transcript and outputs
38
- 2. **Settle the verdict**:
39
- - **PASS**: Clear evidence the expectation holds AND that evidence reflects real task completion, not just surface-level compliance
40
- - **FAIL**: No evidence, or evidence that contradicts the expectation, or evidence that's only skin-deep (e.g., right filename but empty/wrong content)
41
- 3. **Cite the evidence**: Quote the exact text or describe what you found
42
-
43
- ### Step 4: Extract and Verify Claims
44
-
45
- Beyond the predefined expectations, tease out the implicit claims in the outputs and check them:
46
-
47
- 1. **Pull out claims** from the transcript and outputs:
48
- - Factual statements ("The form has 12 fields")
49
- - Process claims ("Used pypdf to fill the form")
50
- - Quality claims ("All fields were filled correctly")
51
-
52
- 2. **Verify each claim**:
53
- - **Factual claims**: Can be checked against the outputs or external sources
54
- - **Process claims**: Can be confirmed from the transcript
55
- - **Quality claims**: Judge whether the claim is warranted
56
-
57
- 3. **Flag unverifiable claims**: Note claims you can't confirm with what's available
58
-
59
- This catches problems the predefined expectations might let slip.
60
-
61
- ### Step 5: Read User Notes
62
-
63
- If `{outputs_dir}/user_notes.md` exists:
64
- 1. Read it and note any uncertainties or issues the executor flagged
65
- 2. Pull the relevant concerns into the grading output
66
- 3. These can expose problems even when the expectations pass
67
-
68
- ### Step 6: Critique the Inspections
69
-
70
- After grading, mull over whether the Inspections themselves could be sharper. Only raise suggestions when there's a genuine gap.
71
-
72
- Good suggestions test outcomes that matter — assertions that are hard to satisfy without actually doing the work right. Think about what makes an assertion *discriminating*: it passes when the skill truly succeeds and fails when it doesn't.
73
-
74
- Suggestions worth raising:
75
- - An assertion that passed but would also pass for a plainly wrong output (e.g., checking that a filename exists but not its contents)
76
- - An important outcome you saw — good or bad — that no assertion touches at all
77
- - An assertion that can't really be verified from the available outputs
78
-
79
- Hold the bar high. The aim is to flag things the Inspection author would call a "good catch," not to nitpick every assertion.
80
-
81
- ### Step 7: Write Grading Results
82
-
83
- Save the results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
84
-
85
- ## Grading Criteria
86
-
87
- **PASS when**:
88
- - The transcript or outputs clearly show the expectation is true
89
- - Specific evidence can be cited
90
- - The evidence carries real substance, not just surface compliance (e.g., a file exists AND holds the correct content, not merely the right filename)
91
-
92
- **FAIL when**:
93
- - No evidence turns up for the expectation
94
- - Evidence contradicts the expectation
95
- - The expectation can't be verified from what's available
96
- - The evidence is only skin-deep — the assertion is technically met but the underlying task outcome is wrong or incomplete
97
- - The output seems to meet the assertion by luck rather than by actually doing the work
98
-
99
- **When uncertain**: The burden of proof to pass rests on the expectation.
100
-
101
- ### Step 8: Read Executor Metrics and Timing
102
-
103
- 1. If `{outputs_dir}/metrics.json` exists, read it and fold it into the grading output
104
- 2. If `{outputs_dir}/../timing.json` exists, read it and include the timing data
105
-
106
- ## Output Format
107
-
108
- Write out a JSON file in this shape:
109
-
110
- ```json
111
- {
112
- "expectations": [
113
- {
114
- "text": "The output includes the name 'John Smith'",
115
- "passed": true,
116
- "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
117
- },
118
- {
119
- "text": "The spreadsheet has a SUM formula in cell B10",
120
- "passed": false,
121
- "evidence": "No spreadsheet was created. The output was a text file."
122
- },
123
- {
124
- "text": "The assistant used the skill's OCR script",
125
- "passed": true,
126
- "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
127
- }
128
- ],
129
- "summary": {
130
- "passed": 2,
131
- "failed": 1,
132
- "total": 3,
133
- "pass_rate": 0.67
134
- },
135
- "execution_metrics": {
136
- "tool_calls": {
137
- "Read": 5,
138
- "Write": 2,
139
- "Bash": 8
140
- },
141
- "total_tool_calls": 15,
142
- "total_steps": 6,
143
- "errors_encountered": 0,
144
- "output_chars": 12450,
145
- "transcript_chars": 3200
146
- },
147
- "timing": {
148
- "executor_duration_seconds": 165.0,
149
- "grader_duration_seconds": 26.0,
150
- "total_duration_seconds": 191.0
151
- },
152
- "claims": [
153
- {
154
- "claim": "The form has 12 fillable fields",
155
- "type": "factual",
156
- "verified": true,
157
- "evidence": "Counted 12 fields in field_info.json"
158
- },
159
- {
160
- "claim": "All required fields were populated",
161
- "type": "quality",
162
- "verified": false,
163
- "evidence": "Reference section was left blank despite data being available"
164
- }
165
- ],
166
- "user_notes_summary": {
167
- "uncertainties": ["Used 2023 data, may be stale"],
168
- "needs_review": [],
169
- "workarounds": ["Fell back to text overlay for non-fillable fields"]
170
- },
171
- "eval_feedback": {
172
- "suggestions": [
173
- {
174
- "assertion": "The output includes the name 'John Smith'",
175
- "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
176
- },
177
- {
178
- "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
179
- }
180
- ],
181
- "overall": "Assertions check presence but not correctness. Consider adding content verification."
182
- }
183
- }
184
- ```
185
-
186
- ## Field Descriptions
187
-
188
- - **expectations**: Array of graded expectations
189
- - **text**: The original expectation text
190
- - **passed**: Boolean - true when the expectation passes
191
- - **evidence**: A specific quote or description backing the verdict
192
- - **summary**: Aggregate statistics
193
- - **passed**: Number of expectations that passed
194
- - **failed**: Number of expectations that failed
195
- - **total**: Total expectations inspected
196
- - **pass_rate**: Fraction passed (0.0 to 1.0)
197
- - **execution_metrics**: Carried over from the executor's metrics.json (when available)
198
- - **output_chars**: Total character count of the output files (a stand-in for tokens)
199
- - **transcript_chars**: Character count of the transcript
200
- - **timing**: Wall clock timing from timing.json (when available)
201
- - **executor_duration_seconds**: Time spent in the executor subagent
202
- - **total_duration_seconds**: Total elapsed time for the run
203
- - **claims**: Claims extracted from the output and verified
204
- - **claim**: The statement under check
205
- - **type**: "factual", "process", or "quality"
206
- - **verified**: Boolean - whether the claim holds up
207
- - **evidence**: Evidence for or against it
208
- - **user_notes_summary**: Issues the executor flagged
209
- - **uncertainties**: Things the executor wasn't sure about
210
- - **needs_review**: Items that need a human's eyes
211
- - **workarounds**: Spots where the skill didn't behave as expected
212
- - **eval_feedback**: Improvement suggestions for the Inspections (only when warranted)
213
- - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it ties to
214
- - **overall**: A short assessment — can be "No suggestions, Inspections look solid" when there's nothing to flag
215
-
216
- ## Guidelines
217
-
218
- - **Be objective**: Ground verdicts in evidence, not assumptions
219
- - **Be specific**: Quote the exact text that supports your verdict
220
- - **Be thorough**: Check both the transcript and the output files
221
- - **Be consistent**: Hold every expectation to the same standard
222
- - **Explain failures**: Make it clear why the evidence fell short
223
- - **No partial credit**: Each expectation is pass or fail, never halfway
@@ -1,146 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
7
- <link rel="preconnect" href="https://fonts.googleapis.com">
8
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
- <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
10
- <style>
11
- * { box-sizing: border-box; margin: 0; padding: 0; }
12
- body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
13
- h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
14
- .description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
15
- .controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
16
- .btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
17
- .btn-add { background: #6a9bcc; color: white; }
18
- .btn-add:hover { background: #5889b8; }
19
- .btn-export { background: #d97757; color: white; }
20
- .btn-export:hover { background: #c4613f; }
21
- table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
22
- th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
23
- td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
24
- tr:nth-child(even) td { background: #faf9f5; }
25
- tr:hover td { background: #f3f1ea; }
26
- .section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
27
- .query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
28
- .query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
29
- .toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
30
- .toggle input { opacity: 0; width: 0; height: 0; }
31
- .toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
32
- .toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
33
- .toggle input:checked + .slider { background: #d97757; }
34
- .toggle input:checked + .slider::before { transform: translateX(20px); }
35
- .btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
36
- .btn-delete:hover { background: #a33; }
37
- .summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
38
- </style>
39
- </head>
40
- <body>
41
- <h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
42
- <p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
43
-
44
- <div class="controls">
45
- <button class="btn btn-add" onclick="addRow()">+ Add Query</button>
46
- <button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
47
- </div>
48
-
49
- <table>
50
- <thead>
51
- <tr>
52
- <th style="width:65%">Query</th>
53
- <th style="width:18%">Should Trigger</th>
54
- <th style="width:10%">Actions</th>
55
- </tr>
56
- </thead>
57
- <tbody id="eval-body"></tbody>
58
- </table>
59
-
60
- <p class="summary" id="summary"></p>
61
-
62
- <script>
63
- const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
64
-
65
- let evalItems = [...EVAL_DATA];
66
-
67
- function render() {
68
- const tbody = document.getElementById('eval-body');
69
- tbody.innerHTML = '';
70
-
71
- // Sort: should-trigger first, then should-not-trigger
72
- const sorted = evalItems
73
- .map((item, origIdx) => ({ ...item, origIdx }))
74
- .sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
75
-
76
- let lastGroup = null;
77
- sorted.forEach(item => {
78
- const group = item.should_trigger ? 'trigger' : 'no-trigger';
79
- if (group !== lastGroup) {
80
- const headerRow = document.createElement('tr');
81
- headerRow.className = 'section-header';
82
- headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
83
- tbody.appendChild(headerRow);
84
- lastGroup = group;
85
- }
86
-
87
- const idx = item.origIdx;
88
- const tr = document.createElement('tr');
89
- tr.innerHTML = `
90
- <td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
91
- <td>
92
- <label class="toggle">
93
- <input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
94
- <span class="slider"></span>
95
- </label>
96
- <span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
97
- </td>
98
- <td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
99
- `;
100
- tbody.appendChild(tr);
101
- });
102
- updateSummary();
103
- }
104
-
105
- function escapeHtml(text) {
106
- const div = document.createElement('div');
107
- div.textContent = text;
108
- return div.innerHTML;
109
- }
110
-
111
- function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
112
- function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
113
- function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
114
-
115
- function addRow() {
116
- evalItems.push({ query: '', should_trigger: true });
117
- render();
118
- const inputs = document.querySelectorAll('.query-input');
119
- inputs[inputs.length - 1].focus();
120
- }
121
-
122
- function updateSummary() {
123
- const trigger = evalItems.filter(i => i.should_trigger).length;
124
- const noTrigger = evalItems.filter(i => !i.should_trigger).length;
125
- document.getElementById('summary').textContent =
126
- `${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
127
- }
128
-
129
- function exportEvalSet() {
130
- const valid = evalItems.filter(i => i.query.trim() !== '');
131
- const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
132
- const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
133
- const url = URL.createObjectURL(blob);
134
- const a = document.createElement('a');
135
- a.href = url;
136
- a.download = 'eval_set.json';
137
- document.body.appendChild(a);
138
- a.click();
139
- document.body.removeChild(a);
140
- URL.revokeObjectURL(url);
141
- }
142
-
143
- render();
144
- </script>
145
- </body>
146
- </html>