wormclaude 1.0.119 → 1.0.121
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/theme.js +1 -1
- package/dist/tui.js +6 -1
- package/package.json +1 -1
- package/skills/build-mcp-app/SKILL.md +0 -393
- package/skills/build-mcp-app/references/abuse-protection.md +0 -60
- package/skills/build-mcp-app/references/apps-sdk-messages.md +0 -227
- package/skills/build-mcp-app/references/directory-checklist.md +0 -18
- package/skills/build-mcp-app/references/iframe-sandbox.md +0 -164
- package/skills/build-mcp-app/references/payload-budgeting.md +0 -54
- package/skills/build-mcp-app/references/widget-templates.md +0 -249
- package/skills/build-mcp-server/SKILL.md +0 -222
- package/skills/build-mcp-server/references/auth.md +0 -108
- package/skills/build-mcp-server/references/deploy-cloudflare-workers.md +0 -106
- package/skills/build-mcp-server/references/elicitation.md +0 -129
- package/skills/build-mcp-server/references/remote-http-scaffold.md +0 -211
- package/skills/build-mcp-server/references/resources-and-prompts.md +0 -122
- package/skills/build-mcp-server/references/server-capabilities.md +0 -164
- package/skills/build-mcp-server/references/tool-design.md +0 -189
- package/skills/build-mcp-server/references/versions.md +0 -25
- package/skills/build-mcpb/SKILL.md +0 -200
- package/skills/build-mcpb/references/local-security.md +0 -149
- package/skills/build-mcpb/references/manifest-schema.md +0 -156
- package/skills/docx/script/__init__.py +0 -1
- package/skills/docx/script/accept_chages.py +0 -135
- package/skills/docx/script/comment.py +0 -318
- package/skills/docx/script/office/helpers/__init__.py +0 -0
- package/skills/docx/script/office/helpers/merge_runs.py +0 -199
- package/skills/docx/script/office/helpers/simplify_redlines.py +0 -197
- package/skills/docx/script/office/pack.py +0 -159
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
- package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
- package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
- package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
- package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
- package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
- package/skills/docx/script/office/schemas/mce/mc.xsd +0 -75
- package/skills/docx/script/office/schemas/microsoft/wml-2010.xsd +0 -560
- package/skills/docx/script/office/schemas/microsoft/wml-2012.xsd +0 -67
- package/skills/docx/script/office/schemas/microsoft/wml-2018.xsd +0 -14
- package/skills/docx/script/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
- package/skills/docx/script/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
- package/skills/docx/script/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
- package/skills/docx/script/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
- package/skills/docx/script/office/soffice.py +0 -183
- package/skills/docx/script/office/unpack.py +0 -132
- package/skills/docx/script/office/validate.py +0 -117
- package/skills/docx/script/office/validators/__init__.py +0 -15
- package/skills/docx/script/office/validators/base.py +0 -851
- package/skills/docx/script/office/validators/docx.py +0 -446
- package/skills/docx/script/office/validators/pptx.py +0 -275
- package/skills/docx/script/office/validators/redlining.py +0 -247
- package/skills/docx/script/templates/comments.xml +0 -3
- package/skills/docx/script/templates/commentsExtended.xml +0 -3
- package/skills/docx/script/templates/commentsExtensible.xml +0 -3
- package/skills/docx/script/templates/commentsIds.xml +0 -3
- package/skills/docx/script/templates/people.xml +0 -3
- package/skills/docx/skill.md +0 -593
- package/skills/explain.md +0 -14
- package/skills/frontend-design/SKILL.md +0 -42
- package/skills/pdf/FORMS.md +0 -294
- package/skills/pdf/REFERENCE.md +0 -612
- package/skills/pdf/SKILL.md +0 -314
- package/skills/pdf/scripts/check_bounding_boxes.py +0 -65
- package/skills/pdf/scripts/check_fillable_fields.py +0 -11
- package/skills/pdf/scripts/convert_pdf_to_images.py +0 -33
- package/skills/pdf/scripts/create_validation_image.py +0 -37
- package/skills/pdf/scripts/extract_form_field_info.py +0 -122
- package/skills/pdf/scripts/extract_form_structure.py +0 -115
- package/skills/pdf/scripts/fill_fillable_fields.py +0 -98
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -107
- package/skills/playground/SKILL.md +0 -77
- package/skills/playground/templates/code-map.md +0 -158
- package/skills/playground/templates/concept-map.md +0 -73
- package/skills/playground/templates/data-explorer.md +0 -67
- package/skills/playground/templates/design-playground.md +0 -67
- package/skills/playground/templates/diff-review.md +0 -179
- package/skills/playground/templates/document-critique.md +0 -171
- package/skills/pptx/SKILL.md +0 -230
- package/skills/pptx/editing.md +0 -205
- package/skills/pptx/pptxgenjs.md +0 -437
- package/skills/pptx/scripts/__init__.py +0 -0
- package/skills/pptx/scripts/add_slide.py +0 -195
- package/skills/pptx/scripts/clean.py +0 -286
- package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
- package/skills/pptx/scripts/office/helpers/merge_runs.py +0 -199
- package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -197
- package/skills/pptx/scripts/office/pack.py +0 -159
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
- package/skills/pptx/scripts/office/schemas/mce/mc.xsd +0 -75
- package/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
- package/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
- package/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
- package/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
- package/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
- package/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
- package/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
- package/skills/pptx/scripts/office/soffice.py +0 -183
- package/skills/pptx/scripts/office/unpack.py +0 -132
- package/skills/pptx/scripts/office/validate.py +0 -117
- package/skills/pptx/scripts/office/validators/__init__.py +0 -15
- package/skills/pptx/scripts/office/validators/base.py +0 -851
- package/skills/pptx/scripts/office/validators/docx.py +0 -446
- package/skills/pptx/scripts/office/validators/pptx.py +0 -275
- package/skills/pptx/scripts/office/validators/redlining.py +0 -247
- package/skills/pptx/scripts/thumbnail.py +0 -289
- package/skills/recon.md +0 -16
- package/skills/security-audit/SKILL.md +0 -26
- package/skills/talent-creator/SKILL.md +0 -486
- package/skills/talent-creator/agents/analyzer.md +0 -274
- package/skills/talent-creator/agents/comparator.md +0 -202
- package/skills/talent-creator/agents/grader.md +0 -223
- package/skills/talent-creator/assets/eval_review.html +0 -146
- package/skills/talent-creator/eval-viewer/generate_review.py +0 -471
- package/skills/talent-creator/eval-viewer/viewer.html +0 -1325
- package/skills/talent-creator/references/schemas.md +0 -430
- package/skills/talent-creator/scripts/__init__.py +0 -0
- package/skills/talent-creator/scripts/aggregate_benchmark.py +0 -401
- package/skills/talent-creator/scripts/generate_report.py +0 -326
- package/skills/talent-creator/scripts/improve_description.py +0 -247
- package/skills/talent-creator/scripts/package_skill.py +0 -136
- package/skills/talent-creator/scripts/quick_validate.py +0 -146
- package/skills/talent-creator/scripts/run_eval.py +0 -310
- package/skills/talent-creator/scripts/run_loop.py +0 -328
- package/skills/talent-creator/scripts/utils.py +0 -47
- package/skills/xlsx/SKILL.md +0 -300
- package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
- package/skills/xlsx/scripts/office/helpers/merge_runs.py +0 -199
- package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -197
- package/skills/xlsx/scripts/office/pack.py +0 -159
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
- package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +0 -75
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
- package/skills/xlsx/scripts/office/soffice.py +0 -183
- package/skills/xlsx/scripts/office/unpack.py +0 -132
- package/skills/xlsx/scripts/office/validate.py +0 -117
- package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
- package/skills/xlsx/scripts/office/validators/base.py +0 -851
- package/skills/xlsx/scripts/office/validators/docx.py +0 -446
- package/skills/xlsx/scripts/office/validators/pptx.py +0 -275
- package/skills/xlsx/scripts/office/validators/redlining.py +0 -247
- package/skills/xlsx/scripts/recalc.py +0 -184
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
# Grader Agent
|
|
2
|
-
|
|
3
|
-
Judge expectations against an execution transcript and its outputs.
|
|
4
|
-
|
|
5
|
-
## Role
|
|
6
|
-
|
|
7
|
-
The Grader goes over a transcript and the output files, then rules each expectation pass or fail. Back every ruling with clear evidence.
|
|
8
|
-
|
|
9
|
-
You wear two hats: grade the outputs, and critique the Inspections themselves. A pass on a flimsy assertion is worse than useless — it breeds false confidence. When you spot an assertion that's trivially met, or an important outcome no assertion checks, say so.
|
|
10
|
-
|
|
11
|
-
## Inputs
|
|
12
|
-
|
|
13
|
-
Your prompt hands you these parameters:
|
|
14
|
-
|
|
15
|
-
- **expectations**: List of expectations to inspect (strings)
|
|
16
|
-
- **transcript_path**: Path to the execution transcript (markdown file)
|
|
17
|
-
- **outputs_dir**: Directory containing output files from execution
|
|
18
|
-
|
|
19
|
-
## Process
|
|
20
|
-
|
|
21
|
-
### Step 1: Read the Transcript
|
|
22
|
-
|
|
23
|
-
1. Read the transcript file end to end
|
|
24
|
-
2. Note the Inspection prompt, the execution steps, and the final result
|
|
25
|
-
3. Pick out any issues or errors that got documented
|
|
26
|
-
|
|
27
|
-
### Step 2: Examine Output Files
|
|
28
|
-
|
|
29
|
-
1. List the files in outputs_dir
|
|
30
|
-
2. Read/examine each file that bears on the expectations. If the outputs aren't plain text, use the inspection tools your prompt provides — don't just take the transcript's word for what the executor produced.
|
|
31
|
-
3. Note the contents, structure, and quality
|
|
32
|
-
|
|
33
|
-
### Step 3: Inspect Each Assertion
|
|
34
|
-
|
|
35
|
-
For each expectation:
|
|
36
|
-
|
|
37
|
-
1. **Hunt for evidence** in the transcript and outputs
|
|
38
|
-
2. **Settle the verdict**:
|
|
39
|
-
- **PASS**: Clear evidence the expectation holds AND that evidence reflects real task completion, not just surface-level compliance
|
|
40
|
-
- **FAIL**: No evidence, or evidence that contradicts the expectation, or evidence that's only skin-deep (e.g., right filename but empty/wrong content)
|
|
41
|
-
3. **Cite the evidence**: Quote the exact text or describe what you found
|
|
42
|
-
|
|
43
|
-
### Step 4: Extract and Verify Claims
|
|
44
|
-
|
|
45
|
-
Beyond the predefined expectations, tease out the implicit claims in the outputs and check them:
|
|
46
|
-
|
|
47
|
-
1. **Pull out claims** from the transcript and outputs:
|
|
48
|
-
- Factual statements ("The form has 12 fields")
|
|
49
|
-
- Process claims ("Used pypdf to fill the form")
|
|
50
|
-
- Quality claims ("All fields were filled correctly")
|
|
51
|
-
|
|
52
|
-
2. **Verify each claim**:
|
|
53
|
-
- **Factual claims**: Can be checked against the outputs or external sources
|
|
54
|
-
- **Process claims**: Can be confirmed from the transcript
|
|
55
|
-
- **Quality claims**: Judge whether the claim is warranted
|
|
56
|
-
|
|
57
|
-
3. **Flag unverifiable claims**: Note claims you can't confirm with what's available
|
|
58
|
-
|
|
59
|
-
This catches problems the predefined expectations might let slip.
|
|
60
|
-
|
|
61
|
-
### Step 5: Read User Notes
|
|
62
|
-
|
|
63
|
-
If `{outputs_dir}/user_notes.md` exists:
|
|
64
|
-
1. Read it and note any uncertainties or issues the executor flagged
|
|
65
|
-
2. Pull the relevant concerns into the grading output
|
|
66
|
-
3. These can expose problems even when the expectations pass
|
|
67
|
-
|
|
68
|
-
### Step 6: Critique the Inspections
|
|
69
|
-
|
|
70
|
-
After grading, mull over whether the Inspections themselves could be sharper. Only raise suggestions when there's a genuine gap.
|
|
71
|
-
|
|
72
|
-
Good suggestions test outcomes that matter — assertions that are hard to satisfy without actually doing the work right. Think about what makes an assertion *discriminating*: it passes when the skill truly succeeds and fails when it doesn't.
|
|
73
|
-
|
|
74
|
-
Suggestions worth raising:
|
|
75
|
-
- An assertion that passed but would also pass for a plainly wrong output (e.g., checking that a filename exists but not its contents)
|
|
76
|
-
- An important outcome you saw — good or bad — that no assertion touches at all
|
|
77
|
-
- An assertion that can't really be verified from the available outputs
|
|
78
|
-
|
|
79
|
-
Hold the bar high. The aim is to flag things the Inspection author would call a "good catch," not to nitpick every assertion.
|
|
80
|
-
|
|
81
|
-
### Step 7: Write Grading Results
|
|
82
|
-
|
|
83
|
-
Save the results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
|
|
84
|
-
|
|
85
|
-
## Grading Criteria
|
|
86
|
-
|
|
87
|
-
**PASS when**:
|
|
88
|
-
- The transcript or outputs clearly show the expectation is true
|
|
89
|
-
- Specific evidence can be cited
|
|
90
|
-
- The evidence carries real substance, not just surface compliance (e.g., a file exists AND holds the correct content, not merely the right filename)
|
|
91
|
-
|
|
92
|
-
**FAIL when**:
|
|
93
|
-
- No evidence turns up for the expectation
|
|
94
|
-
- Evidence contradicts the expectation
|
|
95
|
-
- The expectation can't be verified from what's available
|
|
96
|
-
- The evidence is only skin-deep — the assertion is technically met but the underlying task outcome is wrong or incomplete
|
|
97
|
-
- The output seems to meet the assertion by luck rather than by actually doing the work
|
|
98
|
-
|
|
99
|
-
**When uncertain**: The burden of proof to pass rests on the expectation.
|
|
100
|
-
|
|
101
|
-
### Step 8: Read Executor Metrics and Timing
|
|
102
|
-
|
|
103
|
-
1. If `{outputs_dir}/metrics.json` exists, read it and fold it into the grading output
|
|
104
|
-
2. If `{outputs_dir}/../timing.json` exists, read it and include the timing data
|
|
105
|
-
|
|
106
|
-
## Output Format
|
|
107
|
-
|
|
108
|
-
Write out a JSON file in this shape:
|
|
109
|
-
|
|
110
|
-
```json
|
|
111
|
-
{
|
|
112
|
-
"expectations": [
|
|
113
|
-
{
|
|
114
|
-
"text": "The output includes the name 'John Smith'",
|
|
115
|
-
"passed": true,
|
|
116
|
-
"evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
|
|
117
|
-
},
|
|
118
|
-
{
|
|
119
|
-
"text": "The spreadsheet has a SUM formula in cell B10",
|
|
120
|
-
"passed": false,
|
|
121
|
-
"evidence": "No spreadsheet was created. The output was a text file."
|
|
122
|
-
},
|
|
123
|
-
{
|
|
124
|
-
"text": "The assistant used the skill's OCR script",
|
|
125
|
-
"passed": true,
|
|
126
|
-
"evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
|
|
127
|
-
}
|
|
128
|
-
],
|
|
129
|
-
"summary": {
|
|
130
|
-
"passed": 2,
|
|
131
|
-
"failed": 1,
|
|
132
|
-
"total": 3,
|
|
133
|
-
"pass_rate": 0.67
|
|
134
|
-
},
|
|
135
|
-
"execution_metrics": {
|
|
136
|
-
"tool_calls": {
|
|
137
|
-
"Read": 5,
|
|
138
|
-
"Write": 2,
|
|
139
|
-
"Bash": 8
|
|
140
|
-
},
|
|
141
|
-
"total_tool_calls": 15,
|
|
142
|
-
"total_steps": 6,
|
|
143
|
-
"errors_encountered": 0,
|
|
144
|
-
"output_chars": 12450,
|
|
145
|
-
"transcript_chars": 3200
|
|
146
|
-
},
|
|
147
|
-
"timing": {
|
|
148
|
-
"executor_duration_seconds": 165.0,
|
|
149
|
-
"grader_duration_seconds": 26.0,
|
|
150
|
-
"total_duration_seconds": 191.0
|
|
151
|
-
},
|
|
152
|
-
"claims": [
|
|
153
|
-
{
|
|
154
|
-
"claim": "The form has 12 fillable fields",
|
|
155
|
-
"type": "factual",
|
|
156
|
-
"verified": true,
|
|
157
|
-
"evidence": "Counted 12 fields in field_info.json"
|
|
158
|
-
},
|
|
159
|
-
{
|
|
160
|
-
"claim": "All required fields were populated",
|
|
161
|
-
"type": "quality",
|
|
162
|
-
"verified": false,
|
|
163
|
-
"evidence": "Reference section was left blank despite data being available"
|
|
164
|
-
}
|
|
165
|
-
],
|
|
166
|
-
"user_notes_summary": {
|
|
167
|
-
"uncertainties": ["Used 2023 data, may be stale"],
|
|
168
|
-
"needs_review": [],
|
|
169
|
-
"workarounds": ["Fell back to text overlay for non-fillable fields"]
|
|
170
|
-
},
|
|
171
|
-
"eval_feedback": {
|
|
172
|
-
"suggestions": [
|
|
173
|
-
{
|
|
174
|
-
"assertion": "The output includes the name 'John Smith'",
|
|
175
|
-
"reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
|
|
176
|
-
},
|
|
177
|
-
{
|
|
178
|
-
"reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
|
|
179
|
-
}
|
|
180
|
-
],
|
|
181
|
-
"overall": "Assertions check presence but not correctness. Consider adding content verification."
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
```
|
|
185
|
-
|
|
186
|
-
## Field Descriptions
|
|
187
|
-
|
|
188
|
-
- **expectations**: Array of graded expectations
|
|
189
|
-
- **text**: The original expectation text
|
|
190
|
-
- **passed**: Boolean - true when the expectation passes
|
|
191
|
-
- **evidence**: A specific quote or description backing the verdict
|
|
192
|
-
- **summary**: Aggregate statistics
|
|
193
|
-
- **passed**: Number of expectations that passed
|
|
194
|
-
- **failed**: Number of expectations that failed
|
|
195
|
-
- **total**: Total expectations inspected
|
|
196
|
-
- **pass_rate**: Fraction passed (0.0 to 1.0)
|
|
197
|
-
- **execution_metrics**: Carried over from the executor's metrics.json (when available)
|
|
198
|
-
- **output_chars**: Total character count of the output files (a stand-in for tokens)
|
|
199
|
-
- **transcript_chars**: Character count of the transcript
|
|
200
|
-
- **timing**: Wall clock timing from timing.json (when available)
|
|
201
|
-
- **executor_duration_seconds**: Time spent in the executor subagent
|
|
202
|
-
- **total_duration_seconds**: Total elapsed time for the run
|
|
203
|
-
- **claims**: Claims extracted from the output and verified
|
|
204
|
-
- **claim**: The statement under check
|
|
205
|
-
- **type**: "factual", "process", or "quality"
|
|
206
|
-
- **verified**: Boolean - whether the claim holds up
|
|
207
|
-
- **evidence**: Evidence for or against it
|
|
208
|
-
- **user_notes_summary**: Issues the executor flagged
|
|
209
|
-
- **uncertainties**: Things the executor wasn't sure about
|
|
210
|
-
- **needs_review**: Items that need a human's eyes
|
|
211
|
-
- **workarounds**: Spots where the skill didn't behave as expected
|
|
212
|
-
- **eval_feedback**: Improvement suggestions for the Inspections (only when warranted)
|
|
213
|
-
- **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it ties to
|
|
214
|
-
- **overall**: A short assessment — can be "No suggestions, Inspections look solid" when there's nothing to flag
|
|
215
|
-
|
|
216
|
-
## Guidelines
|
|
217
|
-
|
|
218
|
-
- **Be objective**: Ground verdicts in evidence, not assumptions
|
|
219
|
-
- **Be specific**: Quote the exact text that supports your verdict
|
|
220
|
-
- **Be thorough**: Check both the transcript and the output files
|
|
221
|
-
- **Be consistent**: Hold every expectation to the same standard
|
|
222
|
-
- **Explain failures**: Make it clear why the evidence fell short
|
|
223
|
-
- **No partial credit**: Each expectation is pass or fail, never halfway
|
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<meta charset="UTF-8">
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
-
<title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
|
|
7
|
-
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
8
|
-
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
9
|
-
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
|
|
10
|
-
<style>
|
|
11
|
-
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
12
|
-
body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
|
|
13
|
-
h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
|
|
14
|
-
.description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
|
|
15
|
-
.controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
|
|
16
|
-
.btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
|
|
17
|
-
.btn-add { background: #6a9bcc; color: white; }
|
|
18
|
-
.btn-add:hover { background: #5889b8; }
|
|
19
|
-
.btn-export { background: #d97757; color: white; }
|
|
20
|
-
.btn-export:hover { background: #c4613f; }
|
|
21
|
-
table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
|
|
22
|
-
th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
|
|
23
|
-
td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
|
|
24
|
-
tr:nth-child(even) td { background: #faf9f5; }
|
|
25
|
-
tr:hover td { background: #f3f1ea; }
|
|
26
|
-
.section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
|
|
27
|
-
.query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
|
|
28
|
-
.query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
|
|
29
|
-
.toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
|
|
30
|
-
.toggle input { opacity: 0; width: 0; height: 0; }
|
|
31
|
-
.toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
|
|
32
|
-
.toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
|
|
33
|
-
.toggle input:checked + .slider { background: #d97757; }
|
|
34
|
-
.toggle input:checked + .slider::before { transform: translateX(20px); }
|
|
35
|
-
.btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
|
|
36
|
-
.btn-delete:hover { background: #a33; }
|
|
37
|
-
.summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
|
|
38
|
-
</style>
|
|
39
|
-
</head>
|
|
40
|
-
<body>
|
|
41
|
-
<h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
|
|
42
|
-
<p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
|
|
43
|
-
|
|
44
|
-
<div class="controls">
|
|
45
|
-
<button class="btn btn-add" onclick="addRow()">+ Add Query</button>
|
|
46
|
-
<button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
|
|
47
|
-
</div>
|
|
48
|
-
|
|
49
|
-
<table>
|
|
50
|
-
<thead>
|
|
51
|
-
<tr>
|
|
52
|
-
<th style="width:65%">Query</th>
|
|
53
|
-
<th style="width:18%">Should Trigger</th>
|
|
54
|
-
<th style="width:10%">Actions</th>
|
|
55
|
-
</tr>
|
|
56
|
-
</thead>
|
|
57
|
-
<tbody id="eval-body"></tbody>
|
|
58
|
-
</table>
|
|
59
|
-
|
|
60
|
-
<p class="summary" id="summary"></p>
|
|
61
|
-
|
|
62
|
-
<script>
|
|
63
|
-
const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
|
|
64
|
-
|
|
65
|
-
let evalItems = [...EVAL_DATA];
|
|
66
|
-
|
|
67
|
-
function render() {
|
|
68
|
-
const tbody = document.getElementById('eval-body');
|
|
69
|
-
tbody.innerHTML = '';
|
|
70
|
-
|
|
71
|
-
// Sort: should-trigger first, then should-not-trigger
|
|
72
|
-
const sorted = evalItems
|
|
73
|
-
.map((item, origIdx) => ({ ...item, origIdx }))
|
|
74
|
-
.sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
|
|
75
|
-
|
|
76
|
-
let lastGroup = null;
|
|
77
|
-
sorted.forEach(item => {
|
|
78
|
-
const group = item.should_trigger ? 'trigger' : 'no-trigger';
|
|
79
|
-
if (group !== lastGroup) {
|
|
80
|
-
const headerRow = document.createElement('tr');
|
|
81
|
-
headerRow.className = 'section-header';
|
|
82
|
-
headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
|
|
83
|
-
tbody.appendChild(headerRow);
|
|
84
|
-
lastGroup = group;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
const idx = item.origIdx;
|
|
88
|
-
const tr = document.createElement('tr');
|
|
89
|
-
tr.innerHTML = `
|
|
90
|
-
<td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
|
|
91
|
-
<td>
|
|
92
|
-
<label class="toggle">
|
|
93
|
-
<input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
|
|
94
|
-
<span class="slider"></span>
|
|
95
|
-
</label>
|
|
96
|
-
<span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
|
|
97
|
-
</td>
|
|
98
|
-
<td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
|
|
99
|
-
`;
|
|
100
|
-
tbody.appendChild(tr);
|
|
101
|
-
});
|
|
102
|
-
updateSummary();
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
function escapeHtml(text) {
|
|
106
|
-
const div = document.createElement('div');
|
|
107
|
-
div.textContent = text;
|
|
108
|
-
return div.innerHTML;
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
|
|
112
|
-
function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
|
|
113
|
-
function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
|
|
114
|
-
|
|
115
|
-
function addRow() {
|
|
116
|
-
evalItems.push({ query: '', should_trigger: true });
|
|
117
|
-
render();
|
|
118
|
-
const inputs = document.querySelectorAll('.query-input');
|
|
119
|
-
inputs[inputs.length - 1].focus();
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
function updateSummary() {
|
|
123
|
-
const trigger = evalItems.filter(i => i.should_trigger).length;
|
|
124
|
-
const noTrigger = evalItems.filter(i => !i.should_trigger).length;
|
|
125
|
-
document.getElementById('summary').textContent =
|
|
126
|
-
`${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
function exportEvalSet() {
|
|
130
|
-
const valid = evalItems.filter(i => i.query.trim() !== '');
|
|
131
|
-
const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
|
|
132
|
-
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
|
|
133
|
-
const url = URL.createObjectURL(blob);
|
|
134
|
-
const a = document.createElement('a');
|
|
135
|
-
a.href = url;
|
|
136
|
-
a.download = 'eval_set.json';
|
|
137
|
-
document.body.appendChild(a);
|
|
138
|
-
a.click();
|
|
139
|
-
document.body.removeChild(a);
|
|
140
|
-
URL.revokeObjectURL(url);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
render();
|
|
144
|
-
</script>
|
|
145
|
-
</body>
|
|
146
|
-
</html>
|