@cleocode/cleo 2026.3.20 → 2026.3.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/dist/cli/index.js +46156 -42192
  2. package/dist/cli/index.js.map +4 -4
  3. package/dist/mcp/index.js +37601 -36260
  4. package/dist/mcp/index.js.map +4 -4
  5. package/drizzle-brain.config.ts +7 -0
  6. package/drizzle-nexus.config.ts +7 -0
  7. package/drizzle-tasks.config.ts +7 -0
  8. package/migrations/drizzle-brain/20260301230215_workable_spitfire/migration.sql +68 -0
  9. package/migrations/drizzle-brain/20260301230215_workable_spitfire/snapshot.json +651 -0
  10. package/migrations/drizzle-brain/20260302050325_unknown_justin_hammer/migration.sql +23 -0
  11. package/migrations/drizzle-brain/20260302050325_unknown_justin_hammer/snapshot.json +884 -0
  12. package/migrations/drizzle-brain/20260302061755_unusual_jamie_braddock/migration.sql +2 -0
  13. package/migrations/drizzle-brain/20260302061755_unusual_jamie_braddock/snapshot.json +908 -0
  14. package/migrations/drizzle-brain/20260302193548_luxuriant_glorian/migration.sql +20 -0
  15. package/migrations/drizzle-brain/20260302193548_luxuriant_glorian/snapshot.json +1078 -0
  16. package/migrations/drizzle-brain/20260304045002_white_thunderbolt_ross/migration.sql +16 -0
  17. package/migrations/drizzle-brain/20260304045002_white_thunderbolt_ross/snapshot.json +1233 -0
  18. package/migrations/drizzle-nexus/20260305070805_quick_ted_forrester/migration.sql +46 -0
  19. package/migrations/drizzle-nexus/20260305070805_quick_ted_forrester/snapshot.json +461 -0
  20. package/migrations/drizzle-tasks/20260308024513_oval_king_bedlam/migration.sql +32 -0
  21. package/migrations/drizzle-tasks/20260308024513_oval_king_bedlam/snapshot.json +3727 -0
  22. package/package.json +22 -5
  23. package/packages/ct-skills/skills/ct-cleo/SKILL.md +344 -81
  24. package/packages/ct-skills/skills/ct-grade/SKILL.md +20 -4
  25. package/packages/ct-skills/skills/ct-grade/agents/analysis-reporter.md +203 -0
  26. package/packages/ct-skills/skills/ct-grade/agents/blind-comparator.md +157 -0
  27. package/packages/ct-skills/skills/ct-grade/agents/scenario-runner.md +134 -0
  28. package/packages/ct-skills/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  29. package/packages/ct-skills/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  30. package/packages/ct-skills/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  31. package/packages/ct-skills/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  32. package/packages/ct-skills/skills/ct-grade/eval-viewer/viewer.html +219 -0
  33. package/packages/ct-skills/skills/ct-grade/evals/evals.json +94 -0
  34. package/packages/ct-skills/skills/ct-grade/references/ab-test-methodology.md +150 -0
  35. package/packages/ct-skills/skills/ct-grade/references/domains.md +137 -0
  36. package/packages/ct-skills/skills/ct-grade/references/grade-spec.md +236 -0
  37. package/packages/ct-skills/skills/ct-grade/references/scenario-playbook.md +234 -0
  38. package/packages/ct-skills/skills/ct-grade/references/token-tracking.md +120 -0
  39. package/packages/ct-skills/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  40. package/packages/ct-skills/skills/ct-grade/scripts/generate_report.py +283 -0
  41. package/packages/ct-skills/skills/ct-grade/scripts/run_ab_test.py +504 -0
  42. package/packages/ct-skills/skills/ct-grade/scripts/run_all.py +287 -0
  43. package/packages/ct-skills/skills/ct-grade/scripts/setup_run.py +183 -0
  44. package/packages/ct-skills/skills/ct-grade/scripts/token_tracker.py +630 -0
  45. package/packages/ct-skills/skills/ct-grade-v2-1/SKILL.md +237 -0
  46. package/packages/ct-skills/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  47. package/packages/ct-skills/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  48. package/packages/ct-skills/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  49. package/packages/ct-skills/skills/ct-grade-v2-1/evals/evals.json +74 -0
  50. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  51. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  52. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  53. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  54. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  55. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  56. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  57. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  58. package/packages/ct-skills/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  59. package/packages/ct-skills/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  60. package/packages/ct-skills/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  61. package/packages/ct-skills/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  62. package/packages/ct-skills/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  63. package/packages/ct-skills/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  64. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  65. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  66. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  67. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  68. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  69. package/packages/ct-skills/skills/ct-orchestrator/SKILL.md +1 -29
  70. package/packages/ct-skills/skills/ct-orchestrator/manifest-entry.json +19 -0
  71. package/packages/ct-skills/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  72. package/packages/ct-skills/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  73. package/packages/ct-skills/skills/ct-skill-creator/SKILL.md +0 -12
  74. package/packages/ct-skills/skills/ct-skill-creator/agents/analyzer.md +276 -0
  75. package/packages/ct-skills/skills/ct-skill-creator/agents/comparator.md +204 -0
  76. package/packages/ct-skills/skills/ct-skill-creator/agents/grader.md +225 -0
  77. package/packages/ct-skills/skills/ct-skill-creator/assets/eval_review.html +146 -0
  78. package/packages/ct-skills/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  79. package/packages/ct-skills/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  80. package/packages/ct-skills/skills/ct-skill-creator/manifest-entry.json +17 -0
  81. package/packages/ct-skills/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  82. package/packages/ct-skills/skills/ct-skill-creator/references/frontmatter.md +83 -0
  83. package/packages/ct-skills/skills/ct-skill-creator/references/invocation-control.md +165 -0
  84. package/packages/ct-skills/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  85. package/packages/ct-skills/skills/ct-skill-creator/references/schemas.md +430 -0
  86. package/packages/ct-skills/skills/ct-skill-creator/scripts/__init__.py +1 -0
  87. package/packages/ct-skills/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  88. package/packages/ct-skills/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  89. package/packages/ct-skills/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  90. package/packages/ct-skills/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  91. package/packages/ct-skills/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  92. package/packages/ct-skills/skills/ct-skill-creator/scripts/utils.py +47 -0
  93. package/packages/ct-skills/skills/ct-skill-validator/SKILL.md +178 -0
  94. package/packages/ct-skills/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  95. package/packages/ct-skills/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  96. package/packages/ct-skills/skills/ct-skill-validator/evals/eval_set.json +14 -0
  97. package/packages/ct-skills/skills/ct-skill-validator/evals/evals.json +52 -0
  98. package/packages/ct-skills/skills/ct-skill-validator/manifest-entry.json +20 -0
  99. package/packages/ct-skills/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  100. package/packages/ct-skills/skills/ct-skill-validator/references/validation-rules.md +168 -0
  101. package/packages/ct-skills/skills/ct-skill-validator/scripts/__init__.py +0 -0
  102. package/packages/ct-skills/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  103. package/packages/ct-skills/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  104. package/packages/ct-skills/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  105. package/packages/ct-skills/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  106. package/packages/ct-skills/skills/ct-skill-validator/scripts/validate.py +422 -0
  107. /package/{drizzle → migrations/drizzle-tasks}/20260224040019_baseline/migration.sql +0 -0
  108. /package/{drizzle → migrations/drizzle-tasks}/20260224040019_baseline/snapshot.json +0 -0
  109. /package/{drizzle → migrations/drizzle-tasks}/20260224040238_add-audit-log/migration.sql +0 -0
  110. /package/{drizzle → migrations/drizzle-tasks}/20260224040238_add-audit-log/snapshot.json +0 -0
  111. /package/{drizzle → migrations/drizzle-tasks}/20260224144602_closed_grim_reaper/migration.sql +0 -0
  112. /package/{drizzle → migrations/drizzle-tasks}/20260224144602_closed_grim_reaper/snapshot.json +0 -0
  113. /package/{drizzle → migrations/drizzle-tasks}/20260225024442_sync-lifecycle-enums-and-arch-decisions/migration.sql +0 -0
  114. /package/{drizzle → migrations/drizzle-tasks}/20260225024442_sync-lifecycle-enums-and-arch-decisions/snapshot.json +0 -0
  115. /package/{drizzle → migrations/drizzle-tasks}/20260227014821_adr-system-and-status-registry/migration.sql +0 -0
  116. /package/{drizzle → migrations/drizzle-tasks}/20260227014821_adr-system-and-status-registry/snapshot.json +0 -0
  117. /package/{drizzle → migrations/drizzle-tasks}/20260227021231_add-cancelled-pipeline-status/migration.sql +0 -0
  118. /package/{drizzle → migrations/drizzle-tasks}/20260227021231_add-cancelled-pipeline-status/snapshot.json +0 -0
  119. /package/{drizzle → migrations/drizzle-tasks}/20260227022417_adr-cognitive-search-fields/migration.sql +0 -0
  120. /package/{drizzle → migrations/drizzle-tasks}/20260227022417_adr-cognitive-search-fields/snapshot.json +0 -0
  121. /package/{drizzle → migrations/drizzle-tasks}/20260227172236_freezing_grey_gargoyle/migration.sql +0 -0
  122. /package/{drizzle → migrations/drizzle-tasks}/20260227172236_freezing_grey_gargoyle/snapshot.json +0 -0
  123. /package/{drizzle → migrations/drizzle-tasks}/20260227183444_fix-orphaned-parent-ids/migration.sql +0 -0
  124. /package/{drizzle → migrations/drizzle-tasks}/20260227183444_fix-orphaned-parent-ids/snapshot.json +0 -0
  125. /package/{drizzle → migrations/drizzle-tasks}/20260227183521_parent-id-on-delete-set-null/migration.sql +0 -0
  126. /package/{drizzle → migrations/drizzle-tasks}/20260227183521_parent-id-on-delete-set-null/snapshot.json +0 -0
  127. /package/{drizzle → migrations/drizzle-tasks}/20260227200430_numerous_mysterio/migration.sql +0 -0
  128. /package/{drizzle → migrations/drizzle-tasks}/20260227200430_numerous_mysterio/snapshot.json +0 -0
  129. /package/{drizzle → migrations/drizzle-tasks}/20260227235745_add-audit-log-dispatch-columns/migration.sql +0 -0
  130. /package/{drizzle → migrations/drizzle-tasks}/20260227235745_add-audit-log-dispatch-columns/snapshot.json +0 -0
  131. /package/{drizzle → migrations/drizzle-tasks}/20260301053344_careless_changeling/migration.sql +0 -0
  132. /package/{drizzle → migrations/drizzle-tasks}/20260301053344_careless_changeling/snapshot.json +0 -0
  133. /package/{drizzle → migrations/drizzle-tasks}/20260301175940_futuristic_eternity/migration.sql +0 -0
  134. /package/{drizzle → migrations/drizzle-tasks}/20260301175940_futuristic_eternity/snapshot.json +0 -0
  135. /package/{drizzle → migrations/drizzle-tasks}/20260301180528_update-task-relations-check-constraint/migration.sql +0 -0
  136. /package/{drizzle → migrations/drizzle-tasks}/20260301180528_update-task-relations-check-constraint/snapshot.json +0 -0
  137. /package/{drizzle → migrations/drizzle-tasks}/20260302163443_free_silk_fever/migration.sql +0 -0
  138. /package/{drizzle → migrations/drizzle-tasks}/20260302163443_free_silk_fever/snapshot.json +0 -0
  139. /package/{drizzle → migrations/drizzle-tasks}/20260302163457_robust_johnny_storm/migration.sql +0 -0
  140. /package/{drizzle → migrations/drizzle-tasks}/20260302163457_robust_johnny_storm/snapshot.json +0 -0
  141. /package/{drizzle → migrations/drizzle-tasks}/20260302163511_late_sphinx/migration.sql +0 -0
  142. /package/{drizzle → migrations/drizzle-tasks}/20260302163511_late_sphinx/snapshot.json +0 -0
  143. /package/{drizzle → migrations/drizzle-tasks}/20260305011924_cheerful_mongu/migration.sql +0 -0
  144. /package/{drizzle → migrations/drizzle-tasks}/20260305011924_cheerful_mongu/snapshot.json +0 -0
  145. /package/{drizzle → migrations/drizzle-tasks}/20260305203927_demonic_storm/migration.sql +0 -0
  146. /package/{drizzle → migrations/drizzle-tasks}/20260305203927_demonic_storm/snapshot.json +0 -0
  147. /package/{drizzle → migrations/drizzle-tasks}/20260306001243_spooky_rage/migration.sql +0 -0
  148. /package/{drizzle → migrations/drizzle-tasks}/20260306001243_spooky_rage/snapshot.json +0 -0
  149. /package/{drizzle → migrations/drizzle-tasks}/20260306193138_young_morbius/migration.sql +0 -0
  150. /package/{drizzle → migrations/drizzle-tasks}/20260306193138_young_morbius/snapshot.json +0 -0
  151. /package/{drizzle → migrations/drizzle-tasks}/20260306194959_sticky_captain_flint/migration.sql +0 -0
  152. /package/{drizzle → migrations/drizzle-tasks}/20260306194959_sticky_captain_flint/snapshot.json +0 -0
@@ -0,0 +1,204 @@
1
+ # Blind Comparator Agent
2
+
3
+ Compare two outputs WITHOUT knowing which skill produced them.
4
+
5
+ ## Role
6
+
7
+ The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
8
+
9
+ Your judgment is based purely on output quality and task completion.
10
+
11
+ ## Inputs
12
+
13
+ You receive these parameters in your prompt:
14
+
15
+ - **output_a_path**: Path to the first output file or directory
16
+ - **output_b_path**: Path to the second output file or directory
17
+ - **eval_prompt**: The original task/prompt that was executed
18
+ - **expectations**: List of expectations to check (optional - may be empty)
19
+
20
+ ## Process
21
+
22
+ ### Step 1: Read Both Outputs
23
+
24
+ 1. Examine output A (file or directory)
25
+ 2. Examine output B (file or directory)
26
+ 3. Note the type, structure, and content of each
27
+ 4. If outputs are directories, examine all relevant files inside
28
+
29
+ ### Step 2: Understand the Task
30
+
31
+ 1. Read the eval_prompt carefully
32
+ 2. Identify what the task requires:
33
+ - What should be produced?
34
+ - What qualities matter (accuracy, completeness, format)?
35
+ - What would distinguish a good output from a poor one?
36
+
37
+ ### Step 3: Generate Evaluation Rubric
38
+
39
+ Based on the task, generate a rubric with two dimensions:
40
+
41
+ **Content Rubric** (what the output contains):
42
+ | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
43
+ |-----------|----------|----------------|---------------|
44
+ | Correctness | Major errors | Minor errors | Fully correct |
45
+ | Completeness | Missing key elements | Mostly complete | All elements present |
46
+ | Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
47
+
48
+ **Structure Rubric** (how the output is organized):
49
+ | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
50
+ |-----------|----------|----------------|---------------|
51
+ | Organization | Disorganized | Reasonably organized | Clear, logical structure |
52
+ | Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
53
+ | Usability | Difficult to use | Usable with effort | Easy to use |
54
+
55
+ Adapt criteria to the specific task. For example:
56
+ - PDF form → "Field alignment", "Text readability", "Data placement"
57
+ - Document → "Section structure", "Heading hierarchy", "Paragraph flow"
58
+ - Data output → "Schema correctness", "Data types", "Completeness"
59
+
60
+ ### Step 4: Evaluate Each Output Against the Rubric
61
+
62
+ For each output (A and B):
63
+
64
+ 1. **Score each criterion** on the rubric (1-5 scale)
65
+ 2. **Calculate dimension totals**: Content score, Structure score
66
+ 3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
67
+
68
+ ### Step 5: Check Assertions (if provided)
69
+
70
+ If expectations are provided:
71
+
72
+ 1. Check each expectation against output A
73
+ 2. Check each expectation against output B
74
+ 3. Count pass rates for each output
75
+ 4. Use expectation scores as secondary evidence (not the primary decision factor)
76
+
77
+ ### Step 6: Determine the Winner
78
+
79
+ Compare A and B based on (in priority order):
80
+
81
+ 1. **Primary**: Overall rubric score (content + structure)
82
+ 2. **Secondary**: Assertion pass rates (if applicable)
83
+ 3. **Tiebreaker**: If truly equal, declare a TIE
84
+
85
+ Be decisive - ties should be rare. One output is usually better, even if marginally.
86
+
87
+ ### Step 7: Write Comparison Results
88
+
89
+ Save results to a JSON file at the path specified (or `comparison.json` if not specified).
90
+
91
+ ## Output Format
92
+
93
+ Write a JSON file with this structure:
94
+
95
+ ```json
96
+ {
97
+ "winner": "A",
98
+ "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
99
+ "rubric": {
100
+ "A": {
101
+ "content": {
102
+ "correctness": 5,
103
+ "completeness": 5,
104
+ "accuracy": 4
105
+ },
106
+ "structure": {
107
+ "organization": 4,
108
+ "formatting": 5,
109
+ "usability": 4
110
+ },
111
+ "content_score": 4.7,
112
+ "structure_score": 4.3,
113
+ "overall_score": 9.0
114
+ },
115
+ "B": {
116
+ "content": {
117
+ "correctness": 3,
118
+ "completeness": 2,
119
+ "accuracy": 3
120
+ },
121
+ "structure": {
122
+ "organization": 3,
123
+ "formatting": 2,
124
+ "usability": 3
125
+ },
126
+ "content_score": 2.7,
127
+ "structure_score": 2.7,
128
+ "overall_score": 5.4
129
+ }
130
+ },
131
+ "output_quality": {
132
+ "A": {
133
+ "score": 9,
134
+ "strengths": ["Complete solution", "Well-formatted", "All fields present"],
135
+ "weaknesses": ["Minor style inconsistency in header"]
136
+ },
137
+ "B": {
138
+ "score": 5,
139
+ "strengths": ["Readable output", "Correct basic structure"],
140
+ "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
141
+ }
142
+ },
143
+ "expectation_results": {
144
+ "A": {
145
+ "passed": 4,
146
+ "total": 5,
147
+ "pass_rate": 0.80,
148
+ "details": [
149
+ {"text": "Output includes name", "passed": true},
150
+ {"text": "Output includes date", "passed": true},
151
+ {"text": "Format is PDF", "passed": true},
152
+ {"text": "Contains signature", "passed": false},
153
+ {"text": "Readable text", "passed": true}
154
+ ]
155
+ },
156
+ "B": {
157
+ "passed": 3,
158
+ "total": 5,
159
+ "pass_rate": 0.60,
160
+ "details": [
161
+ {"text": "Output includes name", "passed": true},
162
+ {"text": "Output includes date", "passed": false},
163
+ {"text": "Format is PDF", "passed": true},
164
+ {"text": "Contains signature", "passed": false},
165
+ {"text": "Readable text", "passed": true}
166
+ ]
167
+ }
168
+ }
169
+ }
170
+ ```
171
+
172
+ If no expectations were provided, omit the `expectation_results` field entirely.
173
+
174
+ ## Field Descriptions
175
+
176
+ - **winner**: "A", "B", or "TIE"
177
+ - **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
178
+ - **rubric**: Structured rubric evaluation for each output
179
+ - **content**: Scores for content criteria (correctness, completeness, accuracy)
180
+ - **structure**: Scores for structure criteria (organization, formatting, usability)
181
+ - **content_score**: Average of content criteria (1-5)
182
+ - **structure_score**: Average of structure criteria (1-5)
183
+ - **overall_score**: Combined score scaled to 1-10
184
+ - **output_quality**: Summary quality assessment
185
+ - **score**: 1-10 rating (should match rubric overall_score)
186
+ - **strengths**: List of positive aspects
187
+ - **weaknesses**: List of issues or shortcomings
188
+ - **expectation_results**: (Only if expectations provided)
189
+ - **passed**: Number of expectations that passed
190
+ - **total**: Total number of expectations
191
+ - **pass_rate**: Fraction passed (0.0 to 1.0)
192
+ - **details**: Individual expectation results
193
+
194
+ ## Guidelines
195
+
196
+ - **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
197
+ - **Be specific**: Cite specific examples when explaining strengths and weaknesses.
198
+ - **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
199
+ - **Output quality first**: Assertion scores are secondary to overall task completion.
200
+ - **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
201
+ - **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
202
+ - **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.
203
+
204
+ See [references/schemas.md](../references/schemas.md) for the complete comparison.json schema definition.
@@ -0,0 +1,225 @@
1
+ # Grader Agent
2
+
3
+ Evaluate expectations against an execution transcript and outputs.
4
+
5
+ ## Role
6
+
7
+ The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
8
+
9
+ You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
10
+
11
+ ## Inputs
12
+
13
+ You receive these parameters in your prompt:
14
+
15
+ - **expectations**: List of expectations to evaluate (strings)
16
+ - **transcript_path**: Path to the execution transcript (markdown file)
17
+ - **outputs_dir**: Directory containing output files from execution
18
+
19
+ ## Process
20
+
21
+ ### Step 1: Read the Transcript
22
+
23
+ 1. Read the transcript file completely
24
+ 2. Note the eval prompt, execution steps, and final result
25
+ 3. Identify any issues or errors documented
26
+
27
+ ### Step 2: Examine Output Files
28
+
29
+ 1. List files in outputs_dir
30
+ 2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
31
+ 3. Note contents, structure, and quality
32
+
33
+ ### Step 3: Evaluate Each Assertion
34
+
35
+ For each expectation:
36
+
37
+ 1. **Search for evidence** in the transcript and outputs
38
+ 2. **Determine verdict**:
39
+ - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
40
+ - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
41
+ 3. **Cite the evidence**: Quote the specific text or describe what you found
42
+
43
+ ### Step 4: Extract and Verify Claims
44
+
45
+ Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
46
+
47
+ 1. **Extract claims** from the transcript and outputs:
48
+ - Factual statements ("The form has 12 fields")
49
+ - Process claims ("Used pypdf to fill the form")
50
+ - Quality claims ("All fields were filled correctly")
51
+
52
+ 2. **Verify each claim**:
53
+ - **Factual claims**: Can be checked against the outputs or external sources
54
+ - **Process claims**: Can be verified from the transcript
55
+ - **Quality claims**: Evaluate whether the claim is justified
56
+
57
+ 3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
58
+
59
+ This catches issues that predefined expectations might miss.
60
+
61
+ ### Step 5: Read User Notes
62
+
63
+ If `{outputs_dir}/user_notes.md` exists:
64
+ 1. Read it and note any uncertainties or issues flagged by the executor
65
+ 2. Include relevant concerns in the grading output
66
+ 3. These may reveal problems even when expectations pass
67
+
68
+ ### Step 6: Critique the Evals
69
+
70
+ After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
71
+
72
+ Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
73
+
74
+ Suggestions worth raising:
75
+ - An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
76
+ - An important outcome you observed — good or bad — that no assertion covers at all
77
+ - An assertion that can't actually be verified from the available outputs
78
+
79
+ Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
80
+
81
+ ### Step 7: Write Grading Results
82
+
83
+ Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
84
+
85
+ ## Grading Criteria
86
+
87
+ **PASS when**:
88
+ - The transcript or outputs clearly demonstrate the expectation is true
89
+ - Specific evidence can be cited
90
+ - The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
91
+
92
+ **FAIL when**:
93
+ - No evidence found for the expectation
94
+ - Evidence contradicts the expectation
95
+ - The expectation cannot be verified from available information
96
+ - The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
97
+ - The output appears to meet the assertion by coincidence rather than by actually doing the work
98
+
99
+ **When uncertain**: The burden of proof to pass is on the expectation.
100
+
101
+ ### Step 8: Read Executor Metrics and Timing
102
+
103
+ 1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
104
+ 2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
105
+
106
+ ## Output Format
107
+
108
+ Write a JSON file with this structure:
109
+
110
+ ```json
111
+ {
112
+ "expectations": [
113
+ {
114
+ "text": "The output includes the name 'John Smith'",
115
+ "passed": true,
116
+ "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
117
+ },
118
+ {
119
+ "text": "The spreadsheet has a SUM formula in cell B10",
120
+ "passed": false,
121
+ "evidence": "No spreadsheet was created. The output was a text file."
122
+ },
123
+ {
124
+ "text": "The assistant used the skill's OCR script",
125
+ "passed": true,
126
+ "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
127
+ }
128
+ ],
129
+ "summary": {
130
+ "passed": 2,
131
+ "failed": 1,
132
+ "total": 3,
133
+ "pass_rate": 0.67
134
+ },
135
+ "execution_metrics": {
136
+ "tool_calls": {
137
+ "Read": 5,
138
+ "Write": 2,
139
+ "Bash": 8
140
+ },
141
+ "total_tool_calls": 15,
142
+ "total_steps": 6,
143
+ "errors_encountered": 0,
144
+ "output_chars": 12450,
145
+ "transcript_chars": 3200
146
+ },
147
+ "timing": {
148
+ "executor_duration_seconds": 165.0,
149
+ "grader_duration_seconds": 26.0,
150
+ "total_duration_seconds": 191.0
151
+ },
152
+ "claims": [
153
+ {
154
+ "claim": "The form has 12 fillable fields",
155
+ "type": "factual",
156
+ "verified": true,
157
+ "evidence": "Counted 12 fields in field_info.json"
158
+ },
159
+ {
160
+ "claim": "All required fields were populated",
161
+ "type": "quality",
162
+ "verified": false,
163
+ "evidence": "Reference section was left blank despite data being available"
164
+ }
165
+ ],
166
+ "user_notes_summary": {
167
+ "uncertainties": ["Used 2023 data, may be stale"],
168
+ "needs_review": [],
169
+ "workarounds": ["Fell back to text overlay for non-fillable fields"]
170
+ },
171
+ "eval_feedback": {
172
+ "suggestions": [
173
+ {
174
+ "assertion": "The output includes the name 'John Smith'",
175
+ "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
176
+ },
177
+ {
178
+ "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
179
+ }
180
+ ],
181
+ "overall": "Assertions check presence but not correctness. Consider adding content verification."
182
+ }
183
+ }
184
+ ```
185
+
186
+ ## Field Descriptions
187
+
188
+ - **expectations**: Array of graded expectations
189
+ - **text**: The original expectation text
190
+ - **passed**: Boolean - true if expectation passes
191
+ - **evidence**: Specific quote or description supporting the verdict
192
+ - **summary**: Aggregate statistics
193
+ - **passed**: Count of passed expectations
194
+ - **failed**: Count of failed expectations
195
+ - **total**: Total expectations evaluated
196
+ - **pass_rate**: Fraction passed (0.0 to 1.0)
197
+ - **execution_metrics**: Copied from executor's metrics.json (if available)
198
+ - **output_chars**: Total character count of output files (proxy for tokens)
199
+ - **transcript_chars**: Character count of transcript
200
+ - **timing**: Wall clock timing from timing.json (if available)
201
+ - **executor_duration_seconds**: Time spent in executor subagent
202
+ - **total_duration_seconds**: Total elapsed time for the run
203
+ - **claims**: Extracted and verified claims from the output
204
+ - **claim**: The statement being verified
205
+ - **type**: "factual", "process", or "quality"
206
+ - **verified**: Boolean - whether the claim holds
207
+ - **evidence**: Supporting or contradicting evidence
208
+ - **user_notes_summary**: Issues flagged by the executor
209
+ - **uncertainties**: Things the executor wasn't sure about
210
+ - **needs_review**: Items requiring human attention
211
+ - **workarounds**: Places where the skill didn't work as expected
212
+ - **eval_feedback**: Improvement suggestions for the evals (only when warranted)
213
+ - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
214
+ - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
215
+
216
+ ## Guidelines
217
+
218
+ - **Be objective**: Base verdicts on evidence, not assumptions
219
+ - **Be specific**: Quote the exact text that supports your verdict
220
+ - **Be thorough**: Check both transcript and output files
221
+ - **Be consistent**: Apply the same standard to each expectation
222
+ - **Explain failures**: Make it clear why evidence was insufficient
223
+ - **No partial credit**: Each expectation is pass or fail, not partial
224
+
225
+ See [references/schemas.md](../references/schemas.md) for the complete grading.json schema definition.
@@ -0,0 +1,146 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
+ <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
10
+ <style>
11
+ * { box-sizing: border-box; margin: 0; padding: 0; }
12
+ body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
13
+ h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
14
+ .description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
15
+ .controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
16
+ .btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
17
+ .btn-add { background: #6a9bcc; color: white; }
18
+ .btn-add:hover { background: #5889b8; }
19
+ .btn-export { background: #d97757; color: white; }
20
+ .btn-export:hover { background: #c4613f; }
21
+ table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
22
+ th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
23
+ td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
24
+ tr:nth-child(even) td { background: #faf9f5; }
25
+ tr:hover td { background: #f3f1ea; }
26
+ .section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
27
+ .query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
28
+ .query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
29
+ .toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
30
+ .toggle input { opacity: 0; width: 0; height: 0; }
31
+ .toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
32
+ .toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
33
+ .toggle input:checked + .slider { background: #d97757; }
34
+ .toggle input:checked + .slider::before { transform: translateX(20px); }
35
+ .btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
36
+ .btn-delete:hover { background: #a33; }
37
+ .summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
38
+ </style>
39
+ </head>
40
+ <body>
41
+ <h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
42
+ <p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
43
+
44
+ <div class="controls">
45
+ <button class="btn btn-add" onclick="addRow()">+ Add Query</button>
46
+ <button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
47
+ </div>
48
+
49
+ <table>
50
+ <thead>
51
+ <tr>
52
+ <th style="width:65%">Query</th>
53
+ <th style="width:18%">Should Trigger</th>
54
+ <th style="width:10%">Actions</th>
55
+ </tr>
56
+ </thead>
57
+ <tbody id="eval-body"></tbody>
58
+ </table>
59
+
60
+ <p class="summary" id="summary"></p>
61
+
62
+ <script>
63
+ const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
64
+
65
+ let evalItems = [...EVAL_DATA];
66
+
67
+ function render() {
68
+ const tbody = document.getElementById('eval-body');
69
+ tbody.innerHTML = '';
70
+
71
+ // Sort: should-trigger first, then should-not-trigger
72
+ const sorted = evalItems
73
+ .map((item, origIdx) => ({ ...item, origIdx }))
74
+ .sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
75
+
76
+ let lastGroup = null;
77
+ sorted.forEach(item => {
78
+ const group = item.should_trigger ? 'trigger' : 'no-trigger';
79
+ if (group !== lastGroup) {
80
+ const headerRow = document.createElement('tr');
81
+ headerRow.className = 'section-header';
82
+ headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
83
+ tbody.appendChild(headerRow);
84
+ lastGroup = group;
85
+ }
86
+
87
+ const idx = item.origIdx;
88
+ const tr = document.createElement('tr');
89
+ tr.innerHTML = `
90
+ <td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
91
+ <td>
92
+ <label class="toggle">
93
+ <input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
94
+ <span class="slider"></span>
95
+ </label>
96
+ <span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
97
+ </td>
98
+ <td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
99
+ `;
100
+ tbody.appendChild(tr);
101
+ });
102
+ updateSummary();
103
+ }
104
+
105
+ function escapeHtml(text) {
106
+ const div = document.createElement('div');
107
+ div.textContent = text;
108
+ return div.innerHTML;
109
+ }
110
+
111
+ function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
112
+ function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
113
+ function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
114
+
115
+ function addRow() {
116
+ evalItems.push({ query: '', should_trigger: true });
117
+ render();
118
+ const inputs = document.querySelectorAll('.query-input');
119
+ inputs[inputs.length - 1].focus();
120
+ }
121
+
122
+ function updateSummary() {
123
+ const trigger = evalItems.filter(i => i.should_trigger).length;
124
+ const noTrigger = evalItems.filter(i => !i.should_trigger).length;
125
+ document.getElementById('summary').textContent =
126
+ `${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
127
+ }
128
+
129
+ function exportEvalSet() {
130
+ const valid = evalItems.filter(i => i.query.trim() !== '');
131
+ const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
132
+ const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
133
+ const url = URL.createObjectURL(blob);
134
+ const a = document.createElement('a');
135
+ a.href = url;
136
+ a.download = 'eval_set.json';
137
+ document.body.appendChild(a);
138
+ a.click();
139
+ document.body.removeChild(a);
140
+ URL.revokeObjectURL(url);
141
+ }
142
+
143
+ render();
144
+ </script>
145
+ </body>
146
+ </html>