@biggora/claude-plugins 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -4
- package/package.json +1 -1
- package/registry/registry.json +319 -244
- package/specs/coding.md +24 -0
- package/specs/pod.md +2 -0
- package/src/skills/captcha/README.md +221 -0
- package/src/skills/captcha/SKILL.md +355 -0
- package/src/skills/captcha/references/captcha-types.md +254 -0
- package/src/skills/captcha/references/services.md +172 -0
- package/src/skills/captcha/references/stealth.md +238 -0
- package/src/skills/captcha/scripts/solve_captcha.py +323 -0
- package/src/skills/captcha/scripts/solve_image_grid.py +350 -0
- package/src/skills/google-merchant-api/SKILL.md +581 -0
- package/src/skills/google-merchant-api/references/accounts.md +247 -0
- package/src/skills/google-merchant-api/references/content-api-legacy.md +216 -0
- package/src/skills/google-merchant-api/references/datasources.md +233 -0
- package/src/skills/google-merchant-api/references/inventories.md +201 -0
- package/src/skills/google-merchant-api/references/migration.md +267 -0
- package/src/skills/google-merchant-api/references/products.md +316 -0
- package/src/skills/google-merchant-api/references/promotions.md +201 -0
- package/src/skills/google-merchant-api/references/reports.md +240 -0
- package/src/skills/lv-aggregators-api/SKILL.md +113 -0
- package/src/skills/lv-aggregators-api/references/integration-guide.md +368 -0
- package/src/skills/lv-aggregators-api/references/kurpirkt.md +103 -0
- package/src/skills/lv-aggregators-api/references/salidzini.md +122 -0
- package/src/skills/tailwindcss-best-practices/SKILL.md +180 -0
- package/src/skills/tailwindcss-best-practices/references/best-practices-utility-patterns.md +87 -0
- package/src/skills/tailwindcss-best-practices/references/core-installation.md +109 -0
- package/src/skills/tailwindcss-best-practices/references/core-preflight.md +200 -0
- package/src/skills/tailwindcss-best-practices/references/core-responsive.md +163 -0
- package/src/skills/tailwindcss-best-practices/references/core-source-detection.md +114 -0
- package/src/skills/tailwindcss-best-practices/references/core-theme.md +108 -0
- package/src/skills/tailwindcss-best-practices/references/core-utility-classes.md +59 -0
- package/src/skills/tailwindcss-best-practices/references/core-variants.md +204 -0
- package/src/skills/tailwindcss-best-practices/references/effects-form-controls.md +76 -0
- package/src/skills/tailwindcss-best-practices/references/effects-mask.md +91 -0
- package/src/skills/tailwindcss-best-practices/references/effects-scroll-snap.md +59 -0
- package/src/skills/tailwindcss-best-practices/references/effects-text-shadow.md +78 -0
- package/src/skills/tailwindcss-best-practices/references/effects-transition-animation.md +80 -0
- package/src/skills/tailwindcss-best-practices/references/effects-visibility-interactivity.md +82 -0
- package/src/skills/tailwindcss-best-practices/references/features-content-detection.md +175 -0
- package/src/skills/tailwindcss-best-practices/references/features-custom-styles.md +203 -0
- package/src/skills/tailwindcss-best-practices/references/features-dark-mode.md +137 -0
- package/src/skills/tailwindcss-best-practices/references/features-functions-directives.md +241 -0
- package/src/skills/tailwindcss-best-practices/references/features-upgrade.md +160 -0
- package/src/skills/tailwindcss-best-practices/references/layout-aspect-ratio.md +39 -0
- package/src/skills/tailwindcss-best-practices/references/layout-columns.md +80 -0
- package/src/skills/tailwindcss-best-practices/references/layout-display.md +110 -0
- package/src/skills/tailwindcss-best-practices/references/layout-flexbox.md +112 -0
- package/src/skills/tailwindcss-best-practices/references/layout-grid.md +87 -0
- package/src/skills/tailwindcss-best-practices/references/layout-height.md +97 -0
- package/src/skills/tailwindcss-best-practices/references/layout-inset.md +103 -0
- package/src/skills/tailwindcss-best-practices/references/layout-logical-properties.md +92 -0
- package/src/skills/tailwindcss-best-practices/references/layout-margin.md +126 -0
- package/src/skills/tailwindcss-best-practices/references/layout-min-max-sizing.md +63 -0
- package/src/skills/tailwindcss-best-practices/references/layout-object-fit-position.md +64 -0
- package/src/skills/tailwindcss-best-practices/references/layout-overflow.md +57 -0
- package/src/skills/tailwindcss-best-practices/references/layout-padding.md +77 -0
- package/src/skills/tailwindcss-best-practices/references/layout-position.md +85 -0
- package/src/skills/tailwindcss-best-practices/references/layout-tables.md +67 -0
- package/src/skills/tailwindcss-best-practices/references/layout-width.md +102 -0
- package/src/skills/tailwindcss-best-practices/references/transform-base.md +68 -0
- package/src/skills/tailwindcss-best-practices/references/transform-rotate.md +70 -0
- package/src/skills/tailwindcss-best-practices/references/transform-scale.md +83 -0
- package/src/skills/tailwindcss-best-practices/references/transform-skew.md +62 -0
- package/src/skills/tailwindcss-best-practices/references/transform-translate.md +77 -0
- package/src/skills/tailwindcss-best-practices/references/typography-font-text.md +142 -0
- package/src/skills/tailwindcss-best-practices/references/typography-list-style.md +65 -0
- package/src/skills/tailwindcss-best-practices/references/typography-text-align.md +60 -0
- package/src/skills/tailwindcss-best-practices/references/visual-background.md +76 -0
- package/src/skills/tailwindcss-best-practices/references/visual-border.md +108 -0
- package/src/skills/tailwindcss-best-practices/references/visual-effects.md +111 -0
- package/src/skills/tailwindcss-best-practices/references/visual-svg.md +82 -0
- package/src/skills/test-mobile-app/SKILL.md +11 -6
- package/src/skills/test-mobile-app/scripts/analyze_apk.py +15 -4
- package/src/skills/test-mobile-app/scripts/check_environment.py +5 -5
- package/src/skills/test-mobile-app/scripts/run_tests.py +1 -1
- package/src/skills/test-web-ui/SKILL.md +264 -84
- package/src/skills/test-web-ui/scripts/discover.py +25 -12
- package/src/skills/test-web-ui/scripts/run_tests.py +3 -2
- package/src/skills/vite-best-practices/SKILL.md +115 -0
- package/src/skills/vite-best-practices/references/build-and-ssr.md +255 -0
- package/src/skills/vite-best-practices/references/core-config.md +231 -0
- package/src/skills/vite-best-practices/references/core-features.md +222 -0
- package/src/skills/vite-best-practices/references/core-plugin-api.md +294 -0
- package/src/skills/vite-best-practices/references/environment-api.md +108 -0
- package/src/skills/vite-best-practices/references/rolldown-migration.md +242 -0
- package/codex-cli-workspace/iteration-1/benchmark.json +0 -122
- package/codex-cli-workspace/iteration-1/eval-1-ci-integration/eval_metadata.json +0 -13
- package/codex-cli-workspace/iteration-1/eval-1-ci-integration/with_skill/grading.json +0 -52
- package/codex-cli-workspace/iteration-1/eval-1-ci-integration/with_skill/outputs/response.md +0 -163
- package/codex-cli-workspace/iteration-1/eval-1-ci-integration/with_skill/timing.json +0 -5
- package/codex-cli-workspace/iteration-1/eval-1-ci-integration/without_skill/grading.json +0 -58
- package/codex-cli-workspace/iteration-1/eval-1-ci-integration/without_skill/outputs/response.md +0 -151
- package/codex-cli-workspace/iteration-1/eval-1-ci-integration/without_skill/timing.json +0 -5
- package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/eval_metadata.json +0 -13
- package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/grading.json +0 -52
- package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/outputs/response.md +0 -86
- package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/timing.json +0 -5
- package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/grading.json +0 -58
- package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/outputs/response.md +0 -164
- package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/timing.json +0 -5
- package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/eval_metadata.json +0 -13
- package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/with_skill/grading.json +0 -52
- package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/with_skill/outputs/response.md +0 -130
- package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/with_skill/timing.json +0 -5
- package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/without_skill/grading.json +0 -64
- package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/without_skill/outputs/response.md +0 -209
- package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/without_skill/timing.json +0 -5
- package/codex-cli-workspace/iteration-1/review.html +0 -1325
- package/gemini-cli-workspace/iteration-1/benchmark.json +0 -86
- package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/eval_metadata.json +0 -37
- package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/with_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/with_skill/outputs/response.md +0 -401
- package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/with_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/without_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/without_skill/outputs/response.md +0 -405
- package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/without_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/eval_metadata.json +0 -37
- package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/outputs/response.md +0 -212
- package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/outputs/response.md +0 -427
- package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/eval_metadata.json +0 -32
- package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/with_skill/grading.json +0 -32
- package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/with_skill/outputs/response.md +0 -171
- package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/with_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/without_skill/grading.json +0 -32
- package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/without_skill/outputs/response.md +0 -199
- package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/without_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-1/review.html +0 -1325
- package/gemini-cli-workspace/iteration-2/benchmark.json +0 -173
- package/gemini-cli-workspace/iteration-2/benchmark.md +0 -28
- package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/eval_metadata.json +0 -37
- package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/with_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/with_skill/outputs/response.md +0 -195
- package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/with_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/without_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/without_skill/outputs/response.md +0 -377
- package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/without_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/eval_metadata.json +0 -37
- package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/with_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/with_skill/outputs/response.md +0 -127
- package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/with_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/without_skill/grading.json +0 -37
- package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/without_skill/outputs/response.md +0 -164
- package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/without_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/eval_metadata.json +0 -32
- package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/with_skill/grading.json +0 -32
- package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/with_skill/outputs/response.md +0 -91
- package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/with_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/without_skill/grading.json +0 -32
- package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/without_skill/outputs/response.md +0 -112
- package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/without_skill/timing.json +0 -5
- package/gemini-cli-workspace/iteration-2/eval-viewer.html +0 -1325
- package/screen-recording-workspace/evals.json +0 -41
- package/screen-recording-workspace/iteration-1/benchmark.json +0 -102
- package/screen-recording-workspace/iteration-1/eval-0-fullscreen/eval_metadata.json +0 -31
- package/screen-recording-workspace/iteration-1/eval-0-fullscreen/with_skill/grading.json +0 -11
- package/screen-recording-workspace/iteration-1/eval-0-fullscreen/with_skill/outputs/demo.mp4 +0 -0
- package/screen-recording-workspace/iteration-1/eval-0-fullscreen/with_skill/timing.json +0 -5
- package/screen-recording-workspace/iteration-1/eval-0-fullscreen/without_skill/grading.json +0 -11
- package/screen-recording-workspace/iteration-1/eval-0-fullscreen/without_skill/outputs/demo.mp4 +0 -0
- package/screen-recording-workspace/iteration-1/eval-0-fullscreen/without_skill/timing.json +0 -5
- package/screen-recording-workspace/iteration-1/eval-1-region-audio/eval_metadata.json +0 -31
- package/screen-recording-workspace/iteration-1/eval-1-region-audio/with_skill/grading.json +0 -11
- package/screen-recording-workspace/iteration-1/eval-1-region-audio/with_skill/outputs/region_capture.mp4 +0 -0
- package/screen-recording-workspace/iteration-1/eval-1-region-audio/with_skill/timing.json +0 -5
- package/screen-recording-workspace/iteration-1/eval-1-region-audio/without_skill/grading.json +0 -11
- package/screen-recording-workspace/iteration-1/eval-1-region-audio/without_skill/outputs/region_capture.mp4 +0 -0
- package/screen-recording-workspace/iteration-1/eval-1-region-audio/without_skill/timing.json +0 -5
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/eval_metadata.json +0 -31
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/with_skill/grading.json +0 -11
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/with_skill/outputs/fallback_recording.mp4 +0 -0
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/with_skill/timing.json +0 -5
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/grading.json +0 -11
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/outputs/fallback_recording.mp4 +0 -0
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/outputs/record_screen.py +0 -67
- package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/timing.json +0 -5
- package/screen-recording-workspace/iteration-1/review.html +0 -1325
- package/src/skills/codex-cli/evals/evals.json +0 -47
- package/src/skills/gemini-cli/evals/evals.json +0 -46
- package/src/skills/tm-search/evals/evals.json +0 -23
|
@@ -1,1325 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<meta charset="UTF-8">
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
-
<title>Eval Review</title>
|
|
7
|
-
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
8
|
-
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
9
|
-
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
|
|
10
|
-
<script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
|
|
11
|
-
<style>
|
|
12
|
-
:root {
|
|
13
|
-
--bg: #faf9f5;
|
|
14
|
-
--surface: #ffffff;
|
|
15
|
-
--border: #e8e6dc;
|
|
16
|
-
--text: #141413;
|
|
17
|
-
--text-muted: #b0aea5;
|
|
18
|
-
--accent: #d97757;
|
|
19
|
-
--accent-hover: #c4613f;
|
|
20
|
-
--green: #788c5d;
|
|
21
|
-
--green-bg: #eef2e8;
|
|
22
|
-
--red: #c44;
|
|
23
|
-
--red-bg: #fceaea;
|
|
24
|
-
--header-bg: #141413;
|
|
25
|
-
--header-text: #faf9f5;
|
|
26
|
-
--radius: 6px;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
30
|
-
|
|
31
|
-
body {
|
|
32
|
-
font-family: 'Lora', Georgia, serif;
|
|
33
|
-
background: var(--bg);
|
|
34
|
-
color: var(--text);
|
|
35
|
-
height: 100vh;
|
|
36
|
-
display: flex;
|
|
37
|
-
flex-direction: column;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
/* ---- Header ---- */
|
|
41
|
-
.header {
|
|
42
|
-
background: var(--header-bg);
|
|
43
|
-
color: var(--header-text);
|
|
44
|
-
padding: 1rem 2rem;
|
|
45
|
-
display: flex;
|
|
46
|
-
justify-content: space-between;
|
|
47
|
-
align-items: center;
|
|
48
|
-
flex-shrink: 0;
|
|
49
|
-
}
|
|
50
|
-
.header h1 {
|
|
51
|
-
font-family: 'Poppins', sans-serif;
|
|
52
|
-
font-size: 1.25rem;
|
|
53
|
-
font-weight: 600;
|
|
54
|
-
}
|
|
55
|
-
.header .instructions {
|
|
56
|
-
font-size: 0.8rem;
|
|
57
|
-
opacity: 0.7;
|
|
58
|
-
margin-top: 0.25rem;
|
|
59
|
-
}
|
|
60
|
-
.header .progress {
|
|
61
|
-
font-size: 0.875rem;
|
|
62
|
-
opacity: 0.8;
|
|
63
|
-
text-align: right;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/* ---- Main content ---- */
|
|
67
|
-
.main {
|
|
68
|
-
flex: 1;
|
|
69
|
-
overflow-y: auto;
|
|
70
|
-
padding: 1.5rem 2rem;
|
|
71
|
-
display: flex;
|
|
72
|
-
flex-direction: column;
|
|
73
|
-
gap: 1.25rem;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
/* ---- Sections ---- */
|
|
77
|
-
.section {
|
|
78
|
-
background: var(--surface);
|
|
79
|
-
border: 1px solid var(--border);
|
|
80
|
-
border-radius: var(--radius);
|
|
81
|
-
flex-shrink: 0;
|
|
82
|
-
}
|
|
83
|
-
.section-header {
|
|
84
|
-
font-family: 'Poppins', sans-serif;
|
|
85
|
-
padding: 0.75rem 1rem;
|
|
86
|
-
font-size: 0.75rem;
|
|
87
|
-
font-weight: 500;
|
|
88
|
-
text-transform: uppercase;
|
|
89
|
-
letter-spacing: 0.05em;
|
|
90
|
-
color: var(--text-muted);
|
|
91
|
-
border-bottom: 1px solid var(--border);
|
|
92
|
-
background: var(--bg);
|
|
93
|
-
}
|
|
94
|
-
.section-body {
|
|
95
|
-
padding: 1rem;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
/* ---- Config badge ---- */
|
|
99
|
-
.config-badge {
|
|
100
|
-
display: inline-block;
|
|
101
|
-
padding: 0.2rem 0.625rem;
|
|
102
|
-
border-radius: 9999px;
|
|
103
|
-
font-family: 'Poppins', sans-serif;
|
|
104
|
-
font-size: 0.6875rem;
|
|
105
|
-
font-weight: 600;
|
|
106
|
-
text-transform: uppercase;
|
|
107
|
-
letter-spacing: 0.03em;
|
|
108
|
-
margin-left: 0.75rem;
|
|
109
|
-
vertical-align: middle;
|
|
110
|
-
}
|
|
111
|
-
.config-badge.config-primary {
|
|
112
|
-
background: rgba(33, 150, 243, 0.12);
|
|
113
|
-
color: #1976d2;
|
|
114
|
-
}
|
|
115
|
-
.config-badge.config-baseline {
|
|
116
|
-
background: rgba(255, 193, 7, 0.15);
|
|
117
|
-
color: #f57f17;
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
/* ---- Prompt ---- */
|
|
121
|
-
.prompt-text {
|
|
122
|
-
white-space: pre-wrap;
|
|
123
|
-
font-size: 0.9375rem;
|
|
124
|
-
line-height: 1.6;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
/* ---- Outputs ---- */
|
|
128
|
-
.output-file {
|
|
129
|
-
border: 1px solid var(--border);
|
|
130
|
-
border-radius: var(--radius);
|
|
131
|
-
overflow: hidden;
|
|
132
|
-
}
|
|
133
|
-
.output-file + .output-file {
|
|
134
|
-
margin-top: 1rem;
|
|
135
|
-
}
|
|
136
|
-
.output-file-header {
|
|
137
|
-
padding: 0.5rem 0.75rem;
|
|
138
|
-
font-size: 0.8rem;
|
|
139
|
-
font-weight: 600;
|
|
140
|
-
color: var(--text-muted);
|
|
141
|
-
background: var(--bg);
|
|
142
|
-
border-bottom: 1px solid var(--border);
|
|
143
|
-
font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
|
|
144
|
-
display: flex;
|
|
145
|
-
justify-content: space-between;
|
|
146
|
-
align-items: center;
|
|
147
|
-
}
|
|
148
|
-
.output-file-header .dl-btn {
|
|
149
|
-
font-size: 0.7rem;
|
|
150
|
-
color: var(--accent);
|
|
151
|
-
text-decoration: none;
|
|
152
|
-
cursor: pointer;
|
|
153
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
154
|
-
font-weight: 500;
|
|
155
|
-
opacity: 0.8;
|
|
156
|
-
}
|
|
157
|
-
.output-file-header .dl-btn:hover {
|
|
158
|
-
opacity: 1;
|
|
159
|
-
text-decoration: underline;
|
|
160
|
-
}
|
|
161
|
-
.output-file-content {
|
|
162
|
-
padding: 0.75rem;
|
|
163
|
-
overflow-x: auto;
|
|
164
|
-
}
|
|
165
|
-
.output-file-content pre {
|
|
166
|
-
font-size: 0.8125rem;
|
|
167
|
-
line-height: 1.5;
|
|
168
|
-
white-space: pre-wrap;
|
|
169
|
-
word-break: break-word;
|
|
170
|
-
font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
|
|
171
|
-
}
|
|
172
|
-
.output-file-content img {
|
|
173
|
-
max-width: 100%;
|
|
174
|
-
height: auto;
|
|
175
|
-
border-radius: 4px;
|
|
176
|
-
}
|
|
177
|
-
.output-file-content iframe {
|
|
178
|
-
width: 100%;
|
|
179
|
-
height: 600px;
|
|
180
|
-
border: none;
|
|
181
|
-
}
|
|
182
|
-
.output-file-content table {
|
|
183
|
-
border-collapse: collapse;
|
|
184
|
-
font-size: 0.8125rem;
|
|
185
|
-
width: 100%;
|
|
186
|
-
}
|
|
187
|
-
.output-file-content table td,
|
|
188
|
-
.output-file-content table th {
|
|
189
|
-
border: 1px solid var(--border);
|
|
190
|
-
padding: 0.375rem 0.5rem;
|
|
191
|
-
text-align: left;
|
|
192
|
-
}
|
|
193
|
-
.output-file-content table th {
|
|
194
|
-
background: var(--bg);
|
|
195
|
-
font-weight: 600;
|
|
196
|
-
}
|
|
197
|
-
.output-file-content .download-link {
|
|
198
|
-
display: inline-flex;
|
|
199
|
-
align-items: center;
|
|
200
|
-
gap: 0.5rem;
|
|
201
|
-
padding: 0.5rem 1rem;
|
|
202
|
-
background: var(--bg);
|
|
203
|
-
border: 1px solid var(--border);
|
|
204
|
-
border-radius: 4px;
|
|
205
|
-
color: var(--accent);
|
|
206
|
-
text-decoration: none;
|
|
207
|
-
font-size: 0.875rem;
|
|
208
|
-
cursor: pointer;
|
|
209
|
-
}
|
|
210
|
-
.output-file-content .download-link:hover {
|
|
211
|
-
background: var(--border);
|
|
212
|
-
}
|
|
213
|
-
.empty-state {
|
|
214
|
-
color: var(--text-muted);
|
|
215
|
-
font-style: italic;
|
|
216
|
-
padding: 2rem;
|
|
217
|
-
text-align: center;
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
/* ---- Feedback ---- */
|
|
221
|
-
.prev-feedback {
|
|
222
|
-
background: var(--bg);
|
|
223
|
-
border: 1px solid var(--border);
|
|
224
|
-
border-radius: 4px;
|
|
225
|
-
padding: 0.625rem 0.75rem;
|
|
226
|
-
margin-top: 0.75rem;
|
|
227
|
-
font-size: 0.8125rem;
|
|
228
|
-
color: var(--text-muted);
|
|
229
|
-
line-height: 1.5;
|
|
230
|
-
}
|
|
231
|
-
.prev-feedback-label {
|
|
232
|
-
font-size: 0.7rem;
|
|
233
|
-
font-weight: 600;
|
|
234
|
-
text-transform: uppercase;
|
|
235
|
-
letter-spacing: 0.04em;
|
|
236
|
-
margin-bottom: 0.25rem;
|
|
237
|
-
color: var(--text-muted);
|
|
238
|
-
}
|
|
239
|
-
.feedback-textarea {
|
|
240
|
-
width: 100%;
|
|
241
|
-
min-height: 100px;
|
|
242
|
-
padding: 0.75rem;
|
|
243
|
-
border: 1px solid var(--border);
|
|
244
|
-
border-radius: 4px;
|
|
245
|
-
font-family: inherit;
|
|
246
|
-
font-size: 0.9375rem;
|
|
247
|
-
line-height: 1.5;
|
|
248
|
-
resize: vertical;
|
|
249
|
-
color: var(--text);
|
|
250
|
-
}
|
|
251
|
-
.feedback-textarea:focus {
|
|
252
|
-
outline: none;
|
|
253
|
-
border-color: var(--accent);
|
|
254
|
-
box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
|
|
255
|
-
}
|
|
256
|
-
.feedback-status {
|
|
257
|
-
font-size: 0.75rem;
|
|
258
|
-
color: var(--text-muted);
|
|
259
|
-
margin-top: 0.5rem;
|
|
260
|
-
min-height: 1.1em;
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
/* ---- Grades (collapsible) ---- */
|
|
264
|
-
.grades-toggle {
|
|
265
|
-
display: flex;
|
|
266
|
-
align-items: center;
|
|
267
|
-
cursor: pointer;
|
|
268
|
-
user-select: none;
|
|
269
|
-
}
|
|
270
|
-
.grades-toggle:hover {
|
|
271
|
-
color: var(--accent);
|
|
272
|
-
}
|
|
273
|
-
.grades-toggle .arrow {
|
|
274
|
-
margin-right: 0.5rem;
|
|
275
|
-
transition: transform 0.15s;
|
|
276
|
-
font-size: 0.75rem;
|
|
277
|
-
}
|
|
278
|
-
.grades-toggle .arrow.open {
|
|
279
|
-
transform: rotate(90deg);
|
|
280
|
-
}
|
|
281
|
-
.grades-content {
|
|
282
|
-
display: none;
|
|
283
|
-
margin-top: 0.75rem;
|
|
284
|
-
}
|
|
285
|
-
.grades-content.open {
|
|
286
|
-
display: block;
|
|
287
|
-
}
|
|
288
|
-
.grades-summary {
|
|
289
|
-
font-size: 0.875rem;
|
|
290
|
-
margin-bottom: 0.75rem;
|
|
291
|
-
display: flex;
|
|
292
|
-
align-items: center;
|
|
293
|
-
gap: 0.5rem;
|
|
294
|
-
}
|
|
295
|
-
.grade-badge {
|
|
296
|
-
display: inline-block;
|
|
297
|
-
padding: 0.125rem 0.5rem;
|
|
298
|
-
border-radius: 9999px;
|
|
299
|
-
font-size: 0.75rem;
|
|
300
|
-
font-weight: 600;
|
|
301
|
-
}
|
|
302
|
-
.grade-pass { background: var(--green-bg); color: var(--green); }
|
|
303
|
-
.grade-fail { background: var(--red-bg); color: var(--red); }
|
|
304
|
-
.assertion-list {
|
|
305
|
-
list-style: none;
|
|
306
|
-
}
|
|
307
|
-
.assertion-item {
|
|
308
|
-
padding: 0.625rem 0;
|
|
309
|
-
border-bottom: 1px solid var(--border);
|
|
310
|
-
font-size: 0.8125rem;
|
|
311
|
-
}
|
|
312
|
-
.assertion-item:last-child { border-bottom: none; }
|
|
313
|
-
.assertion-status {
|
|
314
|
-
font-weight: 600;
|
|
315
|
-
margin-right: 0.5rem;
|
|
316
|
-
}
|
|
317
|
-
.assertion-status.pass { color: var(--green); }
|
|
318
|
-
.assertion-status.fail { color: var(--red); }
|
|
319
|
-
.assertion-evidence {
|
|
320
|
-
color: var(--text-muted);
|
|
321
|
-
font-size: 0.75rem;
|
|
322
|
-
margin-top: 0.25rem;
|
|
323
|
-
padding-left: 1.5rem;
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
/* ---- View tabs ---- */
|
|
327
|
-
.view-tabs {
|
|
328
|
-
display: flex;
|
|
329
|
-
gap: 0;
|
|
330
|
-
padding: 0 2rem;
|
|
331
|
-
background: var(--bg);
|
|
332
|
-
border-bottom: 1px solid var(--border);
|
|
333
|
-
flex-shrink: 0;
|
|
334
|
-
}
|
|
335
|
-
.view-tab {
|
|
336
|
-
font-family: 'Poppins', sans-serif;
|
|
337
|
-
padding: 0.625rem 1.25rem;
|
|
338
|
-
font-size: 0.8125rem;
|
|
339
|
-
font-weight: 500;
|
|
340
|
-
cursor: pointer;
|
|
341
|
-
border: none;
|
|
342
|
-
background: none;
|
|
343
|
-
color: var(--text-muted);
|
|
344
|
-
border-bottom: 2px solid transparent;
|
|
345
|
-
transition: all 0.15s;
|
|
346
|
-
}
|
|
347
|
-
.view-tab:hover { color: var(--text); }
|
|
348
|
-
.view-tab.active {
|
|
349
|
-
color: var(--accent);
|
|
350
|
-
border-bottom-color: var(--accent);
|
|
351
|
-
}
|
|
352
|
-
.view-panel { display: none; }
|
|
353
|
-
.view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
|
|
354
|
-
|
|
355
|
-
/* ---- Benchmark view ---- */
|
|
356
|
-
.benchmark-view {
|
|
357
|
-
padding: 1.5rem 2rem;
|
|
358
|
-
overflow-y: auto;
|
|
359
|
-
flex: 1;
|
|
360
|
-
}
|
|
361
|
-
.benchmark-table {
|
|
362
|
-
border-collapse: collapse;
|
|
363
|
-
background: var(--surface);
|
|
364
|
-
border: 1px solid var(--border);
|
|
365
|
-
border-radius: var(--radius);
|
|
366
|
-
font-size: 0.8125rem;
|
|
367
|
-
width: 100%;
|
|
368
|
-
margin-bottom: 1.5rem;
|
|
369
|
-
}
|
|
370
|
-
.benchmark-table th, .benchmark-table td {
|
|
371
|
-
padding: 0.625rem 0.75rem;
|
|
372
|
-
text-align: left;
|
|
373
|
-
border: 1px solid var(--border);
|
|
374
|
-
}
|
|
375
|
-
.benchmark-table th {
|
|
376
|
-
font-family: 'Poppins', sans-serif;
|
|
377
|
-
background: var(--header-bg);
|
|
378
|
-
color: var(--header-text);
|
|
379
|
-
font-weight: 500;
|
|
380
|
-
font-size: 0.75rem;
|
|
381
|
-
text-transform: uppercase;
|
|
382
|
-
letter-spacing: 0.04em;
|
|
383
|
-
}
|
|
384
|
-
.benchmark-table tr:hover { background: var(--bg); }
|
|
385
|
-
.benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
|
|
386
|
-
.benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
|
|
387
|
-
.benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
|
|
388
|
-
.benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
|
|
389
|
-
.benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
|
|
390
|
-
.benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
|
|
391
|
-
.benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
|
|
392
|
-
.benchmark-delta-positive { color: var(--green); font-weight: 600; }
|
|
393
|
-
.benchmark-delta-negative { color: var(--red); font-weight: 600; }
|
|
394
|
-
.benchmark-notes {
|
|
395
|
-
background: var(--surface);
|
|
396
|
-
border: 1px solid var(--border);
|
|
397
|
-
border-radius: var(--radius);
|
|
398
|
-
padding: 1rem;
|
|
399
|
-
}
|
|
400
|
-
.benchmark-notes h3 {
|
|
401
|
-
font-family: 'Poppins', sans-serif;
|
|
402
|
-
font-size: 0.875rem;
|
|
403
|
-
margin-bottom: 0.75rem;
|
|
404
|
-
}
|
|
405
|
-
.benchmark-notes ul {
|
|
406
|
-
list-style: disc;
|
|
407
|
-
padding-left: 1.25rem;
|
|
408
|
-
}
|
|
409
|
-
.benchmark-notes li {
|
|
410
|
-
font-size: 0.8125rem;
|
|
411
|
-
line-height: 1.6;
|
|
412
|
-
margin-bottom: 0.375rem;
|
|
413
|
-
}
|
|
414
|
-
.benchmark-empty {
|
|
415
|
-
color: var(--text-muted);
|
|
416
|
-
font-style: italic;
|
|
417
|
-
text-align: center;
|
|
418
|
-
padding: 3rem;
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
/* ---- Navigation ---- */
|
|
422
|
-
.nav {
|
|
423
|
-
display: flex;
|
|
424
|
-
justify-content: space-between;
|
|
425
|
-
align-items: center;
|
|
426
|
-
padding: 1rem 2rem;
|
|
427
|
-
border-top: 1px solid var(--border);
|
|
428
|
-
background: var(--surface);
|
|
429
|
-
flex-shrink: 0;
|
|
430
|
-
}
|
|
431
|
-
.nav-btn {
|
|
432
|
-
font-family: 'Poppins', sans-serif;
|
|
433
|
-
padding: 0.5rem 1.25rem;
|
|
434
|
-
border: 1px solid var(--border);
|
|
435
|
-
border-radius: var(--radius);
|
|
436
|
-
background: var(--surface);
|
|
437
|
-
cursor: pointer;
|
|
438
|
-
font-size: 0.875rem;
|
|
439
|
-
font-weight: 500;
|
|
440
|
-
color: var(--text);
|
|
441
|
-
transition: all 0.15s;
|
|
442
|
-
}
|
|
443
|
-
.nav-btn:hover:not(:disabled) {
|
|
444
|
-
background: var(--bg);
|
|
445
|
-
border-color: var(--text-muted);
|
|
446
|
-
}
|
|
447
|
-
.nav-btn:disabled {
|
|
448
|
-
opacity: 0.4;
|
|
449
|
-
cursor: not-allowed;
|
|
450
|
-
}
|
|
451
|
-
.done-btn {
|
|
452
|
-
font-family: 'Poppins', sans-serif;
|
|
453
|
-
padding: 0.5rem 1.5rem;
|
|
454
|
-
border: 1px solid var(--border);
|
|
455
|
-
border-radius: var(--radius);
|
|
456
|
-
background: var(--surface);
|
|
457
|
-
color: var(--text);
|
|
458
|
-
cursor: pointer;
|
|
459
|
-
font-size: 0.875rem;
|
|
460
|
-
font-weight: 500;
|
|
461
|
-
transition: all 0.15s;
|
|
462
|
-
}
|
|
463
|
-
.done-btn:hover {
|
|
464
|
-
background: var(--bg);
|
|
465
|
-
border-color: var(--text-muted);
|
|
466
|
-
}
|
|
467
|
-
.done-btn.ready {
|
|
468
|
-
border: none;
|
|
469
|
-
background: var(--accent);
|
|
470
|
-
color: white;
|
|
471
|
-
font-weight: 600;
|
|
472
|
-
}
|
|
473
|
-
.done-btn.ready:hover {
|
|
474
|
-
background: var(--accent-hover);
|
|
475
|
-
}
|
|
476
|
-
/* ---- Done overlay ---- */
|
|
477
|
-
.done-overlay {
|
|
478
|
-
display: none;
|
|
479
|
-
position: fixed;
|
|
480
|
-
inset: 0;
|
|
481
|
-
background: rgba(0, 0, 0, 0.5);
|
|
482
|
-
z-index: 100;
|
|
483
|
-
justify-content: center;
|
|
484
|
-
align-items: center;
|
|
485
|
-
}
|
|
486
|
-
.done-overlay.visible {
|
|
487
|
-
display: flex;
|
|
488
|
-
}
|
|
489
|
-
.done-card {
|
|
490
|
-
background: var(--surface);
|
|
491
|
-
border-radius: 12px;
|
|
492
|
-
padding: 2rem 3rem;
|
|
493
|
-
text-align: center;
|
|
494
|
-
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
|
|
495
|
-
max-width: 500px;
|
|
496
|
-
}
|
|
497
|
-
.done-card h2 {
|
|
498
|
-
font-size: 1.5rem;
|
|
499
|
-
margin-bottom: 0.5rem;
|
|
500
|
-
}
|
|
501
|
-
.done-card p {
|
|
502
|
-
color: var(--text-muted);
|
|
503
|
-
margin-bottom: 1.5rem;
|
|
504
|
-
line-height: 1.5;
|
|
505
|
-
}
|
|
506
|
-
.done-card .btn-row {
|
|
507
|
-
display: flex;
|
|
508
|
-
gap: 0.5rem;
|
|
509
|
-
justify-content: center;
|
|
510
|
-
}
|
|
511
|
-
.done-card button {
|
|
512
|
-
padding: 0.5rem 1.25rem;
|
|
513
|
-
border: 1px solid var(--border);
|
|
514
|
-
border-radius: var(--radius);
|
|
515
|
-
background: var(--surface);
|
|
516
|
-
cursor: pointer;
|
|
517
|
-
font-size: 0.875rem;
|
|
518
|
-
}
|
|
519
|
-
.done-card button:hover {
|
|
520
|
-
background: var(--bg);
|
|
521
|
-
}
|
|
522
|
-
/* ---- Toast ---- */
|
|
523
|
-
.toast {
|
|
524
|
-
position: fixed;
|
|
525
|
-
bottom: 5rem;
|
|
526
|
-
left: 50%;
|
|
527
|
-
transform: translateX(-50%);
|
|
528
|
-
background: var(--header-bg);
|
|
529
|
-
color: var(--header-text);
|
|
530
|
-
padding: 0.625rem 1.25rem;
|
|
531
|
-
border-radius: var(--radius);
|
|
532
|
-
font-size: 0.875rem;
|
|
533
|
-
opacity: 0;
|
|
534
|
-
transition: opacity 0.3s;
|
|
535
|
-
pointer-events: none;
|
|
536
|
-
z-index: 200;
|
|
537
|
-
}
|
|
538
|
-
.toast.visible {
|
|
539
|
-
opacity: 1;
|
|
540
|
-
}
|
|
541
|
-
</style>
|
|
542
|
-
</head>
|
|
543
|
-
<body>
|
|
544
|
-
<div id="app" style="height:100vh; display:flex; flex-direction:column;">
|
|
545
|
-
<div class="header">
|
|
546
|
-
<div>
|
|
547
|
-
<h1>Eval Review: <span id="skill-name"></span></h1>
|
|
548
|
-
<div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
|
|
549
|
-
</div>
|
|
550
|
-
<div class="progress" id="progress"></div>
|
|
551
|
-
</div>
|
|
552
|
-
|
|
553
|
-
<!-- View tabs (only shown when benchmark data exists) -->
|
|
554
|
-
<div class="view-tabs" id="view-tabs" style="display:none;">
|
|
555
|
-
<button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
|
|
556
|
-
<button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
|
|
557
|
-
</div>
|
|
558
|
-
|
|
559
|
-
<!-- Outputs panel (qualitative review) -->
|
|
560
|
-
<div class="view-panel active" id="panel-outputs">
|
|
561
|
-
<div class="main">
|
|
562
|
-
<!-- Prompt -->
|
|
563
|
-
<div class="section">
|
|
564
|
-
<div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
|
|
565
|
-
<div class="section-body">
|
|
566
|
-
<div class="prompt-text" id="prompt-text"></div>
|
|
567
|
-
</div>
|
|
568
|
-
</div>
|
|
569
|
-
|
|
570
|
-
<!-- Outputs -->
|
|
571
|
-
<div class="section">
|
|
572
|
-
<div class="section-header">Output</div>
|
|
573
|
-
<div class="section-body" id="outputs-body">
|
|
574
|
-
<div class="empty-state">No output files found</div>
|
|
575
|
-
</div>
|
|
576
|
-
</div>
|
|
577
|
-
|
|
578
|
-
<!-- Previous Output (collapsible) -->
|
|
579
|
-
<div class="section" id="prev-outputs-section" style="display:none;">
|
|
580
|
-
<div class="section-header">
|
|
581
|
-
<div class="grades-toggle" onclick="togglePrevOutputs()">
|
|
582
|
-
<span class="arrow" id="prev-outputs-arrow">▶</span>
|
|
583
|
-
Previous Output
|
|
584
|
-
</div>
|
|
585
|
-
</div>
|
|
586
|
-
<div class="grades-content" id="prev-outputs-content"></div>
|
|
587
|
-
</div>
|
|
588
|
-
|
|
589
|
-
<!-- Grades (collapsible) -->
|
|
590
|
-
<div class="section" id="grades-section" style="display:none;">
|
|
591
|
-
<div class="section-header">
|
|
592
|
-
<div class="grades-toggle" onclick="toggleGrades()">
|
|
593
|
-
<span class="arrow" id="grades-arrow">▶</span>
|
|
594
|
-
Formal Grades
|
|
595
|
-
</div>
|
|
596
|
-
</div>
|
|
597
|
-
<div class="grades-content" id="grades-content"></div>
|
|
598
|
-
</div>
|
|
599
|
-
|
|
600
|
-
<!-- Feedback -->
|
|
601
|
-
<div class="section">
|
|
602
|
-
<div class="section-header">Your Feedback</div>
|
|
603
|
-
<div class="section-body">
|
|
604
|
-
<textarea
|
|
605
|
-
class="feedback-textarea"
|
|
606
|
-
id="feedback"
|
|
607
|
-
placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
|
|
608
|
-
></textarea>
|
|
609
|
-
<div class="feedback-status" id="feedback-status"></div>
|
|
610
|
-
<div class="prev-feedback" id="prev-feedback" style="display:none;">
|
|
611
|
-
<div class="prev-feedback-label">Previous feedback</div>
|
|
612
|
-
<div id="prev-feedback-text"></div>
|
|
613
|
-
</div>
|
|
614
|
-
</div>
|
|
615
|
-
</div>
|
|
616
|
-
</div>
|
|
617
|
-
|
|
618
|
-
<div class="nav" id="outputs-nav">
|
|
619
|
-
<button class="nav-btn" id="prev-btn" onclick="navigate(-1)">← Previous</button>
|
|
620
|
-
<button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
|
|
621
|
-
<button class="nav-btn" id="next-btn" onclick="navigate(1)">Next →</button>
|
|
622
|
-
</div>
|
|
623
|
-
</div><!-- end panel-outputs -->
|
|
624
|
-
|
|
625
|
-
<!-- Benchmark panel (quantitative stats) -->
|
|
626
|
-
<div class="view-panel" id="panel-benchmark">
|
|
627
|
-
<div class="benchmark-view" id="benchmark-content">
|
|
628
|
-
<div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
|
|
629
|
-
</div>
|
|
630
|
-
</div>
|
|
631
|
-
</div>
|
|
632
|
-
|
|
633
|
-
<!-- Done overlay -->
|
|
634
|
-
<div class="done-overlay" id="done-overlay">
|
|
635
|
-
<div class="done-card">
|
|
636
|
-
<h2>Review Complete</h2>
|
|
637
|
-
<p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
|
|
638
|
-
<div class="btn-row">
|
|
639
|
-
<button onclick="closeDoneDialog()">OK</button>
|
|
640
|
-
</div>
|
|
641
|
-
</div>
|
|
642
|
-
</div>
|
|
643
|
-
|
|
644
|
-
<!-- Toast -->
|
|
645
|
-
<div class="toast" id="toast"></div>
|
|
646
|
-
|
|
647
|
-
<script>
|
|
648
|
-
// ---- Embedded data (injected by generate_review.py) ----
|
|
649
|
-
const EMBEDDED_DATA = {"skill_name": "gemini-cli", "runs": [{"id": "eval-1-cicd-setup-with_skill", "prompt": "I want to set up gemini cli in my CI/CD pipeline on GitHub Actions so it can automatically review PRs. We use an API key stored in GitHub secrets. How do I set this up with structured JSON output so I can parse the review?", "eval_id": 1, "outputs": [{"name": "response.md", "type": "text", "content": "# Setting Up Gemini CLI for Automated PR Reviews in GitHub Actions\n\nHere is a complete guide to configuring Gemini CLI in your CI/CD pipeline to automatically review pull requests with structured JSON output.\n\n---\n\n## Prerequisites\n\n1. A Gemini API key (get one at https://aistudio.google.com/apikey).\n2. The API key stored as a GitHub Actions secret named `GEMINI_API_KEY`.\n\n---\n\n## Step 1: Basic GitHub Actions Workflow\n\nCreate `.github/workflows/gemini-pr-review.yml`:\n\n```yaml\nname: Gemini PR Review\n\non:\n pull_request:\n types: [opened, synchronize, reopened]\n\npermissions:\n pull-requests: write\n contents: read\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout code\n uses: actions/checkout@v4\n with:\n fetch-depth: 0 # Full history so we can diff against base\n\n - name: Set up Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI\n run: npm install -g @google/gemini-cli\n\n - name: Get PR diff\n id: diff\n run: |\n git fetch origin ${{ github.base_ref }}\n git diff origin/${{ github.base_ref }}...HEAD > /tmp/pr-diff.txt\n echo \"diff_size=$(wc -c < /tmp/pr-diff.txt)\" >> \"$GITHUB_OUTPUT\"\n\n - name: Run Gemini PR Review\n id: review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n REVIEW_JSON=$(cat /tmp/pr-diff.txt | gemini -p \"You are a senior code reviewer. Review the following pull request diff for:\n 1. Bugs and logical errors\n 2. Security vulnerabilities\n 3. Performance issues\n 4. Code style and best practices\n\n Provide your review as structured text with sections for each category.\n If no issues are found in a category, say 'No issues found.'\n End with an overall summary and a verdict: APPROVE, REQUEST_CHANGES, or COMMENT.\" \\\n --output-format json)\n\n # Extract the response text from the JSON envelope\n REVIEW_TEXT=$(echo \"$REVIEW_JSON\" | jq -r '.response')\n\n # Extract token stats for logging\n TOTAL_TOKENS=$(echo \"$REVIEW_JSON\" | jq '.stats.models | to_entries[0].value.tokens.total // 0')\n echo \"Tokens used: $TOTAL_TOKENS\"\n\n # Save review to file (handles multi-line safely)\n echo \"$REVIEW_TEXT\" > /tmp/review-output.txt\n\n # Save for the summary\n echo \"$REVIEW_TEXT\" >> \"$GITHUB_STEP_SUMMARY\"\n\n - name: Post review as PR comment\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n const review = fs.readFileSync('/tmp/review-output.txt', 'utf8');\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body: `## Gemini Code Review\\n\\n${review}\\n\\n---\\n*Automated review by Gemini CLI*`\n });\n```\n\nThis workflow triggers on every PR event, runs the diff through Gemini CLI with `--output-format json`, extracts the response with `jq`, and posts it as a PR comment.\n\n---\n\n## Step 2: Advanced Workflow with Full JSON Parsing\n\nIf you want to parse the structured JSON output more thoroughly -- for example, to extract token usage, check for errors, or conditionally fail the build -- use this expanded version:\n\n```yaml\nname: Gemini PR Review (Advanced)\n\non:\n pull_request:\n types: [opened, synchronize]\n\npermissions:\n pull-requests: write\n contents: read\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout\n uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Setup Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI and jq\n run: |\n npm install -g @google/gemini-cli\n # jq is pre-installed on ubuntu-latest, but just in case:\n which jq || sudo apt-get install -y jq\n\n - name: Run structured review\n id: gemini\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n git fetch origin ${{ github.base_ref }}\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD)\n\n # Skip review if diff is empty\n if [ -z \"$DIFF\" ]; then\n echo \"No changes to review.\"\n echo \"skip=true\" >> \"$GITHUB_OUTPUT\"\n exit 0\n fi\n\n # Run Gemini with JSON output\n RESULT=$(echo \"$DIFF\" | gemini -p \"You are a code review bot. Analyze this PR diff and respond with ONLY a valid JSON object (no markdown fences) using this exact schema:\n {\n \\\"verdict\\\": \\\"APPROVE\\\" | \\\"REQUEST_CHANGES\\\" | \\\"COMMENT\\\",\n \\\"summary\\\": \\\"Brief overall summary\\\",\n \\\"bugs\\\": [{ \\\"file\\\": \\\"path\\\", \\\"line\\\": number, \\\"description\\\": \\\"...\\\" }],\n \\\"security\\\": [{ \\\"file\\\": \\\"path\\\", \\\"severity\\\": \\\"low|medium|high|critical\\\", \\\"description\\\": \\\"...\\\" }],\n \\\"performance\\\": [{ \\\"file\\\": \\\"path\\\", \\\"description\\\": \\\"...\\\" }],\n \\\"suggestions\\\": [{ \\\"file\\\": \\\"path\\\", \\\"description\\\": \\\"...\\\" }]\n }\n Return empty arrays if no issues found in a category.\" \\\n --output-format json)\n\n # The outer JSON has a .response field containing the model's text.\n # The model's text itself is a JSON string we need to parse.\n REVIEW_RESPONSE=$(echo \"$RESULT\" | jq -r '.response')\n\n # Parse the inner JSON (the actual review)\n echo \"$REVIEW_RESPONSE\" | jq '.' > /tmp/review.json\n\n # Extract fields for downstream steps\n VERDICT=$(cat /tmp/review.json | jq -r '.verdict')\n SUMMARY=$(cat /tmp/review.json | jq -r '.summary')\n BUG_COUNT=$(cat /tmp/review.json | jq '.bugs | length')\n SECURITY_COUNT=$(cat /tmp/review.json | jq '.security | length')\n CRITICAL_COUNT=$(cat /tmp/review.json | jq '[.security[] | select(.severity == \"critical\")] | length')\n\n echo \"verdict=$VERDICT\" >> \"$GITHUB_OUTPUT\"\n echo \"bug_count=$BUG_COUNT\" >> \"$GITHUB_OUTPUT\"\n echo \"security_count=$SECURITY_COUNT\" >> \"$GITHUB_OUTPUT\"\n echo \"critical_count=$CRITICAL_COUNT\" >> \"$GITHUB_OUTPUT\"\n echo \"skip=false\" >> \"$GITHUB_OUTPUT\"\n\n # Log token usage from the Gemini stats envelope\n TOKENS=$(echo \"$RESULT\" | jq '.stats.models | to_entries[0].value.tokens.total // 0')\n LATENCY=$(echo \"$RESULT\" | jq '.stats.models | to_entries[0].value.api.totalLatencyMs // 0')\n echo \"Tokens used: $TOKENS, Latency: ${LATENCY}ms\"\n\n - name: Format and post review comment\n if: steps.gemini.outputs.skip != 'true'\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n const review = JSON.parse(fs.readFileSync('/tmp/review.json', 'utf8'));\n\n let body = `## Gemini Code Review\\n\\n`;\n body += `**Verdict:** ${review.verdict}\\n\\n`;\n body += `**Summary:** ${review.summary}\\n\\n`;\n\n if (review.bugs.length > 0) {\n body += `### Bugs (${review.bugs.length})\\n`;\n for (const bug of review.bugs) {\n body += `- **${bug.file}** (line ${bug.line}): ${bug.description}\\n`;\n }\n body += '\\n';\n }\n\n if (review.security.length > 0) {\n body += `### Security Issues (${review.security.length})\\n`;\n for (const issue of review.security) {\n const emoji = issue.severity === 'critical' ? '\u00f0\u0178\u201d\u00b4' :\n issue.severity === 'high' ? '\u00f0\u0178\u0178\u00a0' :\n issue.severity === 'medium' ? '\u00f0\u0178\u0178\u00a1' : '\u00f0\u0178\u0178\u00a2';\n body += `- ${emoji} **[${issue.severity.toUpperCase()}]** ${issue.file}: ${issue.description}\\n`;\n }\n body += '\\n';\n }\n\n if (review.performance.length > 0) {\n body += `### Performance (${review.performance.length})\\n`;\n for (const perf of review.performance) {\n body += `- **${perf.file}**: ${perf.description}\\n`;\n }\n body += '\\n';\n }\n\n if (review.suggestions.length > 0) {\n body += `### Suggestions (${review.suggestions.length})\\n`;\n for (const sug of review.suggestions) {\n body += `- **${sug.file}**: ${sug.description}\\n`;\n }\n body += '\\n';\n }\n\n body += `---\\n*Automated review by Gemini CLI*`;\n\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body\n });\n\n - name: Fail on critical security issues\n if: steps.gemini.outputs.critical_count != '0' && steps.gemini.outputs.skip != 'true'\n run: |\n echo \"::error::Gemini found ${{ steps.gemini.outputs.critical_count }} critical security issue(s). Review required.\"\n exit 1\n```\n\n---\n\n## Understanding the JSON Output\n\nGemini CLI's `--output-format json` wraps the model's response in an envelope with this schema:\n\n```json\n{\n \"response\": \"The model's text response (string)\",\n \"stats\": {\n \"models\": {\n \"gemini-2.5-pro\": {\n \"api\": {\n \"totalRequests\": 2,\n \"totalErrors\": 0,\n \"totalLatencyMs\": 5053\n },\n \"tokens\": {\n \"prompt\": 24939,\n \"candidates\": 20,\n \"total\": 25113,\n \"cached\": 21263,\n \"thoughts\": 154,\n \"tool\": 0\n }\n }\n },\n \"tools\": {\n \"totalCalls\": 1,\n \"totalSuccess\": 1,\n \"totalFail\": 0,\n \"totalDurationMs\": 1881\n },\n \"files\": {\n \"totalLinesAdded\": 5,\n \"totalLinesRemoved\": 2\n }\n }\n}\n```\n\nTo parse this in your pipeline:\n\n```bash\n# Get the model's text response\necho \"$RESULT\" | jq -r '.response'\n\n# Get total tokens used\necho \"$RESULT\" | jq '.stats.models | to_entries[0].value.tokens.total'\n\n# Get API latency\necho \"$RESULT\" | jq '.stats.models | to_entries[0].value.api.totalLatencyMs'\n\n# Check for API errors\necho \"$RESULT\" | jq '.stats.models | to_entries[0].value.api.totalErrors'\n```\n\n---\n\n## Exit Codes for CI Logic\n\nGemini CLI returns meaningful exit codes you can use for conditional logic:\n\n| Exit Code | Meaning |\n|-----------|---------|\n| 0 | Success |\n| 1 | General error |\n| 2 | Authentication error (bad or missing API key) |\n| 3 | Rate limit / quota exceeded |\n\nExample usage:\n\n```bash\ngemini -p \"Review this code\" --output-format json > /tmp/result.json 2>/dev/null\nEXIT_CODE=$?\n\nif [ $EXIT_CODE -eq 2 ]; then\n echo \"::error::Gemini API key is invalid or missing. Check your GEMINI_API_KEY secret.\"\n exit 1\nelif [ $EXIT_CODE -eq 3 ]; then\n echo \"::warning::Gemini rate limit reached. Skipping review.\"\n exit 0\nelif [ $EXIT_CODE -ne 0 ]; then\n echo \"::warning::Gemini CLI failed with exit code $EXIT_CODE. Skipping review.\"\n exit 0\nfi\n```\n\n---\n\n## Tips and Best Practices\n\n1. **Use `--output-format json` (not `stream-json`)** for CI. The `json` format gives you a single complete JSON object after all processing is done. The `stream-json` format emits newline-delimited JSON events in real-time, which is harder to parse in a script.\n\n2. **Authentication**: Set `GEMINI_API_KEY` as an environment variable. Gemini CLI picks it up automatically. No `settings.json` or `.env` file is needed in CI.\n\n3. **Model selection**: Add `-m gemini-2.5-flash` for faster, cheaper reviews or `-m gemini-2.5-pro` for deeper analysis:\n ```bash\n gemini -m gemini-2.5-flash -p \"Review this diff\" --output-format json\n ```\n\n4. **Large diffs**: Gemini models support up to 1M tokens of context. For very large PRs, consider filtering the diff to only changed files that matter:\n ```bash\n git diff origin/main...HEAD -- '*.py' '*.js' '*.ts' | gemini -p \"Review\" --output-format json\n ```\n\n5. **File references with `@` syntax**: You can reference specific files directly in your prompt:\n ```bash\n gemini -p \"Review @./src/auth.py and @./src/middleware.py for security issues\" --output-format json\n ```\n\n6. **Do NOT use `--yolo` for reviews**. The `--yolo` flag auto-accepts all tool actions (file writes, shell commands). For a read-only review, you do not need it. Only use `--yolo` if you want Gemini to make actual code changes.\n\n7. **Rate limits on the free tier**: 60 requests/minute, 1,000 requests/day. For high-volume repos, consider using a paid API key or adding concurrency limits to your workflow.\n\n8. **Debug logging**: If reviews are producing unexpected output, add `--debug-log` to capture full API traces:\n ```bash\n gemini -p \"Review this code\" --output-format json --debug-log ./gemini-debug.jsonl\n ```\n Then upload `gemini-debug.jsonl` as a workflow artifact for inspection.\n\n---\n\n## Minimal Quick-Start Example\n\nIf you just want the simplest possible setup to get started:\n\n```yaml\nname: Gemini Review\non: [pull_request]\npermissions:\n pull-requests: write\n contents: read\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n with:\n fetch-depth: 0\n - run: npm install -g @google/gemini-cli\n - name: Review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n git fetch origin ${{ github.base_ref }}\n REVIEW=$(git diff origin/${{ github.base_ref }}...HEAD | gemini -p \"Review these changes for bugs and issues\" --output-format json | jq -r '.response')\n echo \"$REVIEW\" >> $GITHUB_STEP_SUMMARY\n```\n\nThis pipes the PR diff into Gemini CLI, extracts the review text from the JSON output with `jq`, and writes it to the GitHub Actions step summary where it is visible directly on the workflow run page.\n"}], "grading": {"eval_id": 1, "eval_name": "cicd-setup", "configuration": "with_skill", "expectations": [{"text": "Uses -p or --prompt flag for headless/non-interactive mode", "passed": true, "evidence": "Uses `-p` flag throughout: `gemini -p \"...\" --output-format json`"}, {"text": "Includes --output-format json flag for structured output", "passed": true, "evidence": "Correctly uses `--output-format json` in all examples"}, {"text": "Shows how to set GEMINI_API_KEY from GitHub secrets", "passed": true, "evidence": "Shows `GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}` in workflow env block"}, {"text": "Provides a GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Provides 3 complete workflows: basic, advanced, and minimal quick-start"}, {"text": "Shows how to parse JSON response (e.g., with jq .response)", "passed": true, "evidence": "Shows `jq -r '.response'` and detailed stats extraction with jq"}, {"text": "Does not invent non-existent CLI flags or options", "passed": true, "evidence": "All flags used (-p, --output-format json, -m, --yolo, --debug-log) are real Gemini CLI flags"}]}}, {"id": "eval-1-cicd-setup-without_skill", "prompt": "I want to set up gemini cli in my CI/CD pipeline on GitHub Actions so it can automatically review PRs. We use an API key stored in GitHub secrets. How do I set this up with structured JSON output so I can parse the review?", "eval_id": 1, "outputs": [{"name": "response.md", "type": "text", "content": "# Setting Up Gemini CLI for Automated PR Reviews in GitHub Actions\n\nThis guide covers how to configure Gemini CLI in a GitHub Actions CI/CD pipeline to automatically review pull requests, using an API key from GitHub Secrets and structured JSON output for parsing.\n\n## Prerequisites\n\n- A Google AI API key (from [Google AI Studio](https://aistudio.google.com/apikey))\n- The API key stored as a GitHub repository secret (e.g., `GEMINI_API_KEY`)\n- A GitHub repository with pull request workflows\n\n## Step 1: Store Your API Key in GitHub Secrets\n\n1. Go to your repository on GitHub.\n2. Navigate to **Settings > Secrets and variables > Actions**.\n3. Click **New repository secret**.\n4. Name it `GEMINI_API_KEY` and paste your API key as the value.\n\n## Step 2: Create the GitHub Actions Workflow\n\nCreate a file at `.github/workflows/gemini-pr-review.yml`:\n\n```yaml\nname: Gemini PR Review\n\non:\n pull_request:\n types: [opened, synchronize, reopened]\n\npermissions:\n contents: read\n pull-requests: write\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout repository\n uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Set up Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI\n run: npm install -g @anthropic-ai/gemini-cli || npm install -g gemini-cli\n\n - name: Get PR diff\n id: diff\n run: |\n git fetch origin ${{ github.base_ref }}\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD)\n # Write diff to a file to avoid shell escaping issues\n echo \"$DIFF\" > /tmp/pr_diff.txt\n\n - name: Get changed files list\n id: files\n run: |\n git fetch origin ${{ github.base_ref }}\n FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)\n echo \"$FILES\" > /tmp/changed_files.txt\n\n - name: Run Gemini CLI review\n id: gemini-review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n DIFF=$(cat /tmp/pr_diff.txt)\n FILES=$(cat /tmp/changed_files.txt)\n\n PROMPT=$(cat <<'PROMPT_EOF'\n You are a senior code reviewer. Analyze the following pull request diff and provide a structured review.\n\n Changed files:\n FILE_LIST_PLACEHOLDER\n\n Diff:\n DIFF_PLACEHOLDER\n\n Respond ONLY with valid JSON in this exact format (no markdown fencing, no extra text):\n {\n \"summary\": \"Brief summary of the changes\",\n \"risk_level\": \"low|medium|high\",\n \"issues\": [\n {\n \"file\": \"path/to/file\",\n \"line\": 42,\n \"severity\": \"error|warning|suggestion\",\n \"message\": \"Description of the issue\",\n \"suggestion\": \"How to fix it\"\n }\n ],\n \"positive_aspects\": [\"List of things done well\"],\n \"approval_recommendation\": \"approve|request_changes|comment\"\n }\n PROMPT_EOF\n )\n\n # Replace placeholders\n PROMPT=\"${PROMPT/FILE_LIST_PLACEHOLDER/$FILES}\"\n PROMPT=\"${PROMPT/DIFF_PLACEHOLDER/$DIFF}\"\n\n # Run Gemini CLI with the prompt\n # Using the --json flag if available, or parsing output directly\n RESPONSE=$(echo \"$PROMPT\" | gemini \\\n --model gemini-2.0-flash \\\n --api-key \"$GEMINI_API_KEY\" \\\n 2>/dev/null) || true\n\n # If gemini CLI is not available, fall back to direct API call\n if [ -z \"$RESPONSE\" ]; then\n RESPONSE=$(curl -s \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${GEMINI_API_KEY}\" \\\n -H 'Content-Type: application/json' \\\n -d \"$(jq -n --arg prompt \"$PROMPT\" '{\n \"contents\": [{\"parts\": [{\"text\": $prompt}]}],\n \"generationConfig\": {\n \"responseMimeType\": \"application/json\",\n \"temperature\": 0.2\n }\n }')\" | jq -r '.candidates[0].content.parts[0].text')\n fi\n\n echo \"$RESPONSE\" > /tmp/review_result.json\n\n # Validate JSON\n if jq empty /tmp/review_result.json 2>/dev/null; then\n echo \"valid_json=true\" >> $GITHUB_OUTPUT\n else\n echo \"valid_json=false\" >> $GITHUB_OUTPUT\n echo \"Raw response was not valid JSON, attempting extraction...\"\n # Try to extract JSON from markdown fenced blocks\n sed -n '/^```json/,/^```$/p' /tmp/review_result.json | sed '1d;$d' > /tmp/review_cleaned.json\n if jq empty /tmp/review_cleaned.json 2>/dev/null; then\n mv /tmp/review_cleaned.json /tmp/review_result.json\n echo \"valid_json=true\" >> $GITHUB_OUTPUT\n fi\n fi\n\n - name: Parse and post review\n if: success()\n env:\n GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n run: |\n REVIEW_FILE=\"/tmp/review_result.json\"\n\n if ! jq empty \"$REVIEW_FILE\" 2>/dev/null; then\n echo \"Failed to get valid JSON review. Posting raw output as comment.\"\n gh pr comment ${{ github.event.pull_request.number }} \\\n --body \"## Gemini Review\n\n > Could not parse structured output. Raw response attached.\n\n \\`\\`\\`\n $(cat \"$REVIEW_FILE\")\n \\`\\`\\`\"\n exit 0\n fi\n\n # Extract fields from JSON\n SUMMARY=$(jq -r '.summary' \"$REVIEW_FILE\")\n RISK=$(jq -r '.risk_level' \"$REVIEW_FILE\")\n APPROVAL=$(jq -r '.approval_recommendation' \"$REVIEW_FILE\")\n POSITIVES=$(jq -r '.positive_aspects // [] | map(\"- \" + .) | join(\"\\n\")' \"$REVIEW_FILE\")\n ISSUE_COUNT=$(jq -r '.issues | length' \"$REVIEW_FILE\")\n\n # Build issue table\n ISSUES_MD=\"\"\n if [ \"$ISSUE_COUNT\" -gt 0 ]; then\n ISSUES_MD=\"### Issues Found ($ISSUE_COUNT)\n\n | Severity | File | Line | Message | Suggestion |\n |----------|------|------|---------|------------|\n \"\n ISSUES_MD+=$(jq -r '.issues[] | \"| \\(.severity) | `\\(.file)` | \\(.line) | \\(.message) | \\(.suggestion) |\"' \"$REVIEW_FILE\")\n else\n ISSUES_MD=\"### No issues found\"\n fi\n\n # Build risk badge\n case \"$RISK\" in\n low) RISK_BADGE=\"\" ;;\n medium) RISK_BADGE=\"\" ;;\n high) RISK_BADGE=\"\" ;;\n *) RISK_BADGE=\"Risk: $RISK\" ;;\n esac\n\n # Compose comment\n COMMENT=$(cat <<EOF\n ## Gemini Automated PR Review\n\n $RISK_BADGE\n\n ### Summary\n $SUMMARY\n\n $ISSUES_MD\n\n ### Positive Aspects\n $POSITIVES\n\n ---\n **Recommendation:** \\`$APPROVAL\\`\n EOF\n )\n\n # Post comment to PR\n gh pr comment ${{ github.event.pull_request.number }} --body \"$COMMENT\"\n```\n\n## Step 3: Direct API Approach with Structured JSON Output (Recommended)\n\nIf the Gemini CLI does not reliably produce JSON, use the Gemini REST API directly with `responseMimeType` set to `application/json`. This forces the model to return valid JSON. Here is a standalone workflow that uses `curl` and `jq`:\n\n```yaml\nname: Gemini PR Review (API Direct)\n\non:\n pull_request:\n types: [opened, synchronize, reopened]\n\npermissions:\n contents: read\n pull-requests: write\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout\n uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Generate review via Gemini API\n id: review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n git fetch origin ${{ github.base_ref }}\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD | head -c 30000)\n FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)\n\n # Build the prompt\n PROMPT=\"You are a code reviewer. Review this PR diff. Changed files: ${FILES}. Diff: ${DIFF}. Return a JSON object with keys: summary (string), risk_level (low/medium/high), issues (array of {file, line, severity, message, suggestion}), positive_aspects (array of strings), approval_recommendation (approve/request_changes/comment).\"\n\n # Call Gemini API with structured output\n RESPONSE=$(curl -sf \\\n \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${GEMINI_API_KEY}\" \\\n -H 'Content-Type: application/json' \\\n -d \"$(jq -n --arg prompt \"$PROMPT\" '{\n contents: [{parts: [{text: $prompt}]}],\n generationConfig: {\n responseMimeType: \"application/json\",\n responseSchema: {\n type: \"object\",\n properties: {\n summary: {type: \"string\"},\n risk_level: {type: \"string\", enum: [\"low\", \"medium\", \"high\"]},\n issues: {\n type: \"array\",\n items: {\n type: \"object\",\n properties: {\n file: {type: \"string\"},\n line: {type: \"integer\"},\n severity: {type: \"string\", enum: [\"error\", \"warning\", \"suggestion\"]},\n message: {type: \"string\"},\n suggestion: {type: \"string\"}\n },\n required: [\"file\", \"severity\", \"message\"]\n }\n },\n positive_aspects: {type: \"array\", items: {type: \"string\"}},\n approval_recommendation: {type: \"string\", enum: [\"approve\", \"request_changes\", \"comment\"]}\n },\n required: [\"summary\", \"risk_level\", \"issues\", \"approval_recommendation\"]\n },\n temperature: 0.2\n }\n }')\")\n\n # Extract the text content from the API response\n REVIEW=$(echo \"$RESPONSE\" | jq -r '.candidates[0].content.parts[0].text')\n echo \"$REVIEW\" > /tmp/review.json\n\n # Validate\n jq empty /tmp/review.json\n\n - name: Post review comment\n env:\n GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n run: |\n REVIEW_FILE=\"/tmp/review.json\"\n\n SUMMARY=$(jq -r '.summary' \"$REVIEW_FILE\")\n RISK=$(jq -r '.risk_level' \"$REVIEW_FILE\")\n APPROVAL=$(jq -r '.approval_recommendation' \"$REVIEW_FILE\")\n\n # Format issues as markdown\n ISSUES=$(jq -r '\n if (.issues | length) > 0 then\n \"| Severity | File | Line | Issue | Fix |\\n|---|---|---|---|---|\\n\" +\n (.issues[] | \"| \\(.severity) | `\\(.file)` | \\(.line // \"N/A\") | \\(.message) | \\(.suggestion // \"N/A\") |\")\n else\n \"_No issues found._\"\n end\n ' \"$REVIEW_FILE\")\n\n POSITIVES=$(jq -r '(.positive_aspects // []) | map(\"- \" + .) | join(\"\\n\")' \"$REVIEW_FILE\")\n\n BODY=$(cat <<EOF\n ## Automated PR Review (Gemini)\n\n **Risk Level:** \\`${RISK}\\` | **Recommendation:** \\`${APPROVAL}\\`\n\n ### Summary\n ${SUMMARY}\n\n ### Issues\n ${ISSUES}\n\n ### What looks good\n ${POSITIVES}\n EOF\n )\n\n gh pr comment \"${{ github.event.pull_request.number }}\" --body \"$BODY\"\n```\n\n## Step 4: Using Gemini CLI with `--json` Flag (If Supported)\n\nSome versions of the Gemini CLI support a `--json` output flag. If your version does, the invocation simplifies to:\n\n```bash\ngemini --model gemini-2.0-flash \\\n --api-key \"$GEMINI_API_KEY\" \\\n --json \\\n --prompt \"Review this code diff and return structured JSON: $DIFF\"\n```\n\nCheck your installed version's help output (`gemini --help`) to confirm available flags.\n\n## Step 5: Parsing the JSON Output in Downstream Steps\n\nOnce you have valid JSON in a file, you can use it in subsequent pipeline steps:\n\n```yaml\n - name: Fail on high-risk reviews\n run: |\n RISK=$(jq -r '.risk_level' /tmp/review.json)\n ERRORS=$(jq '[.issues[] | select(.severity == \"error\")] | length' /tmp/review.json)\n\n echo \"Risk level: $RISK\"\n echo \"Error count: $ERRORS\"\n\n if [ \"$RISK\" = \"high\" ] || [ \"$ERRORS\" -gt 0 ]; then\n echo \"::error::PR review found high risk or errors. Please address the issues.\"\n exit 1\n fi\n```\n\nYou can also use it in a matrix strategy or pass data between jobs:\n\n```yaml\n - name: Export review data\n id: export\n run: |\n echo \"risk=$(jq -r '.risk_level' /tmp/review.json)\" >> $GITHUB_OUTPUT\n echo \"approval=$(jq -r '.approval_recommendation' /tmp/review.json)\" >> $GITHUB_OUTPUT\n echo \"issue_count=$(jq '.issues | length' /tmp/review.json)\" >> $GITHUB_OUTPUT\n```\n\nThen reference in later steps or jobs:\n\n```yaml\n - name: Auto-approve if clean\n if: steps.export.outputs.approval == 'approve' && steps.export.outputs.issue_count == '0'\n env:\n GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n run: |\n gh pr review \"${{ github.event.pull_request.number }}\" --approve --body \"Automated review passed with no issues.\"\n```\n\n## Key Points\n\n1. **`responseMimeType: \"application/json\"`** in the Gemini API `generationConfig` forces the model to return valid JSON. This is the most reliable way to get structured output.\n\n2. **`responseSchema`** lets you define the exact JSON schema you expect. The model will conform to it. This eliminates the need for post-processing or regex extraction.\n\n3. **Diff size limits**: Git diffs can be very large. Truncate them (e.g., `head -c 30000`) to stay within token limits. For large PRs, consider reviewing changed files individually.\n\n4. **Error handling**: Always validate the JSON output with `jq empty` before trying to parse it. Have a fallback path that posts raw output as a comment.\n\n5. **Rate limits**: The Gemini API has rate limits. For high-traffic repos, add retry logic or use the paid tier.\n\n6. **Security**: The `GEMINI_API_KEY` secret is never exposed in logs because GitHub Actions automatically masks secrets in output. The `GITHUB_TOKEN` is provided automatically by Actions with the permissions declared in the workflow.\n\n## Troubleshooting\n\n- **\"jq: parse error\"**: The model returned non-JSON text. Use `responseMimeType: \"application/json\"` to fix this.\n- **Empty response**: Check that `GEMINI_API_KEY` is set correctly in secrets and that the model name is valid.\n- **Diff too large**: Truncate the diff or split the review across multiple API calls, one per file.\n- **Permission denied on PR comment**: Ensure `permissions: pull-requests: write` is set in the workflow.\n"}], "grading": {"eval_id": 1, "eval_name": "cicd-setup", "configuration": "without_skill", "expectations": [{"text": "Uses -p or --prompt flag for headless/non-interactive mode", "passed": false, "evidence": "Pipes prompt via stdin (`echo \"$PROMPT\" | gemini`) instead of using -p flag. Does not demonstrate the correct headless invocation."}, {"text": "Includes --output-format json flag for structured output", "passed": false, "evidence": "Mentions `--json` in Step 4 but says 'if supported' and 'check your version'. Does not use the correct `--output-format json` flag."}, {"text": "Shows how to set GEMINI_API_KEY from GitHub secrets", "passed": true, "evidence": "Shows `GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}` in env block"}, {"text": "Provides a GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Provides multiple complete workflow YAML files"}, {"text": "Shows how to parse JSON response (e.g., with jq .response)", "passed": false, "evidence": "Parses the REST API response structure (candidates[0].content.parts[0].text) rather than Gemini CLI's JSON envelope (.response). Falls back to curl-based API calls."}, {"text": "Does not invent non-existent CLI flags or options", "passed": false, "evidence": "Uses `--api-key` flag (doesn't exist), `--json` flag (doesn't exist), and references wrong npm package `@anthropic-ai/gemini-cli`"}]}}, {"id": "eval-2-mcp-server-config-with_skill", "prompt": "I need to connect a custom MCP server I built in Python to gemini cli. The server needs an API key from my environment. Also I want to restrict which tools it can expose. How do I configure this?", "eval_id": 2, "outputs": [{"name": "response.md", "type": "text", "content": "# Connecting a Custom Python MCP Server to Gemini CLI\n\nYou need to configure your server in `settings.json`, pass the API key via the `env` field, and use `includeTools` to restrict which tools are exposed. Here is the full walkthrough.\n\n---\n\n## 1. Configure the MCP Server in settings.json\n\nEdit (or create) `~/.gemini/settings.json` for a global configuration, or `<your-project>/.gemini/settings.json` for project-scoped configuration. Add your server under the `mcpServers` key:\n\n```json\n{\n \"mcpServers\": {\n \"myPythonServer\": {\n \"command\": \"python\",\n \"args\": [\"-m\", \"my_mcp_server\"],\n \"cwd\": \"./mcp_tools\",\n \"env\": {\n \"MY_API_KEY\": \"$MY_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"includeTools\": [\"search_records\", \"get_status\"]\n }\n }\n}\n```\n\n### What each field does\n\n| Field | Type | Description |\n|---|---|---|\n| `command` | string | The executable to run. Use `\"python\"` (or `\"python3\"` depending on your system). |\n| `args` | string[] | Arguments passed to the command. `[\"-m\", \"my_mcp_server\"]` runs your package as a module. You can also use a script path like `[\"./server.py\"]`. |\n| `cwd` | string | Working directory for the server process. Optional -- set this if your server needs to run from a specific directory. |\n| `env` | object | Environment variables passed to the server process. Use the `\"$VAR_NAME\"` syntax to reference variables from your shell environment. |\n| `timeout` | number | Connection timeout in milliseconds. Default is 10000 (10 seconds). Increase if your server takes longer to start. |\n| `trust` | boolean | When `true`, skips all tool confirmation dialogs. Keep this `false` unless you fully control and trust the server. |\n| `includeTools` | string[] | **Whitelist** -- only these tools will be exposed to Gemini. If set, all other tools from the server are hidden. |\n| `excludeTools` | string[] | **Blacklist** -- these specific tools will be hidden. All others remain available. |\n\n**Important:** `includeTools` and `excludeTools` are mutually exclusive approaches. Use one or the other:\n- Use `includeTools` when your server exposes many tools but you only want a few available.\n- Use `excludeTools` when you want most tools but need to block specific dangerous ones.\n\n---\n\n## 2. Pass the API Key Securely\n\nNever hardcode secrets directly in `settings.json`. Instead, use the `\"$VAR_NAME\"` pattern to reference environment variables from your shell:\n\n```json\n\"env\": {\n \"MY_API_KEY\": \"$MY_API_KEY\"\n}\n```\n\nThen set the variable in your shell before running Gemini:\n\n```bash\nexport MY_API_KEY=\"sk-your-actual-api-key-here\"\ngemini\n```\n\nAlternatively, you can store the variable in a `.env` file that Gemini loads automatically:\n\n- `~/.gemini/.env` -- global env file\n- `./.gemini/.env` -- project-level env file\n\nExample `.gemini/.env`:\n\n```\nMY_API_KEY=sk-your-actual-api-key-here\n```\n\n**Security note:** Gemini CLI automatically redacts sensitive env vars (names containing `TOKEN`, `SECRET`, `KEY`, etc.) from MCP processes unless you explicitly list them in the `env` field. So you must include the variable in `env` for it to reach your server.\n\n---\n\n## 3. Full Working Example\n\nSuppose you have a Python MCP server at `~/mcp-servers/analytics_server.py` that exposes five tools (`query_data`, `generate_report`, `delete_records`, `export_csv`, `run_migration`), and you only want `query_data` and `generate_report` available. The server needs a `ANALYTICS_API_KEY`.\n\n### settings.json\n\n```json\n{\n \"mcpServers\": {\n \"analytics\": {\n \"command\": \"python3\",\n \"args\": [\"analytics_server.py\"],\n \"cwd\": \"/home/user/mcp-servers\",\n \"env\": {\n \"ANALYTICS_API_KEY\": \"$ANALYTICS_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"includeTools\": [\"query_data\", \"generate_report\"]\n }\n }\n}\n```\n\n### Shell setup\n\n```bash\nexport ANALYTICS_API_KEY=\"your-key-here\"\ngemini\n```\n\n### Using the server in a Gemini session\n\nOnce inside the interactive session, you can verify the server connected:\n\n```\n/mcp list\n/mcp status\n```\n\nThen invoke the tools by addressing the server:\n\n```\n> @analytics Query monthly active users for Q4 2025\n> @analytics Generate a report on user retention trends\n```\n\n---\n\n## 4. Alternative: Use excludeTools Instead\n\nIf you want all tools available except specific dangerous ones:\n\n```json\n{\n \"mcpServers\": {\n \"analytics\": {\n \"command\": \"python3\",\n \"args\": [\"analytics_server.py\"],\n \"cwd\": \"/home/user/mcp-servers\",\n \"env\": {\n \"ANALYTICS_API_KEY\": \"$ANALYTICS_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"excludeTools\": [\"delete_records\", \"run_migration\"]\n }\n }\n}\n```\n\nThis exposes everything except `delete_records` and `run_migration`.\n\n---\n\n## 5. Adding the Server via CLI (Alternative to Manual Editing)\n\nYou can also add MCP servers from the command line:\n\n```bash\ngemini mcp add analytics --command \"python3 analytics_server.py\"\n```\n\nHowever, the CLI `add` command provides less control over fields like `env`, `includeTools`, and `cwd`. For full configuration, editing `settings.json` directly is recommended.\n\nTo manage servers later:\n\n```bash\ngemini mcp list # list all configured servers\ngemini mcp remove analytics # remove a server\n```\n\nOr from within an interactive session:\n\n```\n/mcp list\n/mcp enable analytics\n/mcp disable analytics\n```\n\n---\n\n## 6. Minimal Python MCP Server Skeleton\n\nFor reference, here is the basic structure of a Python MCP server that your configuration would launch:\n\n```python\n# analytics_server.py\nimport os\nimport json\nimport sys\n\n# Read the API key passed through the env config\napi_key = os.environ.get(\"ANALYTICS_API_KEY\")\nif not api_key:\n print(\"Error: ANALYTICS_API_KEY not set\", file=sys.stderr)\n sys.exit(1)\n\n# Your MCP server implementation here\n# The server communicates over stdio using the MCP protocol\n# Tools like query_data, generate_report, etc. are registered\n# and handled according to the MCP specification.\n```\n\n---\n\n## Summary\n\n1. Add your server to `mcpServers` in `~/.gemini/settings.json` (or project-level).\n2. Pass the API key using `\"env\": { \"MY_API_KEY\": \"$MY_API_KEY\" }` -- never hardcode it.\n3. Restrict tools with `\"includeTools\": [\"tool1\", \"tool2\"]` (whitelist) or `\"excludeTools\": [\"bad_tool\"]` (blacklist).\n4. Set the actual environment variable in your shell or in a `.gemini/.env` file.\n5. Verify the connection with `/mcp list` and `/mcp status` inside an interactive session.\n"}], "grading": {"eval_id": 2, "eval_name": "mcp-server-config", "configuration": "with_skill", "expectations": [{"text": "Points to ~/.gemini/settings.json or project-level settings.json", "passed": true, "evidence": "Mentions both `~/.gemini/settings.json` (global) and `<project>/.gemini/settings.json` (project-scoped)"}, {"text": "Shows mcpServers config with command, args fields", "passed": true, "evidence": "Complete mcpServers config with command, args, cwd, env, timeout, trust, includeTools fields"}, {"text": "Shows $VAR pattern for environment variable references in env field", "passed": true, "evidence": "Shows `\"MY_API_KEY\": \"$MY_API_KEY\"` and explains the $VAR_NAME pattern explicitly"}, {"text": "Explains includeTools and/or excludeTools for restricting exposed tools", "passed": true, "evidence": "Explains both includeTools (whitelist) and excludeTools (blacklist) with examples and guidance on when to use each"}, {"text": "Mentions security considerations (trust field, not hardcoding keys, etc.)", "passed": true, "evidence": "Warns to never hardcode secrets, explains trust field, mentions auto-redaction of sensitive env vars"}, {"text": "Uses correct Python command (python/python3) in the command field", "passed": true, "evidence": "Uses both `python` and `python3` with note about system differences"}]}}, {"id": "eval-2-mcp-server-config-without_skill", "prompt": "I need to connect a custom MCP server I built in Python to gemini cli. The server needs an API key from my environment. Also I want to restrict which tools it can expose. How do I configure this?", "eval_id": 2, "outputs": [{"name": "response.md", "type": "text", "content": "# Connecting a Custom Python MCP Server to Gemini CLI\n\n## Overview\n\nGemini CLI supports MCP (Model Context Protocol) servers through its settings configuration. You can connect a custom Python MCP server, pass environment variables (like API keys), and restrict which tools are exposed.\n\n## 1. Build Your Python MCP Server\n\nFirst, make sure your Python MCP server uses the official MCP SDK. Here is a minimal example:\n\n```python\n# my_mcp_server.py\nimport os\nfrom mcp.server.fastmcp import FastMCP\n\n# Read the API key from environment\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nif not API_KEY:\n raise ValueError(\"MY_API_KEY environment variable is required\")\n\nmcp = FastMCP(\"my-custom-server\")\n\n@mcp.tool()\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API using the configured API key.\"\"\"\n # Use API_KEY here for your requests\n return f\"Results for '{query}' using key ending in ...{API_KEY[-4:]}\"\n\n@mcp.tool()\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data.\"\"\"\n return f\"Analysis of: {data}\"\n\n@mcp.tool()\ndef internal_debug(info: str) -> str:\n \"\"\"Internal debug tool - you may want to hide this from Gemini.\"\"\"\n return f\"Debug: {info}\"\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\nInstall the dependency:\n\n```bash\npip install mcp\n```\n\n## 2. Configure Gemini CLI to Use Your MCP Server\n\nGemini CLI reads MCP server configuration from a `settings.json` file. There are two levels:\n\n- **Global (user-level):** `~/.gemini/settings.json`\n- **Project-level:** `.gemini/settings.json` in your project root\n\n### Basic Configuration\n\nCreate or edit `~/.gemini/settings.json` (or `.gemini/settings.json` in your project):\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"cwd\": \"/absolute/path/to/your/server/directory\",\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-value-here\"\n }\n }\n }\n}\n```\n\n### Configuration Fields Explained\n\n| Field | Description |\n|-------|-------------|\n| `command` | The executable to run (e.g., `python`, `python3`, `node`, `npx`) |\n| `args` | Array of arguments passed to the command |\n| `cwd` | Working directory for the server process (optional but recommended) |\n| `env` | Environment variables passed to the server process |\n\n### Using a Virtual Environment\n\nIf your MCP server has its own virtual environment:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"/path/to/venv/bin/python\",\n \"args\": [\"/absolute/path/to/my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-value-here\"\n }\n }\n }\n}\n```\n\nOn Windows, use the Windows path to the venv Python:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"C:\\\\path\\\\to\\\\venv\\\\Scripts\\\\python.exe\",\n \"args\": [\"C:\\\\path\\\\to\\\\my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-value-here\"\n }\n }\n }\n}\n```\n\n## 3. Passing the API Key Securely\n\nThere are several approaches to handle the API key:\n\n### Option A: Inline in settings.json (simplest, but key is in plaintext)\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"sk-abc123your-actual-key\"\n }\n }\n }\n}\n```\n\n### Option B: Reference a shell variable at launch time\n\nSet the environment variable in your shell profile (`.bashrc`, `.zshrc`, etc.):\n\n```bash\nexport MY_API_KEY=\"sk-abc123your-actual-key\"\n```\n\nThen in `settings.json`, you can omit the `env` block entirely if the variable is already in your shell environment. The child process inherits the parent environment. However, to be explicit:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"${MY_API_KEY}\"\n }\n }\n }\n}\n```\n\n> **Note:** Whether `${MY_API_KEY}` variable interpolation is supported depends on the Gemini CLI version. If it is not supported, you may need to use a wrapper script approach (see Option C).\n\n### Option C: Wrapper script (most flexible)\n\nCreate a wrapper script `run_server.sh`:\n\n```bash\n#!/bin/bash\n# run_server.sh\n# Load API key from a secure location\nsource ~/.secrets/my_api_keys.env\nexport MY_API_KEY\nexec python /path/to/my_mcp_server.py \"$@\"\n```\n\nThen configure:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"bash\",\n \"args\": [\"/path/to/run_server.sh\"]\n }\n }\n}\n```\n\n## 4. Restricting Which Tools Are Exposed\n\nThere are two approaches to restricting tools: server-side and client-side.\n\n### Approach A: Server-Side Restriction (Recommended)\n\nOnly define the tools you want exposed in your MCP server. Simply remove or comment out any tools you do not want Gemini to access:\n\n```python\n# my_mcp_server.py - only expose the tools you want\nfrom mcp.server.fastmcp import FastMCP\nimport os\n\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nmcp = FastMCP(\"my-custom-server\")\n\n@mcp.tool()\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API.\"\"\"\n return f\"Results for '{query}'\"\n\n@mcp.tool()\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data.\"\"\"\n return f\"Analysis of: {data}\"\n\n# internal_debug is NOT registered - Gemini will never see it\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\n### Approach B: Conditional Tool Registration\n\nUse an environment variable or config file to control which tools are registered:\n\n```python\n# my_mcp_server.py - conditionally register tools\nfrom mcp.server.fastmcp import FastMCP\nimport os\nimport json\n\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nALLOWED_TOOLS = os.environ.get(\"ALLOWED_TOOLS\", \"\").split(\",\")\n\nmcp = FastMCP(\"my-custom-server\")\n\ndef register_if_allowed(tool_name):\n \"\"\"Decorator factory that only registers a tool if it's in the allowed list.\"\"\"\n def decorator(func):\n if not ALLOWED_TOOLS or not ALLOWED_TOOLS[0]:\n # No restriction set, register all tools\n return mcp.tool()(func)\n elif tool_name in ALLOWED_TOOLS:\n return mcp.tool()(func)\n return func\n return decorator\n\n@register_if_allowed(\"fetch_data\")\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API.\"\"\"\n return f\"Results for '{query}'\"\n\n@register_if_allowed(\"analyze_data\")\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data.\"\"\"\n return f\"Analysis of: {data}\"\n\n@register_if_allowed(\"internal_debug\")\ndef internal_debug(info: str) -> str:\n \"\"\"Internal debug tool.\"\"\"\n return f\"Debug: {info}\"\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\nThen in your `settings.json`, pass the allowed tools list:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-here\",\n \"ALLOWED_TOOLS\": \"fetch_data,analyze_data\"\n }\n }\n }\n}\n```\n\nThis way, `internal_debug` is never registered and Gemini cannot call it.\n\n### Approach C: Gemini CLI Tool Filtering (if supported)\n\nSome MCP clients support an `allowedTools` or `toolFilter` field in the server configuration. If Gemini CLI supports this, the configuration would look like:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-here\"\n },\n \"toolFilter\": {\n \"allow\": [\"fetch_data\", \"analyze_data\"]\n }\n }\n }\n}\n```\n\n> **Note:** The exact field name and syntax for client-side tool filtering varies by MCP client implementation. Check the Gemini CLI documentation for the exact syntax. If Gemini CLI does not support this field, use the server-side approaches (A or B) described above.\n\n## 5. Complete Working Example\n\n### Directory Structure\n\n```\nmy-mcp-project/\n my_mcp_server.py\n requirements.txt\n .gemini/\n settings.json\n```\n\n### requirements.txt\n\n```\nmcp>=1.0.0\n```\n\n### my_mcp_server.py\n\n```python\nimport os\nfrom mcp.server.fastmcp import FastMCP\n\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nif not API_KEY:\n raise ValueError(\"MY_API_KEY environment variable must be set\")\n\nALLOWED_TOOLS = [t.strip() for t in os.environ.get(\"ALLOWED_TOOLS\", \"\").split(\",\") if t.strip()]\n\nmcp = FastMCP(\"my-custom-server\")\n\n\ndef expose(tool_name):\n \"\"\"Only register the tool if it passes the allowlist filter.\"\"\"\n def decorator(func):\n if not ALLOWED_TOOLS or tool_name in ALLOWED_TOOLS:\n return mcp.tool()(func)\n return func\n return decorator\n\n\n@expose(\"fetch_data\")\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API using the configured API key.\"\"\"\n # Your real implementation here\n return f\"Fetched results for '{query}'\"\n\n\n@expose(\"analyze_data\")\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data and return insights.\"\"\"\n return f\"Analysis complete for: {data}\"\n\n\n@expose(\"dangerous_reset\")\ndef dangerous_reset() -> str:\n \"\"\"Reset all data - restricted tool.\"\"\"\n return \"Data reset complete\"\n\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\n### .gemini/settings.json\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"cwd\": \"/absolute/path/to/my-mcp-project\",\n \"env\": {\n \"MY_API_KEY\": \"sk-your-api-key-here\",\n \"ALLOWED_TOOLS\": \"fetch_data,analyze_data\"\n }\n }\n }\n}\n```\n\nWith this configuration:\n- Gemini CLI launches your Python MCP server as a subprocess\n- The `MY_API_KEY` environment variable is passed securely to the server process\n- Only `fetch_data` and `analyze_data` are registered as tools; `dangerous_reset` is excluded\n- Gemini can discover and invoke the allowed tools during conversation\n\n## 6. Testing Your Server\n\nBefore connecting to Gemini CLI, test your server standalone:\n\n```bash\n# Test that the server starts correctly\nMY_API_KEY=\"test-key\" python my_mcp_server.py\n\n# Or with tool restrictions\nMY_API_KEY=\"test-key\" ALLOWED_TOOLS=\"fetch_data\" python my_mcp_server.py\n```\n\nYou can also use the MCP Inspector tool to verify your server works:\n\n```bash\nnpx @modelcontextprotocol/inspector python my_mcp_server.py\n```\n\nThen launch Gemini CLI in your project directory. It will automatically pick up the `.gemini/settings.json` and start your MCP server.\n\n## 7. Troubleshooting\n\n| Issue | Solution |\n|-------|----------|\n| Server not found | Use absolute paths in `command` and `args` |\n| API key not available | Verify the `env` block in settings.json; check with a print/log statement |\n| Tools not showing up | Confirm the tool functions are decorated with `@mcp.tool()` and not filtered out |\n| Permission errors | Ensure the Python executable and script have correct permissions |\n| Server crashes on start | Run the server manually from the terminal first to see error output |\n| Wrong Python version | Use the full path to the correct Python interpreter in `command` |\n"}], "grading": {"eval_id": 2, "eval_name": "mcp-server-config", "configuration": "without_skill", "expectations": [{"text": "Points to ~/.gemini/settings.json or project-level settings.json", "passed": true, "evidence": "Mentions both `~/.gemini/settings.json` and `.gemini/settings.json` in project root"}, {"text": "Shows mcpServers config with command, args fields", "passed": true, "evidence": "Shows mcpServers config with command, args, cwd, env fields"}, {"text": "Shows $VAR pattern for environment variable references in env field", "passed": false, "evidence": "Hardcodes API key directly in env (`\"MY_API_KEY\": \"sk-abc123your-actual-key\"`). Uses `${MY_API_KEY}` syntax but adds caveat that it might not be supported. Does not show the correct `$VAR` pattern."}, {"text": "Explains includeTools and/or excludeTools for restricting exposed tools", "passed": false, "evidence": "Does not mention includeTools or excludeTools. Instead suggests server-side tool filtering and uses non-existent `toolFilter` field with `allow` sub-field."}, {"text": "Mentions security considerations (trust field, not hardcoding keys, etc.)", "passed": false, "evidence": "Does not mention the trust field. Actually recommends hardcoding as Option A. Does not mention env var redaction behavior."}, {"text": "Uses correct Python command (python/python3) in the command field", "passed": true, "evidence": "Uses `python` in the command field correctly"}]}}, {"id": "eval-3-custom-slash-command-with_skill", "prompt": "How do I create a custom slash command in gemini cli that runs git diff --cached and asks the AI to write a commit message? I want it available in all my projects.", "eval_id": 3, "outputs": [{"name": "response.md", "type": "text", "content": "# Creating a Global Custom Slash Command for AI Commit Messages in Gemini CLI\n\nYou can create a custom slash command by writing a `.toml` file in the global commands directory. Since you want it available in all your projects, place it under `~/.gemini/commands/`.\n\n## Step 1: Create the Commands Directory\n\n```bash\nmkdir -p ~/.gemini/commands\n```\n\n## Step 2: Create the Slash Command TOML File\n\nCreate the file `~/.gemini/commands/commit.toml`:\n\n```toml\ndescription = \"Generate a commit message from staged changes\"\nprompt = \"\"\"\nWrite a concise conventional commit message for the following staged changes.\n\nUse the format: <type>(<scope>): <description>\n\nWhere type is one of: feat, fix, docs, style, refactor, test, chore\n\nOnly output the commit message itself, nothing else.\n\nStaged diff:\n!{git diff --cached}\n\"\"\"\n```\n\n### How This Works\n\n- The `prompt` field defines the instruction sent to Gemini when the command is invoked.\n- `!{git diff --cached}` is a **shell execution block** -- Gemini CLI runs the shell command inside `!{...}` and inlines its output into the prompt before sending it to the model.\n- `description` is optional but shows up when you run `/help` in an interactive session.\n\n## Step 3: Use the Command\n\nStart Gemini CLI in any project that has staged git changes:\n\n```bash\ncd your-project\ngit add -A # or stage specific files\ngemini\n```\n\nThen in the interactive session, type:\n\n```\n/commit\n```\n\nGemini will execute `git diff --cached`, include the diff in the prompt, and return a conventional commit message.\n\n## Namespaced Alternative\n\nIf you prefer a namespaced command like `/git:commit`, use a subdirectory:\n\n```bash\nmkdir -p ~/.gemini/commands/git\n```\n\nCreate `~/.gemini/commands/git/commit.toml`:\n\n```toml\ndescription = \"Generate a commit message from staged changes\"\nprompt = \"\"\"\nWrite a concise conventional commit message for the following staged changes.\n\nUse the format: <type>(<scope>): <description>\n\nWhere type is one of: feat, fix, docs, style, refactor, test, chore\n\nOnly output the commit message itself, nothing else.\n\nStaged diff:\n!{git diff --cached}\n\"\"\"\n```\n\nThen invoke it with:\n\n```\n/git:commit\n```\n\n## Using It with Arguments\n\nYou can also make the command accept arguments with `{{args}}` to allow the user to pass additional context:\n\n```toml\ndescription = \"Generate a commit message from staged changes with optional context\"\nprompt = \"\"\"\nWrite a concise conventional commit message for the following staged changes.\n\nUse the format: <type>(<scope>): <description>\n\nWhere type is one of: feat, fix, docs, style, refactor, test, chore\n\nAdditional context from the user: {{args}}\n\nOnly output the commit message itself, nothing else.\n\nStaged diff:\n!{git diff --cached}\n\"\"\"\n```\n\nThis lets you provide extra hints:\n\n```\n/commit focus on the auth refactoring\n```\n\n## Fully Automated Shell Script (Bonus)\n\nIf you want to go beyond interactive mode and automate the entire commit flow from your shell, you can create a shell script. Note that custom slash commands currently do **not** work in headless mode (`-p` flag), so for automation you need to embed the prompt directly:\n\n```bash\n#!/bin/bash\n# Save as ~/bin/ai-commit or anywhere on your PATH\n\nDIFF=$(git diff --cached)\n\nif [ -z \"$DIFF\" ]; then\n echo \"Error: Nothing staged. Run 'git add' first.\"\n exit 1\nfi\n\nMSG=$(echo \"$DIFF\" | gemini -p \"Write a concise conventional commit message (type(scope): description format, max 72 chars). Only output the message, nothing else.\" --output-format json | jq -r '.response')\n\nif [ -z \"$MSG\" ]; then\n echo \"Error: Failed to generate commit message.\"\n exit 1\nfi\n\necho \"Proposed commit message:\"\necho \" $MSG\"\necho \"\"\nread -p \"Commit with this message? [y/N] \" confirm\n\nif [[ \"$confirm\" =~ ^[Yy]$ ]]; then\n git commit -m \"$MSG\"\n echo \"Committed.\"\nelse\n echo \"Aborted.\"\nfi\n```\n\nMake it executable:\n\n```bash\nchmod +x ~/bin/ai-commit\n```\n\nThen use it from any project:\n\n```bash\ngit add -A\nai-commit\n```\n\n## Summary of File Locations\n\n| Path | Scope | Invocation |\n|---|---|---|\n| `~/.gemini/commands/commit.toml` | Global (all projects) | `/commit` |\n| `~/.gemini/commands/git/commit.toml` | Global, namespaced | `/git:commit` |\n| `<project>/.gemini/commands/commit.toml` | Project only | `/commit` |\n\nThe global location (`~/.gemini/commands/`) is what makes the command available across all your projects without any per-project setup.\n"}], "grading": {"eval_id": 3, "eval_name": "custom-slash-command", "configuration": "with_skill", "expectations": [{"text": "Points to ~/.gemini/commands/ for global user-scoped commands", "passed": true, "evidence": "Correctly points to `~/.gemini/commands/commit.toml` for global scope"}, {"text": "Shows .toml file format with prompt field", "passed": true, "evidence": "Full .toml example with description and prompt fields"}, {"text": "Uses !{git diff --cached} shell execution syntax in the prompt", "passed": true, "evidence": "Correctly uses `!{git diff --cached}` in the prompt field"}, {"text": "Warns that custom slash commands don't work in headless/non-interactive mode", "passed": true, "evidence": "Explicitly warns: 'custom slash commands currently do not work in headless mode (`-p` flag)' and provides shell script alternative"}, {"text": "Provides a complete, copy-pasteable .toml file example", "passed": true, "evidence": "Multiple complete .toml examples including basic, namespaced, and with-arguments variants"}]}}, {"id": "eval-3-custom-slash-command-without_skill", "prompt": "How do I create a custom slash command in gemini cli that runs git diff --cached and asks the AI to write a commit message? I want it available in all my projects.", "eval_id": 3, "outputs": [{"name": "response.md", "type": "text", "content": "# Creating a Custom Slash Command in Gemini CLI for Commit Messages\n\n## Overview\n\nGemini CLI supports custom slash commands through its extensions system. To make a command available across all your projects, you place it in the global configuration directory at `~/.gemini/extensions/`. Extensions are defined using `gemini_extension.json` manifest files that describe the command and its behavior.\n\n## Step-by-Step Setup\n\n### 1. Create the Global Extensions Directory\n\n```bash\nmkdir -p ~/.gemini/extensions/commit-message\n```\n\n### 2. Create the Shell Script\n\nCreate `~/.gemini/extensions/commit-message/generate-commit-msg.sh`:\n\n```bash\n#!/bin/bash\n# generate-commit-msg.sh\n# Gets the staged diff and outputs it for the AI to process\n\nDIFF=$(git diff --cached)\n\nif [ -z \"$DIFF\" ]; then\n echo \"ERROR: No staged changes found. Stage your changes with 'git add' first.\"\n exit 1\nfi\n\necho \"Here is the staged git diff:\"\necho \"\"\necho '```diff'\necho \"$DIFF\"\necho '```'\necho \"\"\necho \"Please write a concise, conventional commit message for these changes.\"\necho \"Follow the Conventional Commits format: type(scope): description\"\necho \"\"\necho \"Types: feat, fix, docs, style, refactor, perf, test, build, ci, chore\"\necho \"\"\necho \"Provide:\"\necho \"1. A single-line subject (max 72 chars)\"\necho \"2. A blank line\"\necho \"3. An optional body explaining the 'why' behind the changes\"\n```\n\nMake it executable:\n\n```bash\nchmod +x ~/.gemini/extensions/commit-message/generate-commit-msg.sh\n```\n\n### 3. Create the Extension Manifest\n\nCreate `~/.gemini/extensions/commit-message/gemini_extension.json`:\n\n```json\n{\n \"name\": \"commit-message\",\n \"description\": \"Generate a commit message from staged changes using git diff --cached\",\n \"version\": \"1.0.0\",\n \"commands\": [\n {\n \"name\": \"commit-msg\",\n \"description\": \"Analyze staged git changes and generate a conventional commit message\",\n \"action\": {\n \"type\": \"shell\",\n \"command\": \"~/.gemini/extensions/commit-message/generate-commit-msg.sh\"\n }\n }\n ]\n}\n```\n\n### 4. Usage\n\nOnce configured, open Gemini CLI in any project and type:\n\n```\n/commit-msg\n```\n\nThe command will:\n1. Run `git diff --cached` to capture your staged changes\n2. Pass the diff output into the conversation context\n3. Ask Gemini to generate a conventional commit message based on the diff\n\n## Alternative Approach: Using GEMINI.md with a Prompt-Based Workflow\n\nIf extensions are not behaving as expected in your version of Gemini CLI, you can achieve a similar result using a global `GEMINI.md` instruction file combined with a simple shell alias.\n\n### 1. Create a Global GEMINI.md\n\nCreate or edit `~/.gemini/GEMINI.md`:\n\n```markdown\n# Custom Commands\n\n## Commit Message Generation\n\nWhen I say \"generate commit message\" or provide a git diff, follow these rules:\n\n1. Analyze the diff carefully\n2. Write a commit message in Conventional Commits format\n3. Use the format: type(scope): short description\n4. Include a body if the changes are complex\n5. Keep the subject line under 72 characters\n6. Use imperative mood (\"add feature\" not \"added feature\")\n```\n\n### 2. Create a Shell Helper Script\n\nAdd to your `~/.bashrc` or `~/.zshrc`:\n\n```bash\n# Gemini CLI commit message helper\ngcm() {\n local diff\n diff=$(git diff --cached)\n\n if [ -z \"$diff\" ]; then\n echo \"No staged changes. Use 'git add' to stage files first.\"\n return 1\n fi\n\n echo \"Staged diff:\"\n echo '```diff'\n echo \"$diff\"\n echo '```'\n echo \"\"\n echo \"Generate a conventional commit message for the above changes.\"\n}\n```\n\nThen in Gemini CLI, you can run:\n\n```\n@run gcm\n```\n\nOr pipe the output directly:\n\n```bash\n# From your regular terminal, pipe into gemini\ngcm | gemini\n```\n\n## Alternative Approach: Inline Tool Use\n\nGemini CLI allows you to invoke shell commands inline. You can simply type within a Gemini CLI session:\n\n```\nPlease run `git diff --cached` and write me a commit message based on the output.\n```\n\nSince Gemini CLI has the ability to execute shell commands (with your approval), it will run the diff, read the output, and generate a commit message. This requires no configuration at all, but it is a manual process each time rather than a single slash command.\n\n## Applying the Generated Commit Message\n\nOnce Gemini generates a commit message you like, you can apply it directly:\n\n```bash\ngit commit -m \"feat(auth): add OAuth2 token refresh logic\n\nImplement automatic token refresh when access tokens expire.\nUses refresh token rotation for improved security.\"\n```\n\nOr ask Gemini CLI to run the commit for you:\n\n```\nPlease commit with that message.\n```\n\n## Directory Structure Summary\n\n```\n~/.gemini/\n GEMINI.md # Global instructions (optional)\n extensions/\n commit-message/\n gemini_extension.json # Extension manifest\n generate-commit-msg.sh # Shell script for the command\n```\n\n## Tips\n\n- **Scope control**: Placing extensions in `~/.gemini/extensions/` makes them global. For project-specific commands, place them in `.gemini/extensions/` within the project root.\n- **Customizing the prompt**: Edit the echo statements at the end of `generate-commit-msg.sh` to adjust the instructions given to the AI (e.g., require a specific commit style, add ticket number references, etc.).\n- **Large diffs**: If the staged diff is very large, consider truncating it or summarizing file-level changes to stay within context limits. You can modify the script to include `--stat` output alongside a truncated diff.\n- **Security**: The shell script only reads data (via `git diff --cached`). It does not modify your repository or make commits on its own.\n\n## Troubleshooting\n\n- **\"Command not found\"**: Make sure the shell script is executable (`chmod +x`) and the path in `gemini_extension.json` is correct.\n- **\"No staged changes\"**: Run `git add <files>` before invoking the command.\n- **Extension not loading**: Verify the JSON manifest is valid (`cat ~/.gemini/extensions/commit-message/gemini_extension.json | python3 -m json.tool`) and restart Gemini CLI.\n- **Path issues on Windows**: Use forward slashes or the `%USERPROFILE%` equivalent in paths. The extensions directory would be at `%USERPROFILE%\\.gemini\\extensions\\`.\n"}], "grading": {"eval_id": 3, "eval_name": "custom-slash-command", "configuration": "without_skill", "expectations": [{"text": "Points to ~/.gemini/commands/ for global user-scoped commands", "passed": false, "evidence": "Points to `~/.gemini/extensions/commit-message/` instead. Does not know about the commands/ directory or .toml approach."}, {"text": "Shows .toml file format with prompt field", "passed": false, "evidence": "Uses gemini_extension.json manifest with commands array instead of .toml format"}, {"text": "Uses !{git diff --cached} shell execution syntax in the prompt", "passed": false, "evidence": "Creates a separate shell script to execute git diff. Does not know about the !{...} inline shell execution syntax."}, {"text": "Warns that custom slash commands don't work in headless/non-interactive mode", "passed": false, "evidence": "No mention of headless mode limitation for slash commands"}, {"text": "Provides a complete, copy-pasteable .toml file example", "passed": false, "evidence": "Provides a complete extension example, but it uses the wrong mechanism entirely (extensions vs custom commands)"}]}}], "previous_feedback": {}, "previous_outputs": {}, "benchmark": {"skill_name": "gemini-cli", "iteration": 1, "configurations": [{"name": "with_skill", "evals": [{"eval_name": "cicd-setup", "pass_rate": 1.0, "passed": 6, "total": 6, "tokens": 73417, "duration_seconds": 92.4}, {"eval_name": "mcp-server-config", "pass_rate": 1.0, "passed": 6, "total": 6, "tokens": 67196, "duration_seconds": 64.1}, {"eval_name": "custom-slash-command", "pass_rate": 1.0, "passed": 5, "total": 5, "tokens": 70530, "duration_seconds": 49.5}], "aggregate": {"mean_pass_rate": 1.0, "mean_tokens": 70381, "mean_duration_seconds": 68.7}}, {"name": "without_skill", "evals": [{"eval_name": "cicd-setup", "pass_rate": 0.33, "passed": 2, "total": 6, "tokens": 60035, "duration_seconds": 77.9}, {"eval_name": "mcp-server-config", "pass_rate": 0.33, "passed": 2, "total": 6, "tokens": 59258, "duration_seconds": 65.4}, {"eval_name": "custom-slash-command", "pass_rate": 0.0, "passed": 0, "total": 5, "tokens": 57648, "duration_seconds": 56.7}], "aggregate": {"mean_pass_rate": 0.22, "mean_tokens": 58980, "mean_duration_seconds": 66.7}}], "delta": {"pass_rate": "+0.78", "tokens": "+11401 (19% more)", "duration": "+2.0s (3% more)"}, "analyst_observations": ["The skill provides a massive accuracy improvement (100% vs 22% pass rate) with only marginal cost increase (~19% more tokens, ~3% more time).", "Eval 3 (custom slash command) shows the largest gap: the baseline gets the entire mechanism wrong (uses extensions instead of .toml commands), scoring 0/5.", "Eval 1 (CI/CD) baseline hallucinates non-existent flags (--api-key, --json) and even the wrong npm package name (@anthropic-ai/gemini-cli).", "Eval 2 (MCP config) baseline doesn't know about includeTools/excludeTools and recommends hardcoding API keys, which is a security anti-pattern.", "All assertions are discriminating - none pass for both configurations, confirming the skill provides genuine value over baseline knowledge."]}};
|
|
650
|
-
|
|
651
|
-
// ---- State ----
|
|
652
|
-
let feedbackMap = {}; // run_id -> feedback text
|
|
653
|
-
let currentIndex = 0;
|
|
654
|
-
let visitedRuns = new Set();
|
|
655
|
-
|
|
656
|
-
// ---- Init ----
|
|
657
|
-
async function init() {
|
|
658
|
-
// Load saved feedback from server — but only if this isn't a fresh
|
|
659
|
-
// iteration (indicated by previous_feedback being present). When
|
|
660
|
-
// previous feedback exists, the feedback.json on disk is stale from
|
|
661
|
-
// the prior iteration and should not pre-fill the textareas.
|
|
662
|
-
const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
|
|
663
|
-
|| Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
|
|
664
|
-
if (!hasPrevious) {
|
|
665
|
-
try {
|
|
666
|
-
const resp = await fetch("/api/feedback");
|
|
667
|
-
const data = await resp.json();
|
|
668
|
-
if (data.reviews) {
|
|
669
|
-
for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
|
|
670
|
-
}
|
|
671
|
-
} catch { /* first run, no feedback yet */ }
|
|
672
|
-
}
|
|
673
|
-
|
|
674
|
-
document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
|
|
675
|
-
showRun(0);
|
|
676
|
-
|
|
677
|
-
// Wire up feedback auto-save
|
|
678
|
-
const textarea = document.getElementById("feedback");
|
|
679
|
-
let saveTimeout = null;
|
|
680
|
-
textarea.addEventListener("input", () => {
|
|
681
|
-
clearTimeout(saveTimeout);
|
|
682
|
-
document.getElementById("feedback-status").textContent = "";
|
|
683
|
-
saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
|
|
684
|
-
});
|
|
685
|
-
}
|
|
686
|
-
|
|
687
|
-
// ---- Navigation ----
|
|
688
|
-
function navigate(delta) {
|
|
689
|
-
const newIndex = currentIndex + delta;
|
|
690
|
-
if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
|
|
691
|
-
saveCurrentFeedback();
|
|
692
|
-
showRun(newIndex);
|
|
693
|
-
}
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
function updateNavButtons() {
|
|
697
|
-
document.getElementById("prev-btn").disabled = currentIndex === 0;
|
|
698
|
-
document.getElementById("next-btn").disabled =
|
|
699
|
-
currentIndex === EMBEDDED_DATA.runs.length - 1;
|
|
700
|
-
}
|
|
701
|
-
|
|
702
|
-
// ---- Show a run ----
|
|
703
|
-
function showRun(index) {
|
|
704
|
-
currentIndex = index;
|
|
705
|
-
const run = EMBEDDED_DATA.runs[index];
|
|
706
|
-
|
|
707
|
-
// Progress
|
|
708
|
-
document.getElementById("progress").textContent =
|
|
709
|
-
`${index + 1} of ${EMBEDDED_DATA.runs.length}`;
|
|
710
|
-
|
|
711
|
-
// Prompt
|
|
712
|
-
document.getElementById("prompt-text").textContent = run.prompt;
|
|
713
|
-
|
|
714
|
-
// Config badge
|
|
715
|
-
const badge = document.getElementById("config-badge");
|
|
716
|
-
const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
|
|
717
|
-
if (configMatch) {
|
|
718
|
-
const config = configMatch[1];
|
|
719
|
-
const isBaseline = config === "without_skill" || config === "old_skill";
|
|
720
|
-
badge.textContent = config.replace(/_/g, " ");
|
|
721
|
-
badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
|
|
722
|
-
badge.style.display = "inline-block";
|
|
723
|
-
} else {
|
|
724
|
-
badge.style.display = "none";
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
// Outputs
|
|
728
|
-
renderOutputs(run);
|
|
729
|
-
|
|
730
|
-
// Previous outputs
|
|
731
|
-
renderPrevOutputs(run);
|
|
732
|
-
|
|
733
|
-
// Grades
|
|
734
|
-
renderGrades(run);
|
|
735
|
-
|
|
736
|
-
// Previous feedback
|
|
737
|
-
const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
|
|
738
|
-
const prevEl = document.getElementById("prev-feedback");
|
|
739
|
-
if (prevFb) {
|
|
740
|
-
document.getElementById("prev-feedback-text").textContent = prevFb;
|
|
741
|
-
prevEl.style.display = "block";
|
|
742
|
-
} else {
|
|
743
|
-
prevEl.style.display = "none";
|
|
744
|
-
}
|
|
745
|
-
|
|
746
|
-
// Feedback
|
|
747
|
-
document.getElementById("feedback").value = feedbackMap[run.id] || "";
|
|
748
|
-
document.getElementById("feedback-status").textContent = "";
|
|
749
|
-
|
|
750
|
-
updateNavButtons();
|
|
751
|
-
|
|
752
|
-
// Track visited runs and promote done button when all visited
|
|
753
|
-
visitedRuns.add(index);
|
|
754
|
-
const doneBtn = document.getElementById("done-btn");
|
|
755
|
-
if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
|
|
756
|
-
doneBtn.classList.add("ready");
|
|
757
|
-
}
|
|
758
|
-
|
|
759
|
-
// Scroll main content to top
|
|
760
|
-
document.querySelector(".main").scrollTop = 0;
|
|
761
|
-
}
|
|
762
|
-
|
|
763
|
-
// ---- Render outputs ----
|
|
764
|
-
function renderOutputs(run) {
|
|
765
|
-
const container = document.getElementById("outputs-body");
|
|
766
|
-
container.innerHTML = "";
|
|
767
|
-
|
|
768
|
-
const outputs = run.outputs || [];
|
|
769
|
-
if (outputs.length === 0) {
|
|
770
|
-
container.innerHTML = '<div class="empty-state">No output files</div>';
|
|
771
|
-
return;
|
|
772
|
-
}
|
|
773
|
-
|
|
774
|
-
for (const file of outputs) {
|
|
775
|
-
const fileDiv = document.createElement("div");
|
|
776
|
-
fileDiv.className = "output-file";
|
|
777
|
-
|
|
778
|
-
// Always show file header with download link
|
|
779
|
-
const header = document.createElement("div");
|
|
780
|
-
header.className = "output-file-header";
|
|
781
|
-
const nameSpan = document.createElement("span");
|
|
782
|
-
nameSpan.textContent = file.name;
|
|
783
|
-
header.appendChild(nameSpan);
|
|
784
|
-
const dlBtn = document.createElement("a");
|
|
785
|
-
dlBtn.className = "dl-btn";
|
|
786
|
-
dlBtn.textContent = "Download";
|
|
787
|
-
dlBtn.download = file.name;
|
|
788
|
-
dlBtn.href = getDownloadUri(file);
|
|
789
|
-
header.appendChild(dlBtn);
|
|
790
|
-
fileDiv.appendChild(header);
|
|
791
|
-
|
|
792
|
-
const content = document.createElement("div");
|
|
793
|
-
content.className = "output-file-content";
|
|
794
|
-
|
|
795
|
-
if (file.type === "text") {
|
|
796
|
-
const pre = document.createElement("pre");
|
|
797
|
-
pre.textContent = file.content;
|
|
798
|
-
content.appendChild(pre);
|
|
799
|
-
} else if (file.type === "image") {
|
|
800
|
-
const img = document.createElement("img");
|
|
801
|
-
img.src = file.data_uri;
|
|
802
|
-
img.alt = file.name;
|
|
803
|
-
content.appendChild(img);
|
|
804
|
-
} else if (file.type === "pdf") {
|
|
805
|
-
const iframe = document.createElement("iframe");
|
|
806
|
-
iframe.src = file.data_uri;
|
|
807
|
-
content.appendChild(iframe);
|
|
808
|
-
} else if (file.type === "xlsx") {
|
|
809
|
-
renderXlsx(content, file.data_b64);
|
|
810
|
-
} else if (file.type === "binary") {
|
|
811
|
-
const a = document.createElement("a");
|
|
812
|
-
a.className = "download-link";
|
|
813
|
-
a.href = file.data_uri;
|
|
814
|
-
a.download = file.name;
|
|
815
|
-
a.textContent = "Download " + file.name;
|
|
816
|
-
content.appendChild(a);
|
|
817
|
-
} else if (file.type === "error") {
|
|
818
|
-
const pre = document.createElement("pre");
|
|
819
|
-
pre.textContent = file.content;
|
|
820
|
-
pre.style.color = "var(--red)";
|
|
821
|
-
content.appendChild(pre);
|
|
822
|
-
}
|
|
823
|
-
|
|
824
|
-
fileDiv.appendChild(content);
|
|
825
|
-
container.appendChild(fileDiv);
|
|
826
|
-
}
|
|
827
|
-
}
|
|
828
|
-
|
|
829
|
-
// ---- XLSX rendering via SheetJS ----
|
|
830
|
-
function renderXlsx(container, b64Data) {
|
|
831
|
-
try {
|
|
832
|
-
const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
|
|
833
|
-
const wb = XLSX.read(raw, { type: "array" });
|
|
834
|
-
|
|
835
|
-
for (let i = 0; i < wb.SheetNames.length; i++) {
|
|
836
|
-
const sheetName = wb.SheetNames[i];
|
|
837
|
-
const ws = wb.Sheets[sheetName];
|
|
838
|
-
|
|
839
|
-
if (wb.SheetNames.length > 1) {
|
|
840
|
-
const sheetLabel = document.createElement("div");
|
|
841
|
-
sheetLabel.style.cssText =
|
|
842
|
-
"font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
|
|
843
|
-
sheetLabel.textContent = "Sheet: " + sheetName;
|
|
844
|
-
container.appendChild(sheetLabel);
|
|
845
|
-
}
|
|
846
|
-
|
|
847
|
-
const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
|
|
848
|
-
const wrapper = document.createElement("div");
|
|
849
|
-
wrapper.innerHTML = htmlStr;
|
|
850
|
-
container.appendChild(wrapper);
|
|
851
|
-
}
|
|
852
|
-
} catch (err) {
|
|
853
|
-
container.textContent = "Error rendering spreadsheet: " + err.message;
|
|
854
|
-
}
|
|
855
|
-
}
|
|
856
|
-
|
|
857
|
-
// ---- Grades ----
|
|
858
|
-
function renderGrades(run) {
|
|
859
|
-
const section = document.getElementById("grades-section");
|
|
860
|
-
const content = document.getElementById("grades-content");
|
|
861
|
-
|
|
862
|
-
if (!run.grading) {
|
|
863
|
-
section.style.display = "none";
|
|
864
|
-
return;
|
|
865
|
-
}
|
|
866
|
-
|
|
867
|
-
const grading = run.grading;
|
|
868
|
-
section.style.display = "block";
|
|
869
|
-
// Reset to collapsed
|
|
870
|
-
content.classList.remove("open");
|
|
871
|
-
document.getElementById("grades-arrow").classList.remove("open");
|
|
872
|
-
|
|
873
|
-
const summary = grading.summary || {};
|
|
874
|
-
const expectations = grading.expectations || [];
|
|
875
|
-
|
|
876
|
-
let html = '<div style="padding: 1rem;">';
|
|
877
|
-
|
|
878
|
-
// Summary line
|
|
879
|
-
const passRate = summary.pass_rate != null
|
|
880
|
-
? Math.round(summary.pass_rate * 100) + "%"
|
|
881
|
-
: "?";
|
|
882
|
-
const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
|
|
883
|
-
html += '<div class="grades-summary">';
|
|
884
|
-
html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
|
|
885
|
-
html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
|
|
886
|
-
html += '</div>';
|
|
887
|
-
|
|
888
|
-
// Assertions list
|
|
889
|
-
html += '<ul class="assertion-list">';
|
|
890
|
-
for (const exp of expectations) {
|
|
891
|
-
const statusClass = exp.passed ? "pass" : "fail";
|
|
892
|
-
const statusIcon = exp.passed ? "\u2713" : "\u2717";
|
|
893
|
-
html += '<li class="assertion-item">';
|
|
894
|
-
html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
|
|
895
|
-
html += '<span>' + escapeHtml(exp.text) + '</span>';
|
|
896
|
-
if (exp.evidence) {
|
|
897
|
-
html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
|
|
898
|
-
}
|
|
899
|
-
html += '</li>';
|
|
900
|
-
}
|
|
901
|
-
html += '</ul>';
|
|
902
|
-
|
|
903
|
-
html += '</div>';
|
|
904
|
-
content.innerHTML = html;
|
|
905
|
-
}
|
|
906
|
-
|
|
907
|
-
function toggleGrades() {
|
|
908
|
-
const content = document.getElementById("grades-content");
|
|
909
|
-
const arrow = document.getElementById("grades-arrow");
|
|
910
|
-
content.classList.toggle("open");
|
|
911
|
-
arrow.classList.toggle("open");
|
|
912
|
-
}
|
|
913
|
-
|
|
914
|
-
// ---- Previous outputs (collapsible) ----
|
|
915
|
-
function renderPrevOutputs(run) {
|
|
916
|
-
const section = document.getElementById("prev-outputs-section");
|
|
917
|
-
const content = document.getElementById("prev-outputs-content");
|
|
918
|
-
const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
|
|
919
|
-
|
|
920
|
-
if (!prevOutputs || prevOutputs.length === 0) {
|
|
921
|
-
section.style.display = "none";
|
|
922
|
-
return;
|
|
923
|
-
}
|
|
924
|
-
|
|
925
|
-
section.style.display = "block";
|
|
926
|
-
// Reset to collapsed
|
|
927
|
-
content.classList.remove("open");
|
|
928
|
-
document.getElementById("prev-outputs-arrow").classList.remove("open");
|
|
929
|
-
|
|
930
|
-
// Render the files into the content area
|
|
931
|
-
content.innerHTML = "";
|
|
932
|
-
const wrapper = document.createElement("div");
|
|
933
|
-
wrapper.style.padding = "1rem";
|
|
934
|
-
|
|
935
|
-
for (const file of prevOutputs) {
|
|
936
|
-
const fileDiv = document.createElement("div");
|
|
937
|
-
fileDiv.className = "output-file";
|
|
938
|
-
|
|
939
|
-
const header = document.createElement("div");
|
|
940
|
-
header.className = "output-file-header";
|
|
941
|
-
const nameSpan = document.createElement("span");
|
|
942
|
-
nameSpan.textContent = file.name;
|
|
943
|
-
header.appendChild(nameSpan);
|
|
944
|
-
const dlBtn = document.createElement("a");
|
|
945
|
-
dlBtn.className = "dl-btn";
|
|
946
|
-
dlBtn.textContent = "Download";
|
|
947
|
-
dlBtn.download = file.name;
|
|
948
|
-
dlBtn.href = getDownloadUri(file);
|
|
949
|
-
header.appendChild(dlBtn);
|
|
950
|
-
fileDiv.appendChild(header);
|
|
951
|
-
|
|
952
|
-
const fc = document.createElement("div");
|
|
953
|
-
fc.className = "output-file-content";
|
|
954
|
-
|
|
955
|
-
if (file.type === "text") {
|
|
956
|
-
const pre = document.createElement("pre");
|
|
957
|
-
pre.textContent = file.content;
|
|
958
|
-
fc.appendChild(pre);
|
|
959
|
-
} else if (file.type === "image") {
|
|
960
|
-
const img = document.createElement("img");
|
|
961
|
-
img.src = file.data_uri;
|
|
962
|
-
img.alt = file.name;
|
|
963
|
-
fc.appendChild(img);
|
|
964
|
-
} else if (file.type === "pdf") {
|
|
965
|
-
const iframe = document.createElement("iframe");
|
|
966
|
-
iframe.src = file.data_uri;
|
|
967
|
-
fc.appendChild(iframe);
|
|
968
|
-
} else if (file.type === "xlsx") {
|
|
969
|
-
renderXlsx(fc, file.data_b64);
|
|
970
|
-
} else if (file.type === "binary") {
|
|
971
|
-
const a = document.createElement("a");
|
|
972
|
-
a.className = "download-link";
|
|
973
|
-
a.href = file.data_uri;
|
|
974
|
-
a.download = file.name;
|
|
975
|
-
a.textContent = "Download " + file.name;
|
|
976
|
-
fc.appendChild(a);
|
|
977
|
-
}
|
|
978
|
-
|
|
979
|
-
fileDiv.appendChild(fc);
|
|
980
|
-
wrapper.appendChild(fileDiv);
|
|
981
|
-
}
|
|
982
|
-
|
|
983
|
-
content.appendChild(wrapper);
|
|
984
|
-
}
|
|
985
|
-
|
|
986
|
-
function togglePrevOutputs() {
|
|
987
|
-
const content = document.getElementById("prev-outputs-content");
|
|
988
|
-
const arrow = document.getElementById("prev-outputs-arrow");
|
|
989
|
-
content.classList.toggle("open");
|
|
990
|
-
arrow.classList.toggle("open");
|
|
991
|
-
}
|
|
992
|
-
|
|
993
|
-
// ---- Feedback (saved to server -> feedback.json) ----
|
|
994
|
-
function saveCurrentFeedback() {
|
|
995
|
-
const run = EMBEDDED_DATA.runs[currentIndex];
|
|
996
|
-
const text = document.getElementById("feedback").value;
|
|
997
|
-
|
|
998
|
-
if (text.trim() === "") {
|
|
999
|
-
delete feedbackMap[run.id];
|
|
1000
|
-
} else {
|
|
1001
|
-
feedbackMap[run.id] = text;
|
|
1002
|
-
}
|
|
1003
|
-
|
|
1004
|
-
// Build reviews array from map
|
|
1005
|
-
const reviews = [];
|
|
1006
|
-
for (const [run_id, feedback] of Object.entries(feedbackMap)) {
|
|
1007
|
-
if (feedback.trim()) {
|
|
1008
|
-
reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
|
|
1009
|
-
}
|
|
1010
|
-
}
|
|
1011
|
-
|
|
1012
|
-
fetch("/api/feedback", {
|
|
1013
|
-
method: "POST",
|
|
1014
|
-
headers: { "Content-Type": "application/json" },
|
|
1015
|
-
body: JSON.stringify({ reviews, status: "in_progress" }),
|
|
1016
|
-
}).then(() => {
|
|
1017
|
-
document.getElementById("feedback-status").textContent = "Saved";
|
|
1018
|
-
}).catch(() => {
|
|
1019
|
-
// Static mode or server unavailable — no-op on auto-save,
|
|
1020
|
-
// feedback will be downloaded on final submit
|
|
1021
|
-
document.getElementById("feedback-status").textContent = "Will download on submit";
|
|
1022
|
-
});
|
|
1023
|
-
}
|
|
1024
|
-
|
|
1025
|
-
// ---- Done ----
|
|
1026
|
-
function showDoneDialog() {
|
|
1027
|
-
// Save current textarea to feedbackMap (but don't POST yet)
|
|
1028
|
-
const run = EMBEDDED_DATA.runs[currentIndex];
|
|
1029
|
-
const text = document.getElementById("feedback").value;
|
|
1030
|
-
if (text.trim() === "") {
|
|
1031
|
-
delete feedbackMap[run.id];
|
|
1032
|
-
} else {
|
|
1033
|
-
feedbackMap[run.id] = text;
|
|
1034
|
-
}
|
|
1035
|
-
|
|
1036
|
-
// POST once with status: complete — include ALL runs so the model
|
|
1037
|
-
// can distinguish "no feedback" (looks good) from "not reviewed"
|
|
1038
|
-
const reviews = [];
|
|
1039
|
-
const ts = new Date().toISOString();
|
|
1040
|
-
for (const r of EMBEDDED_DATA.runs) {
|
|
1041
|
-
reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
|
|
1042
|
-
}
|
|
1043
|
-
const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
|
|
1044
|
-
fetch("/api/feedback", {
|
|
1045
|
-
method: "POST",
|
|
1046
|
-
headers: { "Content-Type": "application/json" },
|
|
1047
|
-
body: payload,
|
|
1048
|
-
}).then(() => {
|
|
1049
|
-
document.getElementById("done-overlay").classList.add("visible");
|
|
1050
|
-
}).catch(() => {
|
|
1051
|
-
// Server not available (static mode) — download as file
|
|
1052
|
-
const blob = new Blob([payload], { type: "application/json" });
|
|
1053
|
-
const url = URL.createObjectURL(blob);
|
|
1054
|
-
const a = document.createElement("a");
|
|
1055
|
-
a.href = url;
|
|
1056
|
-
a.download = "feedback.json";
|
|
1057
|
-
a.click();
|
|
1058
|
-
URL.revokeObjectURL(url);
|
|
1059
|
-
document.getElementById("done-overlay").classList.add("visible");
|
|
1060
|
-
});
|
|
1061
|
-
}
|
|
1062
|
-
|
|
1063
|
-
function closeDoneDialog() {
|
|
1064
|
-
// Reset status back to in_progress
|
|
1065
|
-
saveCurrentFeedback();
|
|
1066
|
-
document.getElementById("done-overlay").classList.remove("visible");
|
|
1067
|
-
}
|
|
1068
|
-
|
|
1069
|
-
// ---- Toast ----
|
|
1070
|
-
function showToast(message) {
|
|
1071
|
-
const toast = document.getElementById("toast");
|
|
1072
|
-
toast.textContent = message;
|
|
1073
|
-
toast.classList.add("visible");
|
|
1074
|
-
setTimeout(() => toast.classList.remove("visible"), 2000);
|
|
1075
|
-
}
|
|
1076
|
-
|
|
1077
|
-
// ---- Keyboard nav ----
|
|
1078
|
-
document.addEventListener("keydown", (e) => {
|
|
1079
|
-
// Don't capture when typing in textarea
|
|
1080
|
-
if (e.target.tagName === "TEXTAREA") return;
|
|
1081
|
-
|
|
1082
|
-
if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
|
|
1083
|
-
e.preventDefault();
|
|
1084
|
-
navigate(-1);
|
|
1085
|
-
} else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
|
|
1086
|
-
e.preventDefault();
|
|
1087
|
-
navigate(1);
|
|
1088
|
-
}
|
|
1089
|
-
});
|
|
1090
|
-
|
|
1091
|
-
// ---- Util ----
|
|
1092
|
-
function getDownloadUri(file) {
|
|
1093
|
-
if (file.data_uri) return file.data_uri;
|
|
1094
|
-
if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
|
|
1095
|
-
if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
|
|
1096
|
-
return "#";
|
|
1097
|
-
}
|
|
1098
|
-
|
|
1099
|
-
function escapeHtml(text) {
|
|
1100
|
-
const div = document.createElement("div");
|
|
1101
|
-
div.textContent = text;
|
|
1102
|
-
return div.innerHTML;
|
|
1103
|
-
}
|
|
1104
|
-
|
|
1105
|
-
// ---- View switching ----
|
|
1106
|
-
function switchView(view) {
|
|
1107
|
-
document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
|
|
1108
|
-
document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
|
|
1109
|
-
document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
|
|
1110
|
-
document.getElementById("panel-" + view).classList.add("active");
|
|
1111
|
-
}
|
|
1112
|
-
|
|
1113
|
-
// ---- Benchmark rendering ----
|
|
1114
|
-
function renderBenchmark() {
|
|
1115
|
-
const data = EMBEDDED_DATA.benchmark;
|
|
1116
|
-
if (!data) return;
|
|
1117
|
-
|
|
1118
|
-
// Show the tabs
|
|
1119
|
-
document.getElementById("view-tabs").style.display = "flex";
|
|
1120
|
-
|
|
1121
|
-
const container = document.getElementById("benchmark-content");
|
|
1122
|
-
const summary = data.run_summary || {};
|
|
1123
|
-
const metadata = data.metadata || {};
|
|
1124
|
-
const notes = data.notes || [];
|
|
1125
|
-
|
|
1126
|
-
let html = "";
|
|
1127
|
-
|
|
1128
|
-
// Header
|
|
1129
|
-
html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
|
|
1130
|
-
html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
|
|
1131
|
-
if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> — ";
|
|
1132
|
-
if (metadata.timestamp) html += metadata.timestamp + " — ";
|
|
1133
|
-
if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " — ";
|
|
1134
|
-
html += (metadata.runs_per_configuration || "?") + " runs per configuration";
|
|
1135
|
-
html += "</p>";
|
|
1136
|
-
|
|
1137
|
-
// Summary table
|
|
1138
|
-
html += '<table class="benchmark-table">';
|
|
1139
|
-
|
|
1140
|
-
function fmtStat(stat, pct) {
|
|
1141
|
-
if (!stat) return "—";
|
|
1142
|
-
const suffix = pct ? "%" : "";
|
|
1143
|
-
const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
|
|
1144
|
-
const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
|
|
1145
|
-
return m + suffix + " ± " + s + suffix;
|
|
1146
|
-
}
|
|
1147
|
-
|
|
1148
|
-
function deltaClass(val) {
|
|
1149
|
-
if (!val) return "";
|
|
1150
|
-
const n = parseFloat(val);
|
|
1151
|
-
if (n > 0) return "benchmark-delta-positive";
|
|
1152
|
-
if (n < 0) return "benchmark-delta-negative";
|
|
1153
|
-
return "";
|
|
1154
|
-
}
|
|
1155
|
-
|
|
1156
|
-
// Discover config names dynamically (everything except "delta")
|
|
1157
|
-
const configs = Object.keys(summary).filter(k => k !== "delta");
|
|
1158
|
-
const configA = configs[0] || "config_a";
|
|
1159
|
-
const configB = configs[1] || "config_b";
|
|
1160
|
-
const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
|
|
1161
|
-
const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
|
|
1162
|
-
const a = summary[configA] || {};
|
|
1163
|
-
const b = summary[configB] || {};
|
|
1164
|
-
const delta = summary.delta || {};
|
|
1165
|
-
|
|
1166
|
-
html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
|
|
1167
|
-
html += "<tbody>";
|
|
1168
|
-
|
|
1169
|
-
html += "<tr><td><strong>Pass Rate</strong></td>";
|
|
1170
|
-
html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
|
|
1171
|
-
html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
|
|
1172
|
-
html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
|
|
1173
|
-
|
|
1174
|
-
// Time (only show row if data exists)
|
|
1175
|
-
if (a.time_seconds || b.time_seconds) {
|
|
1176
|
-
html += "<tr><td><strong>Time (s)</strong></td>";
|
|
1177
|
-
html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
|
|
1178
|
-
html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
|
|
1179
|
-
html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
|
|
1180
|
-
}
|
|
1181
|
-
|
|
1182
|
-
// Tokens (only show row if data exists)
|
|
1183
|
-
if (a.tokens || b.tokens) {
|
|
1184
|
-
html += "<tr><td><strong>Tokens</strong></td>";
|
|
1185
|
-
html += "<td>" + fmtStat(a.tokens, false) + "</td>";
|
|
1186
|
-
html += "<td>" + fmtStat(b.tokens, false) + "</td>";
|
|
1187
|
-
html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
|
|
1188
|
-
}
|
|
1189
|
-
|
|
1190
|
-
html += "</tbody></table>";
|
|
1191
|
-
|
|
1192
|
-
// Per-eval breakdown (if runs data available)
|
|
1193
|
-
const runs = data.runs || [];
|
|
1194
|
-
if (runs.length > 0) {
|
|
1195
|
-
const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
|
|
1196
|
-
|
|
1197
|
-
html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
|
|
1198
|
-
|
|
1199
|
-
const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
|
|
1200
|
-
const hasErrors = runs.some(r => r.result && r.result.errors > 0);
|
|
1201
|
-
|
|
1202
|
-
for (const evalId of evalIds) {
|
|
1203
|
-
const evalRuns = runs.filter(r => r.eval_id === evalId);
|
|
1204
|
-
const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
|
|
1205
|
-
|
|
1206
|
-
html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
|
|
1207
|
-
html += '<table class="benchmark-table">';
|
|
1208
|
-
html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
|
|
1209
|
-
if (hasTime) html += "<th>Time (s)</th>";
|
|
1210
|
-
if (hasErrors) html += "<th>Crashes During Execution</th>";
|
|
1211
|
-
html += "</tr></thead>";
|
|
1212
|
-
html += "<tbody>";
|
|
1213
|
-
|
|
1214
|
-
// Group by config and render with average rows
|
|
1215
|
-
const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
|
|
1216
|
-
for (let ci = 0; ci < configGroups.length; ci++) {
|
|
1217
|
-
const config = configGroups[ci];
|
|
1218
|
-
const configRuns = evalRuns.filter(r => r.configuration === config);
|
|
1219
|
-
if (configRuns.length === 0) continue;
|
|
1220
|
-
|
|
1221
|
-
const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
|
|
1222
|
-
const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
|
|
1223
|
-
|
|
1224
|
-
for (const run of configRuns) {
|
|
1225
|
-
const r = run.result || {};
|
|
1226
|
-
const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
|
|
1227
|
-
html += '<tr class="' + rowClass + '">';
|
|
1228
|
-
html += "<td>" + configLabel + "</td>";
|
|
1229
|
-
html += "<td>" + run.run_number + "</td>";
|
|
1230
|
-
html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
|
|
1231
|
-
if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
|
|
1232
|
-
if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
|
|
1233
|
-
html += "</tr>";
|
|
1234
|
-
}
|
|
1235
|
-
|
|
1236
|
-
// Average row
|
|
1237
|
-
const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
|
|
1238
|
-
const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
|
|
1239
|
-
const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
|
|
1240
|
-
html += '<tr class="benchmark-row-avg ' + rowClass + '">';
|
|
1241
|
-
html += "<td>" + configLabel + "</td>";
|
|
1242
|
-
html += "<td>Avg</td>";
|
|
1243
|
-
html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
|
|
1244
|
-
if (hasTime) {
|
|
1245
|
-
const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
|
|
1246
|
-
html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
|
|
1247
|
-
}
|
|
1248
|
-
if (hasErrors) html += "<td></td>";
|
|
1249
|
-
html += "</tr>";
|
|
1250
|
-
}
|
|
1251
|
-
html += "</tbody></table>";
|
|
1252
|
-
|
|
1253
|
-
// Per-assertion detail for this eval
|
|
1254
|
-
const runsWithExpectations = {};
|
|
1255
|
-
for (const config of configGroups) {
|
|
1256
|
-
runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
|
|
1257
|
-
}
|
|
1258
|
-
const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
|
|
1259
|
-
if (hasAnyExpectations) {
|
|
1260
|
-
// Collect all unique assertion texts across all configs
|
|
1261
|
-
const allAssertions = [];
|
|
1262
|
-
const seen = new Set();
|
|
1263
|
-
for (const config of configGroups) {
|
|
1264
|
-
for (const run of runsWithExpectations[config]) {
|
|
1265
|
-
for (const exp of (run.expectations || [])) {
|
|
1266
|
-
if (!seen.has(exp.text)) {
|
|
1267
|
-
seen.add(exp.text);
|
|
1268
|
-
allAssertions.push(exp.text);
|
|
1269
|
-
}
|
|
1270
|
-
}
|
|
1271
|
-
}
|
|
1272
|
-
}
|
|
1273
|
-
|
|
1274
|
-
html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
|
|
1275
|
-
html += "<thead><tr><th>Assertion</th>";
|
|
1276
|
-
for (const config of configGroups) {
|
|
1277
|
-
const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
|
|
1278
|
-
html += "<th>" + escapeHtml(label) + "</th>";
|
|
1279
|
-
}
|
|
1280
|
-
html += "</tr></thead><tbody>";
|
|
1281
|
-
|
|
1282
|
-
for (const assertionText of allAssertions) {
|
|
1283
|
-
html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
|
|
1284
|
-
|
|
1285
|
-
for (const config of configGroups) {
|
|
1286
|
-
html += "<td>";
|
|
1287
|
-
for (const run of runsWithExpectations[config]) {
|
|
1288
|
-
const exp = (run.expectations || []).find(e => e.text === assertionText);
|
|
1289
|
-
if (exp) {
|
|
1290
|
-
const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
|
|
1291
|
-
const icon = exp.passed ? "\u2713" : "\u2717";
|
|
1292
|
-
html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
|
|
1293
|
-
} else {
|
|
1294
|
-
html += "— ";
|
|
1295
|
-
}
|
|
1296
|
-
}
|
|
1297
|
-
html += "</td>";
|
|
1298
|
-
}
|
|
1299
|
-
html += "</tr>";
|
|
1300
|
-
}
|
|
1301
|
-
html += "</tbody></table>";
|
|
1302
|
-
}
|
|
1303
|
-
}
|
|
1304
|
-
}
|
|
1305
|
-
|
|
1306
|
-
// Notes
|
|
1307
|
-
if (notes.length > 0) {
|
|
1308
|
-
html += '<div class="benchmark-notes">';
|
|
1309
|
-
html += "<h3>Analysis Notes</h3>";
|
|
1310
|
-
html += "<ul>";
|
|
1311
|
-
for (const note of notes) {
|
|
1312
|
-
html += "<li>" + escapeHtml(note) + "</li>";
|
|
1313
|
-
}
|
|
1314
|
-
html += "</ul></div>";
|
|
1315
|
-
}
|
|
1316
|
-
|
|
1317
|
-
container.innerHTML = html;
|
|
1318
|
-
}
|
|
1319
|
-
|
|
1320
|
-
// ---- Start ----
|
|
1321
|
-
init();
|
|
1322
|
-
renderBenchmark();
|
|
1323
|
-
</script>
|
|
1324
|
-
</body>
|
|
1325
|
-
</html>
|