@biggora/claude-plugins 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/README.md +11 -4
  2. package/package.json +1 -1
  3. package/registry/registry.json +319 -244
  4. package/specs/coding.md +24 -0
  5. package/specs/pod.md +2 -0
  6. package/src/skills/captcha/README.md +221 -0
  7. package/src/skills/captcha/SKILL.md +355 -0
  8. package/src/skills/captcha/references/captcha-types.md +254 -0
  9. package/src/skills/captcha/references/services.md +172 -0
  10. package/src/skills/captcha/references/stealth.md +238 -0
  11. package/src/skills/captcha/scripts/solve_captcha.py +323 -0
  12. package/src/skills/captcha/scripts/solve_image_grid.py +350 -0
  13. package/src/skills/google-merchant-api/SKILL.md +581 -0
  14. package/src/skills/google-merchant-api/references/accounts.md +247 -0
  15. package/src/skills/google-merchant-api/references/content-api-legacy.md +216 -0
  16. package/src/skills/google-merchant-api/references/datasources.md +233 -0
  17. package/src/skills/google-merchant-api/references/inventories.md +201 -0
  18. package/src/skills/google-merchant-api/references/migration.md +267 -0
  19. package/src/skills/google-merchant-api/references/products.md +316 -0
  20. package/src/skills/google-merchant-api/references/promotions.md +201 -0
  21. package/src/skills/google-merchant-api/references/reports.md +240 -0
  22. package/src/skills/lv-aggregators-api/SKILL.md +113 -0
  23. package/src/skills/lv-aggregators-api/references/integration-guide.md +368 -0
  24. package/src/skills/lv-aggregators-api/references/kurpirkt.md +103 -0
  25. package/src/skills/lv-aggregators-api/references/salidzini.md +122 -0
  26. package/src/skills/tailwindcss-best-practices/SKILL.md +180 -0
  27. package/src/skills/tailwindcss-best-practices/references/best-practices-utility-patterns.md +87 -0
  28. package/src/skills/tailwindcss-best-practices/references/core-installation.md +109 -0
  29. package/src/skills/tailwindcss-best-practices/references/core-preflight.md +200 -0
  30. package/src/skills/tailwindcss-best-practices/references/core-responsive.md +163 -0
  31. package/src/skills/tailwindcss-best-practices/references/core-source-detection.md +114 -0
  32. package/src/skills/tailwindcss-best-practices/references/core-theme.md +108 -0
  33. package/src/skills/tailwindcss-best-practices/references/core-utility-classes.md +59 -0
  34. package/src/skills/tailwindcss-best-practices/references/core-variants.md +204 -0
  35. package/src/skills/tailwindcss-best-practices/references/effects-form-controls.md +76 -0
  36. package/src/skills/tailwindcss-best-practices/references/effects-mask.md +91 -0
  37. package/src/skills/tailwindcss-best-practices/references/effects-scroll-snap.md +59 -0
  38. package/src/skills/tailwindcss-best-practices/references/effects-text-shadow.md +78 -0
  39. package/src/skills/tailwindcss-best-practices/references/effects-transition-animation.md +80 -0
  40. package/src/skills/tailwindcss-best-practices/references/effects-visibility-interactivity.md +82 -0
  41. package/src/skills/tailwindcss-best-practices/references/features-content-detection.md +175 -0
  42. package/src/skills/tailwindcss-best-practices/references/features-custom-styles.md +203 -0
  43. package/src/skills/tailwindcss-best-practices/references/features-dark-mode.md +137 -0
  44. package/src/skills/tailwindcss-best-practices/references/features-functions-directives.md +241 -0
  45. package/src/skills/tailwindcss-best-practices/references/features-upgrade.md +160 -0
  46. package/src/skills/tailwindcss-best-practices/references/layout-aspect-ratio.md +39 -0
  47. package/src/skills/tailwindcss-best-practices/references/layout-columns.md +80 -0
  48. package/src/skills/tailwindcss-best-practices/references/layout-display.md +110 -0
  49. package/src/skills/tailwindcss-best-practices/references/layout-flexbox.md +112 -0
  50. package/src/skills/tailwindcss-best-practices/references/layout-grid.md +87 -0
  51. package/src/skills/tailwindcss-best-practices/references/layout-height.md +97 -0
  52. package/src/skills/tailwindcss-best-practices/references/layout-inset.md +103 -0
  53. package/src/skills/tailwindcss-best-practices/references/layout-logical-properties.md +92 -0
  54. package/src/skills/tailwindcss-best-practices/references/layout-margin.md +126 -0
  55. package/src/skills/tailwindcss-best-practices/references/layout-min-max-sizing.md +63 -0
  56. package/src/skills/tailwindcss-best-practices/references/layout-object-fit-position.md +64 -0
  57. package/src/skills/tailwindcss-best-practices/references/layout-overflow.md +57 -0
  58. package/src/skills/tailwindcss-best-practices/references/layout-padding.md +77 -0
  59. package/src/skills/tailwindcss-best-practices/references/layout-position.md +85 -0
  60. package/src/skills/tailwindcss-best-practices/references/layout-tables.md +67 -0
  61. package/src/skills/tailwindcss-best-practices/references/layout-width.md +102 -0
  62. package/src/skills/tailwindcss-best-practices/references/transform-base.md +68 -0
  63. package/src/skills/tailwindcss-best-practices/references/transform-rotate.md +70 -0
  64. package/src/skills/tailwindcss-best-practices/references/transform-scale.md +83 -0
  65. package/src/skills/tailwindcss-best-practices/references/transform-skew.md +62 -0
  66. package/src/skills/tailwindcss-best-practices/references/transform-translate.md +77 -0
  67. package/src/skills/tailwindcss-best-practices/references/typography-font-text.md +142 -0
  68. package/src/skills/tailwindcss-best-practices/references/typography-list-style.md +65 -0
  69. package/src/skills/tailwindcss-best-practices/references/typography-text-align.md +60 -0
  70. package/src/skills/tailwindcss-best-practices/references/visual-background.md +76 -0
  71. package/src/skills/tailwindcss-best-practices/references/visual-border.md +108 -0
  72. package/src/skills/tailwindcss-best-practices/references/visual-effects.md +111 -0
  73. package/src/skills/tailwindcss-best-practices/references/visual-svg.md +82 -0
  74. package/src/skills/test-mobile-app/SKILL.md +11 -6
  75. package/src/skills/test-mobile-app/scripts/analyze_apk.py +15 -4
  76. package/src/skills/test-mobile-app/scripts/check_environment.py +5 -5
  77. package/src/skills/test-mobile-app/scripts/run_tests.py +1 -1
  78. package/src/skills/test-web-ui/SKILL.md +264 -84
  79. package/src/skills/test-web-ui/scripts/discover.py +25 -12
  80. package/src/skills/test-web-ui/scripts/run_tests.py +3 -2
  81. package/src/skills/vite-best-practices/SKILL.md +115 -0
  82. package/src/skills/vite-best-practices/references/build-and-ssr.md +255 -0
  83. package/src/skills/vite-best-practices/references/core-config.md +231 -0
  84. package/src/skills/vite-best-practices/references/core-features.md +222 -0
  85. package/src/skills/vite-best-practices/references/core-plugin-api.md +294 -0
  86. package/src/skills/vite-best-practices/references/environment-api.md +108 -0
  87. package/src/skills/vite-best-practices/references/rolldown-migration.md +242 -0
  88. package/codex-cli-workspace/iteration-1/benchmark.json +0 -122
  89. package/codex-cli-workspace/iteration-1/eval-1-ci-integration/eval_metadata.json +0 -13
  90. package/codex-cli-workspace/iteration-1/eval-1-ci-integration/with_skill/grading.json +0 -52
  91. package/codex-cli-workspace/iteration-1/eval-1-ci-integration/with_skill/outputs/response.md +0 -163
  92. package/codex-cli-workspace/iteration-1/eval-1-ci-integration/with_skill/timing.json +0 -5
  93. package/codex-cli-workspace/iteration-1/eval-1-ci-integration/without_skill/grading.json +0 -58
  94. package/codex-cli-workspace/iteration-1/eval-1-ci-integration/without_skill/outputs/response.md +0 -151
  95. package/codex-cli-workspace/iteration-1/eval-1-ci-integration/without_skill/timing.json +0 -5
  96. package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/eval_metadata.json +0 -13
  97. package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/grading.json +0 -52
  98. package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/outputs/response.md +0 -86
  99. package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/timing.json +0 -5
  100. package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/grading.json +0 -58
  101. package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/outputs/response.md +0 -164
  102. package/codex-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/timing.json +0 -5
  103. package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/eval_metadata.json +0 -13
  104. package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/with_skill/grading.json +0 -52
  105. package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/with_skill/outputs/response.md +0 -130
  106. package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/with_skill/timing.json +0 -5
  107. package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/without_skill/grading.json +0 -64
  108. package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/without_skill/outputs/response.md +0 -209
  109. package/codex-cli-workspace/iteration-1/eval-3-profiles-troubleshooting/without_skill/timing.json +0 -5
  110. package/codex-cli-workspace/iteration-1/review.html +0 -1325
  111. package/gemini-cli-workspace/iteration-1/benchmark.json +0 -86
  112. package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/eval_metadata.json +0 -37
  113. package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/with_skill/grading.json +0 -37
  114. package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/with_skill/outputs/response.md +0 -401
  115. package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/with_skill/timing.json +0 -5
  116. package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/without_skill/grading.json +0 -37
  117. package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/without_skill/outputs/response.md +0 -405
  118. package/gemini-cli-workspace/iteration-1/eval-1-cicd-setup/without_skill/timing.json +0 -5
  119. package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/eval_metadata.json +0 -37
  120. package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/grading.json +0 -37
  121. package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/outputs/response.md +0 -212
  122. package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/with_skill/timing.json +0 -5
  123. package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/grading.json +0 -37
  124. package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/outputs/response.md +0 -427
  125. package/gemini-cli-workspace/iteration-1/eval-2-mcp-server-config/without_skill/timing.json +0 -5
  126. package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/eval_metadata.json +0 -32
  127. package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/with_skill/grading.json +0 -32
  128. package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/with_skill/outputs/response.md +0 -171
  129. package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/with_skill/timing.json +0 -5
  130. package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/without_skill/grading.json +0 -32
  131. package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/without_skill/outputs/response.md +0 -199
  132. package/gemini-cli-workspace/iteration-1/eval-3-custom-slash-command/without_skill/timing.json +0 -5
  133. package/gemini-cli-workspace/iteration-1/review.html +0 -1325
  134. package/gemini-cli-workspace/iteration-2/benchmark.json +0 -173
  135. package/gemini-cli-workspace/iteration-2/benchmark.md +0 -28
  136. package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/eval_metadata.json +0 -37
  137. package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/with_skill/grading.json +0 -37
  138. package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/with_skill/outputs/response.md +0 -195
  139. package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/with_skill/timing.json +0 -5
  140. package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/without_skill/grading.json +0 -37
  141. package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/without_skill/outputs/response.md +0 -377
  142. package/gemini-cli-workspace/iteration-2/eval-1-cicd-setup/without_skill/timing.json +0 -5
  143. package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/eval_metadata.json +0 -37
  144. package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/with_skill/grading.json +0 -37
  145. package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/with_skill/outputs/response.md +0 -127
  146. package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/with_skill/timing.json +0 -5
  147. package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/without_skill/grading.json +0 -37
  148. package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/without_skill/outputs/response.md +0 -164
  149. package/gemini-cli-workspace/iteration-2/eval-2-mcp-server-config/without_skill/timing.json +0 -5
  150. package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/eval_metadata.json +0 -32
  151. package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/with_skill/grading.json +0 -32
  152. package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/with_skill/outputs/response.md +0 -91
  153. package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/with_skill/timing.json +0 -5
  154. package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/without_skill/grading.json +0 -32
  155. package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/without_skill/outputs/response.md +0 -112
  156. package/gemini-cli-workspace/iteration-2/eval-3-custom-slash-command/without_skill/timing.json +0 -5
  157. package/gemini-cli-workspace/iteration-2/eval-viewer.html +0 -1325
  158. package/screen-recording-workspace/evals.json +0 -41
  159. package/screen-recording-workspace/iteration-1/benchmark.json +0 -102
  160. package/screen-recording-workspace/iteration-1/eval-0-fullscreen/eval_metadata.json +0 -31
  161. package/screen-recording-workspace/iteration-1/eval-0-fullscreen/with_skill/grading.json +0 -11
  162. package/screen-recording-workspace/iteration-1/eval-0-fullscreen/with_skill/outputs/demo.mp4 +0 -0
  163. package/screen-recording-workspace/iteration-1/eval-0-fullscreen/with_skill/timing.json +0 -5
  164. package/screen-recording-workspace/iteration-1/eval-0-fullscreen/without_skill/grading.json +0 -11
  165. package/screen-recording-workspace/iteration-1/eval-0-fullscreen/without_skill/outputs/demo.mp4 +0 -0
  166. package/screen-recording-workspace/iteration-1/eval-0-fullscreen/without_skill/timing.json +0 -5
  167. package/screen-recording-workspace/iteration-1/eval-1-region-audio/eval_metadata.json +0 -31
  168. package/screen-recording-workspace/iteration-1/eval-1-region-audio/with_skill/grading.json +0 -11
  169. package/screen-recording-workspace/iteration-1/eval-1-region-audio/with_skill/outputs/region_capture.mp4 +0 -0
  170. package/screen-recording-workspace/iteration-1/eval-1-region-audio/with_skill/timing.json +0 -5
  171. package/screen-recording-workspace/iteration-1/eval-1-region-audio/without_skill/grading.json +0 -11
  172. package/screen-recording-workspace/iteration-1/eval-1-region-audio/without_skill/outputs/region_capture.mp4 +0 -0
  173. package/screen-recording-workspace/iteration-1/eval-1-region-audio/without_skill/timing.json +0 -5
  174. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/eval_metadata.json +0 -31
  175. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/with_skill/grading.json +0 -11
  176. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/with_skill/outputs/fallback_recording.mp4 +0 -0
  177. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/with_skill/timing.json +0 -5
  178. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/grading.json +0 -11
  179. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/outputs/fallback_recording.mp4 +0 -0
  180. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/outputs/record_screen.py +0 -67
  181. package/screen-recording-workspace/iteration-1/eval-2-python-fallback/without_skill/timing.json +0 -5
  182. package/screen-recording-workspace/iteration-1/review.html +0 -1325
  183. package/src/skills/codex-cli/evals/evals.json +0 -47
  184. package/src/skills/gemini-cli/evals/evals.json +0 -46
  185. package/src/skills/tm-search/evals/evals.json +0 -23
@@ -1,1325 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Eval Review</title>
7
- <link rel="preconnect" href="https://fonts.googleapis.com">
8
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
- <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
10
- <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
11
- <style>
12
- :root {
13
- --bg: #faf9f5;
14
- --surface: #ffffff;
15
- --border: #e8e6dc;
16
- --text: #141413;
17
- --text-muted: #b0aea5;
18
- --accent: #d97757;
19
- --accent-hover: #c4613f;
20
- --green: #788c5d;
21
- --green-bg: #eef2e8;
22
- --red: #c44;
23
- --red-bg: #fceaea;
24
- --header-bg: #141413;
25
- --header-text: #faf9f5;
26
- --radius: 6px;
27
- }
28
-
29
- * { box-sizing: border-box; margin: 0; padding: 0; }
30
-
31
- body {
32
- font-family: 'Lora', Georgia, serif;
33
- background: var(--bg);
34
- color: var(--text);
35
- height: 100vh;
36
- display: flex;
37
- flex-direction: column;
38
- }
39
-
40
- /* ---- Header ---- */
41
- .header {
42
- background: var(--header-bg);
43
- color: var(--header-text);
44
- padding: 1rem 2rem;
45
- display: flex;
46
- justify-content: space-between;
47
- align-items: center;
48
- flex-shrink: 0;
49
- }
50
- .header h1 {
51
- font-family: 'Poppins', sans-serif;
52
- font-size: 1.25rem;
53
- font-weight: 600;
54
- }
55
- .header .instructions {
56
- font-size: 0.8rem;
57
- opacity: 0.7;
58
- margin-top: 0.25rem;
59
- }
60
- .header .progress {
61
- font-size: 0.875rem;
62
- opacity: 0.8;
63
- text-align: right;
64
- }
65
-
66
- /* ---- Main content ---- */
67
- .main {
68
- flex: 1;
69
- overflow-y: auto;
70
- padding: 1.5rem 2rem;
71
- display: flex;
72
- flex-direction: column;
73
- gap: 1.25rem;
74
- }
75
-
76
- /* ---- Sections ---- */
77
- .section {
78
- background: var(--surface);
79
- border: 1px solid var(--border);
80
- border-radius: var(--radius);
81
- flex-shrink: 0;
82
- }
83
- .section-header {
84
- font-family: 'Poppins', sans-serif;
85
- padding: 0.75rem 1rem;
86
- font-size: 0.75rem;
87
- font-weight: 500;
88
- text-transform: uppercase;
89
- letter-spacing: 0.05em;
90
- color: var(--text-muted);
91
- border-bottom: 1px solid var(--border);
92
- background: var(--bg);
93
- }
94
- .section-body {
95
- padding: 1rem;
96
- }
97
-
98
- /* ---- Config badge ---- */
99
- .config-badge {
100
- display: inline-block;
101
- padding: 0.2rem 0.625rem;
102
- border-radius: 9999px;
103
- font-family: 'Poppins', sans-serif;
104
- font-size: 0.6875rem;
105
- font-weight: 600;
106
- text-transform: uppercase;
107
- letter-spacing: 0.03em;
108
- margin-left: 0.75rem;
109
- vertical-align: middle;
110
- }
111
- .config-badge.config-primary {
112
- background: rgba(33, 150, 243, 0.12);
113
- color: #1976d2;
114
- }
115
- .config-badge.config-baseline {
116
- background: rgba(255, 193, 7, 0.15);
117
- color: #f57f17;
118
- }
119
-
120
- /* ---- Prompt ---- */
121
- .prompt-text {
122
- white-space: pre-wrap;
123
- font-size: 0.9375rem;
124
- line-height: 1.6;
125
- }
126
-
127
- /* ---- Outputs ---- */
128
- .output-file {
129
- border: 1px solid var(--border);
130
- border-radius: var(--radius);
131
- overflow: hidden;
132
- }
133
- .output-file + .output-file {
134
- margin-top: 1rem;
135
- }
136
- .output-file-header {
137
- padding: 0.5rem 0.75rem;
138
- font-size: 0.8rem;
139
- font-weight: 600;
140
- color: var(--text-muted);
141
- background: var(--bg);
142
- border-bottom: 1px solid var(--border);
143
- font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
144
- display: flex;
145
- justify-content: space-between;
146
- align-items: center;
147
- }
148
- .output-file-header .dl-btn {
149
- font-size: 0.7rem;
150
- color: var(--accent);
151
- text-decoration: none;
152
- cursor: pointer;
153
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
154
- font-weight: 500;
155
- opacity: 0.8;
156
- }
157
- .output-file-header .dl-btn:hover {
158
- opacity: 1;
159
- text-decoration: underline;
160
- }
161
- .output-file-content {
162
- padding: 0.75rem;
163
- overflow-x: auto;
164
- }
165
- .output-file-content pre {
166
- font-size: 0.8125rem;
167
- line-height: 1.5;
168
- white-space: pre-wrap;
169
- word-break: break-word;
170
- font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
171
- }
172
- .output-file-content img {
173
- max-width: 100%;
174
- height: auto;
175
- border-radius: 4px;
176
- }
177
- .output-file-content iframe {
178
- width: 100%;
179
- height: 600px;
180
- border: none;
181
- }
182
- .output-file-content table {
183
- border-collapse: collapse;
184
- font-size: 0.8125rem;
185
- width: 100%;
186
- }
187
- .output-file-content table td,
188
- .output-file-content table th {
189
- border: 1px solid var(--border);
190
- padding: 0.375rem 0.5rem;
191
- text-align: left;
192
- }
193
- .output-file-content table th {
194
- background: var(--bg);
195
- font-weight: 600;
196
- }
197
- .output-file-content .download-link {
198
- display: inline-flex;
199
- align-items: center;
200
- gap: 0.5rem;
201
- padding: 0.5rem 1rem;
202
- background: var(--bg);
203
- border: 1px solid var(--border);
204
- border-radius: 4px;
205
- color: var(--accent);
206
- text-decoration: none;
207
- font-size: 0.875rem;
208
- cursor: pointer;
209
- }
210
- .output-file-content .download-link:hover {
211
- background: var(--border);
212
- }
213
- .empty-state {
214
- color: var(--text-muted);
215
- font-style: italic;
216
- padding: 2rem;
217
- text-align: center;
218
- }
219
-
220
- /* ---- Feedback ---- */
221
- .prev-feedback {
222
- background: var(--bg);
223
- border: 1px solid var(--border);
224
- border-radius: 4px;
225
- padding: 0.625rem 0.75rem;
226
- margin-top: 0.75rem;
227
- font-size: 0.8125rem;
228
- color: var(--text-muted);
229
- line-height: 1.5;
230
- }
231
- .prev-feedback-label {
232
- font-size: 0.7rem;
233
- font-weight: 600;
234
- text-transform: uppercase;
235
- letter-spacing: 0.04em;
236
- margin-bottom: 0.25rem;
237
- color: var(--text-muted);
238
- }
239
- .feedback-textarea {
240
- width: 100%;
241
- min-height: 100px;
242
- padding: 0.75rem;
243
- border: 1px solid var(--border);
244
- border-radius: 4px;
245
- font-family: inherit;
246
- font-size: 0.9375rem;
247
- line-height: 1.5;
248
- resize: vertical;
249
- color: var(--text);
250
- }
251
- .feedback-textarea:focus {
252
- outline: none;
253
- border-color: var(--accent);
254
- box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
255
- }
256
- .feedback-status {
257
- font-size: 0.75rem;
258
- color: var(--text-muted);
259
- margin-top: 0.5rem;
260
- min-height: 1.1em;
261
- }
262
-
263
- /* ---- Grades (collapsible) ---- */
264
- .grades-toggle {
265
- display: flex;
266
- align-items: center;
267
- cursor: pointer;
268
- user-select: none;
269
- }
270
- .grades-toggle:hover {
271
- color: var(--accent);
272
- }
273
- .grades-toggle .arrow {
274
- margin-right: 0.5rem;
275
- transition: transform 0.15s;
276
- font-size: 0.75rem;
277
- }
278
- .grades-toggle .arrow.open {
279
- transform: rotate(90deg);
280
- }
281
- .grades-content {
282
- display: none;
283
- margin-top: 0.75rem;
284
- }
285
- .grades-content.open {
286
- display: block;
287
- }
288
- .grades-summary {
289
- font-size: 0.875rem;
290
- margin-bottom: 0.75rem;
291
- display: flex;
292
- align-items: center;
293
- gap: 0.5rem;
294
- }
295
- .grade-badge {
296
- display: inline-block;
297
- padding: 0.125rem 0.5rem;
298
- border-radius: 9999px;
299
- font-size: 0.75rem;
300
- font-weight: 600;
301
- }
302
- .grade-pass { background: var(--green-bg); color: var(--green); }
303
- .grade-fail { background: var(--red-bg); color: var(--red); }
304
- .assertion-list {
305
- list-style: none;
306
- }
307
- .assertion-item {
308
- padding: 0.625rem 0;
309
- border-bottom: 1px solid var(--border);
310
- font-size: 0.8125rem;
311
- }
312
- .assertion-item:last-child { border-bottom: none; }
313
- .assertion-status {
314
- font-weight: 600;
315
- margin-right: 0.5rem;
316
- }
317
- .assertion-status.pass { color: var(--green); }
318
- .assertion-status.fail { color: var(--red); }
319
- .assertion-evidence {
320
- color: var(--text-muted);
321
- font-size: 0.75rem;
322
- margin-top: 0.25rem;
323
- padding-left: 1.5rem;
324
- }
325
-
326
- /* ---- View tabs ---- */
327
- .view-tabs {
328
- display: flex;
329
- gap: 0;
330
- padding: 0 2rem;
331
- background: var(--bg);
332
- border-bottom: 1px solid var(--border);
333
- flex-shrink: 0;
334
- }
335
- .view-tab {
336
- font-family: 'Poppins', sans-serif;
337
- padding: 0.625rem 1.25rem;
338
- font-size: 0.8125rem;
339
- font-weight: 500;
340
- cursor: pointer;
341
- border: none;
342
- background: none;
343
- color: var(--text-muted);
344
- border-bottom: 2px solid transparent;
345
- transition: all 0.15s;
346
- }
347
- .view-tab:hover { color: var(--text); }
348
- .view-tab.active {
349
- color: var(--accent);
350
- border-bottom-color: var(--accent);
351
- }
352
- .view-panel { display: none; }
353
- .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
354
-
355
- /* ---- Benchmark view ---- */
356
- .benchmark-view {
357
- padding: 1.5rem 2rem;
358
- overflow-y: auto;
359
- flex: 1;
360
- }
361
- .benchmark-table {
362
- border-collapse: collapse;
363
- background: var(--surface);
364
- border: 1px solid var(--border);
365
- border-radius: var(--radius);
366
- font-size: 0.8125rem;
367
- width: 100%;
368
- margin-bottom: 1.5rem;
369
- }
370
- .benchmark-table th, .benchmark-table td {
371
- padding: 0.625rem 0.75rem;
372
- text-align: left;
373
- border: 1px solid var(--border);
374
- }
375
- .benchmark-table th {
376
- font-family: 'Poppins', sans-serif;
377
- background: var(--header-bg);
378
- color: var(--header-text);
379
- font-weight: 500;
380
- font-size: 0.75rem;
381
- text-transform: uppercase;
382
- letter-spacing: 0.04em;
383
- }
384
- .benchmark-table tr:hover { background: var(--bg); }
385
- .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
386
- .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
387
- .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
388
- .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
389
- .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
390
- .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
391
- .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
392
- .benchmark-delta-positive { color: var(--green); font-weight: 600; }
393
- .benchmark-delta-negative { color: var(--red); font-weight: 600; }
394
- .benchmark-notes {
395
- background: var(--surface);
396
- border: 1px solid var(--border);
397
- border-radius: var(--radius);
398
- padding: 1rem;
399
- }
400
- .benchmark-notes h3 {
401
- font-family: 'Poppins', sans-serif;
402
- font-size: 0.875rem;
403
- margin-bottom: 0.75rem;
404
- }
405
- .benchmark-notes ul {
406
- list-style: disc;
407
- padding-left: 1.25rem;
408
- }
409
- .benchmark-notes li {
410
- font-size: 0.8125rem;
411
- line-height: 1.6;
412
- margin-bottom: 0.375rem;
413
- }
414
- .benchmark-empty {
415
- color: var(--text-muted);
416
- font-style: italic;
417
- text-align: center;
418
- padding: 3rem;
419
- }
420
-
421
- /* ---- Navigation ---- */
422
- .nav {
423
- display: flex;
424
- justify-content: space-between;
425
- align-items: center;
426
- padding: 1rem 2rem;
427
- border-top: 1px solid var(--border);
428
- background: var(--surface);
429
- flex-shrink: 0;
430
- }
431
- .nav-btn {
432
- font-family: 'Poppins', sans-serif;
433
- padding: 0.5rem 1.25rem;
434
- border: 1px solid var(--border);
435
- border-radius: var(--radius);
436
- background: var(--surface);
437
- cursor: pointer;
438
- font-size: 0.875rem;
439
- font-weight: 500;
440
- color: var(--text);
441
- transition: all 0.15s;
442
- }
443
- .nav-btn:hover:not(:disabled) {
444
- background: var(--bg);
445
- border-color: var(--text-muted);
446
- }
447
- .nav-btn:disabled {
448
- opacity: 0.4;
449
- cursor: not-allowed;
450
- }
451
- .done-btn {
452
- font-family: 'Poppins', sans-serif;
453
- padding: 0.5rem 1.5rem;
454
- border: 1px solid var(--border);
455
- border-radius: var(--radius);
456
- background: var(--surface);
457
- color: var(--text);
458
- cursor: pointer;
459
- font-size: 0.875rem;
460
- font-weight: 500;
461
- transition: all 0.15s;
462
- }
463
- .done-btn:hover {
464
- background: var(--bg);
465
- border-color: var(--text-muted);
466
- }
467
- .done-btn.ready {
468
- border: none;
469
- background: var(--accent);
470
- color: white;
471
- font-weight: 600;
472
- }
473
- .done-btn.ready:hover {
474
- background: var(--accent-hover);
475
- }
476
- /* ---- Done overlay ---- */
477
- .done-overlay {
478
- display: none;
479
- position: fixed;
480
- inset: 0;
481
- background: rgba(0, 0, 0, 0.5);
482
- z-index: 100;
483
- justify-content: center;
484
- align-items: center;
485
- }
486
- .done-overlay.visible {
487
- display: flex;
488
- }
489
- .done-card {
490
- background: var(--surface);
491
- border-radius: 12px;
492
- padding: 2rem 3rem;
493
- text-align: center;
494
- box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
495
- max-width: 500px;
496
- }
497
- .done-card h2 {
498
- font-size: 1.5rem;
499
- margin-bottom: 0.5rem;
500
- }
501
- .done-card p {
502
- color: var(--text-muted);
503
- margin-bottom: 1.5rem;
504
- line-height: 1.5;
505
- }
506
- .done-card .btn-row {
507
- display: flex;
508
- gap: 0.5rem;
509
- justify-content: center;
510
- }
511
- .done-card button {
512
- padding: 0.5rem 1.25rem;
513
- border: 1px solid var(--border);
514
- border-radius: var(--radius);
515
- background: var(--surface);
516
- cursor: pointer;
517
- font-size: 0.875rem;
518
- }
519
- .done-card button:hover {
520
- background: var(--bg);
521
- }
522
- /* ---- Toast ---- */
523
- .toast {
524
- position: fixed;
525
- bottom: 5rem;
526
- left: 50%;
527
- transform: translateX(-50%);
528
- background: var(--header-bg);
529
- color: var(--header-text);
530
- padding: 0.625rem 1.25rem;
531
- border-radius: var(--radius);
532
- font-size: 0.875rem;
533
- opacity: 0;
534
- transition: opacity 0.3s;
535
- pointer-events: none;
536
- z-index: 200;
537
- }
538
- .toast.visible {
539
- opacity: 1;
540
- }
541
- </style>
542
- </head>
543
- <body>
544
- <div id="app" style="height:100vh; display:flex; flex-direction:column;">
545
- <div class="header">
546
- <div>
547
- <h1>Eval Review: <span id="skill-name"></span></h1>
548
- <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
549
- </div>
550
- <div class="progress" id="progress"></div>
551
- </div>
552
-
553
- <!-- View tabs (only shown when benchmark data exists) -->
554
- <div class="view-tabs" id="view-tabs" style="display:none;">
555
- <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
556
- <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
557
- </div>
558
-
559
- <!-- Outputs panel (qualitative review) -->
560
- <div class="view-panel active" id="panel-outputs">
561
- <div class="main">
562
- <!-- Prompt -->
563
- <div class="section">
564
- <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
565
- <div class="section-body">
566
- <div class="prompt-text" id="prompt-text"></div>
567
- </div>
568
- </div>
569
-
570
- <!-- Outputs -->
571
- <div class="section">
572
- <div class="section-header">Output</div>
573
- <div class="section-body" id="outputs-body">
574
- <div class="empty-state">No output files found</div>
575
- </div>
576
- </div>
577
-
578
- <!-- Previous Output (collapsible) -->
579
- <div class="section" id="prev-outputs-section" style="display:none;">
580
- <div class="section-header">
581
- <div class="grades-toggle" onclick="togglePrevOutputs()">
582
- <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
583
- Previous Output
584
- </div>
585
- </div>
586
- <div class="grades-content" id="prev-outputs-content"></div>
587
- </div>
588
-
589
- <!-- Grades (collapsible) -->
590
- <div class="section" id="grades-section" style="display:none;">
591
- <div class="section-header">
592
- <div class="grades-toggle" onclick="toggleGrades()">
593
- <span class="arrow" id="grades-arrow">&#9654;</span>
594
- Formal Grades
595
- </div>
596
- </div>
597
- <div class="grades-content" id="grades-content"></div>
598
- </div>
599
-
600
- <!-- Feedback -->
601
- <div class="section">
602
- <div class="section-header">Your Feedback</div>
603
- <div class="section-body">
604
- <textarea
605
- class="feedback-textarea"
606
- id="feedback"
607
- placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
608
- ></textarea>
609
- <div class="feedback-status" id="feedback-status"></div>
610
- <div class="prev-feedback" id="prev-feedback" style="display:none;">
611
- <div class="prev-feedback-label">Previous feedback</div>
612
- <div id="prev-feedback-text"></div>
613
- </div>
614
- </div>
615
- </div>
616
- </div>
617
-
618
- <div class="nav" id="outputs-nav">
619
- <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
620
- <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
621
- <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
622
- </div>
623
- </div><!-- end panel-outputs -->
624
-
625
- <!-- Benchmark panel (quantitative stats) -->
626
- <div class="view-panel" id="panel-benchmark">
627
- <div class="benchmark-view" id="benchmark-content">
628
- <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
629
- </div>
630
- </div>
631
- </div>
632
-
633
- <!-- Done overlay -->
634
- <div class="done-overlay" id="done-overlay">
635
- <div class="done-card">
636
- <h2>Review Complete</h2>
637
- <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
638
- <div class="btn-row">
639
- <button onclick="closeDoneDialog()">OK</button>
640
- </div>
641
- </div>
642
- </div>
643
-
644
- <!-- Toast -->
645
- <div class="toast" id="toast"></div>
646
-
647
- <script>
648
- // ---- Embedded data (injected by generate_review.py) ----
649
- const EMBEDDED_DATA = {"skill_name": "gemini-cli", "runs": [{"id": "eval-1-cicd-setup-with_skill", "prompt": "I want to set up gemini cli in my CI/CD pipeline on GitHub Actions so it can automatically review PRs. We use an API key stored in GitHub secrets. How do I set this up with structured JSON output so I can parse the review?", "eval_id": 1, "outputs": [{"name": "response.md", "type": "text", "content": "# Setting Up Gemini CLI for Automated PR Reviews in GitHub Actions\n\n## Prerequisites\n\n1. A Gemini API key (get one at https://aistudio.google.com/apikey)\n2. The API key stored as a GitHub Actions secret named `GEMINI_API_KEY`\n\n## GitHub Actions Workflow\n\nCreate a file at `.github/workflows/gemini-pr-review.yml`:\n\n```yaml\nname: Gemini PR Review\n\non:\n pull_request:\n types: [opened, synchronize]\n\npermissions:\n pull-requests: write\n contents: read\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout code\n uses: actions/checkout@v4\n with:\n fetch-depth: 0 # Full history so we can diff against base\n\n - name: Set up Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI\n run: npm install -g @google/gemini-cli\n\n - name: Run AI Code Review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n # Get the diff between the PR branch and the base branch\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD)\n\n if [ -z \"$DIFF\" ]; then\n echo \"No changes to review.\"\n exit 0\n fi\n\n # Run Gemini CLI with structured JSON output\n RESULT=$(echo \"$DIFF\" | gemini -p \"You are a senior code reviewer. Review the following pull request diff for:\n 1. Bugs and logical errors\n 2. Security vulnerabilities\n 3. Performance issues\n 4. Code style and best practices\n\n Provide your review in a structured format with sections for each category. For each issue found, include the file name, line context, severity (critical/warning/info), and a suggested fix.\n\n If no issues are found in a category, say so explicitly.\" \\\n --output-format json)\n\n # Extract just the response text\n REVIEW=$(echo \"$RESULT\" | jq -r '.response')\n\n # Extract token usage stats for logging\n TOKENS=$(echo \"$RESULT\" | jq '.stats.models | to_entries[0].value.tokens.total // \"unknown\"')\n echo \"Tokens used: $TOKENS\"\n\n # Write the review to the GitHub Step Summary\n echo \"## Gemini Code Review\" >> $GITHUB_STEP_SUMMARY\n echo \"\" >> $GITHUB_STEP_SUMMARY\n echo \"$REVIEW\" >> $GITHUB_STEP_SUMMARY\n\n # Save the full JSON result for downstream steps\n echo \"$RESULT\" > review-result.json\n\n - name: Post review as PR comment\n if: success()\n env:\n GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n run: |\n REVIEW=$(jq -r '.response' review-result.json)\n TOKENS=$(jq '.stats.models | to_entries[0].value.tokens.total // \"unknown\"' review-result.json)\n\n COMMENT_BODY=$(cat <<EOF\n ## Gemini Automated Code Review\n\n $REVIEW\n\n ---\n *Reviewed by Gemini CLI | Tokens used: $TOKENS*\n EOF\n )\n\n gh pr comment ${{ github.event.pull_request.number }} --body \"$COMMENT_BODY\"\n```\n\n## How It Works\n\n### Authentication\n\nThe workflow uses the `GEMINI_API_KEY` environment variable, which is pulled from GitHub Secrets. Gemini CLI automatically detects this variable -- no additional configuration is needed. The free tier allows 1,000 requests per day, which is sufficient for most PR review workflows.\n\n### Structured JSON Output\n\nThe `--output-format json` flag causes Gemini CLI to return a single JSON object once all processing is complete. The schema looks like this:\n\n```json\n{\n \"response\": \"The review text content...\",\n \"stats\": {\n \"models\": {\n \"gemini-2.5-pro\": {\n \"tokens\": {\n \"prompt\": 24939,\n \"candidates\": 20,\n \"total\": 25113\n },\n \"api\": {\n \"totalRequests\": 2,\n \"totalErrors\": 0,\n \"totalLatencyMs\": 5053\n }\n }\n },\n \"tools\": {\n \"totalCalls\": 1,\n \"totalSuccess\": 1,\n \"totalFail\": 0\n },\n \"files\": {\n \"totalLinesAdded\": 0,\n \"totalLinesRemoved\": 0\n }\n }\n}\n```\n\nYou extract the review text with `jq -r '.response'` and can inspect token usage or tool call stats from the `stats` object.\n\n### Exit Codes\n\nGemini CLI uses these exit codes, which you can use for conditional logic:\n\n| Code | Meaning |\n|------|---------|\n| 0 | Success |\n| 1 | General error |\n| 42 | Input error (invalid arguments, missing config) |\n| 53 | Turn limit exceeded |\n\n## Parsing the JSON Output in a Downstream Step\n\nIf you need to parse the review in a separate step (for example, to fail the build on critical findings), you can do it like this:\n\n```yaml\n - name: Check for critical issues\n run: |\n REVIEW=$(jq -r '.response' review-result.json)\n\n # Simple check: fail if the review mentions critical severity\n if echo \"$REVIEW\" | grep -qi \"critical\"; then\n echo \"::error::Critical issues found in code review\"\n exit 1\n fi\n```\n\n## Advanced: Using a GEMINI.md for Consistent Review Context\n\nCreate a `.gemini/GEMINI.md` file in your repository root to give Gemini persistent context about your project's standards:\n\n```markdown\n# Project Review Guidelines\n\n## Code Standards\n- All functions must have JSDoc comments\n- Use 2-space indentation\n- No console.log in production code\n\n## Security Requirements\n- Never hardcode secrets or API keys\n- All user input must be validated\n- SQL queries must use parameterized statements\n```\n\nGemini CLI automatically loads this file when running in your repo directory, so the reviewer will apply your project-specific rules during every PR review.\n\n## Known Limitations\n\n- Custom slash commands (`.toml` files) do not work in headless/non-interactive mode. You must embed the full prompt text directly in your script, as shown above.\n- Extensions are also unavailable in headless mode.\n- The free tier is rate-limited to 60 requests/minute and 1,000 requests/day. For high-volume repos, consider using a paid API key.\n- Very large diffs may exceed the model's context window. Consider filtering the diff to only changed files that matter (e.g., exclude lockfiles and generated code).\n"}], "grading": {"eval_id": 1, "eval_name": "cicd-setup", "config": "with_skill", "expectations": [{"text": "Uses -p or --prompt flag for headless/non-interactive mode", "passed": true, "evidence": "Line 53: gemini -p \"You are a senior code reviewer...\" with --output-format json"}, {"text": "Includes --output-format json flag for structured output", "passed": true, "evidence": "Line 62: --output-format json flag used correctly"}, {"text": "Shows how to set GEMINI_API_KEY from GitHub secrets", "passed": true, "evidence": "Line 42: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} in env block"}, {"text": "Provides a GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Complete workflow YAML at .github/workflows/gemini-pr-review.yml with checkout, node setup, install, review, and comment steps"}, {"text": "Shows how to parse JSON response (e.g., with jq .response)", "passed": true, "evidence": "Line 65: REVIEW=$(echo \"$RESULT\" | jq -r '.response') and line 68: token stats extraction with jq"}, {"text": "Does not invent non-existent CLI flags or options", "passed": true, "evidence": "Only uses verified flags: -p and --output-format json. Exit codes match corrected values (0, 1, 42, 53)."}]}}, {"id": "eval-1-cicd-setup-without_skill", "prompt": "I want to set up gemini cli in my CI/CD pipeline on GitHub Actions so it can automatically review PRs. We use an API key stored in GitHub secrets. How do I set this up with structured JSON output so I can parse the review?", "eval_id": 1, "outputs": [{"name": "response.md", "type": "text", "content": "# Setting Up Gemini CLI for Automated PR Reviews in GitHub Actions\n\n## Overview\n\nThis guide covers how to configure Gemini CLI in a GitHub Actions CI/CD pipeline to automatically review pull requests, using an API key from GitHub Secrets and structured JSON output for parsing.\n\n## Prerequisites\n\n1. A Google AI API key (from Google AI Studio)\n2. The API key stored as a GitHub Actions secret (e.g., `GEMINI_API_KEY`)\n3. A GitHub repository with pull requests to review\n\n## Step 1: Store Your API Key in GitHub Secrets\n\n1. Go to your repository on GitHub\n2. Navigate to **Settings > Secrets and variables > Actions**\n3. Click **New repository secret**\n4. Name it `GEMINI_API_KEY` and paste your API key\n5. Click **Add secret**\n\n## Step 2: Create the GitHub Actions Workflow\n\nCreate a file at `.github/workflows/gemini-pr-review.yml`:\n\n```yaml\nname: Gemini PR Review\n\non:\n pull_request:\n types: [opened, synchronize, reopened]\n\npermissions:\n contents: read\n pull-requests: write\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout code\n uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Setup Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI\n run: npm install -g @google/gemini-cli\n\n - name: Get PR diff\n id: diff\n run: |\n git diff origin/${{ github.base_ref }}...HEAD > pr_diff.txt\n echo \"diff_size=$(wc -c < pr_diff.txt)\" >> $GITHUB_OUTPUT\n\n - name: Run Gemini Review\n id: review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n DIFF_CONTENT=$(cat pr_diff.txt)\n\n # Create the prompt requesting structured JSON output\n cat > review_prompt.txt << 'PROMPT_EOF'\n You are a code reviewer. Analyze the following PR diff and provide your review as a JSON object with this exact structure:\n\n {\n \"summary\": \"Brief overall summary of the changes\",\n \"risk_level\": \"low|medium|high\",\n \"issues\": [\n {\n \"severity\": \"critical|warning|suggestion\",\n \"file\": \"path/to/file\",\n \"line\": 0,\n \"description\": \"Description of the issue\",\n \"suggestion\": \"Suggested fix\"\n }\n ],\n \"positives\": [\"List of good practices observed\"],\n \"approved\": true\n }\n\n Respond ONLY with valid JSON, no markdown fences, no extra text.\n\n Here is the diff:\n PROMPT_EOF\n\n # Append the diff content\n cat pr_diff.txt >> review_prompt.txt\n\n # Run Gemini CLI with the prompt\n # Use --json flag if available, otherwise parse text output\n REVIEW_OUTPUT=$(gemini -p \"$(cat review_prompt.txt)\" 2>/dev/null || true)\n\n # Save raw output for debugging\n echo \"$REVIEW_OUTPUT\" > review_output.json\n\n # Validate JSON\n if echo \"$REVIEW_OUTPUT\" | jq . > /dev/null 2>&1; then\n echo \"valid_json=true\" >> $GITHUB_OUTPUT\n else\n echo \"valid_json=false\" >> $GITHUB_OUTPUT\n # Try to extract JSON from the response (in case of markdown fences)\n REVIEW_OUTPUT=$(echo \"$REVIEW_OUTPUT\" | sed -n '/^{/,/^}/p' | head -1)\n echo \"$REVIEW_OUTPUT\" > review_output.json\n fi\n\n - name: Parse and Post Review\n if: always()\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n\n let review;\n try {\n const rawOutput = fs.readFileSync('review_output.json', 'utf8');\n // Strip markdown code fences if present\n const cleaned = rawOutput\n .replace(/```json\\n?/g, '')\n .replace(/```\\n?/g, '')\n .trim();\n review = JSON.parse(cleaned);\n } catch (e) {\n console.log('Failed to parse review JSON:', e.message);\n // Post a fallback comment\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body: '## Gemini Review\\n\\n_Review output could not be parsed as JSON. Raw output saved as artifact._'\n });\n return;\n }\n\n // Build the review comment\n let body = `## Gemini PR Review\\n\\n`;\n body += `**Summary:** ${review.summary}\\n\\n`;\n body += `**Risk Level:** ${review.risk_level}\\n\\n`;\n\n if (review.issues && review.issues.length > 0) {\n body += `### Issues Found (${review.issues.length})\\n\\n`;\n for (const issue of review.issues) {\n const emoji = issue.severity === 'critical' ? '\u00f0\u0178\u201d\u00b4' :\n issue.severity === 'warning' ? '\u00f0\u0178\u0178\u00a1' : '\u00f0\u0178\u201d\u00b5';\n body += `${emoji} **${issue.severity.toUpperCase()}** - \\`${issue.file}\\``;\n if (issue.line) body += ` (line ${issue.line})`;\n body += `\\n`;\n body += `> ${issue.description}\\n`;\n if (issue.suggestion) {\n body += `> **Suggestion:** ${issue.suggestion}\\n`;\n }\n body += `\\n`;\n }\n } else {\n body += `### No Issues Found\\n\\n`;\n }\n\n if (review.positives && review.positives.length > 0) {\n body += `### Positives\\n\\n`;\n for (const positive of review.positives) {\n body += `- ${positive}\\n`;\n }\n body += `\\n`;\n }\n\n body += `---\\n_Automated review by Gemini CLI_`;\n\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body: body\n });\n\n - name: Upload Review Artifact\n if: always()\n uses: actions/upload-artifact@v4\n with:\n name: gemini-review\n path: review_output.json\n retention-days: 7\n```\n\n## Step 3: Alternative Approach Using the Gemini API Directly (More Reliable for JSON)\n\nIf you want more control over structured output, you can call the Gemini API directly with `curl` instead of relying on the CLI. This gives you access to the `response_mime_type` parameter for guaranteed JSON:\n\n```yaml\n - name: Run Gemini Review via API\n id: review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n DIFF_CONTENT=$(cat pr_diff.txt | jq -Rs .)\n\n # Build the API request with structured output\n cat > request.json << EOF\n {\n \"contents\": [{\n \"parts\": [{\n \"text\": \"You are a code reviewer. Analyze this PR diff and provide a review.\\n\\nDiff:\\n${DIFF_CONTENT}\"\n }]\n }],\n \"generationConfig\": {\n \"responseMimeType\": \"application/json\",\n \"responseSchema\": {\n \"type\": \"object\",\n \"properties\": {\n \"summary\": { \"type\": \"string\" },\n \"risk_level\": { \"type\": \"string\", \"enum\": [\"low\", \"medium\", \"high\"] },\n \"issues\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"severity\": { \"type\": \"string\", \"enum\": [\"critical\", \"warning\", \"suggestion\"] },\n \"file\": { \"type\": \"string\" },\n \"line\": { \"type\": \"integer\" },\n \"description\": { \"type\": \"string\" },\n \"suggestion\": { \"type\": \"string\" }\n },\n \"required\": [\"severity\", \"file\", \"description\"]\n }\n },\n \"positives\": {\n \"type\": \"array\",\n \"items\": { \"type\": \"string\" }\n },\n \"approved\": { \"type\": \"boolean\" }\n },\n \"required\": [\"summary\", \"risk_level\", \"issues\", \"approved\"]\n }\n }\n }\n EOF\n\n # Call the Gemini API\n curl -s \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${GEMINI_API_KEY}\" \\\n -H 'Content-Type: application/json' \\\n -d @request.json \\\n -o api_response.json\n\n # Extract the text content from the API response\n jq -r '.candidates[0].content.parts[0].text' api_response.json > review_output.json\n```\n\n## Step 4: Handling Large Diffs\n\nFor large PRs, you may hit token limits. Add a truncation step:\n\n```yaml\n - name: Prepare diff (truncated if needed)\n run: |\n # Limit diff to ~100KB to stay within token limits\n if [ $(wc -c < pr_diff.txt) -gt 100000 ]; then\n head -c 100000 pr_diff.txt > pr_diff_truncated.txt\n echo -e \"\\n\\n[DIFF TRUNCATED - showing first 100KB]\" >> pr_diff_truncated.txt\n mv pr_diff_truncated.txt pr_diff.txt\n fi\n```\n\n## Key Configuration Notes\n\n### Authentication\n\n- Gemini CLI uses the `GEMINI_API_KEY` environment variable for authentication\n- In GitHub Actions, pass it via `env:` from secrets -- never hardcode it\n\n### Structured JSON Output\n\nThere are two approaches for getting structured JSON:\n\n1. **Prompt-based** (CLI approach): Ask the model to return JSON in your prompt. Less reliable but simpler. Include \"Respond ONLY with valid JSON\" in your prompt.\n\n2. **Schema-based** (API approach): Use `responseMimeType: \"application/json\"` with a `responseSchema` in the API request. This guarantees valid JSON output conforming to your schema. This is the more reliable approach for CI/CD pipelines.\n\n### Parsing the Output\n\nAlways include fallback handling when parsing JSON from LLM output:\n\n```bash\n# Strip potential markdown code fences\nCLEAN_JSON=$(echo \"$RAW_OUTPUT\" | sed 's/^```json//; s/^```//; /^$/d')\n\n# Validate with jq\nif echo \"$CLEAN_JSON\" | jq . > /dev/null 2>&1; then\n echo \"Valid JSON\"\nelse\n echo \"Invalid JSON - falling back\"\nfi\n```\n\n### Cost and Rate Limiting\n\n- Consider adding rate limiting or caching to avoid excessive API calls\n- Use `concurrency` in your workflow to prevent parallel runs:\n\n```yaml\nconcurrency:\n group: gemini-review-${{ github.event.pull_request.number }}\n cancel-in-progress: true\n```\n\n### Security Considerations\n\n- Never log or echo the API key in workflow output\n- Use `pull_requests: write` permission (minimum required)\n- Consider limiting the workflow to specific file types or paths:\n\n```yaml\non:\n pull_request:\n paths:\n - '**.js'\n - '**.ts'\n - '**.py'\n```\n\n## Complete Minimal Example\n\nFor a quick start, here is the simplest possible setup:\n\n```yaml\nname: Gemini Review\non:\n pull_request:\n types: [opened, synchronize]\n\npermissions:\n contents: read\n pull-requests: write\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Setup and Review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n npm install -g @google/gemini-cli\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD)\n REVIEW=$(gemini -p \"Review this code diff. Return JSON with keys: summary (string), issues (array of {severity, file, description}), approved (boolean). Only output JSON. Diff: $DIFF\")\n echo \"$REVIEW\" > review.json\n\n - name: Post Comment\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n const raw = fs.readFileSync('review.json', 'utf8');\n const clean = raw.replace(/```json\\n?/g, '').replace(/```\\n?/g, '').trim();\n let review;\n try { review = JSON.parse(clean); } catch(e) {\n return github.rest.issues.createComment({\n ...context.repo, issue_number: context.issue.number,\n body: '## Gemini Review\\nFailed to parse output.'\n });\n }\n let body = `## Gemini Review\\n**Summary:** ${review.summary}\\n\\n`;\n if (review.issues?.length) {\n body += review.issues.map(i => `- **${i.severity}** \\`${i.file}\\`: ${i.description}`).join('\\n');\n } else { body += 'No issues found.'; }\n await github.rest.issues.createComment({\n ...context.repo, issue_number: context.issue.number, body\n });\n```\n\nThis setup will automatically run Gemini CLI on every PR, parse the structured JSON review, and post the results as a PR comment.\n"}], "grading": {"eval_id": 1, "eval_name": "cicd-setup", "config": "without_skill", "expectations": [{"text": "Uses -p or --prompt flag for headless/non-interactive mode", "passed": true, "evidence": "Line 96: gemini -p \"$(cat review_prompt.txt)\" and line 351: gemini -p \"Review this code diff...\""}, {"text": "Includes --output-format json flag for structured output", "passed": false, "evidence": "Does not use --output-format json. Instead relies on prompt-based JSON (asking model to return JSON in prompt text). Line 95 comment mentions '--json flag if available' which is not a real Gemini CLI flag."}, {"text": "Shows how to set GEMINI_API_KEY from GitHub secrets", "passed": true, "evidence": "Line 62: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} in env block"}, {"text": "Provides a GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Complete workflow YAML provided with checkout, node setup, install, review, and post-comment steps"}, {"text": "Shows how to parse JSON response (e.g., with jq .response)", "passed": true, "evidence": "Uses jq for JSON validation (line 102) and parsing (line 248: jq -r '.candidates[0].content.parts[0].text'). Different schema than CLI --output-format json but still demonstrates jq parsing."}, {"text": "Does not invent non-existent CLI flags or options", "passed": false, "evidence": "Line 95: comment references '--json flag if available' which is not a real Gemini CLI flag. Should use --output-format json instead."}]}}, {"id": "eval-2-mcp-server-config-with_skill", "prompt": "I need to connect a custom MCP server I built in Python to gemini cli. The server needs an API key from my environment. Also I want to restrict which tools it can expose. How do I configure this?", "eval_id": 2, "outputs": [{"name": "response.md", "type": "text", "content": "# Connecting a Custom Python MCP Server to Gemini CLI\n\n## Configuration\n\nAdd your server to `~/.gemini/settings.json` (global) or `<project>/.gemini/settings.json` (project-scoped) under the `mcpServers` key:\n\n```json\n{\n \"mcpServers\": {\n \"myPythonServer\": {\n \"command\": \"python\",\n \"args\": [\"-m\", \"my_mcp_server\"],\n \"cwd\": \"./mcp_tools\",\n \"env\": {\n \"API_KEY\": \"$MY_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"includeTools\": [\"tool_a\", \"tool_b\"]\n }\n }\n}\n```\n\n## Breakdown of each field\n\n| Field | Purpose |\n|---|---|\n| `command` | The executable to run. Use `\"python\"` (or `\"python3\"` depending on your system). |\n| `args` | Arguments passed to the command. Use `[\"-m\", \"my_mcp_server\"]` to run as a module, or `[\"./path/to/server.py\"]` to run a script directly. |\n| `cwd` | Optional working directory for the server process. |\n| `env` | Environment variables passed to the server. Use the `\"$VAR_NAME\"` syntax to reference variables from your shell environment rather than hardcoding secrets. |\n| `timeout` | Connection timeout in milliseconds. Defaults to 600000 (10 minutes). Set lower (e.g., 15000) for servers that should start quickly. |\n| `trust` | When `false` (the default), Gemini CLI will prompt you for confirmation before executing tools from this server. Set to `true` only for servers you fully control and trust. |\n| `includeTools` | A whitelist array. Only the tools listed here will be exposed to Gemini. All other tools from the server are hidden. |\n| `excludeTools` | A blacklist array. The listed tools are hidden; everything else is exposed. |\n\nUse `includeTools` or `excludeTools`, not both. `includeTools` is the safer choice when you want to tightly restrict what the model can call.\n\n## Passing the API key securely\n\nNever hardcode secrets in `settings.json`. Instead, set the variable in your shell environment:\n\n```bash\nexport MY_API_KEY=\"sk-your-actual-key\"\n```\n\nThen reference it with the dollar-sign prefix in the config:\n\n```json\n\"env\": {\n \"API_KEY\": \"$MY_API_KEY\"\n}\n```\n\nGemini CLI automatically redacts environment variable values that match patterns like `*TOKEN*`, `*SECRET*`, `*KEY*` from MCP process output, so your key will not leak into logs.\n\nYou can also store the variable in `~/.gemini/.env` or `<project>/.gemini/.env` and it will be loaded automatically.\n\n## Restricting exposed tools\n\nTo allow only specific tools (whitelist approach):\n\n```json\n\"includeTools\": [\"query_data\", \"list_items\"]\n```\n\nTo block specific tools while allowing everything else (blacklist approach):\n\n```json\n\"excludeTools\": [\"delete_all\", \"admin_reset\"]\n```\n\n## Verifying the connection\n\nAfter configuring, start Gemini CLI and check the server status:\n\n```bash\ngemini\n```\n\nThen inside the interactive session:\n\n```\n/mcp list\n/mcp status\n```\n\nYou can also manage servers from the command line:\n\n```bash\ngemini mcp list\ngemini mcp remove myPythonServer\n```\n\n## Invoking your server's tools\n\nIn the interactive session, reference the server by name with the `@` prefix:\n\n```\n@myPythonServer query all active users\n```\n\nGemini CLI will route the request to the appropriate tool exposed by your MCP server.\n\n## Complete example\n\nAssuming your Python MCP server is at `./tools/my_server.py` and exposes tools named `search`, `fetch_details`, and `delete_record`, and you only want to allow `search` and `fetch_details`:\n\n```json\n{\n \"mcpServers\": {\n \"myPythonServer\": {\n \"command\": \"python\",\n \"args\": [\"./tools/my_server.py\"],\n \"env\": {\n \"API_KEY\": \"$MY_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"includeTools\": [\"search\", \"fetch_details\"]\n }\n }\n}\n```\n\nThis configuration starts the Python server, passes the API key from your environment, sets a 15-second connection timeout, requires confirmation prompts for tool calls, and only exposes the `search` and `fetch_details` tools to Gemini.\n"}], "grading": {"eval_id": 2, "eval_name": "mcp-server-config", "config": "with_skill", "expectations": [{"text": "Points to ~/.gemini/settings.json or project-level settings.json", "passed": true, "evidence": "Opens with: 'Add your server to ~/.gemini/settings.json (global) or <project>/.gemini/settings.json (project-scoped)'"}, {"text": "Shows mcpServers config with command, args fields", "passed": true, "evidence": "Full JSON config block with mcpServers containing command, args, cwd, env, timeout, trust, includeTools"}, {"text": "Shows $VAR pattern for environment variable references in env field", "passed": true, "evidence": "Shows \"API_KEY\": \"$MY_API_KEY\" pattern and explains the dollar-sign prefix syntax"}, {"text": "Explains includeTools and/or excludeTools for restricting exposed tools", "passed": true, "evidence": "Dedicated section explaining both includeTools (whitelist) and excludeTools (blacklist) with examples"}, {"text": "Mentions security considerations (trust field, not hardcoding keys, etc.)", "passed": true, "evidence": "Explains trust field (default false, requires confirmation), warns never to hardcode secrets, mentions automatic redaction of env vars"}, {"text": "Uses correct Python command (python/python3) in the command field for the MCP server", "passed": true, "evidence": "Uses \"command\": \"python\" consistently across examples"}]}}, {"id": "eval-2-mcp-server-config-without_skill", "prompt": "I need to connect a custom MCP server I built in Python to gemini cli. The server needs an API key from my environment. Also I want to restrict which tools it can expose. How do I configure this?", "eval_id": 2, "outputs": [{"name": "response.md", "type": "text", "content": "# Connecting a Custom Python MCP Server to Gemini CLI\n\n## Configuration File Location\n\nGemini CLI reads MCP server configuration from `settings.json` at two levels:\n\n- **Global (user-level):** `~/.gemini/settings.json` -- applies to all Gemini CLI sessions\n- **Project-level:** `.gemini/settings.json` in your project root -- applies only when running from that project\n\nProject-level settings override global ones.\n\n## Full Configuration Example\n\nHere is how to configure a custom Python MCP server with an API key from your environment and tool filtering:\n\n```json\n{\n \"mcpServers\": {\n \"my-python-server\": {\n \"command\": \"python\",\n \"args\": [\"path/to/your/server.py\"],\n \"env\": {\n \"API_KEY\": \"$MY_API_KEY\"\n },\n \"cwd\": \"/path/to/server/directory\",\n \"timeout\": 30000,\n \"includeTools\": [\"allowed_tool_1\", \"allowed_tool_2\", \"allowed_tool_3\"]\n }\n }\n}\n```\n\n## Explanation of Each Property\n\n### Required (choose one transport)\n\n| Property | Description |\n|----------|-------------|\n| `command` | Path to the executable for stdio transport (e.g., `\"python\"`, `\"node\"`) |\n| `url` | SSE endpoint URL for remote servers (e.g., `\"http://localhost:8080/sse\"`) |\n| `httpUrl` | HTTP streaming endpoint URL |\n\n### Optional Properties\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `args` | string[] | Command-line arguments passed to the server process |\n| `env` | object | Environment variables to pass to the server (supports expansion) |\n| `cwd` | string | Working directory for the server process |\n| `timeout` | number | Request timeout in milliseconds (default: 600,000ms / 10 minutes) |\n| `trust` | boolean | When `true`, bypasses tool confirmation prompts |\n| `includeTools` | string[] | Allowlist -- only these tools will be available |\n| `excludeTools` | string[] | Blocklist -- these tools will be hidden from the model |\n| `headers` | object | Custom HTTP headers (for remote transports) |\n\n## Passing the API Key via Environment Variables\n\nGemini CLI supports environment variable expansion in the `env` block using two syntaxes:\n\n```json\n\"env\": {\n \"API_KEY\": \"$MY_API_KEY\",\n \"OTHER_SECRET\": \"${ANOTHER_ENV_VAR}\"\n}\n```\n\nBoth `$VAR_NAME` and `${VAR_NAME}` are supported. At runtime, Gemini CLI reads the value of `MY_API_KEY` from your host shell environment and passes it to the MCP server process.\n\n**Security note:** Gemini CLI automatically redacts environment variables to prevent leakage. However, variables you explicitly declare in the `env` block are considered trusted (since you intentionally configured them) and will not be redacted when passed to that specific server. You must explicitly list any environment variable that the server needs -- variables are not inherited automatically.\n\n## Restricting Exposed Tools\n\nGemini CLI provides two complementary mechanisms for tool filtering:\n\n### Allowlist with `includeTools`\n\nOnly the tools listed here will be available from that server. All other tools the server exposes will be hidden:\n\n```json\n\"includeTools\": [\"query_data\", \"get_status\", \"run_analysis\"]\n```\n\n### Blocklist with `excludeTools`\n\nTools listed here will be hidden, even if the server exposes them:\n\n```json\n\"excludeTools\": [\"delete_records\", \"admin_reset\"]\n```\n\n### Precedence Rule\n\n`excludeTools` takes precedence over `includeTools`. If a tool appears in both lists, it will be excluded.\n\n### Choosing an Approach\n\n- Use `includeTools` when your server exposes many tools but you only want a specific subset available (allowlist approach).\n- Use `excludeTools` when you want most tools available but need to block a few dangerous ones (blocklist approach).\n- You can combine both for fine-grained control.\n\n## Complete Practical Example\n\nSuppose you built a Python MCP server at `~/mcp-servers/my_api_server.py` that exposes tools `fetch_data`, `analyze`, `delete_all`, and `export_csv`. You want to pass your API key and only allow `fetch_data` and `analyze`.\n\nCreate or edit `~/.gemini/settings.json`:\n\n```json\n{\n \"mcpServers\": {\n \"my-api-server\": {\n \"command\": \"python\",\n \"args\": [\"/home/user/mcp-servers/my_api_server.py\"],\n \"env\": {\n \"API_KEY\": \"$MY_API_KEY\"\n },\n \"timeout\": 30000,\n \"includeTools\": [\"fetch_data\", \"analyze\"]\n }\n }\n}\n```\n\nMake sure `MY_API_KEY` is set in your shell environment (e.g., in `~/.bashrc` or `~/.zshrc`):\n\n```bash\nexport MY_API_KEY=\"your-secret-key-here\"\n```\n\n### Using FastMCP\n\nIf your Python server uses FastMCP (the recommended approach for Python MCP servers), the configuration would look like:\n\n```json\n{\n \"mcpServers\": {\n \"my-api-server\": {\n \"command\": \"uv\",\n \"args\": [\"run\", \"--with\", \"fastmcp\", \"fastmcp\", \"run\", \"my_server.py\"],\n \"cwd\": \"/home/user/mcp-servers\",\n \"env\": {\n \"API_KEY\": \"$MY_API_KEY\"\n },\n \"includeTools\": [\"fetch_data\", \"analyze\"]\n }\n }\n}\n```\n\n### Using the CLI to Add a Server\n\nYou can also add a server via the command line instead of editing `settings.json` manually:\n\n```bash\ngemini mcp add my-api-server -e API_KEY=$MY_API_KEY python /home/user/mcp-servers/my_api_server.py\n```\n\nNote: Tool filtering (`includeTools`/`excludeTools`) must be configured by editing `settings.json` directly, as the CLI `add` command may not support these options as flags.\n\n## Sources\n\n- [MCP servers with the Gemini CLI (official docs)](https://google-gemini.github.io/gemini-cli/docs/tools/mcp-server.html)\n- [Gemini CLI configuration reference](https://google-gemini.github.io/gemini-cli/docs/get-started/configuration.html)\n- [Gemini CLI + FastMCP integration (Google Developers Blog)](https://developers.googleblog.com/en/gemini-cli-fastmcp-simplifying-mcp-server-development/)\n- [MCP server docs on GitHub](https://github.com/google-gemini/gemini-cli/blob/main/docs/tools/mcp-server.md)\n"}], "grading": {"eval_id": 2, "eval_name": "mcp-server-config", "config": "without_skill", "expectations": [{"text": "Points to ~/.gemini/settings.json or project-level settings.json", "passed": true, "evidence": "States '~/.gemini/settings.json' (global) and '.gemini/settings.json' (project-level)"}, {"text": "Shows mcpServers config with command, args fields", "passed": true, "evidence": "Full JSON config with mcpServers, command, args, env, cwd, timeout, includeTools"}, {"text": "Shows $VAR pattern for environment variable references in env field", "passed": true, "evidence": "Shows both $MY_API_KEY and ${ANOTHER_ENV_VAR} syntax with explanation"}, {"text": "Explains includeTools and/or excludeTools for restricting exposed tools", "passed": true, "evidence": "Dedicated sections for both includeTools (allowlist) and excludeTools (blocklist) with examples and precedence rules"}, {"text": "Mentions security considerations (trust field, not hardcoding keys, etc.)", "passed": true, "evidence": "Mentions trust field, automatic redaction, env variable security note about trusted declarations"}, {"text": "Uses correct Python command (python/python3) in the command field for the MCP server", "passed": true, "evidence": "Uses \"command\": \"python\" consistently, also shows uv-based FastMCP alternative"}]}}, {"id": "eval-3-custom-slash-command-with_skill", "prompt": "How do I create a custom slash command in gemini cli that runs git diff --cached and asks the AI to write a commit message? I want it available in all my projects.", "eval_id": 3, "outputs": [{"name": "response.md", "type": "text", "content": "# Creating a Global Custom Slash Command for Commit Messages in Gemini CLI\n\nTo create a custom slash command that runs `git diff --cached` and asks the AI to write a commit message, available across all your projects, create a single `.toml` file in your user-scoped commands directory.\n\n## Step-by-Step\n\n### 1. Create the commands directory (if it doesn't exist)\n\n```bash\nmkdir -p ~/.gemini/commands\n```\n\n### 2. Create the slash command file\n\nCreate the file `~/.gemini/commands/commit.toml`:\n\n```toml\ndescription = \"Generate a commit message from staged changes\"\nprompt = \"\"\"\nBased on the following staged git diff, write a concise and descriptive commit message following conventional commit format (e.g., feat:, fix:, refactor:, docs:, chore:).\n\nThe first line should be a short summary (50 chars or less). If needed, add a blank line followed by a more detailed explanation.\n\nStaged changes:\n!{git diff --cached}\n\"\"\"\n```\n\n### How it works\n\n- **File location**: `~/.gemini/commands/commit.toml` -- placing it under `~/.gemini/commands/` makes it user-scoped, meaning it is available in every project you open with Gemini CLI.\n- **`!{git diff --cached}`**: The `!{...}` syntax is shell execution interpolation. When the slash command runs, Gemini CLI executes `git diff --cached` and substitutes the output directly into the prompt before sending it to the model.\n- **`description`**: Optional but recommended. It shows up when you run `/help` to list available commands.\n- **`prompt`**: The full prompt template sent to the model, with the shell output injected.\n\n### 3. Use the command\n\nStart an interactive Gemini CLI session in any git repository:\n\n```bash\ngemini\n```\n\nThen stage your changes and invoke the command:\n\n```\n> /commit\n```\n\nGemini will run `git diff --cached`, inject the diff into the prompt, and return a suggested commit message.\n\n## Variations\n\n### Accept an argument for extra context\n\nIf you want to optionally pass additional context (e.g., a ticket number or description):\n\n```toml\ndescription = \"Generate a commit message from staged changes\"\nprompt = \"\"\"\nBased on the following staged git diff, write a concise commit message.\n\nAdditional context from the user: {{args}}\n\nStaged changes:\n!{git diff --cached}\n\"\"\"\n```\n\nUsage: `/commit JIRA-1234 add user validation`\n\nThe `{{args}}` placeholder captures everything typed after `/commit`.\n\n### Namespaced command\n\nIf you prefer organizing under a `git` namespace, place the file at:\n\n```\n~/.gemini/commands/git/commit.toml\n```\n\nThen invoke it as `/git:commit` in the interactive session.\n\n## Important notes\n\n- Custom slash commands work in **interactive mode only**. They are not available in headless mode (`-p` flag). For headless commit message generation, pipe the diff directly:\n ```bash\n git diff --cached | gemini -p \"Write a concise commit message\" --output-format json | jq -r '.response'\n ```\n- The command runs in whatever directory you launched `gemini` from, so `git diff --cached` will pick up the correct repository context automatically.\n- If there are no staged changes, the diff output will be empty, and the model will tell you there is nothing to generate a message for.\n"}], "grading": {"eval_id": 3, "eval_name": "custom-slash-command", "config": "with_skill", "expectations": [{"text": "Points to ~/.gemini/commands/ for global user-scoped commands", "passed": true, "evidence": "States 'Create the file ~/.gemini/commands/commit.toml' and explains ~/.gemini/commands/ is user-scoped"}, {"text": "Shows .toml file format with prompt field", "passed": true, "evidence": "Complete .toml example with description and prompt fields using triple-quoted string"}, {"text": "Uses !{git diff --cached} shell execution syntax in the prompt", "passed": true, "evidence": "Uses !{git diff --cached} in the prompt field and explains the !{...} syntax is shell execution interpolation"}, {"text": "Warns that custom slash commands don't work in headless/non-interactive mode", "passed": true, "evidence": "Important notes section: 'Custom slash commands work in interactive mode only. They are not available in headless mode (-p flag).'"}, {"text": "Provides a complete, copy-pasteable .toml file example", "passed": true, "evidence": "Complete commit.toml file with description and prompt fields, ready to copy-paste"}]}}, {"id": "eval-3-custom-slash-command-without_skill", "prompt": "How do I create a custom slash command in gemini cli that runs git diff --cached and asks the AI to write a commit message? I want it available in all my projects.", "eval_id": 3, "outputs": [{"name": "response.md", "type": "text", "content": "# Creating a Custom Slash Command in Gemini CLI for Commit Messages\n\nGemini CLI supports custom slash commands defined as `.toml` files. To make a command available across all your projects, place the file in the **global** commands directory: `~/.gemini/commands/`.\n\n## Step-by-Step Setup\n\n### 1. Create the commands directory (if it does not exist)\n\n```bash\nmkdir -p ~/.gemini/commands\n```\n\n### 2. Create the command file\n\nCreate the file `~/.gemini/commands/commit.toml`:\n\n```toml\ndescription = \"Generate a commit message from staged changes\"\nprompt = \"\"\"Review the following staged git diff and write a clear, conventional commit message for it. Follow the Conventional Commits format (e.g., feat:, fix:, refactor:, docs:, chore:). Include a concise subject line (max 72 characters) and, if the changes are non-trivial, a body that explains the \"why\" behind the changes.\n\n```diff\n!{git diff --cached}\n```\"\"\"\n```\n\nThis creates the slash command `/commit`.\n\n### How It Works\n\n- **`description`** -- A one-line summary shown when you run `/help` inside Gemini CLI.\n- **`prompt`** -- The full prompt sent to the Gemini model. The `!{git diff --cached}` syntax is a **shell command injection block**. When the command runs, Gemini CLI executes `git diff --cached` in your shell, captures the output, and injects it directly into the prompt at that location.\n- **Security note** -- The first time a shell command inside `!{...}` runs, Gemini CLI will prompt you for confirmation before executing it.\n\n### 3. Reload commands (if Gemini CLI is already running)\n\nInside an active Gemini CLI session, run:\n\n```\n/commands reload\n```\n\nThis picks up new or modified `.toml` files without restarting the CLI.\n\n### 4. Usage\n\nStage your changes and invoke the command:\n\n```bash\ngit add -p # stage your changes\ngemini # start Gemini CLI\n```\n\nThen inside the Gemini CLI session:\n\n```\n/commit\n```\n\nGemini will read your staged diff, analyze the changes, and generate a commit message for you.\n\n## Namespaced Variant\n\nIf you prefer organizing your commands by category, use subdirectories. For example, placing the file at:\n\n```\n~/.gemini/commands/git/commit.toml\n```\n\nmakes it available as `/git:commit` instead of `/commit`. Subdirectory separators are converted to colons in the command name.\n\n## Adding Arguments Support\n\nYou can extend the command to accept optional context using `{{args}}`:\n\n```toml\ndescription = \"Generate a commit message from staged changes with optional context\"\nprompt = \"\"\"Review the following staged git diff and write a clear, conventional commit message. Follow the Conventional Commits format (e.g., feat:, fix:, refactor:).\n\nAdditional context from the developer: {{args}}\n\n```diff\n!{git diff --cached}\n```\"\"\"\n```\n\nThis allows you to provide extra context when invoking the command:\n\n```\n/commit This change addresses the performance regression in the auth module\n```\n\nThe `{{args}}` placeholder gets replaced with whatever text you type after the command name. Inside `!{...}` blocks, arguments are automatically shell-escaped for safety.\n\n## Summary of Key Concepts\n\n| Concept | Detail |\n|---|---|\n| Global location | `~/.gemini/commands/` |\n| Project location | `<project-root>/.gemini/commands/` |\n| File format | TOML (`.toml` extension) |\n| Shell execution | `!{command}` syntax, requires confirmation |\n| Arguments | `{{args}}` placeholder |\n| File injection | `@{path/to/file}` syntax |\n| Namespacing | Subdirectories map to colon-separated names |\n| Precedence | Project commands override global commands with the same name |\n\n## Sources\n\n- [Custom commands - Gemini CLI Documentation](https://geminicli.com/docs/cli/custom-commands/)\n- [Gemini CLI: Custom slash commands - Google Cloud Blog](https://cloud.google.com/blog/topics/developers-practitioners/gemini-cli-custom-slash-commands)\n- [CLI Commands - Gemini CLI](https://google-gemini.github.io/gemini-cli/docs/cli/commands.html)\n- [Gemini CLI Tutorial Series Part 7: Custom slash commands - Medium](https://medium.com/google-cloud/gemini-cli-tutorial-series-part-7-custom-slash-commands-64c06195294b)\n"}], "grading": {"eval_id": 3, "eval_name": "custom-slash-command", "config": "without_skill", "expectations": [{"text": "Points to ~/.gemini/commands/ for global user-scoped commands", "passed": true, "evidence": "States '~/.gemini/commands/' as global location in both instructions and summary table"}, {"text": "Shows .toml file format with prompt field", "passed": true, "evidence": "Complete .toml example with description and prompt fields"}, {"text": "Uses !{git diff --cached} shell execution syntax in the prompt", "passed": true, "evidence": "Uses !{git diff --cached} in the prompt field and explains the shell command injection block syntax"}, {"text": "Warns that custom slash commands don't work in headless/non-interactive mode", "passed": false, "evidence": "No mention of headless mode limitations anywhere in the response. Missing this critical caveat."}, {"text": "Provides a complete, copy-pasteable .toml file example", "passed": true, "evidence": "Complete commit.toml file at ~/.gemini/commands/commit.toml with description and prompt fields"}]}}], "previous_feedback": {}, "previous_outputs": {"eval-1-cicd-setup-with_skill": [{"name": "response.md", "type": "text", "content": "# Setting Up Gemini CLI for Automated PR Reviews in GitHub Actions\n\nHere is a complete guide to configuring Gemini CLI in your CI/CD pipeline to automatically review pull requests with structured JSON output.\n\n---\n\n## Prerequisites\n\n1. A Gemini API key (get one at https://aistudio.google.com/apikey).\n2. The API key stored as a GitHub Actions secret named `GEMINI_API_KEY`.\n\n---\n\n## Step 1: Basic GitHub Actions Workflow\n\nCreate `.github/workflows/gemini-pr-review.yml`:\n\n```yaml\nname: Gemini PR Review\n\non:\n pull_request:\n types: [opened, synchronize, reopened]\n\npermissions:\n pull-requests: write\n contents: read\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout code\n uses: actions/checkout@v4\n with:\n fetch-depth: 0 # Full history so we can diff against base\n\n - name: Set up Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI\n run: npm install -g @google/gemini-cli\n\n - name: Get PR diff\n id: diff\n run: |\n git fetch origin ${{ github.base_ref }}\n git diff origin/${{ github.base_ref }}...HEAD > /tmp/pr-diff.txt\n echo \"diff_size=$(wc -c < /tmp/pr-diff.txt)\" >> \"$GITHUB_OUTPUT\"\n\n - name: Run Gemini PR Review\n id: review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n REVIEW_JSON=$(cat /tmp/pr-diff.txt | gemini -p \"You are a senior code reviewer. Review the following pull request diff for:\n 1. Bugs and logical errors\n 2. Security vulnerabilities\n 3. Performance issues\n 4. Code style and best practices\n\n Provide your review as structured text with sections for each category.\n If no issues are found in a category, say 'No issues found.'\n End with an overall summary and a verdict: APPROVE, REQUEST_CHANGES, or COMMENT.\" \\\n --output-format json)\n\n # Extract the response text from the JSON envelope\n REVIEW_TEXT=$(echo \"$REVIEW_JSON\" | jq -r '.response')\n\n # Extract token stats for logging\n TOTAL_TOKENS=$(echo \"$REVIEW_JSON\" | jq '.stats.models | to_entries[0].value.tokens.total // 0')\n echo \"Tokens used: $TOTAL_TOKENS\"\n\n # Save review to file (handles multi-line safely)\n echo \"$REVIEW_TEXT\" > /tmp/review-output.txt\n\n # Save for the summary\n echo \"$REVIEW_TEXT\" >> \"$GITHUB_STEP_SUMMARY\"\n\n - name: Post review as PR comment\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n const review = fs.readFileSync('/tmp/review-output.txt', 'utf8');\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body: `## Gemini Code Review\\n\\n${review}\\n\\n---\\n*Automated review by Gemini CLI*`\n });\n```\n\nThis workflow triggers on every PR event, runs the diff through Gemini CLI with `--output-format json`, extracts the response with `jq`, and posts it as a PR comment.\n\n---\n\n## Step 2: Advanced Workflow with Full JSON Parsing\n\nIf you want to parse the structured JSON output more thoroughly -- for example, to extract token usage, check for errors, or conditionally fail the build -- use this expanded version:\n\n```yaml\nname: Gemini PR Review (Advanced)\n\non:\n pull_request:\n types: [opened, synchronize]\n\npermissions:\n pull-requests: write\n contents: read\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout\n uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Setup Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI and jq\n run: |\n npm install -g @google/gemini-cli\n # jq is pre-installed on ubuntu-latest, but just in case:\n which jq || sudo apt-get install -y jq\n\n - name: Run structured review\n id: gemini\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n git fetch origin ${{ github.base_ref }}\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD)\n\n # Skip review if diff is empty\n if [ -z \"$DIFF\" ]; then\n echo \"No changes to review.\"\n echo \"skip=true\" >> \"$GITHUB_OUTPUT\"\n exit 0\n fi\n\n # Run Gemini with JSON output\n RESULT=$(echo \"$DIFF\" | gemini -p \"You are a code review bot. Analyze this PR diff and respond with ONLY a valid JSON object (no markdown fences) using this exact schema:\n {\n \\\"verdict\\\": \\\"APPROVE\\\" | \\\"REQUEST_CHANGES\\\" | \\\"COMMENT\\\",\n \\\"summary\\\": \\\"Brief overall summary\\\",\n \\\"bugs\\\": [{ \\\"file\\\": \\\"path\\\", \\\"line\\\": number, \\\"description\\\": \\\"...\\\" }],\n \\\"security\\\": [{ \\\"file\\\": \\\"path\\\", \\\"severity\\\": \\\"low|medium|high|critical\\\", \\\"description\\\": \\\"...\\\" }],\n \\\"performance\\\": [{ \\\"file\\\": \\\"path\\\", \\\"description\\\": \\\"...\\\" }],\n \\\"suggestions\\\": [{ \\\"file\\\": \\\"path\\\", \\\"description\\\": \\\"...\\\" }]\n }\n Return empty arrays if no issues found in a category.\" \\\n --output-format json)\n\n # The outer JSON has a .response field containing the model's text.\n # The model's text itself is a JSON string we need to parse.\n REVIEW_RESPONSE=$(echo \"$RESULT\" | jq -r '.response')\n\n # Parse the inner JSON (the actual review)\n echo \"$REVIEW_RESPONSE\" | jq '.' > /tmp/review.json\n\n # Extract fields for downstream steps\n VERDICT=$(cat /tmp/review.json | jq -r '.verdict')\n SUMMARY=$(cat /tmp/review.json | jq -r '.summary')\n BUG_COUNT=$(cat /tmp/review.json | jq '.bugs | length')\n SECURITY_COUNT=$(cat /tmp/review.json | jq '.security | length')\n CRITICAL_COUNT=$(cat /tmp/review.json | jq '[.security[] | select(.severity == \"critical\")] | length')\n\n echo \"verdict=$VERDICT\" >> \"$GITHUB_OUTPUT\"\n echo \"bug_count=$BUG_COUNT\" >> \"$GITHUB_OUTPUT\"\n echo \"security_count=$SECURITY_COUNT\" >> \"$GITHUB_OUTPUT\"\n echo \"critical_count=$CRITICAL_COUNT\" >> \"$GITHUB_OUTPUT\"\n echo \"skip=false\" >> \"$GITHUB_OUTPUT\"\n\n # Log token usage from the Gemini stats envelope\n TOKENS=$(echo \"$RESULT\" | jq '.stats.models | to_entries[0].value.tokens.total // 0')\n LATENCY=$(echo \"$RESULT\" | jq '.stats.models | to_entries[0].value.api.totalLatencyMs // 0')\n echo \"Tokens used: $TOKENS, Latency: ${LATENCY}ms\"\n\n - name: Format and post review comment\n if: steps.gemini.outputs.skip != 'true'\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n const review = JSON.parse(fs.readFileSync('/tmp/review.json', 'utf8'));\n\n let body = `## Gemini Code Review\\n\\n`;\n body += `**Verdict:** ${review.verdict}\\n\\n`;\n body += `**Summary:** ${review.summary}\\n\\n`;\n\n if (review.bugs.length > 0) {\n body += `### Bugs (${review.bugs.length})\\n`;\n for (const bug of review.bugs) {\n body += `- **${bug.file}** (line ${bug.line}): ${bug.description}\\n`;\n }\n body += '\\n';\n }\n\n if (review.security.length > 0) {\n body += `### Security Issues (${review.security.length})\\n`;\n for (const issue of review.security) {\n const emoji = issue.severity === 'critical' ? '\u00f0\u0178\u201d\u00b4' :\n issue.severity === 'high' ? '\u00f0\u0178\u0178\u00a0' :\n issue.severity === 'medium' ? '\u00f0\u0178\u0178\u00a1' : '\u00f0\u0178\u0178\u00a2';\n body += `- ${emoji} **[${issue.severity.toUpperCase()}]** ${issue.file}: ${issue.description}\\n`;\n }\n body += '\\n';\n }\n\n if (review.performance.length > 0) {\n body += `### Performance (${review.performance.length})\\n`;\n for (const perf of review.performance) {\n body += `- **${perf.file}**: ${perf.description}\\n`;\n }\n body += '\\n';\n }\n\n if (review.suggestions.length > 0) {\n body += `### Suggestions (${review.suggestions.length})\\n`;\n for (const sug of review.suggestions) {\n body += `- **${sug.file}**: ${sug.description}\\n`;\n }\n body += '\\n';\n }\n\n body += `---\\n*Automated review by Gemini CLI*`;\n\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body\n });\n\n - name: Fail on critical security issues\n if: steps.gemini.outputs.critical_count != '0' && steps.gemini.outputs.skip != 'true'\n run: |\n echo \"::error::Gemini found ${{ steps.gemini.outputs.critical_count }} critical security issue(s). Review required.\"\n exit 1\n```\n\n---\n\n## Understanding the JSON Output\n\nGemini CLI's `--output-format json` wraps the model's response in an envelope with this schema:\n\n```json\n{\n \"response\": \"The model's text response (string)\",\n \"stats\": {\n \"models\": {\n \"gemini-2.5-pro\": {\n \"api\": {\n \"totalRequests\": 2,\n \"totalErrors\": 0,\n \"totalLatencyMs\": 5053\n },\n \"tokens\": {\n \"prompt\": 24939,\n \"candidates\": 20,\n \"total\": 25113,\n \"cached\": 21263,\n \"thoughts\": 154,\n \"tool\": 0\n }\n }\n },\n \"tools\": {\n \"totalCalls\": 1,\n \"totalSuccess\": 1,\n \"totalFail\": 0,\n \"totalDurationMs\": 1881\n },\n \"files\": {\n \"totalLinesAdded\": 5,\n \"totalLinesRemoved\": 2\n }\n }\n}\n```\n\nTo parse this in your pipeline:\n\n```bash\n# Get the model's text response\necho \"$RESULT\" | jq -r '.response'\n\n# Get total tokens used\necho \"$RESULT\" | jq '.stats.models | to_entries[0].value.tokens.total'\n\n# Get API latency\necho \"$RESULT\" | jq '.stats.models | to_entries[0].value.api.totalLatencyMs'\n\n# Check for API errors\necho \"$RESULT\" | jq '.stats.models | to_entries[0].value.api.totalErrors'\n```\n\n---\n\n## Exit Codes for CI Logic\n\nGemini CLI returns meaningful exit codes you can use for conditional logic:\n\n| Exit Code | Meaning |\n|-----------|---------|\n| 0 | Success |\n| 1 | General error |\n| 2 | Authentication error (bad or missing API key) |\n| 3 | Rate limit / quota exceeded |\n\nExample usage:\n\n```bash\ngemini -p \"Review this code\" --output-format json > /tmp/result.json 2>/dev/null\nEXIT_CODE=$?\n\nif [ $EXIT_CODE -eq 2 ]; then\n echo \"::error::Gemini API key is invalid or missing. Check your GEMINI_API_KEY secret.\"\n exit 1\nelif [ $EXIT_CODE -eq 3 ]; then\n echo \"::warning::Gemini rate limit reached. Skipping review.\"\n exit 0\nelif [ $EXIT_CODE -ne 0 ]; then\n echo \"::warning::Gemini CLI failed with exit code $EXIT_CODE. Skipping review.\"\n exit 0\nfi\n```\n\n---\n\n## Tips and Best Practices\n\n1. **Use `--output-format json` (not `stream-json`)** for CI. The `json` format gives you a single complete JSON object after all processing is done. The `stream-json` format emits newline-delimited JSON events in real-time, which is harder to parse in a script.\n\n2. **Authentication**: Set `GEMINI_API_KEY` as an environment variable. Gemini CLI picks it up automatically. No `settings.json` or `.env` file is needed in CI.\n\n3. **Model selection**: Add `-m gemini-2.5-flash` for faster, cheaper reviews or `-m gemini-2.5-pro` for deeper analysis:\n ```bash\n gemini -m gemini-2.5-flash -p \"Review this diff\" --output-format json\n ```\n\n4. **Large diffs**: Gemini models support up to 1M tokens of context. For very large PRs, consider filtering the diff to only changed files that matter:\n ```bash\n git diff origin/main...HEAD -- '*.py' '*.js' '*.ts' | gemini -p \"Review\" --output-format json\n ```\n\n5. **File references with `@` syntax**: You can reference specific files directly in your prompt:\n ```bash\n gemini -p \"Review @./src/auth.py and @./src/middleware.py for security issues\" --output-format json\n ```\n\n6. **Do NOT use `--yolo` for reviews**. The `--yolo` flag auto-accepts all tool actions (file writes, shell commands). For a read-only review, you do not need it. Only use `--yolo` if you want Gemini to make actual code changes.\n\n7. **Rate limits on the free tier**: 60 requests/minute, 1,000 requests/day. For high-volume repos, consider using a paid API key or adding concurrency limits to your workflow.\n\n8. **Debug logging**: If reviews are producing unexpected output, add `--debug-log` to capture full API traces:\n ```bash\n gemini -p \"Review this code\" --output-format json --debug-log ./gemini-debug.jsonl\n ```\n Then upload `gemini-debug.jsonl` as a workflow artifact for inspection.\n\n---\n\n## Minimal Quick-Start Example\n\nIf you just want the simplest possible setup to get started:\n\n```yaml\nname: Gemini Review\non: [pull_request]\npermissions:\n pull-requests: write\n contents: read\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n with:\n fetch-depth: 0\n - run: npm install -g @google/gemini-cli\n - name: Review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n git fetch origin ${{ github.base_ref }}\n REVIEW=$(git diff origin/${{ github.base_ref }}...HEAD | gemini -p \"Review these changes for bugs and issues\" --output-format json | jq -r '.response')\n echo \"$REVIEW\" >> $GITHUB_STEP_SUMMARY\n```\n\nThis pipes the PR diff into Gemini CLI, extracts the review text from the JSON output with `jq`, and writes it to the GitHub Actions step summary where it is visible directly on the workflow run page.\n"}], "eval-1-cicd-setup-without_skill": [{"name": "response.md", "type": "text", "content": "# Setting Up Gemini CLI for Automated PR Reviews in GitHub Actions\n\nThis guide covers how to configure Gemini CLI in a GitHub Actions CI/CD pipeline to automatically review pull requests, using an API key from GitHub Secrets and structured JSON output for parsing.\n\n## Prerequisites\n\n- A Google AI API key (from [Google AI Studio](https://aistudio.google.com/apikey))\n- The API key stored as a GitHub repository secret (e.g., `GEMINI_API_KEY`)\n- A GitHub repository with pull request workflows\n\n## Step 1: Store Your API Key in GitHub Secrets\n\n1. Go to your repository on GitHub.\n2. Navigate to **Settings > Secrets and variables > Actions**.\n3. Click **New repository secret**.\n4. Name it `GEMINI_API_KEY` and paste your API key as the value.\n\n## Step 2: Create the GitHub Actions Workflow\n\nCreate a file at `.github/workflows/gemini-pr-review.yml`:\n\n```yaml\nname: Gemini PR Review\n\non:\n pull_request:\n types: [opened, synchronize, reopened]\n\npermissions:\n contents: read\n pull-requests: write\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout repository\n uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Set up Node.js\n uses: actions/setup-node@v4\n with:\n node-version: '20'\n\n - name: Install Gemini CLI\n run: npm install -g @anthropic-ai/gemini-cli || npm install -g gemini-cli\n\n - name: Get PR diff\n id: diff\n run: |\n git fetch origin ${{ github.base_ref }}\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD)\n # Write diff to a file to avoid shell escaping issues\n echo \"$DIFF\" > /tmp/pr_diff.txt\n\n - name: Get changed files list\n id: files\n run: |\n git fetch origin ${{ github.base_ref }}\n FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)\n echo \"$FILES\" > /tmp/changed_files.txt\n\n - name: Run Gemini CLI review\n id: gemini-review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n DIFF=$(cat /tmp/pr_diff.txt)\n FILES=$(cat /tmp/changed_files.txt)\n\n PROMPT=$(cat <<'PROMPT_EOF'\n You are a senior code reviewer. Analyze the following pull request diff and provide a structured review.\n\n Changed files:\n FILE_LIST_PLACEHOLDER\n\n Diff:\n DIFF_PLACEHOLDER\n\n Respond ONLY with valid JSON in this exact format (no markdown fencing, no extra text):\n {\n \"summary\": \"Brief summary of the changes\",\n \"risk_level\": \"low|medium|high\",\n \"issues\": [\n {\n \"file\": \"path/to/file\",\n \"line\": 42,\n \"severity\": \"error|warning|suggestion\",\n \"message\": \"Description of the issue\",\n \"suggestion\": \"How to fix it\"\n }\n ],\n \"positive_aspects\": [\"List of things done well\"],\n \"approval_recommendation\": \"approve|request_changes|comment\"\n }\n PROMPT_EOF\n )\n\n # Replace placeholders\n PROMPT=\"${PROMPT/FILE_LIST_PLACEHOLDER/$FILES}\"\n PROMPT=\"${PROMPT/DIFF_PLACEHOLDER/$DIFF}\"\n\n # Run Gemini CLI with the prompt\n # Using the --json flag if available, or parsing output directly\n RESPONSE=$(echo \"$PROMPT\" | gemini \\\n --model gemini-2.0-flash \\\n --api-key \"$GEMINI_API_KEY\" \\\n 2>/dev/null) || true\n\n # If gemini CLI is not available, fall back to direct API call\n if [ -z \"$RESPONSE\" ]; then\n RESPONSE=$(curl -s \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${GEMINI_API_KEY}\" \\\n -H 'Content-Type: application/json' \\\n -d \"$(jq -n --arg prompt \"$PROMPT\" '{\n \"contents\": [{\"parts\": [{\"text\": $prompt}]}],\n \"generationConfig\": {\n \"responseMimeType\": \"application/json\",\n \"temperature\": 0.2\n }\n }')\" | jq -r '.candidates[0].content.parts[0].text')\n fi\n\n echo \"$RESPONSE\" > /tmp/review_result.json\n\n # Validate JSON\n if jq empty /tmp/review_result.json 2>/dev/null; then\n echo \"valid_json=true\" >> $GITHUB_OUTPUT\n else\n echo \"valid_json=false\" >> $GITHUB_OUTPUT\n echo \"Raw response was not valid JSON, attempting extraction...\"\n # Try to extract JSON from markdown fenced blocks\n sed -n '/^```json/,/^```$/p' /tmp/review_result.json | sed '1d;$d' > /tmp/review_cleaned.json\n if jq empty /tmp/review_cleaned.json 2>/dev/null; then\n mv /tmp/review_cleaned.json /tmp/review_result.json\n echo \"valid_json=true\" >> $GITHUB_OUTPUT\n fi\n fi\n\n - name: Parse and post review\n if: success()\n env:\n GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n run: |\n REVIEW_FILE=\"/tmp/review_result.json\"\n\n if ! jq empty \"$REVIEW_FILE\" 2>/dev/null; then\n echo \"Failed to get valid JSON review. Posting raw output as comment.\"\n gh pr comment ${{ github.event.pull_request.number }} \\\n --body \"## Gemini Review\n\n > Could not parse structured output. Raw response attached.\n\n \\`\\`\\`\n $(cat \"$REVIEW_FILE\")\n \\`\\`\\`\"\n exit 0\n fi\n\n # Extract fields from JSON\n SUMMARY=$(jq -r '.summary' \"$REVIEW_FILE\")\n RISK=$(jq -r '.risk_level' \"$REVIEW_FILE\")\n APPROVAL=$(jq -r '.approval_recommendation' \"$REVIEW_FILE\")\n POSITIVES=$(jq -r '.positive_aspects // [] | map(\"- \" + .) | join(\"\\n\")' \"$REVIEW_FILE\")\n ISSUE_COUNT=$(jq -r '.issues | length' \"$REVIEW_FILE\")\n\n # Build issue table\n ISSUES_MD=\"\"\n if [ \"$ISSUE_COUNT\" -gt 0 ]; then\n ISSUES_MD=\"### Issues Found ($ISSUE_COUNT)\n\n | Severity | File | Line | Message | Suggestion |\n |----------|------|------|---------|------------|\n \"\n ISSUES_MD+=$(jq -r '.issues[] | \"| \\(.severity) | `\\(.file)` | \\(.line) | \\(.message) | \\(.suggestion) |\"' \"$REVIEW_FILE\")\n else\n ISSUES_MD=\"### No issues found\"\n fi\n\n # Build risk badge\n case \"$RISK\" in\n low) RISK_BADGE=\"![Risk: Low](https://img.shields.io/badge/Risk-Low-green)\" ;;\n medium) RISK_BADGE=\"![Risk: Medium](https://img.shields.io/badge/Risk-Medium-yellow)\" ;;\n high) RISK_BADGE=\"![Risk: High](https://img.shields.io/badge/Risk-High-red)\" ;;\n *) RISK_BADGE=\"Risk: $RISK\" ;;\n esac\n\n # Compose comment\n COMMENT=$(cat <<EOF\n ## Gemini Automated PR Review\n\n $RISK_BADGE\n\n ### Summary\n $SUMMARY\n\n $ISSUES_MD\n\n ### Positive Aspects\n $POSITIVES\n\n ---\n **Recommendation:** \\`$APPROVAL\\`\n EOF\n )\n\n # Post comment to PR\n gh pr comment ${{ github.event.pull_request.number }} --body \"$COMMENT\"\n```\n\n## Step 3: Direct API Approach with Structured JSON Output (Recommended)\n\nIf the Gemini CLI does not reliably produce JSON, use the Gemini REST API directly with `responseMimeType` set to `application/json`. This forces the model to return valid JSON. Here is a standalone workflow that uses `curl` and `jq`:\n\n```yaml\nname: Gemini PR Review (API Direct)\n\non:\n pull_request:\n types: [opened, synchronize, reopened]\n\npermissions:\n contents: read\n pull-requests: write\n\njobs:\n review:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout\n uses: actions/checkout@v4\n with:\n fetch-depth: 0\n\n - name: Generate review via Gemini API\n id: review\n env:\n GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}\n run: |\n git fetch origin ${{ github.base_ref }}\n DIFF=$(git diff origin/${{ github.base_ref }}...HEAD | head -c 30000)\n FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)\n\n # Build the prompt\n PROMPT=\"You are a code reviewer. Review this PR diff. Changed files: ${FILES}. Diff: ${DIFF}. Return a JSON object with keys: summary (string), risk_level (low/medium/high), issues (array of {file, line, severity, message, suggestion}), positive_aspects (array of strings), approval_recommendation (approve/request_changes/comment).\"\n\n # Call Gemini API with structured output\n RESPONSE=$(curl -sf \\\n \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${GEMINI_API_KEY}\" \\\n -H 'Content-Type: application/json' \\\n -d \"$(jq -n --arg prompt \"$PROMPT\" '{\n contents: [{parts: [{text: $prompt}]}],\n generationConfig: {\n responseMimeType: \"application/json\",\n responseSchema: {\n type: \"object\",\n properties: {\n summary: {type: \"string\"},\n risk_level: {type: \"string\", enum: [\"low\", \"medium\", \"high\"]},\n issues: {\n type: \"array\",\n items: {\n type: \"object\",\n properties: {\n file: {type: \"string\"},\n line: {type: \"integer\"},\n severity: {type: \"string\", enum: [\"error\", \"warning\", \"suggestion\"]},\n message: {type: \"string\"},\n suggestion: {type: \"string\"}\n },\n required: [\"file\", \"severity\", \"message\"]\n }\n },\n positive_aspects: {type: \"array\", items: {type: \"string\"}},\n approval_recommendation: {type: \"string\", enum: [\"approve\", \"request_changes\", \"comment\"]}\n },\n required: [\"summary\", \"risk_level\", \"issues\", \"approval_recommendation\"]\n },\n temperature: 0.2\n }\n }')\")\n\n # Extract the text content from the API response\n REVIEW=$(echo \"$RESPONSE\" | jq -r '.candidates[0].content.parts[0].text')\n echo \"$REVIEW\" > /tmp/review.json\n\n # Validate\n jq empty /tmp/review.json\n\n - name: Post review comment\n env:\n GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n run: |\n REVIEW_FILE=\"/tmp/review.json\"\n\n SUMMARY=$(jq -r '.summary' \"$REVIEW_FILE\")\n RISK=$(jq -r '.risk_level' \"$REVIEW_FILE\")\n APPROVAL=$(jq -r '.approval_recommendation' \"$REVIEW_FILE\")\n\n # Format issues as markdown\n ISSUES=$(jq -r '\n if (.issues | length) > 0 then\n \"| Severity | File | Line | Issue | Fix |\\n|---|---|---|---|---|\\n\" +\n (.issues[] | \"| \\(.severity) | `\\(.file)` | \\(.line // \"N/A\") | \\(.message) | \\(.suggestion // \"N/A\") |\")\n else\n \"_No issues found._\"\n end\n ' \"$REVIEW_FILE\")\n\n POSITIVES=$(jq -r '(.positive_aspects // []) | map(\"- \" + .) | join(\"\\n\")' \"$REVIEW_FILE\")\n\n BODY=$(cat <<EOF\n ## Automated PR Review (Gemini)\n\n **Risk Level:** \\`${RISK}\\` | **Recommendation:** \\`${APPROVAL}\\`\n\n ### Summary\n ${SUMMARY}\n\n ### Issues\n ${ISSUES}\n\n ### What looks good\n ${POSITIVES}\n EOF\n )\n\n gh pr comment \"${{ github.event.pull_request.number }}\" --body \"$BODY\"\n```\n\n## Step 4: Using Gemini CLI with `--json` Flag (If Supported)\n\nSome versions of the Gemini CLI support a `--json` output flag. If your version does, the invocation simplifies to:\n\n```bash\ngemini --model gemini-2.0-flash \\\n --api-key \"$GEMINI_API_KEY\" \\\n --json \\\n --prompt \"Review this code diff and return structured JSON: $DIFF\"\n```\n\nCheck your installed version's help output (`gemini --help`) to confirm available flags.\n\n## Step 5: Parsing the JSON Output in Downstream Steps\n\nOnce you have valid JSON in a file, you can use it in subsequent pipeline steps:\n\n```yaml\n - name: Fail on high-risk reviews\n run: |\n RISK=$(jq -r '.risk_level' /tmp/review.json)\n ERRORS=$(jq '[.issues[] | select(.severity == \"error\")] | length' /tmp/review.json)\n\n echo \"Risk level: $RISK\"\n echo \"Error count: $ERRORS\"\n\n if [ \"$RISK\" = \"high\" ] || [ \"$ERRORS\" -gt 0 ]; then\n echo \"::error::PR review found high risk or errors. Please address the issues.\"\n exit 1\n fi\n```\n\nYou can also use it in a matrix strategy or pass data between jobs:\n\n```yaml\n - name: Export review data\n id: export\n run: |\n echo \"risk=$(jq -r '.risk_level' /tmp/review.json)\" >> $GITHUB_OUTPUT\n echo \"approval=$(jq -r '.approval_recommendation' /tmp/review.json)\" >> $GITHUB_OUTPUT\n echo \"issue_count=$(jq '.issues | length' /tmp/review.json)\" >> $GITHUB_OUTPUT\n```\n\nThen reference in later steps or jobs:\n\n```yaml\n - name: Auto-approve if clean\n if: steps.export.outputs.approval == 'approve' && steps.export.outputs.issue_count == '0'\n env:\n GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n run: |\n gh pr review \"${{ github.event.pull_request.number }}\" --approve --body \"Automated review passed with no issues.\"\n```\n\n## Key Points\n\n1. **`responseMimeType: \"application/json\"`** in the Gemini API `generationConfig` forces the model to return valid JSON. This is the most reliable way to get structured output.\n\n2. **`responseSchema`** lets you define the exact JSON schema you expect. The model will conform to it. This eliminates the need for post-processing or regex extraction.\n\n3. **Diff size limits**: Git diffs can be very large. Truncate them (e.g., `head -c 30000`) to stay within token limits. For large PRs, consider reviewing changed files individually.\n\n4. **Error handling**: Always validate the JSON output with `jq empty` before trying to parse it. Have a fallback path that posts raw output as a comment.\n\n5. **Rate limits**: The Gemini API has rate limits. For high-traffic repos, add retry logic or use the paid tier.\n\n6. **Security**: The `GEMINI_API_KEY` secret is never exposed in logs because GitHub Actions automatically masks secrets in output. The `GITHUB_TOKEN` is provided automatically by Actions with the permissions declared in the workflow.\n\n## Troubleshooting\n\n- **\"jq: parse error\"**: The model returned non-JSON text. Use `responseMimeType: \"application/json\"` to fix this.\n- **Empty response**: Check that `GEMINI_API_KEY` is set correctly in secrets and that the model name is valid.\n- **Diff too large**: Truncate the diff or split the review across multiple API calls, one per file.\n- **Permission denied on PR comment**: Ensure `permissions: pull-requests: write` is set in the workflow.\n"}], "eval-2-mcp-server-config-with_skill": [{"name": "response.md", "type": "text", "content": "# Connecting a Custom Python MCP Server to Gemini CLI\n\nYou need to configure your server in `settings.json`, pass the API key via the `env` field, and use `includeTools` to restrict which tools are exposed. Here is the full walkthrough.\n\n---\n\n## 1. Configure the MCP Server in settings.json\n\nEdit (or create) `~/.gemini/settings.json` for a global configuration, or `<your-project>/.gemini/settings.json` for project-scoped configuration. Add your server under the `mcpServers` key:\n\n```json\n{\n \"mcpServers\": {\n \"myPythonServer\": {\n \"command\": \"python\",\n \"args\": [\"-m\", \"my_mcp_server\"],\n \"cwd\": \"./mcp_tools\",\n \"env\": {\n \"MY_API_KEY\": \"$MY_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"includeTools\": [\"search_records\", \"get_status\"]\n }\n }\n}\n```\n\n### What each field does\n\n| Field | Type | Description |\n|---|---|---|\n| `command` | string | The executable to run. Use `\"python\"` (or `\"python3\"` depending on your system). |\n| `args` | string[] | Arguments passed to the command. `[\"-m\", \"my_mcp_server\"]` runs your package as a module. You can also use a script path like `[\"./server.py\"]`. |\n| `cwd` | string | Working directory for the server process. Optional -- set this if your server needs to run from a specific directory. |\n| `env` | object | Environment variables passed to the server process. Use the `\"$VAR_NAME\"` syntax to reference variables from your shell environment. |\n| `timeout` | number | Connection timeout in milliseconds. Default is 10000 (10 seconds). Increase if your server takes longer to start. |\n| `trust` | boolean | When `true`, skips all tool confirmation dialogs. Keep this `false` unless you fully control and trust the server. |\n| `includeTools` | string[] | **Whitelist** -- only these tools will be exposed to Gemini. If set, all other tools from the server are hidden. |\n| `excludeTools` | string[] | **Blacklist** -- these specific tools will be hidden. All others remain available. |\n\n**Important:** `includeTools` and `excludeTools` are mutually exclusive approaches. Use one or the other:\n- Use `includeTools` when your server exposes many tools but you only want a few available.\n- Use `excludeTools` when you want most tools but need to block specific dangerous ones.\n\n---\n\n## 2. Pass the API Key Securely\n\nNever hardcode secrets directly in `settings.json`. Instead, use the `\"$VAR_NAME\"` pattern to reference environment variables from your shell:\n\n```json\n\"env\": {\n \"MY_API_KEY\": \"$MY_API_KEY\"\n}\n```\n\nThen set the variable in your shell before running Gemini:\n\n```bash\nexport MY_API_KEY=\"sk-your-actual-api-key-here\"\ngemini\n```\n\nAlternatively, you can store the variable in a `.env` file that Gemini loads automatically:\n\n- `~/.gemini/.env` -- global env file\n- `./.gemini/.env` -- project-level env file\n\nExample `.gemini/.env`:\n\n```\nMY_API_KEY=sk-your-actual-api-key-here\n```\n\n**Security note:** Gemini CLI automatically redacts sensitive env vars (names containing `TOKEN`, `SECRET`, `KEY`, etc.) from MCP processes unless you explicitly list them in the `env` field. So you must include the variable in `env` for it to reach your server.\n\n---\n\n## 3. Full Working Example\n\nSuppose you have a Python MCP server at `~/mcp-servers/analytics_server.py` that exposes five tools (`query_data`, `generate_report`, `delete_records`, `export_csv`, `run_migration`), and you only want `query_data` and `generate_report` available. The server needs a `ANALYTICS_API_KEY`.\n\n### settings.json\n\n```json\n{\n \"mcpServers\": {\n \"analytics\": {\n \"command\": \"python3\",\n \"args\": [\"analytics_server.py\"],\n \"cwd\": \"/home/user/mcp-servers\",\n \"env\": {\n \"ANALYTICS_API_KEY\": \"$ANALYTICS_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"includeTools\": [\"query_data\", \"generate_report\"]\n }\n }\n}\n```\n\n### Shell setup\n\n```bash\nexport ANALYTICS_API_KEY=\"your-key-here\"\ngemini\n```\n\n### Using the server in a Gemini session\n\nOnce inside the interactive session, you can verify the server connected:\n\n```\n/mcp list\n/mcp status\n```\n\nThen invoke the tools by addressing the server:\n\n```\n> @analytics Query monthly active users for Q4 2025\n> @analytics Generate a report on user retention trends\n```\n\n---\n\n## 4. Alternative: Use excludeTools Instead\n\nIf you want all tools available except specific dangerous ones:\n\n```json\n{\n \"mcpServers\": {\n \"analytics\": {\n \"command\": \"python3\",\n \"args\": [\"analytics_server.py\"],\n \"cwd\": \"/home/user/mcp-servers\",\n \"env\": {\n \"ANALYTICS_API_KEY\": \"$ANALYTICS_API_KEY\"\n },\n \"timeout\": 15000,\n \"trust\": false,\n \"excludeTools\": [\"delete_records\", \"run_migration\"]\n }\n }\n}\n```\n\nThis exposes everything except `delete_records` and `run_migration`.\n\n---\n\n## 5. Adding the Server via CLI (Alternative to Manual Editing)\n\nYou can also add MCP servers from the command line:\n\n```bash\ngemini mcp add analytics --command \"python3 analytics_server.py\"\n```\n\nHowever, the CLI `add` command provides less control over fields like `env`, `includeTools`, and `cwd`. For full configuration, editing `settings.json` directly is recommended.\n\nTo manage servers later:\n\n```bash\ngemini mcp list # list all configured servers\ngemini mcp remove analytics # remove a server\n```\n\nOr from within an interactive session:\n\n```\n/mcp list\n/mcp enable analytics\n/mcp disable analytics\n```\n\n---\n\n## 6. Minimal Python MCP Server Skeleton\n\nFor reference, here is the basic structure of a Python MCP server that your configuration would launch:\n\n```python\n# analytics_server.py\nimport os\nimport json\nimport sys\n\n# Read the API key passed through the env config\napi_key = os.environ.get(\"ANALYTICS_API_KEY\")\nif not api_key:\n print(\"Error: ANALYTICS_API_KEY not set\", file=sys.stderr)\n sys.exit(1)\n\n# Your MCP server implementation here\n# The server communicates over stdio using the MCP protocol\n# Tools like query_data, generate_report, etc. are registered\n# and handled according to the MCP specification.\n```\n\n---\n\n## Summary\n\n1. Add your server to `mcpServers` in `~/.gemini/settings.json` (or project-level).\n2. Pass the API key using `\"env\": { \"MY_API_KEY\": \"$MY_API_KEY\" }` -- never hardcode it.\n3. Restrict tools with `\"includeTools\": [\"tool1\", \"tool2\"]` (whitelist) or `\"excludeTools\": [\"bad_tool\"]` (blacklist).\n4. Set the actual environment variable in your shell or in a `.gemini/.env` file.\n5. Verify the connection with `/mcp list` and `/mcp status` inside an interactive session.\n"}], "eval-2-mcp-server-config-without_skill": [{"name": "response.md", "type": "text", "content": "# Connecting a Custom Python MCP Server to Gemini CLI\n\n## Overview\n\nGemini CLI supports MCP (Model Context Protocol) servers through its settings configuration. You can connect a custom Python MCP server, pass environment variables (like API keys), and restrict which tools are exposed.\n\n## 1. Build Your Python MCP Server\n\nFirst, make sure your Python MCP server uses the official MCP SDK. Here is a minimal example:\n\n```python\n# my_mcp_server.py\nimport os\nfrom mcp.server.fastmcp import FastMCP\n\n# Read the API key from environment\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nif not API_KEY:\n raise ValueError(\"MY_API_KEY environment variable is required\")\n\nmcp = FastMCP(\"my-custom-server\")\n\n@mcp.tool()\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API using the configured API key.\"\"\"\n # Use API_KEY here for your requests\n return f\"Results for '{query}' using key ending in ...{API_KEY[-4:]}\"\n\n@mcp.tool()\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data.\"\"\"\n return f\"Analysis of: {data}\"\n\n@mcp.tool()\ndef internal_debug(info: str) -> str:\n \"\"\"Internal debug tool - you may want to hide this from Gemini.\"\"\"\n return f\"Debug: {info}\"\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\nInstall the dependency:\n\n```bash\npip install mcp\n```\n\n## 2. Configure Gemini CLI to Use Your MCP Server\n\nGemini CLI reads MCP server configuration from a `settings.json` file. There are two levels:\n\n- **Global (user-level):** `~/.gemini/settings.json`\n- **Project-level:** `.gemini/settings.json` in your project root\n\n### Basic Configuration\n\nCreate or edit `~/.gemini/settings.json` (or `.gemini/settings.json` in your project):\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"cwd\": \"/absolute/path/to/your/server/directory\",\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-value-here\"\n }\n }\n }\n}\n```\n\n### Configuration Fields Explained\n\n| Field | Description |\n|-------|-------------|\n| `command` | The executable to run (e.g., `python`, `python3`, `node`, `npx`) |\n| `args` | Array of arguments passed to the command |\n| `cwd` | Working directory for the server process (optional but recommended) |\n| `env` | Environment variables passed to the server process |\n\n### Using a Virtual Environment\n\nIf your MCP server has its own virtual environment:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"/path/to/venv/bin/python\",\n \"args\": [\"/absolute/path/to/my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-value-here\"\n }\n }\n }\n}\n```\n\nOn Windows, use the Windows path to the venv Python:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"C:\\\\path\\\\to\\\\venv\\\\Scripts\\\\python.exe\",\n \"args\": [\"C:\\\\path\\\\to\\\\my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-value-here\"\n }\n }\n }\n}\n```\n\n## 3. Passing the API Key Securely\n\nThere are several approaches to handle the API key:\n\n### Option A: Inline in settings.json (simplest, but key is in plaintext)\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"sk-abc123your-actual-key\"\n }\n }\n }\n}\n```\n\n### Option B: Reference a shell variable at launch time\n\nSet the environment variable in your shell profile (`.bashrc`, `.zshrc`, etc.):\n\n```bash\nexport MY_API_KEY=\"sk-abc123your-actual-key\"\n```\n\nThen in `settings.json`, you can omit the `env` block entirely if the variable is already in your shell environment. The child process inherits the parent environment. However, to be explicit:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"${MY_API_KEY}\"\n }\n }\n }\n}\n```\n\n> **Note:** Whether `${MY_API_KEY}` variable interpolation is supported depends on the Gemini CLI version. If it is not supported, you may need to use a wrapper script approach (see Option C).\n\n### Option C: Wrapper script (most flexible)\n\nCreate a wrapper script `run_server.sh`:\n\n```bash\n#!/bin/bash\n# run_server.sh\n# Load API key from a secure location\nsource ~/.secrets/my_api_keys.env\nexport MY_API_KEY\nexec python /path/to/my_mcp_server.py \"$@\"\n```\n\nThen configure:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"bash\",\n \"args\": [\"/path/to/run_server.sh\"]\n }\n }\n}\n```\n\n## 4. Restricting Which Tools Are Exposed\n\nThere are two approaches to restricting tools: server-side and client-side.\n\n### Approach A: Server-Side Restriction (Recommended)\n\nOnly define the tools you want exposed in your MCP server. Simply remove or comment out any tools you do not want Gemini to access:\n\n```python\n# my_mcp_server.py - only expose the tools you want\nfrom mcp.server.fastmcp import FastMCP\nimport os\n\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nmcp = FastMCP(\"my-custom-server\")\n\n@mcp.tool()\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API.\"\"\"\n return f\"Results for '{query}'\"\n\n@mcp.tool()\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data.\"\"\"\n return f\"Analysis of: {data}\"\n\n# internal_debug is NOT registered - Gemini will never see it\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\n### Approach B: Conditional Tool Registration\n\nUse an environment variable or config file to control which tools are registered:\n\n```python\n# my_mcp_server.py - conditionally register tools\nfrom mcp.server.fastmcp import FastMCP\nimport os\nimport json\n\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nALLOWED_TOOLS = os.environ.get(\"ALLOWED_TOOLS\", \"\").split(\",\")\n\nmcp = FastMCP(\"my-custom-server\")\n\ndef register_if_allowed(tool_name):\n \"\"\"Decorator factory that only registers a tool if it's in the allowed list.\"\"\"\n def decorator(func):\n if not ALLOWED_TOOLS or not ALLOWED_TOOLS[0]:\n # No restriction set, register all tools\n return mcp.tool()(func)\n elif tool_name in ALLOWED_TOOLS:\n return mcp.tool()(func)\n return func\n return decorator\n\n@register_if_allowed(\"fetch_data\")\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API.\"\"\"\n return f\"Results for '{query}'\"\n\n@register_if_allowed(\"analyze_data\")\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data.\"\"\"\n return f\"Analysis of: {data}\"\n\n@register_if_allowed(\"internal_debug\")\ndef internal_debug(info: str) -> str:\n \"\"\"Internal debug tool.\"\"\"\n return f\"Debug: {info}\"\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\nThen in your `settings.json`, pass the allowed tools list:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-here\",\n \"ALLOWED_TOOLS\": \"fetch_data,analyze_data\"\n }\n }\n }\n}\n```\n\nThis way, `internal_debug` is never registered and Gemini cannot call it.\n\n### Approach C: Gemini CLI Tool Filtering (if supported)\n\nSome MCP clients support an `allowedTools` or `toolFilter` field in the server configuration. If Gemini CLI supports this, the configuration would look like:\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"env\": {\n \"MY_API_KEY\": \"your-api-key-here\"\n },\n \"toolFilter\": {\n \"allow\": [\"fetch_data\", \"analyze_data\"]\n }\n }\n }\n}\n```\n\n> **Note:** The exact field name and syntax for client-side tool filtering varies by MCP client implementation. Check the Gemini CLI documentation for the exact syntax. If Gemini CLI does not support this field, use the server-side approaches (A or B) described above.\n\n## 5. Complete Working Example\n\n### Directory Structure\n\n```\nmy-mcp-project/\n my_mcp_server.py\n requirements.txt\n .gemini/\n settings.json\n```\n\n### requirements.txt\n\n```\nmcp>=1.0.0\n```\n\n### my_mcp_server.py\n\n```python\nimport os\nfrom mcp.server.fastmcp import FastMCP\n\nAPI_KEY = os.environ.get(\"MY_API_KEY\")\nif not API_KEY:\n raise ValueError(\"MY_API_KEY environment variable must be set\")\n\nALLOWED_TOOLS = [t.strip() for t in os.environ.get(\"ALLOWED_TOOLS\", \"\").split(\",\") if t.strip()]\n\nmcp = FastMCP(\"my-custom-server\")\n\n\ndef expose(tool_name):\n \"\"\"Only register the tool if it passes the allowlist filter.\"\"\"\n def decorator(func):\n if not ALLOWED_TOOLS or tool_name in ALLOWED_TOOLS:\n return mcp.tool()(func)\n return func\n return decorator\n\n\n@expose(\"fetch_data\")\ndef fetch_data(query: str) -> str:\n \"\"\"Fetch data from the external API using the configured API key.\"\"\"\n # Your real implementation here\n return f\"Fetched results for '{query}'\"\n\n\n@expose(\"analyze_data\")\ndef analyze_data(data: str) -> str:\n \"\"\"Analyze the provided data and return insights.\"\"\"\n return f\"Analysis complete for: {data}\"\n\n\n@expose(\"dangerous_reset\")\ndef dangerous_reset() -> str:\n \"\"\"Reset all data - restricted tool.\"\"\"\n return \"Data reset complete\"\n\n\nif __name__ == \"__main__\":\n mcp.run()\n```\n\n### .gemini/settings.json\n\n```json\n{\n \"mcpServers\": {\n \"my-custom-server\": {\n \"command\": \"python\",\n \"args\": [\"my_mcp_server.py\"],\n \"cwd\": \"/absolute/path/to/my-mcp-project\",\n \"env\": {\n \"MY_API_KEY\": \"sk-your-api-key-here\",\n \"ALLOWED_TOOLS\": \"fetch_data,analyze_data\"\n }\n }\n }\n}\n```\n\nWith this configuration:\n- Gemini CLI launches your Python MCP server as a subprocess\n- The `MY_API_KEY` environment variable is passed securely to the server process\n- Only `fetch_data` and `analyze_data` are registered as tools; `dangerous_reset` is excluded\n- Gemini can discover and invoke the allowed tools during conversation\n\n## 6. Testing Your Server\n\nBefore connecting to Gemini CLI, test your server standalone:\n\n```bash\n# Test that the server starts correctly\nMY_API_KEY=\"test-key\" python my_mcp_server.py\n\n# Or with tool restrictions\nMY_API_KEY=\"test-key\" ALLOWED_TOOLS=\"fetch_data\" python my_mcp_server.py\n```\n\nYou can also use the MCP Inspector tool to verify your server works:\n\n```bash\nnpx @modelcontextprotocol/inspector python my_mcp_server.py\n```\n\nThen launch Gemini CLI in your project directory. It will automatically pick up the `.gemini/settings.json` and start your MCP server.\n\n## 7. Troubleshooting\n\n| Issue | Solution |\n|-------|----------|\n| Server not found | Use absolute paths in `command` and `args` |\n| API key not available | Verify the `env` block in settings.json; check with a print/log statement |\n| Tools not showing up | Confirm the tool functions are decorated with `@mcp.tool()` and not filtered out |\n| Permission errors | Ensure the Python executable and script have correct permissions |\n| Server crashes on start | Run the server manually from the terminal first to see error output |\n| Wrong Python version | Use the full path to the correct Python interpreter in `command` |\n"}], "eval-3-custom-slash-command-with_skill": [{"name": "response.md", "type": "text", "content": "# Creating a Global Custom Slash Command for AI Commit Messages in Gemini CLI\n\nYou can create a custom slash command by writing a `.toml` file in the global commands directory. Since you want it available in all your projects, place it under `~/.gemini/commands/`.\n\n## Step 1: Create the Commands Directory\n\n```bash\nmkdir -p ~/.gemini/commands\n```\n\n## Step 2: Create the Slash Command TOML File\n\nCreate the file `~/.gemini/commands/commit.toml`:\n\n```toml\ndescription = \"Generate a commit message from staged changes\"\nprompt = \"\"\"\nWrite a concise conventional commit message for the following staged changes.\n\nUse the format: <type>(<scope>): <description>\n\nWhere type is one of: feat, fix, docs, style, refactor, test, chore\n\nOnly output the commit message itself, nothing else.\n\nStaged diff:\n!{git diff --cached}\n\"\"\"\n```\n\n### How This Works\n\n- The `prompt` field defines the instruction sent to Gemini when the command is invoked.\n- `!{git diff --cached}` is a **shell execution block** -- Gemini CLI runs the shell command inside `!{...}` and inlines its output into the prompt before sending it to the model.\n- `description` is optional but shows up when you run `/help` in an interactive session.\n\n## Step 3: Use the Command\n\nStart Gemini CLI in any project that has staged git changes:\n\n```bash\ncd your-project\ngit add -A # or stage specific files\ngemini\n```\n\nThen in the interactive session, type:\n\n```\n/commit\n```\n\nGemini will execute `git diff --cached`, include the diff in the prompt, and return a conventional commit message.\n\n## Namespaced Alternative\n\nIf you prefer a namespaced command like `/git:commit`, use a subdirectory:\n\n```bash\nmkdir -p ~/.gemini/commands/git\n```\n\nCreate `~/.gemini/commands/git/commit.toml`:\n\n```toml\ndescription = \"Generate a commit message from staged changes\"\nprompt = \"\"\"\nWrite a concise conventional commit message for the following staged changes.\n\nUse the format: <type>(<scope>): <description>\n\nWhere type is one of: feat, fix, docs, style, refactor, test, chore\n\nOnly output the commit message itself, nothing else.\n\nStaged diff:\n!{git diff --cached}\n\"\"\"\n```\n\nThen invoke it with:\n\n```\n/git:commit\n```\n\n## Using It with Arguments\n\nYou can also make the command accept arguments with `{{args}}` to allow the user to pass additional context:\n\n```toml\ndescription = \"Generate a commit message from staged changes with optional context\"\nprompt = \"\"\"\nWrite a concise conventional commit message for the following staged changes.\n\nUse the format: <type>(<scope>): <description>\n\nWhere type is one of: feat, fix, docs, style, refactor, test, chore\n\nAdditional context from the user: {{args}}\n\nOnly output the commit message itself, nothing else.\n\nStaged diff:\n!{git diff --cached}\n\"\"\"\n```\n\nThis lets you provide extra hints:\n\n```\n/commit focus on the auth refactoring\n```\n\n## Fully Automated Shell Script (Bonus)\n\nIf you want to go beyond interactive mode and automate the entire commit flow from your shell, you can create a shell script. Note that custom slash commands currently do **not** work in headless mode (`-p` flag), so for automation you need to embed the prompt directly:\n\n```bash\n#!/bin/bash\n# Save as ~/bin/ai-commit or anywhere on your PATH\n\nDIFF=$(git diff --cached)\n\nif [ -z \"$DIFF\" ]; then\n echo \"Error: Nothing staged. Run 'git add' first.\"\n exit 1\nfi\n\nMSG=$(echo \"$DIFF\" | gemini -p \"Write a concise conventional commit message (type(scope): description format, max 72 chars). Only output the message, nothing else.\" --output-format json | jq -r '.response')\n\nif [ -z \"$MSG\" ]; then\n echo \"Error: Failed to generate commit message.\"\n exit 1\nfi\n\necho \"Proposed commit message:\"\necho \" $MSG\"\necho \"\"\nread -p \"Commit with this message? [y/N] \" confirm\n\nif [[ \"$confirm\" =~ ^[Yy]$ ]]; then\n git commit -m \"$MSG\"\n echo \"Committed.\"\nelse\n echo \"Aborted.\"\nfi\n```\n\nMake it executable:\n\n```bash\nchmod +x ~/bin/ai-commit\n```\n\nThen use it from any project:\n\n```bash\ngit add -A\nai-commit\n```\n\n## Summary of File Locations\n\n| Path | Scope | Invocation |\n|---|---|---|\n| `~/.gemini/commands/commit.toml` | Global (all projects) | `/commit` |\n| `~/.gemini/commands/git/commit.toml` | Global, namespaced | `/git:commit` |\n| `<project>/.gemini/commands/commit.toml` | Project only | `/commit` |\n\nThe global location (`~/.gemini/commands/`) is what makes the command available across all your projects without any per-project setup.\n"}], "eval-3-custom-slash-command-without_skill": [{"name": "response.md", "type": "text", "content": "# Creating a Custom Slash Command in Gemini CLI for Commit Messages\n\n## Overview\n\nGemini CLI supports custom slash commands through its extensions system. To make a command available across all your projects, you place it in the global configuration directory at `~/.gemini/extensions/`. Extensions are defined using `gemini_extension.json` manifest files that describe the command and its behavior.\n\n## Step-by-Step Setup\n\n### 1. Create the Global Extensions Directory\n\n```bash\nmkdir -p ~/.gemini/extensions/commit-message\n```\n\n### 2. Create the Shell Script\n\nCreate `~/.gemini/extensions/commit-message/generate-commit-msg.sh`:\n\n```bash\n#!/bin/bash\n# generate-commit-msg.sh\n# Gets the staged diff and outputs it for the AI to process\n\nDIFF=$(git diff --cached)\n\nif [ -z \"$DIFF\" ]; then\n echo \"ERROR: No staged changes found. Stage your changes with 'git add' first.\"\n exit 1\nfi\n\necho \"Here is the staged git diff:\"\necho \"\"\necho '```diff'\necho \"$DIFF\"\necho '```'\necho \"\"\necho \"Please write a concise, conventional commit message for these changes.\"\necho \"Follow the Conventional Commits format: type(scope): description\"\necho \"\"\necho \"Types: feat, fix, docs, style, refactor, perf, test, build, ci, chore\"\necho \"\"\necho \"Provide:\"\necho \"1. A single-line subject (max 72 chars)\"\necho \"2. A blank line\"\necho \"3. An optional body explaining the 'why' behind the changes\"\n```\n\nMake it executable:\n\n```bash\nchmod +x ~/.gemini/extensions/commit-message/generate-commit-msg.sh\n```\n\n### 3. Create the Extension Manifest\n\nCreate `~/.gemini/extensions/commit-message/gemini_extension.json`:\n\n```json\n{\n \"name\": \"commit-message\",\n \"description\": \"Generate a commit message from staged changes using git diff --cached\",\n \"version\": \"1.0.0\",\n \"commands\": [\n {\n \"name\": \"commit-msg\",\n \"description\": \"Analyze staged git changes and generate a conventional commit message\",\n \"action\": {\n \"type\": \"shell\",\n \"command\": \"~/.gemini/extensions/commit-message/generate-commit-msg.sh\"\n }\n }\n ]\n}\n```\n\n### 4. Usage\n\nOnce configured, open Gemini CLI in any project and type:\n\n```\n/commit-msg\n```\n\nThe command will:\n1. Run `git diff --cached` to capture your staged changes\n2. Pass the diff output into the conversation context\n3. Ask Gemini to generate a conventional commit message based on the diff\n\n## Alternative Approach: Using GEMINI.md with a Prompt-Based Workflow\n\nIf extensions are not behaving as expected in your version of Gemini CLI, you can achieve a similar result using a global `GEMINI.md` instruction file combined with a simple shell alias.\n\n### 1. Create a Global GEMINI.md\n\nCreate or edit `~/.gemini/GEMINI.md`:\n\n```markdown\n# Custom Commands\n\n## Commit Message Generation\n\nWhen I say \"generate commit message\" or provide a git diff, follow these rules:\n\n1. Analyze the diff carefully\n2. Write a commit message in Conventional Commits format\n3. Use the format: type(scope): short description\n4. Include a body if the changes are complex\n5. Keep the subject line under 72 characters\n6. Use imperative mood (\"add feature\" not \"added feature\")\n```\n\n### 2. Create a Shell Helper Script\n\nAdd to your `~/.bashrc` or `~/.zshrc`:\n\n```bash\n# Gemini CLI commit message helper\ngcm() {\n local diff\n diff=$(git diff --cached)\n\n if [ -z \"$diff\" ]; then\n echo \"No staged changes. Use 'git add' to stage files first.\"\n return 1\n fi\n\n echo \"Staged diff:\"\n echo '```diff'\n echo \"$diff\"\n echo '```'\n echo \"\"\n echo \"Generate a conventional commit message for the above changes.\"\n}\n```\n\nThen in Gemini CLI, you can run:\n\n```\n@run gcm\n```\n\nOr pipe the output directly:\n\n```bash\n# From your regular terminal, pipe into gemini\ngcm | gemini\n```\n\n## Alternative Approach: Inline Tool Use\n\nGemini CLI allows you to invoke shell commands inline. You can simply type within a Gemini CLI session:\n\n```\nPlease run `git diff --cached` and write me a commit message based on the output.\n```\n\nSince Gemini CLI has the ability to execute shell commands (with your approval), it will run the diff, read the output, and generate a commit message. This requires no configuration at all, but it is a manual process each time rather than a single slash command.\n\n## Applying the Generated Commit Message\n\nOnce Gemini generates a commit message you like, you can apply it directly:\n\n```bash\ngit commit -m \"feat(auth): add OAuth2 token refresh logic\n\nImplement automatic token refresh when access tokens expire.\nUses refresh token rotation for improved security.\"\n```\n\nOr ask Gemini CLI to run the commit for you:\n\n```\nPlease commit with that message.\n```\n\n## Directory Structure Summary\n\n```\n~/.gemini/\n GEMINI.md # Global instructions (optional)\n extensions/\n commit-message/\n gemini_extension.json # Extension manifest\n generate-commit-msg.sh # Shell script for the command\n```\n\n## Tips\n\n- **Scope control**: Placing extensions in `~/.gemini/extensions/` makes them global. For project-specific commands, place them in `.gemini/extensions/` within the project root.\n- **Customizing the prompt**: Edit the echo statements at the end of `generate-commit-msg.sh` to adjust the instructions given to the AI (e.g., require a specific commit style, add ticket number references, etc.).\n- **Large diffs**: If the staged diff is very large, consider truncating it or summarizing file-level changes to stay within context limits. You can modify the script to include `--stat` output alongside a truncated diff.\n- **Security**: The shell script only reads data (via `git diff --cached`). It does not modify your repository or make commits on its own.\n\n## Troubleshooting\n\n- **\"Command not found\"**: Make sure the shell script is executable (`chmod +x`) and the path in `gemini_extension.json` is correct.\n- **\"No staged changes\"**: Run `git add <files>` before invoking the command.\n- **Extension not loading**: Verify the JSON manifest is valid (`cat ~/.gemini/extensions/commit-message/gemini_extension.json | python3 -m json.tool`) and restart Gemini CLI.\n- **Path issues on Windows**: Use forward slashes or the `%USERPROFILE%` equivalent in paths. The extensions directory would be at `%USERPROFILE%\\.gemini\\extensions\\`.\n"}]}, "benchmark": {"metadata": {"skill_name": "gemini-cli", "skill_path": "src/skills/gemini-cli/", "executor_model": "claude-opus-4-6", "analyzer_model": "claude-opus-4-6", "timestamp": "2026-03-11T18:15:00Z", "evals_run": [1, 2, 3], "runs_per_configuration": 3}, "runs": [{"eval_id": 1, "eval_name": "cicd-setup", "configuration": "with_skill", "run_number": 1, "result": {"pass_rate": 1.0, "passed": 6, "failed": 0, "total": 6, "time_seconds": 53.6, "tokens": 64216}, "expectations": [{"text": "Uses -p or --prompt flag for headless/non-interactive mode", "passed": true, "evidence": "Line 53: gemini -p with --output-format json"}, {"text": "Includes --output-format json flag for structured output", "passed": true, "evidence": "Line 62: --output-format json flag used correctly"}, {"text": "Shows how to set GEMINI_API_KEY from GitHub secrets", "passed": true, "evidence": "Line 42: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}"}, {"text": "Provides a GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Complete workflow YAML with checkout, node setup, install, review, and comment steps"}, {"text": "Shows how to parse JSON response (e.g., with jq .response)", "passed": true, "evidence": "Line 65: jq -r '.response' and line 68: token stats extraction"}, {"text": "Does not invent non-existent CLI flags or options", "passed": true, "evidence": "Only uses verified flags: -p and --output-format json. Exit codes match corrected values."}], "notes": []}, {"eval_id": 1, "eval_name": "cicd-setup", "configuration": "without_skill", "run_number": 1, "result": {"pass_rate": 0.6667, "passed": 4, "failed": 2, "total": 6, "time_seconds": 68.0, "tokens": 59322}, "expectations": [{"text": "Uses -p or --prompt flag for headless/non-interactive mode", "passed": true, "evidence": "Line 96: gemini -p"}, {"text": "Includes --output-format json flag for structured output", "passed": false, "evidence": "Does not use --output-format json. Line 95 comment mentions '--json flag if available' which is not a real flag."}, {"text": "Shows how to set GEMINI_API_KEY from GitHub secrets", "passed": true, "evidence": "Line 62: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}"}, {"text": "Provides a GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Complete workflow YAML provided"}, {"text": "Shows how to parse JSON response (e.g., with jq .response)", "passed": true, "evidence": "Uses jq for JSON validation and parsing"}, {"text": "Does not invent non-existent CLI flags or options", "passed": false, "evidence": "Line 95: references '--json flag if available' which is not a real Gemini CLI flag"}], "notes": []}, {"eval_id": 2, "eval_name": "mcp-server-config", "configuration": "with_skill", "run_number": 1, "result": {"pass_rate": 1.0, "passed": 6, "failed": 0, "total": 6, "time_seconds": 47.3, "tokens": 66522}, "expectations": [{"text": "Points to ~/.gemini/settings.json or project-level settings.json", "passed": true, "evidence": "Opens with ~/.gemini/settings.json (global) or <project>/.gemini/settings.json (project-scoped)"}, {"text": "Shows mcpServers config with command, args fields", "passed": true, "evidence": "Full JSON config with mcpServers, command, args, env, timeout, trust, includeTools"}, {"text": "Shows $VAR pattern for environment variable references in env field", "passed": true, "evidence": "Shows \"API_KEY\": \"$MY_API_KEY\" pattern"}, {"text": "Explains includeTools and/or excludeTools for restricting exposed tools", "passed": true, "evidence": "Dedicated section for both includeTools and excludeTools with examples"}, {"text": "Mentions security considerations (trust field, not hardcoding keys, etc.)", "passed": true, "evidence": "Explains trust field, warns never to hardcode secrets, mentions env var redaction"}, {"text": "Uses correct Python command (python/python3) in the command field", "passed": true, "evidence": "Uses \"command\": \"python\" consistently"}], "notes": []}, {"eval_id": 2, "eval_name": "mcp-server-config", "configuration": "without_skill", "run_number": 1, "result": {"pass_rate": 1.0, "passed": 6, "failed": 0, "total": 6, "time_seconds": 86.7, "tokens": 61417}, "expectations": [{"text": "Points to ~/.gemini/settings.json or project-level settings.json", "passed": true, "evidence": "States ~/.gemini/settings.json (global) and .gemini/settings.json (project-level)"}, {"text": "Shows mcpServers config with command, args fields", "passed": true, "evidence": "Full JSON config with mcpServers, command, args, env"}, {"text": "Shows $VAR pattern for environment variable references in env field", "passed": true, "evidence": "Shows both $MY_API_KEY and ${ANOTHER_ENV_VAR} syntax"}, {"text": "Explains includeTools and/or excludeTools for restricting exposed tools", "passed": true, "evidence": "Dedicated sections for includeTools and excludeTools with examples"}, {"text": "Mentions security considerations (trust field, not hardcoding keys, etc.)", "passed": true, "evidence": "Mentions trust field, automatic redaction, env variable security"}, {"text": "Uses correct Python command (python/python3) in the command field", "passed": true, "evidence": "Uses \"command\": \"python\" consistently"}], "notes": []}, {"eval_id": 3, "eval_name": "custom-slash-command", "configuration": "with_skill", "run_number": 1, "result": {"pass_rate": 1.0, "passed": 5, "failed": 0, "total": 5, "time_seconds": 43.7, "tokens": 63047}, "expectations": [{"text": "Points to ~/.gemini/commands/ for global user-scoped commands", "passed": true, "evidence": "States ~/.gemini/commands/commit.toml for global commands"}, {"text": "Shows .toml file format with prompt field", "passed": true, "evidence": "Complete .toml example with description and prompt fields"}, {"text": "Uses !{git diff --cached} shell execution syntax in the prompt", "passed": true, "evidence": "Uses !{git diff --cached} and explains the syntax"}, {"text": "Warns that custom slash commands don't work in headless/non-interactive mode", "passed": true, "evidence": "States 'Custom slash commands work in interactive mode only. They are not available in headless mode (-p flag).'"}, {"text": "Provides a complete, copy-pasteable .toml file example", "passed": true, "evidence": "Complete commit.toml ready to copy-paste"}], "notes": []}, {"eval_id": 3, "eval_name": "custom-slash-command", "configuration": "without_skill", "run_number": 1, "result": {"pass_rate": 0.8, "passed": 4, "failed": 1, "total": 5, "time_seconds": 73.6, "tokens": 59908}, "expectations": [{"text": "Points to ~/.gemini/commands/ for global user-scoped commands", "passed": true, "evidence": "States ~/.gemini/commands/ for global commands"}, {"text": "Shows .toml file format with prompt field", "passed": true, "evidence": "Complete .toml example with description and prompt fields"}, {"text": "Uses !{git diff --cached} shell execution syntax in the prompt", "passed": true, "evidence": "Uses !{git diff --cached} in the prompt field"}, {"text": "Warns that custom slash commands don't work in headless/non-interactive mode", "passed": false, "evidence": "No mention of headless mode limitations anywhere in the response"}, {"text": "Provides a complete, copy-pasteable .toml file example", "passed": true, "evidence": "Complete commit.toml file provided"}], "notes": []}], "run_summary": {"with_skill": {"pass_rate": {"mean": 1.0, "stddev": 0.0, "min": 1.0, "max": 1.0}, "time_seconds": {"mean": 48.2, "stddev": 5.0, "min": 43.7, "max": 53.6}, "tokens": {"mean": 64595, "stddev": 1741, "min": 63047, "max": 66522}}, "without_skill": {"pass_rate": {"mean": 0.822, "stddev": 0.167, "min": 0.667, "max": 1.0}, "time_seconds": {"mean": 76.1, "stddev": 9.7, "min": 68.0, "max": 86.7}, "tokens": {"mean": 60216, "stddev": 1372, "min": 59322, "max": 61417}}, "delta": {"pass_rate": "+0.18", "time_seconds": "-27.9", "tokens": "+4379"}}, "notes": ["With-skill achieves 100% pass rate across all 3 evals (17/17 assertions) vs 82.2% baseline (14/17)", "Eval 2 (MCP server config) is non-discriminating \u00e2\u20ac\u201d both configs pass all 6 assertions. This eval could be made harder.", "With-skill is ~28s faster on average despite using ~4k more tokens, suggesting more focused responses", "Baseline failures are accuracy-related: hallucinated --json flag (eval 1) and missing headless limitation warning (eval 3)", "The skill's value is strongest on accuracy assertions \u00e2\u20ac\u201d it prevents hallucination of non-existent flags and ensures important caveats are mentioned"]}};
650
-
651
- // ---- State ----
652
- let feedbackMap = {}; // run_id -> feedback text
653
- let currentIndex = 0;
654
- let visitedRuns = new Set();
655
-
656
- // ---- Init ----
657
- async function init() {
658
- // Load saved feedback from server — but only if this isn't a fresh
659
- // iteration (indicated by previous_feedback being present). When
660
- // previous feedback exists, the feedback.json on disk is stale from
661
- // the prior iteration and should not pre-fill the textareas.
662
- const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
663
- || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
664
- if (!hasPrevious) {
665
- try {
666
- const resp = await fetch("/api/feedback");
667
- const data = await resp.json();
668
- if (data.reviews) {
669
- for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
670
- }
671
- } catch { /* first run, no feedback yet */ }
672
- }
673
-
674
- document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
675
- showRun(0);
676
-
677
- // Wire up feedback auto-save
678
- const textarea = document.getElementById("feedback");
679
- let saveTimeout = null;
680
- textarea.addEventListener("input", () => {
681
- clearTimeout(saveTimeout);
682
- document.getElementById("feedback-status").textContent = "";
683
- saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
684
- });
685
- }
686
-
687
- // ---- Navigation ----
688
- function navigate(delta) {
689
- const newIndex = currentIndex + delta;
690
- if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
691
- saveCurrentFeedback();
692
- showRun(newIndex);
693
- }
694
- }
695
-
696
- function updateNavButtons() {
697
- document.getElementById("prev-btn").disabled = currentIndex === 0;
698
- document.getElementById("next-btn").disabled =
699
- currentIndex === EMBEDDED_DATA.runs.length - 1;
700
- }
701
-
702
- // ---- Show a run ----
703
- function showRun(index) {
704
- currentIndex = index;
705
- const run = EMBEDDED_DATA.runs[index];
706
-
707
- // Progress
708
- document.getElementById("progress").textContent =
709
- `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
710
-
711
- // Prompt
712
- document.getElementById("prompt-text").textContent = run.prompt;
713
-
714
- // Config badge
715
- const badge = document.getElementById("config-badge");
716
- const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
717
- if (configMatch) {
718
- const config = configMatch[1];
719
- const isBaseline = config === "without_skill" || config === "old_skill";
720
- badge.textContent = config.replace(/_/g, " ");
721
- badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
722
- badge.style.display = "inline-block";
723
- } else {
724
- badge.style.display = "none";
725
- }
726
-
727
- // Outputs
728
- renderOutputs(run);
729
-
730
- // Previous outputs
731
- renderPrevOutputs(run);
732
-
733
- // Grades
734
- renderGrades(run);
735
-
736
- // Previous feedback
737
- const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
738
- const prevEl = document.getElementById("prev-feedback");
739
- if (prevFb) {
740
- document.getElementById("prev-feedback-text").textContent = prevFb;
741
- prevEl.style.display = "block";
742
- } else {
743
- prevEl.style.display = "none";
744
- }
745
-
746
- // Feedback
747
- document.getElementById("feedback").value = feedbackMap[run.id] || "";
748
- document.getElementById("feedback-status").textContent = "";
749
-
750
- updateNavButtons();
751
-
752
- // Track visited runs and promote done button when all visited
753
- visitedRuns.add(index);
754
- const doneBtn = document.getElementById("done-btn");
755
- if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
756
- doneBtn.classList.add("ready");
757
- }
758
-
759
- // Scroll main content to top
760
- document.querySelector(".main").scrollTop = 0;
761
- }
762
-
763
- // ---- Render outputs ----
764
- function renderOutputs(run) {
765
- const container = document.getElementById("outputs-body");
766
- container.innerHTML = "";
767
-
768
- const outputs = run.outputs || [];
769
- if (outputs.length === 0) {
770
- container.innerHTML = '<div class="empty-state">No output files</div>';
771
- return;
772
- }
773
-
774
- for (const file of outputs) {
775
- const fileDiv = document.createElement("div");
776
- fileDiv.className = "output-file";
777
-
778
- // Always show file header with download link
779
- const header = document.createElement("div");
780
- header.className = "output-file-header";
781
- const nameSpan = document.createElement("span");
782
- nameSpan.textContent = file.name;
783
- header.appendChild(nameSpan);
784
- const dlBtn = document.createElement("a");
785
- dlBtn.className = "dl-btn";
786
- dlBtn.textContent = "Download";
787
- dlBtn.download = file.name;
788
- dlBtn.href = getDownloadUri(file);
789
- header.appendChild(dlBtn);
790
- fileDiv.appendChild(header);
791
-
792
- const content = document.createElement("div");
793
- content.className = "output-file-content";
794
-
795
- if (file.type === "text") {
796
- const pre = document.createElement("pre");
797
- pre.textContent = file.content;
798
- content.appendChild(pre);
799
- } else if (file.type === "image") {
800
- const img = document.createElement("img");
801
- img.src = file.data_uri;
802
- img.alt = file.name;
803
- content.appendChild(img);
804
- } else if (file.type === "pdf") {
805
- const iframe = document.createElement("iframe");
806
- iframe.src = file.data_uri;
807
- content.appendChild(iframe);
808
- } else if (file.type === "xlsx") {
809
- renderXlsx(content, file.data_b64);
810
- } else if (file.type === "binary") {
811
- const a = document.createElement("a");
812
- a.className = "download-link";
813
- a.href = file.data_uri;
814
- a.download = file.name;
815
- a.textContent = "Download " + file.name;
816
- content.appendChild(a);
817
- } else if (file.type === "error") {
818
- const pre = document.createElement("pre");
819
- pre.textContent = file.content;
820
- pre.style.color = "var(--red)";
821
- content.appendChild(pre);
822
- }
823
-
824
- fileDiv.appendChild(content);
825
- container.appendChild(fileDiv);
826
- }
827
- }
828
-
829
- // ---- XLSX rendering via SheetJS ----
830
- function renderXlsx(container, b64Data) {
831
- try {
832
- const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
833
- const wb = XLSX.read(raw, { type: "array" });
834
-
835
- for (let i = 0; i < wb.SheetNames.length; i++) {
836
- const sheetName = wb.SheetNames[i];
837
- const ws = wb.Sheets[sheetName];
838
-
839
- if (wb.SheetNames.length > 1) {
840
- const sheetLabel = document.createElement("div");
841
- sheetLabel.style.cssText =
842
- "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
843
- sheetLabel.textContent = "Sheet: " + sheetName;
844
- container.appendChild(sheetLabel);
845
- }
846
-
847
- const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
848
- const wrapper = document.createElement("div");
849
- wrapper.innerHTML = htmlStr;
850
- container.appendChild(wrapper);
851
- }
852
- } catch (err) {
853
- container.textContent = "Error rendering spreadsheet: " + err.message;
854
- }
855
- }
856
-
857
- // ---- Grades ----
858
- function renderGrades(run) {
859
- const section = document.getElementById("grades-section");
860
- const content = document.getElementById("grades-content");
861
-
862
- if (!run.grading) {
863
- section.style.display = "none";
864
- return;
865
- }
866
-
867
- const grading = run.grading;
868
- section.style.display = "block";
869
- // Reset to collapsed
870
- content.classList.remove("open");
871
- document.getElementById("grades-arrow").classList.remove("open");
872
-
873
- const summary = grading.summary || {};
874
- const expectations = grading.expectations || [];
875
-
876
- let html = '<div style="padding: 1rem;">';
877
-
878
- // Summary line
879
- const passRate = summary.pass_rate != null
880
- ? Math.round(summary.pass_rate * 100) + "%"
881
- : "?";
882
- const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
883
- html += '<div class="grades-summary">';
884
- html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
885
- html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
886
- html += '</div>';
887
-
888
- // Assertions list
889
- html += '<ul class="assertion-list">';
890
- for (const exp of expectations) {
891
- const statusClass = exp.passed ? "pass" : "fail";
892
- const statusIcon = exp.passed ? "\u2713" : "\u2717";
893
- html += '<li class="assertion-item">';
894
- html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
895
- html += '<span>' + escapeHtml(exp.text) + '</span>';
896
- if (exp.evidence) {
897
- html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
898
- }
899
- html += '</li>';
900
- }
901
- html += '</ul>';
902
-
903
- html += '</div>';
904
- content.innerHTML = html;
905
- }
906
-
907
- function toggleGrades() {
908
- const content = document.getElementById("grades-content");
909
- const arrow = document.getElementById("grades-arrow");
910
- content.classList.toggle("open");
911
- arrow.classList.toggle("open");
912
- }
913
-
914
- // ---- Previous outputs (collapsible) ----
915
- function renderPrevOutputs(run) {
916
- const section = document.getElementById("prev-outputs-section");
917
- const content = document.getElementById("prev-outputs-content");
918
- const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
919
-
920
- if (!prevOutputs || prevOutputs.length === 0) {
921
- section.style.display = "none";
922
- return;
923
- }
924
-
925
- section.style.display = "block";
926
- // Reset to collapsed
927
- content.classList.remove("open");
928
- document.getElementById("prev-outputs-arrow").classList.remove("open");
929
-
930
- // Render the files into the content area
931
- content.innerHTML = "";
932
- const wrapper = document.createElement("div");
933
- wrapper.style.padding = "1rem";
934
-
935
- for (const file of prevOutputs) {
936
- const fileDiv = document.createElement("div");
937
- fileDiv.className = "output-file";
938
-
939
- const header = document.createElement("div");
940
- header.className = "output-file-header";
941
- const nameSpan = document.createElement("span");
942
- nameSpan.textContent = file.name;
943
- header.appendChild(nameSpan);
944
- const dlBtn = document.createElement("a");
945
- dlBtn.className = "dl-btn";
946
- dlBtn.textContent = "Download";
947
- dlBtn.download = file.name;
948
- dlBtn.href = getDownloadUri(file);
949
- header.appendChild(dlBtn);
950
- fileDiv.appendChild(header);
951
-
952
- const fc = document.createElement("div");
953
- fc.className = "output-file-content";
954
-
955
- if (file.type === "text") {
956
- const pre = document.createElement("pre");
957
- pre.textContent = file.content;
958
- fc.appendChild(pre);
959
- } else if (file.type === "image") {
960
- const img = document.createElement("img");
961
- img.src = file.data_uri;
962
- img.alt = file.name;
963
- fc.appendChild(img);
964
- } else if (file.type === "pdf") {
965
- const iframe = document.createElement("iframe");
966
- iframe.src = file.data_uri;
967
- fc.appendChild(iframe);
968
- } else if (file.type === "xlsx") {
969
- renderXlsx(fc, file.data_b64);
970
- } else if (file.type === "binary") {
971
- const a = document.createElement("a");
972
- a.className = "download-link";
973
- a.href = file.data_uri;
974
- a.download = file.name;
975
- a.textContent = "Download " + file.name;
976
- fc.appendChild(a);
977
- }
978
-
979
- fileDiv.appendChild(fc);
980
- wrapper.appendChild(fileDiv);
981
- }
982
-
983
- content.appendChild(wrapper);
984
- }
985
-
986
- function togglePrevOutputs() {
987
- const content = document.getElementById("prev-outputs-content");
988
- const arrow = document.getElementById("prev-outputs-arrow");
989
- content.classList.toggle("open");
990
- arrow.classList.toggle("open");
991
- }
992
-
993
- // ---- Feedback (saved to server -> feedback.json) ----
994
- function saveCurrentFeedback() {
995
- const run = EMBEDDED_DATA.runs[currentIndex];
996
- const text = document.getElementById("feedback").value;
997
-
998
- if (text.trim() === "") {
999
- delete feedbackMap[run.id];
1000
- } else {
1001
- feedbackMap[run.id] = text;
1002
- }
1003
-
1004
- // Build reviews array from map
1005
- const reviews = [];
1006
- for (const [run_id, feedback] of Object.entries(feedbackMap)) {
1007
- if (feedback.trim()) {
1008
- reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
1009
- }
1010
- }
1011
-
1012
- fetch("/api/feedback", {
1013
- method: "POST",
1014
- headers: { "Content-Type": "application/json" },
1015
- body: JSON.stringify({ reviews, status: "in_progress" }),
1016
- }).then(() => {
1017
- document.getElementById("feedback-status").textContent = "Saved";
1018
- }).catch(() => {
1019
- // Static mode or server unavailable — no-op on auto-save,
1020
- // feedback will be downloaded on final submit
1021
- document.getElementById("feedback-status").textContent = "Will download on submit";
1022
- });
1023
- }
1024
-
1025
- // ---- Done ----
1026
- function showDoneDialog() {
1027
- // Save current textarea to feedbackMap (but don't POST yet)
1028
- const run = EMBEDDED_DATA.runs[currentIndex];
1029
- const text = document.getElementById("feedback").value;
1030
- if (text.trim() === "") {
1031
- delete feedbackMap[run.id];
1032
- } else {
1033
- feedbackMap[run.id] = text;
1034
- }
1035
-
1036
- // POST once with status: complete — include ALL runs so the model
1037
- // can distinguish "no feedback" (looks good) from "not reviewed"
1038
- const reviews = [];
1039
- const ts = new Date().toISOString();
1040
- for (const r of EMBEDDED_DATA.runs) {
1041
- reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
1042
- }
1043
- const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
1044
- fetch("/api/feedback", {
1045
- method: "POST",
1046
- headers: { "Content-Type": "application/json" },
1047
- body: payload,
1048
- }).then(() => {
1049
- document.getElementById("done-overlay").classList.add("visible");
1050
- }).catch(() => {
1051
- // Server not available (static mode) — download as file
1052
- const blob = new Blob([payload], { type: "application/json" });
1053
- const url = URL.createObjectURL(blob);
1054
- const a = document.createElement("a");
1055
- a.href = url;
1056
- a.download = "feedback.json";
1057
- a.click();
1058
- URL.revokeObjectURL(url);
1059
- document.getElementById("done-overlay").classList.add("visible");
1060
- });
1061
- }
1062
-
1063
- function closeDoneDialog() {
1064
- // Reset status back to in_progress
1065
- saveCurrentFeedback();
1066
- document.getElementById("done-overlay").classList.remove("visible");
1067
- }
1068
-
1069
- // ---- Toast ----
1070
- function showToast(message) {
1071
- const toast = document.getElementById("toast");
1072
- toast.textContent = message;
1073
- toast.classList.add("visible");
1074
- setTimeout(() => toast.classList.remove("visible"), 2000);
1075
- }
1076
-
1077
- // ---- Keyboard nav ----
1078
- document.addEventListener("keydown", (e) => {
1079
- // Don't capture when typing in textarea
1080
- if (e.target.tagName === "TEXTAREA") return;
1081
-
1082
- if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
1083
- e.preventDefault();
1084
- navigate(-1);
1085
- } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
1086
- e.preventDefault();
1087
- navigate(1);
1088
- }
1089
- });
1090
-
1091
- // ---- Util ----
1092
- function getDownloadUri(file) {
1093
- if (file.data_uri) return file.data_uri;
1094
- if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
1095
- if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
1096
- return "#";
1097
- }
1098
-
1099
- function escapeHtml(text) {
1100
- const div = document.createElement("div");
1101
- div.textContent = text;
1102
- return div.innerHTML;
1103
- }
1104
-
1105
- // ---- View switching ----
1106
- function switchView(view) {
1107
- document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
1108
- document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
1109
- document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
1110
- document.getElementById("panel-" + view).classList.add("active");
1111
- }
1112
-
1113
- // ---- Benchmark rendering ----
1114
- function renderBenchmark() {
1115
- const data = EMBEDDED_DATA.benchmark;
1116
- if (!data) return;
1117
-
1118
- // Show the tabs
1119
- document.getElementById("view-tabs").style.display = "flex";
1120
-
1121
- const container = document.getElementById("benchmark-content");
1122
- const summary = data.run_summary || {};
1123
- const metadata = data.metadata || {};
1124
- const notes = data.notes || [];
1125
-
1126
- let html = "";
1127
-
1128
- // Header
1129
- html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
1130
- html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
1131
- if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
1132
- if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
1133
- if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
1134
- html += (metadata.runs_per_configuration || "?") + " runs per configuration";
1135
- html += "</p>";
1136
-
1137
- // Summary table
1138
- html += '<table class="benchmark-table">';
1139
-
1140
- function fmtStat(stat, pct) {
1141
- if (!stat) return "—";
1142
- const suffix = pct ? "%" : "";
1143
- const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
1144
- const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
1145
- return m + suffix + " ± " + s + suffix;
1146
- }
1147
-
1148
- function deltaClass(val) {
1149
- if (!val) return "";
1150
- const n = parseFloat(val);
1151
- if (n > 0) return "benchmark-delta-positive";
1152
- if (n < 0) return "benchmark-delta-negative";
1153
- return "";
1154
- }
1155
-
1156
- // Discover config names dynamically (everything except "delta")
1157
- const configs = Object.keys(summary).filter(k => k !== "delta");
1158
- const configA = configs[0] || "config_a";
1159
- const configB = configs[1] || "config_b";
1160
- const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
1161
- const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
1162
- const a = summary[configA] || {};
1163
- const b = summary[configB] || {};
1164
- const delta = summary.delta || {};
1165
-
1166
- html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
1167
- html += "<tbody>";
1168
-
1169
- html += "<tr><td><strong>Pass Rate</strong></td>";
1170
- html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
1171
- html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
1172
- html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
1173
-
1174
- // Time (only show row if data exists)
1175
- if (a.time_seconds || b.time_seconds) {
1176
- html += "<tr><td><strong>Time (s)</strong></td>";
1177
- html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
1178
- html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
1179
- html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
1180
- }
1181
-
1182
- // Tokens (only show row if data exists)
1183
- if (a.tokens || b.tokens) {
1184
- html += "<tr><td><strong>Tokens</strong></td>";
1185
- html += "<td>" + fmtStat(a.tokens, false) + "</td>";
1186
- html += "<td>" + fmtStat(b.tokens, false) + "</td>";
1187
- html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
1188
- }
1189
-
1190
- html += "</tbody></table>";
1191
-
1192
- // Per-eval breakdown (if runs data available)
1193
- const runs = data.runs || [];
1194
- if (runs.length > 0) {
1195
- const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
1196
-
1197
- html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
1198
-
1199
- const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
1200
- const hasErrors = runs.some(r => r.result && r.result.errors > 0);
1201
-
1202
- for (const evalId of evalIds) {
1203
- const evalRuns = runs.filter(r => r.eval_id === evalId);
1204
- const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
1205
-
1206
- html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
1207
- html += '<table class="benchmark-table">';
1208
- html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
1209
- if (hasTime) html += "<th>Time (s)</th>";
1210
- if (hasErrors) html += "<th>Crashes During Execution</th>";
1211
- html += "</tr></thead>";
1212
- html += "<tbody>";
1213
-
1214
- // Group by config and render with average rows
1215
- const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
1216
- for (let ci = 0; ci < configGroups.length; ci++) {
1217
- const config = configGroups[ci];
1218
- const configRuns = evalRuns.filter(r => r.configuration === config);
1219
- if (configRuns.length === 0) continue;
1220
-
1221
- const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
1222
- const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
1223
-
1224
- for (const run of configRuns) {
1225
- const r = run.result || {};
1226
- const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
1227
- html += '<tr class="' + rowClass + '">';
1228
- html += "<td>" + configLabel + "</td>";
1229
- html += "<td>" + run.run_number + "</td>";
1230
- html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
1231
- if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
1232
- if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
1233
- html += "</tr>";
1234
- }
1235
-
1236
- // Average row
1237
- const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
1238
- const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
1239
- const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
1240
- html += '<tr class="benchmark-row-avg ' + rowClass + '">';
1241
- html += "<td>" + configLabel + "</td>";
1242
- html += "<td>Avg</td>";
1243
- html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
1244
- if (hasTime) {
1245
- const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
1246
- html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
1247
- }
1248
- if (hasErrors) html += "<td></td>";
1249
- html += "</tr>";
1250
- }
1251
- html += "</tbody></table>";
1252
-
1253
- // Per-assertion detail for this eval
1254
- const runsWithExpectations = {};
1255
- for (const config of configGroups) {
1256
- runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
1257
- }
1258
- const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
1259
- if (hasAnyExpectations) {
1260
- // Collect all unique assertion texts across all configs
1261
- const allAssertions = [];
1262
- const seen = new Set();
1263
- for (const config of configGroups) {
1264
- for (const run of runsWithExpectations[config]) {
1265
- for (const exp of (run.expectations || [])) {
1266
- if (!seen.has(exp.text)) {
1267
- seen.add(exp.text);
1268
- allAssertions.push(exp.text);
1269
- }
1270
- }
1271
- }
1272
- }
1273
-
1274
- html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
1275
- html += "<thead><tr><th>Assertion</th>";
1276
- for (const config of configGroups) {
1277
- const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
1278
- html += "<th>" + escapeHtml(label) + "</th>";
1279
- }
1280
- html += "</tr></thead><tbody>";
1281
-
1282
- for (const assertionText of allAssertions) {
1283
- html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
1284
-
1285
- for (const config of configGroups) {
1286
- html += "<td>";
1287
- for (const run of runsWithExpectations[config]) {
1288
- const exp = (run.expectations || []).find(e => e.text === assertionText);
1289
- if (exp) {
1290
- const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
1291
- const icon = exp.passed ? "\u2713" : "\u2717";
1292
- html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
1293
- } else {
1294
- html += "— ";
1295
- }
1296
- }
1297
- html += "</td>";
1298
- }
1299
- html += "</tr>";
1300
- }
1301
- html += "</tbody></table>";
1302
- }
1303
- }
1304
- }
1305
-
1306
- // Notes
1307
- if (notes.length > 0) {
1308
- html += '<div class="benchmark-notes">';
1309
- html += "<h3>Analysis Notes</h3>";
1310
- html += "<ul>";
1311
- for (const note of notes) {
1312
- html += "<li>" + escapeHtml(note) + "</li>";
1313
- }
1314
- html += "</ul></div>";
1315
- }
1316
-
1317
- container.innerHTML = html;
1318
- }
1319
-
1320
- // ---- Start ----
1321
- init();
1322
- renderBenchmark();
1323
- </script>
1324
- </body>
1325
- </html>