npm - @biggora/claude-plugins - Versions diffs - 1.2.0 → 1.3.0 - Mend

@biggora/claude-plugins 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (265) hide show

package/codex-cli-workspace/iteration-1/review.html DELETED Viewed

@@ -1,1325 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <title>Eval Review</title>
-  <link rel="preconnect" href="https://fonts.googleapis.com">
-  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
-  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
-  <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
-  <style>
-    :root {
-      --bg: #faf9f5;
-      --surface: #ffffff;
-      --border: #e8e6dc;
-      --text: #141413;
-      --text-muted: #b0aea5;
-      --accent: #d97757;
-      --accent-hover: #c4613f;
-      --green: #788c5d;
-      --green-bg: #eef2e8;
-      --red: #c44;
-      --red-bg: #fceaea;
-      --header-bg: #141413;
-      --header-text: #faf9f5;
-      --radius: 6px;
-    }
-    * { box-sizing: border-box; margin: 0; padding: 0; }
-    body {
-      font-family: 'Lora', Georgia, serif;
-      background: var(--bg);
-      color: var(--text);
-      height: 100vh;
-      display: flex;
-      flex-direction: column;
-    }
-    /* ---- Header ---- */
-    .header {
-      background: var(--header-bg);
-      color: var(--header-text);
-      padding: 1rem 2rem;
-      display: flex;
-      justify-content: space-between;
-      align-items: center;
-      flex-shrink: 0;
-    }
-    .header h1 {
-      font-family: 'Poppins', sans-serif;
-      font-size: 1.25rem;
-      font-weight: 600;
-    }
-    .header .instructions {
-      font-size: 0.8rem;
-      opacity: 0.7;
-      margin-top: 0.25rem;
-    }
-    .header .progress {
-      font-size: 0.875rem;
-      opacity: 0.8;
-      text-align: right;
-    }
-    /* ---- Main content ---- */
-    .main {
-      flex: 1;
-      overflow-y: auto;
-      padding: 1.5rem 2rem;
-      display: flex;
-      flex-direction: column;
-      gap: 1.25rem;
-    }
-    /* ---- Sections ---- */
-    .section {
-      background: var(--surface);
-      border: 1px solid var(--border);
-      border-radius: var(--radius);
-      flex-shrink: 0;
-    }
-    .section-header {
-      font-family: 'Poppins', sans-serif;
-      padding: 0.75rem 1rem;
-      font-size: 0.75rem;
-      font-weight: 500;
-      text-transform: uppercase;
-      letter-spacing: 0.05em;
-      color: var(--text-muted);
-      border-bottom: 1px solid var(--border);
-      background: var(--bg);
-    }
-    .section-body {
-      padding: 1rem;
-    }
-    /* ---- Config badge ---- */
-    .config-badge {
-      display: inline-block;
-      padding: 0.2rem 0.625rem;
-      border-radius: 9999px;
-      font-family: 'Poppins', sans-serif;
-      font-size: 0.6875rem;
-      font-weight: 600;
-      text-transform: uppercase;
-      letter-spacing: 0.03em;
-      margin-left: 0.75rem;
-      vertical-align: middle;
-    }
-    .config-badge.config-primary {
-      background: rgba(33, 150, 243, 0.12);
-      color: #1976d2;
-    }
-    .config-badge.config-baseline {
-      background: rgba(255, 193, 7, 0.15);
-      color: #f57f17;
-    }
-    /* ---- Prompt ---- */
-    .prompt-text {
-      white-space: pre-wrap;
-      font-size: 0.9375rem;
-      line-height: 1.6;
-    }
-    /* ---- Outputs ---- */
-    .output-file {
-      border: 1px solid var(--border);
-      border-radius: var(--radius);
-      overflow: hidden;
-    }
-    .output-file + .output-file {
-      margin-top: 1rem;
-    }
-    .output-file-header {
-      padding: 0.5rem 0.75rem;
-      font-size: 0.8rem;
-      font-weight: 600;
-      color: var(--text-muted);
-      background: var(--bg);
-      border-bottom: 1px solid var(--border);
-      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
-      display: flex;
-      justify-content: space-between;
-      align-items: center;
-    }
-    .output-file-header .dl-btn {
-      font-size: 0.7rem;
-      color: var(--accent);
-      text-decoration: none;
-      cursor: pointer;
-      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
-      font-weight: 500;
-      opacity: 0.8;
-    }
-    .output-file-header .dl-btn:hover {
-      opacity: 1;
-      text-decoration: underline;
-    }
-    .output-file-content {
-      padding: 0.75rem;
-      overflow-x: auto;
-    }
-    .output-file-content pre {
-      font-size: 0.8125rem;
-      line-height: 1.5;
-      white-space: pre-wrap;
-      word-break: break-word;
-      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
-    }
-    .output-file-content img {
-      max-width: 100%;
-      height: auto;
-      border-radius: 4px;
-    }
-    .output-file-content iframe {
-      width: 100%;
-      height: 600px;
-      border: none;
-    }
-    .output-file-content table {
-      border-collapse: collapse;
-      font-size: 0.8125rem;
-      width: 100%;
-    }
-    .output-file-content table td,
-    .output-file-content table th {
-      border: 1px solid var(--border);
-      padding: 0.375rem 0.5rem;
-      text-align: left;
-    }
-    .output-file-content table th {
-      background: var(--bg);
-      font-weight: 600;
-    }
-    .output-file-content .download-link {
-      display: inline-flex;
-      align-items: center;
-      gap: 0.5rem;
-      padding: 0.5rem 1rem;
-      background: var(--bg);
-      border: 1px solid var(--border);
-      border-radius: 4px;
-      color: var(--accent);
-      text-decoration: none;
-      font-size: 0.875rem;
-      cursor: pointer;
-    }
-    .output-file-content .download-link:hover {
-      background: var(--border);
-    }
-    .empty-state {
-      color: var(--text-muted);
-      font-style: italic;
-      padding: 2rem;
-      text-align: center;
-    }
-    /* ---- Feedback ---- */
-    .prev-feedback {
-      background: var(--bg);
-      border: 1px solid var(--border);
-      border-radius: 4px;
-      padding: 0.625rem 0.75rem;
-      margin-top: 0.75rem;
-      font-size: 0.8125rem;
-      color: var(--text-muted);
-      line-height: 1.5;
-    }
-    .prev-feedback-label {
-      font-size: 0.7rem;
-      font-weight: 600;
-      text-transform: uppercase;
-      letter-spacing: 0.04em;
-      margin-bottom: 0.25rem;
-      color: var(--text-muted);
-    }
-    .feedback-textarea {
-      width: 100%;
-      min-height: 100px;
-      padding: 0.75rem;
-      border: 1px solid var(--border);
-      border-radius: 4px;
-      font-family: inherit;
-      font-size: 0.9375rem;
-      line-height: 1.5;
-      resize: vertical;
-      color: var(--text);
-    }
-    .feedback-textarea:focus {
-      outline: none;
-      border-color: var(--accent);
-      box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
-    }
-    .feedback-status {
-      font-size: 0.75rem;
-      color: var(--text-muted);
-      margin-top: 0.5rem;
-      min-height: 1.1em;
-    }
-    /* ---- Grades (collapsible) ---- */
-    .grades-toggle {
-      display: flex;
-      align-items: center;
-      cursor: pointer;
-      user-select: none;
-    }
-    .grades-toggle:hover {
-      color: var(--accent);
-    }
-    .grades-toggle .arrow {
-      margin-right: 0.5rem;
-      transition: transform 0.15s;
-      font-size: 0.75rem;
-    }
-    .grades-toggle .arrow.open {
-      transform: rotate(90deg);
-    }
-    .grades-content {
-      display: none;
-      margin-top: 0.75rem;
-    }
-    .grades-content.open {
-      display: block;
-    }
-    .grades-summary {
-      font-size: 0.875rem;
-      margin-bottom: 0.75rem;
-      display: flex;
-      align-items: center;
-      gap: 0.5rem;
-    }
-    .grade-badge {
-      display: inline-block;
-      padding: 0.125rem 0.5rem;
-      border-radius: 9999px;
-      font-size: 0.75rem;
-      font-weight: 600;
-    }
-    .grade-pass { background: var(--green-bg); color: var(--green); }
-    .grade-fail { background: var(--red-bg); color: var(--red); }
-    .assertion-list {
-      list-style: none;
-    }
-    .assertion-item {
-      padding: 0.625rem 0;
-      border-bottom: 1px solid var(--border);
-      font-size: 0.8125rem;
-    }
-    .assertion-item:last-child { border-bottom: none; }
-    .assertion-status {
-      font-weight: 600;
-      margin-right: 0.5rem;
-    }
-    .assertion-status.pass { color: var(--green); }
-    .assertion-status.fail { color: var(--red); }
-    .assertion-evidence {
-      color: var(--text-muted);
-      font-size: 0.75rem;
-      margin-top: 0.25rem;
-      padding-left: 1.5rem;
-    }
-    /* ---- View tabs ---- */
-    .view-tabs {
-      display: flex;
-      gap: 0;
-      padding: 0 2rem;
-      background: var(--bg);
-      border-bottom: 1px solid var(--border);
-      flex-shrink: 0;
-    }
-    .view-tab {
-      font-family: 'Poppins', sans-serif;
-      padding: 0.625rem 1.25rem;
-      font-size: 0.8125rem;
-      font-weight: 500;
-      cursor: pointer;
-      border: none;
-      background: none;
-      color: var(--text-muted);
-      border-bottom: 2px solid transparent;
-      transition: all 0.15s;
-    }
-    .view-tab:hover { color: var(--text); }
-    .view-tab.active {
-      color: var(--accent);
-      border-bottom-color: var(--accent);
-    }
-    .view-panel { display: none; }
-    .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
-    /* ---- Benchmark view ---- */
-    .benchmark-view {
-      padding: 1.5rem 2rem;
-      overflow-y: auto;
-      flex: 1;
-    }
-    .benchmark-table {
-      border-collapse: collapse;
-      background: var(--surface);
-      border: 1px solid var(--border);
-      border-radius: var(--radius);
-      font-size: 0.8125rem;
-      width: 100%;
-      margin-bottom: 1.5rem;
-    }
-    .benchmark-table th, .benchmark-table td {
-      padding: 0.625rem 0.75rem;
-      text-align: left;
-      border: 1px solid var(--border);
-    }
-    .benchmark-table th {
-      font-family: 'Poppins', sans-serif;
-      background: var(--header-bg);
-      color: var(--header-text);
-      font-weight: 500;
-      font-size: 0.75rem;
-      text-transform: uppercase;
-      letter-spacing: 0.04em;
-    }
-    .benchmark-table tr:hover { background: var(--bg); }
-    .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
-    .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
-    .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
-    .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
-    .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
-    .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
-    .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
-    .benchmark-delta-positive { color: var(--green); font-weight: 600; }
-    .benchmark-delta-negative { color: var(--red); font-weight: 600; }
-    .benchmark-notes {
-      background: var(--surface);
-      border: 1px solid var(--border);
-      border-radius: var(--radius);
-      padding: 1rem;
-    }
-    .benchmark-notes h3 {
-      font-family: 'Poppins', sans-serif;
-      font-size: 0.875rem;
-      margin-bottom: 0.75rem;
-    }
-    .benchmark-notes ul {
-      list-style: disc;
-      padding-left: 1.25rem;
-    }
-    .benchmark-notes li {
-      font-size: 0.8125rem;
-      line-height: 1.6;
-      margin-bottom: 0.375rem;
-    }
-    .benchmark-empty {
-      color: var(--text-muted);
-      font-style: italic;
-      text-align: center;
-      padding: 3rem;
-    }
-    /* ---- Navigation ---- */
-    .nav {
-      display: flex;
-      justify-content: space-between;
-      align-items: center;
-      padding: 1rem 2rem;
-      border-top: 1px solid var(--border);
-      background: var(--surface);
-      flex-shrink: 0;
-    }
-    .nav-btn {
-      font-family: 'Poppins', sans-serif;
-      padding: 0.5rem 1.25rem;
-      border: 1px solid var(--border);
-      border-radius: var(--radius);
-      background: var(--surface);
-      cursor: pointer;
-      font-size: 0.875rem;
-      font-weight: 500;
-      color: var(--text);
-      transition: all 0.15s;
-    }
-    .nav-btn:hover:not(:disabled) {
-      background: var(--bg);
-      border-color: var(--text-muted);
-    }
-    .nav-btn:disabled {
-      opacity: 0.4;
-      cursor: not-allowed;
-    }
-    .done-btn {
-      font-family: 'Poppins', sans-serif;
-      padding: 0.5rem 1.5rem;
-      border: 1px solid var(--border);
-      border-radius: var(--radius);
-      background: var(--surface);
-      color: var(--text);
-      cursor: pointer;
-      font-size: 0.875rem;
-      font-weight: 500;
-      transition: all 0.15s;
-    }
-    .done-btn:hover {
-      background: var(--bg);
-      border-color: var(--text-muted);
-    }
-    .done-btn.ready {
-      border: none;
-      background: var(--accent);
-      color: white;
-      font-weight: 600;
-    }
-    .done-btn.ready:hover {
-      background: var(--accent-hover);
-    }
-    /* ---- Done overlay ---- */
-    .done-overlay {
-      display: none;
-      position: fixed;
-      inset: 0;
-      background: rgba(0, 0, 0, 0.5);
-      z-index: 100;
-      justify-content: center;
-      align-items: center;
-    }
-    .done-overlay.visible {
-      display: flex;
-    }
-    .done-card {
-      background: var(--surface);
-      border-radius: 12px;
-      padding: 2rem 3rem;
-      text-align: center;
-      box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
-      max-width: 500px;
-    }
-    .done-card h2 {
-      font-size: 1.5rem;
-      margin-bottom: 0.5rem;
-    }
-    .done-card p {
-      color: var(--text-muted);
-      margin-bottom: 1.5rem;
-      line-height: 1.5;
-    }
-    .done-card .btn-row {
-      display: flex;
-      gap: 0.5rem;
-      justify-content: center;
-    }
-    .done-card button {
-      padding: 0.5rem 1.25rem;
-      border: 1px solid var(--border);
-      border-radius: var(--radius);
-      background: var(--surface);
-      cursor: pointer;
-      font-size: 0.875rem;
-    }
-    .done-card button:hover {
-      background: var(--bg);
-    }
-    /* ---- Toast ---- */
-    .toast {
-      position: fixed;
-      bottom: 5rem;
-      left: 50%;
-      transform: translateX(-50%);
-      background: var(--header-bg);
-      color: var(--header-text);
-      padding: 0.625rem 1.25rem;
-      border-radius: var(--radius);
-      font-size: 0.875rem;
-      opacity: 0;
-      transition: opacity 0.3s;
-      pointer-events: none;
-      z-index: 200;
-    }
-    .toast.visible {
-      opacity: 1;
-    }
-  </style>
-</head>
-<body>
-  <div id="app" style="height:100vh; display:flex; flex-direction:column;">
-    <div class="header">
-      <div>
-        <h1>Eval Review: <span id="skill-name"></span></h1>
-        <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
-      </div>
-      <div class="progress" id="progress"></div>
-    </div>
-    <!-- View tabs (only shown when benchmark data exists) -->
-    <div class="view-tabs" id="view-tabs" style="display:none;">
-      <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
-      <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
-    </div>
-    <!-- Outputs panel (qualitative review) -->
-    <div class="view-panel active" id="panel-outputs">
-    <div class="main">
-      <!-- Prompt -->
-      <div class="section">
-        <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
-        <div class="section-body">
-          <div class="prompt-text" id="prompt-text"></div>
-        </div>
-      </div>
-      <!-- Outputs -->
-      <div class="section">
-        <div class="section-header">Output</div>
-        <div class="section-body" id="outputs-body">
-          <div class="empty-state">No output files found</div>
-        </div>
-      </div>
-      <!-- Previous Output (collapsible) -->
-      <div class="section" id="prev-outputs-section" style="display:none;">
-        <div class="section-header">
-          <div class="grades-toggle" onclick="togglePrevOutputs()">
-            <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
-            Previous Output
-          </div>
-        </div>
-        <div class="grades-content" id="prev-outputs-content"></div>
-      </div>
-      <!-- Grades (collapsible) -->
-      <div class="section" id="grades-section" style="display:none;">
-        <div class="section-header">
-          <div class="grades-toggle" onclick="toggleGrades()">
-            <span class="arrow" id="grades-arrow">&#9654;</span>
-            Formal Grades
-          </div>
-        </div>
-        <div class="grades-content" id="grades-content"></div>
-      </div>
-      <!-- Feedback -->
-      <div class="section">
-        <div class="section-header">Your Feedback</div>
-        <div class="section-body">
-          <textarea
-            class="feedback-textarea"
-            id="feedback"
-            placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
-          ></textarea>
-          <div class="feedback-status" id="feedback-status"></div>
-          <div class="prev-feedback" id="prev-feedback" style="display:none;">
-            <div class="prev-feedback-label">Previous feedback</div>
-            <div id="prev-feedback-text"></div>
-          </div>
-        </div>
-      </div>
-    </div>
-    <div class="nav" id="outputs-nav">
-      <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
-      <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
-      <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
-    </div>
-    </div><!-- end panel-outputs -->
-    <!-- Benchmark panel (quantitative stats) -->
-    <div class="view-panel" id="panel-benchmark">
-      <div class="benchmark-view" id="benchmark-content">
-        <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
-      </div>
-    </div>
-  </div>
-  <!-- Done overlay -->
-  <div class="done-overlay" id="done-overlay">
-    <div class="done-card">
-      <h2>Review Complete</h2>
-      <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
-      <div class="btn-row">
-        <button onclick="closeDoneDialog()">OK</button>
-      </div>
-    </div>
-  </div>
-  <!-- Toast -->
-  <div class="toast" id="toast"></div>
-  <script>
-    // ---- Embedded data (injected by generate_review.py) ----
-    const EMBEDDED_DATA = {"skill_name": "codex-cli", "runs": [{"id": "eval-1-ci-integration-with_skill", "prompt": "I want to set up OpenAI Codex CLI in my GitHub Actions pipeline to auto-fix lint errors on PRs. How do I configure it with full-auto mode and workspace-write sandbox? My OPENAI_API_KEY is stored in GitHub secrets.", "eval_id": 1, "outputs": [{"name": "response.md", "type": "text", "content": "# Setting Up OpenAI Codex CLI in GitHub Actions to Auto-Fix Lint Errors\n\n## Overview\n\nYou can use `codex exec` (the non-interactive mode of Codex CLI) inside a GitHub Actions workflow to automatically fix lint errors on pull requests. The key pieces are:\n\n- **`--full-auto` flag** -- shortcut for `-a on-request -s workspace-write`, which allows Codex to write files within the working directory without requiring interactive approval.\n- **`OPENAI_API_KEY`** -- passed from GitHub Secrets via the `env` block.\n\n## Workflow File\n\nCreate `.github/workflows/codex-lint-fix.yml` in your repository:\n\n```yaml\nname: Auto-fix lint errors with Codex CLI\n\non:\n  pull_request:\n    types: [opened, synchronize]\n\npermissions:\n  contents: write\n\njobs:\n  lint-fix:\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout PR branch\n        uses: actions/checkout@v4\n        with:\n          ref: ${{ github.head_ref }}\n          # Needed so the push step can write back to the PR branch\n          token: ${{ secrets.GITHUB_TOKEN }}\n\n      - name: Set up Node.js\n        uses: actions/setup-node@v4\n        with:\n          node-version: '20'\n\n      - name: Install dependencies\n        run: npm ci\n\n      - name: Install Codex CLI\n        run: npm install -g @openai/codex\n\n      - name: Run linter and capture errors\n        id: lint\n        run: |\n          npm run lint 2>&1 | tee lint-output.txt || true\n\n      - name: Auto-fix lint errors with Codex\n        run: |\n          codex exec \\\n            --full-auto \\\n            --model gpt-5-codex \\\n            \"Read lint-output.txt for the lint errors in this project. Fix all the lint errors in the source files. Do not modify any test files unless the lint error is in a test file.\"\n        env:\n          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n\n      - name: Commit and push fixes\n        run: |\n          git config user.name \"codex-bot\"\n          git config user.email \"codex-bot@users.noreply.github.com\"\n          git add -A\n          git diff --cached --quiet || git commit -m \"fix: auto-fix lint errors via Codex CLI\"\n          git push\n```\n\n## How It Works\n\n1. **Checkout** -- The PR branch is checked out with write access so fixes can be pushed back.\n2. **Lint** -- Your existing lint command runs and its output is captured to a file. The `|| true` prevents the step from failing the workflow (lint errors are expected).\n3. **Codex exec** -- Runs in non-interactive mode with `--full-auto`, which sets:\n   - **Approval policy**: `on-request` (Codex proceeds without interactive prompts for standard file edits)\n   - **Sandbox**: `workspace-write` (Codex can only write files within the checked-out repository directory)\n4. **Commit and push** -- If Codex made any changes, they are committed and pushed to the PR branch.\n\n## Understanding `--full-auto`\n\nThe `--full-auto` flag is a convenience shortcut equivalent to:\n\n```bash\ncodex exec -a on-request -s workspace-write \"task\"\n```\n\n| Setting | Value | Meaning |\n|---------|-------|---------|\n| Approval policy (`-a`) | `on-request` | Codex runs autonomously for standard operations; only pauses on sensitive actions |\n| Sandbox (`-s`) | `workspace-write` | Codex can read/write files in the working directory but cannot access the broader filesystem |\n\nThis is the recommended combination for CI pipelines because it gives Codex enough freedom to edit source files while preventing it from modifying anything outside the project.\n\n### If You Want Fully Unattended Execution\n\nIf you want Codex to never pause for any reason (no approval prompts at all), use explicit flags instead of `--full-auto`:\n\n```bash\ncodex exec -a never -s workspace-write \"Fix all lint errors\"\n```\n\nThe `-a never` policy means Codex will never ask for approval, even for potentially sensitive actions. Use this when you are confident in the prompt and the sandbox constraint is sufficient protection.\n\n## Configuring the API Key\n\nYour `OPENAI_API_KEY` stored in GitHub Secrets is passed to Codex through the `env` block on the step:\n\n```yaml\nenv:\n  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n```\n\nTo add the secret:\n1. Go to your repository on GitHub.\n2. Navigate to **Settings > Secrets and variables > Actions**.\n3. Click **New repository secret**.\n4. Name: `OPENAI_API_KEY`, Value: your OpenAI API key.\n\n## Optional: Enable Network Access\n\nBy default, the `workspace-write` sandbox disables network access. If your lint fix workflow requires installing packages or fetching resources, enable network with a config override:\n\n```yaml\n- name: Auto-fix lint errors with Codex\n  run: |\n    codex exec \\\n      --full-auto \\\n      -c 'sandbox_workspace_write.network_access=true' \\\n      \"Fix all lint errors found in lint-output.txt\"\n  env:\n    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n```\n\n## Optional: Use a Config Profile\n\nInstead of passing all flags on the command line, you can create a `.codex/config.toml` in your repo with a CI profile:\n\n```toml\n[profiles.ci]\nmodel = \"gpt-5-codex\"\napproval_policy = \"never\"\n\n[profiles.ci.sandbox_workspace_write]\nnetwork_access = false\n```\n\nThen reference it in the workflow:\n\n```yaml\n- name: Auto-fix lint errors with Codex\n  run: |\n    codex exec --profile ci \"Fix all lint errors found in lint-output.txt\"\n  env:\n    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n```\n\n## Troubleshooting\n\n| Problem | Solution |\n|---------|----------|\n| Landlock sandbox error on Ubuntu runner | Update the runner image or, in isolated CI only, use `--dangerously-bypass-approvals-and-sandbox` |\n| Network access denied during fix | Add `-c 'sandbox_workspace_write.network_access=true'` |\n| Codex keeps prompting for approval | Make sure you are using `codex exec` (not bare `codex`), and pass `--full-auto` or `-a never` explicitly |\n| Auth loop / browser auth attempt | Ensure `OPENAI_API_KEY` is set in the `env` block -- Codex uses it automatically and skips browser auth |\n"}], "grading": {"expectations": [{"text": "Shows npm install -g @openai/codex for global installation", "passed": true, "evidence": "Line 44: 'run: npm install -g @openai/codex'"}, {"text": "Uses 'codex exec' for non-interactive CI mode (not bare 'codex')", "passed": true, "evidence": "Line 53: 'codex exec \\' used in the workflow step"}, {"text": "Uses --full-auto flag and correctly explains it equals -a on-request -s workspace-write", "passed": true, "evidence": "Line 54: '--full-auto' and line 80-83: 'The --full-auto flag is a convenience shortcut equivalent to: codex exec -a on-request -s workspace-write'"}, {"text": "Shows OPENAI_API_KEY set from ${{ secrets.OPENAI_API_KEY }} in env", "passed": true, "evidence": "Line 58: 'OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}'"}, {"text": "Provides a complete GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Lines 14-67: Complete workflow with checkout, node setup, install, lint, codex exec, and commit/push steps"}, {"text": "Does not invent non-existent CLI flags or options", "passed": true, "evidence": "All flags used (--full-auto, -a never, -s workspace-write, -c, --model, --profile, --dangerously-bypass-approvals-and-sandbox) match the skill documentation"}], "summary": {"passed": 6, "failed": 0, "total": 6, "pass_rate": 1.0}, "claims": [{"claim": "--full-auto equals -a on-request -s workspace-write", "type": "factual", "verified": true, "evidence": "Matches skill doc line 64: '# Equivalent to: -a on-request -s workspace-write'"}], "eval_feedback": {"suggestions": [], "overall": "No suggestions, evals look solid"}}}, {"id": "eval-1-ci-integration-without_skill", "prompt": "I want to set up OpenAI Codex CLI in my GitHub Actions pipeline to auto-fix lint errors on PRs. How do I configure it with full-auto mode and workspace-write sandbox? My OPENAI_API_KEY is stored in GitHub secrets.", "eval_id": 1, "outputs": [{"name": "response.md", "type": "text", "content": "# Setting Up OpenAI Codex CLI in GitHub Actions for Auto-Fixing Lint Errors\n\n## Overview\n\nOpenAI Codex CLI is a terminal-based AI coding agent. You can integrate it into a GitHub Actions workflow to automatically fix lint errors on pull requests using its `full-auto` approval mode and `write-only` sandbox policy.\n\n## Key Concepts\n\n- **`full-auto` mode**: Codex CLI automatically approves all actions (file reads, writes, and command execution) without human confirmation. Set via `--approval-mode full-auto`.\n- **`write-only` sandbox** (also called `workspace-write`): Restricts file writes to the current working directory (the repo workspace). Network access and writes outside the workspace are blocked. Set via `--sandbox write-only`.\n\n## GitHub Actions Workflow\n\nCreate `.github/workflows/codex-lint-fix.yml`:\n\n```yaml\nname: Codex Auto-Fix Lint Errors\n\non:\n  pull_request:\n    types: [opened, synchronize]\n\npermissions:\n  contents: write\n  pull-requests: write\n\njobs:\n  lint-fix:\n    runs-on: ubuntu-latest\n\n    steps:\n      - name: Checkout PR branch\n        uses: actions/checkout@v4\n        with:\n          ref: ${{ github.head_ref }}\n          fetch-depth: 0\n          token: ${{ secrets.GITHUB_TOKEN }}\n\n      - name: Set up Node.js\n        uses: actions/setup-node@v4\n        with:\n          node-version: '20'\n\n      - name: Install dependencies\n        run: npm ci\n\n      - name: Install OpenAI Codex CLI\n        run: npm install -g @openai/codex\n\n      - name: Run linter and capture errors\n        id: lint\n        continue-on-error: true\n        run: |\n          npx eslint . --format json > lint-results.json 2>&1 || true\n          if [ -s lint-results.json ]; then\n            echo \"has_errors=true\" >> \"$GITHUB_OUTPUT\"\n          else\n            echo \"has_errors=false\" >> \"$GITHUB_OUTPUT\"\n          fi\n\n      - name: Auto-fix lint errors with Codex CLI\n        if: steps.lint.outputs.has_errors == 'true'\n        env:\n          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n        run: |\n          codex \\\n            --approval-mode full-auto \\\n            --sandbox write-only \\\n            \"Read the lint errors in lint-results.json and fix all the reported ESLint errors and warnings in the source files. Do not change any logic, only fix lint issues.\"\n\n      - name: Commit and push fixes\n        run: |\n          git config user.name \"github-actions[bot]\"\n          git config user.email \"github-actions[bot]@users.noreply.github.com\"\n          git add -A\n          git diff --cached --quiet && echo \"No changes to commit\" && exit 0\n          git commit -m \"fix: auto-fix lint errors via Codex CLI\"\n          git push\n```\n\n## Configuration Breakdown\n\n### Authentication\n\nThe `OPENAI_API_KEY` secret must be configured in your repository:\n\n1. Go to your repo on GitHub.\n2. Navigate to **Settings > Secrets and variables > Actions**.\n3. Click **New repository secret**.\n4. Name: `OPENAI_API_KEY`, Value: your OpenAI API key.\n\nThe workflow references it as `${{ secrets.OPENAI_API_KEY }}` and passes it to the Codex CLI step via the `env` block.\n\n### Approval Mode\n\n`--approval-mode full-auto` tells Codex CLI to execute all suggested changes without waiting for human approval. This is required in CI where there is no interactive terminal.\n\nThe three approval modes are:\n- `suggest` (default) -- requires approval for everything\n- `auto-edit` -- auto-approves file edits but asks before running commands\n- `full-auto` -- auto-approves all actions including command execution\n\n### Sandbox Policy\n\n`--sandbox write-only` (workspace-write) restricts Codex to:\n- Reading any file in the workspace\n- Writing only to files within the current working directory\n- No network access from spawned commands\n- No writes outside the repo checkout directory\n\nThis provides a safety boundary so Codex cannot modify system files or exfiltrate data, even in `full-auto` mode.\n\n### Alternative: Using a Codex Config File\n\nInstead of passing flags on the command line, you can create a `codex.json` or use the `~/.codex/config.json` approach. In CI, you can write it inline:\n\n```yaml\n      - name: Configure Codex CLI\n        run: |\n          mkdir -p ~/.codex\n          cat > ~/.codex/config.json << 'EOF'\n          {\n            \"approvalMode\": \"full-auto\",\n            \"sandbox\": \"write-only\",\n            \"model\": \"o4-mini\"\n          }\n          EOF\n```\n\n### Model Selection\n\nBy default, Codex CLI uses `o4-mini`. You can specify a different model:\n\n```bash\ncodex --model o3 --approval-mode full-auto --sandbox write-only \"Fix lint errors...\"\n```\n\n## Security Considerations\n\n1. **Never hardcode your API key** in the workflow file. Always use GitHub Secrets.\n2. **`write-only` sandbox** is recommended for CI to prevent unintended network calls or file access outside the workspace.\n3. **Review the generated commits**. Even though Codex runs automatically, the changes are pushed to the PR branch where reviewers can inspect them.\n4. **Limit permissions**. The workflow uses `contents: write` and `pull-requests: write` which are the minimum needed to push commits and interact with the PR.\n5. **Cost control**. Each Codex invocation calls the OpenAI API. Consider adding conditions to skip the step if there are no lint errors, as shown in the workflow above.\n\n## Troubleshooting\n\n- **\"codex: command not found\"**: Make sure the install step uses `npm install -g @openai/codex` and that the Node.js version is 20 or higher.\n- **Authentication errors**: Verify the `OPENAI_API_KEY` secret is set correctly and the key is valid.\n- **Push failures**: Ensure the checkout step uses `token: ${{ secrets.GITHUB_TOKEN }}` and the workflow has `contents: write` permission.\n- **No changes committed**: If Codex cannot fix certain lint errors, the `git diff --cached --quiet` check will skip the commit gracefully.\n"}], "grading": {"expectations": [{"text": "Shows npm install -g @openai/codex for global installation", "passed": true, "evidence": "Line 49: 'run: npm install -g @openai/codex'"}, {"text": "Uses 'codex exec' for non-interactive CI mode (not bare 'codex')", "passed": false, "evidence": "Line 66: Uses bare 'codex \\' instead of 'codex exec'. The exec subcommand is required for non-interactive CI mode."}, {"text": "Uses --full-auto flag and correctly explains it equals -a on-request -s workspace-write", "passed": false, "evidence": "Uses hallucinated '--approval-mode full-auto' flag (correct is --full-auto or --ask-for-approval/-a) and '--sandbox write-only' (correct is -s workspace-write). Also incorrectly describes approval modes as 'suggest', 'auto-edit', 'full-auto' instead of the real ones: 'untrusted', 'on-request', 'never', 'reject'."}, {"text": "Shows OPENAI_API_KEY set from ${{ secrets.OPENAI_API_KEY }} in env", "passed": true, "evidence": "Line 64: 'OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}'"}, {"text": "Provides a complete GitHub Actions workflow YAML snippet", "passed": true, "evidence": "Lines 16-79: Complete workflow YAML with checkout, node, install, lint, codex, and commit steps"}, {"text": "Does not invent non-existent CLI flags or options", "passed": false, "evidence": "Hallucinated flags: '--approval-mode full-auto' (not a real flag), '--sandbox write-only' (not a real flag), '--model o3'. Hallucinated modes: 'suggest', 'auto-edit'. Hallucinated config format: 'codex.json'/'config.json' instead of config.toml."}], "summary": {"passed": 3, "failed": 3, "total": 6, "pass_rate": 0.5}, "claims": [{"claim": "Approval modes are suggest, auto-edit, full-auto", "type": "factual", "verified": false, "evidence": "Real approval modes are: untrusted, on-request, never, reject"}, {"claim": "Config file is codex.json or config.json", "type": "factual", "verified": false, "evidence": "Real config file is ~/.codex/config.toml (TOML format, not JSON)"}], "eval_feedback": {"suggestions": [], "overall": "Baseline shows significant hallucination of CLI flags and config format without the skill"}}}, {"id": "eval-2-mcp-server-config-with_skill", "prompt": "How do I add a custom MCP server to codex CLI? I have a Python-based MCP server that needs a GITHUB_TOKEN env var. Also want to run codex itself as an MCP server for other agents to consume.", "eval_id": 2, "outputs": [{"name": "response.md", "type": "text", "content": "# Adding a Custom MCP Server to Codex CLI and Running Codex as an MCP Server\n\n## Adding Your Python-based MCP Server with GITHUB_TOKEN\n\nAdd the following to your Codex configuration file at `~/.codex/config.toml`:\n\n```toml\n[mcp_servers.my-python-server]\ncommand = \"python\"\nargs = [\"/path/to/your/mcp_server.py\"]\nenv = { GITHUB_TOKEN = \"$GITHUB_TOKEN\" }\n```\n\nKey details:\n\n- **`command`** -- the executable to launch your server (e.g., `python`, `python3`, or an absolute path to your Python interpreter).\n- **`args`** -- an array of arguments passed to the command. Point this at your MCP server script or module entry point.\n- **`env`** -- a table of environment variables forwarded to the server process. Using `\"$GITHUB_TOKEN\"` tells Codex to pull the value from your shell environment at runtime, so you do not hardcode the token in the config file. Make sure `GITHUB_TOKEN` is exported in your shell (e.g., via `export GITHUB_TOKEN=\"ghp_...\"` in your `.bashrc`/`.zshrc` or a `.env` loader).\n\nIf your Python server is packaged as a module you run with `python -m`, the config would look like:\n\n```toml\n[mcp_servers.my-python-server]\ncommand = \"python\"\nargs = [\"-m\", \"my_mcp_server\"]\nenv = { GITHUB_TOKEN = \"$GITHUB_TOKEN\" }\n```\n\nIf your MCP server exposes an HTTP endpoint instead of running as a stdio subprocess, use the `url` form:\n\n```toml\n[mcp_servers.my-python-server]\nurl = \"https://localhost:8080/mcp\"\n```\n\nIn that case you would manage the `GITHUB_TOKEN` env var in whatever process starts the HTTP server, not in the Codex config.\n\n### Verifying it works\n\nAfter adding the config, launch Codex and use the `/mcp` slash command in the TUI to list all available MCP tools. Your server's tools should appear in the list.\n\n### Per-project config\n\nYou can also place MCP server config in a project-level `.codex/config.toml` (in your project root) if the server is only relevant to a specific project. Project config is only loaded for trusted projects.\n\n---\n\n## Running Codex Itself as an MCP Server\n\nTo expose Codex CLI as an MCP server that other agents can consume, run:\n\n```bash\ncodex mcp\n```\n\nThis starts Codex in MCP server mode, making its capabilities (code reading, editing, execution) available as MCP tools to any MCP-compatible client or agent. Other agents can then connect to Codex the same way Codex connects to third-party MCP servers -- either via stdio or by pointing at the running process.\n\n---\n\n## Full Config Example Combining Both\n\nHere is a complete `~/.codex/config.toml` that sets up your Python MCP server, a GitHub MCP server (for reference), and general Codex settings:\n\n```toml\nmodel = \"gpt-5-codex\"\napproval_policy = \"on-request\"\n\n[sandbox_workspace_write]\nnetwork_access = false\n\n[mcp_servers.github]\ncommand = \"npx\"\nargs = [\"-y\", \"@modelcontextprotocol/server-github\"]\nenv = { GITHUB_TOKEN = \"$GITHUB_TOKEN\" }\n\n[mcp_servers.my-python-server]\ncommand = \"python\"\nargs = [\"/path/to/your/mcp_server.py\"]\nenv = { GITHUB_TOKEN = \"$GITHUB_TOKEN\" }\n```\n\nIf you need network access for your MCP server to make API calls, set `network_access = true` under `[sandbox_workspace_write]`, or pass it as a one-off flag:\n\n```bash\ncodex -c 'sandbox_workspace_write.network_access=true' \"Use my-python-server to fetch data\"\n```\n"}], "grading": {"expectations": [{"text": "Points to ~/.codex/config.toml for MCP server configuration", "passed": true, "evidence": "Line 5: 'Add the following to your Codex configuration file at ~/.codex/config.toml'"}, {"text": "Shows [mcp_servers.name] TOML section with command, args fields", "passed": true, "evidence": "Line 8: '[mcp_servers.my-python-server]' with command and args fields"}, {"text": "Shows $GITHUB_TOKEN or $ENV_VAR pattern for environment variable references", "passed": true, "evidence": "Line 11: 'env = { GITHUB_TOKEN = \"$GITHUB_TOKEN\" }'"}, {"text": "Shows 'codex mcp' command for running codex as an MCP server", "passed": true, "evidence": "Line 53: 'codex mcp'"}, {"text": "Uses correct Python command (python/python3) in the MCP server command field", "passed": true, "evidence": "Line 9: 'command = \"python\"'"}, {"text": "Does not invent non-existent config keys or MCP server fields", "passed": true, "evidence": "All config keys (command, args, env, url, model, approval_policy, network_access) match the skill documentation"}], "summary": {"passed": 6, "failed": 0, "total": 6, "pass_rate": 1.0}, "claims": [{"claim": "MCP servers can use url form for HTTP endpoints", "type": "factual", "verified": true, "evidence": "Matches skill doc: '[mcp_servers.my-api] url = \"https://my-server.example.com/mcp\"'"}], "eval_feedback": {"suggestions": [], "overall": "No suggestions, evals look solid"}}}, {"id": "eval-2-mcp-server-config-without_skill", "prompt": "How do I add a custom MCP server to codex CLI? I have a Python-based MCP server that needs a GITHUB_TOKEN env var. Also want to run codex itself as an MCP server for other agents to consume.", "eval_id": 2, "outputs": [{"name": "response.md", "type": "text", "content": "# Adding a Custom MCP Server to Codex CLI & Running Codex as an MCP Server\n\n## Part 1: Adding a Custom Python MCP Server with GITHUB_TOKEN\n\nCodex CLI supports MCP (Model Context Protocol) servers that extend its capabilities with custom tools. You configure them in your Codex CLI configuration file.\n\n### Configuration File Location\n\nCodex CLI reads its configuration from a JSON file. The typical location is:\n\n- **Linux/macOS**: `~/.codex/config.json` (or potentially `~/.config/codex/config.json`)\n- You can also use a project-local configuration\n\n### Adding Your Python MCP Server\n\nIn your Codex CLI configuration file, add an `mcpServers` section. For a Python-based MCP server that needs the `GITHUB_TOKEN` environment variable:\n\n```json\n{\n  \"mcpServers\": {\n    \"my-github-server\": {\n      \"command\": \"python\",\n      \"args\": [\"/path/to/your/mcp_server.py\"],\n      \"env\": {\n        \"GITHUB_TOKEN\": \"your-github-token-here\"\n      }\n    }\n  }\n}\n```\n\n### Configuration Fields\n\n| Field | Description |\n|-------|-------------|\n| `command` | The executable to run (e.g., `python`, `python3`, `node`) |\n| `args` | Array of arguments passed to the command |\n| `env` | Object of environment variables to set for the server process |\n\n### Using a Virtual Environment\n\nIf your Python MCP server has dependencies managed via a virtual environment, point to the Python binary inside the venv:\n\n```json\n{\n  \"mcpServers\": {\n    \"my-github-server\": {\n      \"command\": \"/path/to/venv/bin/python\",\n      \"args\": [\"/path/to/your/mcp_server.py\"],\n      \"env\": {\n        \"GITHUB_TOKEN\": \"your-github-token-here\"\n      }\n    }\n  }\n}\n```\n\n### Using uvx or pipx\n\nIf your MCP server is distributed as a Python package, you can use `uvx` (from the `uv` tool) or `pipx`:\n\n```json\n{\n  \"mcpServers\": {\n    \"my-github-server\": {\n      \"command\": \"uvx\",\n      \"args\": [\"my-github-mcp-server\"],\n      \"env\": {\n        \"GITHUB_TOKEN\": \"your-github-token-here\"\n      }\n    }\n  }\n}\n```\n\n### Security Tip for the Token\n\nRather than hardcoding the token in the config file, you can reference an environment variable that is already set in your shell session. However, the `env` field in the MCP config explicitly sets environment variables for the child process. To avoid storing secrets in config files, consider:\n\n1. Setting `GITHUB_TOKEN` in your shell profile (`.bashrc`, `.zshrc`) so it is inherited by child processes automatically. In that case you may not need the `env` field at all.\n2. Using a secrets manager or `.env` file that your MCP server reads directly.\n\n### Verifying the Server is Connected\n\nAfter configuring the MCP server, start Codex CLI. It should automatically launch and connect to your MCP server via stdio. The tools provided by your server will be available in the Codex session. You can verify by asking Codex to list available tools or by using one of the tools your server exposes.\n\n---\n\n## Part 2: Running Codex CLI as an MCP Server\n\nCodex CLI can itself be run as an MCP server, allowing other agents or tools that support MCP to consume it as a tool provider.\n\n### Starting Codex as an MCP Server\n\nUse the `--mcp-server` flag (or similar) to start Codex in MCP server mode:\n\n```bash\ncodex --mcp-server\n```\n\nThis starts Codex CLI listening on **stdio** (stdin/stdout) using the MCP protocol, making it available as a tool for other MCP-compatible clients.\n\n### Consuming Codex from Another Agent\n\nIn the consuming agent's MCP configuration, add Codex as a server:\n\n```json\n{\n  \"mcpServers\": {\n    \"codex\": {\n      \"command\": \"codex\",\n      \"args\": [\"--mcp-server\"],\n      \"env\": {\n        \"OPENAI_API_KEY\": \"your-openai-api-key\"\n      }\n    }\n  }\n}\n```\n\n### Example: Adding Codex as an MCP Server in Claude Desktop\n\nIf you are using Claude Desktop (which also supports MCP), you can add Codex to the Claude Desktop MCP config (typically `claude_desktop_config.json`):\n\n```json\n{\n  \"mcpServers\": {\n    \"codex\": {\n      \"command\": \"npx\",\n      \"args\": [\"@openai/codex\", \"--mcp-server\"],\n      \"env\": {\n        \"OPENAI_API_KEY\": \"your-openai-api-key\"\n      }\n    }\n  }\n}\n```\n\n### What Codex Exposes as an MCP Server\n\nWhen running as an MCP server, Codex typically exposes tools that allow the consuming agent to:\n\n- Execute coding tasks via Codex's AI capabilities\n- Run shell commands in a sandboxed environment\n- Perform file operations (read, write, edit)\n- Leverage Codex's code generation and reasoning abilities\n\n---\n\n## Summary\n\n| Goal | Configuration |\n|------|---------------|\n| Add custom MCP server to Codex | Add entry to `mcpServers` in Codex config with `command`, `args`, and `env` |\n| Pass `GITHUB_TOKEN` to MCP server | Use the `env` field in the server config |\n| Run Codex as an MCP server | Start with `codex --mcp-server` flag |\n| Consume Codex from another agent | Add Codex as an MCP server entry in the consuming agent's config |\n\n### Key Points\n\n- MCP servers communicate over **stdio** by default (stdin/stdout JSON-RPC messages).\n- The `env` field lets you pass environment variables like `GITHUB_TOKEN` to the server process.\n- Codex CLI can both **consume** MCP servers (as a client) and **be consumed** as an MCP server (acting as a server for other agents).\n- Make sure your Python MCP server implements the MCP protocol correctly (handles `initialize`, `tools/list`, `tools/call` methods over stdio).\n"}], "grading": {"expectations": [{"text": "Points to ~/.codex/config.toml for MCP server configuration", "passed": false, "evidence": "Uses '~/.codex/config.json' (line 11) \u00e2\u20ac\u201d wrong format. Codex CLI uses TOML (config.toml), not JSON."}, {"text": "Shows [mcp_servers.name] TOML section with command, args fields", "passed": false, "evidence": "Uses JSON 'mcpServers' key (line 20) instead of TOML [mcp_servers.name] sections. Entire config is in JSON format."}, {"text": "Shows $GITHUB_TOKEN or $ENV_VAR pattern for environment variable references", "passed": false, "evidence": "Hardcodes 'your-github-token-here' (line 25) instead of using the $GITHUB_TOKEN reference pattern that Codex supports for pulling from shell environment."}, {"text": "Shows 'codex mcp' command for running codex as an MCP server", "passed": false, "evidence": "Uses hallucinated 'codex --mcp-server' flag (line 98). The correct command is 'codex mcp' (a subcommand, not a flag)."}, {"text": "Uses correct Python command (python/python3) in the MCP server command field", "passed": true, "evidence": "Line 22: '\"command\": \"python\"'"}, {"text": "Does not invent non-existent config keys or MCP server fields", "passed": false, "evidence": "Hallucinated: JSON config format (mcpServers), --mcp-server flag, config.json file. Real format is TOML with [mcp_servers.name] sections."}], "summary": {"passed": 1, "failed": 5, "total": 6, "pass_rate": 0.167}, "claims": [{"claim": "Codex uses config.json for configuration", "type": "factual", "verified": false, "evidence": "Codex CLI uses ~/.codex/config.toml (TOML format)"}, {"claim": "codex --mcp-server starts MCP server mode", "type": "factual", "verified": false, "evidence": "Correct command is 'codex mcp' (subcommand)"}], "eval_feedback": {"suggestions": [], "overall": "Baseline heavily hallucinated \u00e2\u20ac\u201d wrong config format, wrong MCP command, hardcoded secrets"}}}, {"id": "eval-3-profiles-troubleshooting-with_skill", "prompt": "I'm getting Landlock errors on WSL2 when running codex. Also, I need to set up two profiles in config.toml \u00e2\u20ac\u201d one for safe daily dev and one for CI with no approvals. How do I fix the Landlock issue and configure the profiles?", "eval_id": 3, "outputs": [{"name": "response.md", "type": "text", "content": "# Fixing Landlock Errors on WSL2 and Setting Up Codex Profiles\n\n## Fixing the Landlock Error\n\nLandlock is a Linux kernel security feature that Codex uses for filesystem sandboxing. WSL2 kernels often ship with an older version that does not fully support Landlock, which causes Codex to fail when it tries to enforce sandbox restrictions.\n\n### Options to fix it\n\n**Option 1: Update your WSL2 kernel** (recommended long-term fix)\n\nUpdate WSL2 to get a kernel version that supports Landlock (5.13+, ideally 6.1+):\n\n```bash\n# From a Windows PowerShell (admin):\nwsl --update\n\n# Then restart WSL:\nwsl --shutdown\n```\n\nAfter restarting, verify the kernel version inside WSL:\n\n```bash\nuname -r\n```\n\nIf the version is 5.13 or higher, Landlock should work and the error should be resolved.\n\n**Option 2: Bypass the sandbox** (for isolated/trusted environments only)\n\nIf updating the kernel is not an option, you can bypass the sandbox entirely using the `--dangerously-bypass-approvals-and-sandbox` flag (aliased as `--yolo`):\n\n```bash\ncodex --dangerously-bypass-approvals-and-sandbox \"your task here\"\n```\n\nOnly use this in isolated environments (containers, throwaway VMs, CI runners) since it removes all filesystem and approval restrictions.\n\n**Option 3: Use `danger-full-access` sandbox mode**\n\nA slightly less drastic alternative is to use the unrestricted sandbox mode while keeping approval prompts active:\n\n```bash\ncodex -s danger-full-access \"your task here\"\n```\n\nThis avoids the Landlock syscall entirely while still requiring approval for sensitive actions.\n\n---\n\n## Setting Up Two Profiles in config.toml\n\nEdit (or create) `~/.codex/config.toml` and add the following:\n\n```toml\n# Default settings (used when no profile is specified)\nmodel = \"gpt-5-codex\"\napproval_policy = \"on-request\"\n\n[sandbox_workspace_write]\nnetwork_access = false\n\n# ----------------------------------------------------------\n# Profile: safe daily development\n# ----------------------------------------------------------\n[profiles.dev]\nmodel = \"gpt-5-codex\"\napproval_policy = \"on-request\"\n# This is equivalent to --full-auto: it will ask before\n# sensitive actions but auto-approve reads and workspace writes.\n\n# ----------------------------------------------------------\n# Profile: CI with no approvals\n# ----------------------------------------------------------\n[profiles.ci]\nmodel = \"gpt-5-codex\"\napproval_policy = \"never\"\n# \"never\" means Codex will not pause for any approval prompts,\n# making it suitable for fully automated CI pipelines.\n```\n\n### Using the profiles\n\n```bash\n# Daily development (safe, asks before sensitive actions)\ncodex --profile dev \"Refactor the auth module\"\n\n# CI pipeline (no approvals, fully automated)\ncodex --profile ci \"Run tests and fix failures\"\n\n# Short form\ncodex -p dev \"task\"\ncodex -p ci \"task\"\n```\n\n### Setting a default profile\n\nIf you want `dev` to be your default without typing `--profile` every time, add this at the top level of your config:\n\n```toml\nprofile = \"dev\"\n```\n\nThen you only need to explicitly pass `--profile ci` when running in CI.\n\n### GitHub Actions example using the CI profile\n\n```yaml\n- name: Run Codex\n  run: |\n    npm i -g @openai/codex\n    codex exec --profile ci \"Update CHANGELOG for this release\"\n  env:\n    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n```\n\n### Config precedence reminder\n\nCLI flags always override profile settings, which override user config defaults:\n\n1. CLI flags (`--model`, `-a`, `-c key=value`)\n2. Project config (`.codex/config.toml` in project root)\n3. User config (`~/.codex/config.toml`, including profiles)\n4. Built-in defaults\n\nSo even with a profile active, you can override individual settings on the command line, for example:\n\n```bash\ncodex --profile ci --model gpt-5-pro \"task\"\n```\n"}], "grading": {"expectations": [{"text": "Mentions updating WSL2 kernel or using --dangerously-bypass-approvals-and-sandbox (--yolo) as workarounds", "passed": true, "evidence": "Option 1: 'wsl --update' to update kernel. Option 2: '--dangerously-bypass-approvals-and-sandbox' with '(aliased as --yolo)'"}, {"text": "Shows correct [profiles.name] TOML syntax for defining profiles", "passed": true, "evidence": "Lines 66-79: '[profiles.dev]' and '[profiles.ci]' sections in config.toml"}, {"text": "CI profile uses approval_policy = 'never' for fully automated mode", "passed": true, "evidence": "Line 77: 'approval_policy = \"never\"'"}, {"text": "Dev profile uses a safe approval_policy like 'on-request' or 'untrusted'", "passed": true, "evidence": "Line 68: 'approval_policy = \"on-request\"'"}, {"text": "Shows --profile flag for selecting a profile at runtime", "passed": true, "evidence": "Lines 85-93: 'codex --profile dev' and 'codex -p dev'"}, {"text": "Does not invent non-existent troubleshooting steps or config options", "passed": true, "evidence": "All flags (--dangerously-bypass-approvals-and-sandbox, --yolo, -s danger-full-access, --profile, -p) and config keys (approval_policy, model, profile) match the skill documentation"}], "summary": {"passed": 6, "failed": 0, "total": 6, "pass_rate": 1.0}, "claims": [{"claim": "danger-full-access sandbox mode avoids Landlock syscall", "type": "factual", "verified": true, "evidence": "Reasonable inference \u00e2\u20ac\u201d danger-full-access provides unrestricted access per the skill doc"}], "eval_feedback": {"suggestions": [], "overall": "No suggestions, evals look solid"}}}, {"id": "eval-3-profiles-troubleshooting-without_skill", "prompt": "I'm getting Landlock errors on WSL2 when running codex. Also, I need to set up two profiles in config.toml \u00e2\u20ac\u201d one for safe daily dev and one for CI with no approvals. How do I fix the Landlock issue and configure the profiles?", "eval_id": 3, "outputs": [{"name": "response.md", "type": "text", "content": "# Fixing Landlock Errors on WSL2 and Configuring Codex Profiles\n\n## Part 1: Fixing Landlock Errors on WSL2\n\n### What is the problem?\n\nLandlock is a Linux kernel security module (available since kernel 5.13) that Codex CLI uses for sandboxing \u00e2\u20ac\u201d it restricts which files and directories a process can access. On WSL2, Landlock support depends on your WSL kernel version, and many WSL2 distributions ship with kernels that either do not support Landlock or have incomplete support, leading to errors when Codex tries to enable its sandbox.\n\nTypical error messages look like:\n\n```\nError: Landlock is not supported by the running kernel\n```\n\nor\n\n```\nLandlock sandbox initialization failed\n```\n\n### Solutions\n\n#### Option 1: Disable the sandbox (quickest fix)\n\nYou can disable Codex's sandboxing entirely by passing the `--no-sandbox` flag or by setting the `sandbox` option to `false` in your configuration:\n\n**Via CLI flag:**\n```bash\ncodex --no-sandbox \"your prompt here\"\n```\n\n**Via environment variable:**\n```bash\nexport CODEX_SANDBOX_TYPE=none\ncodex \"your prompt here\"\n```\n\n**Via config.toml** (see Part 2 below for file location):\n```toml\nsandbox = \"none\"\n```\n\n#### Option 2: Update your WSL2 kernel\n\nIf you want to keep sandboxing enabled, update your WSL kernel to a version that supports Landlock (5.13+, ideally 6.1+):\n\n1. Open PowerShell as Administrator on the Windows host:\n   ```powershell\n   wsl --update\n   ```\n\n2. Restart WSL:\n   ```powershell\n   wsl --shutdown\n   ```\n\n3. Verify the kernel version inside WSL:\n   ```bash\n   uname -r\n   ```\n   You need at least 5.13. WSL2 kernels based on 5.15 or 6.x should have Landlock support.\n\n4. Check Landlock availability:\n   ```bash\n   # Check if Landlock is enabled in the kernel\n   cat /sys/kernel/security/lsm\n   ```\n   The output should include `landlock` in the comma-separated list. If it does not, your kernel was compiled without Landlock support.\n\n#### Option 3: Use a custom WSL kernel with Landlock enabled\n\nIf `wsl --update` does not give you a Landlock-capable kernel, you can compile a custom WSL kernel with Landlock enabled:\n\n1. Clone the WSL2 kernel source from Microsoft's GitHub repository.\n2. Enable `CONFIG_SECURITY_LANDLOCK=y` in the kernel config.\n3. Build and point WSL to the custom kernel via `.wslconfig`:\n\n   In `%USERPROFILE%\\.wslconfig`:\n   ```ini\n   [wsl2]\n   kernel=C:\\\\path\\\\to\\\\custom\\\\bzImage\n   ```\n\n4. Restart WSL with `wsl --shutdown`.\n\nThis is an advanced approach and usually unnecessary if `wsl --update` brings you to a recent kernel.\n\n---\n\n## Part 2: Configuring Two Profiles in config.toml\n\nCodex CLI uses a TOML configuration file located at:\n\n```\n~/.codex/config.toml\n```\n\n(On some setups this may be at `~/.config/codex/config.toml` depending on the version.)\n\n### Profile structure\n\nCodex supports a default configuration section and named profiles. The default section applies when no profile is specified. Named profiles are defined under `[profile.<name>]` sections and override the defaults.\n\n### Configuration file with two profiles\n\n```toml\n# =============================================================\n# ~/.codex/config.toml\n# =============================================================\n\n# Default model used across all profiles\nmodel = \"o4-mini\"\n\n# Default approval mode\napproval_mode = \"suggest\"\n\n# =============================================================\n# Profile 1: \"dev\" \u00e2\u20ac\u201d Safe daily development\n# =============================================================\n[profile.dev]\nmodel = \"o4-mini\"\napproval_mode = \"suggest\"\nsandbox = \"permissive\"\n\n# In \"suggest\" mode, Codex will:\n#   - Automatically approve read-only operations (file reads, searches)\n#   - Prompt you for approval before any write operation (file edits,\n#     shell commands that modify the filesystem)\n#   - This is the safest interactive mode for daily development\n\n# \"permissive\" sandbox allows reads anywhere but restricts writes\n# to the current working directory and common temp directories.\n\n# =============================================================\n# Profile 2: \"ci\" \u00e2\u20ac\u201d CI/CD with no approvals needed\n# =============================================================\n[profile.ci]\nmodel = \"o4-mini\"\napproval_mode = \"full-auto\"\nsandbox = \"none\"\n\n# In \"full-auto\" mode, Codex will:\n#   - Automatically approve ALL operations without prompting\n#   - Execute read and write operations, shell commands, etc.\n#   - Suitable for CI/CD pipelines where no human is present\n#\n# sandbox = \"none\" disables Landlock sandboxing entirely,\n# which avoids Landlock issues in CI environments (Docker,\n# WSL, older kernels) and removes filesystem restrictions.\n```\n\n### Using profiles\n\nTo run Codex with a specific profile, use the `--profile` flag:\n\n```bash\n# Daily development (safe mode with approvals)\ncodex --profile dev \"refactor the auth module\"\n\n# CI pipeline (no approvals, no sandbox)\ncodex --profile ci \"run the test suite and fix any failures\"\n```\n\n### Approval mode reference\n\n| Mode | Behavior |\n|------|----------|\n| `suggest` | Auto-approves reads; prompts for writes and commands |\n| `auto-edit` | Auto-approves reads and file edits; prompts for shell commands |\n| `full-auto` | Auto-approves everything \u00e2\u20ac\u201d no prompts at all |\n\n### Sandbox mode reference\n\n| Mode | Behavior |\n|------|----------|\n| `permissive` | Allows reads everywhere; restricts writes to cwd and temp dirs |\n| `none` | Disables sandboxing entirely (needed when Landlock is unavailable) |\n\n### CI/CD pipeline example\n\nIn a GitHub Actions workflow or similar CI environment:\n\n```yaml\n- name: Run Codex\n  env:\n    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n  run: |\n    codex --profile ci \"run tests and report results\"\n```\n\nOr without profiles, using flags directly:\n\n```yaml\n- name: Run Codex\n  env:\n    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n  run: |\n    codex --approval-mode full-auto --sandbox none \"run tests and report results\"\n```\n\n---\n\n## Summary\n\n| Problem | Solution |\n|---------|----------|\n| Landlock errors on WSL2 | Set `sandbox = \"none\"` in config or use `--no-sandbox`, or update the WSL kernel via `wsl --update` |\n| Safe daily dev profile | Use `approval_mode = \"suggest\"` with `sandbox = \"permissive\"` |\n| CI profile with no approvals | Use `approval_mode = \"full-auto\"` with `sandbox = \"none\"` |\n"}], "grading": {"expectations": [{"text": "Mentions updating WSL2 kernel or using --dangerously-bypass-approvals-and-sandbox (--yolo) as workarounds", "passed": false, "evidence": "Mentions 'wsl --update' (correct) but does NOT mention --dangerously-bypass-approvals-and-sandbox or --yolo. Instead invents '--no-sandbox' flag and 'CODEX_SANDBOX_TYPE=none' env var, neither of which exist."}, {"text": "Shows correct [profiles.name] TOML syntax for defining profiles", "passed": false, "evidence": "Uses '[profile.dev]' and '[profile.ci]' (singular 'profile') instead of the correct '[profiles.dev]' and '[profiles.ci]' (plural 'profiles')."}, {"text": "CI profile uses approval_policy = 'never' for fully automated mode", "passed": false, "evidence": "Uses 'approval_mode = \"full-auto\"' \u00e2\u20ac\u201d wrong key name (should be approval_policy) and wrong value (should be 'never'). 'full-auto' is a CLI shortcut flag, not an approval_policy value."}, {"text": "Dev profile uses a safe approval_policy like 'on-request' or 'untrusted'", "passed": false, "evidence": "Uses 'approval_mode = \"suggest\"' \u00e2\u20ac\u201d wrong key name (should be approval_policy) and 'suggest' is not a valid approval policy value. Valid values are: untrusted, on-request, never, reject."}, {"text": "Shows --profile flag for selecting a profile at runtime", "passed": true, "evidence": "Line 157: 'codex --profile dev' and 'codex --profile ci'"}, {"text": "Does not invent non-existent troubleshooting steps or config options", "passed": false, "evidence": "Hallucinated: '--no-sandbox' flag, 'CODEX_SANDBOX_TYPE=none' env var, 'sandbox = none/permissive' config keys, 'approval_mode' key (should be approval_policy), 'suggest'/'auto-edit'/'full-auto' as config values, '[profile.name]' syntax (should be [profiles.name])."}], "summary": {"passed": 1, "failed": 5, "total": 6, "pass_rate": 0.167}, "claims": [{"claim": "Approval modes are suggest, auto-edit, full-auto", "type": "factual", "verified": false, "evidence": "Real approval_policy values: untrusted, on-request, never, reject"}, {"claim": "--no-sandbox disables sandboxing", "type": "factual", "verified": false, "evidence": "No --no-sandbox flag exists. The bypass flag is --dangerously-bypass-approvals-and-sandbox (--yolo)"}, {"claim": "Profile sections use [profile.name] syntax", "type": "factual", "verified": false, "evidence": "Correct syntax is [profiles.name] (plural)"}], "eval_feedback": {"suggestions": [], "overall": "Baseline massively hallucinated \u00e2\u20ac\u201d wrong config keys, wrong values, invented flags and env vars"}}}], "previous_feedback": {}, "previous_outputs": {}, "benchmark": {"metadata": {"skill_name": "codex-cli", "skill_path": "src/skills/codex-cli/SKILL.md", "executor_model": "claude-opus-4-6", "timestamp": "2026-03-11T00:00:00Z", "evals_run": 3, "runs_per_configuration": 1}, "runs": [{"eval_id": 1, "eval_name": "ci-integration", "configuration": "with_skill", "result": {"pass_rate": 1.0, "passed": 6, "failed": 0, "total": 6}, "timing": {"total_tokens": 68449, "duration_ms": 60126}}, {"eval_id": 1, "eval_name": "ci-integration", "configuration": "without_skill", "result": {"pass_rate": 0.5, "passed": 3, "failed": 3, "total": 6}, "timing": {"total_tokens": 64960, "duration_ms": 44585}}, {"eval_id": 2, "eval_name": "mcp-server-config", "configuration": "with_skill", "result": {"pass_rate": 1.0, "passed": 6, "failed": 0, "total": 6}, "timing": {"total_tokens": 67753, "duration_ms": 38647}}, {"eval_id": 2, "eval_name": "mcp-server-config", "configuration": "without_skill", "result": {"pass_rate": 0.167, "passed": 1, "failed": 5, "total": 6}, "timing": {"total_tokens": 64950, "duration_ms": 41699}}, {"eval_id": 3, "eval_name": "profiles-troubleshooting", "configuration": "with_skill", "result": {"pass_rate": 1.0, "passed": 6, "failed": 0, "total": 6}, "timing": {"total_tokens": 67804, "duration_ms": 38914}}, {"eval_id": 3, "eval_name": "profiles-troubleshooting", "configuration": "without_skill", "result": {"pass_rate": 0.167, "passed": 1, "failed": 5, "total": 6}, "timing": {"total_tokens": 65148, "duration_ms": 51576}}], "run_summary": {"with_skill": {"mean_pass_rate": 1.0, "stddev_pass_rate": 0.0, "mean_tokens": 68002, "mean_duration_ms": 45896}, "without_skill": {"mean_pass_rate": 0.278, "stddev_pass_rate": 0.192, "mean_tokens": 65019, "mean_duration_ms": 45953}, "delta": {"pass_rate": "+72.2%", "tokens": "+4.6%", "duration": "-0.1%"}}, "notes": "The skill achieves 100% pass rate across all 3 evals vs 27.8% baseline. Without the skill, Claude heavily hallucinated CLI flags (--approval-mode, --sandbox write-only, --no-sandbox, --mcp-server), config format (JSON instead of TOML), config keys (approval_mode instead of approval_policy), and approval mode values (suggest/auto-edit/full-auto instead of untrusted/on-request/never/reject). The skill completely eliminates these hallucinations at negligible additional token cost."}};
-    // ---- State ----
-    let feedbackMap = {};  // run_id -> feedback text
-    let currentIndex = 0;
-    let visitedRuns = new Set();
-    // ---- Init ----
-    async function init() {
-      // Load saved feedback from server — but only if this isn't a fresh
-      // iteration (indicated by previous_feedback being present). When
-      // previous feedback exists, the feedback.json on disk is stale from
-      // the prior iteration and should not pre-fill the textareas.
-      const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
-        || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
-      if (!hasPrevious) {
-        try {
-          const resp = await fetch("/api/feedback");
-          const data = await resp.json();
-          if (data.reviews) {
-            for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
-          }
-        } catch { /* first run, no feedback yet */ }
-      }
-      document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
-      showRun(0);
-      // Wire up feedback auto-save
-      const textarea = document.getElementById("feedback");
-      let saveTimeout = null;
-      textarea.addEventListener("input", () => {
-        clearTimeout(saveTimeout);
-        document.getElementById("feedback-status").textContent = "";
-        saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
-      });
-    }
-    // ---- Navigation ----
-    function navigate(delta) {
-      const newIndex = currentIndex + delta;
-      if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
-        saveCurrentFeedback();
-        showRun(newIndex);
-      }
-    }
-    function updateNavButtons() {
-      document.getElementById("prev-btn").disabled = currentIndex === 0;
-      document.getElementById("next-btn").disabled =
-        currentIndex === EMBEDDED_DATA.runs.length - 1;
-    }
-    // ---- Show a run ----
-    function showRun(index) {
-      currentIndex = index;
-      const run = EMBEDDED_DATA.runs[index];
-      // Progress
-      document.getElementById("progress").textContent =
-        `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
-      // Prompt
-      document.getElementById("prompt-text").textContent = run.prompt;
-      // Config badge
-      const badge = document.getElementById("config-badge");
-      const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
-      if (configMatch) {
-        const config = configMatch[1];
-        const isBaseline = config === "without_skill" || config === "old_skill";
-        badge.textContent = config.replace(/_/g, " ");
-        badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
-        badge.style.display = "inline-block";
-      } else {
-        badge.style.display = "none";
-      }
-      // Outputs
-      renderOutputs(run);
-      // Previous outputs
-      renderPrevOutputs(run);
-      // Grades
-      renderGrades(run);
-      // Previous feedback
-      const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
-      const prevEl = document.getElementById("prev-feedback");
-      if (prevFb) {
-        document.getElementById("prev-feedback-text").textContent = prevFb;
-        prevEl.style.display = "block";
-      } else {
-        prevEl.style.display = "none";
-      }
-      // Feedback
-      document.getElementById("feedback").value = feedbackMap[run.id] || "";
-      document.getElementById("feedback-status").textContent = "";
-      updateNavButtons();
-      // Track visited runs and promote done button when all visited
-      visitedRuns.add(index);
-      const doneBtn = document.getElementById("done-btn");
-      if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
-        doneBtn.classList.add("ready");
-      }
-      // Scroll main content to top
-      document.querySelector(".main").scrollTop = 0;
-    }
-    // ---- Render outputs ----
-    function renderOutputs(run) {
-      const container = document.getElementById("outputs-body");
-      container.innerHTML = "";
-      const outputs = run.outputs || [];
-      if (outputs.length === 0) {
-        container.innerHTML = '<div class="empty-state">No output files</div>';
-        return;
-      }
-      for (const file of outputs) {
-        const fileDiv = document.createElement("div");
-        fileDiv.className = "output-file";
-        // Always show file header with download link
-        const header = document.createElement("div");
-        header.className = "output-file-header";
-        const nameSpan = document.createElement("span");
-        nameSpan.textContent = file.name;
-        header.appendChild(nameSpan);
-        const dlBtn = document.createElement("a");
-        dlBtn.className = "dl-btn";
-        dlBtn.textContent = "Download";
-        dlBtn.download = file.name;
-        dlBtn.href = getDownloadUri(file);
-        header.appendChild(dlBtn);
-        fileDiv.appendChild(header);
-        const content = document.createElement("div");
-        content.className = "output-file-content";
-        if (file.type === "text") {
-          const pre = document.createElement("pre");
-          pre.textContent = file.content;
-          content.appendChild(pre);
-        } else if (file.type === "image") {
-          const img = document.createElement("img");
-          img.src = file.data_uri;
-          img.alt = file.name;
-          content.appendChild(img);
-        } else if (file.type === "pdf") {
-          const iframe = document.createElement("iframe");
-          iframe.src = file.data_uri;
-          content.appendChild(iframe);
-        } else if (file.type === "xlsx") {
-          renderXlsx(content, file.data_b64);
-        } else if (file.type === "binary") {
-          const a = document.createElement("a");
-          a.className = "download-link";
-          a.href = file.data_uri;
-          a.download = file.name;
-          a.textContent = "Download " + file.name;
-          content.appendChild(a);
-        } else if (file.type === "error") {
-          const pre = document.createElement("pre");
-          pre.textContent = file.content;
-          pre.style.color = "var(--red)";
-          content.appendChild(pre);
-        }
-        fileDiv.appendChild(content);
-        container.appendChild(fileDiv);
-      }
-    }
-    // ---- XLSX rendering via SheetJS ----
-    function renderXlsx(container, b64Data) {
-      try {
-        const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
-        const wb = XLSX.read(raw, { type: "array" });
-        for (let i = 0; i < wb.SheetNames.length; i++) {
-          const sheetName = wb.SheetNames[i];
-          const ws = wb.Sheets[sheetName];
-          if (wb.SheetNames.length > 1) {
-            const sheetLabel = document.createElement("div");
-            sheetLabel.style.cssText =
-              "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
-            sheetLabel.textContent = "Sheet: " + sheetName;
-            container.appendChild(sheetLabel);
-          }
-          const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
-          const wrapper = document.createElement("div");
-          wrapper.innerHTML = htmlStr;
-          container.appendChild(wrapper);
-        }
-      } catch (err) {
-        container.textContent = "Error rendering spreadsheet: " + err.message;
-      }
-    }
-    // ---- Grades ----
-    function renderGrades(run) {
-      const section = document.getElementById("grades-section");
-      const content = document.getElementById("grades-content");
-      if (!run.grading) {
-        section.style.display = "none";
-        return;
-      }
-      const grading = run.grading;
-      section.style.display = "block";
-      // Reset to collapsed
-      content.classList.remove("open");
-      document.getElementById("grades-arrow").classList.remove("open");
-      const summary = grading.summary || {};
-      const expectations = grading.expectations || [];
-      let html = '<div style="padding: 1rem;">';
-      // Summary line
-      const passRate = summary.pass_rate != null
-        ? Math.round(summary.pass_rate * 100) + "%"
-        : "?";
-      const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
-      html += '<div class="grades-summary">';
-      html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
-      html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
-      html += '</div>';
-      // Assertions list
-      html += '<ul class="assertion-list">';
-      for (const exp of expectations) {
-        const statusClass = exp.passed ? "pass" : "fail";
-        const statusIcon = exp.passed ? "\u2713" : "\u2717";
-        html += '<li class="assertion-item">';
-        html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
-        html += '<span>' + escapeHtml(exp.text) + '</span>';
-        if (exp.evidence) {
-          html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
-        }
-        html += '</li>';
-      }
-      html += '</ul>';
-      html += '</div>';
-      content.innerHTML = html;
-    }
-    function toggleGrades() {
-      const content = document.getElementById("grades-content");
-      const arrow = document.getElementById("grades-arrow");
-      content.classList.toggle("open");
-      arrow.classList.toggle("open");
-    }
-    // ---- Previous outputs (collapsible) ----
-    function renderPrevOutputs(run) {
-      const section = document.getElementById("prev-outputs-section");
-      const content = document.getElementById("prev-outputs-content");
-      const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
-      if (!prevOutputs || prevOutputs.length === 0) {
-        section.style.display = "none";
-        return;
-      }
-      section.style.display = "block";
-      // Reset to collapsed
-      content.classList.remove("open");
-      document.getElementById("prev-outputs-arrow").classList.remove("open");
-      // Render the files into the content area
-      content.innerHTML = "";
-      const wrapper = document.createElement("div");
-      wrapper.style.padding = "1rem";
-      for (const file of prevOutputs) {
-        const fileDiv = document.createElement("div");
-        fileDiv.className = "output-file";
-        const header = document.createElement("div");
-        header.className = "output-file-header";
-        const nameSpan = document.createElement("span");
-        nameSpan.textContent = file.name;
-        header.appendChild(nameSpan);
-        const dlBtn = document.createElement("a");
-        dlBtn.className = "dl-btn";
-        dlBtn.textContent = "Download";
-        dlBtn.download = file.name;
-        dlBtn.href = getDownloadUri(file);
-        header.appendChild(dlBtn);
-        fileDiv.appendChild(header);
-        const fc = document.createElement("div");
-        fc.className = "output-file-content";
-        if (file.type === "text") {
-          const pre = document.createElement("pre");
-          pre.textContent = file.content;
-          fc.appendChild(pre);
-        } else if (file.type === "image") {
-          const img = document.createElement("img");
-          img.src = file.data_uri;
-          img.alt = file.name;
-          fc.appendChild(img);
-        } else if (file.type === "pdf") {
-          const iframe = document.createElement("iframe");
-          iframe.src = file.data_uri;
-          fc.appendChild(iframe);
-        } else if (file.type === "xlsx") {
-          renderXlsx(fc, file.data_b64);
-        } else if (file.type === "binary") {
-          const a = document.createElement("a");
-          a.className = "download-link";
-          a.href = file.data_uri;
-          a.download = file.name;
-          a.textContent = "Download " + file.name;
-          fc.appendChild(a);
-        }
-        fileDiv.appendChild(fc);
-        wrapper.appendChild(fileDiv);
-      }
-      content.appendChild(wrapper);
-    }
-    function togglePrevOutputs() {
-      const content = document.getElementById("prev-outputs-content");
-      const arrow = document.getElementById("prev-outputs-arrow");
-      content.classList.toggle("open");
-      arrow.classList.toggle("open");
-    }
-    // ---- Feedback (saved to server -> feedback.json) ----
-    function saveCurrentFeedback() {
-      const run = EMBEDDED_DATA.runs[currentIndex];
-      const text = document.getElementById("feedback").value;
-      if (text.trim() === "") {
-        delete feedbackMap[run.id];
-      } else {
-        feedbackMap[run.id] = text;
-      }
-      // Build reviews array from map
-      const reviews = [];
-      for (const [run_id, feedback] of Object.entries(feedbackMap)) {
-        if (feedback.trim()) {
-          reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
-        }
-      }
-      fetch("/api/feedback", {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ reviews, status: "in_progress" }),
-      }).then(() => {
-        document.getElementById("feedback-status").textContent = "Saved";
-      }).catch(() => {
-        // Static mode or server unavailable — no-op on auto-save,
-        // feedback will be downloaded on final submit
-        document.getElementById("feedback-status").textContent = "Will download on submit";
-      });
-    }
-    // ---- Done ----
-    function showDoneDialog() {
-      // Save current textarea to feedbackMap (but don't POST yet)
-      const run = EMBEDDED_DATA.runs[currentIndex];
-      const text = document.getElementById("feedback").value;
-      if (text.trim() === "") {
-        delete feedbackMap[run.id];
-      } else {
-        feedbackMap[run.id] = text;
-      }
-      // POST once with status: complete — include ALL runs so the model
-      // can distinguish "no feedback" (looks good) from "not reviewed"
-      const reviews = [];
-      const ts = new Date().toISOString();
-      for (const r of EMBEDDED_DATA.runs) {
-        reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
-      }
-      const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
-      fetch("/api/feedback", {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: payload,
-      }).then(() => {
-        document.getElementById("done-overlay").classList.add("visible");
-      }).catch(() => {
-        // Server not available (static mode) — download as file
-        const blob = new Blob([payload], { type: "application/json" });
-        const url = URL.createObjectURL(blob);
-        const a = document.createElement("a");
-        a.href = url;
-        a.download = "feedback.json";
-        a.click();
-        URL.revokeObjectURL(url);
-        document.getElementById("done-overlay").classList.add("visible");
-      });
-    }
-    function closeDoneDialog() {
-      // Reset status back to in_progress
-      saveCurrentFeedback();
-      document.getElementById("done-overlay").classList.remove("visible");
-    }
-    // ---- Toast ----
-    function showToast(message) {
-      const toast = document.getElementById("toast");
-      toast.textContent = message;
-      toast.classList.add("visible");
-      setTimeout(() => toast.classList.remove("visible"), 2000);
-    }
-    // ---- Keyboard nav ----
-    document.addEventListener("keydown", (e) => {
-      // Don't capture when typing in textarea
-      if (e.target.tagName === "TEXTAREA") return;
-      if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
-        e.preventDefault();
-        navigate(-1);
-      } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
-        e.preventDefault();
-        navigate(1);
-      }
-    });
-    // ---- Util ----
-    function getDownloadUri(file) {
-      if (file.data_uri) return file.data_uri;
-      if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
-      if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
-      return "#";
-    }
-    function escapeHtml(text) {
-      const div = document.createElement("div");
-      div.textContent = text;
-      return div.innerHTML;
-    }
-    // ---- View switching ----
-    function switchView(view) {
-      document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
-      document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
-      document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
-      document.getElementById("panel-" + view).classList.add("active");
-    }
-    // ---- Benchmark rendering ----
-    function renderBenchmark() {
-      const data = EMBEDDED_DATA.benchmark;
-      if (!data) return;
-      // Show the tabs
-      document.getElementById("view-tabs").style.display = "flex";
-      const container = document.getElementById("benchmark-content");
-      const summary = data.run_summary || {};
-      const metadata = data.metadata || {};
-      const notes = data.notes || [];
-      let html = "";
-      // Header
-      html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
-      html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
-      if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
-      if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
-      if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
-      html += (metadata.runs_per_configuration || "?") + " runs per configuration";
-      html += "</p>";
-      // Summary table
-      html += '<table class="benchmark-table">';
-      function fmtStat(stat, pct) {
-        if (!stat) return "—";
-        const suffix = pct ? "%" : "";
-        const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
-        const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
-        return m + suffix + " ± " + s + suffix;
-      }
-      function deltaClass(val) {
-        if (!val) return "";
-        const n = parseFloat(val);
-        if (n > 0) return "benchmark-delta-positive";
-        if (n < 0) return "benchmark-delta-negative";
-        return "";
-      }
-      // Discover config names dynamically (everything except "delta")
-      const configs = Object.keys(summary).filter(k => k !== "delta");
-      const configA = configs[0] || "config_a";
-      const configB = configs[1] || "config_b";
-      const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
-      const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
-      const a = summary[configA] || {};
-      const b = summary[configB] || {};
-      const delta = summary.delta || {};
-      html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
-      html += "<tbody>";
-      html += "<tr><td><strong>Pass Rate</strong></td>";
-      html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
-      html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
-      html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
-      // Time (only show row if data exists)
-      if (a.time_seconds || b.time_seconds) {
-        html += "<tr><td><strong>Time (s)</strong></td>";
-        html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
-        html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
-        html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
-      }
-      // Tokens (only show row if data exists)
-      if (a.tokens || b.tokens) {
-        html += "<tr><td><strong>Tokens</strong></td>";
-        html += "<td>" + fmtStat(a.tokens, false) + "</td>";
-        html += "<td>" + fmtStat(b.tokens, false) + "</td>";
-        html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
-      }
-      html += "</tbody></table>";
-      // Per-eval breakdown (if runs data available)
-      const runs = data.runs || [];
-      if (runs.length > 0) {
-        const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
-        html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
-        const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
-        const hasErrors = runs.some(r => r.result && r.result.errors > 0);
-        for (const evalId of evalIds) {
-          const evalRuns = runs.filter(r => r.eval_id === evalId);
-          const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
-          html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
-          html += '<table class="benchmark-table">';
-          html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
-          if (hasTime) html += "<th>Time (s)</th>";
-          if (hasErrors) html += "<th>Crashes During Execution</th>";
-          html += "</tr></thead>";
-          html += "<tbody>";
-          // Group by config and render with average rows
-          const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
-          for (let ci = 0; ci < configGroups.length; ci++) {
-            const config = configGroups[ci];
-            const configRuns = evalRuns.filter(r => r.configuration === config);
-            if (configRuns.length === 0) continue;
-            const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
-            const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
-            for (const run of configRuns) {
-              const r = run.result || {};
-              const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
-              html += '<tr class="' + rowClass + '">';
-              html += "<td>" + configLabel + "</td>";
-              html += "<td>" + run.run_number + "</td>";
-              html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
-              if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
-              if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
-              html += "</tr>";
-            }
-            // Average row
-            const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
-            const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
-            const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
-            html += '<tr class="benchmark-row-avg ' + rowClass + '">';
-            html += "<td>" + configLabel + "</td>";
-            html += "<td>Avg</td>";
-            html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
-            if (hasTime) {
-              const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
-              html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
-            }
-            if (hasErrors) html += "<td></td>";
-            html += "</tr>";
-          }
-          html += "</tbody></table>";
-          // Per-assertion detail for this eval
-          const runsWithExpectations = {};
-          for (const config of configGroups) {
-            runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
-          }
-          const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
-          if (hasAnyExpectations) {
-            // Collect all unique assertion texts across all configs
-            const allAssertions = [];
-            const seen = new Set();
-            for (const config of configGroups) {
-              for (const run of runsWithExpectations[config]) {
-                for (const exp of (run.expectations || [])) {
-                  if (!seen.has(exp.text)) {
-                    seen.add(exp.text);
-                    allAssertions.push(exp.text);
-                  }
-                }
-              }
-            }
-            html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
-            html += "<thead><tr><th>Assertion</th>";
-            for (const config of configGroups) {
-              const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
-              html += "<th>" + escapeHtml(label) + "</th>";
-            }
-            html += "</tr></thead><tbody>";
-            for (const assertionText of allAssertions) {
-              html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
-              for (const config of configGroups) {
-                html += "<td>";
-                for (const run of runsWithExpectations[config]) {
-                  const exp = (run.expectations || []).find(e => e.text === assertionText);
-                  if (exp) {
-                    const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
-                    const icon = exp.passed ? "\u2713" : "\u2717";
-                    html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
-                  } else {
-                    html += "— ";
-                  }
-                }
-                html += "</td>";
-              }
-              html += "</tr>";
-            }
-            html += "</tbody></table>";
-          }
-        }
-      }
-      // Notes
-      if (notes.length > 0) {
-        html += '<div class="benchmark-notes">';
-        html += "<h3>Analysis Notes</h3>";
-        html += "<ul>";
-        for (const note of notes) {
-          html += "<li>" + escapeHtml(note) + "</li>";
-        }
-        html += "</ul></div>";
-      }
-      container.innerHTML = html;
-    }
-    // ---- Start ----
-    init();
-    renderBenchmark();
-  </script>
-</body>
-</html>