npm - aiforcecli-chat - Versions diffs - 0.1.0 - Mend

aiforcecli-chat 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/License.MD +49 -0
package/README.md +642 -0
package/aiforcecli.config.example.json +66 -0
package/assets/README.md +14 -0
package/dist/cli.js +2 -0
package/dist/index.js +2 -0
package/package.json +62 -0
package/tools/scorecard/README.md +92 -0
package/tools/scorecard/config.json +134 -0
package/tools/scorecard/fetch.mjs +335 -0
package/tools/scorecard/generate.mjs +289 -0
package/tools/scorecard/generated/example/invalid-rows.json +1 -0
package/tools/scorecard/generated/example/scorecard-report.md +147 -0
package/tools/scorecard/generated/example/scorecard.compact.json +61 -0
package/tools/scorecard/generated/example/scorecard.json +1492 -0
package/tools/scorecard/generated/example/unmapped-models.json +1492 -0
package/tools/scorecard/generated/raw/aider_polyglot.html +21071 -0
package/tools/scorecard/generated/raw/terminal_bench_2_1.html +2 -0
package/tools/scorecard/generated/scorecard/invalid-rows.json +1 -0
package/tools/scorecard/generated/scorecard/scorecard-report.md +133 -0
package/tools/scorecard/generated/scorecard/scorecard.compact.json +51 -0
package/tools/scorecard/generated/scorecard/scorecard.json +1181 -0
package/tools/scorecard/generated/scorecard/unmapped-models.json +1492 -0
package/tools/scorecard/generated/scorecard-example/invalid-rows.json +1 -0
package/tools/scorecard/generated/scorecard-example/scorecard-report.md +40 -0
package/tools/scorecard/generated/scorecard-example/scorecard.compact.json +22 -0
package/tools/scorecard/generated/scorecard-example/scorecard.json +389 -0
package/tools/scorecard/generated/scorecard-example/unmapped-models.json +1 -0
package/tools/scorecard/generated/scorecard-fetch/raw/aider_polyglot.html +21071 -0
package/tools/scorecard/generated/scorecard-fetch/raw/terminal_bench_2_1.html +2 -0
package/tools/scorecard/snapshots/example.normalized.example.json +38 -0
package/tools/scorecard/snapshots/live.aider_polyglot.json +1318 -0
package/tools/scorecard/snapshots/live.terminal_bench_2_1.json +294 -0

package/tools/scorecard/generated/scorecard/scorecard.json ADDED Viewed

@@ -0,0 +1,1181 @@
+{
+  "version": "manual.2026.06.16",
+  "generatedAt": "2026-06-16T21:10:57.926Z",
+  "taskTypes": [
+    "bugfix",
+    "feature",
+    "refactor",
+    "test",
+    "docs",
+    "security",
+    "perf",
+    "general"
+  ],
+  "notes": [
+    "Generated scorecard artifact. It is not used by the application unless explicitly wired in later.",
+    "Scores are normalized public benchmark priors, not private repo outcomes."
+  ],
+  "scores": {
+    "antigravity:gemini-3.1-pro": {
+      "bugfix": {
+        "score": 0.7331,
+        "confidence": 0.2143,
+        "evidenceWeight": 0.5455,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7066,
+            "weight": 0.1574,
+            "date": "2026-05-05",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3.1 Pro"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6629,
+            "weight": 0.1556,
+            "date": "2026-05-02",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3 Pro"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.0805,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.0805,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0717,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          }
+        ]
+      },
+      "feature": {
+        "score": 0.7424,
+        "confidence": 0.1985,
+        "evidenceWeight": 0.4954,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7066,
+            "weight": 0.1224,
+            "date": "2026-05-05",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3.1 Pro"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6629,
+            "weight": 0.121,
+            "date": "2026-05-02",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3 Pro"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.0872,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.0872,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0777,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          }
+        ]
+      },
+      "refactor": {
+        "score": 0.7574,
+        "confidence": 0.1948,
+        "evidenceWeight": 0.484,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.1073,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.1073,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0956,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7066,
+            "weight": 0.0874,
+            "date": "2026-05-05",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3.1 Pro"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6629,
+            "weight": 0.0864,
+            "date": "2026-05-02",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3 Pro"
+          }
+        ]
+      },
+      "test": {
+        "score": 0.7364,
+        "confidence": 0.1607,
+        "evidenceWeight": 0.3831,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7066,
+            "weight": 0.1049,
+            "date": "2026-05-05",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3.1 Pro"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6629,
+            "weight": 0.1037,
+            "date": "2026-05-02",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3 Pro"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.0603,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.0603,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0538,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          }
+        ]
+      },
+      "docs": {
+        "score": 0.7981,
+        "confidence": 0.0462,
+        "evidenceWeight": 0.0969,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.0335,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.0335,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0299,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          }
+        ]
+      },
+      "security": {
+        "score": 0.7208,
+        "confidence": 0.1325,
+        "evidenceWeight": 0.3055,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7066,
+            "weight": 0.1049,
+            "date": "2026-05-05",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3.1 Pro"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6629,
+            "weight": 0.1037,
+            "date": "2026-05-02",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3 Pro"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.0335,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.0335,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0299,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          }
+        ]
+      },
+      "perf": {
+        "score": 0.7254,
+        "confidence": 0.1593,
+        "evidenceWeight": 0.3791,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7066,
+            "weight": 0.1224,
+            "date": "2026-05-05",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3.1 Pro"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6629,
+            "weight": 0.121,
+            "date": "2026-05-02",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3 Pro"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.0469,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.0469,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0418,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          }
+        ]
+      },
+      "general": {
+        "score": 0.7231,
+        "confidence": 0.2397,
+        "evidenceWeight": 0.6304,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7066,
+            "weight": 0.2098,
+            "date": "2026-05-05",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3.1 Pro"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6629,
+            "weight": 0.2074,
+            "date": "2026-05-02",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Gemini CLI Gemini 3 Pro"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.831,
+            "weight": 0.0738,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.791,
+            "weight": 0.0738,
+            "sampleSize": 225,
+            "date": "2025-06-06",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.769,
+            "weight": 0.0657,
+            "sampleSize": 225,
+            "date": "2025-05-07",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "Gemini 2.5 Pro Preview 05-06"
+          }
+        ]
+      }
+    },
+    "claude-code:opus": {
+      "bugfix": {
+        "score": 0.7454,
+        "confidence": 0.1407,
+        "evidenceWeight": 0.3276,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7888,
+            "weight": 0.1726,
+            "date": "2026-05-29",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.8"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6972,
+            "weight": 0.155,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.7"
+          }
+        ]
+      },
+      "feature": {
+        "score": 0.7454,
+        "confidence": 0.113,
+        "evidenceWeight": 0.2548,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7888,
+            "weight": 0.1342,
+            "date": "2026-05-29",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.8"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6972,
+            "weight": 0.1205,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.7"
+          }
+        ]
+      },
+      "refactor": {
+        "score": 0.7454,
+        "confidence": 0.0834,
+        "evidenceWeight": 0.182,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7888,
+            "weight": 0.0959,
+            "date": "2026-05-29",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.8"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6972,
+            "weight": 0.0861,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.7"
+          }
+        ]
+      },
+      "test": {
+        "score": 0.7454,
+        "confidence": 0.0984,
+        "evidenceWeight": 0.2184,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7888,
+            "weight": 0.1151,
+            "date": "2026-05-29",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.8"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6972,
+            "weight": 0.1033,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.7"
+          }
+        ]
+      },
+      "security": {
+        "score": 0.7454,
+        "confidence": 0.0984,
+        "evidenceWeight": 0.2184,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7888,
+            "weight": 0.1151,
+            "date": "2026-05-29",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.8"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6972,
+            "weight": 0.1033,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.7"
+          }
+        ]
+      },
+      "perf": {
+        "score": 0.7454,
+        "confidence": 0.113,
+        "evidenceWeight": 0.2548,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7888,
+            "weight": 0.1342,
+            "date": "2026-05-29",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.8"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6972,
+            "weight": 0.1205,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.7"
+          }
+        ]
+      },
+      "general": {
+        "score": 0.7454,
+        "confidence": 0.1792,
+        "evidenceWeight": 0.4368,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.7888,
+            "weight": 0.2301,
+            "date": "2026-05-29",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.8"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.6972,
+            "weight": 0.2066,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Claude Code Claude Opus 4.7"
+          }
+        ]
+      }
+    },
+    "codex:gpt-5.4": {
+      "bugfix": {
+        "score": 0.867,
+        "confidence": 0.0519,
+        "evidenceWeight": 0.1095,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.1095,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      },
+      "feature": {
+        "score": 0.867,
+        "confidence": 0.056,
+        "evidenceWeight": 0.1186,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.1186,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      },
+      "refactor": {
+        "score": 0.867,
+        "confidence": 0.068,
+        "evidenceWeight": 0.146,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.146,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      },
+      "test": {
+        "score": 0.867,
+        "confidence": 0.0394,
+        "evidenceWeight": 0.0821,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.0821,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      },
+      "docs": {
+        "score": 0.867,
+        "confidence": 0.0223,
+        "evidenceWeight": 0.0456,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.0456,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      },
+      "security": {
+        "score": 0.867,
+        "confidence": 0.0223,
+        "evidenceWeight": 0.0456,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.0456,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      },
+      "perf": {
+        "score": 0.867,
+        "confidence": 0.0309,
+        "evidenceWeight": 0.0639,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.0639,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      },
+      "general": {
+        "score": 0.867,
+        "confidence": 0.0478,
+        "evidenceWeight": 0.1004,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.867,
+            "weight": 0.1004,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 medium"
+          }
+        ]
+      }
+    },
+    "codex:gpt-5.4-mini": {
+      "bugfix": {
+        "score": 0.813,
+        "confidence": 0.0519,
+        "evidenceWeight": 0.1095,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.1095,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      },
+      "feature": {
+        "score": 0.813,
+        "confidence": 0.056,
+        "evidenceWeight": 0.1186,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.1186,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      },
+      "refactor": {
+        "score": 0.813,
+        "confidence": 0.068,
+        "evidenceWeight": 0.146,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.146,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      },
+      "test": {
+        "score": 0.813,
+        "confidence": 0.0394,
+        "evidenceWeight": 0.0821,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.0821,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      },
+      "docs": {
+        "score": 0.813,
+        "confidence": 0.0223,
+        "evidenceWeight": 0.0456,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.0456,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      },
+      "security": {
+        "score": 0.813,
+        "confidence": 0.0223,
+        "evidenceWeight": 0.0456,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.0456,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      },
+      "perf": {
+        "score": 0.813,
+        "confidence": 0.0309,
+        "evidenceWeight": 0.0639,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.0639,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      },
+      "general": {
+        "score": 0.813,
+        "confidence": 0.0478,
+        "evidenceWeight": 0.1004,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.813,
+            "weight": 0.1004,
+            "sampleSize": 225,
+            "date": "2025-08-25",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 low"
+          }
+        ]
+      }
+    },
+    "codex:gpt-5.5": {
+      "bugfix": {
+        "score": 0.8528,
+        "confidence": 0.1165,
+        "evidenceWeight": 0.2636,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.8337,
+            "weight": 0.155,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Codex CLI GPT-5.5"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.1086,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          }
+        ]
+      },
+      "feature": {
+        "score": 0.8566,
+        "confidence": 0.1064,
+        "evidenceWeight": 0.2382,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.8337,
+            "weight": 0.1205,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Codex CLI GPT-5.5"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.1177,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          }
+        ]
+      },
+      "refactor": {
+        "score": 0.8627,
+        "confidence": 0.1035,
+        "evidenceWeight": 0.2309,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.1449,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          },
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.8337,
+            "weight": 0.0861,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Codex CLI GPT-5.5"
+          }
+        ]
+      },
+      "test": {
+        "score": 0.8541,
+        "confidence": 0.0846,
+        "evidenceWeight": 0.1848,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.8337,
+            "weight": 0.1033,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Codex CLI GPT-5.5"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.0815,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          }
+        ]
+      },
+      "docs": {
+        "score": 0.88,
+        "confidence": 0.0221,
+        "evidenceWeight": 0.0453,
+        "sources": [
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.0453,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          }
+        ]
+      },
+      "security": {
+        "score": 0.8478,
+        "confidence": 0.0692,
+        "evidenceWeight": 0.1486,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.8337,
+            "weight": 0.1033,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Codex CLI GPT-5.5"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.0453,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          }
+        ]
+      },
+      "perf": {
+        "score": 0.8497,
+        "confidence": 0.0842,
+        "evidenceWeight": 0.1839,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.8337,
+            "weight": 0.1205,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Codex CLI GPT-5.5"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.0634,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          }
+        ]
+      },
+      "general": {
+        "score": 0.8488,
+        "confidence": 0.1328,
+        "evidenceWeight": 0.3062,
+        "sources": [
+          {
+            "source": "terminal_bench",
+            "benchmark": "terminal_bench",
+            "metric": "accuracy",
+            "score": 0.8337,
+            "weight": 0.2066,
+            "date": "2026-05-01",
+            "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
+            "modelRaw": "Codex CLI GPT-5.5"
+          },
+          {
+            "source": "aider_polyglot",
+            "benchmark": "aider_polyglot",
+            "metric": "pass_rate_2",
+            "score": 0.88,
+            "weight": 0.0996,
+            "sampleSize": 225,
+            "date": "2025-08-23",
+            "url": "https://aider.chat/docs/leaderboards/",
+            "modelRaw": "gpt-5 high"
+          }
+        ]
+      }
+    }
+  }
+}