thumbgate 1.27.12 → 1.27.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/llms.txt +2 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +2 -4
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/adapters/policy-engine/ethicore-guardian-client.js +68 -0
- package/adapters/policy-engine/thumbgate-policy-engine-adapter.js +260 -0
- package/bin/cli.js +78 -259
- package/config/gate-templates.json +0 -228
- package/config/gates/claim-verification.json +0 -18
- package/package.json +35 -25
- package/public/assets/brand/thumbgate-logo-transparent.svg +22 -0
- package/public/assets/brand/thumbgate-mark-inline-v3.svg +19 -0
- package/public/assets/brand/thumbgate-mark.svg +11 -5
- package/public/blog.html +0 -30
- package/public/brand/thumbgate-mark.svg +9 -5
- package/public/chatgpt-app.html +2 -2
- package/public/compare.html +2 -1
- package/public/dashboard.html +1 -1
- package/public/federal.html +1 -1
- package/public/index.html +95 -216
- package/public/learn.html +59 -35
- package/public/lessons.html +1 -1
- package/public/numbers.html +2 -2
- package/public/pro.html +7 -7
- package/scripts/agent-readiness.js +142 -0
- package/scripts/aws-blocks-guardrails.js +228 -0
- package/scripts/cli-schema.js +22 -10
- package/scripts/dashboard-chat.js +2 -1
- package/scripts/document-intake.js +1 -49
- package/scripts/durability/step.js +3 -3
- package/scripts/gate-stats.js +5 -11
- package/scripts/gates-engine.js +0 -49
- package/scripts/gemini-embedding-policy.js +2 -1
- package/scripts/hook-stop-anti-claim.js +116 -184
- package/scripts/hosted-config.js +0 -12
- package/scripts/lesson-search.js +1 -15
- package/scripts/llm-client.js +187 -5
- package/scripts/plausible-domain-config.js +3 -1
- package/scripts/seo-gsd.js +240 -1
- package/scripts/tool-registry.js +2 -2
- package/scripts/vector-store.js +44 -0
- package/scripts/workspace-evolver.js +62 -2
- package/src/api/server.js +340 -131
- package/public/assets/brand/thumbgate-mark-inline.svg +0 -15
- package/public/compare/adopt-ai.html +0 -219
- package/public/compare/agentix-labs.html +0 -197
- package/public/compare/ai-experience-orchestration.html +0 -216
- package/public/compare/anthropic-claude-for-legal.html +0 -260
- package/public/compare/anthropic-containment.html +0 -280
- package/public/compare/arcade.html +0 -175
- package/public/compare/arcjet.html +0 -239
- package/public/compare/bumblebee.html +0 -307
- package/public/compare/claude-code-hooks.html +0 -294
- package/public/compare/databricks-unity-ai-gateway.html +0 -215
- package/public/compare/fallow.html +0 -351
- package/public/compare/heidi.html +0 -233
- package/public/compare/mem0.html +0 -342
- package/public/compare/oak-and-sparrow-gatekeeper.html +0 -289
- package/public/compare/rein.html +0 -236
- package/public/compare/sigmashake.html +0 -256
- package/public/compare/speclock.html +0 -342
- package/public/guides/agent-harness-optimization.html +0 -342
- package/public/guides/agentic-web-governance.html +0 -406
- package/public/guides/ai-agent-governance-sprint.html +0 -415
- package/public/guides/ai-agent-pre-action-approval-gates.html +0 -401
- package/public/guides/ai-agent-workflow-migration-checklist.html +0 -392
- package/public/guides/ai-deployment-readiness.html +0 -415
- package/public/guides/ai-mode-ads-agent-governance.html +0 -401
- package/public/guides/ai-search-topical-presence.html +0 -342
- package/public/guides/autoresearch-agent-safety.html +0 -342
- package/public/guides/background-agent-governance.html +0 -358
- package/public/guides/best-tools-stop-ai-agents-breaking-production.html +0 -363
- package/public/guides/browser-automation-safety.html +0 -342
- package/public/guides/chatgpt-ads-trust.html +0 -353
- package/public/guides/claude-code-feedback.html +0 -339
- package/public/guides/claude-code-prevent-repeated-mistakes.html +0 -161
- package/public/guides/claude-code-skills-guardrails.html +0 -343
- package/public/guides/claude-desktop.html +0 -356
- package/public/guides/code-knowledge-graph-guardrails.html +0 -365
- package/public/guides/codex-cli-guardrails.html +0 -339
- package/public/guides/cursor-agent-guardrails.html +0 -339
- package/public/guides/cursor-prevent-repeated-mistakes.html +0 -161
- package/public/guides/database-agent-safety.html +0 -406
- package/public/guides/deepseek-v4-runtime-guardrails.html +0 -346
- package/public/guides/developer-machine-supply-chain-guardrails.html +0 -358
- package/public/guides/gcp-mcp-guardrails.html +0 -147
- package/public/guides/gemini-cli-feedback-memory.html +0 -339
- package/public/guides/gpt-5-5-model-evaluation.html +0 -358
- package/public/guides/internal-ai-engineering-stack-guardrails.html +0 -348
- package/public/guides/long-running-agent-context-management.html +0 -346
- package/public/guides/mcp-tool-governance.html +0 -401
- package/public/guides/multica-thumbgate-setup.html +0 -134
- package/public/guides/native-messaging-host-security.html +0 -342
- package/public/guides/policy-engine-pre-action-gates.html +0 -346
- package/public/guides/pre-action-checks.html +0 -342
- package/public/guides/pretooluse-hooks-vs-advisory-prompt-rules.html +0 -342
- package/public/guides/prompt-tricks-to-workflow-rules.html +0 -365
- package/public/guides/proxy-pointer-rag-guardrails.html +0 -352
- package/public/guides/rag-precision-tuning-guardrails.html +0 -352
- package/public/guides/reasoning-compression-guardrails.html +0 -346
- package/public/guides/relational-knowledge-ai-recommendations.html +0 -342
- package/public/guides/roo-code-alternative-cline.html +0 -339
- package/public/guides/semantic-programmatic-seo-guardrails.html +0 -352
- package/public/guides/seo-agent-skills-guardrails.html +0 -344
- package/public/guides/stop-repeated-ai-agent-mistakes.html +0 -342
- package/public/learn/ac-dc-runtime-enforcement.html +0 -277
- package/public/learn/agent-harness-pattern.html +0 -181
- package/public/learn/agent-identity-connector-governance.html +0 -146
- package/public/learn/agent-swarms-shared-gates.html +0 -173
- package/public/learn/agentic-enterprise-context-brain.html +0 -117
- package/public/learn/agentic-os-team-governance.html +0 -146
- package/public/learn/ai-agent-governance.html +0 -158
- package/public/learn/ai-agent-persistent-memory.html +0 -211
- package/public/learn/anthropomorphic-claim-gates.html +0 -180
- package/public/learn/background-agent-control-layer.html +0 -184
- package/public/learn/claude-code-goal-with-rubrics.html +0 -205
- package/public/learn/codex-role-plugins-need-governance.html +0 -125
- package/public/learn/cost-aware-agent-gate-routing.html +0 -173
- package/public/learn/databricks-unity-ai-gateway-runtime-governance.html +0 -157
- package/public/learn/deterministic-agent-workflows.html +0 -185
- package/public/learn/feedback-loop-vs-decision-layer.html +0 -283
- package/public/learn/from-prototype-to-production.html +0 -223
- package/public/learn/learn.css +0 -51
- package/public/learn/mcp-pre-action-checks-explained.html +0 -172
- package/public/learn/pretix-stripe-connect-marketplaces.html +0 -161
- package/public/learn/regulated-agent-execution-boundary.html +0 -196
- package/public/learn/spec-driven-development.html +0 -168
- package/public/learn/stop-ai-agent-force-push.html +0 -134
- package/public/learn/vibe-coding-safety-net.html +0 -142
- package/scripts/reddit-browser-notification-watch.js +0 -230
|
@@ -1,358 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<meta charset="UTF-8" />
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
6
|
-
<title>GPT-5.5 Model Evaluation | Benchmark Before Routing Expensive Agent Work</title>
|
|
7
|
-
<meta name="description" content="Frontier-model upgrades can improve coding, dataset analysis, and dashboards, but the ROI comes from measured routing. ThumbGate adds a model-candidate workl..." />
|
|
8
|
-
<meta property="og:title" content="GPT-5.5 Model Evaluation | Benchmark Before Routing Expensive Agent Work" />
|
|
9
|
-
<meta property="og:description" content="Frontier-model upgrades can improve coding, dataset analysis, and dashboards, but the ROI comes from measured routing. ThumbGate adds a model-candidate workl..." />
|
|
10
|
-
<meta property="og:type" content="article" />
|
|
11
|
-
<meta property="og:url" content="https://thumbgate.ai/guides/gpt-5-5-model-evaluation" />
|
|
12
|
-
<link rel="canonical" href="https://thumbgate.ai/guides/gpt-5-5-model-evaluation" />
|
|
13
|
-
<link rel="llm-context" href="/llm-context.md" type="text/markdown" />
|
|
14
|
-
<link rel="icon" type="image/svg+xml" href="/thumbgate-icon.png" />
|
|
15
|
-
<link rel="apple-touch-icon" href="/assets/brand/thumbgate-mark.svg" />
|
|
16
|
-
<meta property="og:image" content="/og.png" />
|
|
17
|
-
<style>
|
|
18
|
-
:root {
|
|
19
|
-
--bg: #0a0a0b;
|
|
20
|
-
--bg-raised: #111113;
|
|
21
|
-
--bg-card: #161618;
|
|
22
|
-
--line: #222225;
|
|
23
|
-
--text: #e8e8ec;
|
|
24
|
-
--muted: #8b8b96;
|
|
25
|
-
--cyan: #22d3ee;
|
|
26
|
-
--green: #4ade80;
|
|
27
|
-
--red: #f87171;
|
|
28
|
-
}
|
|
29
|
-
* { box-sizing: border-box; }
|
|
30
|
-
body {
|
|
31
|
-
margin: 0;
|
|
32
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
33
|
-
background: var(--bg);
|
|
34
|
-
color: var(--text);
|
|
35
|
-
line-height: 1.65;
|
|
36
|
-
}
|
|
37
|
-
a { color: var(--cyan); text-decoration: none; }
|
|
38
|
-
a:hover { text-decoration: underline; }
|
|
39
|
-
.container { max-width: 980px; margin: 0 auto; padding: 0 24px; }
|
|
40
|
-
.topbar {
|
|
41
|
-
position: sticky;
|
|
42
|
-
top: 0;
|
|
43
|
-
z-index: 20;
|
|
44
|
-
backdrop-filter: blur(12px);
|
|
45
|
-
background: rgba(10, 10, 11, 0.88);
|
|
46
|
-
border-bottom: 1px solid var(--line);
|
|
47
|
-
}
|
|
48
|
-
.topbar .container {
|
|
49
|
-
display: flex;
|
|
50
|
-
justify-content: space-between;
|
|
51
|
-
align-items: center;
|
|
52
|
-
padding-top: 14px;
|
|
53
|
-
padding-bottom: 14px;
|
|
54
|
-
}
|
|
55
|
-
.brand {
|
|
56
|
-
font-weight: 700;
|
|
57
|
-
color: var(--text);
|
|
58
|
-
display: inline-flex;
|
|
59
|
-
align-items: center;
|
|
60
|
-
gap: 8px;
|
|
61
|
-
text-decoration: none;
|
|
62
|
-
}
|
|
63
|
-
.brand .logo-mark { width: 28px; height: 28px; display: block; }
|
|
64
|
-
.hero { padding: 72px 0 32px; }
|
|
65
|
-
.eyebrow {
|
|
66
|
-
display: inline-flex;
|
|
67
|
-
align-items: center;
|
|
68
|
-
gap: 8px;
|
|
69
|
-
padding: 6px 12px;
|
|
70
|
-
border-radius: 999px;
|
|
71
|
-
border: 1px solid rgba(34, 211, 238, 0.22);
|
|
72
|
-
background: rgba(34, 211, 238, 0.1);
|
|
73
|
-
color: var(--cyan);
|
|
74
|
-
text-transform: uppercase;
|
|
75
|
-
letter-spacing: 0.08em;
|
|
76
|
-
font-size: 12px;
|
|
77
|
-
font-weight: 700;
|
|
78
|
-
}
|
|
79
|
-
h1 {
|
|
80
|
-
font-size: clamp(34px, 5vw, 56px);
|
|
81
|
-
line-height: 1.06;
|
|
82
|
-
letter-spacing: -0.04em;
|
|
83
|
-
margin: 16px 0;
|
|
84
|
-
max-width: 760px;
|
|
85
|
-
}
|
|
86
|
-
.hero p {
|
|
87
|
-
max-width: 720px;
|
|
88
|
-
color: var(--muted);
|
|
89
|
-
font-size: 18px;
|
|
90
|
-
}
|
|
91
|
-
.signal-row {
|
|
92
|
-
display: flex;
|
|
93
|
-
flex-wrap: wrap;
|
|
94
|
-
gap: 12px;
|
|
95
|
-
margin: 28px 0 0;
|
|
96
|
-
}
|
|
97
|
-
.signal-pill {
|
|
98
|
-
display: inline-flex;
|
|
99
|
-
align-items: center;
|
|
100
|
-
gap: 8px;
|
|
101
|
-
padding: 10px 14px;
|
|
102
|
-
border-radius: 999px;
|
|
103
|
-
border: 1px solid var(--line);
|
|
104
|
-
background: var(--bg-raised);
|
|
105
|
-
font-weight: 600;
|
|
106
|
-
font-size: 14px;
|
|
107
|
-
}
|
|
108
|
-
.signal-pill.up {
|
|
109
|
-
border-color: rgba(74, 222, 128, 0.28);
|
|
110
|
-
color: #b8f7c8;
|
|
111
|
-
background: rgba(74, 222, 128, 0.1);
|
|
112
|
-
}
|
|
113
|
-
.signal-pill.down {
|
|
114
|
-
border-color: rgba(248, 113, 113, 0.28);
|
|
115
|
-
color: #ffc0c0;
|
|
116
|
-
background: rgba(248, 113, 113, 0.1);
|
|
117
|
-
}
|
|
118
|
-
.grid {
|
|
119
|
-
display: grid;
|
|
120
|
-
grid-template-columns: minmax(0, 2fr) minmax(280px, 1fr);
|
|
121
|
-
gap: 24px;
|
|
122
|
-
padding-bottom: 72px;
|
|
123
|
-
}
|
|
124
|
-
.card, .detail-section, .sidebar-card {
|
|
125
|
-
background: var(--bg-card);
|
|
126
|
-
border: 1px solid var(--line);
|
|
127
|
-
border-radius: 16px;
|
|
128
|
-
}
|
|
129
|
-
.card { padding: 24px; }
|
|
130
|
-
.detail-section { padding: 24px; margin-bottom: 18px; }
|
|
131
|
-
.detail-section h2 { margin: 0 0 12px; font-size: 24px; letter-spacing: -0.03em; }
|
|
132
|
-
.detail-section p { color: var(--muted); }
|
|
133
|
-
.detail-section ul, .card ul { padding-left: 18px; color: var(--muted); }
|
|
134
|
-
.card h2 { margin-top: 0; }
|
|
135
|
-
.sidebar {
|
|
136
|
-
display: flex;
|
|
137
|
-
flex-direction: column;
|
|
138
|
-
gap: 18px;
|
|
139
|
-
}
|
|
140
|
-
.sidebar-card {
|
|
141
|
-
padding: 20px;
|
|
142
|
-
}
|
|
143
|
-
/* Only the first sidebar card sticks. Stacking multiple stickies at the
|
|
144
|
-
same top offset makes them overlap each other on scroll. The related-
|
|
145
|
-
pages card flows normally below. */
|
|
146
|
-
.sidebar-card:first-child {
|
|
147
|
-
position: sticky;
|
|
148
|
-
top: 84px;
|
|
149
|
-
max-height: calc(100vh - 104px);
|
|
150
|
-
overflow-y: auto;
|
|
151
|
-
-webkit-overflow-scrolling: touch;
|
|
152
|
-
}
|
|
153
|
-
.proof-links {
|
|
154
|
-
display: flex;
|
|
155
|
-
flex-wrap: wrap;
|
|
156
|
-
gap: 12px;
|
|
157
|
-
margin-top: 16px;
|
|
158
|
-
}
|
|
159
|
-
.cta-button {
|
|
160
|
-
display: inline-flex;
|
|
161
|
-
align-items: center;
|
|
162
|
-
justify-content: center;
|
|
163
|
-
margin-top: 18px;
|
|
164
|
-
padding: 12px 16px;
|
|
165
|
-
border-radius: 10px;
|
|
166
|
-
background: var(--cyan);
|
|
167
|
-
color: #071116;
|
|
168
|
-
font-weight: 700;
|
|
169
|
-
text-decoration: none;
|
|
170
|
-
}
|
|
171
|
-
.faq-item {
|
|
172
|
-
border-top: 1px solid var(--line);
|
|
173
|
-
padding: 14px 0;
|
|
174
|
-
}
|
|
175
|
-
.faq-item summary {
|
|
176
|
-
cursor: pointer;
|
|
177
|
-
font-weight: 600;
|
|
178
|
-
}
|
|
179
|
-
.faq-item p {
|
|
180
|
-
color: var(--muted);
|
|
181
|
-
}
|
|
182
|
-
.related-card {
|
|
183
|
-
display: block;
|
|
184
|
-
padding: 14px;
|
|
185
|
-
border-radius: 12px;
|
|
186
|
-
border: 1px solid var(--line);
|
|
187
|
-
background: var(--bg-raised);
|
|
188
|
-
margin-top: 12px;
|
|
189
|
-
color: var(--text);
|
|
190
|
-
}
|
|
191
|
-
.related-label {
|
|
192
|
-
display: block;
|
|
193
|
-
color: var(--muted);
|
|
194
|
-
font-size: 12px;
|
|
195
|
-
text-transform: uppercase;
|
|
196
|
-
letter-spacing: 0.08em;
|
|
197
|
-
margin-bottom: 4px;
|
|
198
|
-
}
|
|
199
|
-
@media (max-width: 860px) {
|
|
200
|
-
.grid {
|
|
201
|
-
grid-template-columns: 1fr;
|
|
202
|
-
}
|
|
203
|
-
.sidebar-card:first-child {
|
|
204
|
-
position: static;
|
|
205
|
-
max-height: none;
|
|
206
|
-
overflow: visible;
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
</style>
|
|
210
|
-
<script type="application/ld+json">
|
|
211
|
-
{
|
|
212
|
-
"@context": "https://schema.org",
|
|
213
|
-
"@type": "TechArticle",
|
|
214
|
-
"headline": "Evaluate GPT-5.5 Before You Route Production Agent Work",
|
|
215
|
-
"description": "Frontier-model upgrades can improve coding, dataset analysis, and dashboards, but the ROI comes from measured routing. ThumbGate adds a model-candidate workl...",
|
|
216
|
-
"about": [
|
|
217
|
-
"claude code masterclass guardrails",
|
|
218
|
-
"cursor prevent repeated mistakes",
|
|
219
|
-
"claude code prevent repeated mistakes",
|
|
220
|
-
"codex cli guardrails"
|
|
221
|
-
],
|
|
222
|
-
"url": "https://thumbgate.ai/guides/gpt-5-5-model-evaluation",
|
|
223
|
-
"publisher": {
|
|
224
|
-
"@type": "Organization",
|
|
225
|
-
"name": "ThumbGate",
|
|
226
|
-
"url": "https://thumbgate.ai"
|
|
227
|
-
},
|
|
228
|
-
"mainEntityOfPage": "https://thumbgate.ai/guides/gpt-5-5-model-evaluation"
|
|
229
|
-
}
|
|
230
|
-
</script>
|
|
231
|
-
<script type="application/ld+json">
|
|
232
|
-
{
|
|
233
|
-
"@context": "https://schema.org",
|
|
234
|
-
"@type": "FAQPage",
|
|
235
|
-
"mainEntity": [
|
|
236
|
-
{
|
|
237
|
-
"@type": "Question",
|
|
238
|
-
"name": "Should every ThumbGate task use GPT-5.5?",
|
|
239
|
-
"acceptedAnswer": {
|
|
240
|
-
"@type": "Answer",
|
|
241
|
-
"text": "No. Cheap gates, classification, extraction, and simple triage should stay on lower-cost tiers when they pass evals. GPT-5.5 is best evaluated for complex reasoning, long-context, coding, dataset, and dashboard work."
|
|
242
|
-
}
|
|
243
|
-
},
|
|
244
|
-
{
|
|
245
|
-
"@type": "Question",
|
|
246
|
-
"name": "How do I benchmark GPT-5.5 in ThumbGate?",
|
|
247
|
-
"acceptedAnswer": {
|
|
248
|
-
"@type": "Answer",
|
|
249
|
-
"text": "Run npx thumbgate model-candidates --workload=dashboard-analysis --provider=openai --json, then use the emitted benchmark commands and metrics to compare insight quality, chart validity, latency, and cost before changing routing defaults."
|
|
250
|
-
}
|
|
251
|
-
},
|
|
252
|
-
{
|
|
253
|
-
"@type": "Question",
|
|
254
|
-
"name": "Does this automatically call the OpenAI API?",
|
|
255
|
-
"acceptedAnswer": {
|
|
256
|
-
"@type": "Answer",
|
|
257
|
-
"text": "No. The catalog is an evaluation and routing surface. It records candidate models and benchmark plans without assuming provider credentials or silently changing runtime behavior."
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
]
|
|
261
|
-
}
|
|
262
|
-
</script>
|
|
263
|
-
</head>
|
|
264
|
-
<body>
|
|
265
|
-
<div class="topbar">
|
|
266
|
-
<div class="container">
|
|
267
|
-
<a class="brand" href="/"><img src="/assets/brand/thumbgate-mark-inline.svg" alt="ThumbGate" class="logo-mark" width="28" height="28"><span class="logo-text">ThumbGate</span></a>
|
|
268
|
-
<a href="https://github.com/IgorGanapolsky/ThumbGate/blob/main/docs/VERIFICATION_EVIDENCE.md" target="_blank" rel="noopener">Verification evidence</a>
|
|
269
|
-
</div>
|
|
270
|
-
</div>
|
|
271
|
-
|
|
272
|
-
<main class="container">
|
|
273
|
-
<section class="hero">
|
|
274
|
-
<div class="eyebrow">guide | gpt-5.5 model evaluation</div>
|
|
275
|
-
<h1>Evaluate GPT-5.5 Before You Route Production Agent Work</h1>
|
|
276
|
-
<p>Frontier-model upgrades can improve coding, dataset analysis, and dashboards, but the ROI comes from measured routing. ThumbGate adds a model-candidate workload so teams can benchmark GPT-5.5 against real feedback, gate evals, and dashboard-analysis criteria before changing defaults.</p>
|
|
277
|
-
<div class="signal-row">
|
|
278
|
-
<div class="signal-pill up">👍 Thumbs up reinforces good behavior</div>
|
|
279
|
-
<div class="signal-pill down">👎 Thumbs down blocks repeated mistakes</div>
|
|
280
|
-
</div>
|
|
281
|
-
</section>
|
|
282
|
-
|
|
283
|
-
<section class="grid">
|
|
284
|
-
<div>
|
|
285
|
-
<div class="card">
|
|
286
|
-
<h2>Why this page exists</h2>
|
|
287
|
-
<ul><li>GPT-5.5 should be treated as a frontier candidate for complex work, not a blanket replacement for every cheap gate.</li><li>The highest ROI path is benchmark-first routing: keep cheap tiers for simple checks and escalate dataset, dashboard, and long-context work when evidence supports it.</li><li>ThumbGate now exposes a dashboard-analysis workload through npx thumbgate model-candidates --workload=dashboard-analysis --provider=openai --json.</li></ul>
|
|
288
|
-
</div>
|
|
289
|
-
|
|
290
|
-
<section class="detail-section">
|
|
291
|
-
<h2>What changed</h2>
|
|
292
|
-
<p>OpenAI positions GPT-5.5 for complex reasoning, coding, data analysis, and tool-using work. Julius framed the same model around dataset analysis, charts, dashboards, insight quality, and code generation.</p><p>For ThumbGate, the useful product move is not to rewrite every default. It is to make model adoption measurable: define the workload, pick candidate models, run existing evals, and route only the work that earns the frontier spend.</p>
|
|
293
|
-
|
|
294
|
-
</section>
|
|
295
|
-
<section class="detail-section">
|
|
296
|
-
<h2>What ThumbGate adds</h2>
|
|
297
|
-
|
|
298
|
-
<ul><li>A GPT-5.5 model candidate in config/model-candidates.json with long-context, data-analysis, dashboard-creation, charting, tool-use, and reliability strengths.</li><li>A dashboard-analysis workload with metrics for insight accuracy, chart-spec validity, dashboard completeness, long-context reliability, latency, and cost per analysis.</li><li>A CLI path: npx thumbgate model-candidates --workload=dashboard-analysis --provider=openai --json.</li><li>A tier-router config that pins the frontier tier to gpt-5.5 while keeping explicit cheaper tiers for fast, low-cost work.</li></ul>
|
|
299
|
-
</section>
|
|
300
|
-
<section class="detail-section">
|
|
301
|
-
<h2>Where this creates ROI</h2>
|
|
302
|
-
<p>This gives platform teams a defensible answer to "should we move to GPT-5.5?" Run the candidate report, attach gate and benchmark evidence, and only then route high-value analytical or long-context agent tasks to the frontier tier.</p><p>The commercial wedge is a Workflow Hardening Sprint focused on model routing: define which workflows deserve frontier spend, which stay on cheap tiers, and which require pre-action checks before a model can touch live systems.</p>
|
|
303
|
-
|
|
304
|
-
</section>
|
|
305
|
-
<div class="detail-section">
|
|
306
|
-
<h2>FAQ</h2>
|
|
307
|
-
|
|
308
|
-
<details class="faq-item">
|
|
309
|
-
<summary>Should every ThumbGate task use GPT-5.5?</summary>
|
|
310
|
-
<p>No. Cheap gates, classification, extraction, and simple triage should stay on lower-cost tiers when they pass evals. GPT-5.5 is best evaluated for complex reasoning, long-context, coding, dataset, and dashboard work.</p>
|
|
311
|
-
</details>
|
|
312
|
-
<details class="faq-item">
|
|
313
|
-
<summary>How do I benchmark GPT-5.5 in ThumbGate?</summary>
|
|
314
|
-
<p>Run npx thumbgate model-candidates --workload=dashboard-analysis --provider=openai --json, then use the emitted benchmark commands and metrics to compare insight quality, chart validity, latency, and cost before changing routing defaults.</p>
|
|
315
|
-
</details>
|
|
316
|
-
<details class="faq-item">
|
|
317
|
-
<summary>Does this automatically call the OpenAI API?</summary>
|
|
318
|
-
<p>No. The catalog is an evaluation and routing surface. It records candidate models and benchmark plans without assuming provider credentials or silently changing runtime behavior.</p>
|
|
319
|
-
</details>
|
|
320
|
-
</div>
|
|
321
|
-
</div>
|
|
322
|
-
|
|
323
|
-
<aside class="sidebar">
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
<div class="sidebar-card">
|
|
329
|
-
<h2>GSD execution brief</h2>
|
|
330
|
-
<p>This page was prioritized because it captures high-intent demand around gpt-5.5 model evaluation and feeds directly into ThumbGate's proof-led conversion path.</p>
|
|
331
|
-
<p><strong>Opportunity score:</strong> 72</p>
|
|
332
|
-
<p><strong>Primary persona:</strong> ai-engineer</p>
|
|
333
|
-
<p><strong>Keyword cluster:</strong> claude code masterclass guardrails, cursor prevent repeated mistakes, claude code prevent repeated mistakes, codex cli guardrails</p>
|
|
334
|
-
<p><strong>Pricing:</strong> Pro $19/mo or $149/yr. Team $49/seat/mo.</p>
|
|
335
|
-
<div class="proof-links"><a href="https://github.com/IgorGanapolsky/ThumbGate/blob/main/docs/VERIFICATION_EVIDENCE.md" target="_blank" rel="noopener">Verification evidence</a><a href="https://github.com/IgorGanapolsky/ThumbGate/blob/main/proof/automation/report.json" target="_blank" rel="noopener">Automation proof</a><a href="https://github.com/IgorGanapolsky/ThumbGate" target="_blank" rel="noopener">GitHub repository</a></div>
|
|
336
|
-
<a class="cta-button" href="/checkout/pro?utm_source=website&utm_medium=seo_page&utm_campaign=guides_gpt-5-5-model-evaluation&cta_placement=seo_brief&plan_id=pro" target="_blank" rel="noopener">Go Pro — $19/mo</a>
|
|
337
|
-
</div>
|
|
338
|
-
<div class="sidebar-card">
|
|
339
|
-
<h2>Related pages</h2>
|
|
340
|
-
|
|
341
|
-
<a class="related-card" href="/guides/agent-harness-optimization">
|
|
342
|
-
<span class="related-label">Related page</span>
|
|
343
|
-
<strong>AI Agent Harness Optimization That Blocks Repeat Failures</strong>
|
|
344
|
-
</a>
|
|
345
|
-
<a class="related-card" href="/guides/background-agent-governance">
|
|
346
|
-
<span class="related-label">Related page</span>
|
|
347
|
-
<strong>Background Agent Governance for Agent PRs</strong>
|
|
348
|
-
</a>
|
|
349
|
-
<a class="related-card" href="/guides/pre-action-checks">
|
|
350
|
-
<span class="related-label">Related page</span>
|
|
351
|
-
<strong>What Are Pre-Action Checks?</strong>
|
|
352
|
-
</a>
|
|
353
|
-
</div>
|
|
354
|
-
</aside>
|
|
355
|
-
</section>
|
|
356
|
-
</main>
|
|
357
|
-
</body>
|
|
358
|
-
</html>
|