thumbgate 1.16.12 → 1.16.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/.well-known/mcp/server-card.json +1 -1
  4. package/README.md +3 -1
  5. package/adapters/claude/.mcp.json +2 -2
  6. package/adapters/mcp/server-stdio.js +26 -1
  7. package/adapters/opencode/opencode.json +1 -1
  8. package/bin/cli.js +420 -1
  9. package/config/gate-templates.json +372 -0
  10. package/config/mcp-allowlists.json +25 -0
  11. package/config/model-candidates.json +59 -2
  12. package/config/model-tiers.json +4 -1
  13. package/package.json +79 -22
  14. package/public/compare.html +6 -0
  15. package/public/index.html +144 -11
  16. package/public/numbers.html +11 -11
  17. package/public/pro.html +22 -24
  18. package/scripts/agent-design-governance.js +211 -0
  19. package/scripts/agent-reasoning-traces.js +683 -0
  20. package/scripts/agent-reward-model.js +438 -0
  21. package/scripts/agent-stack-survival-audit.js +231 -0
  22. package/scripts/ai-engineering-stack-guardrails.js +256 -0
  23. package/scripts/billing.js +16 -4
  24. package/scripts/chatgpt-ads-readiness-pack.js +195 -0
  25. package/scripts/cli-schema.js +277 -0
  26. package/scripts/code-graph-guardrails.js +176 -0
  27. package/scripts/deepseek-v4-runtime-guardrails.js +253 -0
  28. package/scripts/gemini-embedding-policy.js +198 -0
  29. package/scripts/inference-cache-policy.js +39 -0
  30. package/scripts/judge-reward-function.js +396 -0
  31. package/scripts/llm-behavior-monitor.js +251 -0
  32. package/scripts/long-running-agent-context-guardrails.js +176 -0
  33. package/scripts/multimodal-retrieval-plan.js +31 -11
  34. package/scripts/oss-pr-opportunity-scout.js +240 -0
  35. package/scripts/proactive-agent-eval-guardrails.js +230 -0
  36. package/scripts/profile-router.js +5 -4
  37. package/scripts/prompting-operating-system.js +273 -0
  38. package/scripts/proxy-pointer-rag-guardrails.js +189 -0
  39. package/scripts/rag-precision-guardrails.js +202 -0
  40. package/scripts/rate-limiter.js +1 -1
  41. package/scripts/reasoning-efficiency-guardrails.js +176 -0
  42. package/scripts/reward-hacking-guardrails.js +251 -0
  43. package/scripts/seo-gsd.js +1201 -11
  44. package/scripts/single-use-credential-gate.js +182 -0
  45. package/scripts/structured-prompt-driven.js +226 -0
  46. package/scripts/telemetry-analytics.js +31 -6
  47. package/scripts/tool-registry.js +92 -0
  48. package/scripts/upstream-contribution-engine.js +379 -0
  49. package/scripts/vector-store.js +119 -4
  50. package/src/api/server.js +333 -100
  51. package/scripts/agents-sdk-sandbox-plan.js +0 -57
  52. package/scripts/ai-org-governance.js +0 -98
  53. package/scripts/artifact-agent-plan.js +0 -81
  54. package/scripts/enterprise-agent-rollout.js +0 -34
  55. package/scripts/experience-replay-governance.js +0 -69
  56. package/scripts/inference-economics.js +0 -53
  57. package/scripts/knowledge-layer-plan.js +0 -108
  58. package/scripts/memory-store-governance.js +0 -60
  59. package/scripts/post-training-governance.js +0 -34
  60. package/scripts/production-agent-readiness.js +0 -40
  61. package/scripts/scaling-law-claims.js +0 -60
  62. package/scripts/student-consistent-training.js +0 -73
@@ -72,6 +72,378 @@
72
72
  "problem": "Turns repeated thumbs-up patterns into recommended workflow defaults and starter rules.",
73
73
  "roi": "Helps new agents and new teammates start from what already works.",
74
74
  "rollout": "Use with shared lessons to accelerate onboarding."
75
+ },
76
+ {
77
+ "id": "require-diff-impact-before-central-edit",
78
+ "name": "Require diff impact before central edits",
79
+ "category": "Knowledge Graph Safety",
80
+ "signal": "👎",
81
+ "defaultAction": "block",
82
+ "severity": "high",
83
+ "pattern": "centrality:(high|critical).*tool:(edit|write|patch)",
84
+ "problem": "Blocks edits to high-centrality files unless the agent has inspected the dependency impact first.",
85
+ "roi": "Turns code-graph context into a concrete guardrail for the changes most likely to break many downstream paths.",
86
+ "rollout": "Enable after generating a code knowledge graph and tagging critical files or modules."
87
+ },
88
+ {
89
+ "id": "checkpoint-cross-layer-refactor",
90
+ "name": "Checkpoint cross-layer refactors",
91
+ "category": "Knowledge Graph Safety",
92
+ "signal": "👎",
93
+ "defaultAction": "warn",
94
+ "severity": "high",
95
+ "pattern": "layers_touched:(api|service|data|ui).*(api|service|data|ui)",
96
+ "problem": "Requires an explicit checkpoint before an agent changes multiple architectural layers in one run.",
97
+ "roi": "Prevents broad refactors from becoming invisible blast-radius expansion.",
98
+ "rollout": "Use with graph layer labels from tools such as Understand Anything, code-graph MCPs, or internal architecture maps."
99
+ },
100
+ {
101
+ "id": "protect-graph-generated-artifacts",
102
+ "name": "Protect generated graph artifacts",
103
+ "category": "Knowledge Graph Safety",
104
+ "signal": "👎",
105
+ "defaultAction": "block",
106
+ "severity": "medium",
107
+ "pattern": "\\.(understand-anything|codegraph|knowledge-graph)\\/.*",
108
+ "problem": "Stops agents from editing generated knowledge-graph artifacts as if they were source of truth.",
109
+ "roi": "Keeps graph context reproducible and avoids corrupting the evidence layer agents rely on for impact analysis.",
110
+ "rollout": "Enable for repos that commit or cache graph outputs locally."
111
+ },
112
+ {
113
+ "id": "block-package-lifecycle-secret-harvest",
114
+ "name": "Block package lifecycle secret harvest",
115
+ "category": "Supply Chain Safety",
116
+ "signal": "👎",
117
+ "defaultAction": "block",
118
+ "severity": "critical",
119
+ "pattern": "(preinstall|install|postinstall|prepare).*(env|token|secret|credential|npmrc|pypirc|docker\\/config|ssh)",
120
+ "problem": "Stops package lifecycle scripts from reading local credentials during npm, PyPI, Docker, or CLI compromise scenarios.",
121
+ "roi": "Turns developer-machine supply-chain awareness into a runtime stop before secrets are harvested at scale.",
122
+ "rollout": "Enable on every repo where agents can install packages, run package scripts, or edit dependency automation."
123
+ },
124
+ {
125
+ "id": "review-untrusted-cli-before-execution",
126
+ "name": "Review untrusted CLI before execution",
127
+ "category": "Supply Chain Safety",
128
+ "signal": "👎",
129
+ "defaultAction": "block",
130
+ "severity": "critical",
131
+ "pattern": "(curl|wget).*(bash|sh)|npx\\s+[^@\\s]+|uvx\\s+|pipx\\s+run",
132
+ "problem": "Blocks one-shot CLI execution paths that can turn a copied command into developer-machine compromise.",
133
+ "roi": "Prevents AI assistants from amplifying malicious copy-paste install flows across repos and machines.",
134
+ "rollout": "Allowlist trusted internal CLIs and require source review for every new external executable."
135
+ },
136
+ {
137
+ "id": "checkpoint-dependency-bot-autofix",
138
+ "name": "Checkpoint dependency bot autofix",
139
+ "category": "Supply Chain Safety",
140
+ "signal": "👎",
141
+ "defaultAction": "warn",
142
+ "severity": "high",
143
+ "pattern": "(dependabot|renovate|npm audit fix|pnpm audit|pip install -U|docker pull|cargo update)",
144
+ "problem": "Requires a human-readable checkpoint before automated dependency updates expand the trusted code surface.",
145
+ "roi": "Keeps dependency bots and package managers from silently widening blast radius during a supply-chain incident.",
146
+ "rollout": "Start as warn mode for dependency PRs; promote to block for production services or privileged developer machines."
147
+ },
148
+ {
149
+ "id": "require-credential-exposure-assessment",
150
+ "name": "Require credential exposure assessment",
151
+ "category": "Supply Chain Safety",
152
+ "signal": "👎",
153
+ "defaultAction": "block",
154
+ "severity": "high",
155
+ "pattern": "(secret|token|credential|api[_-]?key|ssh|npmrc|pypirc|docker\\/config).*(rotate|exposed|leak|incident|compromise)",
156
+ "problem": "Forces an exposure assessment before an agent claims a supply-chain incident is resolved.",
157
+ "roi": "Connects prevention with remediation: what credential lived where, who touched it, and whether rotation is required.",
158
+ "rollout": "Use with incident-response runbooks and secrets scanner output from GitGuardian or internal tooling."
159
+ },
160
+ {
161
+ "id": "require-section-tree-before-multimodal-answer",
162
+ "name": "Require section tree before multimodal answers",
163
+ "category": "Document RAG Safety",
164
+ "signal": "👎",
165
+ "defaultAction": "block",
166
+ "severity": "high",
167
+ "pattern": "(multimodal|image|chart|figure|table).*answer.*(missing|no).*section[_ -]?tree",
168
+ "problem": "Blocks visual document answers when the agent has not preserved document hierarchy, section IDs, and source paths.",
169
+ "roi": "Prevents multimodal RAG demos from becoming ungrounded image guessing while keeping costs lower than full multimodal embeddings.",
170
+ "rollout": "Enable for PDF, report, support-doc, research-paper, and compliance workflows before agents answer with images."
171
+ },
172
+ {
173
+ "id": "require-image-pointer-grounding",
174
+ "name": "Require image pointer grounding",
175
+ "category": "Document RAG Safety",
176
+ "signal": "👎",
177
+ "defaultAction": "block",
178
+ "severity": "high",
179
+ "pattern": "(image|figure|chart|diagram).*answer.*(missing|no).*pointer",
180
+ "problem": "Requires every cited visual to carry a source document, parent section, and file path pointer.",
181
+ "roi": "Turns proxy-pointer RAG structure into an auditable answer boundary instead of trusting visual similarity.",
182
+ "rollout": "Start on any workflow that returns charts, figures, screenshots, or PDF images to users."
183
+ },
184
+ {
185
+ "id": "block-cross-document-image-leakage",
186
+ "name": "Block cross-document image leakage",
187
+ "category": "Document RAG Safety",
188
+ "signal": "👎",
189
+ "defaultAction": "block",
190
+ "severity": "critical",
191
+ "pattern": "image_doc_id\\s*!=\\s*answer_doc_id|cross[_ -]?document.*image",
192
+ "problem": "Stops agents from attaching a plausible visual from the wrong source document.",
193
+ "roi": "Protects trust in buyer-facing document answers where one wrong chart or figure can invalidate the whole system.",
194
+ "rollout": "Promote to block as soon as the ingestion pipeline records document IDs for image pointers."
195
+ },
196
+ {
197
+ "id": "checkpoint-vision-filter-for-visual-claims",
198
+ "name": "Checkpoint vision filter for visual claims",
199
+ "category": "Document RAG Safety",
200
+ "signal": "👎",
201
+ "defaultAction": "warn",
202
+ "severity": "medium",
203
+ "pattern": "(visual|image|chart|figure).*claim.*(without|no).*vision[_ -]?filter",
204
+ "problem": "Requires an optional vision-model sanity check before high-impact answers rely on visual content.",
205
+ "roi": "Keeps the cheap text-pointer path fast while adding review only when the answer makes visual claims.",
206
+ "rollout": "Use warn mode for low-risk docs and block mode for legal, financial, medical, or customer-facing visual answers."
207
+ },
208
+ {
209
+ "id": "require-rag-baseline-before-precision-tuning",
210
+ "name": "Require RAG baseline before precision tuning",
211
+ "category": "Document RAG Safety",
212
+ "signal": "👎",
213
+ "defaultAction": "block",
214
+ "severity": "critical",
215
+ "pattern": "(rag|retrieval|embedding).*(fine[- ]?tune|threshold|precision).*(missing|no).*(baseline|recall)",
216
+ "problem": "Blocks embedding, threshold, or precision tuning when the agent has not preserved a retrieval baseline and recall check.",
217
+ "roi": "Prevents a local precision improvement from silently degrading general retrieval quality across the agentic pipeline.",
218
+ "rollout": "Require baseline recall@k, precision@k, and answer-with-evidence metrics before changing retrieval thresholds or embedding models."
219
+ },
220
+ {
221
+ "id": "require-two-stage-rag-verifier-for-structural-near-misses",
222
+ "name": "Require two-stage RAG verifier for structural near misses",
223
+ "category": "Document RAG Safety",
224
+ "signal": "👎",
225
+ "defaultAction": "block",
226
+ "severity": "high",
227
+ "pattern": "(negation|role reversal|structural near[- ]?miss|compositional).*(without|no).*(verifier|rerank|second stage)",
228
+ "problem": "Requires a token-level verifier, reranker, or second-stage check when retrieval must distinguish structurally similar but meaningfully different evidence.",
229
+ "roi": "Catches the role-reversal and negation failures that vector similarity and keyword search can miss before downstream agents act.",
230
+ "rollout": "Enable for legal, finance, compliance, support, and autonomous-agent workflows where wrong retrieval can trigger real actions."
231
+ },
232
+ {
233
+ "id": "checkpoint-rag-latency-precision-tradeoff",
234
+ "name": "Checkpoint RAG latency and precision tradeoff",
235
+ "category": "Document RAG Safety",
236
+ "signal": "👎",
237
+ "defaultAction": "warn",
238
+ "severity": "medium",
239
+ "pattern": "(rerank|verifier|cross[- ]?encoder|two[- ]?stage).*(latency|sla|budget)",
240
+ "problem": "Requires an explicit latency budget before adding verifier or reranker stages to precision-sensitive retrieval.",
241
+ "roi": "Keeps high-risk RAG workflows accurate without accidentally making production agents too slow or expensive to run.",
242
+ "rollout": "Start in warn mode; promote to block when latency regressions exceed the workflow SLA."
243
+ },
244
+ {
245
+ "id": "require-director-journal-for-long-running-agent",
246
+ "name": "Require director journal for long-running agent",
247
+ "category": "Long-Running Agent Context",
248
+ "signal": "👎",
249
+ "defaultAction": "block",
250
+ "severity": "high",
251
+ "pattern": "(long[- ]?running|multi[- ]?agent|background agent).*(missing|no).*(director journal|working memory|structured memory)",
252
+ "problem": "Blocks long-running agent work when the system relies on raw chat history instead of a structured working-memory journal.",
253
+ "roi": "Prevents context-window bloat, compaction drift, and incoherent handoffs across hundreds of agent requests.",
254
+ "rollout": "Enable for background agents, multi-agent investigations, revenue loops, and any workflow expected to span more than one session."
255
+ },
256
+ {
257
+ "id": "require-critic-review-for-agent-findings",
258
+ "name": "Require critic review for agent findings",
259
+ "category": "Long-Running Agent Context",
260
+ "signal": "👎",
261
+ "defaultAction": "block",
262
+ "severity": "high",
263
+ "pattern": "(finding|claim|expert report|agent summary).*(missing|no).*(critic|review|credibility|evidence score)",
264
+ "problem": "Requires a critic pass with evidence inspection and credibility scoring before agent findings become shared truth.",
265
+ "roi": "Stops invented or misinterpreted findings from becoming the memory that future agents build on.",
266
+ "rollout": "Start with warn mode for summaries; promote to block for security, revenue, incident, and customer-facing findings."
267
+ },
268
+ {
269
+ "id": "checkpoint-critic-timeline-conflict-resolution",
270
+ "name": "Checkpoint critic timeline conflict resolution",
271
+ "category": "Long-Running Agent Context",
272
+ "signal": "👎",
273
+ "defaultAction": "warn",
274
+ "severity": "medium",
275
+ "pattern": "(timeline|memory|journal).*(conflict|duplicate|contradict|stale).*(without|no).*(resolution|credibility)",
276
+ "problem": "Requires duplicate removal and source-strength conflict resolution before a long-running agent updates its timeline.",
277
+ "roi": "Keeps long-lived agent memory coherent while preserving only the strongest evidence across rounds.",
278
+ "rollout": "Use in warn mode for internal loops and block mode when the timeline feeds irreversible actions or external replies."
279
+ },
280
+ {
281
+ "id": "require-verifier-before-reasoning-compression",
282
+ "name": "Require verifier before reasoning compression",
283
+ "category": "Reasoning Efficiency Safety",
284
+ "signal": "👎",
285
+ "defaultAction": "block",
286
+ "severity": "high",
287
+ "pattern": "(reasoning|chain|trace).*(compress|shorten|prune).*(without|no).*(verifier|accuracy|pass@1)",
288
+ "problem": "Blocks reasoning-length compression when the workflow has not preserved verifier, accuracy, and rollback evidence.",
289
+ "roi": "Saves tokens only when compressed reasoning still passes quality checks instead of trading correctness for shorter traces.",
290
+ "rollout": "Enable for model-routing, benchmark, prompt-eval, and expensive agent workflows before shortening reasoning traces."
291
+ },
292
+ {
293
+ "id": "checkpoint-low-confidence-reasoning-steps",
294
+ "name": "Checkpoint low-confidence reasoning steps",
295
+ "category": "Reasoning Efficiency Safety",
296
+ "signal": "👎",
297
+ "defaultAction": "warn",
298
+ "severity": "medium",
299
+ "pattern": "(low[- ]?confidence|uncertain).*(step|reasoning).*(correct rollout|accepted answer)",
300
+ "problem": "Requires inspection before low-confidence steps in otherwise successful reasoning become reinforced training or routing signal.",
301
+ "roi": "Reduces brittle step-level learning where a correct final answer hides unstable intermediate reasoning.",
302
+ "rollout": "Start as warn mode for prompt and model evals; promote to block when those traces update routing or fine-tuning data."
303
+ },
304
+ {
305
+ "id": "checkpoint-high-confidence-failed-rollout",
306
+ "name": "Checkpoint high-confidence failed rollout",
307
+ "category": "Reasoning Efficiency Safety",
308
+ "signal": "👎",
309
+ "defaultAction": "warn",
310
+ "severity": "medium",
311
+ "pattern": "(high[- ]?confidence|confident).*(failed|verifier failed|truncated).*(rollout|trace)",
312
+ "problem": "Requires a verifier-error or truncation check before penalizing confident reasoning from failed rollouts.",
313
+ "roi": "Prevents training or routing updates from punishing correct reasoning when the failure came from truncation or verifier noise.",
314
+ "rollout": "Use for reasoning-compression experiments, evaluation harnesses, and DPO/RLHF export review."
315
+ },
316
+ {
317
+ "id": "require-hybrid-prefix-cache-coherence-eval",
318
+ "name": "Require hybrid prefix cache coherence eval",
319
+ "category": "Sparse Attention Runtime Safety",
320
+ "signal": "👎",
321
+ "defaultAction": "block",
322
+ "severity": "critical",
323
+ "pattern": "(deepseek|sparse attention|hybrid attention|prefix cache).*(missing|no).*(coherence|rollback|cache eval)",
324
+ "problem": "Blocks hybrid sparse-attention serving changes when prefix cache lifetime, compressed KV reuse, and rollback behavior have not been verified.",
325
+ "roi": "Prevents expensive long-context inference rollouts from reusing stale cache state or corrupting speculative decode paths.",
326
+ "rollout": "Enable before raising context windows, switching cache implementations, or deploying ShadowRadix-style prefix caching."
327
+ },
328
+ {
329
+ "id": "checkpoint-speculative-decoding-acceptance",
330
+ "name": "Checkpoint speculative decoding acceptance",
331
+ "category": "Sparse Attention Runtime Safety",
332
+ "signal": "👎",
333
+ "defaultAction": "warn",
334
+ "severity": "high",
335
+ "pattern": "(speculative|mtp|eagle).*(accept|rollback|draft).*(low|missing|unstable)",
336
+ "problem": "Requires acceptance-rate, rollback, and correctness evidence before speculative decoding is treated as a production speedup.",
337
+ "roi": "Avoids routing traffic to a faster-looking path that collapses acceptance length or hides draft-token correctness failures.",
338
+ "rollout": "Start in warn mode for lab benchmarks; promote to block when speculation is enabled for customer traffic."
339
+ },
340
+ {
341
+ "id": "require-long-context-kv-offload-capacity-plan",
342
+ "name": "Require long-context KV offload capacity plan",
343
+ "category": "Sparse Attention Runtime Safety",
344
+ "signal": "👎",
345
+ "defaultAction": "block",
346
+ "severity": "high",
347
+ "pattern": "(long context|1m token|kv cache|hisparse|cpu offload).*(missing|no).*(capacity|offload|memory budget)",
348
+ "problem": "Blocks long-context serving rollouts when KV cache capacity, CPU offload, or memory budgets are not documented and benchmarked.",
349
+ "roi": "Prevents GPU memory cliffs and surprise throughput regressions when agents send very large traces or document contexts.",
350
+ "rollout": "Require for 128k+ context targets and any hosted inference path where batch size or context length can grow automatically."
351
+ },
352
+ {
353
+ "id": "require-rollout-routing-and-indexer-replay",
354
+ "name": "Require rollout routing and indexer replay",
355
+ "category": "Sparse Attention Runtime Safety",
356
+ "signal": "👎",
357
+ "defaultAction": "block",
358
+ "severity": "critical",
359
+ "pattern": "(rl|training|fine[- ]?tune|reward).*(missing|no).*(routing replay|indexer replay|train[- ]?inference drift)",
360
+ "problem": "Blocks RL or fine-tuning updates when rollout routing, sparse indexer choices, and train-inference drift are not captured and replayed.",
361
+ "roi": "Stops verified-RL experiments from optimizing against a different execution path than the one served during rollout.",
362
+ "rollout": "Enable for any self-hosted model training, reward optimization, DPO export review, or policy update based on sparse-attention rollouts."
363
+ },
364
+ {
365
+ "id": "checkpoint-mixed-precision-determinism",
366
+ "name": "Checkpoint mixed-precision determinism",
367
+ "category": "Sparse Attention Runtime Safety",
368
+ "signal": "👎",
369
+ "defaultAction": "warn",
370
+ "severity": "high",
371
+ "pattern": "(fp4|fp8|mixed precision|quant).*(nondeterministic|spike|drift|missing deterministic)",
372
+ "problem": "Requires deterministic settings and sensitive FP32 path checks before mixed-precision inference or training results become routing evidence.",
373
+ "roi": "Keeps FP4/FP8 savings from producing silent numerical drift, noisy KL spikes, or false benchmark confidence.",
374
+ "rollout": "Start in warn mode for benchmarking and promote to block when mixed-precision results update routing, training, or customer-facing model choices."
375
+ },
376
+ {
377
+ "id": "checkpoint-long-context-throughput-regression",
378
+ "name": "Checkpoint long-context throughput regression",
379
+ "category": "Sparse Attention Runtime Safety",
380
+ "signal": "👎",
381
+ "defaultAction": "warn",
382
+ "severity": "medium",
383
+ "pattern": "(long context|decode throughput|tpot|latency).*(regression|drop|slowdown)",
384
+ "problem": "Requires a measured throughput comparison before long-context runtime changes are treated as performance improvements.",
385
+ "roi": "Prevents teams from adopting impressive-looking runtime changes that only work on narrow prompts or short contexts.",
386
+ "rollout": "Use in CI benchmark jobs and model-routing reviews whenever context length, cache policy, or speculation settings change."
387
+ },
388
+ {
389
+ "id": "require-ai-gateway-control-plane",
390
+ "name": "Require AI gateway control plane",
391
+ "category": "AI Engineering Stack Safety",
392
+ "signal": "👎",
393
+ "defaultAction": "block",
394
+ "severity": "critical",
395
+ "pattern": "(model|llm|provider).*(direct key|api key|no gateway|bypass gateway)",
396
+ "problem": "Blocks model-provider rollout when requests bypass a central gateway or proxy with cost, identity, provider, and retention controls.",
397
+ "roi": "Prevents leaked keys, fragmented spend, and invisible model usage before agent adoption scales across the company.",
398
+ "rollout": "Enable before adding new AI coding clients, model providers, or bring-your-own-key routes."
399
+ },
400
+ {
401
+ "id": "require-progressive-mcp-tool-discovery",
402
+ "name": "Require progressive MCP tool discovery",
403
+ "category": "AI Engineering Stack Safety",
404
+ "signal": "👎",
405
+ "defaultAction": "warn",
406
+ "severity": "high",
407
+ "pattern": "(mcp|tool schema|tool count).*(too many|context bloat|no code mode|no progressive discovery)",
408
+ "problem": "Requires large MCP surfaces to expose a small search/execute or progressive-discovery interface instead of loading every tool schema into every prompt.",
409
+ "roi": "Cuts token burn and context-window clutter while letting teams keep adding useful tools behind the portal.",
410
+ "rollout": "Start warning above 20 tools; promote to block for high-volume clients or expensive frontier-model workflows."
411
+ },
412
+ {
413
+ "id": "require-agent-context-freshness",
414
+ "name": "Require AGENTS.md and LLM wiki freshness",
415
+ "category": "AI Engineering Stack Safety",
416
+ "signal": "👎",
417
+ "defaultAction": "block",
418
+ "severity": "high",
419
+ "pattern": "(AGENTS\\.md|llm wiki|agent context).*(stale|missing|outdated|no owner|no test command)",
420
+ "problem": "Blocks agent work when repository instructions, ownership, dependencies, test commands, or LLM wiki pages are missing or stale.",
421
+ "roi": "Turns knowledge-base enthusiasm into reliable agent context instead of letting stale docs make plausible wrong changes.",
422
+ "rollout": "Regenerate on repo topology changes and require review from the owning team before the context becomes canonical."
423
+ },
424
+ {
425
+ "id": "require-risk-tiered-ai-review",
426
+ "name": "Require risk-tiered AI review",
427
+ "category": "AI Engineering Stack Safety",
428
+ "signal": "👎",
429
+ "defaultAction": "block",
430
+ "severity": "high",
431
+ "pattern": "(merge|pull request|review).*(no ai reviewer|no severity|no rule id|no standards)",
432
+ "problem": "Requires automated review to classify risk, cite standards-as-skills, and separate security, code quality, performance, docs, and release-impact findings.",
433
+ "roi": "Makes AI review actionable and auditable, reducing merge churn while converting standards into reusable agent feedback.",
434
+ "rollout": "Use lite review for trivial changes and full review for money, production, security, data, or customer-facing work."
435
+ },
436
+ {
437
+ "id": "require-sandboxed-background-agent-runtime",
438
+ "name": "Require sandboxed background agent runtime",
439
+ "category": "AI Engineering Stack Safety",
440
+ "signal": "👎",
441
+ "defaultAction": "block",
442
+ "severity": "critical",
443
+ "pattern": "(background agent|long running agent|autonomous agent).*(no sandbox|host machine|no durable session|no build log)",
444
+ "problem": "Blocks background agents from cloning, building, testing, or publishing unless they run in an isolated durable environment with logs.",
445
+ "roi": "Lets the team pursue unattended revenue and engineering workflows without turning local developer machines into the execution boundary.",
446
+ "rollout": "Enable for every agent that can run tests, push branches, deploy, publish content, change billing, or touch customer data."
75
447
  }
76
448
  ]
77
449
  }
@@ -14,6 +14,11 @@
14
14
  "search_thumbgate",
15
15
  "plan_multimodal_retrieval",
16
16
  "plan_context_footprint",
17
+ "plan_agent_design_governance",
18
+ "plan_proactive_agent_eval_guardrails",
19
+ "plan_reward_hacking_guardrails",
20
+ "plan_oss_pr_opportunity_scout",
21
+ "plan_chatgpt_ads_readiness",
17
22
  "reflect_on_feedback",
18
23
  "feedback_stats",
19
24
  "diagnose_failure",
@@ -77,6 +82,11 @@
77
82
  "search_thumbgate",
78
83
  "plan_multimodal_retrieval",
79
84
  "plan_context_footprint",
85
+ "plan_agent_design_governance",
86
+ "plan_proactive_agent_eval_guardrails",
87
+ "plan_reward_hacking_guardrails",
88
+ "plan_oss_pr_opportunity_scout",
89
+ "plan_chatgpt_ads_readiness",
80
90
  "reflect_on_feedback",
81
91
  "prevention_rules",
82
92
  "set_task_scope",
@@ -124,6 +134,11 @@
124
134
  "search_thumbgate",
125
135
  "plan_multimodal_retrieval",
126
136
  "plan_context_footprint",
137
+ "plan_agent_design_governance",
138
+ "plan_proactive_agent_eval_guardrails",
139
+ "plan_reward_hacking_guardrails",
140
+ "plan_oss_pr_opportunity_scout",
141
+ "plan_chatgpt_ads_readiness",
127
142
  "feedback_stats",
128
143
  "diagnose_failure",
129
144
  "list_harnesses",
@@ -160,6 +175,11 @@
160
175
  "search_thumbgate",
161
176
  "plan_multimodal_retrieval",
162
177
  "plan_context_footprint",
178
+ "plan_agent_design_governance",
179
+ "plan_proactive_agent_eval_guardrails",
180
+ "plan_reward_hacking_guardrails",
181
+ "plan_oss_pr_opportunity_scout",
182
+ "plan_chatgpt_ads_readiness",
163
183
  "feedback_stats",
164
184
  "diagnose_failure",
165
185
  "list_harnesses",
@@ -192,6 +212,11 @@
192
212
  "retrieve_lessons",
193
213
  "search_thumbgate",
194
214
  "plan_context_footprint",
215
+ "plan_agent_design_governance",
216
+ "plan_proactive_agent_eval_guardrails",
217
+ "plan_reward_hacking_guardrails",
218
+ "plan_oss_pr_opportunity_scout",
219
+ "plan_chatgpt_ads_readiness",
195
220
  "diagnose_failure",
196
221
  "list_intents",
197
222
  "plan_intent",
@@ -23,17 +23,20 @@
23
23
  "long-trace-review": {
24
24
  "label": "Long trace review",
25
25
  "summary": "Review long agent traces, multi-step failures, and large-context coding sessions without dropping important detail.",
26
- "desiredStrengths": ["long-horizon-coding", "multi-agent", "reliability"],
26
+ "desiredStrengths": ["long-horizon-coding", "multi-agent", "reliability", "long-context"],
27
27
  "targetContextWindow": 128000,
28
28
  "benchmarkCommands": [
29
29
  "npx thumbgate eval --from-feedback --json --min-score=0",
30
30
  "node scripts/gate-eval.js run",
31
- "npx thumbgate bench --json --min-score=90"
31
+ "npx thumbgate bench --json --min-score=90",
32
+ "npx thumbgate deepseek-v4-runtime-guardrails --context-tokens=900000 --hybrid-attention --speculative-decoding --json"
32
33
  ],
33
34
  "metrics": [
34
35
  "passRate",
35
36
  "longContextReliability",
36
37
  "traceCompressionLoss",
38
+ "cacheCoherencePassRate",
39
+ "speculativeAcceptLength",
37
40
  "medianLatencyMs",
38
41
  "costPerTraceUsd"
39
42
  ]
@@ -54,9 +57,63 @@
54
57
  "costPer1kActionsUsd",
55
58
  "escalationRate"
56
59
  ]
60
+ },
61
+ "dashboard-analysis": {
62
+ "label": "Dashboard and dataset analysis",
63
+ "summary": "Evaluate frontier models for dataset analysis, chart generation, dashboard planning, and proof-backed insight quality before routing expensive analytical work.",
64
+ "desiredStrengths": ["data-analysis", "dashboard-creation", "charting", "long-context", "reliability"],
65
+ "targetContextWindow": 200000,
66
+ "benchmarkCommands": [
67
+ "npx thumbgate eval --from-feedback --json --min-score=0",
68
+ "node scripts/gate-eval.js run",
69
+ "npx thumbgate bench --json --min-score=90"
70
+ ],
71
+ "metrics": [
72
+ "insightAccuracy",
73
+ "chartSpecValidity",
74
+ "dashboardCompleteness",
75
+ "longContextReliability",
76
+ "medianLatencyMs",
77
+ "costPerAnalysisUsd"
78
+ ]
57
79
  }
58
80
  },
59
81
  "candidates": [
82
+ {
83
+ "id": "self-hosted/deepseek-v4-flash-sglang",
84
+ "vendor": "DeepSeek",
85
+ "family": "deepseek",
86
+ "provider": "self-hosted",
87
+ "gateway": "sglang",
88
+ "model": "deepseek-v4-flash",
89
+ "contextWindow": 1000000,
90
+ "costClass": "medium",
91
+ "strengths": ["long-context", "fast-inference", "reliability", "long-horizon-coding"],
92
+ "notes": "Self-hosted long-context candidate for teams that can operate SGLang-class sparse-attention serving. Requires ThumbGate runtime guardrails before routing production traces."
93
+ },
94
+ {
95
+ "id": "self-hosted/deepseek-v4-pro-sglang",
96
+ "vendor": "DeepSeek",
97
+ "family": "deepseek",
98
+ "provider": "self-hosted",
99
+ "gateway": "sglang",
100
+ "model": "deepseek-v4-pro",
101
+ "contextWindow": 1000000,
102
+ "costClass": "high",
103
+ "strengths": ["long-context", "reliability", "long-horizon-coding", "multi-agent"],
104
+ "notes": "High-capacity self-hosted candidate for long-trace review and verified-RL experiments. Benchmark cache coherence, speculative decoding, KV offload, and train-inference drift before use."
105
+ },
106
+ {
107
+ "id": "openai/gpt-5.5",
108
+ "vendor": "OpenAI",
109
+ "family": "gpt",
110
+ "provider": "openai",
111
+ "model": "gpt-5.5",
112
+ "contextWindow": 1000000,
113
+ "costClass": "high",
114
+ "strengths": ["agentic-coding", "tool-use", "reliability", "long-context", "data-analysis", "dashboard-creation", "charting"],
115
+ "notes": "Frontier candidate for complex reasoning, coding, dataset analysis, and dashboard workflows. Benchmark before routing high-volume or cost-sensitive work."
116
+ },
60
117
  {
61
118
  "id": "anthropic/claude-haiku-4-5",
62
119
  "vendor": "Anthropic",
@@ -3,18 +3,21 @@
3
3
  "tiers": {
4
4
  "nano": {
5
5
  "label": "GPT-5.4 nano",
6
+ "modelId": "gpt-5.4-nano",
6
7
  "taskTypes": ["classification", "extraction", "ranking", "labeling", "summarization"],
7
8
  "maxContextTokens": 32000,
8
9
  "costMultiplier": 0.1
9
10
  },
10
11
  "mini": {
11
12
  "label": "GPT-5.4 mini",
13
+ "modelId": "gpt-5.4-mini",
12
14
  "taskTypes": ["code-edit", "test-generation", "review", "tool-use", "debugging"],
13
15
  "maxContextTokens": 200000,
14
16
  "costMultiplier": 0.4
15
17
  },
16
18
  "frontier": {
17
- "label": "GPT-5.4",
19
+ "label": "GPT-5.5",
20
+ "modelId": "gpt-5.5",
18
21
  "taskTypes": ["architecture", "cross-file", "complex-debugging", "large-context"],
19
22
  "maxContextTokens": 1000000,
20
23
  "costMultiplier": 1.0,