thumbgate 1.17.0 โ 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/package.json +6 -4
- package/public/index.html +17 -16
- package/public/numbers.html +2 -2
- package/scripts/auto-promote-gates.js +4 -1
- package/scripts/feedback-to-rules.js +11 -1
- package/scripts/feedback_quality_eval.py +725 -0
- package/src/api/server.js +22 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "thumbgate-marketplace",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.18.0",
|
|
4
4
|
"owner": {
|
|
5
5
|
"name": "Igor Ganapolsky",
|
|
6
6
|
"email": "ig5973700@gmail.com"
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
"source": "npm",
|
|
14
14
|
"package": "thumbgate"
|
|
15
15
|
},
|
|
16
|
-
"version": "1.
|
|
16
|
+
"version": "1.18.0",
|
|
17
17
|
"author": {
|
|
18
18
|
"name": "Igor Ganapolsky"
|
|
19
19
|
},
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "thumbgate",
|
|
3
3
|
"description": "Type ๐ or ๐ on any agent action. ThumbGate captures it, distills a lesson, and blocks the pattern from repeating. One thumbs-down = the agent physically cannot make that mistake again. 33 pre-action checks, budget enforcement, self-protection, and NIST/SOC2 compliance tags.",
|
|
4
|
-
"version": "1.
|
|
4
|
+
"version": "1.18.0",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Igor Ganapolsky"
|
|
7
7
|
},
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "thumbgate",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.18.0",
|
|
4
4
|
"description": "ThumbGate โ ๐๐ feedback that teaches your AI agent. Thumbs down a mistake, it never happens again.",
|
|
5
5
|
"homepage": "https://thumbgate-production.up.railway.app",
|
|
6
6
|
"transport": "stdio",
|
|
@@ -2,13 +2,13 @@
|
|
|
2
2
|
"mcpServers": {
|
|
3
3
|
"thumbgate": {
|
|
4
4
|
"command": "npx",
|
|
5
|
-
"args": ["--yes", "--package", "thumbgate@1.
|
|
5
|
+
"args": ["--yes", "--package", "thumbgate@1.18.0", "thumbgate", "serve"]
|
|
6
6
|
}
|
|
7
7
|
},
|
|
8
8
|
"hooks": {
|
|
9
9
|
"preToolUse": {
|
|
10
10
|
"command": "npx",
|
|
11
|
-
"args": ["--yes", "--package", "thumbgate@1.
|
|
11
|
+
"args": ["--yes", "--package", "thumbgate@1.18.0", "thumbgate", "gate-check"]
|
|
12
12
|
}
|
|
13
13
|
}
|
|
14
14
|
}
|
|
@@ -216,7 +216,7 @@ const {
|
|
|
216
216
|
finalizeSession: finalizeFeedbackSession,
|
|
217
217
|
} = require('../../scripts/feedback-session');
|
|
218
218
|
|
|
219
|
-
const SERVER_INFO = { name: 'thumbgate-mcp', version: '1.
|
|
219
|
+
const SERVER_INFO = { name: 'thumbgate-mcp', version: '1.18.0' };
|
|
220
220
|
const COMMERCE_CATEGORIES = [
|
|
221
221
|
'product_recommendation',
|
|
222
222
|
'brand_compliance',
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "thumbgate",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.18.0",
|
|
4
4
|
"description": "ThumbGate self-improving agent governance: thumbs-up/down turns every mistake into a prevention rule and blocks repeat patterns. 33 pre-action checks, budget enforcement, and self-protection for Claude Code, Cursor, Codex, Gemini CLI, and Amp.",
|
|
5
5
|
"homepage": "https://thumbgate-production.up.railway.app",
|
|
6
6
|
"repository": {
|
|
@@ -110,6 +110,7 @@
|
|
|
110
110
|
"scripts/feedback-loop.js",
|
|
111
111
|
"scripts/feedback-paths.js",
|
|
112
112
|
"scripts/feedback-quality.js",
|
|
113
|
+
"scripts/feedback_quality_eval.py",
|
|
113
114
|
"scripts/feedback-schema.js",
|
|
114
115
|
"scripts/feedback-session.js",
|
|
115
116
|
"scripts/feedback-to-rules.js",
|
|
@@ -346,6 +347,7 @@
|
|
|
346
347
|
"test:telemetry-tracked-link-slug": "node --test tests/telemetry-tracked-link-slug.test.js",
|
|
347
348
|
"test:prompt-eval": "node --test tests/prompt-eval.test.js",
|
|
348
349
|
"eval:feedback": "node scripts/prompt-eval.js --from-feedback",
|
|
350
|
+
"eval:feedback-quality": "python3 scripts/feedback_quality_eval.py",
|
|
349
351
|
"test:decision-trace": "node --test tests/decision-trace.test.js",
|
|
350
352
|
"test:feedback-fallback": "node --test tests/feedback-fallback.test.js",
|
|
351
353
|
"test:metaclaw": "node --test tests/metaclaw-features.test.js",
|
|
@@ -406,7 +408,7 @@
|
|
|
406
408
|
"test:e2e": "node --test tests/e2e-pipeline.test.js tests/e2e-product-flows.test.js tests/e2e-coverage-contract.test.js",
|
|
407
409
|
"test:rlaif": "node --test tests/rlaif-self-audit.test.js tests/dpo-optimizer.test.js tests/meta-policy.test.js tests/agent-reward-model.test.js",
|
|
408
410
|
"test:attribution": "node --test tests/feedback-attribution.test.js tests/hybrid-feedback-context.test.js",
|
|
409
|
-
"test:quality": "node --test tests/validate-feedback.test.js",
|
|
411
|
+
"test:quality": "node --test tests/validate-feedback.test.js tests/feedback-quality-eval-python.test.js",
|
|
410
412
|
"test:intelligence": "node --test tests/intelligence.test.js",
|
|
411
413
|
"test:training-export": "node --test tests/training-export.test.js tests/databricks-export.test.js",
|
|
412
414
|
"test:deployment": "node --test tests/deployment.test.js tests/deploy-policy.test.js tests/publish-decision.test.js tests/changeset-check.test.js tests/release-notes.test.js tests/sonarcloud-workflow.test.js tests/package-boundary.test.js tests/public-package-boundary.test.js tests/revenue-observability-workflow.test.js",
|
|
@@ -676,7 +678,7 @@
|
|
|
676
678
|
"dependencies": {
|
|
677
679
|
"@anthropic-ai/sdk": "0.92.0",
|
|
678
680
|
"@google/genai": "1.49.0",
|
|
679
|
-
"@huggingface/transformers": "^4.
|
|
681
|
+
"@huggingface/transformers": "^4.2.0",
|
|
680
682
|
"@lancedb/lancedb": "^0.27.2",
|
|
681
683
|
"apache-arrow": "^18.1.0",
|
|
682
684
|
"better-sqlite3": "^12.9.0",
|
|
@@ -692,7 +694,7 @@
|
|
|
692
694
|
},
|
|
693
695
|
"mcpName": "io.github.IgorGanapolsky/thumbgate",
|
|
694
696
|
"devDependencies": {
|
|
695
|
-
"@changesets/changelog-github": "^0.
|
|
697
|
+
"@changesets/changelog-github": "^0.7.0",
|
|
696
698
|
"@changesets/cli": "^2.31.0",
|
|
697
699
|
"c8": "^11.0.0",
|
|
698
700
|
"undici": "^8.2.0"
|
package/public/index.html
CHANGED
|
@@ -19,7 +19,7 @@ __GOOGLE_SITE_VERIFICATION_META__
|
|
|
19
19
|
<meta property="og:image" content="https://thumbgate-production.up.railway.app/og.png">
|
|
20
20
|
<meta name="twitter:card" content="summary_large_image">
|
|
21
21
|
<meta name="twitter:image" content="https://thumbgate-production.up.railway.app/og.png">
|
|
22
|
-
<meta name="thumbgate-version" content="1.
|
|
22
|
+
<meta name="thumbgate-version" content="1.18.0">
|
|
23
23
|
<meta name="keywords" content="ThumbGate, thumbgate, AI agent orchestration, AI experience orchestration, agent enforcement layer, save LLM tokens, reduce Claude API cost, reduce OpenAI cost, AI agent token savings, prevent LLM retries, prevent hallucination retries, stop AI token waste, pre-action checks, agent governance, Claude Code, Cursor, Codex, Gemini, Amp, Cline, OpenCode, workflow hardening, context engineering, AI authenticity, brand authenticity AI">
|
|
24
24
|
<link rel="apple-touch-icon" href="/apple-touch-icon.png">
|
|
25
25
|
|
|
@@ -1231,26 +1231,26 @@ __GA_BOOTSTRAP__
|
|
|
1231
1231
|
<a href="https://www.npmjs.com/package/thumbgate" target="_blank" rel="noopener" class="btn-free">Install Free</a>
|
|
1232
1232
|
</div>
|
|
1233
1233
|
<div class="price-card pro" data-price-dollars="19">
|
|
1234
|
-
<div class="tier">
|
|
1234
|
+
<div class="tier">Pro</div>
|
|
1235
1235
|
<div class="price">$19<span style="font-size:16px;color:var(--text-dim)">/mo</span></div>
|
|
1236
|
-
<div class="price-sub">
|
|
1236
|
+
<div class="price-sub">Stop paying tokens to re-correct the same agent mistake across sessions.</div>
|
|
1237
1237
|
<ul>
|
|
1238
|
-
<li>
|
|
1239
|
-
<li>
|
|
1240
|
-
<li><a href="/dashboard#insights" style="color:var(--cyan);text-decoration:underline;">Visual check debugger</a> for every blocked action and the check that fired</li>
|
|
1241
|
-
<li>Auto-connect so supported agents appear automatically after setup</li>
|
|
1242
|
-
<li><a href="/dashboard#export" style="color:var(--cyan);text-decoration:underline;">DPO training data export</a> with ready-to-use preference pairs for fine-tuning</li>
|
|
1243
|
-
<li>Personal local dashboard for the individual operator</li>
|
|
1244
|
-
<li>Model Hardening Advisor plus HuggingFace dataset export</li>
|
|
1245
|
-
<li>Review-ready workflow support and proof-ready lesson bundles</li>
|
|
1246
|
-
<li>Team lesson export/import for handoff or migration</li>
|
|
1238
|
+
<li><strong>Block every repeat mistake</strong> โ unlimited feedback captures and prevention rules (Free caps at 5 active rules)</li>
|
|
1239
|
+
<li><strong>Never re-explain a correction</strong> โ lesson recall and search across sessions on every agent surface</li>
|
|
1240
|
+
<li><strong>See exactly which rule fired</strong> โ <a href="/dashboard#insights" style="color:var(--cyan);text-decoration:underline;">Visual check debugger</a> for every blocked action and the check that fired</li>
|
|
1241
|
+
<li><strong>One install, every agent</strong> โ Auto-connect so supported agents appear automatically after setup (Claude Code, Cursor, Codex, Gemini, Amp, Cline, OpenCode)</li>
|
|
1242
|
+
<li><strong>Fine-tune your local model</strong> on what your team actually wants โ <a href="/dashboard#export" style="color:var(--cyan);text-decoration:underline;">DPO training data export</a> with ready-to-use preference pairs for fine-tuning</li>
|
|
1243
|
+
<li><strong>Audit-ready enforcement proof</strong> โ Personal local dashboard for the individual operator with auditable block history</li>
|
|
1244
|
+
<li><strong>Ship hardened agents to production</strong> โ Model Hardening Advisor plus HuggingFace dataset export</li>
|
|
1245
|
+
<li><strong>Hand a PR with proof</strong> โ Review-ready workflow support and proof-ready lesson bundles a reviewer can verify in 30 seconds</li>
|
|
1246
|
+
<li><strong>Hand off without re-onboarding</strong> โ Team lesson export/import for handoff or migration</li>
|
|
1247
1247
|
</ul>
|
|
1248
1248
|
<div style="margin:12px 0 16px;padding:12px;border:1px solid rgba(34,211,238,0.25);border-radius:8px;background:rgba(34,211,238,0.06);">
|
|
1249
1249
|
<div style="font-size:12px;color:var(--text-muted);margin-bottom:4px;">What your Pro dashboard looks like</div>
|
|
1250
1250
|
<div style="font-family:var(--mono);font-size:12px;color:var(--cyan);line-height:1.6;">checks: 36 active<br>feedback: unlimited<br>exports: DPO + lessons</div>
|
|
1251
1251
|
</div>
|
|
1252
1252
|
<div style="font-size:11px;letter-spacing:0.08em;text-transform:uppercase;color:var(--cyan);font-weight:800;margin-bottom:8px;">PAY-NOW PRO</div>
|
|
1253
|
-
<a href="/checkout/pro?
|
|
1253
|
+
<a href="/checkout/pro?utm_source=pricing&utm_medium=cta&utm_campaign=v2_landing&utm_content=pricing_pro&plan_id=pro" id="pro-checkout-link" class="btn-pro" onclick="handleProCheckout();return false;" style="display:block;width:100%;text-align:center;padding:12px;font-size:15px;">Upgrade to Pro โ $19/mo</a>
|
|
1254
1254
|
<p style="font-size:11px;color:var(--text-muted);margin-top:8px;text-align:center;">Billed today ยท cancel anytime.</p>
|
|
1255
1255
|
<input type="email" id="pro-email" data-buyer-email placeholder="you@company.com" style="margin-top:10px;width:100%;padding:10px 12px;border:1px solid var(--border);border-radius:8px;background:var(--bg-raised);color:var(--text);font-size:14px;">
|
|
1256
1256
|
</div>
|
|
@@ -1267,8 +1267,9 @@ __GA_BOOTSTRAP__
|
|
|
1267
1267
|
<li>Workflow hardening sprint intake for approval boundaries and rollback safety</li>
|
|
1268
1268
|
<li>Email support during pilot rollout</li>
|
|
1269
1269
|
</ul>
|
|
1270
|
-
<a href="
|
|
1271
|
-
<
|
|
1270
|
+
<a href="/checkout/pro?plan_id=team&seat_count=3&confirm=1&utm_source=website&utm_medium=pricing&utm_campaign=team_self_serve&cta_id=pricing_team_self_serve&cta_placement=pricing&landing_path=%2F" onclick="try{posthog.capture('team_self_serve_click',{cta:'team_self_serve',seats:3,price:147})}catch(_){};sendFirstPartyTelemetry('team_self_serve_checkout_started',{ctaId:'pricing_team_self_serve',ctaPlacement:'pricing',planId:'team',seatCount:3,price:147});sendGa4Event('begin_checkout',{currency:'USD',value:147,items:[{item_id:'team_monthly',item_name:'ThumbGate Team (3 seats)',quantity:3}]});" class="btn-team" style="display:block;text-align:center;">Start 3-seat Team โ $147/mo</a>
|
|
1271
|
+
<a href="#workflow-sprint-intake" style="display:block;text-align:center;margin-top:8px;font-size:13px;color:var(--text-dim);text-decoration:none;">Or qualify first via Workflow Hardening Sprint intake โ</a>
|
|
1272
|
+
<p style="font-size:11px;color:var(--text-muted);margin-top:8px;text-align:center;">Team is $49/seat/mo with a 3-seat minimum. Self-serve checkout starts you at $147/mo for 3 seats; add more seats from the dashboard. Prefer to qualify the workflow before paying? Start with the intake.</p>
|
|
1272
1273
|
</div>
|
|
1273
1274
|
</div>
|
|
1274
1275
|
<p style="text-align:center;color:var(--text-muted);font-size:13px;margin:40px 0;">Need a custom diagnostic, sprint, or setup? <a href="#workflow-sprint-intake" style="color:var(--cyan);text-decoration:none;">Send workflow first โ</a></p>
|
|
@@ -1464,7 +1465,7 @@ __GA_BOOTSTRAP__
|
|
|
1464
1465
|
<a href="https://www.linkedin.com/in/igorganapolsky" target="_blank" rel="noopener">LinkedIn</a>
|
|
1465
1466
|
<a href="/blog">Blog</a>
|
|
1466
1467
|
</div>
|
|
1467
|
-
<span class="footer-copy">ยฉ 2026 ThumbGate ยท MIT License ยท npm v1.
|
|
1468
|
+
<span class="footer-copy">ยฉ 2026 ThumbGate ยท MIT License ยท npm v1.18.0</span>
|
|
1468
1469
|
</div>
|
|
1469
1470
|
</footer>
|
|
1470
1471
|
|
package/public/numbers.html
CHANGED
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
"alternateName": "thumbgate",
|
|
26
26
|
"applicationCategory": "DeveloperApplication",
|
|
27
27
|
"operatingSystem": "Cross-platform, Node.js >=18.18.0",
|
|
28
|
-
"softwareVersion": "1.
|
|
28
|
+
"softwareVersion": "1.18.0",
|
|
29
29
|
"url": "https://thumbgate-production.up.railway.app/numbers",
|
|
30
30
|
"dateModified": "2026-05-07",
|
|
31
31
|
"creator": {
|
|
@@ -202,7 +202,7 @@
|
|
|
202
202
|
<main class="container">
|
|
203
203
|
<h1>The Numbers</h1>
|
|
204
204
|
<p class="subtitle">Generated first-party operational snapshot from the ThumbGate runtime. This is not customer traction, install volume, revenue, or proof that a configured gate has fired.</p>
|
|
205
|
-
<div class="freshness">Updated: 2026-05-07 ยท Version 1.
|
|
205
|
+
<div class="freshness">Updated: 2026-05-07 ยท Version 1.18.0</div>
|
|
206
206
|
<div class="truth-note"><strong>Read this first:</strong> configured checks are inventory. Recorded blocks and warnings are usage evidence. This snapshot currently reports 0 recorded hard-block event(s) and 0 recorded warning event(s).</div>
|
|
207
207
|
|
|
208
208
|
<h2>Gate enforcement</h2>
|
|
@@ -6,7 +6,10 @@ const path = require('path');
|
|
|
6
6
|
const { resolveFeedbackDir } = require('./feedback-paths');
|
|
7
7
|
|
|
8
8
|
const MAX_AUTO_GATES = 10;
|
|
9
|
-
|
|
9
|
+
// 1+ failure auto-promotes to a warning gate. Cold buyers expect "one ๐ โ blocked next time"
|
|
10
|
+
// โ a 2-capture threshold made first-capture invisible and broke the activation loop. Block
|
|
11
|
+
// escalation still requires 3 captures (BLOCK_THRESHOLD) so noise doesn't auto-hard-block.
|
|
12
|
+
const WARN_THRESHOLD = 1;
|
|
10
13
|
const BLOCK_THRESHOLD = 3; // 3+ repeated failures hard-block the action
|
|
11
14
|
const WINDOW_DAYS = 30;
|
|
12
15
|
|
|
@@ -32,7 +32,17 @@ function normalize(ctx) {
|
|
|
32
32
|
return (ctx || '').replace(/\/Users\/[^\s/]+/g, '~').replace(/:[0-9]+/g, '').toLowerCase().trim();
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
-
|
|
35
|
+
// HIGH_RISK_TAGS triggers single-capture promotion (count >= 1 && hasHighRisk).
|
|
36
|
+
// Tags here MUST overlap with what inferSemanticTags() actually emits (see scripts/feedback-loop.js)
|
|
37
|
+
// โ otherwise cold buyers' first ๐ stays a lesson and never becomes a gate.
|
|
38
|
+
const HIGH_RISK_TAGS = new Set([
|
|
39
|
+
// Original semantic-category labels
|
|
40
|
+
'git-workflow', 'scope-control', 'trust-breach', 'execution-gap', 'regression', 'security',
|
|
41
|
+
// Tags inferSemanticTags() emits for destructive / irreversible operations
|
|
42
|
+
'destructive', 'force-push', 'delete', 'drop', 'force-overwrite',
|
|
43
|
+
'production', 'database', 'payment', 'credentials', 'secrets',
|
|
44
|
+
'rm-rf', 'reset-hard', 'truncate', 'data-loss',
|
|
45
|
+
]);
|
|
36
46
|
function analyze(entries) {
|
|
37
47
|
let positiveCount = 0, negativeCount = 0;
|
|
38
48
|
const categories = {};
|
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Offline feedback quality evaluation for ThumbGate.
|
|
4
|
+
|
|
5
|
+
This is intentionally stdlib-only. It turns feedback-log.jsonl into a small
|
|
6
|
+
quality report that answers: where are repeated failures clustering, how stable
|
|
7
|
+
is the signal, and do we have enough labeled gate decisions to compute true
|
|
8
|
+
precision/recall yet?
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import math
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import sqlite3
|
|
17
|
+
from collections import Counter, defaultdict
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
PROJECT_ROOT = Path(__file__).parent.parent
|
|
24
|
+
|
|
25
|
+
DEFAULT_CATEGORIES = {
|
|
26
|
+
"code_edit": {
|
|
27
|
+
"keywords": ["edit", "write", "implement", "refactor", "fix", "update", "create file"],
|
|
28
|
+
"tools": ["edit", "write", "multiedit"],
|
|
29
|
+
},
|
|
30
|
+
"git": {
|
|
31
|
+
"keywords": ["commit", "push", "branch", "merge", "pr", "pull request", "rebase", "cherry-pick"],
|
|
32
|
+
"tools": ["bash", "git"],
|
|
33
|
+
},
|
|
34
|
+
"testing": {
|
|
35
|
+
"keywords": ["test", "jest", "coverage", "verify", "verification", "spec", "mock", "assert"],
|
|
36
|
+
"tools": [],
|
|
37
|
+
},
|
|
38
|
+
"review": {
|
|
39
|
+
"keywords": ["review", "pr comment", "resolve", "thread", "feedback"],
|
|
40
|
+
"tools": [],
|
|
41
|
+
},
|
|
42
|
+
"search": {
|
|
43
|
+
"keywords": ["search", "find", "grep", "glob", "explore", "where is", "look for", "rg"],
|
|
44
|
+
"tools": ["grep", "glob", "read", "rg"],
|
|
45
|
+
},
|
|
46
|
+
"security": {
|
|
47
|
+
"keywords": ["security", "secret", "credential", "token", "auth", "injection", "xss"],
|
|
48
|
+
"tools": [],
|
|
49
|
+
},
|
|
50
|
+
"debugging": {
|
|
51
|
+
"keywords": ["debug", "error", "crash", "stack trace", "log", "diagnose", "investigate"],
|
|
52
|
+
"tools": [],
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def resolve_feedback_dir() -> Path:
|
|
58
|
+
env_dir = os.environ.get("THUMBGATE_FEEDBACK_DIR")
|
|
59
|
+
if env_dir:
|
|
60
|
+
return Path(env_dir)
|
|
61
|
+
|
|
62
|
+
local_thumbgate = PROJECT_ROOT / ".thumbgate"
|
|
63
|
+
if local_thumbgate.exists():
|
|
64
|
+
return local_thumbgate
|
|
65
|
+
|
|
66
|
+
local_legacy = PROJECT_ROOT / ".claude" / "memory" / "feedback"
|
|
67
|
+
if local_legacy.exists():
|
|
68
|
+
return local_legacy
|
|
69
|
+
|
|
70
|
+
return Path.home() / ".thumbgate" / "projects" / PROJECT_ROOT.name
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def read_jsonl(path: Path) -> Tuple[List[Dict[str, Any]], int]:
|
|
74
|
+
rows: List[Dict[str, Any]] = []
|
|
75
|
+
invalid = 0
|
|
76
|
+
if not path.exists():
|
|
77
|
+
return rows, invalid
|
|
78
|
+
|
|
79
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
80
|
+
for raw in handle:
|
|
81
|
+
line = raw.strip()
|
|
82
|
+
if not line:
|
|
83
|
+
continue
|
|
84
|
+
try:
|
|
85
|
+
parsed = json.loads(line)
|
|
86
|
+
except json.JSONDecodeError:
|
|
87
|
+
invalid += 1
|
|
88
|
+
continue
|
|
89
|
+
if isinstance(parsed, dict):
|
|
90
|
+
rows.append(parsed)
|
|
91
|
+
else:
|
|
92
|
+
invalid += 1
|
|
93
|
+
return rows, invalid
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def load_sqlite_lessons(db_path: Optional[Path]) -> Dict[str, Any]:
|
|
97
|
+
if not db_path:
|
|
98
|
+
return {
|
|
99
|
+
"available": False,
|
|
100
|
+
"path": None,
|
|
101
|
+
"totalLessons": 0,
|
|
102
|
+
"bySignal": {},
|
|
103
|
+
"byDomain": {},
|
|
104
|
+
"sourceFeedbackIds": [],
|
|
105
|
+
"error": None,
|
|
106
|
+
}
|
|
107
|
+
if not db_path.exists():
|
|
108
|
+
return {
|
|
109
|
+
"available": False,
|
|
110
|
+
"path": str(db_path),
|
|
111
|
+
"totalLessons": 0,
|
|
112
|
+
"bySignal": {},
|
|
113
|
+
"byDomain": {},
|
|
114
|
+
"sourceFeedbackIds": [],
|
|
115
|
+
"error": "SQLite lesson DB does not exist.",
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
connection = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
|
120
|
+
connection.row_factory = sqlite3.Row
|
|
121
|
+
try:
|
|
122
|
+
table_exists = connection.execute(
|
|
123
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='lessons'"
|
|
124
|
+
).fetchone()
|
|
125
|
+
if not table_exists:
|
|
126
|
+
return {
|
|
127
|
+
"available": False,
|
|
128
|
+
"path": str(db_path),
|
|
129
|
+
"totalLessons": 0,
|
|
130
|
+
"bySignal": {},
|
|
131
|
+
"byDomain": {},
|
|
132
|
+
"sourceFeedbackIds": [],
|
|
133
|
+
"error": "SQLite DB does not contain a lessons table.",
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
rows = connection.execute(
|
|
137
|
+
"SELECT id, signal, domain, sourceFeedbackId FROM lessons WHERE pruned = 0"
|
|
138
|
+
).fetchall()
|
|
139
|
+
finally:
|
|
140
|
+
connection.close()
|
|
141
|
+
except sqlite3.Error as exc:
|
|
142
|
+
return {
|
|
143
|
+
"available": False,
|
|
144
|
+
"path": str(db_path),
|
|
145
|
+
"totalLessons": 0,
|
|
146
|
+
"bySignal": {},
|
|
147
|
+
"byDomain": {},
|
|
148
|
+
"sourceFeedbackIds": [],
|
|
149
|
+
"error": str(exc),
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
by_signal = Counter(str(row["signal"] or "unknown") for row in rows)
|
|
153
|
+
by_domain = Counter(str(row["domain"] or "unknown") for row in rows)
|
|
154
|
+
source_ids = sorted({
|
|
155
|
+
str(row["sourceFeedbackId"])
|
|
156
|
+
for row in rows
|
|
157
|
+
if row["sourceFeedbackId"]
|
|
158
|
+
})
|
|
159
|
+
return {
|
|
160
|
+
"available": True,
|
|
161
|
+
"path": str(db_path),
|
|
162
|
+
"totalLessons": len(rows),
|
|
163
|
+
"bySignal": dict(sorted(by_signal.items())),
|
|
164
|
+
"byDomain": dict(sorted(by_domain.items())),
|
|
165
|
+
"sourceFeedbackIds": source_ids,
|
|
166
|
+
"error": None,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def normalize_signal(entry: Dict[str, Any]) -> Optional[str]:
|
|
171
|
+
raw = str(entry.get("signal") or entry.get("feedback") or "").strip().lower()
|
|
172
|
+
if raw in {"positive", "up", "thumbsup", "thumbs_up", "๐"}:
|
|
173
|
+
return "positive"
|
|
174
|
+
if raw in {"negative", "down", "thumbsdown", "thumbs_down", "๐"}:
|
|
175
|
+
return "negative"
|
|
176
|
+
|
|
177
|
+
reward = entry.get("reward")
|
|
178
|
+
if isinstance(reward, (int, float)):
|
|
179
|
+
if reward > 0:
|
|
180
|
+
return "positive"
|
|
181
|
+
if reward < 0:
|
|
182
|
+
return "negative"
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def normalize_text(*values: Any) -> str:
|
|
187
|
+
parts = []
|
|
188
|
+
for value in values:
|
|
189
|
+
if value is None:
|
|
190
|
+
continue
|
|
191
|
+
if isinstance(value, list):
|
|
192
|
+
parts.extend(str(item) for item in value)
|
|
193
|
+
elif isinstance(value, dict):
|
|
194
|
+
parts.append(json.dumps(value, sort_keys=True))
|
|
195
|
+
else:
|
|
196
|
+
parts.append(str(value))
|
|
197
|
+
return " ".join(parts).lower()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def contains_keyword(text: str, keyword: str) -> bool:
|
|
201
|
+
normalized_keyword = keyword.lower().strip()
|
|
202
|
+
if not normalized_keyword:
|
|
203
|
+
return False
|
|
204
|
+
if len(normalized_keyword) <= 3 or re.fullmatch(r"[a-z0-9_+-]+", normalized_keyword):
|
|
205
|
+
return re.search(rf"(?<![a-z0-9_+-]){re.escape(normalized_keyword)}(?![a-z0-9_+-])", text) is not None
|
|
206
|
+
return normalized_keyword in text
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def classify_entry(entry: Dict[str, Any]) -> List[str]:
|
|
210
|
+
tags = entry.get("tags") if isinstance(entry.get("tags"), list) else []
|
|
211
|
+
tool = entry.get("toolName") or entry.get("tool_name") or entry.get("last_tool")
|
|
212
|
+
text = normalize_text(
|
|
213
|
+
entry.get("context"),
|
|
214
|
+
entry.get("whatWentWrong"),
|
|
215
|
+
entry.get("whatToChange"),
|
|
216
|
+
entry.get("whatWorked"),
|
|
217
|
+
entry.get("actionReason"),
|
|
218
|
+
entry.get("failureType"),
|
|
219
|
+
tags,
|
|
220
|
+
)
|
|
221
|
+
tool_text = normalize_text(tool)
|
|
222
|
+
|
|
223
|
+
matched = []
|
|
224
|
+
for category, config in DEFAULT_CATEGORIES.items():
|
|
225
|
+
keyword_match = any(contains_keyword(text, keyword) for keyword in config["keywords"])
|
|
226
|
+
tool_match = any(contains_keyword(tool_text, tool_name) for tool_name in config["tools"])
|
|
227
|
+
if keyword_match or tool_match:
|
|
228
|
+
matched.append(category)
|
|
229
|
+
|
|
230
|
+
if not matched:
|
|
231
|
+
domain = entry.get("richContext", {}).get("domain") if isinstance(entry.get("richContext"), dict) else None
|
|
232
|
+
if isinstance(domain, str) and domain:
|
|
233
|
+
matched.append(domain)
|
|
234
|
+
|
|
235
|
+
return matched or ["uncategorized"]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def parse_timestamp(value: Any) -> Optional[datetime]:
|
|
239
|
+
if not isinstance(value, str) or not value:
|
|
240
|
+
return None
|
|
241
|
+
try:
|
|
242
|
+
normalized = value.replace("Z", "+00:00")
|
|
243
|
+
parsed = datetime.fromisoformat(normalized)
|
|
244
|
+
if parsed.tzinfo is None:
|
|
245
|
+
parsed = parsed.replace(tzinfo=timezone.utc)
|
|
246
|
+
return parsed
|
|
247
|
+
except ValueError:
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def rate(numerator: int, denominator: int) -> float:
|
|
252
|
+
return round(numerator / denominator, 4) if denominator else 0.0
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def wilson_lower_bound(positive: int, total: int, z: float = 1.96) -> float:
|
|
256
|
+
if total <= 0:
|
|
257
|
+
return 0.0
|
|
258
|
+
p = positive / total
|
|
259
|
+
denom = 1 + z * z / total
|
|
260
|
+
centre = p + z * z / (2 * total)
|
|
261
|
+
spread = z * math.sqrt((p * (1 - p) + z * z / (4 * total)) / total)
|
|
262
|
+
return round((centre - spread) / denom, 4)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def summarize_bucket(name: str, values: Iterable[str], signals: List[str], min_support: int) -> List[Dict[str, Any]]:
|
|
266
|
+
counts: Dict[str, Counter] = defaultdict(Counter)
|
|
267
|
+
for bucket_value, signal in zip(values, signals):
|
|
268
|
+
counts[bucket_value][signal] += 1
|
|
269
|
+
|
|
270
|
+
rows = []
|
|
271
|
+
for bucket_value, counter in counts.items():
|
|
272
|
+
positive = counter["positive"]
|
|
273
|
+
negative = counter["negative"]
|
|
274
|
+
total = positive + negative
|
|
275
|
+
if total < min_support:
|
|
276
|
+
continue
|
|
277
|
+
rows.append({
|
|
278
|
+
name: bucket_value,
|
|
279
|
+
"support": total,
|
|
280
|
+
"positive": positive,
|
|
281
|
+
"negative": negative,
|
|
282
|
+
"positiveRate": rate(positive, total),
|
|
283
|
+
"negativeRate": rate(negative, total),
|
|
284
|
+
"wilsonPositiveLower": wilson_lower_bound(positive, total),
|
|
285
|
+
})
|
|
286
|
+
|
|
287
|
+
return sorted(rows, key=lambda row: (-row["negativeRate"], -row["support"], row[name]))
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def explicit_gate_label(entry: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
291
|
+
"""Return expected/actual labels when the log carries explicit gate labels.
|
|
292
|
+
|
|
293
|
+
expected: harmful/safe based on feedback signal.
|
|
294
|
+
actual: blocked/allowed from gate decision fields.
|
|
295
|
+
"""
|
|
296
|
+
signal = normalize_signal(entry)
|
|
297
|
+
if not signal:
|
|
298
|
+
return None, None
|
|
299
|
+
|
|
300
|
+
expected = "harmful" if signal == "negative" else "safe"
|
|
301
|
+
|
|
302
|
+
for key in ("gateDecision", "decision", "outcome", "status"):
|
|
303
|
+
value = str(entry.get(key) or "").lower()
|
|
304
|
+
if value in {"block", "blocked", "deny", "denied", "rejected"}:
|
|
305
|
+
return expected, "blocked"
|
|
306
|
+
if value in {"allow", "allowed", "pass", "passed", "accepted"}:
|
|
307
|
+
return expected, "allowed"
|
|
308
|
+
|
|
309
|
+
if isinstance(entry.get("allowed"), bool):
|
|
310
|
+
return expected, "allowed" if entry["allowed"] else "blocked"
|
|
311
|
+
if isinstance(entry.get("blocked"), bool):
|
|
312
|
+
return expected, "blocked" if entry["blocked"] else "allowed"
|
|
313
|
+
if entry.get("actionType") == "no-action":
|
|
314
|
+
return expected, "blocked"
|
|
315
|
+
|
|
316
|
+
return expected, None
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def compute_sqlite_metrics(entries: List[Dict[str, Any]], sqlite_lessons: Dict[str, Any]) -> Dict[str, Any]:
|
|
320
|
+
if not sqlite_lessons.get("available"):
|
|
321
|
+
return {
|
|
322
|
+
"available": False,
|
|
323
|
+
"path": sqlite_lessons.get("path"),
|
|
324
|
+
"totalLessons": 0,
|
|
325
|
+
"feedbackLessonCoverage": 0.0,
|
|
326
|
+
"negativeLessonCoverage": 0.0,
|
|
327
|
+
"bySignal": {},
|
|
328
|
+
"byDomain": {},
|
|
329
|
+
"error": sqlite_lessons.get("error"),
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
feedback_ids = {str(entry.get("id")) for entry in entries if entry.get("id")}
|
|
333
|
+
negative_ids = {
|
|
334
|
+
str(entry.get("id"))
|
|
335
|
+
for entry in entries
|
|
336
|
+
if entry.get("id") and normalize_signal(entry) == "negative"
|
|
337
|
+
}
|
|
338
|
+
lesson_feedback_ids = set(sqlite_lessons.get("sourceFeedbackIds") or [])
|
|
339
|
+
|
|
340
|
+
return {
|
|
341
|
+
"available": True,
|
|
342
|
+
"path": sqlite_lessons.get("path"),
|
|
343
|
+
"totalLessons": sqlite_lessons.get("totalLessons", 0),
|
|
344
|
+
"feedbackLessonCoverage": rate(len(feedback_ids & lesson_feedback_ids), len(feedback_ids)),
|
|
345
|
+
"negativeLessonCoverage": rate(len(negative_ids & lesson_feedback_ids), len(negative_ids)),
|
|
346
|
+
"bySignal": sqlite_lessons.get("bySignal") or {},
|
|
347
|
+
"byDomain": sqlite_lessons.get("byDomain") or {},
|
|
348
|
+
"error": None,
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def retrieval_score(row: Dict[str, Any]) -> Optional[float]:
|
|
353
|
+
for key in ("score", "similarity", "distanceScore", "topSimilarity"):
|
|
354
|
+
value = row.get(key)
|
|
355
|
+
if isinstance(value, (int, float)) and math.isfinite(value):
|
|
356
|
+
return float(value)
|
|
357
|
+
try:
|
|
358
|
+
return float(value)
|
|
359
|
+
except (TypeError, ValueError):
|
|
360
|
+
continue
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def feedback_id_for_retrieval(row: Dict[str, Any]) -> Optional[str]:
|
|
365
|
+
for key in ("feedbackId", "sourceFeedbackId", "queryFeedbackId", "id"):
|
|
366
|
+
value = row.get(key)
|
|
367
|
+
if value:
|
|
368
|
+
return str(value)
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def unavailable_retrieval_metrics() -> Dict[str, Any]:
|
|
373
|
+
return {
|
|
374
|
+
"available": False,
|
|
375
|
+
"rows": 0,
|
|
376
|
+
"queries": 0,
|
|
377
|
+
"averageTopScore": None,
|
|
378
|
+
"negativeNeighborRate": None,
|
|
379
|
+
"error": None,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def bucket_retrieval_rows(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
|
|
384
|
+
by_feedback: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
|
385
|
+
for row in retrieval_rows:
|
|
386
|
+
feedback_id = feedback_id_for_retrieval(row) or "unknown"
|
|
387
|
+
by_feedback[feedback_id].append(row)
|
|
388
|
+
return by_feedback
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def top_retrieval_scores(by_feedback: Dict[str, List[Dict[str, Any]]]) -> List[float]:
|
|
392
|
+
top_scores = []
|
|
393
|
+
for rows in by_feedback.values():
|
|
394
|
+
scores = [score for score in (retrieval_score(row) for row in rows) if score is not None]
|
|
395
|
+
if scores:
|
|
396
|
+
top_scores.append(max(scores))
|
|
397
|
+
return top_scores
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def retrieval_neighbor_summary(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, int]:
|
|
401
|
+
summary = {"labeled": 0, "negative": 0}
|
|
402
|
+
for row in retrieval_rows:
|
|
403
|
+
neighbor_signal = normalize_signal({
|
|
404
|
+
"signal": row.get("matchedSignal") or row.get("neighborSignal") or row.get("signal")
|
|
405
|
+
})
|
|
406
|
+
if not neighbor_signal:
|
|
407
|
+
continue
|
|
408
|
+
summary["labeled"] += 1
|
|
409
|
+
if neighbor_signal == "negative":
|
|
410
|
+
summary["negative"] += 1
|
|
411
|
+
return summary
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def compute_retrieval_metrics(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
415
|
+
if not retrieval_rows:
|
|
416
|
+
return unavailable_retrieval_metrics()
|
|
417
|
+
|
|
418
|
+
by_feedback = bucket_retrieval_rows(retrieval_rows)
|
|
419
|
+
top_scores = top_retrieval_scores(by_feedback)
|
|
420
|
+
neighbor_summary = retrieval_neighbor_summary(retrieval_rows)
|
|
421
|
+
labeled_neighbors = neighbor_summary["labeled"]
|
|
422
|
+
|
|
423
|
+
return {
|
|
424
|
+
"available": True,
|
|
425
|
+
"rows": len(retrieval_rows),
|
|
426
|
+
"queries": len(by_feedback),
|
|
427
|
+
"averageTopScore": round(sum(top_scores) / len(top_scores), 4) if top_scores else None,
|
|
428
|
+
"negativeNeighborRate": rate(neighbor_summary["negative"], labeled_neighbors) if labeled_neighbors else None,
|
|
429
|
+
"error": None,
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
GATE_OUTCOME_KEYS = {
|
|
434
|
+
("harmful", "blocked"): "truePositiveBlocks",
|
|
435
|
+
("safe", "allowed"): "trueNegativeAllows",
|
|
436
|
+
("safe", "blocked"): "falsePositiveBlocks",
|
|
437
|
+
("harmful", "allowed"): "falseNegativeAllows",
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def initial_gate_counts() -> Dict[str, int]:
|
|
442
|
+
return {
|
|
443
|
+
"truePositiveBlocks": 0,
|
|
444
|
+
"trueNegativeAllows": 0,
|
|
445
|
+
"falsePositiveBlocks": 0,
|
|
446
|
+
"falseNegativeAllows": 0,
|
|
447
|
+
"unlabeledFeedback": 0,
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def count_gate_outcomes(entries: List[Dict[str, Any]]) -> Dict[str, int]:
|
|
452
|
+
counts = initial_gate_counts()
|
|
453
|
+
|
|
454
|
+
for entry in entries:
|
|
455
|
+
expected, actual = explicit_gate_label(entry)
|
|
456
|
+
if expected is None:
|
|
457
|
+
continue
|
|
458
|
+
if actual is None:
|
|
459
|
+
counts["unlabeledFeedback"] += 1
|
|
460
|
+
continue
|
|
461
|
+
|
|
462
|
+
count_key = GATE_OUTCOME_KEYS.get((expected, actual))
|
|
463
|
+
if count_key:
|
|
464
|
+
counts[count_key] += 1
|
|
465
|
+
|
|
466
|
+
return counts
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def compute_f1(precision: Optional[float], recall: Optional[float], labeled: int) -> Optional[float]:
|
|
470
|
+
if not labeled:
|
|
471
|
+
return None
|
|
472
|
+
if not precision or not recall:
|
|
473
|
+
return 0.0
|
|
474
|
+
return round((2 * precision * recall) / (precision + recall), 4)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def compute_gate_metrics(entries: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
478
|
+
counts = count_gate_outcomes(entries)
|
|
479
|
+
tp = counts["truePositiveBlocks"]
|
|
480
|
+
tn = counts["trueNegativeAllows"]
|
|
481
|
+
fp = counts["falsePositiveBlocks"]
|
|
482
|
+
fn = counts["falseNegativeAllows"]
|
|
483
|
+
|
|
484
|
+
labeled = tp + tn + fp + fn
|
|
485
|
+
precision = rate(tp, tp + fp) if labeled else None
|
|
486
|
+
recall = rate(tp, tp + fn) if labeled else None
|
|
487
|
+
f1 = compute_f1(precision, recall, labeled)
|
|
488
|
+
|
|
489
|
+
return {
|
|
490
|
+
"available": labeled > 0,
|
|
491
|
+
"labeledDecisions": labeled,
|
|
492
|
+
"unlabeledFeedback": counts["unlabeledFeedback"],
|
|
493
|
+
"truePositiveBlocks": tp,
|
|
494
|
+
"trueNegativeAllows": tn,
|
|
495
|
+
"falsePositiveBlocks": fp,
|
|
496
|
+
"falseNegativeAllows": fn,
|
|
497
|
+
"precision": precision,
|
|
498
|
+
"recall": recall,
|
|
499
|
+
"f1": f1,
|
|
500
|
+
"note": None if labeled else "No explicit gate decision labels found; feedback quality metrics are available, but classifier precision/recall needs blocked/allowed labels.",
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def base_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
505
|
+
items = []
|
|
506
|
+
if report["usableEntries"] < 10:
|
|
507
|
+
items.append("Collect at least 10 usable feedback entries before making threshold changes.")
|
|
508
|
+
if not report["gateMetrics"]["available"]:
|
|
509
|
+
items.append("Start logging gate decisions as blocked/allowed so precision, recall, and false-positive rate can be computed.")
|
|
510
|
+
return items
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def storage_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
514
|
+
items = []
|
|
515
|
+
sqlite_metrics = report.get("sqliteLessonMetrics") or {}
|
|
516
|
+
if sqlite_metrics.get("available") and sqlite_metrics.get("negativeLessonCoverage", 0) < 0.8:
|
|
517
|
+
items.append("Backfill SQLite lesson rows for negative feedback before treating SQL dashboards as complete eval evidence.")
|
|
518
|
+
|
|
519
|
+
retrieval_metrics = report.get("retrievalMetrics") or {}
|
|
520
|
+
if retrieval_metrics.get("available") and retrieval_metrics.get("negativeNeighborRate") is not None and retrieval_metrics["negativeNeighborRate"] >= 0.5:
|
|
521
|
+
items.append("Inspect LanceDB retrieval neighborhoods: most labeled neighbors are negative, which is a good candidate for repeated-failure clustering.")
|
|
522
|
+
return items
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def category_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
526
|
+
items = []
|
|
527
|
+
weak_categories = [
|
|
528
|
+
row for row in report["categoryMetrics"]
|
|
529
|
+
if row["support"] >= report["minSupport"] and row["negativeRate"] >= 0.5
|
|
530
|
+
]
|
|
531
|
+
if weak_categories:
|
|
532
|
+
top = weak_categories[0]
|
|
533
|
+
items.append(
|
|
534
|
+
f"Tighten prevention rules for {top['category']}: {top['negative']} negative signals across {top['support']} entries."
|
|
535
|
+
)
|
|
536
|
+
return items
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def tag_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
540
|
+
volatile_tags = [
|
|
541
|
+
row for row in report["tagMetrics"]
|
|
542
|
+
if row["support"] >= report["minSupport"] and 0.35 <= row["positiveRate"] <= 0.65
|
|
543
|
+
]
|
|
544
|
+
if not volatile_tags:
|
|
545
|
+
return []
|
|
546
|
+
return [
|
|
547
|
+
f"Review mixed-signal tag '{volatile_tags[0]['tag']}' before promoting broad rules; signal is not separable yet."
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def build_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
552
|
+
recommendations = []
|
|
553
|
+
recommendations.extend(base_recommendations(report))
|
|
554
|
+
recommendations.extend(storage_recommendations(report))
|
|
555
|
+
recommendations.extend(category_recommendations(report))
|
|
556
|
+
recommendations.extend(tag_recommendations(report))
|
|
557
|
+
if not recommendations:
|
|
558
|
+
recommendations.append("No immediate eval action required; keep collecting feedback and rerun this report after the next batch.")
|
|
559
|
+
return recommendations
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def evaluate_feedback(
|
|
563
|
+
entries: List[Dict[str, Any]],
|
|
564
|
+
invalid_entries: int = 0,
|
|
565
|
+
min_support: int = 2,
|
|
566
|
+
sqlite_lessons: Optional[Dict[str, Any]] = None,
|
|
567
|
+
retrieval_rows: Optional[List[Dict[str, Any]]] = None,
|
|
568
|
+
) -> Dict[str, Any]:
|
|
569
|
+
usable = []
|
|
570
|
+
signals = []
|
|
571
|
+
category_values = []
|
|
572
|
+
tag_values = []
|
|
573
|
+
failure_values = []
|
|
574
|
+
timestamps = []
|
|
575
|
+
|
|
576
|
+
for entry in entries:
|
|
577
|
+
signal = normalize_signal(entry)
|
|
578
|
+
if signal not in {"positive", "negative"}:
|
|
579
|
+
continue
|
|
580
|
+
usable.append(entry)
|
|
581
|
+
signals.append(signal)
|
|
582
|
+
categories = classify_entry(entry)
|
|
583
|
+
category_values.append(categories[0])
|
|
584
|
+
tags = entry.get("tags") if isinstance(entry.get("tags"), list) else []
|
|
585
|
+
tag_values.append(str(tags[0]).strip().lower() if tags else "untagged")
|
|
586
|
+
failure_values.append(str(entry.get("failureType") or "unspecified").strip().lower())
|
|
587
|
+
parsed_ts = parse_timestamp(entry.get("timestamp"))
|
|
588
|
+
if parsed_ts:
|
|
589
|
+
timestamps.append(parsed_ts)
|
|
590
|
+
|
|
591
|
+
positive = signals.count("positive")
|
|
592
|
+
negative = signals.count("negative")
|
|
593
|
+
report = {
|
|
594
|
+
"generatedAt": datetime.now(timezone.utc).isoformat(),
|
|
595
|
+
"minSupport": min_support,
|
|
596
|
+
"totalEntries": len(entries),
|
|
597
|
+
"usableEntries": len(usable),
|
|
598
|
+
"invalidEntries": invalid_entries,
|
|
599
|
+
"positive": positive,
|
|
600
|
+
"negative": negative,
|
|
601
|
+
"positiveRate": rate(positive, len(usable)),
|
|
602
|
+
"negativeRate": rate(negative, len(usable)),
|
|
603
|
+
"firstTimestamp": min(timestamps).isoformat() if timestamps else None,
|
|
604
|
+
"lastTimestamp": max(timestamps).isoformat() if timestamps else None,
|
|
605
|
+
"categoryMetrics": summarize_bucket("category", category_values, signals, min_support),
|
|
606
|
+
"tagMetrics": summarize_bucket("tag", tag_values, signals, min_support),
|
|
607
|
+
"failureTypeMetrics": summarize_bucket("failureType", failure_values, signals, min_support),
|
|
608
|
+
"gateMetrics": compute_gate_metrics(usable),
|
|
609
|
+
"sqliteLessonMetrics": compute_sqlite_metrics(usable, sqlite_lessons or {"available": False, "error": None}),
|
|
610
|
+
"retrievalMetrics": compute_retrieval_metrics(retrieval_rows or []),
|
|
611
|
+
}
|
|
612
|
+
report["recommendations"] = build_recommendations(report)
|
|
613
|
+
return report
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def render_markdown(report: Dict[str, Any]) -> str:
|
|
617
|
+
lines = [
|
|
618
|
+
"# Feedback Quality Eval",
|
|
619
|
+
"",
|
|
620
|
+
f"- Generated: {report['generatedAt']}",
|
|
621
|
+
f"- Usable feedback: {report['usableEntries']} / {report['totalEntries']}",
|
|
622
|
+
f"- Positive rate: {report['positiveRate']}",
|
|
623
|
+
f"- Negative rate: {report['negativeRate']}",
|
|
624
|
+
"",
|
|
625
|
+
"## Gate Metrics",
|
|
626
|
+
"",
|
|
627
|
+
]
|
|
628
|
+
gate = report["gateMetrics"]
|
|
629
|
+
if gate["available"]:
|
|
630
|
+
lines.extend([
|
|
631
|
+
f"- Labeled decisions: {gate['labeledDecisions']}",
|
|
632
|
+
f"- Precision: {gate['precision']}",
|
|
633
|
+
f"- Recall: {gate['recall']}",
|
|
634
|
+
f"- F1: {gate['f1']}",
|
|
635
|
+
f"- False positive blocks: {gate['falsePositiveBlocks']}",
|
|
636
|
+
f"- False negative allows: {gate['falseNegativeAllows']}",
|
|
637
|
+
])
|
|
638
|
+
else:
|
|
639
|
+
lines.append(f"- {gate['note']}")
|
|
640
|
+
|
|
641
|
+
lines.extend(["", "## Highest-Risk Categories", ""])
|
|
642
|
+
if report["categoryMetrics"]:
|
|
643
|
+
lines.append("| Category | Support | Positive | Negative | Negative rate |")
|
|
644
|
+
lines.append("| --- | ---: | ---: | ---: | ---: |")
|
|
645
|
+
for row in report["categoryMetrics"][:8]:
|
|
646
|
+
lines.append(f"| {row['category']} | {row['support']} | {row['positive']} | {row['negative']} | {row['negativeRate']} |")
|
|
647
|
+
else:
|
|
648
|
+
lines.append("- Not enough category support yet.")
|
|
649
|
+
|
|
650
|
+
sqlite_metrics = report["sqliteLessonMetrics"]
|
|
651
|
+
lines.extend(["", "## SQLite Lesson Coverage", ""])
|
|
652
|
+
if sqlite_metrics["available"]:
|
|
653
|
+
lines.extend([
|
|
654
|
+
f"- Lessons: {sqlite_metrics['totalLessons']}",
|
|
655
|
+
f"- Feedback coverage: {sqlite_metrics['feedbackLessonCoverage']}",
|
|
656
|
+
f"- Negative feedback coverage: {sqlite_metrics['negativeLessonCoverage']}",
|
|
657
|
+
])
|
|
658
|
+
else:
|
|
659
|
+
lines.append(f"- Not available{': ' + sqlite_metrics['error'] if sqlite_metrics.get('error') else ''}.")
|
|
660
|
+
|
|
661
|
+
retrieval_metrics = report["retrievalMetrics"]
|
|
662
|
+
lines.extend(["", "## LanceDB Retrieval Export", ""])
|
|
663
|
+
if retrieval_metrics["available"]:
|
|
664
|
+
lines.extend([
|
|
665
|
+
f"- Rows: {retrieval_metrics['rows']}",
|
|
666
|
+
f"- Queries: {retrieval_metrics['queries']}",
|
|
667
|
+
f"- Average top score: {retrieval_metrics['averageTopScore']}",
|
|
668
|
+
f"- Negative neighbor rate: {retrieval_metrics['negativeNeighborRate']}",
|
|
669
|
+
])
|
|
670
|
+
else:
|
|
671
|
+
lines.append("- Not available. Export retrieval rows to JSONL to evaluate semantic recall quality.")
|
|
672
|
+
|
|
673
|
+
lines.extend(["", "## Recommendations", ""])
|
|
674
|
+
lines.extend(f"- {item}" for item in report["recommendations"])
|
|
675
|
+
lines.append("")
|
|
676
|
+
return "\n".join(lines)
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def parse_args() -> argparse.Namespace:
|
|
680
|
+
parser = argparse.ArgumentParser(description="Evaluate ThumbGate feedback quality from feedback-log.jsonl.")
|
|
681
|
+
parser.add_argument("--feedback-log", help="Path to feedback-log.jsonl. Defaults to the resolved ThumbGate feedback dir.")
|
|
682
|
+
parser.add_argument("--feedback-dir", help="Directory containing feedback-log.jsonl.")
|
|
683
|
+
parser.add_argument("--lesson-db", help="Path to lessons.sqlite for SQL lesson coverage metrics.")
|
|
684
|
+
parser.add_argument("--retrieval-log", help="JSONL export of LanceDB retrieval rows for semantic recall metrics.")
|
|
685
|
+
parser.add_argument("--min-support", type=int, default=2, help="Minimum bucket support for category/tag metrics.")
|
|
686
|
+
parser.add_argument("--json", action="store_true", help="Print JSON instead of Markdown.")
|
|
687
|
+
parser.add_argument("--write-report", help="Write the rendered report to a file.")
|
|
688
|
+
return parser.parse_args()
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def main() -> int:
|
|
692
|
+
args = parse_args()
|
|
693
|
+
feedback_log = Path(args.feedback_log) if args.feedback_log else None
|
|
694
|
+
if feedback_log is None:
|
|
695
|
+
feedback_dir = Path(args.feedback_dir) if args.feedback_dir else resolve_feedback_dir()
|
|
696
|
+
feedback_log = feedback_dir / "feedback-log.jsonl"
|
|
697
|
+
|
|
698
|
+
entries, invalid = read_jsonl(feedback_log)
|
|
699
|
+
lesson_db = Path(args.lesson_db) if args.lesson_db else None
|
|
700
|
+
retrieval_log = Path(args.retrieval_log) if args.retrieval_log else None
|
|
701
|
+
retrieval_rows, retrieval_invalid = read_jsonl(retrieval_log) if retrieval_log else ([], 0)
|
|
702
|
+
sqlite_lessons = load_sqlite_lessons(lesson_db)
|
|
703
|
+
report = evaluate_feedback(
|
|
704
|
+
entries,
|
|
705
|
+
invalid_entries=invalid,
|
|
706
|
+
min_support=max(args.min_support, 1),
|
|
707
|
+
sqlite_lessons=sqlite_lessons,
|
|
708
|
+
retrieval_rows=retrieval_rows,
|
|
709
|
+
)
|
|
710
|
+
report["feedbackLog"] = str(feedback_log)
|
|
711
|
+
report["lessonDb"] = str(lesson_db) if lesson_db else None
|
|
712
|
+
report["retrievalLog"] = str(retrieval_log) if retrieval_log else None
|
|
713
|
+
report["invalidRetrievalRows"] = retrieval_invalid
|
|
714
|
+
|
|
715
|
+
output = json.dumps(report, indent=2, sort_keys=True) if args.json else render_markdown(report)
|
|
716
|
+
if args.write_report:
|
|
717
|
+
out_path = Path(args.write_report)
|
|
718
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
719
|
+
out_path.write_text(output + ("\n" if not output.endswith("\n") else ""), encoding="utf-8")
|
|
720
|
+
print(output)
|
|
721
|
+
return 0
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
if __name__ == "__main__":
|
|
725
|
+
raise SystemExit(main())
|
package/src/api/server.js
CHANGED
|
@@ -403,6 +403,27 @@ const TRACKED_LINK_TARGETS = Object.freeze({
|
|
|
403
403
|
},
|
|
404
404
|
allowCustomerEmail: true,
|
|
405
405
|
},
|
|
406
|
+
// 2026-05-12: Aiventyx marketplace listing routes its Teams clicks through
|
|
407
|
+
// /go/teams (best-performing listing at ~62% CTR). Without this slug the
|
|
408
|
+
// server returned 404 + "Tracked link not found". Every Aiventyx Teams
|
|
409
|
+
// click between the URL swap and this deploy landed on that error page.
|
|
410
|
+
// Destination: 3-seat Team self-serve Stripe checkout (the path I shipped
|
|
411
|
+
// in PR #1877 โ plan_id=team + seat_count=3 = $147/mo entry).
|
|
412
|
+
teams: {
|
|
413
|
+
path: '/checkout/pro',
|
|
414
|
+
ctaId: 'go_teams',
|
|
415
|
+
ctaPlacement: 'link_router',
|
|
416
|
+
eventType: 'cta_click',
|
|
417
|
+
defaults: {
|
|
418
|
+
utm_source: 'website',
|
|
419
|
+
utm_medium: 'link_router',
|
|
420
|
+
utm_campaign: 'team_self_serve',
|
|
421
|
+
plan_id: 'team',
|
|
422
|
+
seat_count: '3',
|
|
423
|
+
billing_cycle: 'monthly',
|
|
424
|
+
},
|
|
425
|
+
allowCustomerEmail: true,
|
|
426
|
+
},
|
|
406
427
|
install: {
|
|
407
428
|
path: '/guide',
|
|
408
429
|
ctaId: 'go_install',
|
|
@@ -1511,7 +1532,7 @@ function renderCheckoutIntentPage({
|
|
|
1511
1532
|
const teardownAction = safeWorkflowTeardownCheckoutHref
|
|
1512
1533
|
? `<a data-i="workflow_teardown_checkout" href="${safeWorkflowTeardownCheckoutHref}">Pay $99 teardown</a>`
|
|
1513
1534
|
: '';
|
|
1514
|
-
return `<!doctype html><html lang="en"><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1"><style>body{background:#0a0a0a;color:#eee;font-family:system-ui,sans-serif}
|
|
1535
|
+
return `<!doctype html><html lang="en"><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1"><title>Confirm โ ThumbGate Pro</title><style>body{background:#0a0a0a;color:#eee;font-family:system-ui,-apple-system,sans-serif;line-height:1.5}main{max-width:520px;margin:8vh auto;padding:0 20px}.brand{display:flex;align-items:center;gap:10px;margin-bottom:24px;font-size:14px;color:#94a3b8}.brand-mark{width:24px;height:24px;background:#22d3ee;border-radius:6px;display:inline-block}h1{font-size:24px;margin:0 0 8px;color:#fff}.price{font-size:32px;font-weight:700;color:#22d3ee;margin:8px 0 4px}.price small{font-size:14px;color:#94a3b8;font-weight:400}p{color:#cbd5e1;margin:8px 0}a{display:block;text-decoration:none}a.primary{background:#22d3ee;color:#000;padding:16px;text-align:center;border-radius:8px;font-weight:700;font-size:16px;margin:20px 0 10px}a.secondary{border:1px solid #374151;color:#cbd5e1;padding:12px;text-align:center;border-radius:8px;margin:8px 0;font-size:14px}.trust{margin:24px 0;padding:16px;border:1px solid #1f2937;border-radius:8px;background:#0f172a}.trust-item{font-size:13px;color:#cbd5e1;padding:4px 0;display:flex;gap:8px}.trust-item::before{content:"โ";color:#22d3ee;font-weight:700}details{margin-top:32px;font-size:13px;color:#94a3b8}details summary{cursor:pointer;padding:8px 0}details a{border:1px solid #374151;color:#94a3b8;padding:10px;text-align:center;border-radius:6px;margin:6px 0;font-size:13px}.back{text-align:center;color:#64748b;font-size:12px;margin-top:24px}.back a{color:#64748b;display:inline}</style><main><div class="brand"><span class="brand-mark"></span><span>ThumbGate</span></div><h1>Start ThumbGate Pro</h1><div class="price">$19<small>/mo</small></div><p>Block every repeat AI-agent mistake. Local-first. MIT-licensed CLI included. Cancel anytime.</p><a class="primary" data-i="pro_checkout_confirmed" href="${safeConfirmHref}">Pay $19/mo with Stripe โ</a><div class="trust"><div class="trust-item">6 paying customers, 18,000+ installs verified on npm</div><div class="trust-item">Cancel anytime โ instant refund within 7 days</div><div class="trust-item">MIT open source ยท no vendor lock-in</div><div class="trust-item">Works with Claude Code, Cursor, Codex, Gemini, Amp, Cline, OpenCode</div></div><details><summary>Other paid paths (diagnostic, sprint, teardown, single-rule)</summary>${diagnosticAction.replace('<a ', '<a class="secondary" ')}${sprintAction.replace('<a ', '<a class="secondary" ')}${teardownAction.replace('<a ', '<a class="secondary" ')}${quickReadAction.replace('<a ', '<a class="secondary" ')}${firstRuleAction.replace('<a ', '<a class="secondary" ')}<a class="secondary" data-i="workflow_sprint_intake" href="${safeWorkflowIntakeHref}">Send workflow first (intake)</a><a class="secondary" data-i="team_paid_path" href="${safeTeamOptionsHref}">See all options</a></details><p class="back"><a href="/">โ Back to thumbgate.ai</a></p></main><script>addEventListener('click',e=>{let a=e.target.closest('[data-i]');if(a&&navigator.sendBeacon)navigator.sendBeacon('/v1/telemetry/ping',new Blob([JSON.stringify({eventType:'checkout_interstitial_cta_clicked',clientType:'web',page:'/checkout/pro',ctaId:a.dataset.i,ctaPlacement:'checkout_interstitial'})],{type:'application/json'}))})</script></html>`;
|
|
1515
1536
|
}
|
|
1516
1537
|
|
|
1517
1538
|
function buildCheckoutBootstrapBody(parsed, req, journeyState = resolveJourneyState(req, parsed)) {
|