thumbgate 1.16.22 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/llms.txt +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +11 -5
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/config/github-about.json +1 -1
- package/package.json +10 -5
- package/public/blog.html +18 -19
- package/public/compare.html +2 -2
- package/public/guide.html +1 -1
- package/public/index.html +166 -419
- package/public/numbers.html +2 -2
- package/scripts/auto-promote-gates.js +4 -1
- package/scripts/billing.js +62 -3
- package/scripts/feedback-to-rules.js +11 -1
- package/scripts/feedback_quality_eval.py +725 -0
- package/scripts/rate-limiter.js +15 -15
- package/src/api/server.js +91 -19
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Offline feedback quality evaluation for ThumbGate.
|
|
4
|
+
|
|
5
|
+
This is intentionally stdlib-only. It turns feedback-log.jsonl into a small
|
|
6
|
+
quality report that answers: where are repeated failures clustering, how stable
|
|
7
|
+
is the signal, and do we have enough labeled gate decisions to compute true
|
|
8
|
+
precision/recall yet?
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import math
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import sqlite3
|
|
17
|
+
from collections import Counter, defaultdict
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
PROJECT_ROOT = Path(__file__).parent.parent
|
|
24
|
+
|
|
25
|
+
DEFAULT_CATEGORIES = {
|
|
26
|
+
"code_edit": {
|
|
27
|
+
"keywords": ["edit", "write", "implement", "refactor", "fix", "update", "create file"],
|
|
28
|
+
"tools": ["edit", "write", "multiedit"],
|
|
29
|
+
},
|
|
30
|
+
"git": {
|
|
31
|
+
"keywords": ["commit", "push", "branch", "merge", "pr", "pull request", "rebase", "cherry-pick"],
|
|
32
|
+
"tools": ["bash", "git"],
|
|
33
|
+
},
|
|
34
|
+
"testing": {
|
|
35
|
+
"keywords": ["test", "jest", "coverage", "verify", "verification", "spec", "mock", "assert"],
|
|
36
|
+
"tools": [],
|
|
37
|
+
},
|
|
38
|
+
"review": {
|
|
39
|
+
"keywords": ["review", "pr comment", "resolve", "thread", "feedback"],
|
|
40
|
+
"tools": [],
|
|
41
|
+
},
|
|
42
|
+
"search": {
|
|
43
|
+
"keywords": ["search", "find", "grep", "glob", "explore", "where is", "look for", "rg"],
|
|
44
|
+
"tools": ["grep", "glob", "read", "rg"],
|
|
45
|
+
},
|
|
46
|
+
"security": {
|
|
47
|
+
"keywords": ["security", "secret", "credential", "token", "auth", "injection", "xss"],
|
|
48
|
+
"tools": [],
|
|
49
|
+
},
|
|
50
|
+
"debugging": {
|
|
51
|
+
"keywords": ["debug", "error", "crash", "stack trace", "log", "diagnose", "investigate"],
|
|
52
|
+
"tools": [],
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def resolve_feedback_dir() -> Path:
|
|
58
|
+
env_dir = os.environ.get("THUMBGATE_FEEDBACK_DIR")
|
|
59
|
+
if env_dir:
|
|
60
|
+
return Path(env_dir)
|
|
61
|
+
|
|
62
|
+
local_thumbgate = PROJECT_ROOT / ".thumbgate"
|
|
63
|
+
if local_thumbgate.exists():
|
|
64
|
+
return local_thumbgate
|
|
65
|
+
|
|
66
|
+
local_legacy = PROJECT_ROOT / ".claude" / "memory" / "feedback"
|
|
67
|
+
if local_legacy.exists():
|
|
68
|
+
return local_legacy
|
|
69
|
+
|
|
70
|
+
return Path.home() / ".thumbgate" / "projects" / PROJECT_ROOT.name
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def read_jsonl(path: Path) -> Tuple[List[Dict[str, Any]], int]:
|
|
74
|
+
rows: List[Dict[str, Any]] = []
|
|
75
|
+
invalid = 0
|
|
76
|
+
if not path.exists():
|
|
77
|
+
return rows, invalid
|
|
78
|
+
|
|
79
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
80
|
+
for raw in handle:
|
|
81
|
+
line = raw.strip()
|
|
82
|
+
if not line:
|
|
83
|
+
continue
|
|
84
|
+
try:
|
|
85
|
+
parsed = json.loads(line)
|
|
86
|
+
except json.JSONDecodeError:
|
|
87
|
+
invalid += 1
|
|
88
|
+
continue
|
|
89
|
+
if isinstance(parsed, dict):
|
|
90
|
+
rows.append(parsed)
|
|
91
|
+
else:
|
|
92
|
+
invalid += 1
|
|
93
|
+
return rows, invalid
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def load_sqlite_lessons(db_path: Optional[Path]) -> Dict[str, Any]:
|
|
97
|
+
if not db_path:
|
|
98
|
+
return {
|
|
99
|
+
"available": False,
|
|
100
|
+
"path": None,
|
|
101
|
+
"totalLessons": 0,
|
|
102
|
+
"bySignal": {},
|
|
103
|
+
"byDomain": {},
|
|
104
|
+
"sourceFeedbackIds": [],
|
|
105
|
+
"error": None,
|
|
106
|
+
}
|
|
107
|
+
if not db_path.exists():
|
|
108
|
+
return {
|
|
109
|
+
"available": False,
|
|
110
|
+
"path": str(db_path),
|
|
111
|
+
"totalLessons": 0,
|
|
112
|
+
"bySignal": {},
|
|
113
|
+
"byDomain": {},
|
|
114
|
+
"sourceFeedbackIds": [],
|
|
115
|
+
"error": "SQLite lesson DB does not exist.",
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
connection = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
|
120
|
+
connection.row_factory = sqlite3.Row
|
|
121
|
+
try:
|
|
122
|
+
table_exists = connection.execute(
|
|
123
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='lessons'"
|
|
124
|
+
).fetchone()
|
|
125
|
+
if not table_exists:
|
|
126
|
+
return {
|
|
127
|
+
"available": False,
|
|
128
|
+
"path": str(db_path),
|
|
129
|
+
"totalLessons": 0,
|
|
130
|
+
"bySignal": {},
|
|
131
|
+
"byDomain": {},
|
|
132
|
+
"sourceFeedbackIds": [],
|
|
133
|
+
"error": "SQLite DB does not contain a lessons table.",
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
rows = connection.execute(
|
|
137
|
+
"SELECT id, signal, domain, sourceFeedbackId FROM lessons WHERE pruned = 0"
|
|
138
|
+
).fetchall()
|
|
139
|
+
finally:
|
|
140
|
+
connection.close()
|
|
141
|
+
except sqlite3.Error as exc:
|
|
142
|
+
return {
|
|
143
|
+
"available": False,
|
|
144
|
+
"path": str(db_path),
|
|
145
|
+
"totalLessons": 0,
|
|
146
|
+
"bySignal": {},
|
|
147
|
+
"byDomain": {},
|
|
148
|
+
"sourceFeedbackIds": [],
|
|
149
|
+
"error": str(exc),
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
by_signal = Counter(str(row["signal"] or "unknown") for row in rows)
|
|
153
|
+
by_domain = Counter(str(row["domain"] or "unknown") for row in rows)
|
|
154
|
+
source_ids = sorted({
|
|
155
|
+
str(row["sourceFeedbackId"])
|
|
156
|
+
for row in rows
|
|
157
|
+
if row["sourceFeedbackId"]
|
|
158
|
+
})
|
|
159
|
+
return {
|
|
160
|
+
"available": True,
|
|
161
|
+
"path": str(db_path),
|
|
162
|
+
"totalLessons": len(rows),
|
|
163
|
+
"bySignal": dict(sorted(by_signal.items())),
|
|
164
|
+
"byDomain": dict(sorted(by_domain.items())),
|
|
165
|
+
"sourceFeedbackIds": source_ids,
|
|
166
|
+
"error": None,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def normalize_signal(entry: Dict[str, Any]) -> Optional[str]:
|
|
171
|
+
raw = str(entry.get("signal") or entry.get("feedback") or "").strip().lower()
|
|
172
|
+
if raw in {"positive", "up", "thumbsup", "thumbs_up", "👍"}:
|
|
173
|
+
return "positive"
|
|
174
|
+
if raw in {"negative", "down", "thumbsdown", "thumbs_down", "👎"}:
|
|
175
|
+
return "negative"
|
|
176
|
+
|
|
177
|
+
reward = entry.get("reward")
|
|
178
|
+
if isinstance(reward, (int, float)):
|
|
179
|
+
if reward > 0:
|
|
180
|
+
return "positive"
|
|
181
|
+
if reward < 0:
|
|
182
|
+
return "negative"
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def normalize_text(*values: Any) -> str:
|
|
187
|
+
parts = []
|
|
188
|
+
for value in values:
|
|
189
|
+
if value is None:
|
|
190
|
+
continue
|
|
191
|
+
if isinstance(value, list):
|
|
192
|
+
parts.extend(str(item) for item in value)
|
|
193
|
+
elif isinstance(value, dict):
|
|
194
|
+
parts.append(json.dumps(value, sort_keys=True))
|
|
195
|
+
else:
|
|
196
|
+
parts.append(str(value))
|
|
197
|
+
return " ".join(parts).lower()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def contains_keyword(text: str, keyword: str) -> bool:
|
|
201
|
+
normalized_keyword = keyword.lower().strip()
|
|
202
|
+
if not normalized_keyword:
|
|
203
|
+
return False
|
|
204
|
+
if len(normalized_keyword) <= 3 or re.fullmatch(r"[a-z0-9_+-]+", normalized_keyword):
|
|
205
|
+
return re.search(rf"(?<![a-z0-9_+-]){re.escape(normalized_keyword)}(?![a-z0-9_+-])", text) is not None
|
|
206
|
+
return normalized_keyword in text
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def classify_entry(entry: Dict[str, Any]) -> List[str]:
|
|
210
|
+
tags = entry.get("tags") if isinstance(entry.get("tags"), list) else []
|
|
211
|
+
tool = entry.get("toolName") or entry.get("tool_name") or entry.get("last_tool")
|
|
212
|
+
text = normalize_text(
|
|
213
|
+
entry.get("context"),
|
|
214
|
+
entry.get("whatWentWrong"),
|
|
215
|
+
entry.get("whatToChange"),
|
|
216
|
+
entry.get("whatWorked"),
|
|
217
|
+
entry.get("actionReason"),
|
|
218
|
+
entry.get("failureType"),
|
|
219
|
+
tags,
|
|
220
|
+
)
|
|
221
|
+
tool_text = normalize_text(tool)
|
|
222
|
+
|
|
223
|
+
matched = []
|
|
224
|
+
for category, config in DEFAULT_CATEGORIES.items():
|
|
225
|
+
keyword_match = any(contains_keyword(text, keyword) for keyword in config["keywords"])
|
|
226
|
+
tool_match = any(contains_keyword(tool_text, tool_name) for tool_name in config["tools"])
|
|
227
|
+
if keyword_match or tool_match:
|
|
228
|
+
matched.append(category)
|
|
229
|
+
|
|
230
|
+
if not matched:
|
|
231
|
+
domain = entry.get("richContext", {}).get("domain") if isinstance(entry.get("richContext"), dict) else None
|
|
232
|
+
if isinstance(domain, str) and domain:
|
|
233
|
+
matched.append(domain)
|
|
234
|
+
|
|
235
|
+
return matched or ["uncategorized"]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def parse_timestamp(value: Any) -> Optional[datetime]:
|
|
239
|
+
if not isinstance(value, str) or not value:
|
|
240
|
+
return None
|
|
241
|
+
try:
|
|
242
|
+
normalized = value.replace("Z", "+00:00")
|
|
243
|
+
parsed = datetime.fromisoformat(normalized)
|
|
244
|
+
if parsed.tzinfo is None:
|
|
245
|
+
parsed = parsed.replace(tzinfo=timezone.utc)
|
|
246
|
+
return parsed
|
|
247
|
+
except ValueError:
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def rate(numerator: int, denominator: int) -> float:
|
|
252
|
+
return round(numerator / denominator, 4) if denominator else 0.0
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def wilson_lower_bound(positive: int, total: int, z: float = 1.96) -> float:
|
|
256
|
+
if total <= 0:
|
|
257
|
+
return 0.0
|
|
258
|
+
p = positive / total
|
|
259
|
+
denom = 1 + z * z / total
|
|
260
|
+
centre = p + z * z / (2 * total)
|
|
261
|
+
spread = z * math.sqrt((p * (1 - p) + z * z / (4 * total)) / total)
|
|
262
|
+
return round((centre - spread) / denom, 4)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def summarize_bucket(name: str, values: Iterable[str], signals: List[str], min_support: int) -> List[Dict[str, Any]]:
|
|
266
|
+
counts: Dict[str, Counter] = defaultdict(Counter)
|
|
267
|
+
for bucket_value, signal in zip(values, signals):
|
|
268
|
+
counts[bucket_value][signal] += 1
|
|
269
|
+
|
|
270
|
+
rows = []
|
|
271
|
+
for bucket_value, counter in counts.items():
|
|
272
|
+
positive = counter["positive"]
|
|
273
|
+
negative = counter["negative"]
|
|
274
|
+
total = positive + negative
|
|
275
|
+
if total < min_support:
|
|
276
|
+
continue
|
|
277
|
+
rows.append({
|
|
278
|
+
name: bucket_value,
|
|
279
|
+
"support": total,
|
|
280
|
+
"positive": positive,
|
|
281
|
+
"negative": negative,
|
|
282
|
+
"positiveRate": rate(positive, total),
|
|
283
|
+
"negativeRate": rate(negative, total),
|
|
284
|
+
"wilsonPositiveLower": wilson_lower_bound(positive, total),
|
|
285
|
+
})
|
|
286
|
+
|
|
287
|
+
return sorted(rows, key=lambda row: (-row["negativeRate"], -row["support"], row[name]))
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def explicit_gate_label(entry: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
291
|
+
"""Return expected/actual labels when the log carries explicit gate labels.
|
|
292
|
+
|
|
293
|
+
expected: harmful/safe based on feedback signal.
|
|
294
|
+
actual: blocked/allowed from gate decision fields.
|
|
295
|
+
"""
|
|
296
|
+
signal = normalize_signal(entry)
|
|
297
|
+
if not signal:
|
|
298
|
+
return None, None
|
|
299
|
+
|
|
300
|
+
expected = "harmful" if signal == "negative" else "safe"
|
|
301
|
+
|
|
302
|
+
for key in ("gateDecision", "decision", "outcome", "status"):
|
|
303
|
+
value = str(entry.get(key) or "").lower()
|
|
304
|
+
if value in {"block", "blocked", "deny", "denied", "rejected"}:
|
|
305
|
+
return expected, "blocked"
|
|
306
|
+
if value in {"allow", "allowed", "pass", "passed", "accepted"}:
|
|
307
|
+
return expected, "allowed"
|
|
308
|
+
|
|
309
|
+
if isinstance(entry.get("allowed"), bool):
|
|
310
|
+
return expected, "allowed" if entry["allowed"] else "blocked"
|
|
311
|
+
if isinstance(entry.get("blocked"), bool):
|
|
312
|
+
return expected, "blocked" if entry["blocked"] else "allowed"
|
|
313
|
+
if entry.get("actionType") == "no-action":
|
|
314
|
+
return expected, "blocked"
|
|
315
|
+
|
|
316
|
+
return expected, None
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def compute_sqlite_metrics(entries: List[Dict[str, Any]], sqlite_lessons: Dict[str, Any]) -> Dict[str, Any]:
|
|
320
|
+
if not sqlite_lessons.get("available"):
|
|
321
|
+
return {
|
|
322
|
+
"available": False,
|
|
323
|
+
"path": sqlite_lessons.get("path"),
|
|
324
|
+
"totalLessons": 0,
|
|
325
|
+
"feedbackLessonCoverage": 0.0,
|
|
326
|
+
"negativeLessonCoverage": 0.0,
|
|
327
|
+
"bySignal": {},
|
|
328
|
+
"byDomain": {},
|
|
329
|
+
"error": sqlite_lessons.get("error"),
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
feedback_ids = {str(entry.get("id")) for entry in entries if entry.get("id")}
|
|
333
|
+
negative_ids = {
|
|
334
|
+
str(entry.get("id"))
|
|
335
|
+
for entry in entries
|
|
336
|
+
if entry.get("id") and normalize_signal(entry) == "negative"
|
|
337
|
+
}
|
|
338
|
+
lesson_feedback_ids = set(sqlite_lessons.get("sourceFeedbackIds") or [])
|
|
339
|
+
|
|
340
|
+
return {
|
|
341
|
+
"available": True,
|
|
342
|
+
"path": sqlite_lessons.get("path"),
|
|
343
|
+
"totalLessons": sqlite_lessons.get("totalLessons", 0),
|
|
344
|
+
"feedbackLessonCoverage": rate(len(feedback_ids & lesson_feedback_ids), len(feedback_ids)),
|
|
345
|
+
"negativeLessonCoverage": rate(len(negative_ids & lesson_feedback_ids), len(negative_ids)),
|
|
346
|
+
"bySignal": sqlite_lessons.get("bySignal") or {},
|
|
347
|
+
"byDomain": sqlite_lessons.get("byDomain") or {},
|
|
348
|
+
"error": None,
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def retrieval_score(row: Dict[str, Any]) -> Optional[float]:
|
|
353
|
+
for key in ("score", "similarity", "distanceScore", "topSimilarity"):
|
|
354
|
+
value = row.get(key)
|
|
355
|
+
if isinstance(value, (int, float)) and math.isfinite(value):
|
|
356
|
+
return float(value)
|
|
357
|
+
try:
|
|
358
|
+
return float(value)
|
|
359
|
+
except (TypeError, ValueError):
|
|
360
|
+
continue
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def feedback_id_for_retrieval(row: Dict[str, Any]) -> Optional[str]:
|
|
365
|
+
for key in ("feedbackId", "sourceFeedbackId", "queryFeedbackId", "id"):
|
|
366
|
+
value = row.get(key)
|
|
367
|
+
if value:
|
|
368
|
+
return str(value)
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def unavailable_retrieval_metrics() -> Dict[str, Any]:
|
|
373
|
+
return {
|
|
374
|
+
"available": False,
|
|
375
|
+
"rows": 0,
|
|
376
|
+
"queries": 0,
|
|
377
|
+
"averageTopScore": None,
|
|
378
|
+
"negativeNeighborRate": None,
|
|
379
|
+
"error": None,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def bucket_retrieval_rows(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
|
|
384
|
+
by_feedback: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
|
385
|
+
for row in retrieval_rows:
|
|
386
|
+
feedback_id = feedback_id_for_retrieval(row) or "unknown"
|
|
387
|
+
by_feedback[feedback_id].append(row)
|
|
388
|
+
return by_feedback
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def top_retrieval_scores(by_feedback: Dict[str, List[Dict[str, Any]]]) -> List[float]:
|
|
392
|
+
top_scores = []
|
|
393
|
+
for rows in by_feedback.values():
|
|
394
|
+
scores = [score for score in (retrieval_score(row) for row in rows) if score is not None]
|
|
395
|
+
if scores:
|
|
396
|
+
top_scores.append(max(scores))
|
|
397
|
+
return top_scores
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def retrieval_neighbor_summary(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, int]:
|
|
401
|
+
summary = {"labeled": 0, "negative": 0}
|
|
402
|
+
for row in retrieval_rows:
|
|
403
|
+
neighbor_signal = normalize_signal({
|
|
404
|
+
"signal": row.get("matchedSignal") or row.get("neighborSignal") or row.get("signal")
|
|
405
|
+
})
|
|
406
|
+
if not neighbor_signal:
|
|
407
|
+
continue
|
|
408
|
+
summary["labeled"] += 1
|
|
409
|
+
if neighbor_signal == "negative":
|
|
410
|
+
summary["negative"] += 1
|
|
411
|
+
return summary
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def compute_retrieval_metrics(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
415
|
+
if not retrieval_rows:
|
|
416
|
+
return unavailable_retrieval_metrics()
|
|
417
|
+
|
|
418
|
+
by_feedback = bucket_retrieval_rows(retrieval_rows)
|
|
419
|
+
top_scores = top_retrieval_scores(by_feedback)
|
|
420
|
+
neighbor_summary = retrieval_neighbor_summary(retrieval_rows)
|
|
421
|
+
labeled_neighbors = neighbor_summary["labeled"]
|
|
422
|
+
|
|
423
|
+
return {
|
|
424
|
+
"available": True,
|
|
425
|
+
"rows": len(retrieval_rows),
|
|
426
|
+
"queries": len(by_feedback),
|
|
427
|
+
"averageTopScore": round(sum(top_scores) / len(top_scores), 4) if top_scores else None,
|
|
428
|
+
"negativeNeighborRate": rate(neighbor_summary["negative"], labeled_neighbors) if labeled_neighbors else None,
|
|
429
|
+
"error": None,
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
GATE_OUTCOME_KEYS = {
|
|
434
|
+
("harmful", "blocked"): "truePositiveBlocks",
|
|
435
|
+
("safe", "allowed"): "trueNegativeAllows",
|
|
436
|
+
("safe", "blocked"): "falsePositiveBlocks",
|
|
437
|
+
("harmful", "allowed"): "falseNegativeAllows",
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def initial_gate_counts() -> Dict[str, int]:
|
|
442
|
+
return {
|
|
443
|
+
"truePositiveBlocks": 0,
|
|
444
|
+
"trueNegativeAllows": 0,
|
|
445
|
+
"falsePositiveBlocks": 0,
|
|
446
|
+
"falseNegativeAllows": 0,
|
|
447
|
+
"unlabeledFeedback": 0,
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def count_gate_outcomes(entries: List[Dict[str, Any]]) -> Dict[str, int]:
|
|
452
|
+
counts = initial_gate_counts()
|
|
453
|
+
|
|
454
|
+
for entry in entries:
|
|
455
|
+
expected, actual = explicit_gate_label(entry)
|
|
456
|
+
if expected is None:
|
|
457
|
+
continue
|
|
458
|
+
if actual is None:
|
|
459
|
+
counts["unlabeledFeedback"] += 1
|
|
460
|
+
continue
|
|
461
|
+
|
|
462
|
+
count_key = GATE_OUTCOME_KEYS.get((expected, actual))
|
|
463
|
+
if count_key:
|
|
464
|
+
counts[count_key] += 1
|
|
465
|
+
|
|
466
|
+
return counts
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def compute_f1(precision: Optional[float], recall: Optional[float], labeled: int) -> Optional[float]:
|
|
470
|
+
if not labeled:
|
|
471
|
+
return None
|
|
472
|
+
if not precision or not recall:
|
|
473
|
+
return 0.0
|
|
474
|
+
return round((2 * precision * recall) / (precision + recall), 4)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def compute_gate_metrics(entries: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
478
|
+
counts = count_gate_outcomes(entries)
|
|
479
|
+
tp = counts["truePositiveBlocks"]
|
|
480
|
+
tn = counts["trueNegativeAllows"]
|
|
481
|
+
fp = counts["falsePositiveBlocks"]
|
|
482
|
+
fn = counts["falseNegativeAllows"]
|
|
483
|
+
|
|
484
|
+
labeled = tp + tn + fp + fn
|
|
485
|
+
precision = rate(tp, tp + fp) if labeled else None
|
|
486
|
+
recall = rate(tp, tp + fn) if labeled else None
|
|
487
|
+
f1 = compute_f1(precision, recall, labeled)
|
|
488
|
+
|
|
489
|
+
return {
|
|
490
|
+
"available": labeled > 0,
|
|
491
|
+
"labeledDecisions": labeled,
|
|
492
|
+
"unlabeledFeedback": counts["unlabeledFeedback"],
|
|
493
|
+
"truePositiveBlocks": tp,
|
|
494
|
+
"trueNegativeAllows": tn,
|
|
495
|
+
"falsePositiveBlocks": fp,
|
|
496
|
+
"falseNegativeAllows": fn,
|
|
497
|
+
"precision": precision,
|
|
498
|
+
"recall": recall,
|
|
499
|
+
"f1": f1,
|
|
500
|
+
"note": None if labeled else "No explicit gate decision labels found; feedback quality metrics are available, but classifier precision/recall needs blocked/allowed labels.",
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def base_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
505
|
+
items = []
|
|
506
|
+
if report["usableEntries"] < 10:
|
|
507
|
+
items.append("Collect at least 10 usable feedback entries before making threshold changes.")
|
|
508
|
+
if not report["gateMetrics"]["available"]:
|
|
509
|
+
items.append("Start logging gate decisions as blocked/allowed so precision, recall, and false-positive rate can be computed.")
|
|
510
|
+
return items
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def storage_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
514
|
+
items = []
|
|
515
|
+
sqlite_metrics = report.get("sqliteLessonMetrics") or {}
|
|
516
|
+
if sqlite_metrics.get("available") and sqlite_metrics.get("negativeLessonCoverage", 0) < 0.8:
|
|
517
|
+
items.append("Backfill SQLite lesson rows for negative feedback before treating SQL dashboards as complete eval evidence.")
|
|
518
|
+
|
|
519
|
+
retrieval_metrics = report.get("retrievalMetrics") or {}
|
|
520
|
+
if retrieval_metrics.get("available") and retrieval_metrics.get("negativeNeighborRate") is not None and retrieval_metrics["negativeNeighborRate"] >= 0.5:
|
|
521
|
+
items.append("Inspect LanceDB retrieval neighborhoods: most labeled neighbors are negative, which is a good candidate for repeated-failure clustering.")
|
|
522
|
+
return items
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def category_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
526
|
+
items = []
|
|
527
|
+
weak_categories = [
|
|
528
|
+
row for row in report["categoryMetrics"]
|
|
529
|
+
if row["support"] >= report["minSupport"] and row["negativeRate"] >= 0.5
|
|
530
|
+
]
|
|
531
|
+
if weak_categories:
|
|
532
|
+
top = weak_categories[0]
|
|
533
|
+
items.append(
|
|
534
|
+
f"Tighten prevention rules for {top['category']}: {top['negative']} negative signals across {top['support']} entries."
|
|
535
|
+
)
|
|
536
|
+
return items
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def tag_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
540
|
+
volatile_tags = [
|
|
541
|
+
row for row in report["tagMetrics"]
|
|
542
|
+
if row["support"] >= report["minSupport"] and 0.35 <= row["positiveRate"] <= 0.65
|
|
543
|
+
]
|
|
544
|
+
if not volatile_tags:
|
|
545
|
+
return []
|
|
546
|
+
return [
|
|
547
|
+
f"Review mixed-signal tag '{volatile_tags[0]['tag']}' before promoting broad rules; signal is not separable yet."
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def build_recommendations(report: Dict[str, Any]) -> List[str]:
|
|
552
|
+
recommendations = []
|
|
553
|
+
recommendations.extend(base_recommendations(report))
|
|
554
|
+
recommendations.extend(storage_recommendations(report))
|
|
555
|
+
recommendations.extend(category_recommendations(report))
|
|
556
|
+
recommendations.extend(tag_recommendations(report))
|
|
557
|
+
if not recommendations:
|
|
558
|
+
recommendations.append("No immediate eval action required; keep collecting feedback and rerun this report after the next batch.")
|
|
559
|
+
return recommendations
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def evaluate_feedback(
|
|
563
|
+
entries: List[Dict[str, Any]],
|
|
564
|
+
invalid_entries: int = 0,
|
|
565
|
+
min_support: int = 2,
|
|
566
|
+
sqlite_lessons: Optional[Dict[str, Any]] = None,
|
|
567
|
+
retrieval_rows: Optional[List[Dict[str, Any]]] = None,
|
|
568
|
+
) -> Dict[str, Any]:
|
|
569
|
+
usable = []
|
|
570
|
+
signals = []
|
|
571
|
+
category_values = []
|
|
572
|
+
tag_values = []
|
|
573
|
+
failure_values = []
|
|
574
|
+
timestamps = []
|
|
575
|
+
|
|
576
|
+
for entry in entries:
|
|
577
|
+
signal = normalize_signal(entry)
|
|
578
|
+
if signal not in {"positive", "negative"}:
|
|
579
|
+
continue
|
|
580
|
+
usable.append(entry)
|
|
581
|
+
signals.append(signal)
|
|
582
|
+
categories = classify_entry(entry)
|
|
583
|
+
category_values.append(categories[0])
|
|
584
|
+
tags = entry.get("tags") if isinstance(entry.get("tags"), list) else []
|
|
585
|
+
tag_values.append(str(tags[0]).strip().lower() if tags else "untagged")
|
|
586
|
+
failure_values.append(str(entry.get("failureType") or "unspecified").strip().lower())
|
|
587
|
+
parsed_ts = parse_timestamp(entry.get("timestamp"))
|
|
588
|
+
if parsed_ts:
|
|
589
|
+
timestamps.append(parsed_ts)
|
|
590
|
+
|
|
591
|
+
positive = signals.count("positive")
|
|
592
|
+
negative = signals.count("negative")
|
|
593
|
+
report = {
|
|
594
|
+
"generatedAt": datetime.now(timezone.utc).isoformat(),
|
|
595
|
+
"minSupport": min_support,
|
|
596
|
+
"totalEntries": len(entries),
|
|
597
|
+
"usableEntries": len(usable),
|
|
598
|
+
"invalidEntries": invalid_entries,
|
|
599
|
+
"positive": positive,
|
|
600
|
+
"negative": negative,
|
|
601
|
+
"positiveRate": rate(positive, len(usable)),
|
|
602
|
+
"negativeRate": rate(negative, len(usable)),
|
|
603
|
+
"firstTimestamp": min(timestamps).isoformat() if timestamps else None,
|
|
604
|
+
"lastTimestamp": max(timestamps).isoformat() if timestamps else None,
|
|
605
|
+
"categoryMetrics": summarize_bucket("category", category_values, signals, min_support),
|
|
606
|
+
"tagMetrics": summarize_bucket("tag", tag_values, signals, min_support),
|
|
607
|
+
"failureTypeMetrics": summarize_bucket("failureType", failure_values, signals, min_support),
|
|
608
|
+
"gateMetrics": compute_gate_metrics(usable),
|
|
609
|
+
"sqliteLessonMetrics": compute_sqlite_metrics(usable, sqlite_lessons or {"available": False, "error": None}),
|
|
610
|
+
"retrievalMetrics": compute_retrieval_metrics(retrieval_rows or []),
|
|
611
|
+
}
|
|
612
|
+
report["recommendations"] = build_recommendations(report)
|
|
613
|
+
return report
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def render_markdown(report: Dict[str, Any]) -> str:
|
|
617
|
+
lines = [
|
|
618
|
+
"# Feedback Quality Eval",
|
|
619
|
+
"",
|
|
620
|
+
f"- Generated: {report['generatedAt']}",
|
|
621
|
+
f"- Usable feedback: {report['usableEntries']} / {report['totalEntries']}",
|
|
622
|
+
f"- Positive rate: {report['positiveRate']}",
|
|
623
|
+
f"- Negative rate: {report['negativeRate']}",
|
|
624
|
+
"",
|
|
625
|
+
"## Gate Metrics",
|
|
626
|
+
"",
|
|
627
|
+
]
|
|
628
|
+
gate = report["gateMetrics"]
|
|
629
|
+
if gate["available"]:
|
|
630
|
+
lines.extend([
|
|
631
|
+
f"- Labeled decisions: {gate['labeledDecisions']}",
|
|
632
|
+
f"- Precision: {gate['precision']}",
|
|
633
|
+
f"- Recall: {gate['recall']}",
|
|
634
|
+
f"- F1: {gate['f1']}",
|
|
635
|
+
f"- False positive blocks: {gate['falsePositiveBlocks']}",
|
|
636
|
+
f"- False negative allows: {gate['falseNegativeAllows']}",
|
|
637
|
+
])
|
|
638
|
+
else:
|
|
639
|
+
lines.append(f"- {gate['note']}")
|
|
640
|
+
|
|
641
|
+
lines.extend(["", "## Highest-Risk Categories", ""])
|
|
642
|
+
if report["categoryMetrics"]:
|
|
643
|
+
lines.append("| Category | Support | Positive | Negative | Negative rate |")
|
|
644
|
+
lines.append("| --- | ---: | ---: | ---: | ---: |")
|
|
645
|
+
for row in report["categoryMetrics"][:8]:
|
|
646
|
+
lines.append(f"| {row['category']} | {row['support']} | {row['positive']} | {row['negative']} | {row['negativeRate']} |")
|
|
647
|
+
else:
|
|
648
|
+
lines.append("- Not enough category support yet.")
|
|
649
|
+
|
|
650
|
+
sqlite_metrics = report["sqliteLessonMetrics"]
|
|
651
|
+
lines.extend(["", "## SQLite Lesson Coverage", ""])
|
|
652
|
+
if sqlite_metrics["available"]:
|
|
653
|
+
lines.extend([
|
|
654
|
+
f"- Lessons: {sqlite_metrics['totalLessons']}",
|
|
655
|
+
f"- Feedback coverage: {sqlite_metrics['feedbackLessonCoverage']}",
|
|
656
|
+
f"- Negative feedback coverage: {sqlite_metrics['negativeLessonCoverage']}",
|
|
657
|
+
])
|
|
658
|
+
else:
|
|
659
|
+
lines.append(f"- Not available{': ' + sqlite_metrics['error'] if sqlite_metrics.get('error') else ''}.")
|
|
660
|
+
|
|
661
|
+
retrieval_metrics = report["retrievalMetrics"]
|
|
662
|
+
lines.extend(["", "## LanceDB Retrieval Export", ""])
|
|
663
|
+
if retrieval_metrics["available"]:
|
|
664
|
+
lines.extend([
|
|
665
|
+
f"- Rows: {retrieval_metrics['rows']}",
|
|
666
|
+
f"- Queries: {retrieval_metrics['queries']}",
|
|
667
|
+
f"- Average top score: {retrieval_metrics['averageTopScore']}",
|
|
668
|
+
f"- Negative neighbor rate: {retrieval_metrics['negativeNeighborRate']}",
|
|
669
|
+
])
|
|
670
|
+
else:
|
|
671
|
+
lines.append("- Not available. Export retrieval rows to JSONL to evaluate semantic recall quality.")
|
|
672
|
+
|
|
673
|
+
lines.extend(["", "## Recommendations", ""])
|
|
674
|
+
lines.extend(f"- {item}" for item in report["recommendations"])
|
|
675
|
+
lines.append("")
|
|
676
|
+
return "\n".join(lines)
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def parse_args() -> argparse.Namespace:
|
|
680
|
+
parser = argparse.ArgumentParser(description="Evaluate ThumbGate feedback quality from feedback-log.jsonl.")
|
|
681
|
+
parser.add_argument("--feedback-log", help="Path to feedback-log.jsonl. Defaults to the resolved ThumbGate feedback dir.")
|
|
682
|
+
parser.add_argument("--feedback-dir", help="Directory containing feedback-log.jsonl.")
|
|
683
|
+
parser.add_argument("--lesson-db", help="Path to lessons.sqlite for SQL lesson coverage metrics.")
|
|
684
|
+
parser.add_argument("--retrieval-log", help="JSONL export of LanceDB retrieval rows for semantic recall metrics.")
|
|
685
|
+
parser.add_argument("--min-support", type=int, default=2, help="Minimum bucket support for category/tag metrics.")
|
|
686
|
+
parser.add_argument("--json", action="store_true", help="Print JSON instead of Markdown.")
|
|
687
|
+
parser.add_argument("--write-report", help="Write the rendered report to a file.")
|
|
688
|
+
return parser.parse_args()
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def main() -> int:
|
|
692
|
+
args = parse_args()
|
|
693
|
+
feedback_log = Path(args.feedback_log) if args.feedback_log else None
|
|
694
|
+
if feedback_log is None:
|
|
695
|
+
feedback_dir = Path(args.feedback_dir) if args.feedback_dir else resolve_feedback_dir()
|
|
696
|
+
feedback_log = feedback_dir / "feedback-log.jsonl"
|
|
697
|
+
|
|
698
|
+
entries, invalid = read_jsonl(feedback_log)
|
|
699
|
+
lesson_db = Path(args.lesson_db) if args.lesson_db else None
|
|
700
|
+
retrieval_log = Path(args.retrieval_log) if args.retrieval_log else None
|
|
701
|
+
retrieval_rows, retrieval_invalid = read_jsonl(retrieval_log) if retrieval_log else ([], 0)
|
|
702
|
+
sqlite_lessons = load_sqlite_lessons(lesson_db)
|
|
703
|
+
report = evaluate_feedback(
|
|
704
|
+
entries,
|
|
705
|
+
invalid_entries=invalid,
|
|
706
|
+
min_support=max(args.min_support, 1),
|
|
707
|
+
sqlite_lessons=sqlite_lessons,
|
|
708
|
+
retrieval_rows=retrieval_rows,
|
|
709
|
+
)
|
|
710
|
+
report["feedbackLog"] = str(feedback_log)
|
|
711
|
+
report["lessonDb"] = str(lesson_db) if lesson_db else None
|
|
712
|
+
report["retrievalLog"] = str(retrieval_log) if retrieval_log else None
|
|
713
|
+
report["invalidRetrievalRows"] = retrieval_invalid
|
|
714
|
+
|
|
715
|
+
output = json.dumps(report, indent=2, sort_keys=True) if args.json else render_markdown(report)
|
|
716
|
+
if args.write_report:
|
|
717
|
+
out_path = Path(args.write_report)
|
|
718
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
719
|
+
out_path.write_text(output + ("\n" if not output.endswith("\n") else ""), encoding="utf-8")
|
|
720
|
+
print(output)
|
|
721
|
+
return 0
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
if __name__ == "__main__":
|
|
725
|
+
raise SystemExit(main())
|