thumbgate 1.16.22 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,725 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Offline feedback quality evaluation for ThumbGate.
4
+
5
+ This is intentionally stdlib-only. It turns feedback-log.jsonl into a small
6
+ quality report that answers: where are repeated failures clustering, how stable
7
+ is the signal, and do we have enough labeled gate decisions to compute true
8
+ precision/recall yet?
9
+ """
10
+
11
+ import argparse
12
+ import json
13
+ import math
14
+ import os
15
+ import re
16
+ import sqlite3
17
+ from collections import Counter, defaultdict
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
21
+
22
+
23
+ PROJECT_ROOT = Path(__file__).parent.parent
24
+
25
+ DEFAULT_CATEGORIES = {
26
+ "code_edit": {
27
+ "keywords": ["edit", "write", "implement", "refactor", "fix", "update", "create file"],
28
+ "tools": ["edit", "write", "multiedit"],
29
+ },
30
+ "git": {
31
+ "keywords": ["commit", "push", "branch", "merge", "pr", "pull request", "rebase", "cherry-pick"],
32
+ "tools": ["bash", "git"],
33
+ },
34
+ "testing": {
35
+ "keywords": ["test", "jest", "coverage", "verify", "verification", "spec", "mock", "assert"],
36
+ "tools": [],
37
+ },
38
+ "review": {
39
+ "keywords": ["review", "pr comment", "resolve", "thread", "feedback"],
40
+ "tools": [],
41
+ },
42
+ "search": {
43
+ "keywords": ["search", "find", "grep", "glob", "explore", "where is", "look for", "rg"],
44
+ "tools": ["grep", "glob", "read", "rg"],
45
+ },
46
+ "security": {
47
+ "keywords": ["security", "secret", "credential", "token", "auth", "injection", "xss"],
48
+ "tools": [],
49
+ },
50
+ "debugging": {
51
+ "keywords": ["debug", "error", "crash", "stack trace", "log", "diagnose", "investigate"],
52
+ "tools": [],
53
+ },
54
+ }
55
+
56
+
57
+ def resolve_feedback_dir() -> Path:
58
+ env_dir = os.environ.get("THUMBGATE_FEEDBACK_DIR")
59
+ if env_dir:
60
+ return Path(env_dir)
61
+
62
+ local_thumbgate = PROJECT_ROOT / ".thumbgate"
63
+ if local_thumbgate.exists():
64
+ return local_thumbgate
65
+
66
+ local_legacy = PROJECT_ROOT / ".claude" / "memory" / "feedback"
67
+ if local_legacy.exists():
68
+ return local_legacy
69
+
70
+ return Path.home() / ".thumbgate" / "projects" / PROJECT_ROOT.name
71
+
72
+
73
+ def read_jsonl(path: Path) -> Tuple[List[Dict[str, Any]], int]:
74
+ rows: List[Dict[str, Any]] = []
75
+ invalid = 0
76
+ if not path.exists():
77
+ return rows, invalid
78
+
79
+ with path.open("r", encoding="utf-8") as handle:
80
+ for raw in handle:
81
+ line = raw.strip()
82
+ if not line:
83
+ continue
84
+ try:
85
+ parsed = json.loads(line)
86
+ except json.JSONDecodeError:
87
+ invalid += 1
88
+ continue
89
+ if isinstance(parsed, dict):
90
+ rows.append(parsed)
91
+ else:
92
+ invalid += 1
93
+ return rows, invalid
94
+
95
+
96
+ def load_sqlite_lessons(db_path: Optional[Path]) -> Dict[str, Any]:
97
+ if not db_path:
98
+ return {
99
+ "available": False,
100
+ "path": None,
101
+ "totalLessons": 0,
102
+ "bySignal": {},
103
+ "byDomain": {},
104
+ "sourceFeedbackIds": [],
105
+ "error": None,
106
+ }
107
+ if not db_path.exists():
108
+ return {
109
+ "available": False,
110
+ "path": str(db_path),
111
+ "totalLessons": 0,
112
+ "bySignal": {},
113
+ "byDomain": {},
114
+ "sourceFeedbackIds": [],
115
+ "error": "SQLite lesson DB does not exist.",
116
+ }
117
+
118
+ try:
119
+ connection = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
120
+ connection.row_factory = sqlite3.Row
121
+ try:
122
+ table_exists = connection.execute(
123
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='lessons'"
124
+ ).fetchone()
125
+ if not table_exists:
126
+ return {
127
+ "available": False,
128
+ "path": str(db_path),
129
+ "totalLessons": 0,
130
+ "bySignal": {},
131
+ "byDomain": {},
132
+ "sourceFeedbackIds": [],
133
+ "error": "SQLite DB does not contain a lessons table.",
134
+ }
135
+
136
+ rows = connection.execute(
137
+ "SELECT id, signal, domain, sourceFeedbackId FROM lessons WHERE pruned = 0"
138
+ ).fetchall()
139
+ finally:
140
+ connection.close()
141
+ except sqlite3.Error as exc:
142
+ return {
143
+ "available": False,
144
+ "path": str(db_path),
145
+ "totalLessons": 0,
146
+ "bySignal": {},
147
+ "byDomain": {},
148
+ "sourceFeedbackIds": [],
149
+ "error": str(exc),
150
+ }
151
+
152
+ by_signal = Counter(str(row["signal"] or "unknown") for row in rows)
153
+ by_domain = Counter(str(row["domain"] or "unknown") for row in rows)
154
+ source_ids = sorted({
155
+ str(row["sourceFeedbackId"])
156
+ for row in rows
157
+ if row["sourceFeedbackId"]
158
+ })
159
+ return {
160
+ "available": True,
161
+ "path": str(db_path),
162
+ "totalLessons": len(rows),
163
+ "bySignal": dict(sorted(by_signal.items())),
164
+ "byDomain": dict(sorted(by_domain.items())),
165
+ "sourceFeedbackIds": source_ids,
166
+ "error": None,
167
+ }
168
+
169
+
170
+ def normalize_signal(entry: Dict[str, Any]) -> Optional[str]:
171
+ raw = str(entry.get("signal") or entry.get("feedback") or "").strip().lower()
172
+ if raw in {"positive", "up", "thumbsup", "thumbs_up", "👍"}:
173
+ return "positive"
174
+ if raw in {"negative", "down", "thumbsdown", "thumbs_down", "👎"}:
175
+ return "negative"
176
+
177
+ reward = entry.get("reward")
178
+ if isinstance(reward, (int, float)):
179
+ if reward > 0:
180
+ return "positive"
181
+ if reward < 0:
182
+ return "negative"
183
+ return None
184
+
185
+
186
+ def normalize_text(*values: Any) -> str:
187
+ parts = []
188
+ for value in values:
189
+ if value is None:
190
+ continue
191
+ if isinstance(value, list):
192
+ parts.extend(str(item) for item in value)
193
+ elif isinstance(value, dict):
194
+ parts.append(json.dumps(value, sort_keys=True))
195
+ else:
196
+ parts.append(str(value))
197
+ return " ".join(parts).lower()
198
+
199
+
200
+ def contains_keyword(text: str, keyword: str) -> bool:
201
+ normalized_keyword = keyword.lower().strip()
202
+ if not normalized_keyword:
203
+ return False
204
+ if len(normalized_keyword) <= 3 or re.fullmatch(r"[a-z0-9_+-]+", normalized_keyword):
205
+ return re.search(rf"(?<![a-z0-9_+-]){re.escape(normalized_keyword)}(?![a-z0-9_+-])", text) is not None
206
+ return normalized_keyword in text
207
+
208
+
209
+ def classify_entry(entry: Dict[str, Any]) -> List[str]:
210
+ tags = entry.get("tags") if isinstance(entry.get("tags"), list) else []
211
+ tool = entry.get("toolName") or entry.get("tool_name") or entry.get("last_tool")
212
+ text = normalize_text(
213
+ entry.get("context"),
214
+ entry.get("whatWentWrong"),
215
+ entry.get("whatToChange"),
216
+ entry.get("whatWorked"),
217
+ entry.get("actionReason"),
218
+ entry.get("failureType"),
219
+ tags,
220
+ )
221
+ tool_text = normalize_text(tool)
222
+
223
+ matched = []
224
+ for category, config in DEFAULT_CATEGORIES.items():
225
+ keyword_match = any(contains_keyword(text, keyword) for keyword in config["keywords"])
226
+ tool_match = any(contains_keyword(tool_text, tool_name) for tool_name in config["tools"])
227
+ if keyword_match or tool_match:
228
+ matched.append(category)
229
+
230
+ if not matched:
231
+ domain = entry.get("richContext", {}).get("domain") if isinstance(entry.get("richContext"), dict) else None
232
+ if isinstance(domain, str) and domain:
233
+ matched.append(domain)
234
+
235
+ return matched or ["uncategorized"]
236
+
237
+
238
+ def parse_timestamp(value: Any) -> Optional[datetime]:
239
+ if not isinstance(value, str) or not value:
240
+ return None
241
+ try:
242
+ normalized = value.replace("Z", "+00:00")
243
+ parsed = datetime.fromisoformat(normalized)
244
+ if parsed.tzinfo is None:
245
+ parsed = parsed.replace(tzinfo=timezone.utc)
246
+ return parsed
247
+ except ValueError:
248
+ return None
249
+
250
+
251
+ def rate(numerator: int, denominator: int) -> float:
252
+ return round(numerator / denominator, 4) if denominator else 0.0
253
+
254
+
255
+ def wilson_lower_bound(positive: int, total: int, z: float = 1.96) -> float:
256
+ if total <= 0:
257
+ return 0.0
258
+ p = positive / total
259
+ denom = 1 + z * z / total
260
+ centre = p + z * z / (2 * total)
261
+ spread = z * math.sqrt((p * (1 - p) + z * z / (4 * total)) / total)
262
+ return round((centre - spread) / denom, 4)
263
+
264
+
265
+ def summarize_bucket(name: str, values: Iterable[str], signals: List[str], min_support: int) -> List[Dict[str, Any]]:
266
+ counts: Dict[str, Counter] = defaultdict(Counter)
267
+ for bucket_value, signal in zip(values, signals):
268
+ counts[bucket_value][signal] += 1
269
+
270
+ rows = []
271
+ for bucket_value, counter in counts.items():
272
+ positive = counter["positive"]
273
+ negative = counter["negative"]
274
+ total = positive + negative
275
+ if total < min_support:
276
+ continue
277
+ rows.append({
278
+ name: bucket_value,
279
+ "support": total,
280
+ "positive": positive,
281
+ "negative": negative,
282
+ "positiveRate": rate(positive, total),
283
+ "negativeRate": rate(negative, total),
284
+ "wilsonPositiveLower": wilson_lower_bound(positive, total),
285
+ })
286
+
287
+ return sorted(rows, key=lambda row: (-row["negativeRate"], -row["support"], row[name]))
288
+
289
+
290
+ def explicit_gate_label(entry: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
291
+ """Return expected/actual labels when the log carries explicit gate labels.
292
+
293
+ expected: harmful/safe based on feedback signal.
294
+ actual: blocked/allowed from gate decision fields.
295
+ """
296
+ signal = normalize_signal(entry)
297
+ if not signal:
298
+ return None, None
299
+
300
+ expected = "harmful" if signal == "negative" else "safe"
301
+
302
+ for key in ("gateDecision", "decision", "outcome", "status"):
303
+ value = str(entry.get(key) or "").lower()
304
+ if value in {"block", "blocked", "deny", "denied", "rejected"}:
305
+ return expected, "blocked"
306
+ if value in {"allow", "allowed", "pass", "passed", "accepted"}:
307
+ return expected, "allowed"
308
+
309
+ if isinstance(entry.get("allowed"), bool):
310
+ return expected, "allowed" if entry["allowed"] else "blocked"
311
+ if isinstance(entry.get("blocked"), bool):
312
+ return expected, "blocked" if entry["blocked"] else "allowed"
313
+ if entry.get("actionType") == "no-action":
314
+ return expected, "blocked"
315
+
316
+ return expected, None
317
+
318
+
319
+ def compute_sqlite_metrics(entries: List[Dict[str, Any]], sqlite_lessons: Dict[str, Any]) -> Dict[str, Any]:
320
+ if not sqlite_lessons.get("available"):
321
+ return {
322
+ "available": False,
323
+ "path": sqlite_lessons.get("path"),
324
+ "totalLessons": 0,
325
+ "feedbackLessonCoverage": 0.0,
326
+ "negativeLessonCoverage": 0.0,
327
+ "bySignal": {},
328
+ "byDomain": {},
329
+ "error": sqlite_lessons.get("error"),
330
+ }
331
+
332
+ feedback_ids = {str(entry.get("id")) for entry in entries if entry.get("id")}
333
+ negative_ids = {
334
+ str(entry.get("id"))
335
+ for entry in entries
336
+ if entry.get("id") and normalize_signal(entry) == "negative"
337
+ }
338
+ lesson_feedback_ids = set(sqlite_lessons.get("sourceFeedbackIds") or [])
339
+
340
+ return {
341
+ "available": True,
342
+ "path": sqlite_lessons.get("path"),
343
+ "totalLessons": sqlite_lessons.get("totalLessons", 0),
344
+ "feedbackLessonCoverage": rate(len(feedback_ids & lesson_feedback_ids), len(feedback_ids)),
345
+ "negativeLessonCoverage": rate(len(negative_ids & lesson_feedback_ids), len(negative_ids)),
346
+ "bySignal": sqlite_lessons.get("bySignal") or {},
347
+ "byDomain": sqlite_lessons.get("byDomain") or {},
348
+ "error": None,
349
+ }
350
+
351
+
352
+ def retrieval_score(row: Dict[str, Any]) -> Optional[float]:
353
+ for key in ("score", "similarity", "distanceScore", "topSimilarity"):
354
+ value = row.get(key)
355
+ if isinstance(value, (int, float)) and math.isfinite(value):
356
+ return float(value)
357
+ try:
358
+ return float(value)
359
+ except (TypeError, ValueError):
360
+ continue
361
+ return None
362
+
363
+
364
+ def feedback_id_for_retrieval(row: Dict[str, Any]) -> Optional[str]:
365
+ for key in ("feedbackId", "sourceFeedbackId", "queryFeedbackId", "id"):
366
+ value = row.get(key)
367
+ if value:
368
+ return str(value)
369
+ return None
370
+
371
+
372
+ def unavailable_retrieval_metrics() -> Dict[str, Any]:
373
+ return {
374
+ "available": False,
375
+ "rows": 0,
376
+ "queries": 0,
377
+ "averageTopScore": None,
378
+ "negativeNeighborRate": None,
379
+ "error": None,
380
+ }
381
+
382
+
383
+ def bucket_retrieval_rows(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
384
+ by_feedback: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
385
+ for row in retrieval_rows:
386
+ feedback_id = feedback_id_for_retrieval(row) or "unknown"
387
+ by_feedback[feedback_id].append(row)
388
+ return by_feedback
389
+
390
+
391
+ def top_retrieval_scores(by_feedback: Dict[str, List[Dict[str, Any]]]) -> List[float]:
392
+ top_scores = []
393
+ for rows in by_feedback.values():
394
+ scores = [score for score in (retrieval_score(row) for row in rows) if score is not None]
395
+ if scores:
396
+ top_scores.append(max(scores))
397
+ return top_scores
398
+
399
+
400
+ def retrieval_neighbor_summary(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, int]:
401
+ summary = {"labeled": 0, "negative": 0}
402
+ for row in retrieval_rows:
403
+ neighbor_signal = normalize_signal({
404
+ "signal": row.get("matchedSignal") or row.get("neighborSignal") or row.get("signal")
405
+ })
406
+ if not neighbor_signal:
407
+ continue
408
+ summary["labeled"] += 1
409
+ if neighbor_signal == "negative":
410
+ summary["negative"] += 1
411
+ return summary
412
+
413
+
414
+ def compute_retrieval_metrics(retrieval_rows: List[Dict[str, Any]]) -> Dict[str, Any]:
415
+ if not retrieval_rows:
416
+ return unavailable_retrieval_metrics()
417
+
418
+ by_feedback = bucket_retrieval_rows(retrieval_rows)
419
+ top_scores = top_retrieval_scores(by_feedback)
420
+ neighbor_summary = retrieval_neighbor_summary(retrieval_rows)
421
+ labeled_neighbors = neighbor_summary["labeled"]
422
+
423
+ return {
424
+ "available": True,
425
+ "rows": len(retrieval_rows),
426
+ "queries": len(by_feedback),
427
+ "averageTopScore": round(sum(top_scores) / len(top_scores), 4) if top_scores else None,
428
+ "negativeNeighborRate": rate(neighbor_summary["negative"], labeled_neighbors) if labeled_neighbors else None,
429
+ "error": None,
430
+ }
431
+
432
+
433
+ GATE_OUTCOME_KEYS = {
434
+ ("harmful", "blocked"): "truePositiveBlocks",
435
+ ("safe", "allowed"): "trueNegativeAllows",
436
+ ("safe", "blocked"): "falsePositiveBlocks",
437
+ ("harmful", "allowed"): "falseNegativeAllows",
438
+ }
439
+
440
+
441
+ def initial_gate_counts() -> Dict[str, int]:
442
+ return {
443
+ "truePositiveBlocks": 0,
444
+ "trueNegativeAllows": 0,
445
+ "falsePositiveBlocks": 0,
446
+ "falseNegativeAllows": 0,
447
+ "unlabeledFeedback": 0,
448
+ }
449
+
450
+
451
+ def count_gate_outcomes(entries: List[Dict[str, Any]]) -> Dict[str, int]:
452
+ counts = initial_gate_counts()
453
+
454
+ for entry in entries:
455
+ expected, actual = explicit_gate_label(entry)
456
+ if expected is None:
457
+ continue
458
+ if actual is None:
459
+ counts["unlabeledFeedback"] += 1
460
+ continue
461
+
462
+ count_key = GATE_OUTCOME_KEYS.get((expected, actual))
463
+ if count_key:
464
+ counts[count_key] += 1
465
+
466
+ return counts
467
+
468
+
469
+ def compute_f1(precision: Optional[float], recall: Optional[float], labeled: int) -> Optional[float]:
470
+ if not labeled:
471
+ return None
472
+ if not precision or not recall:
473
+ return 0.0
474
+ return round((2 * precision * recall) / (precision + recall), 4)
475
+
476
+
477
+ def compute_gate_metrics(entries: List[Dict[str, Any]]) -> Dict[str, Any]:
478
+ counts = count_gate_outcomes(entries)
479
+ tp = counts["truePositiveBlocks"]
480
+ tn = counts["trueNegativeAllows"]
481
+ fp = counts["falsePositiveBlocks"]
482
+ fn = counts["falseNegativeAllows"]
483
+
484
+ labeled = tp + tn + fp + fn
485
+ precision = rate(tp, tp + fp) if labeled else None
486
+ recall = rate(tp, tp + fn) if labeled else None
487
+ f1 = compute_f1(precision, recall, labeled)
488
+
489
+ return {
490
+ "available": labeled > 0,
491
+ "labeledDecisions": labeled,
492
+ "unlabeledFeedback": counts["unlabeledFeedback"],
493
+ "truePositiveBlocks": tp,
494
+ "trueNegativeAllows": tn,
495
+ "falsePositiveBlocks": fp,
496
+ "falseNegativeAllows": fn,
497
+ "precision": precision,
498
+ "recall": recall,
499
+ "f1": f1,
500
+ "note": None if labeled else "No explicit gate decision labels found; feedback quality metrics are available, but classifier precision/recall needs blocked/allowed labels.",
501
+ }
502
+
503
+
504
+ def base_recommendations(report: Dict[str, Any]) -> List[str]:
505
+ items = []
506
+ if report["usableEntries"] < 10:
507
+ items.append("Collect at least 10 usable feedback entries before making threshold changes.")
508
+ if not report["gateMetrics"]["available"]:
509
+ items.append("Start logging gate decisions as blocked/allowed so precision, recall, and false-positive rate can be computed.")
510
+ return items
511
+
512
+
513
+ def storage_recommendations(report: Dict[str, Any]) -> List[str]:
514
+ items = []
515
+ sqlite_metrics = report.get("sqliteLessonMetrics") or {}
516
+ if sqlite_metrics.get("available") and sqlite_metrics.get("negativeLessonCoverage", 0) < 0.8:
517
+ items.append("Backfill SQLite lesson rows for negative feedback before treating SQL dashboards as complete eval evidence.")
518
+
519
+ retrieval_metrics = report.get("retrievalMetrics") or {}
520
+ if retrieval_metrics.get("available") and retrieval_metrics.get("negativeNeighborRate") is not None and retrieval_metrics["negativeNeighborRate"] >= 0.5:
521
+ items.append("Inspect LanceDB retrieval neighborhoods: most labeled neighbors are negative, which is a good candidate for repeated-failure clustering.")
522
+ return items
523
+
524
+
525
+ def category_recommendations(report: Dict[str, Any]) -> List[str]:
526
+ items = []
527
+ weak_categories = [
528
+ row for row in report["categoryMetrics"]
529
+ if row["support"] >= report["minSupport"] and row["negativeRate"] >= 0.5
530
+ ]
531
+ if weak_categories:
532
+ top = weak_categories[0]
533
+ items.append(
534
+ f"Tighten prevention rules for {top['category']}: {top['negative']} negative signals across {top['support']} entries."
535
+ )
536
+ return items
537
+
538
+
539
+ def tag_recommendations(report: Dict[str, Any]) -> List[str]:
540
+ volatile_tags = [
541
+ row for row in report["tagMetrics"]
542
+ if row["support"] >= report["minSupport"] and 0.35 <= row["positiveRate"] <= 0.65
543
+ ]
544
+ if not volatile_tags:
545
+ return []
546
+ return [
547
+ f"Review mixed-signal tag '{volatile_tags[0]['tag']}' before promoting broad rules; signal is not separable yet."
548
+ ]
549
+
550
+
551
+ def build_recommendations(report: Dict[str, Any]) -> List[str]:
552
+ recommendations = []
553
+ recommendations.extend(base_recommendations(report))
554
+ recommendations.extend(storage_recommendations(report))
555
+ recommendations.extend(category_recommendations(report))
556
+ recommendations.extend(tag_recommendations(report))
557
+ if not recommendations:
558
+ recommendations.append("No immediate eval action required; keep collecting feedback and rerun this report after the next batch.")
559
+ return recommendations
560
+
561
+
562
+ def evaluate_feedback(
563
+ entries: List[Dict[str, Any]],
564
+ invalid_entries: int = 0,
565
+ min_support: int = 2,
566
+ sqlite_lessons: Optional[Dict[str, Any]] = None,
567
+ retrieval_rows: Optional[List[Dict[str, Any]]] = None,
568
+ ) -> Dict[str, Any]:
569
+ usable = []
570
+ signals = []
571
+ category_values = []
572
+ tag_values = []
573
+ failure_values = []
574
+ timestamps = []
575
+
576
+ for entry in entries:
577
+ signal = normalize_signal(entry)
578
+ if signal not in {"positive", "negative"}:
579
+ continue
580
+ usable.append(entry)
581
+ signals.append(signal)
582
+ categories = classify_entry(entry)
583
+ category_values.append(categories[0])
584
+ tags = entry.get("tags") if isinstance(entry.get("tags"), list) else []
585
+ tag_values.append(str(tags[0]).strip().lower() if tags else "untagged")
586
+ failure_values.append(str(entry.get("failureType") or "unspecified").strip().lower())
587
+ parsed_ts = parse_timestamp(entry.get("timestamp"))
588
+ if parsed_ts:
589
+ timestamps.append(parsed_ts)
590
+
591
+ positive = signals.count("positive")
592
+ negative = signals.count("negative")
593
+ report = {
594
+ "generatedAt": datetime.now(timezone.utc).isoformat(),
595
+ "minSupport": min_support,
596
+ "totalEntries": len(entries),
597
+ "usableEntries": len(usable),
598
+ "invalidEntries": invalid_entries,
599
+ "positive": positive,
600
+ "negative": negative,
601
+ "positiveRate": rate(positive, len(usable)),
602
+ "negativeRate": rate(negative, len(usable)),
603
+ "firstTimestamp": min(timestamps).isoformat() if timestamps else None,
604
+ "lastTimestamp": max(timestamps).isoformat() if timestamps else None,
605
+ "categoryMetrics": summarize_bucket("category", category_values, signals, min_support),
606
+ "tagMetrics": summarize_bucket("tag", tag_values, signals, min_support),
607
+ "failureTypeMetrics": summarize_bucket("failureType", failure_values, signals, min_support),
608
+ "gateMetrics": compute_gate_metrics(usable),
609
+ "sqliteLessonMetrics": compute_sqlite_metrics(usable, sqlite_lessons or {"available": False, "error": None}),
610
+ "retrievalMetrics": compute_retrieval_metrics(retrieval_rows or []),
611
+ }
612
+ report["recommendations"] = build_recommendations(report)
613
+ return report
614
+
615
+
616
+ def render_markdown(report: Dict[str, Any]) -> str:
617
+ lines = [
618
+ "# Feedback Quality Eval",
619
+ "",
620
+ f"- Generated: {report['generatedAt']}",
621
+ f"- Usable feedback: {report['usableEntries']} / {report['totalEntries']}",
622
+ f"- Positive rate: {report['positiveRate']}",
623
+ f"- Negative rate: {report['negativeRate']}",
624
+ "",
625
+ "## Gate Metrics",
626
+ "",
627
+ ]
628
+ gate = report["gateMetrics"]
629
+ if gate["available"]:
630
+ lines.extend([
631
+ f"- Labeled decisions: {gate['labeledDecisions']}",
632
+ f"- Precision: {gate['precision']}",
633
+ f"- Recall: {gate['recall']}",
634
+ f"- F1: {gate['f1']}",
635
+ f"- False positive blocks: {gate['falsePositiveBlocks']}",
636
+ f"- False negative allows: {gate['falseNegativeAllows']}",
637
+ ])
638
+ else:
639
+ lines.append(f"- {gate['note']}")
640
+
641
+ lines.extend(["", "## Highest-Risk Categories", ""])
642
+ if report["categoryMetrics"]:
643
+ lines.append("| Category | Support | Positive | Negative | Negative rate |")
644
+ lines.append("| --- | ---: | ---: | ---: | ---: |")
645
+ for row in report["categoryMetrics"][:8]:
646
+ lines.append(f"| {row['category']} | {row['support']} | {row['positive']} | {row['negative']} | {row['negativeRate']} |")
647
+ else:
648
+ lines.append("- Not enough category support yet.")
649
+
650
+ sqlite_metrics = report["sqliteLessonMetrics"]
651
+ lines.extend(["", "## SQLite Lesson Coverage", ""])
652
+ if sqlite_metrics["available"]:
653
+ lines.extend([
654
+ f"- Lessons: {sqlite_metrics['totalLessons']}",
655
+ f"- Feedback coverage: {sqlite_metrics['feedbackLessonCoverage']}",
656
+ f"- Negative feedback coverage: {sqlite_metrics['negativeLessonCoverage']}",
657
+ ])
658
+ else:
659
+ lines.append(f"- Not available{': ' + sqlite_metrics['error'] if sqlite_metrics.get('error') else ''}.")
660
+
661
+ retrieval_metrics = report["retrievalMetrics"]
662
+ lines.extend(["", "## LanceDB Retrieval Export", ""])
663
+ if retrieval_metrics["available"]:
664
+ lines.extend([
665
+ f"- Rows: {retrieval_metrics['rows']}",
666
+ f"- Queries: {retrieval_metrics['queries']}",
667
+ f"- Average top score: {retrieval_metrics['averageTopScore']}",
668
+ f"- Negative neighbor rate: {retrieval_metrics['negativeNeighborRate']}",
669
+ ])
670
+ else:
671
+ lines.append("- Not available. Export retrieval rows to JSONL to evaluate semantic recall quality.")
672
+
673
+ lines.extend(["", "## Recommendations", ""])
674
+ lines.extend(f"- {item}" for item in report["recommendations"])
675
+ lines.append("")
676
+ return "\n".join(lines)
677
+
678
+
679
+ def parse_args() -> argparse.Namespace:
680
+ parser = argparse.ArgumentParser(description="Evaluate ThumbGate feedback quality from feedback-log.jsonl.")
681
+ parser.add_argument("--feedback-log", help="Path to feedback-log.jsonl. Defaults to the resolved ThumbGate feedback dir.")
682
+ parser.add_argument("--feedback-dir", help="Directory containing feedback-log.jsonl.")
683
+ parser.add_argument("--lesson-db", help="Path to lessons.sqlite for SQL lesson coverage metrics.")
684
+ parser.add_argument("--retrieval-log", help="JSONL export of LanceDB retrieval rows for semantic recall metrics.")
685
+ parser.add_argument("--min-support", type=int, default=2, help="Minimum bucket support for category/tag metrics.")
686
+ parser.add_argument("--json", action="store_true", help="Print JSON instead of Markdown.")
687
+ parser.add_argument("--write-report", help="Write the rendered report to a file.")
688
+ return parser.parse_args()
689
+
690
+
691
+ def main() -> int:
692
+ args = parse_args()
693
+ feedback_log = Path(args.feedback_log) if args.feedback_log else None
694
+ if feedback_log is None:
695
+ feedback_dir = Path(args.feedback_dir) if args.feedback_dir else resolve_feedback_dir()
696
+ feedback_log = feedback_dir / "feedback-log.jsonl"
697
+
698
+ entries, invalid = read_jsonl(feedback_log)
699
+ lesson_db = Path(args.lesson_db) if args.lesson_db else None
700
+ retrieval_log = Path(args.retrieval_log) if args.retrieval_log else None
701
+ retrieval_rows, retrieval_invalid = read_jsonl(retrieval_log) if retrieval_log else ([], 0)
702
+ sqlite_lessons = load_sqlite_lessons(lesson_db)
703
+ report = evaluate_feedback(
704
+ entries,
705
+ invalid_entries=invalid,
706
+ min_support=max(args.min_support, 1),
707
+ sqlite_lessons=sqlite_lessons,
708
+ retrieval_rows=retrieval_rows,
709
+ )
710
+ report["feedbackLog"] = str(feedback_log)
711
+ report["lessonDb"] = str(lesson_db) if lesson_db else None
712
+ report["retrievalLog"] = str(retrieval_log) if retrieval_log else None
713
+ report["invalidRetrievalRows"] = retrieval_invalid
714
+
715
+ output = json.dumps(report, indent=2, sort_keys=True) if args.json else render_markdown(report)
716
+ if args.write_report:
717
+ out_path = Path(args.write_report)
718
+ out_path.parent.mkdir(parents=True, exist_ok=True)
719
+ out_path.write_text(output + ("\n" if not output.endswith("\n") else ""), encoding="utf-8")
720
+ print(output)
721
+ return 0
722
+
723
+
724
+ if __name__ == "__main__":
725
+ raise SystemExit(main())