superlocalmemory 2.6.0 → 2.6.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +122 -1806
- package/README.md +142 -410
- package/docs/ACCESSIBILITY.md +291 -0
- package/docs/FRAMEWORK-INTEGRATIONS.md +300 -0
- package/package.json +1 -1
- package/src/learning/__init__.py +201 -0
- package/src/learning/adaptive_ranker.py +826 -0
- package/src/learning/cross_project_aggregator.py +866 -0
- package/src/learning/engagement_tracker.py +638 -0
- package/src/learning/feature_extractor.py +461 -0
- package/src/learning/feedback_collector.py +690 -0
- package/src/learning/learning_db.py +842 -0
- package/src/learning/project_context_manager.py +582 -0
- package/src/learning/source_quality_scorer.py +685 -0
- package/src/learning/workflow_pattern_miner.py +665 -0
- package/ui/index.html +346 -13
- package/ui/js/clusters.js +90 -1
- package/ui/js/graph-core.js +445 -0
- package/ui/js/graph-cytoscape-monolithic-backup.js +1168 -0
- package/ui/js/graph-cytoscape.js +1168 -0
- package/ui/js/graph-d3-backup.js +32 -0
- package/ui/js/graph-filters.js +220 -0
- package/ui/js/graph-interactions.js +354 -0
- package/ui/js/graph-ui.js +214 -0
- package/ui/js/memories.js +52 -0
- package/ui/js/modal.js +104 -1
|
@@ -0,0 +1,685 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SuperLocalMemory V2 - Source Quality Scorer (v2.7)
|
|
4
|
+
Copyright (c) 2026 Varun Pratap Bhardwaj
|
|
5
|
+
Licensed under MIT License
|
|
6
|
+
|
|
7
|
+
Repository: https://github.com/varun369/SuperLocalMemoryV2
|
|
8
|
+
Author: Varun Pratap Bhardwaj (Solution Architect)
|
|
9
|
+
|
|
10
|
+
NOTICE: This software is protected by MIT License.
|
|
11
|
+
Attribution must be preserved in all copies or derivatives.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
SourceQualityScorer — Per-source quality learning.
|
|
16
|
+
|
|
17
|
+
Learns which memory sources (tools/agents) produce memories that users
|
|
18
|
+
actually find useful. If memories from 'mcp:claude-desktop' get positive
|
|
19
|
+
feedback (via memory_used) 3x more often than memories from 'cli:terminal',
|
|
20
|
+
then Claude Desktop memories receive a quality boost in the adaptive ranker.
|
|
21
|
+
|
|
22
|
+
Data Sources:
|
|
23
|
+
- memory.db `created_by` column (set by ProvenanceTracker in v2.5)
|
|
24
|
+
Values like: 'mcp:claude-desktop', 'mcp:cursor', 'cli:terminal',
|
|
25
|
+
'rest:api', 'user', etc.
|
|
26
|
+
- learning.db `ranking_feedback` table (positive signals from FeedbackCollector)
|
|
27
|
+
Signal types: 'mcp_used', 'cli_useful', 'dashboard_click'
|
|
28
|
+
|
|
29
|
+
Scoring Algorithm (Beta-Binomial Smoothing):
|
|
30
|
+
quality_score = (alpha + positive_signals) / (alpha + beta + total_memories)
|
|
31
|
+
|
|
32
|
+
With alpha=1, beta=1 (Laplace smoothing / uniform prior):
|
|
33
|
+
- Unknown source with 0 feedback: 1/(2+0) = 0.50 (neutral)
|
|
34
|
+
- Source with 5 positives out of 10 total: 6/12 = 0.50 (average)
|
|
35
|
+
- Source with 8 positives out of 10 total: 9/12 = 0.75 (good)
|
|
36
|
+
- Source with 1 positive out of 10 total: 2/12 = 0.17 (poor)
|
|
37
|
+
|
|
38
|
+
This naturally handles:
|
|
39
|
+
- Cold start: new sources get 0.5 (neutral) until evidence accumulates
|
|
40
|
+
- Low sample: smoothing prevents extreme scores from few observations
|
|
41
|
+
- Convergence: scores stabilize as evidence grows
|
|
42
|
+
|
|
43
|
+
Storage:
|
|
44
|
+
Results stored in learning.db `source_quality` table via LearningDB.
|
|
45
|
+
The adaptive ranker reads source_quality at query time to boost/penalize
|
|
46
|
+
memories based on their source.
|
|
47
|
+
|
|
48
|
+
Thread Safety:
|
|
49
|
+
- All writes protected by LearningDB's internal write lock
|
|
50
|
+
- Reads to memory.db use per-call connections (safe with WAL mode)
|
|
51
|
+
- compute_source_scores() is idempotent — safe to call concurrently
|
|
52
|
+
|
|
53
|
+
Graceful Degradation:
|
|
54
|
+
- If memory.db lacks `created_by` column: all memories grouped as 'unknown'
|
|
55
|
+
- If learning.db unavailable: scores computed but not persisted
|
|
56
|
+
- If ranking_feedback is empty: all sources get 0.5 (neutral)
|
|
57
|
+
|
|
58
|
+
Research Backing:
|
|
59
|
+
- Beta-Binomial smoothing: Standard Bayesian approach (matches trust_scorer.py)
|
|
60
|
+
- Source reliability learning: ADPMF (IPM 2024) privacy-preserving feedback
|
|
61
|
+
- FCS LREC 2024: cold-start handling via smoothing priors
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
import json
|
|
65
|
+
import logging
|
|
66
|
+
import sqlite3
|
|
67
|
+
import threading
|
|
68
|
+
from datetime import datetime
|
|
69
|
+
from pathlib import Path
|
|
70
|
+
from typing import Dict, List, Optional, Any
|
|
71
|
+
|
|
72
|
+
logger = logging.getLogger("superlocalmemory.learning.source_quality")
|
|
73
|
+
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
# Import LearningDB (sibling module in src/learning/)
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
try:
|
|
78
|
+
from .learning_db import LearningDB
|
|
79
|
+
except ImportError:
|
|
80
|
+
try:
|
|
81
|
+
from learning_db import LearningDB
|
|
82
|
+
except ImportError:
|
|
83
|
+
LearningDB = None
|
|
84
|
+
logger.warning(
|
|
85
|
+
"LearningDB not available — source quality scores will not persist."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# Constants
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
MEMORY_DIR = Path.home() / ".claude-memory"
|
|
93
|
+
DEFAULT_MEMORY_DB = MEMORY_DIR / "memory.db"
|
|
94
|
+
|
|
95
|
+
# Beta-Binomial prior parameters (Laplace smoothing)
|
|
96
|
+
ALPHA = 1.0 # Prior successes
|
|
97
|
+
BETA = 1.0 # Prior failures
|
|
98
|
+
|
|
99
|
+
# Default score for unknown sources (= alpha / (alpha + beta))
|
|
100
|
+
DEFAULT_QUALITY_SCORE = ALPHA / (ALPHA + BETA)
|
|
101
|
+
|
|
102
|
+
# Minimum total memories from a source before we trust its score
|
|
103
|
+
# Below this, the score is blended toward the default
|
|
104
|
+
MIN_EVIDENCE_THRESHOLD = 5
|
|
105
|
+
|
|
106
|
+
# Positive feedback signal types from ranking_feedback table
|
|
107
|
+
POSITIVE_SIGNAL_TYPES = ("mcp_used", "cli_useful", "dashboard_click")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class SourceQualityScorer:
|
|
111
|
+
"""
|
|
112
|
+
Learns which memory sources produce higher-quality memories.
|
|
113
|
+
|
|
114
|
+
Computes a quality score per source using Beta-Binomial smoothing
|
|
115
|
+
over positive feedback signals. Stores results in learning.db for
|
|
116
|
+
use by the adaptive ranker.
|
|
117
|
+
|
|
118
|
+
Usage:
|
|
119
|
+
scorer = SourceQualityScorer()
|
|
120
|
+
scores = scorer.compute_source_scores()
|
|
121
|
+
# scores = {'mcp:claude-desktop': 0.72, 'cli:terminal': 0.45, ...}
|
|
122
|
+
|
|
123
|
+
boost = scorer.get_source_boost(memory_dict)
|
|
124
|
+
# boost = 0.72 (for a memory from claude-desktop)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def __init__(
|
|
128
|
+
self,
|
|
129
|
+
memory_db_path: Optional[Path] = None,
|
|
130
|
+
learning_db: Optional[Any] = None,
|
|
131
|
+
):
|
|
132
|
+
"""
|
|
133
|
+
Initialize the source quality scorer.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
memory_db_path: Path to memory.db (READ-ONLY). Defaults to
|
|
137
|
+
~/.claude-memory/memory.db.
|
|
138
|
+
learning_db: A LearningDB instance for reading feedback and
|
|
139
|
+
storing scores. If None, one is created.
|
|
140
|
+
"""
|
|
141
|
+
self.memory_db_path = Path(memory_db_path) if memory_db_path else DEFAULT_MEMORY_DB
|
|
142
|
+
self._lock = threading.Lock()
|
|
143
|
+
|
|
144
|
+
# In-memory cache of source scores (refreshed by compute_source_scores)
|
|
145
|
+
self._cached_scores: Dict[str, float] = {}
|
|
146
|
+
|
|
147
|
+
# Initialize LearningDB
|
|
148
|
+
if learning_db is not None:
|
|
149
|
+
self._learning_db = learning_db
|
|
150
|
+
elif LearningDB is not None:
|
|
151
|
+
try:
|
|
152
|
+
self._learning_db = LearningDB.get_instance()
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error("Failed to initialize LearningDB: %s", e)
|
|
155
|
+
self._learning_db = None
|
|
156
|
+
else:
|
|
157
|
+
self._learning_db = None
|
|
158
|
+
|
|
159
|
+
# Pre-load cached scores from learning.db if available
|
|
160
|
+
self._load_cached_scores()
|
|
161
|
+
|
|
162
|
+
logger.info(
|
|
163
|
+
"SourceQualityScorer initialized: memory_db=%s, learning_db=%s, "
|
|
164
|
+
"cached_sources=%d",
|
|
165
|
+
self.memory_db_path,
|
|
166
|
+
"available" if self._learning_db else "unavailable",
|
|
167
|
+
len(self._cached_scores),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# ======================================================================
|
|
171
|
+
# Core Scoring
|
|
172
|
+
# ======================================================================
|
|
173
|
+
|
|
174
|
+
def compute_source_scores(self) -> Dict[str, float]:
|
|
175
|
+
"""
|
|
176
|
+
Compute quality scores for all memory sources.
|
|
177
|
+
|
|
178
|
+
Workflow:
|
|
179
|
+
1. Get total memories per source from memory.db (created_by column)
|
|
180
|
+
2. Get positive feedback count per source by joining
|
|
181
|
+
learning.db ranking_feedback with memory.db memories
|
|
182
|
+
3. Compute Beta-Binomial smoothed score per source
|
|
183
|
+
4. Store results in learning.db source_quality table
|
|
184
|
+
5. Update in-memory cache
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Dict mapping source_id -> quality_score (0.0 to 1.0)
|
|
188
|
+
"""
|
|
189
|
+
# Step 1: Count total memories per source from memory.db
|
|
190
|
+
source_totals = self._get_memory_counts_by_source()
|
|
191
|
+
|
|
192
|
+
if not source_totals:
|
|
193
|
+
logger.info("No source data found in memory.db.")
|
|
194
|
+
return {}
|
|
195
|
+
|
|
196
|
+
# Step 2: Count positive signals per source
|
|
197
|
+
source_positives = self._get_positive_signal_counts(
|
|
198
|
+
set(source_totals.keys())
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Step 3: Compute Beta-Binomial scores
|
|
202
|
+
scores = {}
|
|
203
|
+
for source_id, total in source_totals.items():
|
|
204
|
+
positives = source_positives.get(source_id, 0)
|
|
205
|
+
score = self._beta_binomial_score(positives, total)
|
|
206
|
+
scores[source_id] = round(score, 4)
|
|
207
|
+
|
|
208
|
+
# Step 4: Store in learning.db
|
|
209
|
+
self._store_scores(scores, source_totals, source_positives)
|
|
210
|
+
|
|
211
|
+
# Step 5: Update cache
|
|
212
|
+
with self._lock:
|
|
213
|
+
self._cached_scores = dict(scores)
|
|
214
|
+
|
|
215
|
+
logger.info(
|
|
216
|
+
"Source quality scores computed for %d sources: %s",
|
|
217
|
+
len(scores),
|
|
218
|
+
", ".join(
|
|
219
|
+
"%s=%.3f" % (s, sc) for s, sc in sorted(
|
|
220
|
+
scores.items(), key=lambda x: -x[1]
|
|
221
|
+
)[:5]
|
|
222
|
+
) + ("..." if len(scores) > 5 else ""),
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return scores
|
|
226
|
+
|
|
227
|
+
def get_source_boost(
|
|
228
|
+
self,
|
|
229
|
+
memory: dict,
|
|
230
|
+
source_scores: Optional[Dict[str, float]] = None,
|
|
231
|
+
) -> float:
|
|
232
|
+
"""
|
|
233
|
+
Get the ranking boost for a memory based on its source quality.
|
|
234
|
+
|
|
235
|
+
This is called by the adaptive ranker at query time for each
|
|
236
|
+
candidate memory. The boost is a float in [0.0, 1.0] that
|
|
237
|
+
represents how trustworthy/useful this source tends to be.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
memory: A memory dict. Must have 'created_by' key, or will
|
|
241
|
+
fall back to DEFAULT_QUALITY_SCORE.
|
|
242
|
+
source_scores: Optional pre-computed scores dict. If None,
|
|
243
|
+
uses the internal cache. Pass this to avoid
|
|
244
|
+
repeated cache reads in a tight loop.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Quality score (0.0 to 1.0). 0.5 for unknown sources.
|
|
248
|
+
"""
|
|
249
|
+
scores = source_scores if source_scores is not None else self._cached_scores
|
|
250
|
+
|
|
251
|
+
# Extract source identifier from the memory
|
|
252
|
+
source_id = self._extract_source_id(memory)
|
|
253
|
+
|
|
254
|
+
if not source_id or source_id not in scores:
|
|
255
|
+
return DEFAULT_QUALITY_SCORE
|
|
256
|
+
|
|
257
|
+
return scores[source_id]
|
|
258
|
+
|
|
259
|
+
def refresh(self):
|
|
260
|
+
"""
|
|
261
|
+
Recompute all source scores.
|
|
262
|
+
|
|
263
|
+
Convenience wrapper for compute_source_scores(). Called periodically
|
|
264
|
+
by the engagement tracker or on explicit user request.
|
|
265
|
+
"""
|
|
266
|
+
return self.compute_source_scores()
|
|
267
|
+
|
|
268
|
+
# ======================================================================
|
|
269
|
+
# Data Extraction (memory.db — READ-ONLY)
|
|
270
|
+
# ======================================================================
|
|
271
|
+
|
|
272
|
+
def _get_memory_counts_by_source(self) -> Dict[str, int]:
|
|
273
|
+
"""
|
|
274
|
+
Count total memories per source from memory.db's `created_by` column.
|
|
275
|
+
|
|
276
|
+
Handles the case where the `created_by` column does not exist
|
|
277
|
+
(older databases pre-v2.5). In that case, all memories are
|
|
278
|
+
grouped under 'unknown'.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Dict mapping source_id -> total memory count.
|
|
282
|
+
"""
|
|
283
|
+
counts: Dict[str, int] = {}
|
|
284
|
+
|
|
285
|
+
try:
|
|
286
|
+
conn = sqlite3.connect(str(self.memory_db_path), timeout=10)
|
|
287
|
+
conn.execute("PRAGMA busy_timeout=5000")
|
|
288
|
+
cursor = conn.cursor()
|
|
289
|
+
|
|
290
|
+
# Check if created_by column exists
|
|
291
|
+
cursor.execute("PRAGMA table_info(memories)")
|
|
292
|
+
columns = {row[1] for row in cursor.fetchall()}
|
|
293
|
+
|
|
294
|
+
if "created_by" in columns:
|
|
295
|
+
cursor.execute("""
|
|
296
|
+
SELECT
|
|
297
|
+
COALESCE(created_by, 'unknown') AS source,
|
|
298
|
+
COUNT(*) AS cnt
|
|
299
|
+
FROM memories
|
|
300
|
+
GROUP BY source
|
|
301
|
+
ORDER BY cnt DESC
|
|
302
|
+
""")
|
|
303
|
+
for row in cursor.fetchall():
|
|
304
|
+
source_id = row[0] if row[0] else "unknown"
|
|
305
|
+
counts[source_id] = row[1]
|
|
306
|
+
else:
|
|
307
|
+
# Column doesn't exist — count all as 'unknown'
|
|
308
|
+
cursor.execute("SELECT COUNT(*) FROM memories")
|
|
309
|
+
total = cursor.fetchone()[0]
|
|
310
|
+
if total > 0:
|
|
311
|
+
counts["unknown"] = total
|
|
312
|
+
logger.debug(
|
|
313
|
+
"created_by column not in memory.db — "
|
|
314
|
+
"all %d memories grouped as 'unknown'.",
|
|
315
|
+
total,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
conn.close()
|
|
319
|
+
|
|
320
|
+
except sqlite3.OperationalError as e:
|
|
321
|
+
logger.warning("Error reading memory counts by source: %s", e)
|
|
322
|
+
except Exception as e:
|
|
323
|
+
logger.error("Unexpected error reading memory.db: %s", e)
|
|
324
|
+
|
|
325
|
+
return counts
|
|
326
|
+
|
|
327
|
+
def _get_positive_signal_counts(
|
|
328
|
+
self,
|
|
329
|
+
known_sources: set,
|
|
330
|
+
) -> Dict[str, int]:
|
|
331
|
+
"""
|
|
332
|
+
Count positive feedback signals per source.
|
|
333
|
+
|
|
334
|
+
Joins learning.db's ranking_feedback (positive signals) with
|
|
335
|
+
memory.db's memories (to get created_by) on memory_id.
|
|
336
|
+
|
|
337
|
+
This requires reading from BOTH databases. We do a two-step approach:
|
|
338
|
+
1. Get all memory_ids with positive feedback from learning.db
|
|
339
|
+
2. Look up their created_by from memory.db
|
|
340
|
+
|
|
341
|
+
This avoids ATTACH DATABASE which can have locking issues.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Dict mapping source_id -> positive signal count.
|
|
345
|
+
"""
|
|
346
|
+
positives: Dict[str, int] = {}
|
|
347
|
+
|
|
348
|
+
if self._learning_db is None:
|
|
349
|
+
return positives
|
|
350
|
+
|
|
351
|
+
# Step 1: Get memory_ids with positive feedback from learning.db
|
|
352
|
+
feedback_memory_ids: Dict[int, int] = {} # memory_id -> count
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
feedback_rows = self._learning_db.get_feedback_for_training(limit=50000)
|
|
356
|
+
for row in feedback_rows:
|
|
357
|
+
signal_type = row.get("signal_type", "")
|
|
358
|
+
if signal_type in POSITIVE_SIGNAL_TYPES:
|
|
359
|
+
mem_id = row.get("memory_id")
|
|
360
|
+
if mem_id is not None:
|
|
361
|
+
feedback_memory_ids[mem_id] = (
|
|
362
|
+
feedback_memory_ids.get(mem_id, 0) + 1
|
|
363
|
+
)
|
|
364
|
+
except Exception as e:
|
|
365
|
+
logger.warning("Could not read feedback from learning.db: %s", e)
|
|
366
|
+
return positives
|
|
367
|
+
|
|
368
|
+
if not feedback_memory_ids:
|
|
369
|
+
return positives
|
|
370
|
+
|
|
371
|
+
# Step 2: Look up created_by for each feedback memory_id in memory.db
|
|
372
|
+
try:
|
|
373
|
+
conn = sqlite3.connect(str(self.memory_db_path), timeout=10)
|
|
374
|
+
conn.execute("PRAGMA busy_timeout=5000")
|
|
375
|
+
cursor = conn.cursor()
|
|
376
|
+
|
|
377
|
+
# Check if created_by column exists
|
|
378
|
+
cursor.execute("PRAGMA table_info(memories)")
|
|
379
|
+
columns = {row[1] for row in cursor.fetchall()}
|
|
380
|
+
|
|
381
|
+
if "created_by" not in columns:
|
|
382
|
+
# All positives go to 'unknown'
|
|
383
|
+
total_positives = sum(feedback_memory_ids.values())
|
|
384
|
+
if total_positives > 0:
|
|
385
|
+
positives["unknown"] = total_positives
|
|
386
|
+
conn.close()
|
|
387
|
+
return positives
|
|
388
|
+
|
|
389
|
+
# Batch lookup in chunks to avoid SQLite variable limit
|
|
390
|
+
mem_ids = list(feedback_memory_ids.keys())
|
|
391
|
+
chunk_size = 500 # SQLite max variables is 999
|
|
392
|
+
|
|
393
|
+
for i in range(0, len(mem_ids), chunk_size):
|
|
394
|
+
chunk = mem_ids[i:i + chunk_size]
|
|
395
|
+
placeholders = ",".join("?" * len(chunk))
|
|
396
|
+
cursor.execute(
|
|
397
|
+
"SELECT id, COALESCE(created_by, 'unknown') "
|
|
398
|
+
"FROM memories WHERE id IN (%s)" % placeholders,
|
|
399
|
+
chunk,
|
|
400
|
+
)
|
|
401
|
+
for row in cursor.fetchall():
|
|
402
|
+
mem_id = row[0]
|
|
403
|
+
source_id = row[1] if row[1] else "unknown"
|
|
404
|
+
count = feedback_memory_ids.get(mem_id, 0)
|
|
405
|
+
positives[source_id] = positives.get(source_id, 0) + count
|
|
406
|
+
|
|
407
|
+
conn.close()
|
|
408
|
+
|
|
409
|
+
except sqlite3.OperationalError as e:
|
|
410
|
+
logger.warning("Error looking up memory sources: %s", e)
|
|
411
|
+
except Exception as e:
|
|
412
|
+
logger.error("Unexpected error in positive signal lookup: %s", e)
|
|
413
|
+
|
|
414
|
+
return positives
|
|
415
|
+
|
|
416
|
+
# ======================================================================
|
|
417
|
+
# Scoring Math
|
|
418
|
+
# ======================================================================
|
|
419
|
+
|
|
420
|
+
@staticmethod
|
|
421
|
+
def _beta_binomial_score(positive_count: int, total_count: int) -> float:
|
|
422
|
+
"""
|
|
423
|
+
Compute Beta-Binomial smoothed quality score.
|
|
424
|
+
|
|
425
|
+
Formula: (alpha + positive) / (alpha + beta + total)
|
|
426
|
+
|
|
427
|
+
With alpha=1, beta=1 (uniform prior / Laplace smoothing):
|
|
428
|
+
- 0 positives, 0 total = 0.50 (neutral)
|
|
429
|
+
- 5 positives, 10 total = 0.50
|
|
430
|
+
- 8 positives, 10 total = 0.75
|
|
431
|
+
- 1 positive, 10 total = 0.17
|
|
432
|
+
- 50 positives, 100 total = 0.50
|
|
433
|
+
|
|
434
|
+
This converges to the true rate as evidence grows, while being
|
|
435
|
+
conservative (pulled toward 0.5) with limited data.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
positive_count: Number of positive feedback signals.
|
|
439
|
+
total_count: Total number of memories from this source.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
Quality score in [0.0, 1.0].
|
|
443
|
+
"""
|
|
444
|
+
score = (ALPHA + positive_count) / (ALPHA + BETA + total_count)
|
|
445
|
+
return max(0.0, min(1.0, score))
|
|
446
|
+
|
|
447
|
+
# ======================================================================
|
|
448
|
+
# Storage (learning.db)
|
|
449
|
+
# ======================================================================
|
|
450
|
+
|
|
451
|
+
def _store_scores(
|
|
452
|
+
self,
|
|
453
|
+
scores: Dict[str, float],
|
|
454
|
+
totals: Dict[str, int],
|
|
455
|
+
positives: Dict[str, int],
|
|
456
|
+
):
|
|
457
|
+
"""
|
|
458
|
+
Store computed scores in learning.db's source_quality table.
|
|
459
|
+
|
|
460
|
+
Uses LearningDB.update_source_quality() which handles UPSERT
|
|
461
|
+
internally with its own write lock.
|
|
462
|
+
"""
|
|
463
|
+
if self._learning_db is None:
|
|
464
|
+
logger.debug(
|
|
465
|
+
"LearningDB unavailable — scores computed but not stored."
|
|
466
|
+
)
|
|
467
|
+
return
|
|
468
|
+
|
|
469
|
+
stored = 0
|
|
470
|
+
for source_id, score in scores.items():
|
|
471
|
+
try:
|
|
472
|
+
self._learning_db.update_source_quality(
|
|
473
|
+
source_id=source_id,
|
|
474
|
+
positive_signals=positives.get(source_id, 0),
|
|
475
|
+
total_memories=totals.get(source_id, 0),
|
|
476
|
+
)
|
|
477
|
+
stored += 1
|
|
478
|
+
except Exception as e:
|
|
479
|
+
logger.error(
|
|
480
|
+
"Failed to store score for source '%s': %s",
|
|
481
|
+
source_id, e,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
logger.debug("Stored %d/%d source quality scores.", stored, len(scores))
|
|
485
|
+
|
|
486
|
+
def _load_cached_scores(self):
|
|
487
|
+
"""
|
|
488
|
+
Load source quality scores from learning.db into the in-memory cache.
|
|
489
|
+
|
|
490
|
+
Called on initialization so that get_source_boost() works immediately
|
|
491
|
+
without requiring a compute_source_scores() call first.
|
|
492
|
+
"""
|
|
493
|
+
if self._learning_db is None:
|
|
494
|
+
return
|
|
495
|
+
|
|
496
|
+
try:
|
|
497
|
+
db_scores = self._learning_db.get_source_scores()
|
|
498
|
+
with self._lock:
|
|
499
|
+
self._cached_scores = dict(db_scores)
|
|
500
|
+
if db_scores:
|
|
501
|
+
logger.debug(
|
|
502
|
+
"Loaded %d cached source scores from learning.db.",
|
|
503
|
+
len(db_scores),
|
|
504
|
+
)
|
|
505
|
+
except Exception as e:
|
|
506
|
+
logger.debug("Could not load cached source scores: %s", e)
|
|
507
|
+
|
|
508
|
+
# ======================================================================
|
|
509
|
+
# Utility Methods
|
|
510
|
+
# ======================================================================
|
|
511
|
+
|
|
512
|
+
@staticmethod
|
|
513
|
+
def _extract_source_id(memory: dict) -> Optional[str]:
|
|
514
|
+
"""
|
|
515
|
+
Extract the source identifier from a memory dict.
|
|
516
|
+
|
|
517
|
+
Checks 'created_by' first (set by ProvenanceTracker), then
|
|
518
|
+
falls back to 'source_protocol' if available.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
memory: A memory dict (from search results or direct DB query).
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Source identifier string, or None if not available.
|
|
525
|
+
"""
|
|
526
|
+
# Primary: created_by (e.g., 'mcp:claude-desktop', 'cli:terminal')
|
|
527
|
+
source = memory.get("created_by")
|
|
528
|
+
if source and source != "user":
|
|
529
|
+
return source
|
|
530
|
+
|
|
531
|
+
# Fallback: source_protocol (e.g., 'mcp', 'cli', 'rest')
|
|
532
|
+
protocol = memory.get("source_protocol")
|
|
533
|
+
if protocol:
|
|
534
|
+
return protocol
|
|
535
|
+
|
|
536
|
+
# Last resort: the 'user' default from provenance_tracker
|
|
537
|
+
if source == "user":
|
|
538
|
+
return "user"
|
|
539
|
+
|
|
540
|
+
return None
|
|
541
|
+
|
|
542
|
+
def get_all_scores(self) -> Dict[str, dict]:
|
|
543
|
+
"""
|
|
544
|
+
Get detailed quality information for all tracked sources.
|
|
545
|
+
|
|
546
|
+
Returns full details including positive signals, total memories,
|
|
547
|
+
and computed score for diagnostic/dashboard display.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
Dict mapping source_id -> {quality_score, positive_signals,
|
|
551
|
+
total_memories, last_updated}
|
|
552
|
+
"""
|
|
553
|
+
if self._learning_db is None:
|
|
554
|
+
# Return from cache with minimal info
|
|
555
|
+
with self._lock:
|
|
556
|
+
return {
|
|
557
|
+
source_id: {
|
|
558
|
+
"quality_score": score,
|
|
559
|
+
"positive_signals": None,
|
|
560
|
+
"total_memories": None,
|
|
561
|
+
"last_updated": None,
|
|
562
|
+
}
|
|
563
|
+
for source_id, score in self._cached_scores.items()
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
conn = self._learning_db._get_connection()
|
|
568
|
+
cursor = conn.cursor()
|
|
569
|
+
cursor.execute("""
|
|
570
|
+
SELECT source_id, quality_score, positive_signals,
|
|
571
|
+
total_memories, last_updated
|
|
572
|
+
FROM source_quality
|
|
573
|
+
ORDER BY quality_score DESC
|
|
574
|
+
""")
|
|
575
|
+
results = {}
|
|
576
|
+
for row in cursor.fetchall():
|
|
577
|
+
results[row["source_id"]] = {
|
|
578
|
+
"quality_score": row["quality_score"],
|
|
579
|
+
"positive_signals": row["positive_signals"],
|
|
580
|
+
"total_memories": row["total_memories"],
|
|
581
|
+
"last_updated": row["last_updated"],
|
|
582
|
+
}
|
|
583
|
+
conn.close()
|
|
584
|
+
return results
|
|
585
|
+
except Exception as e:
|
|
586
|
+
logger.error("Failed to read detailed source scores: %s", e)
|
|
587
|
+
return {}
|
|
588
|
+
|
|
589
|
+
def get_source_summary(self) -> str:
|
|
590
|
+
"""
|
|
591
|
+
Get a human-readable summary of source quality scores.
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
Formatted multi-line string for diagnostics or dashboard.
|
|
595
|
+
"""
|
|
596
|
+
all_scores = self.get_all_scores()
|
|
597
|
+
|
|
598
|
+
if not all_scores:
|
|
599
|
+
return "No source quality data available. Run refresh() first."
|
|
600
|
+
|
|
601
|
+
lines = ["Source Quality Scores:", ""]
|
|
602
|
+
lines.append(
|
|
603
|
+
" %-30s %8s %8s %8s"
|
|
604
|
+
% ("Source", "Score", "Positive", "Total")
|
|
605
|
+
)
|
|
606
|
+
lines.append(" " + "-" * 62)
|
|
607
|
+
|
|
608
|
+
for source_id, data in sorted(
|
|
609
|
+
all_scores.items(), key=lambda x: -x[1]["quality_score"]
|
|
610
|
+
):
|
|
611
|
+
pos = data["positive_signals"]
|
|
612
|
+
tot = data["total_memories"]
|
|
613
|
+
lines.append(
|
|
614
|
+
" %-30s %8.3f %8s %8s"
|
|
615
|
+
% (
|
|
616
|
+
source_id,
|
|
617
|
+
data["quality_score"],
|
|
618
|
+
str(pos) if pos is not None else "?",
|
|
619
|
+
str(tot) if tot is not None else "?",
|
|
620
|
+
)
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
return "\n".join(lines)
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
# ===========================================================================
|
|
627
|
+
# CLI Interface
|
|
628
|
+
# ===========================================================================
|
|
629
|
+
|
|
630
|
+
if __name__ == "__main__":
|
|
631
|
+
import sys as _sys
|
|
632
|
+
|
|
633
|
+
logging.basicConfig(
|
|
634
|
+
level=logging.INFO,
|
|
635
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
scorer = SourceQualityScorer()
|
|
639
|
+
|
|
640
|
+
if len(_sys.argv) < 2:
|
|
641
|
+
print("SourceQualityScorer — Per-Source Quality Learning")
|
|
642
|
+
print()
|
|
643
|
+
print("Usage:")
|
|
644
|
+
print(" python source_quality_scorer.py compute # Compute all source scores")
|
|
645
|
+
print(" python source_quality_scorer.py show # Show current scores")
|
|
646
|
+
print(" python source_quality_scorer.py summary # Human-readable summary")
|
|
647
|
+
_sys.exit(0)
|
|
648
|
+
|
|
649
|
+
command = _sys.argv[1]
|
|
650
|
+
|
|
651
|
+
if command == "compute":
|
|
652
|
+
scores = scorer.compute_source_scores()
|
|
653
|
+
if scores:
|
|
654
|
+
print("\nComputed quality scores for %d sources:" % len(scores))
|
|
655
|
+
for source_id, score in sorted(scores.items(), key=lambda x: -x[1]):
|
|
656
|
+
bar = "#" * int(score * 20)
|
|
657
|
+
print(" %-30s %.3f [%-20s]" % (source_id, score, bar))
|
|
658
|
+
else:
|
|
659
|
+
print("No sources found. Add memories with provenance tracking first.")
|
|
660
|
+
|
|
661
|
+
elif command == "show":
|
|
662
|
+
all_scores = scorer.get_all_scores()
|
|
663
|
+
if all_scores:
|
|
664
|
+
print("\nStored source quality scores:")
|
|
665
|
+
for source_id, data in sorted(
|
|
666
|
+
all_scores.items(), key=lambda x: -x[1]["quality_score"]
|
|
667
|
+
):
|
|
668
|
+
print(
|
|
669
|
+
" %-30s score=%.3f positives=%s total=%s"
|
|
670
|
+
% (
|
|
671
|
+
source_id,
|
|
672
|
+
data["quality_score"],
|
|
673
|
+
data["positive_signals"],
|
|
674
|
+
data["total_memories"],
|
|
675
|
+
)
|
|
676
|
+
)
|
|
677
|
+
else:
|
|
678
|
+
print("No scores stored. Run 'compute' first.")
|
|
679
|
+
|
|
680
|
+
elif command == "summary":
|
|
681
|
+
print(scorer.get_source_summary())
|
|
682
|
+
|
|
683
|
+
else:
|
|
684
|
+
print("Unknown command: %s" % command)
|
|
685
|
+
_sys.exit(1)
|