superlocalmemory 2.6.0 → 2.6.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +122 -1806
- package/README.md +142 -410
- package/docs/ACCESSIBILITY.md +291 -0
- package/docs/FRAMEWORK-INTEGRATIONS.md +300 -0
- package/package.json +1 -1
- package/src/learning/__init__.py +201 -0
- package/src/learning/adaptive_ranker.py +826 -0
- package/src/learning/cross_project_aggregator.py +866 -0
- package/src/learning/engagement_tracker.py +638 -0
- package/src/learning/feature_extractor.py +461 -0
- package/src/learning/feedback_collector.py +690 -0
- package/src/learning/learning_db.py +842 -0
- package/src/learning/project_context_manager.py +582 -0
- package/src/learning/source_quality_scorer.py +685 -0
- package/src/learning/workflow_pattern_miner.py +665 -0
- package/ui/index.html +346 -13
- package/ui/js/clusters.js +90 -1
- package/ui/js/graph-core.js +445 -0
- package/ui/js/graph-cytoscape-monolithic-backup.js +1168 -0
- package/ui/js/graph-cytoscape.js +1168 -0
- package/ui/js/graph-d3-backup.js +32 -0
- package/ui/js/graph-filters.js +220 -0
- package/ui/js/graph-interactions.js +354 -0
- package/ui/js/graph-ui.js +214 -0
- package/ui/js/memories.js +52 -0
- package/ui/js/modal.js +104 -1
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SuperLocalMemory V2 - Feedback Collector (v2.7)
|
|
4
|
+
Copyright (c) 2026 Varun Pratap Bhardwaj
|
|
5
|
+
Licensed under MIT License
|
|
6
|
+
|
|
7
|
+
Repository: https://github.com/varun369/SuperLocalMemoryV2
|
|
8
|
+
Author: Varun Pratap Bhardwaj (Solution Architect)
|
|
9
|
+
|
|
10
|
+
NOTICE: This software is protected by MIT License.
|
|
11
|
+
Attribution must be preserved in all copies or derivatives.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
FeedbackCollector -- Multi-channel feedback collection for the LightGBM re-ranker.
|
|
16
|
+
|
|
17
|
+
Collects implicit and explicit relevance signals from three channels:
|
|
18
|
+
|
|
19
|
+
1. MCP -- ``memory_used`` tool with usefulness level (high/medium/low).
|
|
20
|
+
2. CLI -- ``slm useful <id> [<id>...]`` marks memories as helpful.
|
|
21
|
+
3. Dashboard -- click events with optional dwell-time tracking.
|
|
22
|
+
|
|
23
|
+
Additionally tracks *passive decay*: memories that are repeatedly returned
|
|
24
|
+
by recall but never receive a positive signal are assigned a 0.0 (negative)
|
|
25
|
+
feedback entry, teaching the re-ranker to demote them.
|
|
26
|
+
|
|
27
|
+
Privacy:
|
|
28
|
+
- Full query text is NEVER stored.
|
|
29
|
+
- Queries are hashed to SHA-256[:16] for grouping.
|
|
30
|
+
- Top 3 keywords are extracted for loose thematic grouping only.
|
|
31
|
+
|
|
32
|
+
All data is written to the ``ranking_feedback`` table in learning.db via
|
|
33
|
+
the shared LearningDB instance.
|
|
34
|
+
|
|
35
|
+
Research backing:
|
|
36
|
+
- ADPMF (IPM 2024): privacy-preserving feedback for recommendation.
|
|
37
|
+
- FCS LREC 2024: cold-start feedback bootstrapping.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
import hashlib
|
|
41
|
+
import logging
|
|
42
|
+
import re
|
|
43
|
+
import threading
|
|
44
|
+
from collections import Counter
|
|
45
|
+
from datetime import datetime
|
|
46
|
+
from typing import Dict, List, Optional, Any
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger("superlocalmemory.learning.feedback")
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Stopwords for keyword extraction (small, curated list -- no NLTK needed)
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
_STOPWORDS = frozenset({
|
|
54
|
+
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
|
55
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "shall",
|
|
56
|
+
"should", "may", "might", "must", "can", "could",
|
|
57
|
+
"i", "me", "my", "we", "our", "you", "your", "he", "she", "it",
|
|
58
|
+
"they", "them", "their", "its", "this", "that", "these", "those",
|
|
59
|
+
"what", "which", "who", "whom", "how", "when", "where", "why",
|
|
60
|
+
"not", "no", "nor", "but", "or", "and", "if", "then", "so",
|
|
61
|
+
"of", "in", "on", "at", "to", "for", "with", "from", "by",
|
|
62
|
+
"about", "into", "through", "during", "before", "after",
|
|
63
|
+
"above", "below", "between", "out", "off", "up", "down",
|
|
64
|
+
"all", "each", "every", "both", "few", "more", "most", "some", "any",
|
|
65
|
+
"such", "only", "same", "than", "too", "very",
|
|
66
|
+
"just", "also", "now", "here", "there",
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
# Regex to split on non-alphanumeric (keeps words and numbers)
|
|
70
|
+
_WORD_SPLIT = re.compile(r"[^a-zA-Z0-9]+")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class FeedbackCollector:
|
|
74
|
+
"""
|
|
75
|
+
Collects multi-channel relevance feedback for the adaptive re-ranker.
|
|
76
|
+
|
|
77
|
+
Each signal maps to a numeric value used as a training label:
|
|
78
|
+
|
|
79
|
+
mcp_used_high = 1.0 (strong positive)
|
|
80
|
+
mcp_used_medium = 0.7
|
|
81
|
+
mcp_used_low = 0.4
|
|
82
|
+
cli_useful = 0.9
|
|
83
|
+
dashboard_click = 0.8
|
|
84
|
+
passive_decay = 0.0 (negative signal)
|
|
85
|
+
|
|
86
|
+
Usage:
|
|
87
|
+
from learning.learning_db import LearningDB
|
|
88
|
+
collector = FeedbackCollector(learning_db=LearningDB())
|
|
89
|
+
|
|
90
|
+
# MCP channel
|
|
91
|
+
collector.record_memory_used(42, "how to deploy FastAPI", usefulness="high")
|
|
92
|
+
|
|
93
|
+
# CLI channel
|
|
94
|
+
collector.record_cli_useful([42, 87], "deploy fastapi")
|
|
95
|
+
|
|
96
|
+
# Dashboard channel
|
|
97
|
+
collector.record_dashboard_click(42, "deploy fastapi", dwell_time=12.5)
|
|
98
|
+
|
|
99
|
+
# Passive decay (call periodically)
|
|
100
|
+
collector.record_recall_results("deploy fastapi", [42, 87, 91])
|
|
101
|
+
collector.compute_passive_decay()
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
# Signal type -> numeric label for re-ranker training
|
|
105
|
+
SIGNAL_VALUES: Dict[str, float] = {
|
|
106
|
+
"mcp_used_high": 1.0,
|
|
107
|
+
"mcp_used_medium": 0.7,
|
|
108
|
+
"mcp_used_low": 0.4,
|
|
109
|
+
"cli_useful": 0.9,
|
|
110
|
+
"dashboard_click": 0.8,
|
|
111
|
+
"passive_decay": 0.0,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
# Usefulness string -> signal type mapping
|
|
115
|
+
_USEFULNESS_MAP: Dict[str, str] = {
|
|
116
|
+
"high": "mcp_used_high",
|
|
117
|
+
"medium": "mcp_used_medium",
|
|
118
|
+
"low": "mcp_used_low",
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
def __init__(self, learning_db: Optional[Any] = None):
|
|
122
|
+
"""
|
|
123
|
+
Args:
|
|
124
|
+
learning_db: LearningDB instance for persisting feedback.
|
|
125
|
+
If None, feedback is logged but not stored.
|
|
126
|
+
"""
|
|
127
|
+
self.learning_db = learning_db
|
|
128
|
+
|
|
129
|
+
# In-memory buffer for passive decay tracking.
|
|
130
|
+
# Structure: {query_hash: {memory_id: times_returned_count}}
|
|
131
|
+
# Protected by a lock since MCP/CLI/API may call concurrently.
|
|
132
|
+
self._recall_buffer: Dict[str, Dict[int, int]] = {}
|
|
133
|
+
self._recall_buffer_lock = threading.Lock()
|
|
134
|
+
|
|
135
|
+
# Counter: total recall operations tracked (for decay threshold)
|
|
136
|
+
self._recall_count: int = 0
|
|
137
|
+
|
|
138
|
+
# ======================================================================
|
|
139
|
+
# Channel 1: MCP -- memory_used tool
|
|
140
|
+
# ======================================================================
|
|
141
|
+
|
|
142
|
+
def record_memory_used(
|
|
143
|
+
self,
|
|
144
|
+
memory_id: int,
|
|
145
|
+
query: str,
|
|
146
|
+
usefulness: str = "high",
|
|
147
|
+
source_tool: Optional[str] = None,
|
|
148
|
+
rank_position: Optional[int] = None,
|
|
149
|
+
) -> Optional[int]:
|
|
150
|
+
"""
|
|
151
|
+
Record that a memory was explicitly used after an MCP recall.
|
|
152
|
+
|
|
153
|
+
Called by the ``memory_used`` MCP tool. This is the highest-quality
|
|
154
|
+
feedback signal because the AI agent explicitly indicates it found
|
|
155
|
+
the memory useful.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
memory_id: ID of the used memory in memory.db.
|
|
159
|
+
query: The original recall query (hashed, not stored raw).
|
|
160
|
+
usefulness: "high", "medium", or "low".
|
|
161
|
+
source_tool: Which tool originated the query (e.g. 'claude-desktop').
|
|
162
|
+
rank_position: Position of the memory in the recall results (1-based).
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Row ID of the feedback record, or None on error.
|
|
166
|
+
"""
|
|
167
|
+
if not query:
|
|
168
|
+
logger.warning("record_memory_used called with empty query")
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
# Validate usefulness level
|
|
172
|
+
usefulness = usefulness.lower().strip()
|
|
173
|
+
if usefulness not in self._USEFULNESS_MAP:
|
|
174
|
+
logger.warning(
|
|
175
|
+
"Invalid usefulness level '%s', defaulting to 'high'",
|
|
176
|
+
usefulness,
|
|
177
|
+
)
|
|
178
|
+
usefulness = "high"
|
|
179
|
+
|
|
180
|
+
signal_type = self._USEFULNESS_MAP[usefulness]
|
|
181
|
+
signal_value = self.SIGNAL_VALUES[signal_type]
|
|
182
|
+
query_hash = self._hash_query(query)
|
|
183
|
+
keywords = self._extract_keywords(query)
|
|
184
|
+
|
|
185
|
+
return self._store_feedback(
|
|
186
|
+
query_hash=query_hash,
|
|
187
|
+
query_keywords=keywords,
|
|
188
|
+
memory_id=memory_id,
|
|
189
|
+
signal_type=signal_type,
|
|
190
|
+
signal_value=signal_value,
|
|
191
|
+
channel="mcp",
|
|
192
|
+
source_tool=source_tool,
|
|
193
|
+
rank_position=rank_position,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# ======================================================================
|
|
197
|
+
# Channel 2: CLI -- slm useful <id> [<id>...]
|
|
198
|
+
# ======================================================================
|
|
199
|
+
|
|
200
|
+
def record_cli_useful(
|
|
201
|
+
self,
|
|
202
|
+
memory_ids: List[int],
|
|
203
|
+
query: str,
|
|
204
|
+
) -> List[Optional[int]]:
|
|
205
|
+
"""
|
|
206
|
+
Record positive feedback from the CLI ``slm useful`` command.
|
|
207
|
+
|
|
208
|
+
Stores a positive signal for each memory_id. The CLI typically
|
|
209
|
+
captures the most recent recall query, so all IDs share the same
|
|
210
|
+
query hash.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
memory_ids: List of memory IDs the user marked as useful.
|
|
214
|
+
query: The recall query that surfaced these memories.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List of row IDs (one per memory_id), or None entries on error.
|
|
218
|
+
"""
|
|
219
|
+
if not query:
|
|
220
|
+
logger.warning("record_cli_useful called with empty query")
|
|
221
|
+
return [None] * len(memory_ids)
|
|
222
|
+
|
|
223
|
+
query_hash = self._hash_query(query)
|
|
224
|
+
keywords = self._extract_keywords(query)
|
|
225
|
+
signal_value = self.SIGNAL_VALUES["cli_useful"]
|
|
226
|
+
row_ids: List[Optional[int]] = []
|
|
227
|
+
|
|
228
|
+
for mid in memory_ids:
|
|
229
|
+
row_id = self._store_feedback(
|
|
230
|
+
query_hash=query_hash,
|
|
231
|
+
query_keywords=keywords,
|
|
232
|
+
memory_id=mid,
|
|
233
|
+
signal_type="cli_useful",
|
|
234
|
+
signal_value=signal_value,
|
|
235
|
+
channel="cli",
|
|
236
|
+
)
|
|
237
|
+
row_ids.append(row_id)
|
|
238
|
+
|
|
239
|
+
logger.info(
|
|
240
|
+
"CLI useful: %d memories marked for query_hash=%s",
|
|
241
|
+
len(memory_ids),
|
|
242
|
+
query_hash,
|
|
243
|
+
)
|
|
244
|
+
return row_ids
|
|
245
|
+
|
|
246
|
+
# ======================================================================
|
|
247
|
+
# Channel 3: Dashboard -- click events
|
|
248
|
+
# ======================================================================
|
|
249
|
+
|
|
250
|
+
def record_dashboard_click(
|
|
251
|
+
self,
|
|
252
|
+
memory_id: int,
|
|
253
|
+
query: str,
|
|
254
|
+
dwell_time: Optional[float] = None,
|
|
255
|
+
) -> Optional[int]:
|
|
256
|
+
"""
|
|
257
|
+
Record a dashboard click on a memory card in search results.
|
|
258
|
+
|
|
259
|
+
Optionally includes dwell time (seconds the user spent viewing
|
|
260
|
+
the expanded memory). Longer dwell times indicate higher relevance
|
|
261
|
+
but this is captured as metadata, not reflected in signal_value
|
|
262
|
+
(the re-ranker can learn from dwell_time as a feature).
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
memory_id: ID of the clicked memory.
|
|
266
|
+
query: The search query active when the click happened.
|
|
267
|
+
dwell_time: Seconds spent viewing the memory (optional).
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Row ID of the feedback record, or None on error.
|
|
271
|
+
"""
|
|
272
|
+
if not query:
|
|
273
|
+
logger.warning("record_dashboard_click called with empty query")
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
query_hash = self._hash_query(query)
|
|
277
|
+
keywords = self._extract_keywords(query)
|
|
278
|
+
signal_value = self.SIGNAL_VALUES["dashboard_click"]
|
|
279
|
+
|
|
280
|
+
return self._store_feedback(
|
|
281
|
+
query_hash=query_hash,
|
|
282
|
+
query_keywords=keywords,
|
|
283
|
+
memory_id=memory_id,
|
|
284
|
+
signal_type="dashboard_click",
|
|
285
|
+
signal_value=signal_value,
|
|
286
|
+
channel="dashboard",
|
|
287
|
+
dwell_time=dwell_time,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# ======================================================================
|
|
291
|
+
# Passive Decay Tracking
|
|
292
|
+
# ======================================================================
|
|
293
|
+
|
|
294
|
+
def record_recall_results(
|
|
295
|
+
self,
|
|
296
|
+
query: str,
|
|
297
|
+
returned_ids: List[int],
|
|
298
|
+
) -> None:
|
|
299
|
+
"""
|
|
300
|
+
Track which memories were returned in a recall operation.
|
|
301
|
+
|
|
302
|
+
This does NOT create feedback records immediately. Instead, it
|
|
303
|
+
populates an in-memory buffer. When ``compute_passive_decay()``
|
|
304
|
+
is called (periodically), memories that were returned in 5+
|
|
305
|
+
distinct queries but never received a positive signal get a
|
|
306
|
+
passive_decay (0.0) feedback entry.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
query: The recall query (hashed for grouping).
|
|
310
|
+
returned_ids: Memory IDs returned by the recall.
|
|
311
|
+
"""
|
|
312
|
+
if not query or not returned_ids:
|
|
313
|
+
return
|
|
314
|
+
|
|
315
|
+
query_hash = self._hash_query(query)
|
|
316
|
+
|
|
317
|
+
with self._recall_buffer_lock:
|
|
318
|
+
if query_hash not in self._recall_buffer:
|
|
319
|
+
self._recall_buffer[query_hash] = {}
|
|
320
|
+
|
|
321
|
+
for mid in returned_ids:
|
|
322
|
+
self._recall_buffer[query_hash][mid] = (
|
|
323
|
+
self._recall_buffer[query_hash].get(mid, 0) + 1
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
self._recall_count += 1
|
|
327
|
+
|
|
328
|
+
def compute_passive_decay(self, threshold: int = 10) -> int:
|
|
329
|
+
"""
|
|
330
|
+
Emit passive decay signals for memories that appear in results
|
|
331
|
+
but are never explicitly marked as useful.
|
|
332
|
+
|
|
333
|
+
Algorithm:
|
|
334
|
+
1. Only runs after *threshold* recall operations are tracked.
|
|
335
|
+
2. For each memory that appeared in 5+ distinct query hashes:
|
|
336
|
+
a. Check if it has ANY positive feedback in ranking_feedback.
|
|
337
|
+
b. If not, insert a passive_decay signal (value=0.0).
|
|
338
|
+
3. Clear the recall buffer after processing.
|
|
339
|
+
|
|
340
|
+
This teaches the re-ranker: "this memory keeps showing up but
|
|
341
|
+
nobody ever finds it useful -- demote it."
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
threshold: Minimum number of tracked recalls before running.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Number of passive decay signals emitted.
|
|
348
|
+
"""
|
|
349
|
+
with self._recall_buffer_lock:
|
|
350
|
+
if self._recall_count < threshold:
|
|
351
|
+
logger.debug(
|
|
352
|
+
"Passive decay skipped: %d/%d recalls tracked",
|
|
353
|
+
self._recall_count,
|
|
354
|
+
threshold,
|
|
355
|
+
)
|
|
356
|
+
return 0
|
|
357
|
+
|
|
358
|
+
# Build a map: memory_id -> set of distinct query_hashes it appeared in
|
|
359
|
+
memory_query_counts: Dict[int, int] = {}
|
|
360
|
+
for query_hash, mem_counts in self._recall_buffer.items():
|
|
361
|
+
for mid in mem_counts:
|
|
362
|
+
memory_query_counts[mid] = memory_query_counts.get(mid, 0) + 1
|
|
363
|
+
|
|
364
|
+
# Find candidates: appeared in 5+ distinct queries
|
|
365
|
+
candidates = [
|
|
366
|
+
mid for mid, qcount in memory_query_counts.items()
|
|
367
|
+
if qcount >= 5
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
# Snapshot and clear buffer
|
|
371
|
+
buffer_snapshot = dict(self._recall_buffer)
|
|
372
|
+
self._recall_buffer.clear()
|
|
373
|
+
self._recall_count = 0
|
|
374
|
+
|
|
375
|
+
if not candidates:
|
|
376
|
+
logger.debug("No passive decay candidates found")
|
|
377
|
+
return 0
|
|
378
|
+
|
|
379
|
+
# Check which candidates have positive feedback already
|
|
380
|
+
decay_count = 0
|
|
381
|
+
for mid in candidates:
|
|
382
|
+
if self._has_positive_feedback(mid):
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
# Emit passive decay signal. Use a synthetic query hash
|
|
386
|
+
# derived from the memory_id to group decay signals.
|
|
387
|
+
decay_hash = self._hash_query(f"__passive_decay__:{mid}")
|
|
388
|
+
self._store_feedback(
|
|
389
|
+
query_hash=decay_hash,
|
|
390
|
+
query_keywords=None,
|
|
391
|
+
memory_id=mid,
|
|
392
|
+
signal_type="passive_decay",
|
|
393
|
+
signal_value=self.SIGNAL_VALUES["passive_decay"],
|
|
394
|
+
channel="system",
|
|
395
|
+
)
|
|
396
|
+
decay_count += 1
|
|
397
|
+
|
|
398
|
+
if decay_count > 0:
|
|
399
|
+
logger.info(
|
|
400
|
+
"Passive decay: emitted %d signals for %d candidates",
|
|
401
|
+
decay_count,
|
|
402
|
+
len(candidates),
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
return decay_count
|
|
406
|
+
|
|
407
|
+
# ======================================================================
|
|
408
|
+
# Summary & Diagnostics
|
|
409
|
+
# ======================================================================
|
|
410
|
+
|
|
411
|
+
def get_feedback_summary(self) -> dict:
|
|
412
|
+
"""
|
|
413
|
+
Return summary statistics for display in CLI or dashboard.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
{
|
|
417
|
+
'total_signals': 142,
|
|
418
|
+
'unique_queries': 38,
|
|
419
|
+
'by_channel': {'mcp': 80, 'cli': 35, 'dashboard': 20, 'system': 7},
|
|
420
|
+
'by_type': {'mcp_used_high': 50, 'cli_useful': 35, ...},
|
|
421
|
+
'passive_decay_pending': 12,
|
|
422
|
+
'recall_buffer_size': 45,
|
|
423
|
+
}
|
|
424
|
+
"""
|
|
425
|
+
summary: Dict[str, Any] = {
|
|
426
|
+
"total_signals": 0,
|
|
427
|
+
"unique_queries": 0,
|
|
428
|
+
"by_channel": {},
|
|
429
|
+
"by_type": {},
|
|
430
|
+
"passive_decay_pending": 0,
|
|
431
|
+
"recall_buffer_size": 0,
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
# Buffer stats (always available, even without DB)
|
|
435
|
+
with self._recall_buffer_lock:
|
|
436
|
+
summary["recall_buffer_size"] = self._recall_count
|
|
437
|
+
# Count memories that would be decay candidates
|
|
438
|
+
memory_query_counts: Dict[int, int] = {}
|
|
439
|
+
for _qh, mem_counts in self._recall_buffer.items():
|
|
440
|
+
for mid in mem_counts:
|
|
441
|
+
memory_query_counts[mid] = memory_query_counts.get(mid, 0) + 1
|
|
442
|
+
summary["passive_decay_pending"] = sum(
|
|
443
|
+
1 for qcount in memory_query_counts.values() if qcount >= 5
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
if self.learning_db is None:
|
|
447
|
+
summary["error"] = "No learning database connected"
|
|
448
|
+
return summary
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
conn = self.learning_db._get_connection()
|
|
452
|
+
try:
|
|
453
|
+
cursor = conn.cursor()
|
|
454
|
+
|
|
455
|
+
# Total signals
|
|
456
|
+
cursor.execute("SELECT COUNT(*) FROM ranking_feedback")
|
|
457
|
+
summary["total_signals"] = cursor.fetchone()[0]
|
|
458
|
+
|
|
459
|
+
# Unique queries
|
|
460
|
+
cursor.execute(
|
|
461
|
+
"SELECT COUNT(DISTINCT query_hash) FROM ranking_feedback"
|
|
462
|
+
)
|
|
463
|
+
summary["unique_queries"] = cursor.fetchone()[0]
|
|
464
|
+
|
|
465
|
+
# By channel
|
|
466
|
+
cursor.execute(
|
|
467
|
+
"SELECT channel, COUNT(*) as cnt "
|
|
468
|
+
"FROM ranking_feedback GROUP BY channel"
|
|
469
|
+
)
|
|
470
|
+
summary["by_channel"] = {
|
|
471
|
+
row["channel"]: row["cnt"] for row in cursor.fetchall()
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
# By signal type
|
|
475
|
+
cursor.execute(
|
|
476
|
+
"SELECT signal_type, COUNT(*) as cnt "
|
|
477
|
+
"FROM ranking_feedback GROUP BY signal_type"
|
|
478
|
+
)
|
|
479
|
+
summary["by_type"] = {
|
|
480
|
+
row["signal_type"]: row["cnt"] for row in cursor.fetchall()
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
finally:
|
|
484
|
+
conn.close()
|
|
485
|
+
|
|
486
|
+
except Exception as e:
|
|
487
|
+
logger.error("Failed to get feedback summary: %s", e)
|
|
488
|
+
summary["error"] = str(e)
|
|
489
|
+
|
|
490
|
+
return summary
|
|
491
|
+
|
|
492
|
+
# ======================================================================
|
|
493
|
+
# Internal helpers
|
|
494
|
+
# ======================================================================
|
|
495
|
+
|
|
496
|
+
def _hash_query(self, query: str) -> str:
|
|
497
|
+
"""
|
|
498
|
+
Privacy-preserving query hash.
|
|
499
|
+
|
|
500
|
+
Returns the first 16 hex characters of the SHA-256 digest.
|
|
501
|
+
This is sufficient for grouping without being reversible.
|
|
502
|
+
"""
|
|
503
|
+
return hashlib.sha256(query.encode("utf-8")).hexdigest()[:16]
|
|
504
|
+
|
|
505
|
+
def _extract_keywords(self, query: str, top_n: int = 3) -> str:
|
|
506
|
+
"""
|
|
507
|
+
Extract the top N meaningful words from a query string.
|
|
508
|
+
|
|
509
|
+
Removes stopwords and short tokens (< 2 chars), then returns
|
|
510
|
+
the most frequent remaining words as a comma-separated string.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
query: Raw query text.
|
|
514
|
+
top_n: Number of keywords to extract.
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
Comma-separated keyword string (e.g. "deploy,fastapi,docker").
|
|
518
|
+
Empty string if no keywords extracted.
|
|
519
|
+
"""
|
|
520
|
+
if not query:
|
|
521
|
+
return ""
|
|
522
|
+
|
|
523
|
+
words = _WORD_SPLIT.split(query.lower())
|
|
524
|
+
# Filter stopwords and short tokens
|
|
525
|
+
meaningful = [w for w in words if w and len(w) >= 2 and w not in _STOPWORDS]
|
|
526
|
+
|
|
527
|
+
if not meaningful:
|
|
528
|
+
return ""
|
|
529
|
+
|
|
530
|
+
# Most common N words (preserves order of first occurrence for ties)
|
|
531
|
+
counts = Counter(meaningful)
|
|
532
|
+
top_words = [word for word, _count in counts.most_common(top_n)]
|
|
533
|
+
return ",".join(top_words)
|
|
534
|
+
|
|
535
|
+
def _store_feedback(
|
|
536
|
+
self,
|
|
537
|
+
query_hash: str,
|
|
538
|
+
query_keywords: Optional[str],
|
|
539
|
+
memory_id: int,
|
|
540
|
+
signal_type: str,
|
|
541
|
+
signal_value: float,
|
|
542
|
+
channel: str,
|
|
543
|
+
source_tool: Optional[str] = None,
|
|
544
|
+
rank_position: Optional[int] = None,
|
|
545
|
+
dwell_time: Optional[float] = None,
|
|
546
|
+
) -> Optional[int]:
|
|
547
|
+
"""
|
|
548
|
+
Persist a single feedback record via LearningDB.
|
|
549
|
+
|
|
550
|
+
Also updates the daily engagement metric counter.
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
Row ID on success, None on failure or if no DB is available.
|
|
554
|
+
"""
|
|
555
|
+
if self.learning_db is None:
|
|
556
|
+
logger.debug(
|
|
557
|
+
"Feedback not stored (no DB): memory=%d, type=%s",
|
|
558
|
+
memory_id,
|
|
559
|
+
signal_type,
|
|
560
|
+
)
|
|
561
|
+
return None
|
|
562
|
+
|
|
563
|
+
try:
|
|
564
|
+
row_id = self.learning_db.store_feedback(
|
|
565
|
+
query_hash=query_hash,
|
|
566
|
+
memory_id=memory_id,
|
|
567
|
+
signal_type=signal_type,
|
|
568
|
+
signal_value=signal_value,
|
|
569
|
+
channel=channel,
|
|
570
|
+
query_keywords=query_keywords,
|
|
571
|
+
rank_position=rank_position,
|
|
572
|
+
source_tool=source_tool,
|
|
573
|
+
dwell_time=dwell_time,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Update daily engagement counter (best-effort)
|
|
577
|
+
try:
|
|
578
|
+
self.learning_db.increment_engagement(
|
|
579
|
+
"feedback_signals",
|
|
580
|
+
count=1,
|
|
581
|
+
source=source_tool,
|
|
582
|
+
)
|
|
583
|
+
except Exception:
|
|
584
|
+
pass
|
|
585
|
+
|
|
586
|
+
return row_id
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.error(
|
|
590
|
+
"Failed to store feedback for memory %d: %s",
|
|
591
|
+
memory_id,
|
|
592
|
+
e,
|
|
593
|
+
)
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
def _has_positive_feedback(self, memory_id: int) -> bool:
|
|
597
|
+
"""
|
|
598
|
+
Check if a memory has ANY positive feedback in learning.db.
|
|
599
|
+
|
|
600
|
+
Positive = signal_value > 0.0 (anything above passive_decay).
|
|
601
|
+
Used by passive decay to avoid penalising memories that were
|
|
602
|
+
actually found useful at some point.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
memory_id: Memory ID to check.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
True if at least one positive feedback record exists.
|
|
609
|
+
"""
|
|
610
|
+
if self.learning_db is None:
|
|
611
|
+
return False
|
|
612
|
+
|
|
613
|
+
try:
|
|
614
|
+
conn = self.learning_db._get_connection()
|
|
615
|
+
try:
|
|
616
|
+
cursor = conn.cursor()
|
|
617
|
+
cursor.execute(
|
|
618
|
+
"""
|
|
619
|
+
SELECT COUNT(*) FROM ranking_feedback
|
|
620
|
+
WHERE memory_id = ? AND signal_value > 0.0
|
|
621
|
+
""",
|
|
622
|
+
(memory_id,),
|
|
623
|
+
)
|
|
624
|
+
count = cursor.fetchone()[0]
|
|
625
|
+
return count > 0
|
|
626
|
+
finally:
|
|
627
|
+
conn.close()
|
|
628
|
+
except Exception as e:
|
|
629
|
+
logger.error(
|
|
630
|
+
"Failed to check positive feedback for memory %d: %s",
|
|
631
|
+
memory_id,
|
|
632
|
+
e,
|
|
633
|
+
)
|
|
634
|
+
# If we can't check, assume positive to avoid false penalisation
|
|
635
|
+
return True
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
# ======================================================================
|
|
639
|
+
# Standalone execution (for diagnostics: python3 feedback_collector.py)
|
|
640
|
+
# ======================================================================
|
|
641
|
+
|
|
642
|
+
def main():
|
|
643
|
+
"""Print feedback summary from CLI."""
|
|
644
|
+
import sys
|
|
645
|
+
|
|
646
|
+
logging.basicConfig(
|
|
647
|
+
level=logging.INFO,
|
|
648
|
+
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# Try to get LearningDB
|
|
652
|
+
learning_db = None
|
|
653
|
+
try:
|
|
654
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent))
|
|
655
|
+
from learning_db import LearningDB
|
|
656
|
+
learning_db = LearningDB()
|
|
657
|
+
except ImportError:
|
|
658
|
+
logger.warning("LearningDB not available")
|
|
659
|
+
|
|
660
|
+
collector = FeedbackCollector(learning_db=learning_db)
|
|
661
|
+
summary = collector.get_feedback_summary()
|
|
662
|
+
|
|
663
|
+
print(f"\n{'='*60}")
|
|
664
|
+
print(f" Feedback Summary")
|
|
665
|
+
print(f"{'='*60}")
|
|
666
|
+
print(f" Total signals: {summary.get('total_signals', 0)}")
|
|
667
|
+
print(f" Unique queries: {summary.get('unique_queries', 0)}")
|
|
668
|
+
print(f" Recall buffer size: {summary.get('recall_buffer_size', 0)}")
|
|
669
|
+
print(f" Decay pending: {summary.get('passive_decay_pending', 0)}")
|
|
670
|
+
|
|
671
|
+
by_channel = summary.get("by_channel", {})
|
|
672
|
+
if by_channel:
|
|
673
|
+
print(f"\n By Channel:")
|
|
674
|
+
for ch, cnt in sorted(by_channel.items()):
|
|
675
|
+
print(f" {ch:>12s}: {cnt}")
|
|
676
|
+
|
|
677
|
+
by_type = summary.get("by_type", {})
|
|
678
|
+
if by_type:
|
|
679
|
+
print(f"\n By Signal Type:")
|
|
680
|
+
for st, cnt in sorted(by_type.items()):
|
|
681
|
+
print(f" {st:>18s}: {cnt}")
|
|
682
|
+
|
|
683
|
+
if "error" in summary:
|
|
684
|
+
print(f"\n Error: {summary['error']}")
|
|
685
|
+
|
|
686
|
+
print()
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
if __name__ == "__main__":
|
|
690
|
+
main()
|