superlocalmemory 3.3.11 → 3.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pyproject.toml +2 -3
- package/src/superlocalmemory/core/config.py +9 -6
- package/src/superlocalmemory/core/embedding_worker.py +5 -1
- package/src/superlocalmemory/core/embeddings.py +3 -1
- package/src/superlocalmemory/core/engine.py +14 -0
- package/src/superlocalmemory/core/engine_wiring.py +16 -1
- package/src/superlocalmemory/core/maintenance_scheduler.py +94 -0
- package/src/superlocalmemory/core/recall_pipeline.py +24 -0
- package/src/superlocalmemory/core/recall_worker.py +22 -4
- package/src/superlocalmemory/core/reranker_worker.py +246 -0
- package/src/superlocalmemory/core/store_pipeline.py +12 -2
- package/src/superlocalmemory/encoding/fact_extractor.py +16 -8
- package/src/superlocalmemory/encoding/graph_builder.py +21 -1
- package/src/superlocalmemory/learning/adaptive.py +2 -2
- package/src/superlocalmemory/math/fisher_quantized.py +8 -4
- package/src/superlocalmemory/math/langevin.py +15 -2
- package/src/superlocalmemory/mcp/resources.py +2 -2
- package/src/superlocalmemory/mcp/shared.py +27 -0
- package/src/superlocalmemory/mcp/tools_active.py +31 -1
- package/src/superlocalmemory/mcp/tools_core.py +15 -9
- package/src/superlocalmemory/mcp/tools_v28.py +2 -2
- package/src/superlocalmemory/mcp/tools_v3.py +3 -0
- package/src/superlocalmemory/mcp/tools_v33.py +68 -7
- package/src/superlocalmemory/retrieval/agentic.py +1 -1
- package/src/superlocalmemory/retrieval/bm25_channel.py +21 -1
- package/src/superlocalmemory/retrieval/engine.py +44 -9
- package/src/superlocalmemory/retrieval/entity_channel.py +6 -0
- package/src/superlocalmemory/retrieval/fusion.py +2 -2
- package/src/superlocalmemory/retrieval/hopfield_channel.py +2 -2
- package/src/superlocalmemory/retrieval/reranker.py +24 -7
- package/src/superlocalmemory/retrieval/semantic_channel.py +2 -2
- package/src/superlocalmemory/retrieval/temporal_channel.py +14 -1
- package/src/superlocalmemory/storage/schema.py +2 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.13",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "superlocalmemory"
|
|
3
|
-
version = "3.3.
|
|
3
|
+
version = "3.3.13"
|
|
4
4
|
description = "Information-geometric agent memory with mathematical guarantees"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = {text = "MIT"}
|
|
@@ -48,8 +48,7 @@ dependencies = [
|
|
|
48
48
|
|
|
49
49
|
[project.optional-dependencies]
|
|
50
50
|
search = [
|
|
51
|
-
"sentence-transformers>=
|
|
52
|
-
"sentence-transformers[onnx]>=4.0.0",
|
|
51
|
+
"sentence-transformers[onnx]>=5.0.0",
|
|
53
52
|
"einops>=0.8.2",
|
|
54
53
|
"torch>=2.2.0",
|
|
55
54
|
"scikit-learn>=1.3.0,<2.0.0",
|
|
@@ -86,10 +86,10 @@ class LLMConfig:
|
|
|
86
86
|
class ChannelWeights:
|
|
87
87
|
"""Retrieval channel weights — 5 channels, query-adaptive."""
|
|
88
88
|
|
|
89
|
-
#
|
|
90
|
-
semantic: float = 1.
|
|
89
|
+
# Semantic should dominate for conversational retrieval (paraphrase matters most).
|
|
90
|
+
semantic: float = 1.5
|
|
91
91
|
bm25: float = 1.0
|
|
92
|
-
entity_graph: float = 1.
|
|
92
|
+
entity_graph: float = 1.0
|
|
93
93
|
temporal: float = 1.0
|
|
94
94
|
spreading_activation: float = 1.0 # Phase 3: 5th channel (BC-08: default value)
|
|
95
95
|
hopfield: float = 0.8 # Phase G: 6th channel (Hopfield associative memory)
|
|
@@ -143,7 +143,7 @@ class RetrievalConfig:
|
|
|
143
143
|
"""Configuration for the retrieval (recall) pipeline."""
|
|
144
144
|
|
|
145
145
|
# Fusion
|
|
146
|
-
rrf_k: int =
|
|
146
|
+
rrf_k: int = 15 # RRF smoothing constant (k=15 for candidate pools of 50-200)
|
|
147
147
|
top_k: int = 20 # Final results to return
|
|
148
148
|
|
|
149
149
|
# Per-channel
|
|
@@ -154,7 +154,7 @@ class RetrievalConfig:
|
|
|
154
154
|
|
|
155
155
|
# Reranking (V3.3.2: ONNX backend enabled for all modes)
|
|
156
156
|
use_cross_encoder: bool = True
|
|
157
|
-
cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-
|
|
157
|
+
cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"
|
|
158
158
|
cross_encoder_backend: str = "onnx" # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
|
|
159
159
|
|
|
160
160
|
# Agentic (Mode C only)
|
|
@@ -618,7 +618,7 @@ class SLMConfig:
|
|
|
618
618
|
# but NEVER override an explicit use_cross_encoder setting.
|
|
619
619
|
# The user's explicit choice always wins.
|
|
620
620
|
if "cross_encoder_backend" not in rt:
|
|
621
|
-
rt.setdefault("cross_encoder_model", "cross-encoder/ms-marco-MiniLM-L-
|
|
621
|
+
rt.setdefault("cross_encoder_model", "cross-encoder/ms-marco-MiniLM-L-12-v2")
|
|
622
622
|
rt["cross_encoder_backend"] = "onnx"
|
|
623
623
|
# Only auto-enable if user didn't explicitly set the field
|
|
624
624
|
rt.setdefault("use_cross_encoder", True)
|
|
@@ -740,6 +740,9 @@ class SLMConfig:
|
|
|
740
740
|
retrieval=RetrievalConfig(
|
|
741
741
|
# V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
|
|
742
742
|
use_cross_encoder=True,
|
|
743
|
+
# Mode A is zero-LLM: disable agentic retrieval (it replaces
|
|
744
|
+
# precision-tuned fusion with crude heuristic expansions)
|
|
745
|
+
agentic_max_rounds=0,
|
|
743
746
|
),
|
|
744
747
|
math=MathConfig(
|
|
745
748
|
sheaf_contradiction_threshold=0.45, # 768d threshold
|
|
@@ -50,7 +50,9 @@ class DimensionMismatchError(RuntimeError):
|
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
_IDLE_TIMEOUT_SECONDS = 120 # 2 minutes — kill worker after idle
|
|
53
|
-
|
|
53
|
+
# V3.3.12: Configurable via SLM_EMBED_IDLE_TIMEOUT env var (seconds)
|
|
54
|
+
_IDLE_TIMEOUT_SECONDS = int(os.environ.get("SLM_EMBED_IDLE_TIMEOUT", _IDLE_TIMEOUT_SECONDS))
|
|
55
|
+
_SUBPROCESS_RESPONSE_TIMEOUT = 180 # V3.3.12: 180s (was 120s) — respawns on stressed systems need more time
|
|
54
56
|
_WORKER_RECYCLE_AFTER = 1000 # Recycle worker after N requests (C++ fragmentation prevention)
|
|
55
57
|
|
|
56
58
|
|
|
@@ -79,6 +79,7 @@ class MemoryEngine:
|
|
|
79
79
|
self._auto_linker = None
|
|
80
80
|
self._graph_analyzer = None
|
|
81
81
|
self._consolidation_engine = None
|
|
82
|
+
self._maintenance_scheduler = None
|
|
82
83
|
self._hooks = HookRegistry()
|
|
83
84
|
|
|
84
85
|
# -- Public properties (Phase 2+ access) --------------------------------
|
|
@@ -194,6 +195,17 @@ class MemoryEngine:
|
|
|
194
195
|
# V3.3: Check for embedding model migration on mode switch
|
|
195
196
|
self._check_embedding_migration()
|
|
196
197
|
|
|
198
|
+
# V3.3.13: Background maintenance scheduler (Langevin/Ebbinghaus/Sheaf)
|
|
199
|
+
if self._config.forgetting.enabled:
|
|
200
|
+
try:
|
|
201
|
+
from superlocalmemory.core.maintenance_scheduler import MaintenanceScheduler
|
|
202
|
+
self._maintenance_scheduler = MaintenanceScheduler(
|
|
203
|
+
self._db, self._config, self._profile_id,
|
|
204
|
+
)
|
|
205
|
+
self._maintenance_scheduler.start()
|
|
206
|
+
except Exception as exc:
|
|
207
|
+
logger.debug("Maintenance scheduler init failed: %s", exc)
|
|
208
|
+
|
|
197
209
|
self._initialized = True
|
|
198
210
|
logger.info(
|
|
199
211
|
"MemoryEngine initialized: mode=%s profile=%s",
|
|
@@ -306,6 +318,8 @@ class MemoryEngine:
|
|
|
306
318
|
# -- Lifecycle ----------------------------------------------------------
|
|
307
319
|
|
|
308
320
|
def close(self) -> None:
|
|
321
|
+
if self._maintenance_scheduler is not None:
|
|
322
|
+
self._maintenance_scheduler.stop()
|
|
309
323
|
self._initialized = False
|
|
310
324
|
|
|
311
325
|
@property
|
|
@@ -339,7 +339,7 @@ def _init_spreading_activation(
|
|
|
339
339
|
SpreadingActivation,
|
|
340
340
|
SpreadingActivationConfig,
|
|
341
341
|
)
|
|
342
|
-
sa_config = SpreadingActivationConfig(enabled=
|
|
342
|
+
sa_config = SpreadingActivationConfig(enabled=True)
|
|
343
343
|
return SpreadingActivation(
|
|
344
344
|
db=db, vector_store=vector_store, config=sa_config,
|
|
345
345
|
)
|
|
@@ -454,6 +454,21 @@ def init_retrieval(
|
|
|
454
454
|
trust_scorer=trust_scorer,
|
|
455
455
|
)
|
|
456
456
|
|
|
457
|
+
# V3.3.13: Ensure reranker warmup is in progress.
|
|
458
|
+
# The CrossEncoderReranker constructor starts background warmup, but
|
|
459
|
+
# callers can also call warmup_sync() to block until ready.
|
|
460
|
+
# Here we just log warmup status — benchmark scripts call warmup_sync() explicitly.
|
|
461
|
+
if reranker is not None:
|
|
462
|
+
import threading
|
|
463
|
+
def _log_warmup_status() -> None:
|
|
464
|
+
ready = reranker.warmup_sync(timeout=180)
|
|
465
|
+
if ready:
|
|
466
|
+
logger.info("Cross-encoder reranker warm and ready")
|
|
467
|
+
else:
|
|
468
|
+
logger.warning("Cross-encoder reranker warmup failed — recalls will use fallback scoring")
|
|
469
|
+
t = threading.Thread(target=_log_warmup_status, daemon=True, name="ce-init-warmup")
|
|
470
|
+
t.start()
|
|
471
|
+
|
|
457
472
|
# Phase A: Register forgetting filter into the channel registry
|
|
458
473
|
try:
|
|
459
474
|
from superlocalmemory.retrieval.forgetting_filter import register_forgetting_filter
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the MIT License - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
|
+
|
|
5
|
+
"""SuperLocalMemory V3 — Background Maintenance Scheduler.
|
|
6
|
+
|
|
7
|
+
V3.3.13: Periodically triggers Langevin/Ebbinghaus/Sheaf maintenance
|
|
8
|
+
so users don't need to call run_maintenance manually.
|
|
9
|
+
|
|
10
|
+
Configurable interval via ForgettingConfig.scheduler_interval_minutes.
|
|
11
|
+
Defaults to 30 min. Disabled during benchmarks (no config.forgetting.enabled).
|
|
12
|
+
|
|
13
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
14
|
+
License: MIT
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import threading
|
|
21
|
+
from typing import TYPE_CHECKING
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from superlocalmemory.core.config import SLMConfig
|
|
25
|
+
from superlocalmemory.storage.database import DatabaseManager
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MaintenanceScheduler:
|
|
31
|
+
"""Background scheduler for periodic math maintenance.
|
|
32
|
+
|
|
33
|
+
Runs Langevin/Sheaf/Fisher maintenance at configurable intervals.
|
|
34
|
+
Thread-safe. Auto-stops on garbage collection or explicit stop().
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
db: DatabaseManager,
|
|
40
|
+
config: SLMConfig,
|
|
41
|
+
profile_id: str = "default",
|
|
42
|
+
) -> None:
|
|
43
|
+
self._db = db
|
|
44
|
+
self._config = config
|
|
45
|
+
self._profile_id = profile_id
|
|
46
|
+
self._timer: threading.Timer | None = None
|
|
47
|
+
self._running = False
|
|
48
|
+
self._interval = config.forgetting.scheduler_interval_minutes * 60.0
|
|
49
|
+
|
|
50
|
+
def start(self) -> None:
|
|
51
|
+
"""Start the periodic scheduler. Idempotent."""
|
|
52
|
+
if self._running:
|
|
53
|
+
return
|
|
54
|
+
self._running = True
|
|
55
|
+
self._schedule_next()
|
|
56
|
+
logger.info(
|
|
57
|
+
"Maintenance scheduler started (interval=%dm)",
|
|
58
|
+
self._config.forgetting.scheduler_interval_minutes,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def stop(self) -> None:
|
|
62
|
+
"""Stop the scheduler. Idempotent."""
|
|
63
|
+
self._running = False
|
|
64
|
+
if self._timer is not None:
|
|
65
|
+
self._timer.cancel()
|
|
66
|
+
self._timer = None
|
|
67
|
+
logger.info("Maintenance scheduler stopped")
|
|
68
|
+
|
|
69
|
+
def _schedule_next(self) -> None:
|
|
70
|
+
"""Schedule the next maintenance run."""
|
|
71
|
+
if not self._running:
|
|
72
|
+
return
|
|
73
|
+
self._timer = threading.Timer(self._interval, self._run)
|
|
74
|
+
self._timer.daemon = True
|
|
75
|
+
self._timer.start()
|
|
76
|
+
|
|
77
|
+
def _run(self) -> None:
|
|
78
|
+
"""Execute maintenance and schedule next run."""
|
|
79
|
+
if not self._running:
|
|
80
|
+
return
|
|
81
|
+
try:
|
|
82
|
+
from superlocalmemory.core.maintenance import run_maintenance
|
|
83
|
+
counts = run_maintenance(self._db, self._config, self._profile_id)
|
|
84
|
+
logger.info("Scheduled maintenance complete: %s", counts)
|
|
85
|
+
except Exception as exc:
|
|
86
|
+
logger.warning("Scheduled maintenance failed: %s", exc)
|
|
87
|
+
finally:
|
|
88
|
+
self._schedule_next()
|
|
89
|
+
|
|
90
|
+
def __del__(self) -> None:
|
|
91
|
+
try:
|
|
92
|
+
self.stop()
|
|
93
|
+
except Exception:
|
|
94
|
+
pass
|
|
@@ -192,6 +192,30 @@ def run_recall(
|
|
|
192
192
|
except Exception as exc:
|
|
193
193
|
logger.debug("Access log batch store failed: %s", exc)
|
|
194
194
|
|
|
195
|
+
# V3.3.12: Wire BehavioralTracker.record_query() into live recall pipeline
|
|
196
|
+
try:
|
|
197
|
+
from superlocalmemory.learning.behavioral import BehavioralTracker
|
|
198
|
+
_tracker = BehavioralTracker(db)
|
|
199
|
+
_tracker.record_query(
|
|
200
|
+
profile_id=profile_id, query=query,
|
|
201
|
+
query_type=response.query_type,
|
|
202
|
+
result_count=len(response.results),
|
|
203
|
+
)
|
|
204
|
+
except Exception as exc:
|
|
205
|
+
logger.debug("Behavioral tracking: %s", exc)
|
|
206
|
+
|
|
207
|
+
# V3.3.12: Spaced repetition update on recall (Ebbinghaus on_access_event)
|
|
208
|
+
if response.results:
|
|
209
|
+
try:
|
|
210
|
+
from superlocalmemory.learning.forgetting_scheduler import ForgettingScheduler
|
|
211
|
+
from superlocalmemory.math.ebbinghaus import EbbinghausCurve
|
|
212
|
+
_ebbinghaus = EbbinghausCurve(config.forgetting)
|
|
213
|
+
_fsched = ForgettingScheduler(db, _ebbinghaus, config.forgetting)
|
|
214
|
+
for r in response.results[:10]:
|
|
215
|
+
_fsched.on_access_event(r.fact.fact_id, profile_id)
|
|
216
|
+
except Exception as exc:
|
|
217
|
+
logger.debug("Spaced repetition update: %s", exc)
|
|
218
|
+
|
|
195
219
|
# Phase 3: Hebbian strengthening for co-accessed facts
|
|
196
220
|
if auto_linker and response.results:
|
|
197
221
|
try:
|
|
@@ -187,10 +187,28 @@ def _handle_update_memory(fact_id: str, content: str, agent_id: str = "system")
|
|
|
187
187
|
if not rows:
|
|
188
188
|
return {"ok": False, "error": f"Memory {fact_id} not found"}
|
|
189
189
|
old_content = dict(rows[0]).get("content", "")[:80]
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
190
|
+
# V3.3.12: Re-embed updated content so semantic search + BM25 stay consistent.
|
|
191
|
+
# Previously only the text column was updated, leaving stale embeddings.
|
|
192
|
+
updates: dict = {"content": content}
|
|
193
|
+
if engine._embedder:
|
|
194
|
+
try:
|
|
195
|
+
new_emb = engine._embedder.embed(content)
|
|
196
|
+
if new_emb:
|
|
197
|
+
updates["embedding"] = new_emb
|
|
198
|
+
fm, fv = engine._embedder.compute_fisher_params(new_emb)
|
|
199
|
+
updates["fisher_mean"] = fm
|
|
200
|
+
updates["fisher_variance"] = fv
|
|
201
|
+
except Exception:
|
|
202
|
+
pass
|
|
203
|
+
engine._db.update_fact(fact_id, updates)
|
|
204
|
+
# Update BM25 index for the new content
|
|
205
|
+
if hasattr(engine, '_retrieval_engine') and engine._retrieval_engine:
|
|
206
|
+
bm25 = getattr(engine._retrieval_engine, '_bm25', None)
|
|
207
|
+
if bm25:
|
|
208
|
+
try:
|
|
209
|
+
bm25.add(fact_id, content, pid)
|
|
210
|
+
except Exception:
|
|
211
|
+
pass
|
|
194
212
|
import logging as _logging
|
|
195
213
|
_logging.getLogger("superlocalmemory.audit").info(
|
|
196
214
|
"UPDATE fact_id=%s by agent=%s old=%s new=%s",
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the MIT License - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
|
+
|
|
5
|
+
"""Subprocess reranker worker — isolates PyTorch/ONNX from main process.
|
|
6
|
+
|
|
7
|
+
Same pattern as embedding_worker.py. The main process stays at ~60 MB.
|
|
8
|
+
All cross-encoder model memory lives in this worker subprocess.
|
|
9
|
+
|
|
10
|
+
Protocol (JSON over stdin/stdout):
|
|
11
|
+
Request: {"cmd": "rerank", "query": "...", "documents": ["...", ...]}
|
|
12
|
+
Response: {"ok": true, "scores": [0.95, 0.32, ...]}
|
|
13
|
+
|
|
14
|
+
Request: {"cmd": "score", "query": "...", "document": "..."}
|
|
15
|
+
Response: {"ok": true, "score": 0.87}
|
|
16
|
+
|
|
17
|
+
Request: {"cmd": "ping"}
|
|
18
|
+
Response: {"ok": true, "backend": "onnx", "model": "..."}
|
|
19
|
+
|
|
20
|
+
Request: {"cmd": "quit"}
|
|
21
|
+
(worker exits)
|
|
22
|
+
|
|
23
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
import platform
|
|
31
|
+
import signal
|
|
32
|
+
import struct
|
|
33
|
+
import sys
|
|
34
|
+
import threading
|
|
35
|
+
|
|
36
|
+
# Force CPU BEFORE any torch import
|
|
37
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|
38
|
+
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
|
|
39
|
+
os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
|
|
40
|
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
41
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
42
|
+
os.environ["TORCH_DEVICE"] = "cpu"
|
|
43
|
+
|
|
44
|
+
# SIGTERM bridge for Docker/systemd
|
|
45
|
+
if sys.platform != "win32":
|
|
46
|
+
signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _start_parent_watchdog() -> None:
|
|
50
|
+
"""Monitor parent process — self-terminate if parent dies.
|
|
51
|
+
|
|
52
|
+
Prevents orphaned workers that consume 1+ GB each when the parent
|
|
53
|
+
process crashes, is killed, or exits without cleanup.
|
|
54
|
+
|
|
55
|
+
V3.3.7: Added after incident where ~30 orphaned workers consumed 33 GB.
|
|
56
|
+
"""
|
|
57
|
+
parent_pid = os.getppid()
|
|
58
|
+
|
|
59
|
+
def _watch() -> None:
|
|
60
|
+
import time
|
|
61
|
+
while True:
|
|
62
|
+
time.sleep(5)
|
|
63
|
+
try:
|
|
64
|
+
os.kill(parent_pid, 0) # Check if parent is alive (signal 0)
|
|
65
|
+
except OSError:
|
|
66
|
+
# Parent is dead — self-terminate
|
|
67
|
+
os._exit(0)
|
|
68
|
+
|
|
69
|
+
t = threading.Thread(target=_watch, daemon=True, name="parent-watchdog")
|
|
70
|
+
t.start()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _detect_onnx_variant() -> str:
|
|
74
|
+
"""Auto-detect the best ONNX model variant for the current platform."""
|
|
75
|
+
arch = platform.machine().lower()
|
|
76
|
+
is_64bit = struct.calcsize("P") * 8 == 64
|
|
77
|
+
|
|
78
|
+
if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
|
|
79
|
+
return "onnx/model_qint8_arm64.onnx"
|
|
80
|
+
if arch in ("x86_64", "amd64") and is_64bit:
|
|
81
|
+
return "onnx/model_quint8_avx2.onnx"
|
|
82
|
+
return "onnx/model.onnx"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _worker_main() -> None:
|
|
86
|
+
"""Main loop: read JSON requests from stdin, write responses to stdout."""
|
|
87
|
+
_start_parent_watchdog() # V3.3.7: self-terminate if parent dies
|
|
88
|
+
|
|
89
|
+
model = None
|
|
90
|
+
active_backend = ""
|
|
91
|
+
model_name = ""
|
|
92
|
+
|
|
93
|
+
for line in sys.stdin:
|
|
94
|
+
line = line.strip()
|
|
95
|
+
if not line:
|
|
96
|
+
continue
|
|
97
|
+
try:
|
|
98
|
+
req = json.loads(line)
|
|
99
|
+
except json.JSONDecodeError:
|
|
100
|
+
_respond({"ok": False, "error": "Invalid JSON"})
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
cmd = req.get("cmd", "")
|
|
104
|
+
|
|
105
|
+
if cmd == "quit":
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
if cmd == "ping":
|
|
109
|
+
_respond({
|
|
110
|
+
"ok": True,
|
|
111
|
+
"loaded": model is not None,
|
|
112
|
+
"backend": active_backend,
|
|
113
|
+
"model": model_name,
|
|
114
|
+
})
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
if cmd == "load":
|
|
118
|
+
name = req.get("model_name", "cross-encoder/ms-marco-MiniLM-L-12-v2")
|
|
119
|
+
backend = req.get("backend", "onnx")
|
|
120
|
+
model, active_backend, model_name = _load_model(name, backend)
|
|
121
|
+
_respond({
|
|
122
|
+
"ok": model is not None,
|
|
123
|
+
"backend": active_backend,
|
|
124
|
+
"model": model_name,
|
|
125
|
+
})
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
if cmd == "rerank":
|
|
129
|
+
query = req.get("query", "")
|
|
130
|
+
documents = req.get("documents", [])
|
|
131
|
+
if not query or not documents:
|
|
132
|
+
_respond({"ok": False, "error": "Missing query or documents"})
|
|
133
|
+
continue
|
|
134
|
+
if model is None:
|
|
135
|
+
# Auto-load with defaults
|
|
136
|
+
name = req.get("model_name", "cross-encoder/ms-marco-MiniLM-L-12-v2")
|
|
137
|
+
backend = req.get("backend", "onnx")
|
|
138
|
+
model, active_backend, model_name = _load_model(name, backend)
|
|
139
|
+
if model is None:
|
|
140
|
+
_respond({"ok": False, "error": "Model load failed"})
|
|
141
|
+
continue
|
|
142
|
+
try:
|
|
143
|
+
pairs = [(query, doc) for doc in documents]
|
|
144
|
+
try:
|
|
145
|
+
import torch
|
|
146
|
+
with torch.inference_mode():
|
|
147
|
+
scores = model.predict(pairs)
|
|
148
|
+
except ImportError:
|
|
149
|
+
scores = model.predict(pairs)
|
|
150
|
+
_respond({
|
|
151
|
+
"ok": True,
|
|
152
|
+
"scores": [float(s) for s in scores],
|
|
153
|
+
})
|
|
154
|
+
except Exception as exc:
|
|
155
|
+
_respond({"ok": False, "error": str(exc)})
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
if cmd == "score":
|
|
159
|
+
query = req.get("query", "")
|
|
160
|
+
document = req.get("document", "")
|
|
161
|
+
if not query or not document:
|
|
162
|
+
_respond({"ok": False, "error": "Missing query or document"})
|
|
163
|
+
continue
|
|
164
|
+
if model is None:
|
|
165
|
+
name = req.get("model_name", "cross-encoder/ms-marco-MiniLM-L-12-v2")
|
|
166
|
+
backend = req.get("backend", "onnx")
|
|
167
|
+
model, active_backend, model_name = _load_model(name, backend)
|
|
168
|
+
if model is None:
|
|
169
|
+
_respond({"ok": False, "error": "Model load failed"})
|
|
170
|
+
continue
|
|
171
|
+
try:
|
|
172
|
+
try:
|
|
173
|
+
import torch
|
|
174
|
+
with torch.inference_mode():
|
|
175
|
+
scores = model.predict([(query, document)])
|
|
176
|
+
except ImportError:
|
|
177
|
+
scores = model.predict([(query, document)])
|
|
178
|
+
_respond({"ok": True, "score": float(scores[0])})
|
|
179
|
+
except Exception as exc:
|
|
180
|
+
_respond({"ok": False, "error": str(exc)})
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
_respond({"ok": False, "error": f"Unknown command: {cmd}"})
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _load_model(
|
|
187
|
+
name: str, backend: str,
|
|
188
|
+
) -> tuple:
|
|
189
|
+
"""Load cross-encoder model. Returns (model, backend_name, model_name).
|
|
190
|
+
|
|
191
|
+
V3.3.13: sentence-transformers 5.x+ supports backend='onnx' for
|
|
192
|
+
CrossEncoder. We use a 3-tier fallback chain:
|
|
193
|
+
|
|
194
|
+
1. ONNX + platform-quantized model (fastest, ~200MB, 2.4ms/pair)
|
|
195
|
+
2. ONNX + generic model (fast, auto-exported on first use)
|
|
196
|
+
3. PyTorch (always works, ~500MB, 6ms/pair)
|
|
197
|
+
|
|
198
|
+
Cross-platform:
|
|
199
|
+
Mac ARM64 → model_qint8_arm64.onnx
|
|
200
|
+
x86_64 → model_quint8_avx2.onnx
|
|
201
|
+
Fallback → model.onnx (generic)
|
|
202
|
+
"""
|
|
203
|
+
try:
|
|
204
|
+
from sentence_transformers import CrossEncoder
|
|
205
|
+
|
|
206
|
+
if backend == "onnx":
|
|
207
|
+
# Tier 1: Platform-specific quantized ONNX (fastest)
|
|
208
|
+
try:
|
|
209
|
+
onnx_file = _detect_onnx_variant()
|
|
210
|
+
m = CrossEncoder(
|
|
211
|
+
name, backend="onnx",
|
|
212
|
+
model_kwargs={"file_name": onnx_file},
|
|
213
|
+
)
|
|
214
|
+
return m, f"onnx-quantized({onnx_file})", name
|
|
215
|
+
except Exception:
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
# Tier 2: Generic ONNX (auto-exported by optimum)
|
|
219
|
+
try:
|
|
220
|
+
m = CrossEncoder(name, backend="onnx")
|
|
221
|
+
return m, "onnx", name
|
|
222
|
+
except Exception:
|
|
223
|
+
pass
|
|
224
|
+
|
|
225
|
+
# Tier 3: PyTorch (always works, no ONNX dependency needed)
|
|
226
|
+
m = CrossEncoder(name)
|
|
227
|
+
return m, "pytorch", name
|
|
228
|
+
except ImportError:
|
|
229
|
+
return None, "", ""
|
|
230
|
+
except Exception:
|
|
231
|
+
return None, "", ""
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _respond(data: dict) -> None:
|
|
235
|
+
"""Write JSON response to stdout, flush immediately."""
|
|
236
|
+
sys.stdout.write(json.dumps(data) + "\n")
|
|
237
|
+
sys.stdout.flush()
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
if __name__ == "__main__":
|
|
241
|
+
try:
|
|
242
|
+
_worker_main()
|
|
243
|
+
except KeyboardInterrupt:
|
|
244
|
+
# V3.3.13: Windows CI sends KeyboardInterrupt on test completion.
|
|
245
|
+
# Exit cleanly instead of printing a traceback that fails CI.
|
|
246
|
+
sys.exit(0)
|
|
@@ -170,13 +170,23 @@ def run_store(
|
|
|
170
170
|
# V3.3.11: Also store raw content as a verbatim fact to preserve details
|
|
171
171
|
# that fact extraction may abstract away (dates, names, specifics).
|
|
172
172
|
# This ensures BM25 and semantic search can always find the original text.
|
|
173
|
+
# V3.3.12: Extract entities from verbatim content so entity channel + temporal
|
|
174
|
+
# channel can find it (was entities=[] which made 4/6 channels blind).
|
|
173
175
|
if content.strip() and len(content.strip()) >= 20:
|
|
174
176
|
import uuid
|
|
177
|
+
import re as _re
|
|
178
|
+
_verbatim_text = content.strip()
|
|
179
|
+
# Extract entities using the same regex as fact_extractor
|
|
180
|
+
_ent_re = _re.compile(r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+){0,3})\b")
|
|
181
|
+
_entity_set = {m.group(1) for m in _ent_re.finditer(_verbatim_text)}
|
|
182
|
+
# Also extract all-caps abbreviations (NYU, MIT, etc.) — dedup with first set
|
|
183
|
+
_entity_set |= {m.group(1) for m in _re.finditer(r'\b([A-Z]{2,})\b', _verbatim_text)}
|
|
184
|
+
_verbatim_entities = sorted(_entity_set)
|
|
175
185
|
verbatim = AtomicFact(
|
|
176
186
|
fact_id=uuid.uuid4().hex[:16],
|
|
177
|
-
content=
|
|
187
|
+
content=_verbatim_text,
|
|
178
188
|
fact_type=FactType.EPISODIC,
|
|
179
|
-
entities=
|
|
189
|
+
entities=_verbatim_entities,
|
|
180
190
|
session_id=session_id,
|
|
181
191
|
observation_date=parsed_date,
|
|
182
192
|
confidence=0.9,
|
|
@@ -84,7 +84,8 @@ _INTERVAL_RE = re.compile(
|
|
|
84
84
|
)
|
|
85
85
|
|
|
86
86
|
_ENTITY_RE = re.compile(
|
|
87
|
-
r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+){0,3})\b"
|
|
87
|
+
r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+){0,3})\b" # Capitalized word sequences
|
|
88
|
+
r"|\b([A-Z]{2,})\b" # ALL-CAPS abbreviations (NYU, MIT)
|
|
88
89
|
)
|
|
89
90
|
|
|
90
91
|
_QUOTED_RE = re.compile(r'"([^"]+)"') # Quoted strings as entities
|
|
@@ -243,7 +244,7 @@ def _extract_entities(text: str) -> list[str]:
|
|
|
243
244
|
|
|
244
245
|
# Capitalized word sequences (proper nouns)
|
|
245
246
|
for match in _ENTITY_RE.finditer(text):
|
|
246
|
-
candidate = match.group(1).strip()
|
|
247
|
+
candidate = (match.group(1) or match.group(2) or "").strip()
|
|
247
248
|
# Filter common English words that start sentences
|
|
248
249
|
# Check first word of multi-word candidates against stop list
|
|
249
250
|
_first_word = candidate.split()[0].lower() if candidate else ""
|
|
@@ -495,10 +496,17 @@ class FactExtractor:
|
|
|
495
496
|
) -> list[AtomicFact]:
|
|
496
497
|
"""Rule-based extraction: regex entities, keyword classification, scoring."""
|
|
497
498
|
combined = "\n".join(turns)
|
|
498
|
-
|
|
499
|
-
if not
|
|
500
|
-
|
|
501
|
-
|
|
499
|
+
raw_sentences = _split_sentences(combined)
|
|
500
|
+
if not raw_sentences:
|
|
501
|
+
raw_sentences = [t.strip() for t in turns if len(t.strip()) >= 8]
|
|
502
|
+
|
|
503
|
+
# V3.3.12: Sliding window of 2 sentences to preserve cross-sentence context.
|
|
504
|
+
# "She enrolled at NYU. Starting January 2024." → becomes one combined fact.
|
|
505
|
+
sentences = list(raw_sentences) # Keep originals
|
|
506
|
+
for i in range(len(raw_sentences) - 1):
|
|
507
|
+
pair = raw_sentences[i].rstrip() + " " + raw_sentences[i + 1].lstrip()
|
|
508
|
+
if len(pair) <= 300: # Only combine if not too long
|
|
509
|
+
sentences.append(pair)
|
|
502
510
|
|
|
503
511
|
# Build entity frequency map for importance scoring
|
|
504
512
|
entity_freq: dict[str, int] = {}
|
|
@@ -549,8 +557,8 @@ class FactExtractor:
|
|
|
549
557
|
if importance < self._config.min_fact_confidence:
|
|
550
558
|
continue
|
|
551
559
|
|
|
552
|
-
#
|
|
553
|
-
speaker
|
|
560
|
+
# V3.3.12: Speaker inference removed — result was never stored in AtomicFact.
|
|
561
|
+
# The speaker info is preserved in verbatim facts via [Speaker]: prefix.
|
|
554
562
|
|
|
555
563
|
facts.append(AtomicFact(
|
|
556
564
|
fact_id=_new_id(),
|