superlocalmemory 3.3.2 → 3.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/package.json +1 -1
- package/pyproject.toml +5 -1
- package/src/superlocalmemory/cli/commands.py +21 -4
- package/src/superlocalmemory/cli/main.py +6 -0
- package/src/superlocalmemory/core/config.py +12 -9
- package/src/superlocalmemory/core/maintenance.py +111 -5
- package/src/superlocalmemory/core/store_pipeline.py +24 -0
- package/src/superlocalmemory/core/worker_pool.py +9 -2
- package/src/superlocalmemory/encoding/cognitive_consolidator.py +19 -1
- package/src/superlocalmemory/encoding/emotional.py +5 -2
- package/src/superlocalmemory/encoding/entity_resolver.py +1 -1
- package/src/superlocalmemory/math/polar_quant.py +3 -1
- package/src/superlocalmemory/retrieval/engine.py +36 -8
- package/src/superlocalmemory/retrieval/reranker.py +240 -163
- package/src/superlocalmemory/storage/embedding_migrator.py +4 -3
package/CHANGELOG.md
CHANGED
|
@@ -16,6 +16,18 @@ SuperLocalMemory V3 - Intelligent local memory system for AI coding assistants.
|
|
|
16
16
|
|
|
17
17
|
---
|
|
18
18
|
|
|
19
|
+
## [3.3.3] - 2026-04-01 — Langevin Awakening
|
|
20
|
+
|
|
21
|
+
### Fixed
|
|
22
|
+
- **Langevin dynamics now active** — positions were never initialized at store time, causing the entire Langevin lifecycle system to be inert (0 positioned facts). New facts now receive near-origin positions (Strategy A).
|
|
23
|
+
- **Backfill for existing facts** — maintenance now initializes unpositioned facts using metadata-aware equilibrium seeding (Strategy B) followed by 50-step burn-in (Strategy C). Old, rarely-accessed facts land in their correct lifecycle zones immediately.
|
|
24
|
+
|
|
25
|
+
### Improved
|
|
26
|
+
- Maintenance returns `langevin_backfilled` count for observability
|
|
27
|
+
- Health check now reports positioned facts accurately after backfill
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
19
31
|
## [3.3.0] - 2026-03-31 — The Living Brain
|
|
20
32
|
|
|
21
33
|
### New Features
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.4",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "superlocalmemory"
|
|
3
|
-
version = "3.3.
|
|
3
|
+
version = "3.3.4"
|
|
4
4
|
description = "Information-geometric agent memory with mathematical guarantees"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = {text = "MIT"}
|
|
@@ -98,6 +98,10 @@ testpaths = ["tests"]
|
|
|
98
98
|
pythonpath = ["src"]
|
|
99
99
|
markers = [
|
|
100
100
|
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
101
|
+
"ollama: marks tests that require a running Ollama instance",
|
|
102
|
+
]
|
|
103
|
+
filterwarnings = [
|
|
104
|
+
"ignore::DeprecationWarning:vaderSentiment",
|
|
101
105
|
]
|
|
102
106
|
|
|
103
107
|
[tool.coverage.run]
|
|
@@ -113,6 +113,10 @@ def cmd_mode(args: Namespace) -> None:
|
|
|
113
113
|
if (config.embedding.provider != updated.embedding.provider
|
|
114
114
|
or config.embedding.model_name != updated.embedding.model_name):
|
|
115
115
|
print(" ⚠ Embedding model changed. Re-indexing will run on next recall.")
|
|
116
|
+
|
|
117
|
+
# V3.3.4: Warn if Mode C lacks cloud API key
|
|
118
|
+
if args.value == "c" and not updated.llm.api_key:
|
|
119
|
+
print(" ⚠ Mode C requires a cloud API key. Run: slm provider set")
|
|
116
120
|
else:
|
|
117
121
|
print(f"Current mode: {config.mode.value.upper()}")
|
|
118
122
|
|
|
@@ -356,12 +360,20 @@ def cmd_forget(args: Namespace) -> None:
|
|
|
356
360
|
sys.exit(1)
|
|
357
361
|
raise
|
|
358
362
|
|
|
363
|
+
dry_run = getattr(args, 'dry_run', False)
|
|
364
|
+
|
|
359
365
|
if use_json:
|
|
360
366
|
from superlocalmemory.cli.json_output import json_print
|
|
361
367
|
if not matches:
|
|
362
368
|
json_print("forget", data={"matched_count": 0, "deleted_count": 0, "matches": []})
|
|
363
369
|
return
|
|
364
370
|
match_items = [{"fact_id": f.fact_id, "content": f.content[:120]} for f in matches[:20]]
|
|
371
|
+
if dry_run:
|
|
372
|
+
json_print("forget", data={
|
|
373
|
+
"matched_count": len(matches), "deleted_count": 0,
|
|
374
|
+
"dry_run": True, "matches": match_items,
|
|
375
|
+
})
|
|
376
|
+
return
|
|
365
377
|
if getattr(args, 'yes', False):
|
|
366
378
|
for f in matches:
|
|
367
379
|
engine._db.delete_fact(f.fact_id)
|
|
@@ -387,6 +399,9 @@ def cmd_forget(args: Namespace) -> None:
|
|
|
387
399
|
print(f"Found {len(matches)} matching memories:")
|
|
388
400
|
for f in matches[:10]:
|
|
389
401
|
print(f" - {f.fact_id[:8]}... {f.content[:80]}")
|
|
402
|
+
if dry_run:
|
|
403
|
+
print(f"(dry run — {len(matches)} would be deleted)")
|
|
404
|
+
return
|
|
390
405
|
if getattr(args, 'yes', False):
|
|
391
406
|
for f in matches:
|
|
392
407
|
engine._db.delete_fact(f.fact_id)
|
|
@@ -861,7 +876,8 @@ def cmd_trace(args: Namespace) -> None:
|
|
|
861
876
|
try:
|
|
862
877
|
config = SLMConfig.load()
|
|
863
878
|
engine = MemoryEngine(config)
|
|
864
|
-
|
|
879
|
+
limit = getattr(args, 'limit', 10)
|
|
880
|
+
response = engine.recall(args.query, limit=limit)
|
|
865
881
|
except Exception as exc:
|
|
866
882
|
if use_json:
|
|
867
883
|
from superlocalmemory.cli.json_output import json_print
|
|
@@ -1435,6 +1451,7 @@ def cmd_consolidate(args: Namespace) -> None:
|
|
|
1435
1451
|
|
|
1436
1452
|
use_json = getattr(args, "json", False)
|
|
1437
1453
|
cognitive = getattr(args, "cognitive", False)
|
|
1454
|
+
dry_run = getattr(args, "dry_run", False)
|
|
1438
1455
|
profile = getattr(args, "profile", "")
|
|
1439
1456
|
|
|
1440
1457
|
if not cognitive:
|
|
@@ -1460,7 +1477,7 @@ def cmd_consolidate(args: Namespace) -> None:
|
|
|
1460
1477
|
)
|
|
1461
1478
|
|
|
1462
1479
|
consolidator = CognitiveConsolidator(db=engine._db)
|
|
1463
|
-
result = consolidator.run_pipeline(pid)
|
|
1480
|
+
result = consolidator.run_pipeline(pid, dry_run=dry_run)
|
|
1464
1481
|
except Exception as exc:
|
|
1465
1482
|
if use_json:
|
|
1466
1483
|
from superlocalmemory.cli.json_output import json_print
|
|
@@ -1473,7 +1490,7 @@ def cmd_consolidate(args: Namespace) -> None:
|
|
|
1473
1490
|
if use_json:
|
|
1474
1491
|
from superlocalmemory.cli.json_output import json_print
|
|
1475
1492
|
json_print("consolidate", data={
|
|
1476
|
-
"
|
|
1493
|
+
"clusters_processed": result.clusters_processed,
|
|
1477
1494
|
"blocks_created": result.blocks_created,
|
|
1478
1495
|
"facts_archived": result.facts_archived,
|
|
1479
1496
|
"compression_ratio": round(result.compression_ratio, 3),
|
|
@@ -1484,7 +1501,7 @@ def cmd_consolidate(args: Namespace) -> None:
|
|
|
1484
1501
|
return
|
|
1485
1502
|
|
|
1486
1503
|
print("CCQ Cognitive Consolidation")
|
|
1487
|
-
print(f" Clusters
|
|
1504
|
+
print(f" Clusters processed: {result.clusters_processed}")
|
|
1488
1505
|
print(f" Blocks created: {result.blocks_created}")
|
|
1489
1506
|
print(f" Facts archived: {result.facts_archived}")
|
|
1490
1507
|
print(f" Compression ratio: {result.compression_ratio:.3f}")
|
|
@@ -123,6 +123,7 @@ def main() -> None:
|
|
|
123
123
|
|
|
124
124
|
forget_p = sub.add_parser("forget", help="Delete memories matching a query (fuzzy)")
|
|
125
125
|
forget_p.add_argument("query", help="Query to match for deletion")
|
|
126
|
+
forget_p.add_argument("--dry-run", action="store_true", default=False, help="Preview matches without deleting")
|
|
126
127
|
forget_p.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompt")
|
|
127
128
|
forget_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
|
|
128
129
|
|
|
@@ -151,6 +152,7 @@ def main() -> None:
|
|
|
151
152
|
|
|
152
153
|
trace_p = sub.add_parser("trace", help="Recall with per-channel score breakdown")
|
|
153
154
|
trace_p.add_argument("query", help="Search query")
|
|
155
|
+
trace_p.add_argument("--limit", type=int, default=10, help="Max results (default 10)")
|
|
154
156
|
trace_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
|
|
155
157
|
|
|
156
158
|
# -- Diagnostics (continued) ----------------------------------------
|
|
@@ -217,6 +219,10 @@ def main() -> None:
|
|
|
217
219
|
"--cognitive", action="store_true",
|
|
218
220
|
help="Run CCQ cognitive consolidation",
|
|
219
221
|
)
|
|
222
|
+
consolidate_p.add_argument(
|
|
223
|
+
"--dry-run", action="store_true", default=False,
|
|
224
|
+
help="Preview without applying",
|
|
225
|
+
)
|
|
220
226
|
consolidate_p.add_argument("--profile", default="", help="Target profile")
|
|
221
227
|
consolidate_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
|
|
222
228
|
|
|
@@ -612,15 +612,15 @@ class SLMConfig:
|
|
|
612
612
|
|
|
613
613
|
rt = data.get("retrieval", {})
|
|
614
614
|
if rt:
|
|
615
|
-
# V3.3.2 migration:
|
|
616
|
-
# Pre-3.3.2 configs
|
|
617
|
-
#
|
|
618
|
-
#
|
|
619
|
-
# by the absence of cross_encoder_backend field.
|
|
615
|
+
# V3.3.2 migration: add ONNX cross-encoder backend field.
|
|
616
|
+
# Pre-3.3.2 configs lacked cross_encoder_backend. Add it,
|
|
617
|
+
# but NEVER override an explicit use_cross_encoder setting.
|
|
618
|
+
# The user's explicit choice always wins.
|
|
620
619
|
if "cross_encoder_backend" not in rt:
|
|
621
|
-
rt
|
|
622
|
-
rt["cross_encoder_model"] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
620
|
+
rt.setdefault("cross_encoder_model", "cross-encoder/ms-marco-MiniLM-L-6-v2")
|
|
623
621
|
rt["cross_encoder_backend"] = "onnx"
|
|
622
|
+
# Only auto-enable if user didn't explicitly set the field
|
|
623
|
+
rt.setdefault("use_cross_encoder", True)
|
|
624
624
|
config.retrieval = RetrievalConfig(**{
|
|
625
625
|
k: v for k, v in rt.items()
|
|
626
626
|
if k in RetrievalConfig.__dataclass_fields__
|
|
@@ -768,6 +768,9 @@ class SLMConfig:
|
|
|
768
768
|
)
|
|
769
769
|
|
|
770
770
|
# Mode C — FULL POWER, UNRESTRICTED
|
|
771
|
+
# Don't carry over local-only providers (ollama) to cloud mode
|
|
772
|
+
c_provider = llm_provider if llm_provider not in ("ollama", "") else "openrouter"
|
|
773
|
+
c_model = llm_model if llm_provider not in ("ollama", "") else "anthropic/claude-sonnet-4"
|
|
771
774
|
return cls(
|
|
772
775
|
mode=mode,
|
|
773
776
|
base_dir=_base,
|
|
@@ -779,8 +782,8 @@ class SLMConfig:
|
|
|
779
782
|
deployment_name=embedding_deployment,
|
|
780
783
|
),
|
|
781
784
|
llm=LLMConfig(
|
|
782
|
-
provider=
|
|
783
|
-
model=
|
|
785
|
+
provider=c_provider,
|
|
786
|
+
model=c_model,
|
|
784
787
|
api_key=llm_api_key,
|
|
785
788
|
api_base=llm_api_base,
|
|
786
789
|
),
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
Periodic batch processing for mathematical layers:
|
|
8
8
|
1. Langevin batch_step on all active facts (self-organization)
|
|
9
|
+
1a. Backfill: seed uninitialized facts with metadata-aware positions (B+C)
|
|
9
10
|
2. Sheaf batch consistency check on recent facts
|
|
10
11
|
3. Fisher adaptive temperature recalculation
|
|
11
12
|
|
|
@@ -18,15 +19,72 @@ License: MIT
|
|
|
18
19
|
from __future__ import annotations
|
|
19
20
|
|
|
20
21
|
import logging
|
|
22
|
+
import math as _math
|
|
21
23
|
from datetime import UTC, datetime, timedelta
|
|
22
24
|
from typing import TYPE_CHECKING
|
|
23
25
|
|
|
26
|
+
import numpy as np
|
|
27
|
+
|
|
24
28
|
if TYPE_CHECKING:
|
|
25
29
|
from superlocalmemory.core.config import SLMConfig
|
|
26
30
|
from superlocalmemory.storage.database import DatabaseManager
|
|
27
31
|
|
|
28
32
|
logger = logging.getLogger(__name__)
|
|
29
33
|
|
|
34
|
+
# Backfill constants
|
|
35
|
+
_BACKFILL_BURN_IN_STEPS = 50
|
|
36
|
+
_LANGEVIN_DIM = 8
|
|
37
|
+
_MAX_NORM = 0.99
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _compute_equilibrium_radius(
|
|
41
|
+
access_count: int,
|
|
42
|
+
age_days: float,
|
|
43
|
+
importance: float,
|
|
44
|
+
temperature: float = 0.3,
|
|
45
|
+
dim: int = 8,
|
|
46
|
+
) -> float:
|
|
47
|
+
"""Compute metadata-aware equilibrium radius (Strategy B).
|
|
48
|
+
|
|
49
|
+
Uses the Langevin potential coefficients to estimate where a fact
|
|
50
|
+
would settle if it had been in the dynamics from the start.
|
|
51
|
+
|
|
52
|
+
r_eq ≈ sqrt(T * dim / (2 * effective_alpha))
|
|
53
|
+
"""
|
|
54
|
+
alpha, beta, gamma, delta = 3.0, 0.8, 0.005, 0.5
|
|
55
|
+
effective_alpha = (
|
|
56
|
+
alpha
|
|
57
|
+
+ beta * _math.log(access_count + 1) / 10.0
|
|
58
|
+
- gamma * min(age_days, 365.0) / 365.0
|
|
59
|
+
+ delta * importance
|
|
60
|
+
)
|
|
61
|
+
effective_alpha = max(0.1, effective_alpha)
|
|
62
|
+
r_eq = _math.sqrt(temperature * dim / (2.0 * effective_alpha))
|
|
63
|
+
return min(r_eq, _MAX_NORM * 0.95)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _seed_langevin_position(
|
|
67
|
+
access_count: int,
|
|
68
|
+
age_days: float,
|
|
69
|
+
importance: float,
|
|
70
|
+
temperature: float = 0.3,
|
|
71
|
+
dim: int = 8,
|
|
72
|
+
) -> list[float]:
|
|
73
|
+
"""Create a metadata-aware initial position (Strategy B).
|
|
74
|
+
|
|
75
|
+
Places the fact at the equilibrium radius with a random direction.
|
|
76
|
+
"""
|
|
77
|
+
r_eq = _compute_equilibrium_radius(
|
|
78
|
+
access_count, age_days, importance, temperature, dim,
|
|
79
|
+
)
|
|
80
|
+
rng = np.random.default_rng()
|
|
81
|
+
direction = rng.standard_normal(dim)
|
|
82
|
+
norm = float(np.linalg.norm(direction))
|
|
83
|
+
if norm < 1e-8:
|
|
84
|
+
direction = np.ones(dim)
|
|
85
|
+
norm = float(np.linalg.norm(direction))
|
|
86
|
+
return (direction / norm * r_eq).tolist()
|
|
87
|
+
|
|
30
88
|
|
|
31
89
|
def run_maintenance(
|
|
32
90
|
db: DatabaseManager,
|
|
@@ -44,6 +102,7 @@ def run_maintenance(
|
|
|
44
102
|
Dict of counts: langevin_updated, sheaf_checked, etc.
|
|
45
103
|
"""
|
|
46
104
|
counts: dict[str, int] = {
|
|
105
|
+
"langevin_backfilled": 0,
|
|
47
106
|
"langevin_updated": 0,
|
|
48
107
|
"fisher_coupled": 0,
|
|
49
108
|
"sheaf_checked": 0,
|
|
@@ -53,13 +112,60 @@ def run_maintenance(
|
|
|
53
112
|
if not facts:
|
|
54
113
|
return counts
|
|
55
114
|
|
|
56
|
-
#
|
|
115
|
+
# 1a. Backfill: seed uninitialized facts with metadata-aware positions (B+C)
|
|
116
|
+
if config.math.langevin_persist_positions:
|
|
117
|
+
try:
|
|
118
|
+
from superlocalmemory.math.langevin import LangevinDynamics
|
|
119
|
+
|
|
120
|
+
ld = LangevinDynamics(
|
|
121
|
+
dim=_LANGEVIN_DIM,
|
|
122
|
+
dt=config.math.langevin_dt,
|
|
123
|
+
temperature=config.math.langevin_temperature,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
backfilled = 0
|
|
127
|
+
for f in facts:
|
|
128
|
+
if f.langevin_position is not None:
|
|
129
|
+
continue
|
|
130
|
+
created = datetime.fromisoformat(
|
|
131
|
+
f.created_at.replace("Z", "+00:00")
|
|
132
|
+
) if f.created_at else datetime.now(UTC)
|
|
133
|
+
age_days = max(
|
|
134
|
+
0.0,
|
|
135
|
+
(datetime.now(UTC) - created).total_seconds() / 86400.0,
|
|
136
|
+
)
|
|
137
|
+
# Strategy B: metadata-aware seed position
|
|
138
|
+
position = _seed_langevin_position(
|
|
139
|
+
f.access_count, age_days, f.importance,
|
|
140
|
+
config.math.langevin_temperature, _LANGEVIN_DIM,
|
|
141
|
+
)
|
|
142
|
+
# Strategy C: burn-in from the seeded position
|
|
143
|
+
for step_i in range(_BACKFILL_BURN_IN_STEPS):
|
|
144
|
+
position, _ = ld.step(
|
|
145
|
+
position, f.access_count, age_days, f.importance,
|
|
146
|
+
)
|
|
147
|
+
weight = ld.compute_lifecycle_weight(position)
|
|
148
|
+
lifecycle = ld.get_lifecycle_state(weight).value
|
|
149
|
+
db.update_fact(f.fact_id, {
|
|
150
|
+
"langevin_position": position,
|
|
151
|
+
"lifecycle": lifecycle,
|
|
152
|
+
})
|
|
153
|
+
f.langevin_position = position # update in-memory for step 1b
|
|
154
|
+
backfilled += 1
|
|
155
|
+
|
|
156
|
+
counts["langevin_backfilled"] = backfilled
|
|
157
|
+
if backfilled:
|
|
158
|
+
logger.info("Langevin backfill: %d facts initialized", backfilled)
|
|
159
|
+
except Exception as exc:
|
|
160
|
+
logger.warning("Langevin backfill failed: %s", exc)
|
|
161
|
+
|
|
162
|
+
# 1b. Langevin batch step on all positioned facts
|
|
57
163
|
if config.math.langevin_persist_positions:
|
|
58
164
|
try:
|
|
59
165
|
from superlocalmemory.math.langevin import LangevinDynamics
|
|
60
166
|
|
|
61
167
|
ld = LangevinDynamics(
|
|
62
|
-
dim=
|
|
168
|
+
dim=_LANGEVIN_DIM,
|
|
63
169
|
dt=config.math.langevin_dt,
|
|
64
170
|
temperature=config.math.langevin_temperature,
|
|
65
171
|
)
|
|
@@ -165,8 +271,8 @@ def run_maintenance(
|
|
|
165
271
|
logger.warning("Sheaf maintenance failed: %s", exc)
|
|
166
272
|
|
|
167
273
|
logger.info(
|
|
168
|
-
"Maintenance complete: %d Langevin, %d Fisher-coupled, %d Sheaf",
|
|
169
|
-
counts["
|
|
170
|
-
counts["sheaf_checked"],
|
|
274
|
+
"Maintenance complete: %d backfilled, %d Langevin, %d Fisher-coupled, %d Sheaf",
|
|
275
|
+
counts["langevin_backfilled"], counts["langevin_updated"],
|
|
276
|
+
counts["fisher_coupled"], counts["sheaf_checked"],
|
|
171
277
|
)
|
|
172
278
|
return counts
|
|
@@ -25,6 +25,25 @@ from superlocalmemory.storage.models import (
|
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
27
27
|
|
|
28
|
+
# Langevin initialization radius for new facts (ACTIVE zone < 0.3)
|
|
29
|
+
_INIT_LANGEVIN_RADIUS = 0.05
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _init_langevin_position(dim: int = 8) -> list[float]:
|
|
33
|
+
"""Initialize Langevin position near origin for a new fact.
|
|
34
|
+
|
|
35
|
+
Small random perturbation ensures each fact gets a unique position
|
|
36
|
+
while staying deep in the ACTIVE zone (radius < 0.3).
|
|
37
|
+
"""
|
|
38
|
+
import numpy as np
|
|
39
|
+
rng = np.random.default_rng()
|
|
40
|
+
direction = rng.standard_normal(dim)
|
|
41
|
+
norm = float(np.linalg.norm(direction))
|
|
42
|
+
if norm < 1e-8:
|
|
43
|
+
direction = np.ones(dim)
|
|
44
|
+
norm = float(np.linalg.norm(direction))
|
|
45
|
+
return (direction / norm * _INIT_LANGEVIN_RADIUS).tolist()
|
|
46
|
+
|
|
28
47
|
|
|
29
48
|
# ---------------------------------------------------------------------------
|
|
30
49
|
# enrich_fact (was MemoryEngine._enrich_fact)
|
|
@@ -59,6 +78,10 @@ def enrich_fact(
|
|
|
59
78
|
emotion = tag_emotion(fact.content)
|
|
60
79
|
signal = infer_signal(fact.content)
|
|
61
80
|
|
|
81
|
+
# Strategy A: initialize Langevin position near origin (ACTIVE zone).
|
|
82
|
+
# New facts start as ACTIVE; dynamics will evolve them based on access patterns.
|
|
83
|
+
langevin_pos = _init_langevin_position(dim=8)
|
|
84
|
+
|
|
62
85
|
return AtomicFact(
|
|
63
86
|
fact_id=fact.fact_id, memory_id=record.memory_id,
|
|
64
87
|
profile_id=profile_id, content=fact.content,
|
|
@@ -73,6 +96,7 @@ def enrich_fact(
|
|
|
73
96
|
evidence_count=fact.evidence_count,
|
|
74
97
|
source_turn_ids=fact.source_turn_ids, session_id=record.session_id,
|
|
75
98
|
embedding=embedding, fisher_mean=fisher_mean, fisher_variance=fisher_variance,
|
|
99
|
+
langevin_position=langevin_pos,
|
|
76
100
|
emotional_valence=emotion.valence, emotional_arousal=emotion.arousal,
|
|
77
101
|
signal_type=signal, created_at=fact.created_at,
|
|
78
102
|
)
|
|
@@ -142,8 +142,15 @@ class WorkerPool:
|
|
|
142
142
|
# ------------------------------------------------------------------
|
|
143
143
|
|
|
144
144
|
def _send(self, request: dict) -> dict:
|
|
145
|
-
"""Send request to worker and get response. Thread-safe.
|
|
146
|
-
|
|
145
|
+
"""Send request to worker and get response. Thread-safe.
|
|
146
|
+
|
|
147
|
+
Auto-retries once on worker death (idle timeout, crash).
|
|
148
|
+
"""
|
|
149
|
+
resp = self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
|
|
150
|
+
if not resp.get("ok") and "Worker" in resp.get("error", ""):
|
|
151
|
+
logger.info("Auto-restarting worker after failure, retrying request")
|
|
152
|
+
resp = self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
|
|
153
|
+
return resp
|
|
147
154
|
|
|
148
155
|
def _send_with_timeout(self, request: dict, timeout: float) -> dict:
|
|
149
156
|
"""Send request with configurable timeout. Thread-safe."""
|
|
@@ -214,11 +214,17 @@ class CognitiveConsolidator:
|
|
|
214
214
|
# Public API
|
|
215
215
|
# ------------------------------------------------------------------
|
|
216
216
|
|
|
217
|
-
def run_pipeline(
|
|
217
|
+
def run_pipeline(
|
|
218
|
+
self, profile_id: str, dry_run: bool = False,
|
|
219
|
+
) -> CCQPipelineResult:
|
|
218
220
|
"""Execute the full 6-step CCQ pipeline.
|
|
219
221
|
|
|
220
222
|
Per-cluster error isolation: one cluster failure does NOT
|
|
221
223
|
abort the pipeline (HR-07).
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
profile_id: Target profile.
|
|
227
|
+
dry_run: If True, identify clusters but don't apply changes.
|
|
222
228
|
"""
|
|
223
229
|
# Step 1: Identify candidates
|
|
224
230
|
candidates = self._step1_identify(profile_id)
|
|
@@ -230,6 +236,18 @@ class CognitiveConsolidator:
|
|
|
230
236
|
if not clusters:
|
|
231
237
|
return self._empty_result()
|
|
232
238
|
|
|
239
|
+
if dry_run:
|
|
240
|
+
return CCQPipelineResult(
|
|
241
|
+
clusters_processed=len(clusters),
|
|
242
|
+
blocks_created=0,
|
|
243
|
+
facts_archived=len(candidates),
|
|
244
|
+
total_bytes_before=0,
|
|
245
|
+
total_bytes_after=0,
|
|
246
|
+
compression_ratio=0.0,
|
|
247
|
+
audit_entries=(),
|
|
248
|
+
errors=(),
|
|
249
|
+
)
|
|
250
|
+
|
|
233
251
|
# Process each cluster
|
|
234
252
|
blocks_created = 0
|
|
235
253
|
facts_archived = 0
|
|
@@ -30,8 +30,11 @@ def _get_vader():
|
|
|
30
30
|
if _vader_analyzer is not None:
|
|
31
31
|
return _vader_analyzer
|
|
32
32
|
try:
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
import warnings
|
|
34
|
+
with warnings.catch_warnings():
|
|
35
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning, module="vaderSentiment")
|
|
36
|
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
|
37
|
+
_vader_analyzer = SentimentIntensityAnalyzer()
|
|
35
38
|
except ImportError:
|
|
36
39
|
logger.warning("vaderSentiment not installed — emotional tagging disabled")
|
|
37
40
|
_vader_analyzer = None
|
|
@@ -103,7 +103,9 @@ class PolarQuantEncoder:
|
|
|
103
103
|
"""
|
|
104
104
|
path_str = self._config.rotation_matrix_path
|
|
105
105
|
if not path_str:
|
|
106
|
-
path_str = str(
|
|
106
|
+
path_str = str(
|
|
107
|
+
Path.home() / ".superlocalmemory" / f"polar_rotation_{self._d}.npy",
|
|
108
|
+
)
|
|
107
109
|
|
|
108
110
|
path = Path(path_str)
|
|
109
111
|
|
|
@@ -83,6 +83,10 @@ class RetrievalEngine:
|
|
|
83
83
|
self._bridge = bridge_discovery
|
|
84
84
|
self._trust_scorer = trust_scorer
|
|
85
85
|
|
|
86
|
+
# V3.3.4: LRU cache for query embeddings (avoids redundant Ollama API calls)
|
|
87
|
+
self._query_embedding_cache: dict[str, list[float]] = {}
|
|
88
|
+
self._cache_max_size = 64
|
|
89
|
+
|
|
86
90
|
# V3.2: ChannelRegistry for self-registration (Phase 0.5)
|
|
87
91
|
from superlocalmemory.retrieval.channel_registry import ChannelRegistry
|
|
88
92
|
self._registry = ChannelRegistry()
|
|
@@ -189,6 +193,21 @@ class RetrievalEngine:
|
|
|
189
193
|
|
|
190
194
|
# -- Channel execution --------------------------------------------------
|
|
191
195
|
|
|
196
|
+
def _embed_query(self, query: str) -> list[float] | None:
|
|
197
|
+
"""Embed query with LRU cache. Avoids redundant Ollama/API calls."""
|
|
198
|
+
if self._embedder is None:
|
|
199
|
+
return None
|
|
200
|
+
cached = self._query_embedding_cache.get(query)
|
|
201
|
+
if cached is not None:
|
|
202
|
+
return cached
|
|
203
|
+
emb = self._embedder.embed(query)
|
|
204
|
+
# Evict oldest if cache full
|
|
205
|
+
if len(self._query_embedding_cache) >= self._cache_max_size:
|
|
206
|
+
oldest = next(iter(self._query_embedding_cache))
|
|
207
|
+
del self._query_embedding_cache[oldest]
|
|
208
|
+
self._query_embedding_cache[query] = emb
|
|
209
|
+
return emb
|
|
210
|
+
|
|
192
211
|
def _run_channels(
|
|
193
212
|
self, query: str, profile_id: str, strat: QueryStrategy,
|
|
194
213
|
) -> dict[str, list[tuple[str, float]]]:
|
|
@@ -197,9 +216,20 @@ class RetrievalEngine:
|
|
|
197
216
|
# Skip channels listed in disabled_channels (ablation support)
|
|
198
217
|
disabled = set(self._config.disabled_channels)
|
|
199
218
|
|
|
200
|
-
|
|
219
|
+
# V3.3.4: Embed query ONCE, reuse for semantic + hopfield channels
|
|
220
|
+
q_emb: list[float] | None = None
|
|
221
|
+
needs_embedding = (
|
|
222
|
+
(self._semantic is not None and "semantic" not in disabled)
|
|
223
|
+
or (self._hopfield is not None and "hopfield" not in disabled)
|
|
224
|
+
)
|
|
225
|
+
if needs_embedding:
|
|
226
|
+
try:
|
|
227
|
+
q_emb = self._embed_query(query)
|
|
228
|
+
except Exception as exc:
|
|
229
|
+
logger.warning("Query embedding failed: %s", exc)
|
|
230
|
+
|
|
231
|
+
if self._semantic is not None and q_emb is not None and "semantic" not in disabled:
|
|
201
232
|
try:
|
|
202
|
-
q_emb = self._embedder.embed(query)
|
|
203
233
|
r = self._semantic.search(q_emb, profile_id, self._config.semantic_top_k)
|
|
204
234
|
if r:
|
|
205
235
|
out["semantic"] = r
|
|
@@ -231,13 +261,11 @@ class RetrievalEngine:
|
|
|
231
261
|
logger.warning("Temporal channel: %s", exc)
|
|
232
262
|
|
|
233
263
|
# Phase G: Hopfield channel (6th) — energy-based pattern completion
|
|
234
|
-
if self._hopfield is not None and "hopfield" not in disabled:
|
|
264
|
+
if self._hopfield is not None and q_emb is not None and "hopfield" not in disabled:
|
|
235
265
|
try:
|
|
236
|
-
|
|
237
|
-
if
|
|
238
|
-
|
|
239
|
-
if r:
|
|
240
|
-
out["hopfield"] = r
|
|
266
|
+
r = self._hopfield.search(q_emb, profile_id, self._config.hopfield_top_k)
|
|
267
|
+
if r:
|
|
268
|
+
out["hopfield"] = r
|
|
241
269
|
except Exception as exc:
|
|
242
270
|
logger.warning("Hopfield channel: %s", exc)
|
|
243
271
|
|
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
# Licensed under the MIT License - see LICENSE file
|
|
3
3
|
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
4
|
|
|
5
|
-
"""SuperLocalMemory V3 — Cross-Encoder Reranker.
|
|
5
|
+
"""SuperLocalMemory V3 — Cross-Encoder Reranker (Subprocess-Isolated).
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
V3.3.3: All PyTorch/ONNX model work runs in a SEPARATE subprocess.
|
|
8
|
+
The main process (dashboard, MCP, CLI) NEVER imports torch and stays
|
|
9
|
+
at ~60 MB. Same isolation pattern as EmbeddingService.
|
|
10
|
+
|
|
11
|
+
The worker subprocess auto-kills after 2 minutes idle.
|
|
9
12
|
|
|
10
13
|
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
11
14
|
License: MIT
|
|
@@ -13,49 +16,33 @@ License: MIT
|
|
|
13
16
|
|
|
14
17
|
from __future__ import annotations
|
|
15
18
|
|
|
19
|
+
import json
|
|
16
20
|
import logging
|
|
17
|
-
import
|
|
18
|
-
import
|
|
21
|
+
import os
|
|
22
|
+
import subprocess
|
|
19
23
|
import sys
|
|
20
24
|
import threading
|
|
25
|
+
import time
|
|
21
26
|
from typing import Any
|
|
22
27
|
|
|
23
28
|
from superlocalmemory.storage.models import AtomicFact
|
|
24
29
|
|
|
25
30
|
logger = logging.getLogger(__name__)
|
|
26
31
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
Returns the file_name parameter for CrossEncoder model_kwargs.
|
|
32
|
-
Platform detection:
|
|
33
|
-
- macOS ARM64 (Apple Silicon): qint8_arm64
|
|
34
|
-
- x86_64 with AVX2: quint8_avx2
|
|
35
|
-
- Everything else: default model.onnx (float32, works everywhere)
|
|
36
|
-
"""
|
|
37
|
-
arch = platform.machine().lower()
|
|
38
|
-
is_64bit = struct.calcsize("P") * 8 == 64
|
|
39
|
-
|
|
40
|
-
if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
|
|
41
|
-
return "onnx/model_qint8_arm64.onnx"
|
|
42
|
-
|
|
43
|
-
if arch in ("x86_64", "amd64") and is_64bit:
|
|
44
|
-
return "onnx/model_quint8_avx2.onnx"
|
|
45
|
-
|
|
46
|
-
return "onnx/model.onnx"
|
|
32
|
+
_IDLE_TIMEOUT_SECONDS = 120 # 2 min → kill worker
|
|
33
|
+
_SUBPROCESS_RESPONSE_TIMEOUT = 120 # 120s for ONNX cold start
|
|
34
|
+
_WORKER_RECYCLE_AFTER = 500 # Recycle after N requests
|
|
47
35
|
|
|
48
36
|
|
|
49
37
|
class CrossEncoderReranker:
|
|
50
38
|
"""Rerank candidate facts using a local cross-encoder model.
|
|
51
39
|
|
|
52
|
-
V3.3.
|
|
53
|
-
|
|
54
|
-
|
|
40
|
+
V3.3.3: SUBPROCESS-ISOLATED. The main process never imports
|
|
41
|
+
sentence_transformers or torch. All model work runs in a child
|
|
42
|
+
process via JSON over stdin/stdout.
|
|
55
43
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
original score order — never crashes.
|
|
44
|
+
Non-blocking first-use: triggers background worker spawn, returns
|
|
45
|
+
fallback scores until worker is ready.
|
|
59
46
|
|
|
60
47
|
Args:
|
|
61
48
|
model_name: HuggingFace cross-encoder model identifier.
|
|
@@ -70,106 +57,207 @@ class CrossEncoderReranker:
|
|
|
70
57
|
) -> None:
|
|
71
58
|
self._model_name = model_name
|
|
72
59
|
self._backend = backend
|
|
73
|
-
self.
|
|
74
|
-
self.
|
|
75
|
-
self.
|
|
76
|
-
self._active_backend: str = ""
|
|
60
|
+
self._worker_proc: subprocess.Popen | None = None
|
|
61
|
+
self._model_loaded = False # True once worker confirms model is ready
|
|
62
|
+
self._worker_loading = False # True while background warmup in progress
|
|
77
63
|
self._lock = threading.Lock()
|
|
64
|
+
self._idle_timer: threading.Timer | None = None
|
|
65
|
+
self._request_count: int = 0
|
|
66
|
+
|
|
67
|
+
# Start background warmup immediately — worker loads model
|
|
68
|
+
# while the rest of init continues. First recall gets instant
|
|
69
|
+
# fallback; second recall uses the warm model.
|
|
70
|
+
self._start_background_warmup()
|
|
78
71
|
|
|
79
72
|
# ------------------------------------------------------------------
|
|
80
|
-
#
|
|
73
|
+
# Background warmup (non-blocking model load)
|
|
81
74
|
# ------------------------------------------------------------------
|
|
82
75
|
|
|
83
|
-
def
|
|
84
|
-
"""
|
|
85
|
-
|
|
86
|
-
On first call, starts loading in a background thread and returns
|
|
87
|
-
immediately. The model becomes available for subsequent calls
|
|
88
|
-
once loading completes. This prevents the 30s ONNX cold start
|
|
89
|
-
from blocking the first recall request.
|
|
76
|
+
def _start_background_warmup(self) -> None:
|
|
77
|
+
"""Start worker and load model in background thread.
|
|
90
78
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
2. PyTorch backend (requires torch) — ~1.5GB RAM
|
|
94
|
-
3. No model (graceful degradation) — 0 RAM
|
|
79
|
+
Returns immediately. The worker loads the model in parallel
|
|
80
|
+
with the rest of engine initialization and the first recall.
|
|
95
81
|
"""
|
|
96
|
-
if self.
|
|
82
|
+
if self._worker_loading or self._model_loaded:
|
|
97
83
|
return
|
|
84
|
+
self._worker_loading = True
|
|
85
|
+
|
|
86
|
+
def _warmup() -> None:
|
|
87
|
+
try:
|
|
88
|
+
self._ensure_worker()
|
|
89
|
+
if self._worker_proc is None:
|
|
90
|
+
return
|
|
91
|
+
# Send load command and wait for response
|
|
92
|
+
req = json.dumps({
|
|
93
|
+
"cmd": "load",
|
|
94
|
+
"model_name": self._model_name,
|
|
95
|
+
"backend": self._backend,
|
|
96
|
+
}) + "\n"
|
|
97
|
+
self._worker_proc.stdin.write(req)
|
|
98
|
+
self._worker_proc.stdin.flush()
|
|
99
|
+
resp_line = self._readline_with_timeout(
|
|
100
|
+
self._worker_proc.stdout, _SUBPROCESS_RESPONSE_TIMEOUT,
|
|
101
|
+
)
|
|
102
|
+
if resp_line:
|
|
103
|
+
resp = json.loads(resp_line)
|
|
104
|
+
if resp.get("ok"):
|
|
105
|
+
self._model_loaded = True
|
|
106
|
+
logger.info(
|
|
107
|
+
"Reranker worker warm (backend=%s)",
|
|
108
|
+
resp.get("backend", "?"),
|
|
109
|
+
)
|
|
110
|
+
self._reset_idle_timer()
|
|
111
|
+
except Exception as exc:
|
|
112
|
+
logger.debug("Background reranker warmup failed: %s", exc)
|
|
113
|
+
finally:
|
|
114
|
+
self._worker_loading = False
|
|
115
|
+
|
|
116
|
+
t = threading.Thread(target=_warmup, daemon=True, name="ce-warmup")
|
|
117
|
+
t.start()
|
|
98
118
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
self._loading = True
|
|
119
|
+
# ------------------------------------------------------------------
|
|
120
|
+
# Worker management (mirrors EmbeddingService pattern)
|
|
121
|
+
# ------------------------------------------------------------------
|
|
103
122
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
123
|
+
def _ensure_worker(self) -> None:
|
|
124
|
+
"""Spawn worker subprocess if not running. Non-blocking."""
|
|
125
|
+
if self._worker_proc is not None and self._worker_proc.poll() is None:
|
|
126
|
+
return
|
|
127
|
+
self._worker_proc = None
|
|
128
|
+
self._worker_ready = False
|
|
109
129
|
|
|
110
|
-
|
|
111
|
-
"""Actually load the model (runs in background thread)."""
|
|
130
|
+
worker_module = "superlocalmemory.core.reranker_worker"
|
|
112
131
|
try:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
"ONNX backend unavailable (%s), falling back to PyTorch",
|
|
132
|
-
onnx_exc,
|
|
133
|
-
)
|
|
134
|
-
model = CrossEncoder(self._model_name)
|
|
135
|
-
self._model = model
|
|
136
|
-
self._active_backend = "pytorch"
|
|
137
|
-
logger.info(
|
|
138
|
-
"Cross-encoder loaded (PyTorch fallback): %s",
|
|
139
|
-
self._model_name,
|
|
140
|
-
)
|
|
141
|
-
else:
|
|
142
|
-
model = CrossEncoder(self._model_name)
|
|
143
|
-
self._model = model
|
|
144
|
-
self._active_backend = "pytorch"
|
|
145
|
-
logger.info("Cross-encoder loaded: %s", self._model_name)
|
|
146
|
-
except ImportError:
|
|
147
|
-
logger.warning(
|
|
148
|
-
"sentence-transformers not installed; "
|
|
149
|
-
"cross-encoder reranking disabled"
|
|
132
|
+
env = {
|
|
133
|
+
**os.environ,
|
|
134
|
+
"CUDA_VISIBLE_DEVICES": "",
|
|
135
|
+
"PYTORCH_MPS_HIGH_WATERMARK_RATIO": "0.0",
|
|
136
|
+
"PYTORCH_MPS_MEM_LIMIT": "0",
|
|
137
|
+
"PYTORCH_ENABLE_MPS_FALLBACK": "1",
|
|
138
|
+
"TOKENIZERS_PARALLELISM": "false",
|
|
139
|
+
"TORCH_DEVICE": "cpu",
|
|
140
|
+
}
|
|
141
|
+
self._worker_proc = subprocess.Popen(
|
|
142
|
+
[sys.executable, "-m", worker_module],
|
|
143
|
+
stdin=subprocess.PIPE,
|
|
144
|
+
stdout=subprocess.PIPE,
|
|
145
|
+
stderr=subprocess.DEVNULL,
|
|
146
|
+
text=True,
|
|
147
|
+
bufsize=1,
|
|
148
|
+
env=env,
|
|
149
|
+
start_new_session=True,
|
|
150
150
|
)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
"Failed to load cross-encoder %s: %s",
|
|
154
|
-
self._model_name,
|
|
155
|
-
exc,
|
|
151
|
+
logger.info(
|
|
152
|
+
"Reranker worker spawned (PID %d)", self._worker_proc.pid,
|
|
156
153
|
)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
154
|
+
self._worker_ready = True
|
|
155
|
+
except Exception as exc:
|
|
156
|
+
logger.warning("Failed to spawn reranker worker: %s", exc)
|
|
157
|
+
self._worker_proc = None
|
|
160
158
|
|
|
161
|
-
def
|
|
162
|
-
"""
|
|
159
|
+
def _send_request(self, req: dict, timeout: float | None = None) -> dict | None:
|
|
160
|
+
"""Send JSON request to worker, get response. Thread-safe.
|
|
163
161
|
|
|
164
|
-
|
|
162
|
+
Uses a short timeout (10s) for rerank requests since the model
|
|
163
|
+
should already be loaded by the background warmup. Uses the full
|
|
164
|
+
timeout only for explicit load/ping commands.
|
|
165
165
|
"""
|
|
166
|
-
|
|
167
|
-
|
|
166
|
+
effective_timeout = timeout or _SUBPROCESS_RESPONSE_TIMEOUT
|
|
167
|
+
|
|
168
168
|
with self._lock:
|
|
169
|
-
if self.
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
169
|
+
if self._request_count >= _WORKER_RECYCLE_AFTER and self._worker_proc is not None:
|
|
170
|
+
logger.info("Recycling reranker worker after %d requests", self._request_count)
|
|
171
|
+
self._kill_worker()
|
|
172
|
+
self._model_loaded = False
|
|
173
|
+
self._request_count = 0
|
|
174
|
+
|
|
175
|
+
# Ensure worker is alive (re-spawn if crashed)
|
|
176
|
+
if self._worker_proc is None or self._worker_proc.poll() is not None:
|
|
177
|
+
self._ensure_worker()
|
|
178
|
+
if self._worker_proc is None:
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
msg = json.dumps(req) + "\n"
|
|
183
|
+
self._worker_proc.stdin.write(msg)
|
|
184
|
+
self._worker_proc.stdin.flush()
|
|
185
|
+
|
|
186
|
+
resp_line = self._readline_with_timeout(
|
|
187
|
+
self._worker_proc.stdout,
|
|
188
|
+
effective_timeout,
|
|
189
|
+
)
|
|
190
|
+
if not resp_line:
|
|
191
|
+
logger.warning("Reranker worker timed out after %ds", effective_timeout)
|
|
192
|
+
self._kill_worker()
|
|
193
|
+
self._model_loaded = False
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
resp = json.loads(resp_line)
|
|
197
|
+
self._reset_idle_timer()
|
|
198
|
+
self._request_count += 1
|
|
199
|
+
return resp
|
|
200
|
+
except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
|
|
201
|
+
logger.warning("Reranker worker communication failed: %s", exc)
|
|
202
|
+
self._kill_worker()
|
|
203
|
+
self._model_loaded = False
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _readline_with_timeout(stream: Any, timeout_seconds: float) -> str:
|
|
208
|
+
"""Read a line from stream with timeout. Returns '' on timeout."""
|
|
209
|
+
result_container: list[str] = []
|
|
210
|
+
error_container: list[Exception] = []
|
|
211
|
+
|
|
212
|
+
def _read() -> None:
|
|
213
|
+
try:
|
|
214
|
+
result_container.append(stream.readline())
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
error_container.append(exc)
|
|
217
|
+
|
|
218
|
+
reader = threading.Thread(target=_read, daemon=True)
|
|
219
|
+
reader.start()
|
|
220
|
+
reader.join(timeout=timeout_seconds)
|
|
221
|
+
|
|
222
|
+
if reader.is_alive():
|
|
223
|
+
return ""
|
|
224
|
+
if error_container:
|
|
225
|
+
raise error_container[0]
|
|
226
|
+
return result_container[0] if result_container else ""
|
|
227
|
+
|
|
228
|
+
def _kill_worker(self) -> None:
|
|
229
|
+
"""Terminate worker subprocess."""
|
|
230
|
+
if self._idle_timer is not None:
|
|
231
|
+
self._idle_timer.cancel()
|
|
232
|
+
self._idle_timer = None
|
|
233
|
+
if self._worker_proc is not None:
|
|
234
|
+
try:
|
|
235
|
+
self._worker_proc.stdin.write('{"cmd":"quit"}\n')
|
|
236
|
+
self._worker_proc.stdin.flush()
|
|
237
|
+
self._worker_proc.wait(timeout=3)
|
|
238
|
+
except Exception:
|
|
239
|
+
try:
|
|
240
|
+
self._worker_proc.kill()
|
|
241
|
+
except Exception:
|
|
242
|
+
pass
|
|
243
|
+
self._worker_proc = None
|
|
244
|
+
self._worker_ready = False
|
|
245
|
+
|
|
246
|
+
def _reset_idle_timer(self) -> None:
|
|
247
|
+
"""Reset idle timer — kills worker after 2 min inactivity."""
|
|
248
|
+
if self._idle_timer is not None:
|
|
249
|
+
self._idle_timer.cancel()
|
|
250
|
+
self._idle_timer = threading.Timer(
|
|
251
|
+
_IDLE_TIMEOUT_SECONDS, self.unload,
|
|
252
|
+
)
|
|
253
|
+
self._idle_timer.daemon = True
|
|
254
|
+
self._idle_timer.start()
|
|
255
|
+
|
|
256
|
+
def unload(self) -> None:
|
|
257
|
+
"""Kill the worker subprocess to free all memory."""
|
|
258
|
+
with self._lock:
|
|
259
|
+
self._kill_worker()
|
|
260
|
+
logger.info("CrossEncoderReranker: worker killed (idle timeout)")
|
|
173
261
|
|
|
174
262
|
# ------------------------------------------------------------------
|
|
175
263
|
# Public API
|
|
@@ -183,73 +271,62 @@ class CrossEncoderReranker:
|
|
|
183
271
|
) -> list[tuple[AtomicFact, float]]:
|
|
184
272
|
"""Rerank candidates by cross-encoder relevance.
|
|
185
273
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
query: User query text.
|
|
194
|
-
candidates: List of (AtomicFact, score) tuples from the
|
|
195
|
-
fusion stage.
|
|
196
|
-
top_k: Maximum results to return.
|
|
197
|
-
|
|
198
|
-
Returns:
|
|
199
|
-
Top-k (AtomicFact, cross_encoder_score) tuples, sorted
|
|
200
|
-
descending by cross-encoder score.
|
|
274
|
+
NON-BLOCKING: If the worker is still loading the model
|
|
275
|
+
(background warmup), returns candidates by existing score
|
|
276
|
+
immediately. Once the worker is warm, subsequent calls use
|
|
277
|
+
the cross-encoder. This means CLI first-call gets instant
|
|
278
|
+
results (without reranking), and MCP gets reranked results
|
|
279
|
+
(worker stays warm between calls).
|
|
201
280
|
"""
|
|
202
281
|
if not candidates:
|
|
203
282
|
return []
|
|
204
283
|
|
|
205
|
-
# Non-blocking:
|
|
206
|
-
self.
|
|
207
|
-
|
|
208
|
-
if self._model is None:
|
|
209
|
-
# Model not loaded yet (still loading in background or failed).
|
|
210
|
-
# Graceful fallback: return candidates sorted by existing score.
|
|
211
|
-
# Next recall will use the model once it's ready.
|
|
212
|
-
sorted_cands = sorted(
|
|
213
|
-
candidates, key=lambda x: x[1], reverse=True
|
|
214
|
-
)
|
|
284
|
+
# Non-blocking: if model isn't loaded yet, return fallback
|
|
285
|
+
if not self._model_loaded:
|
|
286
|
+
sorted_cands = sorted(candidates, key=lambda x: x[1], reverse=True)
|
|
215
287
|
return sorted_cands[:top_k]
|
|
216
288
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
289
|
+
documents = [fact.content for fact, _ in candidates]
|
|
290
|
+
|
|
291
|
+
# Short timeout (10s) — model should already be loaded by warmup.
|
|
292
|
+
# If worker crashed or is still loading, fallback immediately.
|
|
293
|
+
resp = self._send_request({
|
|
294
|
+
"cmd": "rerank",
|
|
295
|
+
"query": query,
|
|
296
|
+
"documents": documents,
|
|
297
|
+
}, timeout=10.0)
|
|
221
298
|
|
|
222
|
-
|
|
299
|
+
if resp is None or not resp.get("ok"):
|
|
300
|
+
# Fallback: return by existing score
|
|
301
|
+
sorted_cands = sorted(candidates, key=lambda x: x[1], reverse=True)
|
|
302
|
+
return sorted_cands[:top_k]
|
|
223
303
|
|
|
304
|
+
scores = resp["scores"]
|
|
224
305
|
scored: list[tuple[AtomicFact, float]] = [
|
|
225
306
|
(fact, float(score))
|
|
226
307
|
for (fact, _), score in zip(candidates, scores)
|
|
227
308
|
]
|
|
228
|
-
|
|
229
309
|
scored.sort(key=lambda x: x[1], reverse=True)
|
|
230
310
|
return scored[:top_k]
|
|
231
311
|
|
|
232
312
|
def score_pair(self, query: str, document: str) -> float:
|
|
233
|
-
"""Score a single (query, document) pair.
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
query:
|
|
237
|
-
document:
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
""
|
|
243
|
-
self._ensure_model()
|
|
244
|
-
|
|
245
|
-
if self._model is None:
|
|
313
|
+
"""Score a single (query, document) pair."""
|
|
314
|
+
resp = self._send_request({
|
|
315
|
+
"cmd": "score",
|
|
316
|
+
"query": query,
|
|
317
|
+
"document": document,
|
|
318
|
+
"model_name": self._model_name,
|
|
319
|
+
"backend": self._backend,
|
|
320
|
+
})
|
|
321
|
+
|
|
322
|
+
if resp is None or not resp.get("ok"):
|
|
246
323
|
return 0.0
|
|
247
|
-
|
|
248
|
-
scores = self._model.predict([(query, document)])
|
|
249
|
-
return float(scores[0])
|
|
324
|
+
return float(resp.get("score", 0.0))
|
|
250
325
|
|
|
251
326
|
@property
|
|
252
327
|
def is_available(self) -> bool:
|
|
253
|
-
"""Whether the cross-encoder
|
|
254
|
-
self.
|
|
255
|
-
|
|
328
|
+
"""Whether the cross-encoder worker can be spawned."""
|
|
329
|
+
resp = self._send_request({"cmd": "ping"})
|
|
330
|
+
if resp is None:
|
|
331
|
+
return False
|
|
332
|
+
return resp.get("ok", False)
|
|
@@ -36,11 +36,12 @@ _REINDEX_BATCH_SIZE = 50
|
|
|
36
36
|
def _model_signature(config: SLMConfig) -> str:
|
|
37
37
|
"""Derive a deterministic signature from the active embedding config.
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
V3.3.4: Only model_name + dimension matter. Provider (sentence-transformers
|
|
40
|
+
vs ollama) doesn't change the embedding space when the model is the same.
|
|
41
|
+
This prevents spurious re-indexing when switching Mode A ↔ B.
|
|
41
42
|
"""
|
|
42
43
|
emb = config.embedding
|
|
43
|
-
return f"{emb.
|
|
44
|
+
return f"{emb.model_name}::{emb.dimension}"
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
def _read_stored_signature(config_dir: Path) -> str:
|