superlocalmemory 3.0.16 → 3.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/slm-npm +8 -0
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/src/superlocalmemory/cli/commands.py +29 -0
- package/src/superlocalmemory/cli/main.py +94 -30
- package/src/superlocalmemory/core/embedding_worker.py +120 -0
- package/src/superlocalmemory/core/embeddings.py +156 -240
- package/src/superlocalmemory/core/recall_worker.py +193 -0
- package/src/superlocalmemory/core/summarizer.py +182 -0
- package/src/superlocalmemory/core/worker_pool.py +209 -0
- package/src/superlocalmemory/mcp/server.py +9 -0
- package/src/superlocalmemory/mcp/tools_core.py +21 -8
- package/src/superlocalmemory/mcp/tools_v3.py +21 -0
- package/src/superlocalmemory/server/routes/helpers.py +21 -0
- package/src/superlocalmemory/server/routes/memories.py +100 -42
- package/src/superlocalmemory/server/routes/stats.py +11 -0
- package/src/superlocalmemory/server/routes/v3_api.py +195 -43
- package/src/superlocalmemory/server/ui.py +15 -14
- package/src/superlocalmemory/storage/database.py +23 -0
- package/src/superlocalmemory.egg-info/PKG-INFO +1 -1
- package/src/superlocalmemory.egg-info/SOURCES.txt +4 -0
- package/ui/index.html +113 -29
- package/ui/js/auto-settings.js +330 -1
- package/ui/js/clusters.js +138 -101
- package/ui/js/graph-core.js +3 -1
- package/ui/js/graph-interactions.js +2 -5
- package/ui/js/memories.js +65 -2
- package/ui/js/modal.js +79 -42
- package/ui/js/recall-lab.js +206 -60
|
@@ -2,35 +2,29 @@
|
|
|
2
2
|
# Licensed under the MIT License - see LICENSE file
|
|
3
3
|
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
4
|
|
|
5
|
-
"""SuperLocalMemory V3 — Embedding Service.
|
|
5
|
+
"""SuperLocalMemory V3 — Embedding Service (Subprocess-Isolated).
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
on dimension mismatch — NEVER silently falls back to a different dimension.
|
|
7
|
+
All PyTorch/model work runs in a SEPARATE subprocess. The main process
|
|
8
|
+
(dashboard, MCP, CLI) never imports torch and stays at ~60 MB.
|
|
10
9
|
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
The worker subprocess auto-kills after 2 minutes idle, returning all
|
|
11
|
+
memory to the OS. It respawns on next embed call (~3 sec cold start).
|
|
13
12
|
|
|
14
13
|
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
15
14
|
"""
|
|
16
15
|
|
|
17
16
|
from __future__ import annotations
|
|
18
17
|
|
|
18
|
+
import json
|
|
19
19
|
import logging
|
|
20
20
|
import os
|
|
21
|
+
import subprocess
|
|
22
|
+
import sys
|
|
21
23
|
import threading
|
|
22
24
|
import time
|
|
25
|
+
from pathlib import Path
|
|
23
26
|
from typing import TYPE_CHECKING
|
|
24
27
|
|
|
25
|
-
# Force CPU before any torch/sentence-transformers import.
|
|
26
|
-
# On Apple Silicon, PyTorch defaults to Metal (MPS) which allocates 4-6 GB
|
|
27
|
-
# of GPU shader buffers that grow over time and never get released.
|
|
28
|
-
# On Windows/Linux with CUDA, similar GPU memory issues occur.
|
|
29
|
-
# CPU-only keeps footprint under 1 GB (vs 6+ GB with GPU).
|
|
30
|
-
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
|
|
31
|
-
os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
|
|
32
|
-
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
|
|
33
|
-
|
|
34
28
|
import numpy as np
|
|
35
29
|
|
|
36
30
|
if TYPE_CHECKING:
|
|
@@ -40,276 +34,215 @@ from superlocalmemory.core.config import EmbeddingConfig
|
|
|
40
34
|
|
|
41
35
|
logger = logging.getLogger(__name__)
|
|
42
36
|
|
|
43
|
-
# ---------------------------------------------------------------------------
|
|
44
37
|
# Fisher variance constants
|
|
45
|
-
# ---------------------------------------------------------------------------
|
|
46
38
|
_FISHER_VAR_MIN = 0.05
|
|
47
39
|
_FISHER_VAR_MAX = 2.0
|
|
48
|
-
_FISHER_VAR_RANGE = _FISHER_VAR_MAX - _FISHER_VAR_MIN
|
|
40
|
+
_FISHER_VAR_RANGE = _FISHER_VAR_MAX - _FISHER_VAR_MIN
|
|
49
41
|
|
|
50
42
|
|
|
51
43
|
class DimensionMismatchError(RuntimeError):
|
|
52
|
-
"""Raised when the actual embedding dimension differs from config.
|
|
53
|
-
|
|
54
|
-
This is a HARD failure — V1 silently fell back to local embeddings
|
|
55
|
-
when Azure failed, changing dimension from 3072 to 768 mid-run.
|
|
56
|
-
We crash loudly instead.
|
|
57
|
-
"""
|
|
44
|
+
"""Raised when the actual embedding dimension differs from config."""
|
|
58
45
|
|
|
59
46
|
|
|
60
|
-
_IDLE_TIMEOUT_SECONDS =
|
|
47
|
+
_IDLE_TIMEOUT_SECONDS = 120 # 2 minutes — kill worker after idle
|
|
61
48
|
|
|
62
49
|
|
|
63
50
|
class EmbeddingService:
|
|
64
|
-
"""
|
|
51
|
+
"""Subprocess-isolated embedding service.
|
|
65
52
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
53
|
+
All model inference runs in a child process. The main process never
|
|
54
|
+
imports torch/sentence-transformers, keeping its memory at ~60 MB.
|
|
55
|
+
|
|
56
|
+
The worker auto-kills after 2 min idle. First embed after idle takes
|
|
57
|
+
~3 sec (model reload). Subsequent embeds are instant (<100ms).
|
|
70
58
|
"""
|
|
71
59
|
|
|
72
60
|
def __init__(self, config: EmbeddingConfig) -> None:
|
|
73
61
|
self._config = config
|
|
74
|
-
self._model: object | None = None
|
|
75
62
|
self._lock = threading.Lock()
|
|
76
|
-
self.
|
|
77
|
-
self._available = True
|
|
63
|
+
self._worker_proc: subprocess.Popen | None = None
|
|
64
|
+
self._available = True
|
|
78
65
|
self._last_used: float = 0.0
|
|
79
66
|
self._idle_timer: threading.Timer | None = None
|
|
67
|
+
self._worker_ready = False
|
|
80
68
|
|
|
81
69
|
@property
|
|
82
70
|
def is_available(self) -> bool:
|
|
83
|
-
"""Check if embedding service
|
|
84
|
-
if
|
|
85
|
-
self.
|
|
86
|
-
return self._available
|
|
71
|
+
"""Check if embedding service can produce embeddings."""
|
|
72
|
+
if self._config.is_cloud:
|
|
73
|
+
return bool(self._config.api_endpoint and self._config.api_key)
|
|
74
|
+
return self._available
|
|
87
75
|
|
|
88
|
-
|
|
89
|
-
|
|
76
|
+
@property
|
|
77
|
+
def dimension(self) -> int:
|
|
78
|
+
return self._config.dimension
|
|
90
79
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
"""
|
|
80
|
+
def unload(self) -> None:
|
|
81
|
+
"""Kill the worker subprocess to free all memory."""
|
|
94
82
|
with self._lock:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
self._model = None
|
|
98
|
-
self._loaded = False
|
|
99
|
-
import gc
|
|
100
|
-
gc.collect()
|
|
101
|
-
logger.info("EmbeddingService: model unloaded (idle timeout)")
|
|
102
|
-
|
|
103
|
-
def _reset_idle_timer(self) -> None:
|
|
104
|
-
"""Reset the idle unload timer after each use."""
|
|
105
|
-
if self._idle_timer is not None:
|
|
106
|
-
self._idle_timer.cancel()
|
|
107
|
-
self._idle_timer = threading.Timer(
|
|
108
|
-
_IDLE_TIMEOUT_SECONDS, self.unload,
|
|
109
|
-
)
|
|
110
|
-
self._idle_timer.daemon = True
|
|
111
|
-
self._idle_timer.start()
|
|
112
|
-
self._last_used = time.time()
|
|
83
|
+
self._kill_worker()
|
|
84
|
+
logger.info("EmbeddingService: worker killed (idle timeout)")
|
|
113
85
|
|
|
114
86
|
# ------------------------------------------------------------------
|
|
115
87
|
# Public API
|
|
116
88
|
# ------------------------------------------------------------------
|
|
117
89
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
"""Expected embedding dimension (from config)."""
|
|
121
|
-
return self._config.dimension
|
|
122
|
-
|
|
123
|
-
def embed(self, text: str) -> list[float]:
|
|
124
|
-
"""Embed a single text string.
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
L2-normalized embedding of exactly ``self.dimension`` floats.
|
|
128
|
-
|
|
129
|
-
Raises:
|
|
130
|
-
ValueError: If text is empty.
|
|
131
|
-
DimensionMismatchError: If output dimension != config.
|
|
132
|
-
"""
|
|
90
|
+
def embed(self, text: str) -> list[float] | None:
|
|
91
|
+
"""Embed a single text string. Returns list of floats or None."""
|
|
133
92
|
if not text or not text.strip():
|
|
134
93
|
raise ValueError("Cannot embed empty text")
|
|
135
|
-
self.
|
|
136
|
-
|
|
94
|
+
if self._config.is_cloud:
|
|
95
|
+
return self._cloud_embed_single(text)
|
|
96
|
+
result = self._subprocess_embed([text])
|
|
97
|
+
if result is None:
|
|
137
98
|
return None
|
|
138
|
-
vec =
|
|
139
|
-
self._validate_dimension(vec)
|
|
140
|
-
|
|
141
|
-
return vec.tolist()
|
|
142
|
-
|
|
143
|
-
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
144
|
-
"""Embed a batch of texts.
|
|
99
|
+
vec = result[0]
|
|
100
|
+
self._validate_dimension(np.asarray(vec))
|
|
101
|
+
return vec
|
|
145
102
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
Raises:
|
|
150
|
-
ValueError: If any text is empty or list is empty.
|
|
151
|
-
DimensionMismatchError: If any output dimension != config.
|
|
152
|
-
"""
|
|
103
|
+
def embed_batch(self, texts: list[str]) -> list[list[float] | None]:
|
|
104
|
+
"""Embed a batch of texts."""
|
|
153
105
|
if not texts:
|
|
154
106
|
raise ValueError("Cannot embed empty batch")
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
self._ensure_loaded()
|
|
160
|
-
if self._model is None:
|
|
107
|
+
if self._config.is_cloud:
|
|
108
|
+
return self._cloud_embed_batch(texts)
|
|
109
|
+
result = self._subprocess_embed(texts)
|
|
110
|
+
if result is None:
|
|
161
111
|
return [None] * len(texts)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
return [v.tolist() for v in vectors]
|
|
112
|
+
for vec in result:
|
|
113
|
+
if vec is not None:
|
|
114
|
+
self._validate_dimension(np.asarray(vec))
|
|
115
|
+
return result
|
|
167
116
|
|
|
168
117
|
def compute_fisher_params(
|
|
169
|
-
self,
|
|
170
|
-
embedding: list[float],
|
|
118
|
+
self, embedding: list[float],
|
|
171
119
|
) -> tuple[list[float], list[float]]:
|
|
172
|
-
"""Compute Fisher-Rao parameters from a raw embedding.
|
|
173
|
-
|
|
174
|
-
Variance is content-derived (NOT uniform). Dimensions with strong
|
|
175
|
-
signal (high absolute value) get LOW variance (high confidence).
|
|
176
|
-
Weak-signal dimensions get HIGH variance (uncertainty).
|
|
177
|
-
|
|
178
|
-
This heterogeneous variance is what gives Fisher-Rao metric
|
|
179
|
-
discriminative power beyond simple cosine similarity.
|
|
180
|
-
|
|
181
|
-
Args:
|
|
182
|
-
embedding: Raw embedding vector (already L2-normalized).
|
|
183
|
-
|
|
184
|
-
Returns:
|
|
185
|
-
(mean, variance) — both lists of ``self.dimension`` floats.
|
|
186
|
-
Variance values are clamped to [0.3, 2.0].
|
|
187
|
-
"""
|
|
120
|
+
"""Compute Fisher-Rao parameters from a raw embedding."""
|
|
188
121
|
arr = np.asarray(embedding, dtype=np.float64)
|
|
189
122
|
norm = float(np.linalg.norm(arr))
|
|
190
|
-
|
|
191
123
|
if norm < 1e-10:
|
|
192
124
|
mean = np.zeros(len(arr), dtype=np.float64)
|
|
193
125
|
variance = np.full(len(arr), _FISHER_VAR_MAX, dtype=np.float64)
|
|
194
126
|
return mean.tolist(), variance.tolist()
|
|
195
|
-
|
|
196
127
|
mean = arr / norm
|
|
197
|
-
|
|
198
|
-
# Content-derived heterogeneous variance
|
|
199
128
|
abs_mean = np.abs(mean)
|
|
200
129
|
max_val = float(np.max(abs_mean)) + 1e-10
|
|
201
|
-
signal_strength = abs_mean / max_val
|
|
202
|
-
|
|
203
|
-
# Inverse: strong signal -> low variance, weak -> high
|
|
130
|
+
signal_strength = abs_mean / max_val
|
|
204
131
|
variance = _FISHER_VAR_MAX - _FISHER_VAR_RANGE * signal_strength
|
|
205
132
|
variance = np.clip(variance, _FISHER_VAR_MIN, _FISHER_VAR_MAX)
|
|
206
|
-
|
|
207
133
|
return mean.tolist(), variance.tolist()
|
|
208
134
|
|
|
209
135
|
# ------------------------------------------------------------------
|
|
210
|
-
#
|
|
136
|
+
# Subprocess worker management
|
|
211
137
|
# ------------------------------------------------------------------
|
|
212
138
|
|
|
213
|
-
def
|
|
214
|
-
"""
|
|
215
|
-
if self._loaded:
|
|
216
|
-
return
|
|
139
|
+
def _subprocess_embed(self, texts: list[str]) -> list[list[float]] | None:
|
|
140
|
+
"""Send texts to worker subprocess, get embeddings back."""
|
|
217
141
|
with self._lock:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
self.
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
142
|
+
self._ensure_worker()
|
|
143
|
+
if self._worker_proc is None:
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
req = json.dumps({
|
|
147
|
+
"cmd": "embed",
|
|
148
|
+
"texts": texts,
|
|
149
|
+
"model_name": self._config.model_name,
|
|
150
|
+
"dimension": self._config.dimension,
|
|
151
|
+
}) + "\n"
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
self._worker_proc.stdin.write(req)
|
|
155
|
+
self._worker_proc.stdin.flush()
|
|
156
|
+
resp_line = self._worker_proc.stdout.readline()
|
|
157
|
+
if not resp_line:
|
|
158
|
+
logger.warning("Worker returned empty response, restarting")
|
|
159
|
+
self._kill_worker()
|
|
160
|
+
return None
|
|
161
|
+
resp = json.loads(resp_line)
|
|
162
|
+
if not resp.get("ok"):
|
|
163
|
+
logger.warning("Worker error: %s", resp.get("error"))
|
|
164
|
+
return None
|
|
165
|
+
self._reset_idle_timer()
|
|
166
|
+
return resp["vectors"]
|
|
167
|
+
except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
|
|
168
|
+
logger.warning("Worker communication failed: %s", exc)
|
|
169
|
+
self._kill_worker()
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def _ensure_worker(self) -> None:
|
|
173
|
+
"""Spawn worker subprocess if not running."""
|
|
174
|
+
if self._worker_proc is not None and self._worker_proc.poll() is None:
|
|
175
|
+
return
|
|
176
|
+
self._worker_proc = None
|
|
177
|
+
worker_module = "superlocalmemory.core.embedding_worker"
|
|
243
178
|
try:
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
"
|
|
248
|
-
"
|
|
179
|
+
env = {
|
|
180
|
+
**os.environ,
|
|
181
|
+
"CUDA_VISIBLE_DEVICES": "",
|
|
182
|
+
"PYTORCH_MPS_HIGH_WATERMARK_RATIO": "0.0",
|
|
183
|
+
"PYTORCH_MPS_MEM_LIMIT": "0",
|
|
184
|
+
"PYTORCH_ENABLE_MPS_FALLBACK": "1",
|
|
185
|
+
"TOKENIZERS_PARALLELISM": "false",
|
|
186
|
+
"TORCH_DEVICE": "cpu",
|
|
187
|
+
}
|
|
188
|
+
self._worker_proc = subprocess.Popen(
|
|
189
|
+
[sys.executable, "-m", worker_module],
|
|
190
|
+
stdin=subprocess.PIPE,
|
|
191
|
+
stdout=subprocess.PIPE,
|
|
192
|
+
stderr=subprocess.DEVNULL,
|
|
193
|
+
text=True,
|
|
194
|
+
bufsize=1,
|
|
195
|
+
env=env,
|
|
249
196
|
)
|
|
250
|
-
|
|
251
|
-
self.
|
|
197
|
+
logger.info("Embedding worker spawned (PID %d)", self._worker_proc.pid)
|
|
198
|
+
self._worker_ready = True
|
|
199
|
+
except Exception as exc:
|
|
200
|
+
logger.warning("Failed to spawn embedding worker: %s", exc)
|
|
252
201
|
self._available = False
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
202
|
+
self._worker_proc = None
|
|
203
|
+
|
|
204
|
+
def _kill_worker(self) -> None:
|
|
205
|
+
"""Terminate worker subprocess."""
|
|
206
|
+
if self._idle_timer is not None:
|
|
207
|
+
self._idle_timer.cancel()
|
|
208
|
+
self._idle_timer = None
|
|
209
|
+
if self._worker_proc is not None:
|
|
210
|
+
try:
|
|
211
|
+
self._worker_proc.stdin.write('{"cmd":"quit"}\n')
|
|
212
|
+
self._worker_proc.stdin.flush()
|
|
213
|
+
self._worker_proc.wait(timeout=3)
|
|
214
|
+
except Exception:
|
|
215
|
+
try:
|
|
216
|
+
self._worker_proc.kill()
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
self._worker_proc = None
|
|
220
|
+
self._worker_ready = False
|
|
221
|
+
|
|
222
|
+
def _reset_idle_timer(self) -> None:
|
|
223
|
+
"""Reset idle timer — kills worker after 2 min inactivity."""
|
|
224
|
+
if self._idle_timer is not None:
|
|
225
|
+
self._idle_timer.cancel()
|
|
226
|
+
self._idle_timer = threading.Timer(
|
|
227
|
+
_IDLE_TIMEOUT_SECONDS, self.unload,
|
|
269
228
|
)
|
|
229
|
+
self._idle_timer.daemon = True
|
|
230
|
+
self._idle_timer.start()
|
|
231
|
+
self._last_used = time.time()
|
|
270
232
|
|
|
271
233
|
# ------------------------------------------------------------------
|
|
272
|
-
#
|
|
234
|
+
# Cloud embedding (no subprocess needed — just HTTP)
|
|
273
235
|
# ------------------------------------------------------------------
|
|
274
236
|
|
|
275
|
-
def
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
if self._config.is_cloud:
|
|
279
|
-
return self._cloud_embed([text])[0]
|
|
280
|
-
return self._local_embed_batch([text])[0]
|
|
237
|
+
def _cloud_embed_single(self, text: str) -> list[float]:
|
|
238
|
+
vecs = self._cloud_embed_batch([text])
|
|
239
|
+
return vecs[0]
|
|
281
240
|
|
|
282
|
-
def
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
return self._cloud_embed(texts)
|
|
287
|
-
return self._local_embed_batch(texts)
|
|
288
|
-
|
|
289
|
-
def _local_embed_batch(
|
|
290
|
-
self,
|
|
291
|
-
texts: list[str],
|
|
292
|
-
) -> list[NDArray[np.float32]]:
|
|
293
|
-
"""Encode via local sentence-transformers (L2-normalized)."""
|
|
294
|
-
if self._model is None:
|
|
295
|
-
raise RuntimeError("Local model not loaded")
|
|
296
|
-
vecs = self._model.encode(texts, normalize_embeddings=True)
|
|
297
|
-
if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
|
|
298
|
-
return [vecs[i] for i in range(vecs.shape[0])]
|
|
299
|
-
return [np.asarray(v, dtype=np.float32) for v in vecs]
|
|
300
|
-
|
|
301
|
-
def _cloud_embed(
|
|
302
|
-
self,
|
|
303
|
-
texts: list[str],
|
|
304
|
-
*,
|
|
305
|
-
max_retries: int = 3,
|
|
306
|
-
) -> list[NDArray[np.float32]]:
|
|
307
|
-
"""Encode via Azure OpenAI embedding API with retry logic.
|
|
308
|
-
|
|
309
|
-
Raises on failure — NEVER falls back to local model.
|
|
310
|
-
"""
|
|
241
|
+
def _cloud_embed_batch(
|
|
242
|
+
self, texts: list[str], *, max_retries: int = 3,
|
|
243
|
+
) -> list[list[float]]:
|
|
244
|
+
"""Encode via Azure OpenAI embedding API with retry."""
|
|
311
245
|
import httpx
|
|
312
|
-
|
|
313
246
|
url = (
|
|
314
247
|
f"{self._config.api_endpoint.rstrip('/')}/openai/deployments/"
|
|
315
248
|
f"{self._config.deployment_name}/embeddings"
|
|
@@ -320,7 +253,6 @@ class EmbeddingService:
|
|
|
320
253
|
"api-key": self._config.api_key,
|
|
321
254
|
}
|
|
322
255
|
body = {"input": texts, "model": self._config.deployment_name}
|
|
323
|
-
|
|
324
256
|
last_error: Exception | None = None
|
|
325
257
|
for attempt in range(max_retries):
|
|
326
258
|
try:
|
|
@@ -328,39 +260,23 @@ class EmbeddingService:
|
|
|
328
260
|
resp = client.post(url, headers=headers, json=body)
|
|
329
261
|
resp.raise_for_status()
|
|
330
262
|
data = resp.json()
|
|
331
|
-
results
|
|
263
|
+
results = []
|
|
332
264
|
for item in sorted(data["data"], key=lambda d: d["index"]):
|
|
333
|
-
|
|
334
|
-
results.append(vec)
|
|
265
|
+
results.append(item["embedding"])
|
|
335
266
|
return results
|
|
336
267
|
except Exception as exc:
|
|
337
268
|
last_error = exc
|
|
338
|
-
wait = 2 ** attempt # 1s, 2s, 4s
|
|
339
|
-
logger.warning(
|
|
340
|
-
"Cloud embed attempt %d/%d failed: %s (retry in %ds)",
|
|
341
|
-
attempt + 1,
|
|
342
|
-
max_retries,
|
|
343
|
-
exc,
|
|
344
|
-
wait,
|
|
345
|
-
)
|
|
346
269
|
if attempt < max_retries - 1:
|
|
347
|
-
time.sleep(
|
|
348
|
-
|
|
349
|
-
raise RuntimeError(
|
|
350
|
-
f"Cloud embedding failed after {max_retries} attempts: "
|
|
351
|
-
f"{last_error}"
|
|
352
|
-
)
|
|
270
|
+
time.sleep(2 ** attempt)
|
|
271
|
+
raise RuntimeError(f"Cloud embedding failed: {last_error}")
|
|
353
272
|
|
|
354
273
|
# ------------------------------------------------------------------
|
|
355
274
|
# Validation
|
|
356
275
|
# ------------------------------------------------------------------
|
|
357
276
|
|
|
358
277
|
def _validate_dimension(self, vec: NDArray) -> None:
|
|
359
|
-
"""Hard validation — crash on mismatch, never silently fall back."""
|
|
360
278
|
actual = len(vec)
|
|
361
279
|
if actual != self._config.dimension:
|
|
362
280
|
raise DimensionMismatchError(
|
|
363
|
-
f"Embedding dimension {actual} != "
|
|
364
|
-
f"expected {self._config.dimension}. "
|
|
365
|
-
f"This is a HARD failure — check your model/API config."
|
|
281
|
+
f"Embedding dimension {actual} != expected {self._config.dimension}"
|
|
366
282
|
)
|