spatial-memory-mcp 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spatial_memory/__init__.py +97 -0
- spatial_memory/__main__.py +271 -0
- spatial_memory/adapters/__init__.py +7 -0
- spatial_memory/adapters/lancedb_repository.py +880 -0
- spatial_memory/config.py +769 -0
- spatial_memory/core/__init__.py +118 -0
- spatial_memory/core/cache.py +317 -0
- spatial_memory/core/circuit_breaker.py +297 -0
- spatial_memory/core/connection_pool.py +220 -0
- spatial_memory/core/consolidation_strategies.py +401 -0
- spatial_memory/core/database.py +3072 -0
- spatial_memory/core/db_idempotency.py +242 -0
- spatial_memory/core/db_indexes.py +576 -0
- spatial_memory/core/db_migrations.py +588 -0
- spatial_memory/core/db_search.py +512 -0
- spatial_memory/core/db_versioning.py +178 -0
- spatial_memory/core/embeddings.py +558 -0
- spatial_memory/core/errors.py +317 -0
- spatial_memory/core/file_security.py +701 -0
- spatial_memory/core/filesystem.py +178 -0
- spatial_memory/core/health.py +289 -0
- spatial_memory/core/helpers.py +79 -0
- spatial_memory/core/import_security.py +433 -0
- spatial_memory/core/lifecycle_ops.py +1067 -0
- spatial_memory/core/logging.py +194 -0
- spatial_memory/core/metrics.py +192 -0
- spatial_memory/core/models.py +660 -0
- spatial_memory/core/rate_limiter.py +326 -0
- spatial_memory/core/response_types.py +500 -0
- spatial_memory/core/security.py +588 -0
- spatial_memory/core/spatial_ops.py +430 -0
- spatial_memory/core/tracing.py +300 -0
- spatial_memory/core/utils.py +110 -0
- spatial_memory/core/validation.py +406 -0
- spatial_memory/factory.py +444 -0
- spatial_memory/migrations/__init__.py +40 -0
- spatial_memory/ports/__init__.py +11 -0
- spatial_memory/ports/repositories.py +630 -0
- spatial_memory/py.typed +0 -0
- spatial_memory/server.py +1214 -0
- spatial_memory/services/__init__.py +70 -0
- spatial_memory/services/decay_manager.py +411 -0
- spatial_memory/services/export_import.py +1031 -0
- spatial_memory/services/lifecycle.py +1139 -0
- spatial_memory/services/memory.py +412 -0
- spatial_memory/services/spatial.py +1152 -0
- spatial_memory/services/utility.py +429 -0
- spatial_memory/tools/__init__.py +5 -0
- spatial_memory/tools/definitions.py +695 -0
- spatial_memory/verify.py +140 -0
- spatial_memory_mcp-1.9.1.dist-info/METADATA +509 -0
- spatial_memory_mcp-1.9.1.dist-info/RECORD +55 -0
- spatial_memory_mcp-1.9.1.dist-info/WHEEL +4 -0
- spatial_memory_mcp-1.9.1.dist-info/entry_points.txt +2 -0
- spatial_memory_mcp-1.9.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1031 @@
|
|
|
1
|
+
"""Export/Import service for memory data portability.
|
|
2
|
+
|
|
3
|
+
This service provides the application layer for memory export/import operations:
|
|
4
|
+
- export_memories: Export memories to Parquet/JSON/CSV formats
|
|
5
|
+
- import_memories: Import memories with validation and deduplication
|
|
6
|
+
|
|
7
|
+
The service uses dependency injection for repository and embedding services,
|
|
8
|
+
following Clean Architecture principles. File I/O and format conversion are
|
|
9
|
+
handled at this service layer, while the repository handles only data access.
|
|
10
|
+
|
|
11
|
+
Security is enforced through PathValidator for all file operations.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import csv
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import time
|
|
20
|
+
from collections.abc import Iterator, Sequence
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from io import TextIOWrapper
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import TYPE_CHECKING, Any, BinaryIO
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
|
|
28
|
+
from spatial_memory.core.errors import (
|
|
29
|
+
ExportError,
|
|
30
|
+
FileSizeLimitError,
|
|
31
|
+
ImportRecordLimitError,
|
|
32
|
+
MemoryImportError,
|
|
33
|
+
PathSecurityError,
|
|
34
|
+
ValidationError,
|
|
35
|
+
)
|
|
36
|
+
from spatial_memory.core.file_security import PathValidator
|
|
37
|
+
from spatial_memory.core.models import (
|
|
38
|
+
ExportImportConfig,
|
|
39
|
+
ExportResult,
|
|
40
|
+
ImportedMemory,
|
|
41
|
+
ImportResult,
|
|
42
|
+
ImportValidationError,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
# Explicit exports for mypy
|
|
48
|
+
__all__ = [
|
|
49
|
+
"ExportImportConfig",
|
|
50
|
+
"ExportImportService",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
if TYPE_CHECKING:
|
|
54
|
+
import pyarrow as pa
|
|
55
|
+
|
|
56
|
+
from spatial_memory.ports.repositories import (
|
|
57
|
+
EmbeddingServiceProtocol,
|
|
58
|
+
MemoryRepositoryProtocol,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# =============================================================================
|
|
63
|
+
# Constants
|
|
64
|
+
# =============================================================================
|
|
65
|
+
|
|
66
|
+
SUPPORTED_FORMATS = frozenset({"parquet", "json", "csv"})
|
|
67
|
+
|
|
68
|
+
EXTENSION_TO_FORMAT: dict[str, str] = {
|
|
69
|
+
".parquet": "parquet",
|
|
70
|
+
".json": "json",
|
|
71
|
+
".jsonl": "json",
|
|
72
|
+
".csv": "csv",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# Required fields for import validation
|
|
76
|
+
REQUIRED_IMPORT_FIELDS = frozenset({"content"})
|
|
77
|
+
|
|
78
|
+
# Default import size limit (100 MB)
|
|
79
|
+
DEFAULT_MAX_IMPORT_SIZE_BYTES = 100 * 1024 * 1024
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# =============================================================================
|
|
83
|
+
# Service Implementation
|
|
84
|
+
# =============================================================================
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ExportImportService:
|
|
88
|
+
"""Service for memory export and import operations.
|
|
89
|
+
|
|
90
|
+
Uses Clean Architecture - depends on protocol interfaces, not implementations.
|
|
91
|
+
Handles file I/O and format conversion at the service layer while delegating
|
|
92
|
+
data access to the repository.
|
|
93
|
+
|
|
94
|
+
Security Features:
|
|
95
|
+
- Path validation to prevent traversal attacks
|
|
96
|
+
- File size limits for imports
|
|
97
|
+
- Symlink detection (optional)
|
|
98
|
+
- Extension validation
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
service = ExportImportService(
|
|
102
|
+
repository=repo,
|
|
103
|
+
embeddings=emb,
|
|
104
|
+
allowed_export_paths=[Path("./exports")],
|
|
105
|
+
allowed_import_paths=[Path("./imports")],
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Export memories
|
|
109
|
+
result = service.export_memories(
|
|
110
|
+
output_path="./exports/backup.parquet",
|
|
111
|
+
namespace="work",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Import memories
|
|
115
|
+
result = service.import_memories(
|
|
116
|
+
source_path="./imports/restore.json",
|
|
117
|
+
dry_run=False,
|
|
118
|
+
)
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
def __init__(
|
|
122
|
+
self,
|
|
123
|
+
repository: MemoryRepositoryProtocol,
|
|
124
|
+
embeddings: EmbeddingServiceProtocol,
|
|
125
|
+
config: ExportImportConfig | None = None,
|
|
126
|
+
allowed_export_paths: Sequence[str | Path] | None = None,
|
|
127
|
+
allowed_import_paths: Sequence[str | Path] | None = None,
|
|
128
|
+
allow_symlinks: bool = False,
|
|
129
|
+
max_import_size_bytes: int | None = None,
|
|
130
|
+
) -> None:
|
|
131
|
+
"""Initialize the export/import service.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
repository: Repository for memory storage.
|
|
135
|
+
embeddings: Service for generating embeddings.
|
|
136
|
+
config: Optional configuration (uses defaults if not provided).
|
|
137
|
+
allowed_export_paths: Directories where exports are permitted.
|
|
138
|
+
allowed_import_paths: Directories where imports are permitted.
|
|
139
|
+
allow_symlinks: Whether to allow following symlinks (default False).
|
|
140
|
+
max_import_size_bytes: Maximum import file size in bytes.
|
|
141
|
+
"""
|
|
142
|
+
self._repo = repository
|
|
143
|
+
self._embeddings = embeddings
|
|
144
|
+
self._config = config or ExportImportConfig()
|
|
145
|
+
|
|
146
|
+
# Set up path validator
|
|
147
|
+
export_paths = allowed_export_paths or [Path("./exports"), Path("./backups")]
|
|
148
|
+
import_paths = allowed_import_paths or [Path("./imports"), Path("./backups")]
|
|
149
|
+
|
|
150
|
+
self._path_validator = PathValidator(
|
|
151
|
+
allowed_export_paths=export_paths,
|
|
152
|
+
allowed_import_paths=import_paths,
|
|
153
|
+
allow_symlinks=allow_symlinks,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
self._max_import_size_bytes = (
|
|
157
|
+
max_import_size_bytes or DEFAULT_MAX_IMPORT_SIZE_BYTES
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def export_memories(
|
|
161
|
+
self,
|
|
162
|
+
output_path: str,
|
|
163
|
+
format: str | None = None,
|
|
164
|
+
namespace: str | None = None,
|
|
165
|
+
include_vectors: bool = True,
|
|
166
|
+
) -> ExportResult:
|
|
167
|
+
"""Export memories to a file.
|
|
168
|
+
|
|
169
|
+
Streams data from repository and writes to the specified format.
|
|
170
|
+
Supports Parquet (recommended for full fidelity), JSON, and CSV.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
output_path: Path for output file. Extension determines format
|
|
174
|
+
if format parameter is not specified.
|
|
175
|
+
format: Export format (parquet, json, csv). Auto-detected from
|
|
176
|
+
extension if not specified.
|
|
177
|
+
namespace: Export only this namespace (all if not specified).
|
|
178
|
+
include_vectors: Include embedding vectors in export (default True).
|
|
179
|
+
Note: CSV exports may set this to False for readability.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
ExportResult with export statistics.
|
|
183
|
+
|
|
184
|
+
Raises:
|
|
185
|
+
ExportError: If export operation fails.
|
|
186
|
+
PathSecurityError: If path validation fails.
|
|
187
|
+
ValidationError: If input validation fails.
|
|
188
|
+
"""
|
|
189
|
+
start_time = time.monotonic()
|
|
190
|
+
|
|
191
|
+
# Validate and resolve path
|
|
192
|
+
try:
|
|
193
|
+
canonical_path = self._path_validator.validate_export_path(output_path)
|
|
194
|
+
except (PathSecurityError, ValueError) as e:
|
|
195
|
+
raise PathSecurityError(
|
|
196
|
+
path=output_path,
|
|
197
|
+
violation_type="export_path_validation_failed",
|
|
198
|
+
message=str(e),
|
|
199
|
+
) from e
|
|
200
|
+
|
|
201
|
+
# Detect or validate format
|
|
202
|
+
detected_format = format or self._detect_format(output_path)
|
|
203
|
+
if detected_format is None:
|
|
204
|
+
detected_format = self._config.default_export_format
|
|
205
|
+
|
|
206
|
+
if detected_format not in SUPPORTED_FORMATS:
|
|
207
|
+
raise ValidationError(
|
|
208
|
+
f"Unsupported export format: {detected_format}. "
|
|
209
|
+
f"Supported: {', '.join(sorted(SUPPORTED_FORMATS))}"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
# Check export record limit before starting
|
|
214
|
+
if self._config.max_export_records > 0:
|
|
215
|
+
memory_count = self._repo.count(namespace=namespace)
|
|
216
|
+
if memory_count > self._config.max_export_records:
|
|
217
|
+
raise ExportError(
|
|
218
|
+
f"Export would contain {memory_count} records, "
|
|
219
|
+
f"exceeding limit of {self._config.max_export_records}. "
|
|
220
|
+
"Consider filtering by namespace or increasing max_export_records."
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Ensure parent directory exists
|
|
224
|
+
canonical_path.parent.mkdir(parents=True, exist_ok=True)
|
|
225
|
+
|
|
226
|
+
# Stream data from repository
|
|
227
|
+
batches = self._repo.get_all_for_export(
|
|
228
|
+
namespace=namespace,
|
|
229
|
+
batch_size=self._config.export_batch_size,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Get namespaces for result
|
|
233
|
+
if namespace:
|
|
234
|
+
namespaces_included = [namespace]
|
|
235
|
+
else:
|
|
236
|
+
namespaces_included = self._repo.get_namespaces()
|
|
237
|
+
|
|
238
|
+
# Export based on format
|
|
239
|
+
if detected_format == "parquet":
|
|
240
|
+
memories_exported = self._export_parquet(
|
|
241
|
+
canonical_path, batches, include_vectors
|
|
242
|
+
)
|
|
243
|
+
elif detected_format == "json":
|
|
244
|
+
memories_exported = self._export_json(
|
|
245
|
+
canonical_path, batches, include_vectors
|
|
246
|
+
)
|
|
247
|
+
elif detected_format == "csv":
|
|
248
|
+
memories_exported = self._export_csv(
|
|
249
|
+
canonical_path, batches, include_vectors
|
|
250
|
+
)
|
|
251
|
+
else:
|
|
252
|
+
raise ExportError(f"Unsupported format: {detected_format}")
|
|
253
|
+
|
|
254
|
+
# Calculate file size
|
|
255
|
+
if canonical_path.exists():
|
|
256
|
+
file_size_bytes = canonical_path.stat().st_size
|
|
257
|
+
else:
|
|
258
|
+
file_size_bytes = 0
|
|
259
|
+
|
|
260
|
+
duration_seconds = time.monotonic() - start_time
|
|
261
|
+
|
|
262
|
+
return ExportResult(
|
|
263
|
+
format=detected_format,
|
|
264
|
+
output_path=str(canonical_path),
|
|
265
|
+
memories_exported=memories_exported,
|
|
266
|
+
file_size_bytes=file_size_bytes,
|
|
267
|
+
file_size_mb=file_size_bytes / (1024 * 1024),
|
|
268
|
+
namespaces_included=namespaces_included,
|
|
269
|
+
duration_seconds=duration_seconds,
|
|
270
|
+
compression="zstd" if detected_format == "parquet" else None,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
except (ExportError, PathSecurityError, ValidationError):
|
|
274
|
+
raise
|
|
275
|
+
except Exception as e:
|
|
276
|
+
logger.error(f"Export failed: {e}")
|
|
277
|
+
raise ExportError(f"Export operation failed: {e}") from e
|
|
278
|
+
|
|
279
|
+
def import_memories(
|
|
280
|
+
self,
|
|
281
|
+
source_path: str,
|
|
282
|
+
format: str | None = None,
|
|
283
|
+
namespace_override: str | None = None,
|
|
284
|
+
deduplicate: bool = False,
|
|
285
|
+
dedup_threshold: float = 0.95,
|
|
286
|
+
validate: bool = True,
|
|
287
|
+
regenerate_embeddings: bool = False,
|
|
288
|
+
dry_run: bool = True,
|
|
289
|
+
) -> ImportResult:
|
|
290
|
+
"""Import memories from a file.
|
|
291
|
+
|
|
292
|
+
Parses the file, validates records, optionally deduplicates against
|
|
293
|
+
existing memories, and imports to the repository.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
source_path: Path to source file.
|
|
297
|
+
format: Import format (parquet, json, csv). Auto-detected from
|
|
298
|
+
extension if not specified.
|
|
299
|
+
namespace_override: Override namespace for all imported memories.
|
|
300
|
+
deduplicate: Skip records similar to existing memories (default False).
|
|
301
|
+
dedup_threshold: Similarity threshold for deduplication (0.7-0.99).
|
|
302
|
+
validate: Validate records before import (default True).
|
|
303
|
+
regenerate_embeddings: Generate new embeddings for imported memories.
|
|
304
|
+
Required if source lacks vectors or dimensions don't match.
|
|
305
|
+
dry_run: Validate without importing (default True). Set to False
|
|
306
|
+
to actually import the memories.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
ImportResult with import statistics and validation errors.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
MemoryImportError: If import operation fails.
|
|
313
|
+
PathSecurityError: If path validation fails.
|
|
314
|
+
FileSizeLimitError: If file exceeds size limit.
|
|
315
|
+
ValidationError: If input validation fails.
|
|
316
|
+
"""
|
|
317
|
+
start_time = time.monotonic()
|
|
318
|
+
|
|
319
|
+
# Detect or validate format BEFORE opening file
|
|
320
|
+
detected_format = format or self._detect_format(source_path)
|
|
321
|
+
if detected_format is None:
|
|
322
|
+
raise ValidationError(
|
|
323
|
+
f"Cannot detect format from path: {source_path}. "
|
|
324
|
+
"Please specify format explicitly."
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if detected_format not in SUPPORTED_FORMATS:
|
|
328
|
+
raise ValidationError(
|
|
329
|
+
f"Unsupported import format: {detected_format}. "
|
|
330
|
+
f"Supported: {', '.join(sorted(SUPPORTED_FORMATS))}"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Validate dedup threshold
|
|
334
|
+
if deduplicate and not 0.7 <= dedup_threshold <= 0.99:
|
|
335
|
+
raise ValidationError(
|
|
336
|
+
"dedup_threshold must be between 0.7 and 0.99"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# ATOMIC: Validate and open file in one step (prevents TOCTOU)
|
|
340
|
+
# The file handle MUST be used for reading, not re-opened by path
|
|
341
|
+
try:
|
|
342
|
+
canonical_path, file_handle = self._path_validator.validate_and_open_import_file(
|
|
343
|
+
source_path,
|
|
344
|
+
max_size_bytes=self._max_import_size_bytes,
|
|
345
|
+
)
|
|
346
|
+
except PathSecurityError as e:
|
|
347
|
+
raise e
|
|
348
|
+
except FileSizeLimitError as e:
|
|
349
|
+
raise e
|
|
350
|
+
except ValueError as e:
|
|
351
|
+
raise PathSecurityError(
|
|
352
|
+
path=source_path,
|
|
353
|
+
violation_type="import_path_validation_failed",
|
|
354
|
+
message=str(e),
|
|
355
|
+
) from e
|
|
356
|
+
|
|
357
|
+
try:
|
|
358
|
+
# Parse file using the ALREADY OPEN file handle (TOCTOU safe)
|
|
359
|
+
if detected_format == "parquet":
|
|
360
|
+
records_iter = self._parse_parquet_from_handle(file_handle, canonical_path)
|
|
361
|
+
elif detected_format == "json":
|
|
362
|
+
records_iter = self._parse_json_from_handle(file_handle)
|
|
363
|
+
elif detected_format == "csv":
|
|
364
|
+
records_iter = self._parse_csv_from_handle(file_handle)
|
|
365
|
+
else:
|
|
366
|
+
raise MemoryImportError(f"Unsupported format: {detected_format}")
|
|
367
|
+
|
|
368
|
+
# Stream records with early termination to prevent memory exhaustion.
|
|
369
|
+
# Check limit during iteration, not after loading all records.
|
|
370
|
+
max_records = self._config.max_import_records
|
|
371
|
+
records: list[dict[str, Any]] = []
|
|
372
|
+
|
|
373
|
+
for record in records_iter:
|
|
374
|
+
records.append(record)
|
|
375
|
+
# Fail fast if limit exceeded - prevents memory exhaustion from large files
|
|
376
|
+
if max_records > 0 and len(records) > max_records:
|
|
377
|
+
raise ImportRecordLimitError(
|
|
378
|
+
actual_count=len(records),
|
|
379
|
+
max_count=max_records,
|
|
380
|
+
)
|
|
381
|
+
finally:
|
|
382
|
+
# Ensure file is closed even if parsing fails
|
|
383
|
+
file_handle.close()
|
|
384
|
+
|
|
385
|
+
try:
|
|
386
|
+
|
|
387
|
+
# Process records
|
|
388
|
+
total_records = 0
|
|
389
|
+
valid_records: list[dict[str, Any]] = []
|
|
390
|
+
validation_errors: list[ImportValidationError] = []
|
|
391
|
+
skipped_count = 0
|
|
392
|
+
failed_count = 0
|
|
393
|
+
imported_memories: list[ImportedMemory] = []
|
|
394
|
+
|
|
395
|
+
for idx, record in enumerate(records):
|
|
396
|
+
total_records += 1
|
|
397
|
+
|
|
398
|
+
# Validate record if requested
|
|
399
|
+
if validate:
|
|
400
|
+
expected_dims = (
|
|
401
|
+
self._embeddings.dimensions
|
|
402
|
+
if not regenerate_embeddings
|
|
403
|
+
else None
|
|
404
|
+
)
|
|
405
|
+
errors = self._validate_record(record, idx, expected_dims)
|
|
406
|
+
if errors:
|
|
407
|
+
validation_errors.extend(errors)
|
|
408
|
+
failed_count += 1
|
|
409
|
+
continue
|
|
410
|
+
|
|
411
|
+
# Apply namespace override
|
|
412
|
+
if namespace_override:
|
|
413
|
+
record["namespace"] = namespace_override
|
|
414
|
+
|
|
415
|
+
# Handle embeddings
|
|
416
|
+
if regenerate_embeddings or "vector" not in record:
|
|
417
|
+
if not dry_run:
|
|
418
|
+
vector = self._embeddings.embed(record["content"])
|
|
419
|
+
record["vector"] = vector.tolist()
|
|
420
|
+
else:
|
|
421
|
+
# In dry run, just mark that we would regenerate
|
|
422
|
+
record["_needs_embedding"] = True
|
|
423
|
+
|
|
424
|
+
# Deduplicate if requested
|
|
425
|
+
if deduplicate and not dry_run:
|
|
426
|
+
is_duplicate = self._check_duplicate(
|
|
427
|
+
record, dedup_threshold
|
|
428
|
+
)
|
|
429
|
+
if is_duplicate is True:
|
|
430
|
+
skipped_count += 1
|
|
431
|
+
continue
|
|
432
|
+
# If is_duplicate is None (check failed), proceed with import
|
|
433
|
+
# This is a conservative policy - import on failure
|
|
434
|
+
|
|
435
|
+
valid_records.append(record)
|
|
436
|
+
|
|
437
|
+
# Import if not dry run
|
|
438
|
+
memories_imported = 0
|
|
439
|
+
imported_ids: list[str] = []
|
|
440
|
+
|
|
441
|
+
if not dry_run and valid_records:
|
|
442
|
+
# Filter out internal fields
|
|
443
|
+
import_records = [
|
|
444
|
+
{k: v for k, v in r.items() if not k.startswith("_")}
|
|
445
|
+
for r in valid_records
|
|
446
|
+
]
|
|
447
|
+
|
|
448
|
+
memories_imported, imported_ids = self._repo.bulk_import(
|
|
449
|
+
iter(import_records),
|
|
450
|
+
batch_size=self._config.import_batch_size,
|
|
451
|
+
namespace_override=namespace_override,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Build imported memories list
|
|
455
|
+
for record, new_id in zip(valid_records, imported_ids):
|
|
456
|
+
content = record.get("content", "")
|
|
457
|
+
preview = content[:100] + "..." if len(content) > 100 else content
|
|
458
|
+
imported_memories.append(
|
|
459
|
+
ImportedMemory(
|
|
460
|
+
id=new_id,
|
|
461
|
+
content_preview=preview,
|
|
462
|
+
namespace=record.get("namespace", "default"),
|
|
463
|
+
was_deduplicated=False,
|
|
464
|
+
original_id=record.get("id"),
|
|
465
|
+
)
|
|
466
|
+
)
|
|
467
|
+
elif dry_run:
|
|
468
|
+
# In dry run, count valid records as "would be imported"
|
|
469
|
+
memories_imported = len(valid_records)
|
|
470
|
+
|
|
471
|
+
duration_seconds = time.monotonic() - start_time
|
|
472
|
+
|
|
473
|
+
return ImportResult(
|
|
474
|
+
source_path=str(canonical_path),
|
|
475
|
+
format=detected_format,
|
|
476
|
+
total_records_in_file=total_records,
|
|
477
|
+
memories_imported=memories_imported,
|
|
478
|
+
memories_skipped=skipped_count,
|
|
479
|
+
memories_failed=failed_count,
|
|
480
|
+
validation_errors=validation_errors,
|
|
481
|
+
duration_seconds=duration_seconds,
|
|
482
|
+
namespace_override=namespace_override,
|
|
483
|
+
imported_memories=imported_memories if not dry_run else None,
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
except (MemoryImportError, PathSecurityError, ValidationError, FileSizeLimitError):
|
|
487
|
+
raise
|
|
488
|
+
except json.JSONDecodeError as e:
|
|
489
|
+
raise MemoryImportError(f"Invalid JSON in import file: {e}") from e
|
|
490
|
+
except Exception as e:
|
|
491
|
+
logger.error(f"Import failed: {e}")
|
|
492
|
+
raise MemoryImportError(f"Import operation failed: {e}") from e
|
|
493
|
+
|
|
494
|
+
# =========================================================================
|
|
495
|
+
# Format Detection
|
|
496
|
+
# =========================================================================
|
|
497
|
+
|
|
498
|
+
def _detect_format(self, path: str) -> str | None:
|
|
499
|
+
"""Detect format from file extension.
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
path: File path.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Format string or None if unknown.
|
|
506
|
+
"""
|
|
507
|
+
path_obj = Path(path)
|
|
508
|
+
ext = path_obj.suffix.lower()
|
|
509
|
+
return EXTENSION_TO_FORMAT.get(ext)
|
|
510
|
+
|
|
511
|
+
# =========================================================================
|
|
512
|
+
# Export Format Handlers
|
|
513
|
+
# =========================================================================
|
|
514
|
+
|
|
515
|
+
def _create_parquet_schema(self, include_vectors: bool) -> pa.Schema:
|
|
516
|
+
"""Create PyArrow schema for Parquet export.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
include_vectors: Whether to include embedding vector field.
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
PyArrow schema for memory records.
|
|
523
|
+
"""
|
|
524
|
+
import pyarrow as pa
|
|
525
|
+
|
|
526
|
+
fields = [
|
|
527
|
+
("id", pa.string()),
|
|
528
|
+
("content", pa.string()),
|
|
529
|
+
("namespace", pa.string()),
|
|
530
|
+
("importance", pa.float32()),
|
|
531
|
+
("tags", pa.list_(pa.string())),
|
|
532
|
+
("source", pa.string()),
|
|
533
|
+
("metadata", pa.string()),
|
|
534
|
+
("created_at", pa.timestamp("us", tz="UTC")),
|
|
535
|
+
("updated_at", pa.timestamp("us", tz="UTC")),
|
|
536
|
+
("last_accessed", pa.timestamp("us", tz="UTC")),
|
|
537
|
+
("access_count", pa.int32()),
|
|
538
|
+
]
|
|
539
|
+
if include_vectors:
|
|
540
|
+
fields.append(("vector", pa.list_(pa.float32())))
|
|
541
|
+
return pa.schema(fields)
|
|
542
|
+
|
|
543
|
+
def _export_parquet(
|
|
544
|
+
self,
|
|
545
|
+
path: Path,
|
|
546
|
+
batches: Iterator[list[dict[str, Any]]],
|
|
547
|
+
include_vectors: bool,
|
|
548
|
+
) -> int:
|
|
549
|
+
"""Export to Parquet format using streaming writes.
|
|
550
|
+
|
|
551
|
+
Uses ParquetWriter to write batches incrementally, avoiding
|
|
552
|
+
accumulation of all records in memory.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
path: Output file path.
|
|
556
|
+
batches: Iterator of record batches.
|
|
557
|
+
include_vectors: Whether to include embedding vectors.
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
Number of records exported.
|
|
561
|
+
"""
|
|
562
|
+
try:
|
|
563
|
+
import pyarrow as pa
|
|
564
|
+
import pyarrow.parquet as pq
|
|
565
|
+
except ImportError as e:
|
|
566
|
+
raise ExportError(
|
|
567
|
+
"pyarrow is required for Parquet export. "
|
|
568
|
+
"Install with: pip install pyarrow"
|
|
569
|
+
) from e
|
|
570
|
+
|
|
571
|
+
schema = self._create_parquet_schema(include_vectors)
|
|
572
|
+
total_records = 0
|
|
573
|
+
writer: pq.ParquetWriter | None = None
|
|
574
|
+
|
|
575
|
+
try:
|
|
576
|
+
for batch in batches:
|
|
577
|
+
if not batch:
|
|
578
|
+
continue
|
|
579
|
+
|
|
580
|
+
# Process records for this batch
|
|
581
|
+
processed_records: list[dict[str, Any]] = []
|
|
582
|
+
for record in batch:
|
|
583
|
+
processed = self._prepare_record_for_export(record, include_vectors)
|
|
584
|
+
# Parquet needs metadata as string to avoid empty struct issues
|
|
585
|
+
if "metadata" in processed:
|
|
586
|
+
if isinstance(processed["metadata"], dict):
|
|
587
|
+
processed["metadata"] = json.dumps(processed["metadata"])
|
|
588
|
+
processed_records.append(processed)
|
|
589
|
+
|
|
590
|
+
if not processed_records:
|
|
591
|
+
continue
|
|
592
|
+
|
|
593
|
+
# Create table from this batch
|
|
594
|
+
batch_table = pa.Table.from_pylist(processed_records, schema=schema)
|
|
595
|
+
|
|
596
|
+
# Initialize writer on first batch with data
|
|
597
|
+
if writer is None:
|
|
598
|
+
writer = pq.ParquetWriter(
|
|
599
|
+
path,
|
|
600
|
+
schema,
|
|
601
|
+
compression=self._config.parquet_compression,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
writer.write_table(batch_table)
|
|
605
|
+
total_records += len(processed_records)
|
|
606
|
+
|
|
607
|
+
# Handle empty export case - write an empty file with schema
|
|
608
|
+
if writer is None:
|
|
609
|
+
empty_table = pa.Table.from_pydict(
|
|
610
|
+
{f.name: [] for f in schema}, schema=schema
|
|
611
|
+
)
|
|
612
|
+
pq.write_table(
|
|
613
|
+
empty_table,
|
|
614
|
+
path,
|
|
615
|
+
compression=self._config.parquet_compression,
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
finally:
|
|
619
|
+
if writer is not None:
|
|
620
|
+
writer.close()
|
|
621
|
+
|
|
622
|
+
return total_records
|
|
623
|
+
|
|
624
|
+
def _export_json(
|
|
625
|
+
self,
|
|
626
|
+
path: Path,
|
|
627
|
+
batches: Iterator[list[dict[str, Any]]],
|
|
628
|
+
include_vectors: bool,
|
|
629
|
+
) -> int:
|
|
630
|
+
"""Export to JSON format using streaming to avoid memory exhaustion.
|
|
631
|
+
|
|
632
|
+
Writes a valid JSON array by streaming records one at a time,
|
|
633
|
+
without accumulating all records in memory.
|
|
634
|
+
|
|
635
|
+
Args:
|
|
636
|
+
path: Output file path.
|
|
637
|
+
batches: Iterator of record batches.
|
|
638
|
+
include_vectors: Whether to include embedding vectors.
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
Number of records exported.
|
|
642
|
+
"""
|
|
643
|
+
total_records = 0
|
|
644
|
+
first_record = True
|
|
645
|
+
|
|
646
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
647
|
+
f.write("[\n")
|
|
648
|
+
|
|
649
|
+
for batch in batches:
|
|
650
|
+
for record in batch:
|
|
651
|
+
processed = self._prepare_record_for_export(record, include_vectors)
|
|
652
|
+
|
|
653
|
+
# Add comma separator for all but first record
|
|
654
|
+
if not first_record:
|
|
655
|
+
f.write(",\n")
|
|
656
|
+
first_record = False
|
|
657
|
+
|
|
658
|
+
# Write the record
|
|
659
|
+
json_str = json.dumps(processed, default=self._json_serializer, indent=2)
|
|
660
|
+
# Indent each line for pretty formatting
|
|
661
|
+
indented = "\n".join(" " + line for line in json_str.split("\n"))
|
|
662
|
+
f.write(indented)
|
|
663
|
+
|
|
664
|
+
total_records += 1
|
|
665
|
+
|
|
666
|
+
f.write("\n]")
|
|
667
|
+
|
|
668
|
+
return total_records
|
|
669
|
+
|
|
670
|
+
def _export_csv(
|
|
671
|
+
self,
|
|
672
|
+
path: Path,
|
|
673
|
+
batches: Iterator[list[dict[str, Any]]],
|
|
674
|
+
include_vectors: bool,
|
|
675
|
+
) -> int:
|
|
676
|
+
"""Export to CSV format using streaming to avoid memory exhaustion.
|
|
677
|
+
|
|
678
|
+
Writes CSV rows as they are processed without accumulating
|
|
679
|
+
all records in memory.
|
|
680
|
+
|
|
681
|
+
Args:
|
|
682
|
+
path: Output file path.
|
|
683
|
+
batches: Iterator of record batches.
|
|
684
|
+
include_vectors: Whether to include embedding vectors.
|
|
685
|
+
|
|
686
|
+
Returns:
|
|
687
|
+
Number of records exported.
|
|
688
|
+
"""
|
|
689
|
+
# Define fieldnames upfront
|
|
690
|
+
fieldnames = [
|
|
691
|
+
"id", "content", "namespace", "importance", "tags",
|
|
692
|
+
"source", "metadata", "created_at", "updated_at",
|
|
693
|
+
"last_accessed", "access_count"
|
|
694
|
+
]
|
|
695
|
+
if include_vectors:
|
|
696
|
+
fieldnames.append("vector")
|
|
697
|
+
|
|
698
|
+
total_records = 0
|
|
699
|
+
|
|
700
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
701
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
|
|
702
|
+
writer.writeheader()
|
|
703
|
+
|
|
704
|
+
for batch in batches:
|
|
705
|
+
for record in batch:
|
|
706
|
+
processed = self._prepare_record_for_export(record, include_vectors)
|
|
707
|
+
# Convert complex types to strings for CSV
|
|
708
|
+
processed["tags"] = json.dumps(processed.get("tags", []))
|
|
709
|
+
processed["metadata"] = json.dumps(processed.get("metadata", {}))
|
|
710
|
+
if include_vectors and "vector" in processed:
|
|
711
|
+
processed["vector"] = json.dumps(processed["vector"])
|
|
712
|
+
# Convert datetimes to ISO format
|
|
713
|
+
for key in ["created_at", "updated_at", "last_accessed"]:
|
|
714
|
+
if key in processed and processed[key] is not None:
|
|
715
|
+
if isinstance(processed[key], datetime):
|
|
716
|
+
processed[key] = processed[key].isoformat()
|
|
717
|
+
|
|
718
|
+
writer.writerow(processed)
|
|
719
|
+
total_records += 1
|
|
720
|
+
|
|
721
|
+
return total_records
|
|
722
|
+
|
|
723
|
+
def _prepare_record_for_export(
|
|
724
|
+
self,
|
|
725
|
+
record: dict[str, Any],
|
|
726
|
+
include_vectors: bool,
|
|
727
|
+
) -> dict[str, Any]:
|
|
728
|
+
"""Prepare a record for export.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
record: Raw record from repository.
|
|
732
|
+
include_vectors: Whether to include embedding vectors.
|
|
733
|
+
|
|
734
|
+
Returns:
|
|
735
|
+
Processed record suitable for export.
|
|
736
|
+
"""
|
|
737
|
+
processed = dict(record)
|
|
738
|
+
|
|
739
|
+
# Handle vector
|
|
740
|
+
if not include_vectors:
|
|
741
|
+
processed.pop("vector", None)
|
|
742
|
+
elif "vector" in processed:
|
|
743
|
+
# Ensure vector is a list, not numpy array
|
|
744
|
+
vec = processed["vector"]
|
|
745
|
+
if isinstance(vec, np.ndarray):
|
|
746
|
+
processed["vector"] = vec.tolist()
|
|
747
|
+
|
|
748
|
+
# Handle metadata - ensure it's JSON serializable
|
|
749
|
+
if "metadata" in processed:
|
|
750
|
+
meta = processed["metadata"]
|
|
751
|
+
if isinstance(meta, str):
|
|
752
|
+
try:
|
|
753
|
+
processed["metadata"] = json.loads(meta)
|
|
754
|
+
except json.JSONDecodeError:
|
|
755
|
+
processed["metadata"] = {}
|
|
756
|
+
|
|
757
|
+
return processed
|
|
758
|
+
|
|
759
|
+
def _json_serializer(self, obj: Any) -> Any:
|
|
760
|
+
"""Custom JSON serializer for complex types."""
|
|
761
|
+
if isinstance(obj, datetime):
|
|
762
|
+
return obj.isoformat()
|
|
763
|
+
if isinstance(obj, np.ndarray):
|
|
764
|
+
return obj.tolist()
|
|
765
|
+
if isinstance(obj, np.floating):
|
|
766
|
+
return float(obj)
|
|
767
|
+
if isinstance(obj, np.integer):
|
|
768
|
+
return int(obj)
|
|
769
|
+
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
|
770
|
+
|
|
771
|
+
# =========================================================================
|
|
772
|
+
# Import Format Handlers (TOCTOU-safe versions using file handles)
|
|
773
|
+
# =========================================================================
|
|
774
|
+
|
|
775
|
+
def _parse_parquet_from_handle(
|
|
776
|
+
self, file_handle: BinaryIO, path: Path
|
|
777
|
+
) -> Iterator[dict[str, Any]]:
|
|
778
|
+
"""Parse Parquet from an already-open file handle (TOCTOU-safe).
|
|
779
|
+
|
|
780
|
+
Args:
|
|
781
|
+
file_handle: Open binary file handle.
|
|
782
|
+
path: Original path (for error messages only).
|
|
783
|
+
|
|
784
|
+
Yields:
|
|
785
|
+
Memory records as dictionaries.
|
|
786
|
+
"""
|
|
787
|
+
try:
|
|
788
|
+
import pyarrow.parquet as pq
|
|
789
|
+
except ImportError as e:
|
|
790
|
+
raise MemoryImportError(
|
|
791
|
+
"pyarrow is required for Parquet import. "
|
|
792
|
+
"Install with: pip install pyarrow"
|
|
793
|
+
) from e
|
|
794
|
+
|
|
795
|
+
try:
|
|
796
|
+
# PyArrow can read from file-like objects
|
|
797
|
+
table = pq.read_table(file_handle)
|
|
798
|
+
records = table.to_pylist()
|
|
799
|
+
|
|
800
|
+
for record in records:
|
|
801
|
+
# Convert metadata from string if needed
|
|
802
|
+
if "metadata" in record and isinstance(record["metadata"], str):
|
|
803
|
+
try:
|
|
804
|
+
record["metadata"] = json.loads(record["metadata"])
|
|
805
|
+
except json.JSONDecodeError:
|
|
806
|
+
record["metadata"] = {}
|
|
807
|
+
yield record
|
|
808
|
+
except Exception as e:
|
|
809
|
+
raise MemoryImportError(f"Failed to parse Parquet file {path}: {e}") from e
|
|
810
|
+
|
|
811
|
+
def _parse_json_from_handle(self, file_handle: BinaryIO) -> Iterator[dict[str, Any]]:
|
|
812
|
+
"""Parse JSON from an already-open file handle (TOCTOU-safe).
|
|
813
|
+
|
|
814
|
+
Args:
|
|
815
|
+
file_handle: Open binary file handle.
|
|
816
|
+
|
|
817
|
+
Yields:
|
|
818
|
+
Memory records as dictionaries.
|
|
819
|
+
"""
|
|
820
|
+
# Read and decode content
|
|
821
|
+
content = file_handle.read().decode("utf-8").strip()
|
|
822
|
+
|
|
823
|
+
# Handle both JSON array and JSON Lines formats
|
|
824
|
+
if content.startswith("["):
|
|
825
|
+
# JSON array
|
|
826
|
+
records = json.loads(content)
|
|
827
|
+
yield from records
|
|
828
|
+
else:
|
|
829
|
+
# JSON Lines (one object per line)
|
|
830
|
+
for line in content.split("\n"):
|
|
831
|
+
line = line.strip()
|
|
832
|
+
if line:
|
|
833
|
+
yield json.loads(line)
|
|
834
|
+
|
|
835
|
+
def _parse_csv_from_handle(self, file_handle: BinaryIO) -> Iterator[dict[str, Any]]:
|
|
836
|
+
"""Parse CSV from an already-open file handle (TOCTOU-safe).
|
|
837
|
+
|
|
838
|
+
Args:
|
|
839
|
+
file_handle: Open binary file handle.
|
|
840
|
+
|
|
841
|
+
Yields:
|
|
842
|
+
Memory records as dictionaries.
|
|
843
|
+
"""
|
|
844
|
+
# Wrap binary handle in text wrapper for CSV reader
|
|
845
|
+
text_handle = TextIOWrapper(file_handle, encoding="utf-8", newline="")
|
|
846
|
+
try:
|
|
847
|
+
reader = csv.DictReader(text_handle)
|
|
848
|
+
|
|
849
|
+
for row in reader:
|
|
850
|
+
record: dict[str, Any] = dict(row)
|
|
851
|
+
|
|
852
|
+
# Convert string fields to appropriate types
|
|
853
|
+
if "importance" in record:
|
|
854
|
+
try:
|
|
855
|
+
record["importance"] = float(record["importance"])
|
|
856
|
+
except (ValueError, TypeError):
|
|
857
|
+
record["importance"] = 0.5
|
|
858
|
+
|
|
859
|
+
if "access_count" in record:
|
|
860
|
+
try:
|
|
861
|
+
record["access_count"] = int(record["access_count"])
|
|
862
|
+
except (ValueError, TypeError):
|
|
863
|
+
record["access_count"] = 0
|
|
864
|
+
|
|
865
|
+
# Parse JSON fields
|
|
866
|
+
if "tags" in record and isinstance(record["tags"], str):
|
|
867
|
+
try:
|
|
868
|
+
record["tags"] = json.loads(record["tags"])
|
|
869
|
+
except json.JSONDecodeError:
|
|
870
|
+
record["tags"] = []
|
|
871
|
+
|
|
872
|
+
if "metadata" in record and isinstance(record["metadata"], str):
|
|
873
|
+
try:
|
|
874
|
+
record["metadata"] = json.loads(record["metadata"])
|
|
875
|
+
except json.JSONDecodeError:
|
|
876
|
+
record["metadata"] = {}
|
|
877
|
+
|
|
878
|
+
if "vector" in record and isinstance(record["vector"], str):
|
|
879
|
+
try:
|
|
880
|
+
record["vector"] = json.loads(record["vector"])
|
|
881
|
+
except json.JSONDecodeError:
|
|
882
|
+
# Remove invalid vector
|
|
883
|
+
del record["vector"]
|
|
884
|
+
|
|
885
|
+
yield record
|
|
886
|
+
finally:
|
|
887
|
+
# Detach text wrapper to prevent it from closing the underlying handle
|
|
888
|
+
text_handle.detach()
|
|
889
|
+
|
|
890
|
+
# =========================================================================
|
|
891
|
+
# Validation
|
|
892
|
+
# =========================================================================
|
|
893
|
+
|
|
894
|
+
def _validate_record(
|
|
895
|
+
self,
|
|
896
|
+
record: dict[str, Any],
|
|
897
|
+
row_number: int,
|
|
898
|
+
expected_dims: int | None = None,
|
|
899
|
+
) -> list[ImportValidationError]:
|
|
900
|
+
"""Validate a single import record.
|
|
901
|
+
|
|
902
|
+
Args:
|
|
903
|
+
record: Record to validate.
|
|
904
|
+
row_number: Row number for error reporting.
|
|
905
|
+
expected_dims: Expected vector dimensions (None to skip check).
|
|
906
|
+
|
|
907
|
+
Returns:
|
|
908
|
+
List of validation errors (empty if valid).
|
|
909
|
+
"""
|
|
910
|
+
errors: list[ImportValidationError] = []
|
|
911
|
+
|
|
912
|
+
# Check required fields
|
|
913
|
+
for field in REQUIRED_IMPORT_FIELDS:
|
|
914
|
+
if field not in record or record[field] is None:
|
|
915
|
+
errors.append(
|
|
916
|
+
ImportValidationError(
|
|
917
|
+
row_number=row_number,
|
|
918
|
+
field=field,
|
|
919
|
+
error=f"Required field '{field}' is missing",
|
|
920
|
+
value=None,
|
|
921
|
+
)
|
|
922
|
+
)
|
|
923
|
+
elif field == "content" and not str(record[field]).strip():
|
|
924
|
+
errors.append(
|
|
925
|
+
ImportValidationError(
|
|
926
|
+
row_number=row_number,
|
|
927
|
+
field=field,
|
|
928
|
+
error="Content cannot be empty",
|
|
929
|
+
value=str(record[field])[:50],
|
|
930
|
+
)
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Validate importance range
|
|
934
|
+
if "importance" in record:
|
|
935
|
+
importance = record["importance"]
|
|
936
|
+
try:
|
|
937
|
+
importance_float = float(importance)
|
|
938
|
+
if not 0.0 <= importance_float <= 1.0:
|
|
939
|
+
errors.append(
|
|
940
|
+
ImportValidationError(
|
|
941
|
+
row_number=row_number,
|
|
942
|
+
field="importance",
|
|
943
|
+
error="Importance must be between 0.0 and 1.0",
|
|
944
|
+
value=str(importance),
|
|
945
|
+
)
|
|
946
|
+
)
|
|
947
|
+
except (ValueError, TypeError):
|
|
948
|
+
errors.append(
|
|
949
|
+
ImportValidationError(
|
|
950
|
+
row_number=row_number,
|
|
951
|
+
field="importance",
|
|
952
|
+
error="Importance must be a number",
|
|
953
|
+
value=str(importance)[:50],
|
|
954
|
+
)
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
# Validate vector dimensions
|
|
958
|
+
if expected_dims is not None and "vector" in record:
|
|
959
|
+
vector = record["vector"]
|
|
960
|
+
if vector is not None:
|
|
961
|
+
try:
|
|
962
|
+
if isinstance(vector, (list, np.ndarray)):
|
|
963
|
+
actual_dims = len(vector)
|
|
964
|
+
if actual_dims != expected_dims:
|
|
965
|
+
errors.append(
|
|
966
|
+
ImportValidationError(
|
|
967
|
+
row_number=row_number,
|
|
968
|
+
field="vector",
|
|
969
|
+
error=f"Vector dimension mismatch: expected "
|
|
970
|
+
f"{expected_dims}, got {actual_dims}",
|
|
971
|
+
value=f"[{actual_dims} dimensions]",
|
|
972
|
+
)
|
|
973
|
+
)
|
|
974
|
+
except (TypeError, AttributeError):
|
|
975
|
+
errors.append(
|
|
976
|
+
ImportValidationError(
|
|
977
|
+
row_number=row_number,
|
|
978
|
+
field="vector",
|
|
979
|
+
error="Vector must be an array of numbers",
|
|
980
|
+
value=str(type(vector)),
|
|
981
|
+
)
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
return errors
|
|
985
|
+
|
|
986
|
+
# =========================================================================
|
|
987
|
+
# Deduplication
|
|
988
|
+
# =========================================================================
|
|
989
|
+
|
|
990
|
+
def _check_duplicate(
|
|
991
|
+
self,
|
|
992
|
+
record: dict[str, Any],
|
|
993
|
+
threshold: float,
|
|
994
|
+
) -> bool | None:
|
|
995
|
+
"""Check if record is a duplicate of an existing memory.
|
|
996
|
+
|
|
997
|
+
Args:
|
|
998
|
+
record: Record to check.
|
|
999
|
+
threshold: Similarity threshold for deduplication.
|
|
1000
|
+
|
|
1001
|
+
Returns:
|
|
1002
|
+
True if record is a duplicate.
|
|
1003
|
+
False if no duplicate found.
|
|
1004
|
+
None if the check failed (let caller decide policy).
|
|
1005
|
+
"""
|
|
1006
|
+
try:
|
|
1007
|
+
# Get vector for comparison
|
|
1008
|
+
if "vector" in record and record["vector"] is not None:
|
|
1009
|
+
vector = np.array(record["vector"], dtype=np.float32)
|
|
1010
|
+
else:
|
|
1011
|
+
# Generate embedding for comparison
|
|
1012
|
+
vector = self._embeddings.embed(record["content"])
|
|
1013
|
+
|
|
1014
|
+
# Search for similar existing memories
|
|
1015
|
+
namespace = record.get("namespace")
|
|
1016
|
+
results = self._repo.search(vector, limit=5, namespace=namespace)
|
|
1017
|
+
|
|
1018
|
+
# Check if any result exceeds threshold
|
|
1019
|
+
for result in results:
|
|
1020
|
+
if result.similarity >= threshold:
|
|
1021
|
+
logger.debug(
|
|
1022
|
+
f"Duplicate found: similarity {result.similarity:.3f} "
|
|
1023
|
+
f">= threshold {threshold:.3f}"
|
|
1024
|
+
)
|
|
1025
|
+
return True
|
|
1026
|
+
|
|
1027
|
+
return False
|
|
1028
|
+
|
|
1029
|
+
except Exception as e:
|
|
1030
|
+
logger.warning(f"Duplicate check failed: {e}")
|
|
1031
|
+
return None
|