spatial-memory-mcp 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spatial-memory-mcp might be problematic. Click here for more details.

Files changed (54) hide show
  1. spatial_memory/__init__.py +97 -0
  2. spatial_memory/__main__.py +270 -0
  3. spatial_memory/adapters/__init__.py +7 -0
  4. spatial_memory/adapters/lancedb_repository.py +878 -0
  5. spatial_memory/config.py +728 -0
  6. spatial_memory/core/__init__.py +118 -0
  7. spatial_memory/core/cache.py +317 -0
  8. spatial_memory/core/circuit_breaker.py +297 -0
  9. spatial_memory/core/connection_pool.py +220 -0
  10. spatial_memory/core/consolidation_strategies.py +402 -0
  11. spatial_memory/core/database.py +3069 -0
  12. spatial_memory/core/db_idempotency.py +242 -0
  13. spatial_memory/core/db_indexes.py +575 -0
  14. spatial_memory/core/db_migrations.py +584 -0
  15. spatial_memory/core/db_search.py +509 -0
  16. spatial_memory/core/db_versioning.py +177 -0
  17. spatial_memory/core/embeddings.py +557 -0
  18. spatial_memory/core/errors.py +317 -0
  19. spatial_memory/core/file_security.py +702 -0
  20. spatial_memory/core/filesystem.py +178 -0
  21. spatial_memory/core/health.py +289 -0
  22. spatial_memory/core/helpers.py +79 -0
  23. spatial_memory/core/import_security.py +432 -0
  24. spatial_memory/core/lifecycle_ops.py +1067 -0
  25. spatial_memory/core/logging.py +194 -0
  26. spatial_memory/core/metrics.py +192 -0
  27. spatial_memory/core/models.py +628 -0
  28. spatial_memory/core/rate_limiter.py +326 -0
  29. spatial_memory/core/response_types.py +497 -0
  30. spatial_memory/core/security.py +588 -0
  31. spatial_memory/core/spatial_ops.py +426 -0
  32. spatial_memory/core/tracing.py +300 -0
  33. spatial_memory/core/utils.py +110 -0
  34. spatial_memory/core/validation.py +403 -0
  35. spatial_memory/factory.py +407 -0
  36. spatial_memory/migrations/__init__.py +40 -0
  37. spatial_memory/ports/__init__.py +11 -0
  38. spatial_memory/ports/repositories.py +631 -0
  39. spatial_memory/py.typed +0 -0
  40. spatial_memory/server.py +1141 -0
  41. spatial_memory/services/__init__.py +70 -0
  42. spatial_memory/services/export_import.py +1023 -0
  43. spatial_memory/services/lifecycle.py +1120 -0
  44. spatial_memory/services/memory.py +412 -0
  45. spatial_memory/services/spatial.py +1147 -0
  46. spatial_memory/services/utility.py +409 -0
  47. spatial_memory/tools/__init__.py +5 -0
  48. spatial_memory/tools/definitions.py +695 -0
  49. spatial_memory/verify.py +140 -0
  50. spatial_memory_mcp-1.6.1.dist-info/METADATA +499 -0
  51. spatial_memory_mcp-1.6.1.dist-info/RECORD +54 -0
  52. spatial_memory_mcp-1.6.1.dist-info/WHEEL +4 -0
  53. spatial_memory_mcp-1.6.1.dist-info/entry_points.txt +2 -0
  54. spatial_memory_mcp-1.6.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1023 @@
1
+ """Export/Import service for memory data portability.
2
+
3
+ This service provides the application layer for memory export/import operations:
4
+ - export_memories: Export memories to Parquet/JSON/CSV formats
5
+ - import_memories: Import memories with validation and deduplication
6
+
7
+ The service uses dependency injection for repository and embedding services,
8
+ following Clean Architecture principles. File I/O and format conversion are
9
+ handled at this service layer, while the repository handles only data access.
10
+
11
+ Security is enforced through PathValidator for all file operations.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import csv
17
+ import json
18
+ import logging
19
+ import time
20
+ from collections.abc import Sequence
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+ from io import TextIOWrapper
24
+ from typing import TYPE_CHECKING, Any, BinaryIO, Iterator
25
+
26
+ import numpy as np
27
+
28
+ from spatial_memory.core.errors import (
29
+ ExportError,
30
+ FileSizeLimitError,
31
+ ImportRecordLimitError,
32
+ MemoryImportError,
33
+ PathSecurityError,
34
+ ValidationError,
35
+ )
36
+ from spatial_memory.core.file_security import PathValidator
37
+ from spatial_memory.core.models import (
38
+ ExportImportConfig,
39
+ ExportResult,
40
+ ImportedMemory,
41
+ ImportResult,
42
+ ImportValidationError,
43
+ )
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ if TYPE_CHECKING:
48
+ from spatial_memory.ports.repositories import (
49
+ EmbeddingServiceProtocol,
50
+ MemoryRepositoryProtocol,
51
+ )
52
+
53
+
54
+ # =============================================================================
55
+ # Constants
56
+ # =============================================================================
57
+
58
+ SUPPORTED_FORMATS = frozenset({"parquet", "json", "csv"})
59
+
60
+ EXTENSION_TO_FORMAT: dict[str, str] = {
61
+ ".parquet": "parquet",
62
+ ".json": "json",
63
+ ".jsonl": "json",
64
+ ".csv": "csv",
65
+ }
66
+
67
+ # Required fields for import validation
68
+ REQUIRED_IMPORT_FIELDS = frozenset({"content"})
69
+
70
+ # Default import size limit (100 MB)
71
+ DEFAULT_MAX_IMPORT_SIZE_BYTES = 100 * 1024 * 1024
72
+
73
+
74
+ # =============================================================================
75
+ # Service Implementation
76
+ # =============================================================================
77
+
78
+
79
+ class ExportImportService:
80
+ """Service for memory export and import operations.
81
+
82
+ Uses Clean Architecture - depends on protocol interfaces, not implementations.
83
+ Handles file I/O and format conversion at the service layer while delegating
84
+ data access to the repository.
85
+
86
+ Security Features:
87
+ - Path validation to prevent traversal attacks
88
+ - File size limits for imports
89
+ - Symlink detection (optional)
90
+ - Extension validation
91
+
92
+ Example:
93
+ service = ExportImportService(
94
+ repository=repo,
95
+ embeddings=emb,
96
+ allowed_export_paths=[Path("./exports")],
97
+ allowed_import_paths=[Path("./imports")],
98
+ )
99
+
100
+ # Export memories
101
+ result = service.export_memories(
102
+ output_path="./exports/backup.parquet",
103
+ namespace="work",
104
+ )
105
+
106
+ # Import memories
107
+ result = service.import_memories(
108
+ source_path="./imports/restore.json",
109
+ dry_run=False,
110
+ )
111
+ """
112
+
113
+ def __init__(
114
+ self,
115
+ repository: MemoryRepositoryProtocol,
116
+ embeddings: EmbeddingServiceProtocol,
117
+ config: ExportImportConfig | None = None,
118
+ allowed_export_paths: Sequence[str | Path] | None = None,
119
+ allowed_import_paths: Sequence[str | Path] | None = None,
120
+ allow_symlinks: bool = False,
121
+ max_import_size_bytes: int | None = None,
122
+ ) -> None:
123
+ """Initialize the export/import service.
124
+
125
+ Args:
126
+ repository: Repository for memory storage.
127
+ embeddings: Service for generating embeddings.
128
+ config: Optional configuration (uses defaults if not provided).
129
+ allowed_export_paths: Directories where exports are permitted.
130
+ allowed_import_paths: Directories where imports are permitted.
131
+ allow_symlinks: Whether to allow following symlinks (default False).
132
+ max_import_size_bytes: Maximum import file size in bytes.
133
+ """
134
+ self._repo = repository
135
+ self._embeddings = embeddings
136
+ self._config = config or ExportImportConfig()
137
+
138
+ # Set up path validator
139
+ export_paths = allowed_export_paths or [Path("./exports"), Path("./backups")]
140
+ import_paths = allowed_import_paths or [Path("./imports"), Path("./backups")]
141
+
142
+ self._path_validator = PathValidator(
143
+ allowed_export_paths=export_paths,
144
+ allowed_import_paths=import_paths,
145
+ allow_symlinks=allow_symlinks,
146
+ )
147
+
148
+ self._max_import_size_bytes = (
149
+ max_import_size_bytes or DEFAULT_MAX_IMPORT_SIZE_BYTES
150
+ )
151
+
152
+ def export_memories(
153
+ self,
154
+ output_path: str,
155
+ format: str | None = None,
156
+ namespace: str | None = None,
157
+ include_vectors: bool = True,
158
+ ) -> ExportResult:
159
+ """Export memories to a file.
160
+
161
+ Streams data from repository and writes to the specified format.
162
+ Supports Parquet (recommended for full fidelity), JSON, and CSV.
163
+
164
+ Args:
165
+ output_path: Path for output file. Extension determines format
166
+ if format parameter is not specified.
167
+ format: Export format (parquet, json, csv). Auto-detected from
168
+ extension if not specified.
169
+ namespace: Export only this namespace (all if not specified).
170
+ include_vectors: Include embedding vectors in export (default True).
171
+ Note: CSV exports may set this to False for readability.
172
+
173
+ Returns:
174
+ ExportResult with export statistics.
175
+
176
+ Raises:
177
+ ExportError: If export operation fails.
178
+ PathSecurityError: If path validation fails.
179
+ ValidationError: If input validation fails.
180
+ """
181
+ start_time = time.monotonic()
182
+
183
+ # Validate and resolve path
184
+ try:
185
+ canonical_path = self._path_validator.validate_export_path(output_path)
186
+ except (PathSecurityError, ValueError) as e:
187
+ raise PathSecurityError(
188
+ path=output_path,
189
+ violation_type="export_path_validation_failed",
190
+ message=str(e),
191
+ ) from e
192
+
193
+ # Detect or validate format
194
+ detected_format = format or self._detect_format(output_path)
195
+ if detected_format is None:
196
+ detected_format = self._config.default_export_format
197
+
198
+ if detected_format not in SUPPORTED_FORMATS:
199
+ raise ValidationError(
200
+ f"Unsupported export format: {detected_format}. "
201
+ f"Supported: {', '.join(sorted(SUPPORTED_FORMATS))}"
202
+ )
203
+
204
+ try:
205
+ # Check export record limit before starting
206
+ if self._config.max_export_records > 0:
207
+ memory_count = self._repo.count(namespace=namespace)
208
+ if memory_count > self._config.max_export_records:
209
+ raise ExportError(
210
+ f"Export would contain {memory_count} records, "
211
+ f"exceeding limit of {self._config.max_export_records}. "
212
+ "Consider filtering by namespace or increasing max_export_records."
213
+ )
214
+
215
+ # Ensure parent directory exists
216
+ canonical_path.parent.mkdir(parents=True, exist_ok=True)
217
+
218
+ # Stream data from repository
219
+ batches = self._repo.get_all_for_export(
220
+ namespace=namespace,
221
+ batch_size=self._config.export_batch_size,
222
+ )
223
+
224
+ # Get namespaces for result
225
+ if namespace:
226
+ namespaces_included = [namespace]
227
+ else:
228
+ namespaces_included = self._repo.get_namespaces()
229
+
230
+ # Export based on format
231
+ if detected_format == "parquet":
232
+ memories_exported = self._export_parquet(
233
+ canonical_path, batches, include_vectors
234
+ )
235
+ elif detected_format == "json":
236
+ memories_exported = self._export_json(
237
+ canonical_path, batches, include_vectors
238
+ )
239
+ elif detected_format == "csv":
240
+ memories_exported = self._export_csv(
241
+ canonical_path, batches, include_vectors
242
+ )
243
+ else:
244
+ raise ExportError(f"Unsupported format: {detected_format}")
245
+
246
+ # Calculate file size
247
+ if canonical_path.exists():
248
+ file_size_bytes = canonical_path.stat().st_size
249
+ else:
250
+ file_size_bytes = 0
251
+
252
+ duration_seconds = time.monotonic() - start_time
253
+
254
+ return ExportResult(
255
+ format=detected_format,
256
+ output_path=str(canonical_path),
257
+ memories_exported=memories_exported,
258
+ file_size_bytes=file_size_bytes,
259
+ file_size_mb=file_size_bytes / (1024 * 1024),
260
+ namespaces_included=namespaces_included,
261
+ duration_seconds=duration_seconds,
262
+ compression="zstd" if detected_format == "parquet" else None,
263
+ )
264
+
265
+ except (ExportError, PathSecurityError, ValidationError):
266
+ raise
267
+ except Exception as e:
268
+ logger.error(f"Export failed: {e}")
269
+ raise ExportError(f"Export operation failed: {e}") from e
270
+
271
+ def import_memories(
272
+ self,
273
+ source_path: str,
274
+ format: str | None = None,
275
+ namespace_override: str | None = None,
276
+ deduplicate: bool = False,
277
+ dedup_threshold: float = 0.95,
278
+ validate: bool = True,
279
+ regenerate_embeddings: bool = False,
280
+ dry_run: bool = True,
281
+ ) -> ImportResult:
282
+ """Import memories from a file.
283
+
284
+ Parses the file, validates records, optionally deduplicates against
285
+ existing memories, and imports to the repository.
286
+
287
+ Args:
288
+ source_path: Path to source file.
289
+ format: Import format (parquet, json, csv). Auto-detected from
290
+ extension if not specified.
291
+ namespace_override: Override namespace for all imported memories.
292
+ deduplicate: Skip records similar to existing memories (default False).
293
+ dedup_threshold: Similarity threshold for deduplication (0.7-0.99).
294
+ validate: Validate records before import (default True).
295
+ regenerate_embeddings: Generate new embeddings for imported memories.
296
+ Required if source lacks vectors or dimensions don't match.
297
+ dry_run: Validate without importing (default True). Set to False
298
+ to actually import the memories.
299
+
300
+ Returns:
301
+ ImportResult with import statistics and validation errors.
302
+
303
+ Raises:
304
+ MemoryImportError: If import operation fails.
305
+ PathSecurityError: If path validation fails.
306
+ FileSizeLimitError: If file exceeds size limit.
307
+ ValidationError: If input validation fails.
308
+ """
309
+ start_time = time.monotonic()
310
+
311
+ # Detect or validate format BEFORE opening file
312
+ detected_format = format or self._detect_format(source_path)
313
+ if detected_format is None:
314
+ raise ValidationError(
315
+ f"Cannot detect format from path: {source_path}. "
316
+ "Please specify format explicitly."
317
+ )
318
+
319
+ if detected_format not in SUPPORTED_FORMATS:
320
+ raise ValidationError(
321
+ f"Unsupported import format: {detected_format}. "
322
+ f"Supported: {', '.join(sorted(SUPPORTED_FORMATS))}"
323
+ )
324
+
325
+ # Validate dedup threshold
326
+ if deduplicate and not 0.7 <= dedup_threshold <= 0.99:
327
+ raise ValidationError(
328
+ "dedup_threshold must be between 0.7 and 0.99"
329
+ )
330
+
331
+ # ATOMIC: Validate and open file in one step (prevents TOCTOU)
332
+ # The file handle MUST be used for reading, not re-opened by path
333
+ try:
334
+ canonical_path, file_handle = self._path_validator.validate_and_open_import_file(
335
+ source_path,
336
+ max_size_bytes=self._max_import_size_bytes,
337
+ )
338
+ except PathSecurityError as e:
339
+ raise e
340
+ except FileSizeLimitError as e:
341
+ raise e
342
+ except ValueError as e:
343
+ raise PathSecurityError(
344
+ path=source_path,
345
+ violation_type="import_path_validation_failed",
346
+ message=str(e),
347
+ ) from e
348
+
349
+ try:
350
+ # Parse file using the ALREADY OPEN file handle (TOCTOU safe)
351
+ if detected_format == "parquet":
352
+ records_iter = self._parse_parquet_from_handle(file_handle, canonical_path)
353
+ elif detected_format == "json":
354
+ records_iter = self._parse_json_from_handle(file_handle)
355
+ elif detected_format == "csv":
356
+ records_iter = self._parse_csv_from_handle(file_handle)
357
+ else:
358
+ raise MemoryImportError(f"Unsupported format: {detected_format}")
359
+
360
+ # Stream records with early termination to prevent memory exhaustion.
361
+ # Check limit during iteration, not after loading all records.
362
+ max_records = self._config.max_import_records
363
+ records: list[dict[str, Any]] = []
364
+
365
+ for record in records_iter:
366
+ records.append(record)
367
+ # Fail fast if limit exceeded - prevents memory exhaustion from large files
368
+ if max_records > 0 and len(records) > max_records:
369
+ raise ImportRecordLimitError(
370
+ actual_count=len(records),
371
+ max_count=max_records,
372
+ )
373
+ finally:
374
+ # Ensure file is closed even if parsing fails
375
+ file_handle.close()
376
+
377
+ try:
378
+
379
+ # Process records
380
+ total_records = 0
381
+ valid_records: list[dict[str, Any]] = []
382
+ validation_errors: list[ImportValidationError] = []
383
+ skipped_count = 0
384
+ failed_count = 0
385
+ imported_memories: list[ImportedMemory] = []
386
+
387
+ for idx, record in enumerate(records):
388
+ total_records += 1
389
+
390
+ # Validate record if requested
391
+ if validate:
392
+ expected_dims = (
393
+ self._embeddings.dimensions
394
+ if not regenerate_embeddings
395
+ else None
396
+ )
397
+ errors = self._validate_record(record, idx, expected_dims)
398
+ if errors:
399
+ validation_errors.extend(errors)
400
+ failed_count += 1
401
+ continue
402
+
403
+ # Apply namespace override
404
+ if namespace_override:
405
+ record["namespace"] = namespace_override
406
+
407
+ # Handle embeddings
408
+ if regenerate_embeddings or "vector" not in record:
409
+ if not dry_run:
410
+ vector = self._embeddings.embed(record["content"])
411
+ record["vector"] = vector.tolist()
412
+ else:
413
+ # In dry run, just mark that we would regenerate
414
+ record["_needs_embedding"] = True
415
+
416
+ # Deduplicate if requested
417
+ if deduplicate and not dry_run:
418
+ is_duplicate = self._check_duplicate(
419
+ record, dedup_threshold
420
+ )
421
+ if is_duplicate is True:
422
+ skipped_count += 1
423
+ continue
424
+ # If is_duplicate is None (check failed), proceed with import
425
+ # This is a conservative policy - import on failure
426
+
427
+ valid_records.append(record)
428
+
429
+ # Import if not dry run
430
+ memories_imported = 0
431
+ imported_ids: list[str] = []
432
+
433
+ if not dry_run and valid_records:
434
+ # Filter out internal fields
435
+ import_records = [
436
+ {k: v for k, v in r.items() if not k.startswith("_")}
437
+ for r in valid_records
438
+ ]
439
+
440
+ memories_imported, imported_ids = self._repo.bulk_import(
441
+ iter(import_records),
442
+ batch_size=self._config.import_batch_size,
443
+ namespace_override=namespace_override,
444
+ )
445
+
446
+ # Build imported memories list
447
+ for record, new_id in zip(valid_records, imported_ids):
448
+ content = record.get("content", "")
449
+ preview = content[:100] + "..." if len(content) > 100 else content
450
+ imported_memories.append(
451
+ ImportedMemory(
452
+ id=new_id,
453
+ content_preview=preview,
454
+ namespace=record.get("namespace", "default"),
455
+ was_deduplicated=False,
456
+ original_id=record.get("id"),
457
+ )
458
+ )
459
+ elif dry_run:
460
+ # In dry run, count valid records as "would be imported"
461
+ memories_imported = len(valid_records)
462
+
463
+ duration_seconds = time.monotonic() - start_time
464
+
465
+ return ImportResult(
466
+ source_path=str(canonical_path),
467
+ format=detected_format,
468
+ total_records_in_file=total_records,
469
+ memories_imported=memories_imported,
470
+ memories_skipped=skipped_count,
471
+ memories_failed=failed_count,
472
+ validation_errors=validation_errors,
473
+ duration_seconds=duration_seconds,
474
+ namespace_override=namespace_override,
475
+ imported_memories=imported_memories if not dry_run else None,
476
+ )
477
+
478
+ except (MemoryImportError, PathSecurityError, ValidationError, FileSizeLimitError):
479
+ raise
480
+ except json.JSONDecodeError as e:
481
+ raise MemoryImportError(f"Invalid JSON in import file: {e}") from e
482
+ except Exception as e:
483
+ logger.error(f"Import failed: {e}")
484
+ raise MemoryImportError(f"Import operation failed: {e}") from e
485
+
486
+ # =========================================================================
487
+ # Format Detection
488
+ # =========================================================================
489
+
490
+ def _detect_format(self, path: str) -> str | None:
491
+ """Detect format from file extension.
492
+
493
+ Args:
494
+ path: File path.
495
+
496
+ Returns:
497
+ Format string or None if unknown.
498
+ """
499
+ path_obj = Path(path)
500
+ ext = path_obj.suffix.lower()
501
+ return EXTENSION_TO_FORMAT.get(ext)
502
+
503
+ # =========================================================================
504
+ # Export Format Handlers
505
+ # =========================================================================
506
+
507
+ def _create_parquet_schema(self, include_vectors: bool) -> "pa.Schema":
508
+ """Create PyArrow schema for Parquet export.
509
+
510
+ Args:
511
+ include_vectors: Whether to include embedding vector field.
512
+
513
+ Returns:
514
+ PyArrow schema for memory records.
515
+ """
516
+ import pyarrow as pa
517
+
518
+ fields = [
519
+ ("id", pa.string()),
520
+ ("content", pa.string()),
521
+ ("namespace", pa.string()),
522
+ ("importance", pa.float32()),
523
+ ("tags", pa.list_(pa.string())),
524
+ ("source", pa.string()),
525
+ ("metadata", pa.string()),
526
+ ("created_at", pa.timestamp("us", tz="UTC")),
527
+ ("updated_at", pa.timestamp("us", tz="UTC")),
528
+ ("last_accessed", pa.timestamp("us", tz="UTC")),
529
+ ("access_count", pa.int32()),
530
+ ]
531
+ if include_vectors:
532
+ fields.append(("vector", pa.list_(pa.float32())))
533
+ return pa.schema(fields)
534
+
535
+ def _export_parquet(
536
+ self,
537
+ path: Path,
538
+ batches: Iterator[list[dict[str, Any]]],
539
+ include_vectors: bool,
540
+ ) -> int:
541
+ """Export to Parquet format using streaming writes.
542
+
543
+ Uses ParquetWriter to write batches incrementally, avoiding
544
+ accumulation of all records in memory.
545
+
546
+ Args:
547
+ path: Output file path.
548
+ batches: Iterator of record batches.
549
+ include_vectors: Whether to include embedding vectors.
550
+
551
+ Returns:
552
+ Number of records exported.
553
+ """
554
+ try:
555
+ import pyarrow as pa
556
+ import pyarrow.parquet as pq
557
+ except ImportError as e:
558
+ raise ExportError(
559
+ "pyarrow is required for Parquet export. "
560
+ "Install with: pip install pyarrow"
561
+ ) from e
562
+
563
+ schema = self._create_parquet_schema(include_vectors)
564
+ total_records = 0
565
+ writer: pq.ParquetWriter | None = None
566
+
567
+ try:
568
+ for batch in batches:
569
+ if not batch:
570
+ continue
571
+
572
+ # Process records for this batch
573
+ processed_records: list[dict[str, Any]] = []
574
+ for record in batch:
575
+ processed = self._prepare_record_for_export(record, include_vectors)
576
+ # Parquet needs metadata as string to avoid empty struct issues
577
+ if "metadata" in processed:
578
+ if isinstance(processed["metadata"], dict):
579
+ processed["metadata"] = json.dumps(processed["metadata"])
580
+ processed_records.append(processed)
581
+
582
+ if not processed_records:
583
+ continue
584
+
585
+ # Create table from this batch
586
+ batch_table = pa.Table.from_pylist(processed_records, schema=schema)
587
+
588
+ # Initialize writer on first batch with data
589
+ if writer is None:
590
+ writer = pq.ParquetWriter(
591
+ path,
592
+ schema,
593
+ compression=self._config.parquet_compression,
594
+ )
595
+
596
+ writer.write_table(batch_table)
597
+ total_records += len(processed_records)
598
+
599
+ # Handle empty export case - write an empty file with schema
600
+ if writer is None:
601
+ empty_table = pa.Table.from_pydict(
602
+ {f.name: [] for f in schema}, schema=schema
603
+ )
604
+ pq.write_table(
605
+ empty_table,
606
+ path,
607
+ compression=self._config.parquet_compression,
608
+ )
609
+
610
+ finally:
611
+ if writer is not None:
612
+ writer.close()
613
+
614
+ return total_records
615
+
616
+ def _export_json(
617
+ self,
618
+ path: Path,
619
+ batches: Iterator[list[dict[str, Any]]],
620
+ include_vectors: bool,
621
+ ) -> int:
622
+ """Export to JSON format using streaming to avoid memory exhaustion.
623
+
624
+ Writes a valid JSON array by streaming records one at a time,
625
+ without accumulating all records in memory.
626
+
627
+ Args:
628
+ path: Output file path.
629
+ batches: Iterator of record batches.
630
+ include_vectors: Whether to include embedding vectors.
631
+
632
+ Returns:
633
+ Number of records exported.
634
+ """
635
+ total_records = 0
636
+ first_record = True
637
+
638
+ with open(path, "w", encoding="utf-8") as f:
639
+ f.write("[\n")
640
+
641
+ for batch in batches:
642
+ for record in batch:
643
+ processed = self._prepare_record_for_export(record, include_vectors)
644
+
645
+ # Add comma separator for all but first record
646
+ if not first_record:
647
+ f.write(",\n")
648
+ first_record = False
649
+
650
+ # Write the record
651
+ json_str = json.dumps(processed, default=self._json_serializer, indent=2)
652
+ # Indent each line for pretty formatting
653
+ indented = "\n".join(" " + line for line in json_str.split("\n"))
654
+ f.write(indented)
655
+
656
+ total_records += 1
657
+
658
+ f.write("\n]")
659
+
660
+ return total_records
661
+
662
+ def _export_csv(
663
+ self,
664
+ path: Path,
665
+ batches: Iterator[list[dict[str, Any]]],
666
+ include_vectors: bool,
667
+ ) -> int:
668
+ """Export to CSV format using streaming to avoid memory exhaustion.
669
+
670
+ Writes CSV rows as they are processed without accumulating
671
+ all records in memory.
672
+
673
+ Args:
674
+ path: Output file path.
675
+ batches: Iterator of record batches.
676
+ include_vectors: Whether to include embedding vectors.
677
+
678
+ Returns:
679
+ Number of records exported.
680
+ """
681
+ # Define fieldnames upfront
682
+ fieldnames = [
683
+ "id", "content", "namespace", "importance", "tags",
684
+ "source", "metadata", "created_at", "updated_at",
685
+ "last_accessed", "access_count"
686
+ ]
687
+ if include_vectors:
688
+ fieldnames.append("vector")
689
+
690
+ total_records = 0
691
+
692
+ with open(path, "w", newline="", encoding="utf-8") as f:
693
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
694
+ writer.writeheader()
695
+
696
+ for batch in batches:
697
+ for record in batch:
698
+ processed = self._prepare_record_for_export(record, include_vectors)
699
+ # Convert complex types to strings for CSV
700
+ processed["tags"] = json.dumps(processed.get("tags", []))
701
+ processed["metadata"] = json.dumps(processed.get("metadata", {}))
702
+ if include_vectors and "vector" in processed:
703
+ processed["vector"] = json.dumps(processed["vector"])
704
+ # Convert datetimes to ISO format
705
+ for key in ["created_at", "updated_at", "last_accessed"]:
706
+ if key in processed and processed[key] is not None:
707
+ if isinstance(processed[key], datetime):
708
+ processed[key] = processed[key].isoformat()
709
+
710
+ writer.writerow(processed)
711
+ total_records += 1
712
+
713
+ return total_records
714
+
715
+ def _prepare_record_for_export(
716
+ self,
717
+ record: dict[str, Any],
718
+ include_vectors: bool,
719
+ ) -> dict[str, Any]:
720
+ """Prepare a record for export.
721
+
722
+ Args:
723
+ record: Raw record from repository.
724
+ include_vectors: Whether to include embedding vectors.
725
+
726
+ Returns:
727
+ Processed record suitable for export.
728
+ """
729
+ processed = dict(record)
730
+
731
+ # Handle vector
732
+ if not include_vectors:
733
+ processed.pop("vector", None)
734
+ elif "vector" in processed:
735
+ # Ensure vector is a list, not numpy array
736
+ vec = processed["vector"]
737
+ if isinstance(vec, np.ndarray):
738
+ processed["vector"] = vec.tolist()
739
+
740
+ # Handle metadata - ensure it's JSON serializable
741
+ if "metadata" in processed:
742
+ meta = processed["metadata"]
743
+ if isinstance(meta, str):
744
+ try:
745
+ processed["metadata"] = json.loads(meta)
746
+ except json.JSONDecodeError:
747
+ processed["metadata"] = {}
748
+
749
+ return processed
750
+
751
+ def _json_serializer(self, obj: Any) -> Any:
752
+ """Custom JSON serializer for complex types."""
753
+ if isinstance(obj, datetime):
754
+ return obj.isoformat()
755
+ if isinstance(obj, np.ndarray):
756
+ return obj.tolist()
757
+ if isinstance(obj, np.floating):
758
+ return float(obj)
759
+ if isinstance(obj, np.integer):
760
+ return int(obj)
761
+ raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
762
+
763
+ # =========================================================================
764
+ # Import Format Handlers (TOCTOU-safe versions using file handles)
765
+ # =========================================================================
766
+
767
+ def _parse_parquet_from_handle(
768
+ self, file_handle: BinaryIO, path: Path
769
+ ) -> Iterator[dict[str, Any]]:
770
+ """Parse Parquet from an already-open file handle (TOCTOU-safe).
771
+
772
+ Args:
773
+ file_handle: Open binary file handle.
774
+ path: Original path (for error messages only).
775
+
776
+ Yields:
777
+ Memory records as dictionaries.
778
+ """
779
+ try:
780
+ import pyarrow.parquet as pq
781
+ except ImportError as e:
782
+ raise MemoryImportError(
783
+ "pyarrow is required for Parquet import. "
784
+ "Install with: pip install pyarrow"
785
+ ) from e
786
+
787
+ try:
788
+ # PyArrow can read from file-like objects
789
+ table = pq.read_table(file_handle)
790
+ records = table.to_pylist()
791
+
792
+ for record in records:
793
+ # Convert metadata from string if needed
794
+ if "metadata" in record and isinstance(record["metadata"], str):
795
+ try:
796
+ record["metadata"] = json.loads(record["metadata"])
797
+ except json.JSONDecodeError:
798
+ record["metadata"] = {}
799
+ yield record
800
+ except Exception as e:
801
+ raise MemoryImportError(f"Failed to parse Parquet file {path}: {e}") from e
802
+
803
+ def _parse_json_from_handle(self, file_handle: BinaryIO) -> Iterator[dict[str, Any]]:
804
+ """Parse JSON from an already-open file handle (TOCTOU-safe).
805
+
806
+ Args:
807
+ file_handle: Open binary file handle.
808
+
809
+ Yields:
810
+ Memory records as dictionaries.
811
+ """
812
+ # Read and decode content
813
+ content = file_handle.read().decode("utf-8").strip()
814
+
815
+ # Handle both JSON array and JSON Lines formats
816
+ if content.startswith("["):
817
+ # JSON array
818
+ records = json.loads(content)
819
+ for record in records:
820
+ yield record
821
+ else:
822
+ # JSON Lines (one object per line)
823
+ for line in content.split("\n"):
824
+ line = line.strip()
825
+ if line:
826
+ yield json.loads(line)
827
+
828
+ def _parse_csv_from_handle(self, file_handle: BinaryIO) -> Iterator[dict[str, Any]]:
829
+ """Parse CSV from an already-open file handle (TOCTOU-safe).
830
+
831
+ Args:
832
+ file_handle: Open binary file handle.
833
+
834
+ Yields:
835
+ Memory records as dictionaries.
836
+ """
837
+ # Wrap binary handle in text wrapper for CSV reader
838
+ text_handle = TextIOWrapper(file_handle, encoding="utf-8", newline="")
839
+ try:
840
+ reader = csv.DictReader(text_handle)
841
+
842
+ for row in reader:
843
+ record: dict[str, Any] = dict(row)
844
+
845
+ # Convert string fields to appropriate types
846
+ if "importance" in record:
847
+ try:
848
+ record["importance"] = float(record["importance"])
849
+ except (ValueError, TypeError):
850
+ record["importance"] = 0.5
851
+
852
+ if "access_count" in record:
853
+ try:
854
+ record["access_count"] = int(record["access_count"])
855
+ except (ValueError, TypeError):
856
+ record["access_count"] = 0
857
+
858
+ # Parse JSON fields
859
+ if "tags" in record and isinstance(record["tags"], str):
860
+ try:
861
+ record["tags"] = json.loads(record["tags"])
862
+ except json.JSONDecodeError:
863
+ record["tags"] = []
864
+
865
+ if "metadata" in record and isinstance(record["metadata"], str):
866
+ try:
867
+ record["metadata"] = json.loads(record["metadata"])
868
+ except json.JSONDecodeError:
869
+ record["metadata"] = {}
870
+
871
+ if "vector" in record and isinstance(record["vector"], str):
872
+ try:
873
+ record["vector"] = json.loads(record["vector"])
874
+ except json.JSONDecodeError:
875
+ # Remove invalid vector
876
+ del record["vector"]
877
+
878
+ yield record
879
+ finally:
880
+ # Detach text wrapper to prevent it from closing the underlying handle
881
+ text_handle.detach()
882
+
883
+ # =========================================================================
884
+ # Validation
885
+ # =========================================================================
886
+
887
+ def _validate_record(
888
+ self,
889
+ record: dict[str, Any],
890
+ row_number: int,
891
+ expected_dims: int | None = None,
892
+ ) -> list[ImportValidationError]:
893
+ """Validate a single import record.
894
+
895
+ Args:
896
+ record: Record to validate.
897
+ row_number: Row number for error reporting.
898
+ expected_dims: Expected vector dimensions (None to skip check).
899
+
900
+ Returns:
901
+ List of validation errors (empty if valid).
902
+ """
903
+ errors: list[ImportValidationError] = []
904
+
905
+ # Check required fields
906
+ for field in REQUIRED_IMPORT_FIELDS:
907
+ if field not in record or record[field] is None:
908
+ errors.append(
909
+ ImportValidationError(
910
+ row_number=row_number,
911
+ field=field,
912
+ error=f"Required field '{field}' is missing",
913
+ value=None,
914
+ )
915
+ )
916
+ elif field == "content" and not str(record[field]).strip():
917
+ errors.append(
918
+ ImportValidationError(
919
+ row_number=row_number,
920
+ field=field,
921
+ error="Content cannot be empty",
922
+ value=str(record[field])[:50],
923
+ )
924
+ )
925
+
926
+ # Validate importance range
927
+ if "importance" in record:
928
+ importance = record["importance"]
929
+ try:
930
+ importance_float = float(importance)
931
+ if not 0.0 <= importance_float <= 1.0:
932
+ errors.append(
933
+ ImportValidationError(
934
+ row_number=row_number,
935
+ field="importance",
936
+ error="Importance must be between 0.0 and 1.0",
937
+ value=str(importance),
938
+ )
939
+ )
940
+ except (ValueError, TypeError):
941
+ errors.append(
942
+ ImportValidationError(
943
+ row_number=row_number,
944
+ field="importance",
945
+ error="Importance must be a number",
946
+ value=str(importance)[:50],
947
+ )
948
+ )
949
+
950
+ # Validate vector dimensions
951
+ if expected_dims is not None and "vector" in record:
952
+ vector = record["vector"]
953
+ if vector is not None:
954
+ try:
955
+ if isinstance(vector, (list, np.ndarray)):
956
+ actual_dims = len(vector)
957
+ if actual_dims != expected_dims:
958
+ errors.append(
959
+ ImportValidationError(
960
+ row_number=row_number,
961
+ field="vector",
962
+ error=f"Vector dimension mismatch: expected {expected_dims}, got {actual_dims}",
963
+ value=f"[{actual_dims} dimensions]",
964
+ )
965
+ )
966
+ except (TypeError, AttributeError):
967
+ errors.append(
968
+ ImportValidationError(
969
+ row_number=row_number,
970
+ field="vector",
971
+ error="Vector must be an array of numbers",
972
+ value=str(type(vector)),
973
+ )
974
+ )
975
+
976
+ return errors
977
+
978
+ # =========================================================================
979
+ # Deduplication
980
+ # =========================================================================
981
+
982
+ def _check_duplicate(
983
+ self,
984
+ record: dict[str, Any],
985
+ threshold: float,
986
+ ) -> bool | None:
987
+ """Check if record is a duplicate of an existing memory.
988
+
989
+ Args:
990
+ record: Record to check.
991
+ threshold: Similarity threshold for deduplication.
992
+
993
+ Returns:
994
+ True if record is a duplicate.
995
+ False if no duplicate found.
996
+ None if the check failed (let caller decide policy).
997
+ """
998
+ try:
999
+ # Get vector for comparison
1000
+ if "vector" in record and record["vector"] is not None:
1001
+ vector = np.array(record["vector"], dtype=np.float32)
1002
+ else:
1003
+ # Generate embedding for comparison
1004
+ vector = self._embeddings.embed(record["content"])
1005
+
1006
+ # Search for similar existing memories
1007
+ namespace = record.get("namespace")
1008
+ results = self._repo.search(vector, limit=5, namespace=namespace)
1009
+
1010
+ # Check if any result exceeds threshold
1011
+ for result in results:
1012
+ if result.similarity >= threshold:
1013
+ logger.debug(
1014
+ f"Duplicate found: similarity {result.similarity:.3f} "
1015
+ f">= threshold {threshold:.3f}"
1016
+ )
1017
+ return True
1018
+
1019
+ return False
1020
+
1021
+ except Exception as e:
1022
+ logger.warning(f"Duplicate check failed: {e}")
1023
+ return None