spatial-memory-mcp 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. spatial_memory/__init__.py +97 -0
  2. spatial_memory/__main__.py +271 -0
  3. spatial_memory/adapters/__init__.py +7 -0
  4. spatial_memory/adapters/lancedb_repository.py +880 -0
  5. spatial_memory/config.py +769 -0
  6. spatial_memory/core/__init__.py +118 -0
  7. spatial_memory/core/cache.py +317 -0
  8. spatial_memory/core/circuit_breaker.py +297 -0
  9. spatial_memory/core/connection_pool.py +220 -0
  10. spatial_memory/core/consolidation_strategies.py +401 -0
  11. spatial_memory/core/database.py +3072 -0
  12. spatial_memory/core/db_idempotency.py +242 -0
  13. spatial_memory/core/db_indexes.py +576 -0
  14. spatial_memory/core/db_migrations.py +588 -0
  15. spatial_memory/core/db_search.py +512 -0
  16. spatial_memory/core/db_versioning.py +178 -0
  17. spatial_memory/core/embeddings.py +558 -0
  18. spatial_memory/core/errors.py +317 -0
  19. spatial_memory/core/file_security.py +701 -0
  20. spatial_memory/core/filesystem.py +178 -0
  21. spatial_memory/core/health.py +289 -0
  22. spatial_memory/core/helpers.py +79 -0
  23. spatial_memory/core/import_security.py +433 -0
  24. spatial_memory/core/lifecycle_ops.py +1067 -0
  25. spatial_memory/core/logging.py +194 -0
  26. spatial_memory/core/metrics.py +192 -0
  27. spatial_memory/core/models.py +660 -0
  28. spatial_memory/core/rate_limiter.py +326 -0
  29. spatial_memory/core/response_types.py +500 -0
  30. spatial_memory/core/security.py +588 -0
  31. spatial_memory/core/spatial_ops.py +430 -0
  32. spatial_memory/core/tracing.py +300 -0
  33. spatial_memory/core/utils.py +110 -0
  34. spatial_memory/core/validation.py +406 -0
  35. spatial_memory/factory.py +444 -0
  36. spatial_memory/migrations/__init__.py +40 -0
  37. spatial_memory/ports/__init__.py +11 -0
  38. spatial_memory/ports/repositories.py +630 -0
  39. spatial_memory/py.typed +0 -0
  40. spatial_memory/server.py +1214 -0
  41. spatial_memory/services/__init__.py +70 -0
  42. spatial_memory/services/decay_manager.py +411 -0
  43. spatial_memory/services/export_import.py +1031 -0
  44. spatial_memory/services/lifecycle.py +1139 -0
  45. spatial_memory/services/memory.py +412 -0
  46. spatial_memory/services/spatial.py +1152 -0
  47. spatial_memory/services/utility.py +429 -0
  48. spatial_memory/tools/__init__.py +5 -0
  49. spatial_memory/tools/definitions.py +695 -0
  50. spatial_memory/verify.py +140 -0
  51. spatial_memory_mcp-1.9.1.dist-info/METADATA +509 -0
  52. spatial_memory_mcp-1.9.1.dist-info/RECORD +55 -0
  53. spatial_memory_mcp-1.9.1.dist-info/WHEEL +4 -0
  54. spatial_memory_mcp-1.9.1.dist-info/entry_points.txt +2 -0
  55. spatial_memory_mcp-1.9.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1031 @@
1
+ """Export/Import service for memory data portability.
2
+
3
+ This service provides the application layer for memory export/import operations:
4
+ - export_memories: Export memories to Parquet/JSON/CSV formats
5
+ - import_memories: Import memories with validation and deduplication
6
+
7
+ The service uses dependency injection for repository and embedding services,
8
+ following Clean Architecture principles. File I/O and format conversion are
9
+ handled at this service layer, while the repository handles only data access.
10
+
11
+ Security is enforced through PathValidator for all file operations.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import csv
17
+ import json
18
+ import logging
19
+ import time
20
+ from collections.abc import Iterator, Sequence
21
+ from datetime import datetime
22
+ from io import TextIOWrapper
23
+ from pathlib import Path
24
+ from typing import TYPE_CHECKING, Any, BinaryIO
25
+
26
+ import numpy as np
27
+
28
+ from spatial_memory.core.errors import (
29
+ ExportError,
30
+ FileSizeLimitError,
31
+ ImportRecordLimitError,
32
+ MemoryImportError,
33
+ PathSecurityError,
34
+ ValidationError,
35
+ )
36
+ from spatial_memory.core.file_security import PathValidator
37
+ from spatial_memory.core.models import (
38
+ ExportImportConfig,
39
+ ExportResult,
40
+ ImportedMemory,
41
+ ImportResult,
42
+ ImportValidationError,
43
+ )
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ # Explicit exports for mypy
48
+ __all__ = [
49
+ "ExportImportConfig",
50
+ "ExportImportService",
51
+ ]
52
+
53
+ if TYPE_CHECKING:
54
+ import pyarrow as pa
55
+
56
+ from spatial_memory.ports.repositories import (
57
+ EmbeddingServiceProtocol,
58
+ MemoryRepositoryProtocol,
59
+ )
60
+
61
+
62
+ # =============================================================================
63
+ # Constants
64
+ # =============================================================================
65
+
66
+ SUPPORTED_FORMATS = frozenset({"parquet", "json", "csv"})
67
+
68
+ EXTENSION_TO_FORMAT: dict[str, str] = {
69
+ ".parquet": "parquet",
70
+ ".json": "json",
71
+ ".jsonl": "json",
72
+ ".csv": "csv",
73
+ }
74
+
75
+ # Required fields for import validation
76
+ REQUIRED_IMPORT_FIELDS = frozenset({"content"})
77
+
78
+ # Default import size limit (100 MB)
79
+ DEFAULT_MAX_IMPORT_SIZE_BYTES = 100 * 1024 * 1024
80
+
81
+
82
+ # =============================================================================
83
+ # Service Implementation
84
+ # =============================================================================
85
+
86
+
87
+ class ExportImportService:
88
+ """Service for memory export and import operations.
89
+
90
+ Uses Clean Architecture - depends on protocol interfaces, not implementations.
91
+ Handles file I/O and format conversion at the service layer while delegating
92
+ data access to the repository.
93
+
94
+ Security Features:
95
+ - Path validation to prevent traversal attacks
96
+ - File size limits for imports
97
+ - Symlink detection (optional)
98
+ - Extension validation
99
+
100
+ Example:
101
+ service = ExportImportService(
102
+ repository=repo,
103
+ embeddings=emb,
104
+ allowed_export_paths=[Path("./exports")],
105
+ allowed_import_paths=[Path("./imports")],
106
+ )
107
+
108
+ # Export memories
109
+ result = service.export_memories(
110
+ output_path="./exports/backup.parquet",
111
+ namespace="work",
112
+ )
113
+
114
+ # Import memories
115
+ result = service.import_memories(
116
+ source_path="./imports/restore.json",
117
+ dry_run=False,
118
+ )
119
+ """
120
+
121
+ def __init__(
122
+ self,
123
+ repository: MemoryRepositoryProtocol,
124
+ embeddings: EmbeddingServiceProtocol,
125
+ config: ExportImportConfig | None = None,
126
+ allowed_export_paths: Sequence[str | Path] | None = None,
127
+ allowed_import_paths: Sequence[str | Path] | None = None,
128
+ allow_symlinks: bool = False,
129
+ max_import_size_bytes: int | None = None,
130
+ ) -> None:
131
+ """Initialize the export/import service.
132
+
133
+ Args:
134
+ repository: Repository for memory storage.
135
+ embeddings: Service for generating embeddings.
136
+ config: Optional configuration (uses defaults if not provided).
137
+ allowed_export_paths: Directories where exports are permitted.
138
+ allowed_import_paths: Directories where imports are permitted.
139
+ allow_symlinks: Whether to allow following symlinks (default False).
140
+ max_import_size_bytes: Maximum import file size in bytes.
141
+ """
142
+ self._repo = repository
143
+ self._embeddings = embeddings
144
+ self._config = config or ExportImportConfig()
145
+
146
+ # Set up path validator
147
+ export_paths = allowed_export_paths or [Path("./exports"), Path("./backups")]
148
+ import_paths = allowed_import_paths or [Path("./imports"), Path("./backups")]
149
+
150
+ self._path_validator = PathValidator(
151
+ allowed_export_paths=export_paths,
152
+ allowed_import_paths=import_paths,
153
+ allow_symlinks=allow_symlinks,
154
+ )
155
+
156
+ self._max_import_size_bytes = (
157
+ max_import_size_bytes or DEFAULT_MAX_IMPORT_SIZE_BYTES
158
+ )
159
+
160
+ def export_memories(
161
+ self,
162
+ output_path: str,
163
+ format: str | None = None,
164
+ namespace: str | None = None,
165
+ include_vectors: bool = True,
166
+ ) -> ExportResult:
167
+ """Export memories to a file.
168
+
169
+ Streams data from repository and writes to the specified format.
170
+ Supports Parquet (recommended for full fidelity), JSON, and CSV.
171
+
172
+ Args:
173
+ output_path: Path for output file. Extension determines format
174
+ if format parameter is not specified.
175
+ format: Export format (parquet, json, csv). Auto-detected from
176
+ extension if not specified.
177
+ namespace: Export only this namespace (all if not specified).
178
+ include_vectors: Include embedding vectors in export (default True).
179
+ Note: CSV exports may set this to False for readability.
180
+
181
+ Returns:
182
+ ExportResult with export statistics.
183
+
184
+ Raises:
185
+ ExportError: If export operation fails.
186
+ PathSecurityError: If path validation fails.
187
+ ValidationError: If input validation fails.
188
+ """
189
+ start_time = time.monotonic()
190
+
191
+ # Validate and resolve path
192
+ try:
193
+ canonical_path = self._path_validator.validate_export_path(output_path)
194
+ except (PathSecurityError, ValueError) as e:
195
+ raise PathSecurityError(
196
+ path=output_path,
197
+ violation_type="export_path_validation_failed",
198
+ message=str(e),
199
+ ) from e
200
+
201
+ # Detect or validate format
202
+ detected_format = format or self._detect_format(output_path)
203
+ if detected_format is None:
204
+ detected_format = self._config.default_export_format
205
+
206
+ if detected_format not in SUPPORTED_FORMATS:
207
+ raise ValidationError(
208
+ f"Unsupported export format: {detected_format}. "
209
+ f"Supported: {', '.join(sorted(SUPPORTED_FORMATS))}"
210
+ )
211
+
212
+ try:
213
+ # Check export record limit before starting
214
+ if self._config.max_export_records > 0:
215
+ memory_count = self._repo.count(namespace=namespace)
216
+ if memory_count > self._config.max_export_records:
217
+ raise ExportError(
218
+ f"Export would contain {memory_count} records, "
219
+ f"exceeding limit of {self._config.max_export_records}. "
220
+ "Consider filtering by namespace or increasing max_export_records."
221
+ )
222
+
223
+ # Ensure parent directory exists
224
+ canonical_path.parent.mkdir(parents=True, exist_ok=True)
225
+
226
+ # Stream data from repository
227
+ batches = self._repo.get_all_for_export(
228
+ namespace=namespace,
229
+ batch_size=self._config.export_batch_size,
230
+ )
231
+
232
+ # Get namespaces for result
233
+ if namespace:
234
+ namespaces_included = [namespace]
235
+ else:
236
+ namespaces_included = self._repo.get_namespaces()
237
+
238
+ # Export based on format
239
+ if detected_format == "parquet":
240
+ memories_exported = self._export_parquet(
241
+ canonical_path, batches, include_vectors
242
+ )
243
+ elif detected_format == "json":
244
+ memories_exported = self._export_json(
245
+ canonical_path, batches, include_vectors
246
+ )
247
+ elif detected_format == "csv":
248
+ memories_exported = self._export_csv(
249
+ canonical_path, batches, include_vectors
250
+ )
251
+ else:
252
+ raise ExportError(f"Unsupported format: {detected_format}")
253
+
254
+ # Calculate file size
255
+ if canonical_path.exists():
256
+ file_size_bytes = canonical_path.stat().st_size
257
+ else:
258
+ file_size_bytes = 0
259
+
260
+ duration_seconds = time.monotonic() - start_time
261
+
262
+ return ExportResult(
263
+ format=detected_format,
264
+ output_path=str(canonical_path),
265
+ memories_exported=memories_exported,
266
+ file_size_bytes=file_size_bytes,
267
+ file_size_mb=file_size_bytes / (1024 * 1024),
268
+ namespaces_included=namespaces_included,
269
+ duration_seconds=duration_seconds,
270
+ compression="zstd" if detected_format == "parquet" else None,
271
+ )
272
+
273
+ except (ExportError, PathSecurityError, ValidationError):
274
+ raise
275
+ except Exception as e:
276
+ logger.error(f"Export failed: {e}")
277
+ raise ExportError(f"Export operation failed: {e}") from e
278
+
279
+ def import_memories(
280
+ self,
281
+ source_path: str,
282
+ format: str | None = None,
283
+ namespace_override: str | None = None,
284
+ deduplicate: bool = False,
285
+ dedup_threshold: float = 0.95,
286
+ validate: bool = True,
287
+ regenerate_embeddings: bool = False,
288
+ dry_run: bool = True,
289
+ ) -> ImportResult:
290
+ """Import memories from a file.
291
+
292
+ Parses the file, validates records, optionally deduplicates against
293
+ existing memories, and imports to the repository.
294
+
295
+ Args:
296
+ source_path: Path to source file.
297
+ format: Import format (parquet, json, csv). Auto-detected from
298
+ extension if not specified.
299
+ namespace_override: Override namespace for all imported memories.
300
+ deduplicate: Skip records similar to existing memories (default False).
301
+ dedup_threshold: Similarity threshold for deduplication (0.7-0.99).
302
+ validate: Validate records before import (default True).
303
+ regenerate_embeddings: Generate new embeddings for imported memories.
304
+ Required if source lacks vectors or dimensions don't match.
305
+ dry_run: Validate without importing (default True). Set to False
306
+ to actually import the memories.
307
+
308
+ Returns:
309
+ ImportResult with import statistics and validation errors.
310
+
311
+ Raises:
312
+ MemoryImportError: If import operation fails.
313
+ PathSecurityError: If path validation fails.
314
+ FileSizeLimitError: If file exceeds size limit.
315
+ ValidationError: If input validation fails.
316
+ """
317
+ start_time = time.monotonic()
318
+
319
+ # Detect or validate format BEFORE opening file
320
+ detected_format = format or self._detect_format(source_path)
321
+ if detected_format is None:
322
+ raise ValidationError(
323
+ f"Cannot detect format from path: {source_path}. "
324
+ "Please specify format explicitly."
325
+ )
326
+
327
+ if detected_format not in SUPPORTED_FORMATS:
328
+ raise ValidationError(
329
+ f"Unsupported import format: {detected_format}. "
330
+ f"Supported: {', '.join(sorted(SUPPORTED_FORMATS))}"
331
+ )
332
+
333
+ # Validate dedup threshold
334
+ if deduplicate and not 0.7 <= dedup_threshold <= 0.99:
335
+ raise ValidationError(
336
+ "dedup_threshold must be between 0.7 and 0.99"
337
+ )
338
+
339
+ # ATOMIC: Validate and open file in one step (prevents TOCTOU)
340
+ # The file handle MUST be used for reading, not re-opened by path
341
+ try:
342
+ canonical_path, file_handle = self._path_validator.validate_and_open_import_file(
343
+ source_path,
344
+ max_size_bytes=self._max_import_size_bytes,
345
+ )
346
+ except PathSecurityError as e:
347
+ raise e
348
+ except FileSizeLimitError as e:
349
+ raise e
350
+ except ValueError as e:
351
+ raise PathSecurityError(
352
+ path=source_path,
353
+ violation_type="import_path_validation_failed",
354
+ message=str(e),
355
+ ) from e
356
+
357
+ try:
358
+ # Parse file using the ALREADY OPEN file handle (TOCTOU safe)
359
+ if detected_format == "parquet":
360
+ records_iter = self._parse_parquet_from_handle(file_handle, canonical_path)
361
+ elif detected_format == "json":
362
+ records_iter = self._parse_json_from_handle(file_handle)
363
+ elif detected_format == "csv":
364
+ records_iter = self._parse_csv_from_handle(file_handle)
365
+ else:
366
+ raise MemoryImportError(f"Unsupported format: {detected_format}")
367
+
368
+ # Stream records with early termination to prevent memory exhaustion.
369
+ # Check limit during iteration, not after loading all records.
370
+ max_records = self._config.max_import_records
371
+ records: list[dict[str, Any]] = []
372
+
373
+ for record in records_iter:
374
+ records.append(record)
375
+ # Fail fast if limit exceeded - prevents memory exhaustion from large files
376
+ if max_records > 0 and len(records) > max_records:
377
+ raise ImportRecordLimitError(
378
+ actual_count=len(records),
379
+ max_count=max_records,
380
+ )
381
+ finally:
382
+ # Ensure file is closed even if parsing fails
383
+ file_handle.close()
384
+
385
+ try:
386
+
387
+ # Process records
388
+ total_records = 0
389
+ valid_records: list[dict[str, Any]] = []
390
+ validation_errors: list[ImportValidationError] = []
391
+ skipped_count = 0
392
+ failed_count = 0
393
+ imported_memories: list[ImportedMemory] = []
394
+
395
+ for idx, record in enumerate(records):
396
+ total_records += 1
397
+
398
+ # Validate record if requested
399
+ if validate:
400
+ expected_dims = (
401
+ self._embeddings.dimensions
402
+ if not regenerate_embeddings
403
+ else None
404
+ )
405
+ errors = self._validate_record(record, idx, expected_dims)
406
+ if errors:
407
+ validation_errors.extend(errors)
408
+ failed_count += 1
409
+ continue
410
+
411
+ # Apply namespace override
412
+ if namespace_override:
413
+ record["namespace"] = namespace_override
414
+
415
+ # Handle embeddings
416
+ if regenerate_embeddings or "vector" not in record:
417
+ if not dry_run:
418
+ vector = self._embeddings.embed(record["content"])
419
+ record["vector"] = vector.tolist()
420
+ else:
421
+ # In dry run, just mark that we would regenerate
422
+ record["_needs_embedding"] = True
423
+
424
+ # Deduplicate if requested
425
+ if deduplicate and not dry_run:
426
+ is_duplicate = self._check_duplicate(
427
+ record, dedup_threshold
428
+ )
429
+ if is_duplicate is True:
430
+ skipped_count += 1
431
+ continue
432
+ # If is_duplicate is None (check failed), proceed with import
433
+ # This is a conservative policy - import on failure
434
+
435
+ valid_records.append(record)
436
+
437
+ # Import if not dry run
438
+ memories_imported = 0
439
+ imported_ids: list[str] = []
440
+
441
+ if not dry_run and valid_records:
442
+ # Filter out internal fields
443
+ import_records = [
444
+ {k: v for k, v in r.items() if not k.startswith("_")}
445
+ for r in valid_records
446
+ ]
447
+
448
+ memories_imported, imported_ids = self._repo.bulk_import(
449
+ iter(import_records),
450
+ batch_size=self._config.import_batch_size,
451
+ namespace_override=namespace_override,
452
+ )
453
+
454
+ # Build imported memories list
455
+ for record, new_id in zip(valid_records, imported_ids):
456
+ content = record.get("content", "")
457
+ preview = content[:100] + "..." if len(content) > 100 else content
458
+ imported_memories.append(
459
+ ImportedMemory(
460
+ id=new_id,
461
+ content_preview=preview,
462
+ namespace=record.get("namespace", "default"),
463
+ was_deduplicated=False,
464
+ original_id=record.get("id"),
465
+ )
466
+ )
467
+ elif dry_run:
468
+ # In dry run, count valid records as "would be imported"
469
+ memories_imported = len(valid_records)
470
+
471
+ duration_seconds = time.monotonic() - start_time
472
+
473
+ return ImportResult(
474
+ source_path=str(canonical_path),
475
+ format=detected_format,
476
+ total_records_in_file=total_records,
477
+ memories_imported=memories_imported,
478
+ memories_skipped=skipped_count,
479
+ memories_failed=failed_count,
480
+ validation_errors=validation_errors,
481
+ duration_seconds=duration_seconds,
482
+ namespace_override=namespace_override,
483
+ imported_memories=imported_memories if not dry_run else None,
484
+ )
485
+
486
+ except (MemoryImportError, PathSecurityError, ValidationError, FileSizeLimitError):
487
+ raise
488
+ except json.JSONDecodeError as e:
489
+ raise MemoryImportError(f"Invalid JSON in import file: {e}") from e
490
+ except Exception as e:
491
+ logger.error(f"Import failed: {e}")
492
+ raise MemoryImportError(f"Import operation failed: {e}") from e
493
+
494
+ # =========================================================================
495
+ # Format Detection
496
+ # =========================================================================
497
+
498
+ def _detect_format(self, path: str) -> str | None:
499
+ """Detect format from file extension.
500
+
501
+ Args:
502
+ path: File path.
503
+
504
+ Returns:
505
+ Format string or None if unknown.
506
+ """
507
+ path_obj = Path(path)
508
+ ext = path_obj.suffix.lower()
509
+ return EXTENSION_TO_FORMAT.get(ext)
510
+
511
+ # =========================================================================
512
+ # Export Format Handlers
513
+ # =========================================================================
514
+
515
+ def _create_parquet_schema(self, include_vectors: bool) -> pa.Schema:
516
+ """Create PyArrow schema for Parquet export.
517
+
518
+ Args:
519
+ include_vectors: Whether to include embedding vector field.
520
+
521
+ Returns:
522
+ PyArrow schema for memory records.
523
+ """
524
+ import pyarrow as pa
525
+
526
+ fields = [
527
+ ("id", pa.string()),
528
+ ("content", pa.string()),
529
+ ("namespace", pa.string()),
530
+ ("importance", pa.float32()),
531
+ ("tags", pa.list_(pa.string())),
532
+ ("source", pa.string()),
533
+ ("metadata", pa.string()),
534
+ ("created_at", pa.timestamp("us", tz="UTC")),
535
+ ("updated_at", pa.timestamp("us", tz="UTC")),
536
+ ("last_accessed", pa.timestamp("us", tz="UTC")),
537
+ ("access_count", pa.int32()),
538
+ ]
539
+ if include_vectors:
540
+ fields.append(("vector", pa.list_(pa.float32())))
541
+ return pa.schema(fields)
542
+
543
+ def _export_parquet(
544
+ self,
545
+ path: Path,
546
+ batches: Iterator[list[dict[str, Any]]],
547
+ include_vectors: bool,
548
+ ) -> int:
549
+ """Export to Parquet format using streaming writes.
550
+
551
+ Uses ParquetWriter to write batches incrementally, avoiding
552
+ accumulation of all records in memory.
553
+
554
+ Args:
555
+ path: Output file path.
556
+ batches: Iterator of record batches.
557
+ include_vectors: Whether to include embedding vectors.
558
+
559
+ Returns:
560
+ Number of records exported.
561
+ """
562
+ try:
563
+ import pyarrow as pa
564
+ import pyarrow.parquet as pq
565
+ except ImportError as e:
566
+ raise ExportError(
567
+ "pyarrow is required for Parquet export. "
568
+ "Install with: pip install pyarrow"
569
+ ) from e
570
+
571
+ schema = self._create_parquet_schema(include_vectors)
572
+ total_records = 0
573
+ writer: pq.ParquetWriter | None = None
574
+
575
+ try:
576
+ for batch in batches:
577
+ if not batch:
578
+ continue
579
+
580
+ # Process records for this batch
581
+ processed_records: list[dict[str, Any]] = []
582
+ for record in batch:
583
+ processed = self._prepare_record_for_export(record, include_vectors)
584
+ # Parquet needs metadata as string to avoid empty struct issues
585
+ if "metadata" in processed:
586
+ if isinstance(processed["metadata"], dict):
587
+ processed["metadata"] = json.dumps(processed["metadata"])
588
+ processed_records.append(processed)
589
+
590
+ if not processed_records:
591
+ continue
592
+
593
+ # Create table from this batch
594
+ batch_table = pa.Table.from_pylist(processed_records, schema=schema)
595
+
596
+ # Initialize writer on first batch with data
597
+ if writer is None:
598
+ writer = pq.ParquetWriter(
599
+ path,
600
+ schema,
601
+ compression=self._config.parquet_compression,
602
+ )
603
+
604
+ writer.write_table(batch_table)
605
+ total_records += len(processed_records)
606
+
607
+ # Handle empty export case - write an empty file with schema
608
+ if writer is None:
609
+ empty_table = pa.Table.from_pydict(
610
+ {f.name: [] for f in schema}, schema=schema
611
+ )
612
+ pq.write_table(
613
+ empty_table,
614
+ path,
615
+ compression=self._config.parquet_compression,
616
+ )
617
+
618
+ finally:
619
+ if writer is not None:
620
+ writer.close()
621
+
622
+ return total_records
623
+
624
+ def _export_json(
625
+ self,
626
+ path: Path,
627
+ batches: Iterator[list[dict[str, Any]]],
628
+ include_vectors: bool,
629
+ ) -> int:
630
+ """Export to JSON format using streaming to avoid memory exhaustion.
631
+
632
+ Writes a valid JSON array by streaming records one at a time,
633
+ without accumulating all records in memory.
634
+
635
+ Args:
636
+ path: Output file path.
637
+ batches: Iterator of record batches.
638
+ include_vectors: Whether to include embedding vectors.
639
+
640
+ Returns:
641
+ Number of records exported.
642
+ """
643
+ total_records = 0
644
+ first_record = True
645
+
646
+ with open(path, "w", encoding="utf-8") as f:
647
+ f.write("[\n")
648
+
649
+ for batch in batches:
650
+ for record in batch:
651
+ processed = self._prepare_record_for_export(record, include_vectors)
652
+
653
+ # Add comma separator for all but first record
654
+ if not first_record:
655
+ f.write(",\n")
656
+ first_record = False
657
+
658
+ # Write the record
659
+ json_str = json.dumps(processed, default=self._json_serializer, indent=2)
660
+ # Indent each line for pretty formatting
661
+ indented = "\n".join(" " + line for line in json_str.split("\n"))
662
+ f.write(indented)
663
+
664
+ total_records += 1
665
+
666
+ f.write("\n]")
667
+
668
+ return total_records
669
+
670
+ def _export_csv(
671
+ self,
672
+ path: Path,
673
+ batches: Iterator[list[dict[str, Any]]],
674
+ include_vectors: bool,
675
+ ) -> int:
676
+ """Export to CSV format using streaming to avoid memory exhaustion.
677
+
678
+ Writes CSV rows as they are processed without accumulating
679
+ all records in memory.
680
+
681
+ Args:
682
+ path: Output file path.
683
+ batches: Iterator of record batches.
684
+ include_vectors: Whether to include embedding vectors.
685
+
686
+ Returns:
687
+ Number of records exported.
688
+ """
689
+ # Define fieldnames upfront
690
+ fieldnames = [
691
+ "id", "content", "namespace", "importance", "tags",
692
+ "source", "metadata", "created_at", "updated_at",
693
+ "last_accessed", "access_count"
694
+ ]
695
+ if include_vectors:
696
+ fieldnames.append("vector")
697
+
698
+ total_records = 0
699
+
700
+ with open(path, "w", newline="", encoding="utf-8") as f:
701
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
702
+ writer.writeheader()
703
+
704
+ for batch in batches:
705
+ for record in batch:
706
+ processed = self._prepare_record_for_export(record, include_vectors)
707
+ # Convert complex types to strings for CSV
708
+ processed["tags"] = json.dumps(processed.get("tags", []))
709
+ processed["metadata"] = json.dumps(processed.get("metadata", {}))
710
+ if include_vectors and "vector" in processed:
711
+ processed["vector"] = json.dumps(processed["vector"])
712
+ # Convert datetimes to ISO format
713
+ for key in ["created_at", "updated_at", "last_accessed"]:
714
+ if key in processed and processed[key] is not None:
715
+ if isinstance(processed[key], datetime):
716
+ processed[key] = processed[key].isoformat()
717
+
718
+ writer.writerow(processed)
719
+ total_records += 1
720
+
721
+ return total_records
722
+
723
+ def _prepare_record_for_export(
724
+ self,
725
+ record: dict[str, Any],
726
+ include_vectors: bool,
727
+ ) -> dict[str, Any]:
728
+ """Prepare a record for export.
729
+
730
+ Args:
731
+ record: Raw record from repository.
732
+ include_vectors: Whether to include embedding vectors.
733
+
734
+ Returns:
735
+ Processed record suitable for export.
736
+ """
737
+ processed = dict(record)
738
+
739
+ # Handle vector
740
+ if not include_vectors:
741
+ processed.pop("vector", None)
742
+ elif "vector" in processed:
743
+ # Ensure vector is a list, not numpy array
744
+ vec = processed["vector"]
745
+ if isinstance(vec, np.ndarray):
746
+ processed["vector"] = vec.tolist()
747
+
748
+ # Handle metadata - ensure it's JSON serializable
749
+ if "metadata" in processed:
750
+ meta = processed["metadata"]
751
+ if isinstance(meta, str):
752
+ try:
753
+ processed["metadata"] = json.loads(meta)
754
+ except json.JSONDecodeError:
755
+ processed["metadata"] = {}
756
+
757
+ return processed
758
+
759
+ def _json_serializer(self, obj: Any) -> Any:
760
+ """Custom JSON serializer for complex types."""
761
+ if isinstance(obj, datetime):
762
+ return obj.isoformat()
763
+ if isinstance(obj, np.ndarray):
764
+ return obj.tolist()
765
+ if isinstance(obj, np.floating):
766
+ return float(obj)
767
+ if isinstance(obj, np.integer):
768
+ return int(obj)
769
+ raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
770
+
771
+ # =========================================================================
772
+ # Import Format Handlers (TOCTOU-safe versions using file handles)
773
+ # =========================================================================
774
+
775
+ def _parse_parquet_from_handle(
776
+ self, file_handle: BinaryIO, path: Path
777
+ ) -> Iterator[dict[str, Any]]:
778
+ """Parse Parquet from an already-open file handle (TOCTOU-safe).
779
+
780
+ Args:
781
+ file_handle: Open binary file handle.
782
+ path: Original path (for error messages only).
783
+
784
+ Yields:
785
+ Memory records as dictionaries.
786
+ """
787
+ try:
788
+ import pyarrow.parquet as pq
789
+ except ImportError as e:
790
+ raise MemoryImportError(
791
+ "pyarrow is required for Parquet import. "
792
+ "Install with: pip install pyarrow"
793
+ ) from e
794
+
795
+ try:
796
+ # PyArrow can read from file-like objects
797
+ table = pq.read_table(file_handle)
798
+ records = table.to_pylist()
799
+
800
+ for record in records:
801
+ # Convert metadata from string if needed
802
+ if "metadata" in record and isinstance(record["metadata"], str):
803
+ try:
804
+ record["metadata"] = json.loads(record["metadata"])
805
+ except json.JSONDecodeError:
806
+ record["metadata"] = {}
807
+ yield record
808
+ except Exception as e:
809
+ raise MemoryImportError(f"Failed to parse Parquet file {path}: {e}") from e
810
+
811
+ def _parse_json_from_handle(self, file_handle: BinaryIO) -> Iterator[dict[str, Any]]:
812
+ """Parse JSON from an already-open file handle (TOCTOU-safe).
813
+
814
+ Args:
815
+ file_handle: Open binary file handle.
816
+
817
+ Yields:
818
+ Memory records as dictionaries.
819
+ """
820
+ # Read and decode content
821
+ content = file_handle.read().decode("utf-8").strip()
822
+
823
+ # Handle both JSON array and JSON Lines formats
824
+ if content.startswith("["):
825
+ # JSON array
826
+ records = json.loads(content)
827
+ yield from records
828
+ else:
829
+ # JSON Lines (one object per line)
830
+ for line in content.split("\n"):
831
+ line = line.strip()
832
+ if line:
833
+ yield json.loads(line)
834
+
835
+ def _parse_csv_from_handle(self, file_handle: BinaryIO) -> Iterator[dict[str, Any]]:
836
+ """Parse CSV from an already-open file handle (TOCTOU-safe).
837
+
838
+ Args:
839
+ file_handle: Open binary file handle.
840
+
841
+ Yields:
842
+ Memory records as dictionaries.
843
+ """
844
+ # Wrap binary handle in text wrapper for CSV reader
845
+ text_handle = TextIOWrapper(file_handle, encoding="utf-8", newline="")
846
+ try:
847
+ reader = csv.DictReader(text_handle)
848
+
849
+ for row in reader:
850
+ record: dict[str, Any] = dict(row)
851
+
852
+ # Convert string fields to appropriate types
853
+ if "importance" in record:
854
+ try:
855
+ record["importance"] = float(record["importance"])
856
+ except (ValueError, TypeError):
857
+ record["importance"] = 0.5
858
+
859
+ if "access_count" in record:
860
+ try:
861
+ record["access_count"] = int(record["access_count"])
862
+ except (ValueError, TypeError):
863
+ record["access_count"] = 0
864
+
865
+ # Parse JSON fields
866
+ if "tags" in record and isinstance(record["tags"], str):
867
+ try:
868
+ record["tags"] = json.loads(record["tags"])
869
+ except json.JSONDecodeError:
870
+ record["tags"] = []
871
+
872
+ if "metadata" in record and isinstance(record["metadata"], str):
873
+ try:
874
+ record["metadata"] = json.loads(record["metadata"])
875
+ except json.JSONDecodeError:
876
+ record["metadata"] = {}
877
+
878
+ if "vector" in record and isinstance(record["vector"], str):
879
+ try:
880
+ record["vector"] = json.loads(record["vector"])
881
+ except json.JSONDecodeError:
882
+ # Remove invalid vector
883
+ del record["vector"]
884
+
885
+ yield record
886
+ finally:
887
+ # Detach text wrapper to prevent it from closing the underlying handle
888
+ text_handle.detach()
889
+
890
+ # =========================================================================
891
+ # Validation
892
+ # =========================================================================
893
+
894
+ def _validate_record(
895
+ self,
896
+ record: dict[str, Any],
897
+ row_number: int,
898
+ expected_dims: int | None = None,
899
+ ) -> list[ImportValidationError]:
900
+ """Validate a single import record.
901
+
902
+ Args:
903
+ record: Record to validate.
904
+ row_number: Row number for error reporting.
905
+ expected_dims: Expected vector dimensions (None to skip check).
906
+
907
+ Returns:
908
+ List of validation errors (empty if valid).
909
+ """
910
+ errors: list[ImportValidationError] = []
911
+
912
+ # Check required fields
913
+ for field in REQUIRED_IMPORT_FIELDS:
914
+ if field not in record or record[field] is None:
915
+ errors.append(
916
+ ImportValidationError(
917
+ row_number=row_number,
918
+ field=field,
919
+ error=f"Required field '{field}' is missing",
920
+ value=None,
921
+ )
922
+ )
923
+ elif field == "content" and not str(record[field]).strip():
924
+ errors.append(
925
+ ImportValidationError(
926
+ row_number=row_number,
927
+ field=field,
928
+ error="Content cannot be empty",
929
+ value=str(record[field])[:50],
930
+ )
931
+ )
932
+
933
+ # Validate importance range
934
+ if "importance" in record:
935
+ importance = record["importance"]
936
+ try:
937
+ importance_float = float(importance)
938
+ if not 0.0 <= importance_float <= 1.0:
939
+ errors.append(
940
+ ImportValidationError(
941
+ row_number=row_number,
942
+ field="importance",
943
+ error="Importance must be between 0.0 and 1.0",
944
+ value=str(importance),
945
+ )
946
+ )
947
+ except (ValueError, TypeError):
948
+ errors.append(
949
+ ImportValidationError(
950
+ row_number=row_number,
951
+ field="importance",
952
+ error="Importance must be a number",
953
+ value=str(importance)[:50],
954
+ )
955
+ )
956
+
957
+ # Validate vector dimensions
958
+ if expected_dims is not None and "vector" in record:
959
+ vector = record["vector"]
960
+ if vector is not None:
961
+ try:
962
+ if isinstance(vector, (list, np.ndarray)):
963
+ actual_dims = len(vector)
964
+ if actual_dims != expected_dims:
965
+ errors.append(
966
+ ImportValidationError(
967
+ row_number=row_number,
968
+ field="vector",
969
+ error=f"Vector dimension mismatch: expected "
970
+ f"{expected_dims}, got {actual_dims}",
971
+ value=f"[{actual_dims} dimensions]",
972
+ )
973
+ )
974
+ except (TypeError, AttributeError):
975
+ errors.append(
976
+ ImportValidationError(
977
+ row_number=row_number,
978
+ field="vector",
979
+ error="Vector must be an array of numbers",
980
+ value=str(type(vector)),
981
+ )
982
+ )
983
+
984
+ return errors
985
+
986
+ # =========================================================================
987
+ # Deduplication
988
+ # =========================================================================
989
+
990
+ def _check_duplicate(
991
+ self,
992
+ record: dict[str, Any],
993
+ threshold: float,
994
+ ) -> bool | None:
995
+ """Check if record is a duplicate of an existing memory.
996
+
997
+ Args:
998
+ record: Record to check.
999
+ threshold: Similarity threshold for deduplication.
1000
+
1001
+ Returns:
1002
+ True if record is a duplicate.
1003
+ False if no duplicate found.
1004
+ None if the check failed (let caller decide policy).
1005
+ """
1006
+ try:
1007
+ # Get vector for comparison
1008
+ if "vector" in record and record["vector"] is not None:
1009
+ vector = np.array(record["vector"], dtype=np.float32)
1010
+ else:
1011
+ # Generate embedding for comparison
1012
+ vector = self._embeddings.embed(record["content"])
1013
+
1014
+ # Search for similar existing memories
1015
+ namespace = record.get("namespace")
1016
+ results = self._repo.search(vector, limit=5, namespace=namespace)
1017
+
1018
+ # Check if any result exceeds threshold
1019
+ for result in results:
1020
+ if result.similarity >= threshold:
1021
+ logger.debug(
1022
+ f"Duplicate found: similarity {result.similarity:.3f} "
1023
+ f">= threshold {threshold:.3f}"
1024
+ )
1025
+ return True
1026
+
1027
+ return False
1028
+
1029
+ except Exception as e:
1030
+ logger.warning(f"Duplicate check failed: {e}")
1031
+ return None