tribalmemory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. tribalmemory/__init__.py +3 -0
  2. tribalmemory/a21/__init__.py +38 -0
  3. tribalmemory/a21/config/__init__.py +20 -0
  4. tribalmemory/a21/config/providers.py +104 -0
  5. tribalmemory/a21/config/system.py +184 -0
  6. tribalmemory/a21/container/__init__.py +8 -0
  7. tribalmemory/a21/container/container.py +212 -0
  8. tribalmemory/a21/providers/__init__.py +32 -0
  9. tribalmemory/a21/providers/base.py +241 -0
  10. tribalmemory/a21/providers/deduplication.py +99 -0
  11. tribalmemory/a21/providers/lancedb.py +232 -0
  12. tribalmemory/a21/providers/memory.py +128 -0
  13. tribalmemory/a21/providers/mock.py +54 -0
  14. tribalmemory/a21/providers/openai.py +151 -0
  15. tribalmemory/a21/providers/timestamp.py +88 -0
  16. tribalmemory/a21/system.py +293 -0
  17. tribalmemory/cli.py +298 -0
  18. tribalmemory/interfaces.py +306 -0
  19. tribalmemory/mcp/__init__.py +9 -0
  20. tribalmemory/mcp/__main__.py +6 -0
  21. tribalmemory/mcp/server.py +484 -0
  22. tribalmemory/performance/__init__.py +1 -0
  23. tribalmemory/performance/benchmarks.py +285 -0
  24. tribalmemory/performance/corpus_generator.py +171 -0
  25. tribalmemory/portability/__init__.py +1 -0
  26. tribalmemory/portability/embedding_metadata.py +320 -0
  27. tribalmemory/server/__init__.py +9 -0
  28. tribalmemory/server/__main__.py +6 -0
  29. tribalmemory/server/app.py +187 -0
  30. tribalmemory/server/config.py +115 -0
  31. tribalmemory/server/models.py +206 -0
  32. tribalmemory/server/routes.py +378 -0
  33. tribalmemory/services/__init__.py +15 -0
  34. tribalmemory/services/deduplication.py +115 -0
  35. tribalmemory/services/embeddings.py +273 -0
  36. tribalmemory/services/import_export.py +506 -0
  37. tribalmemory/services/memory.py +275 -0
  38. tribalmemory/services/vector_store.py +360 -0
  39. tribalmemory/testing/__init__.py +22 -0
  40. tribalmemory/testing/embedding_utils.py +110 -0
  41. tribalmemory/testing/fixtures.py +123 -0
  42. tribalmemory/testing/metrics.py +256 -0
  43. tribalmemory/testing/mocks.py +560 -0
  44. tribalmemory/testing/semantic_expansions.py +91 -0
  45. tribalmemory/utils.py +23 -0
  46. tribalmemory-0.1.0.dist-info/METADATA +275 -0
  47. tribalmemory-0.1.0.dist-info/RECORD +51 -0
  48. tribalmemory-0.1.0.dist-info/WHEEL +5 -0
  49. tribalmemory-0.1.0.dist-info/entry_points.txt +3 -0
  50. tribalmemory-0.1.0.dist-info/licenses/LICENSE +190 -0
  51. tribalmemory-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,506 @@
1
+ """Import/export service for data migration (Issue #7).
2
+
3
+ Provides filtered export and conflict-aware import of memory
4
+ entries using the portable bundle format defined in
5
+ ``tribalmemory.portability.embedding_metadata``.
6
+
7
+ Export supports filtering by:
8
+ - Tags (any-match)
9
+ - Date range (``created_at``)
10
+
11
+ Import supports conflict resolution:
12
+ - SKIP (default): ignore entries whose ID already exists
13
+ - OVERWRITE: replace existing entries unconditionally
14
+ - MERGE: keep whichever entry has the newer ``updated_at``
15
+
16
+ Import also supports **dry-run mode**: when ``dry_run=True`` the
17
+ import walks every entry and reports what *would* happen without
18
+ writing to the store. Useful for previewing changes before commit.
19
+
20
+ Timezone assumption:
21
+ All naive ``datetime`` objects are treated as UTC. This is
22
+ consistent with ``MemoryEntry.created_at`` and ``updated_at``
23
+ which default to ``datetime.utcnow()`` (naive UTC).
24
+
25
+ Recommended limits:
26
+ Export loads all matching entries into memory. For stores with
27
+ more than ~50k entries, consider exporting in batches (e.g.
28
+ by date range) to limit peak memory usage. The hard default
29
+ is ``MAX_EXPORT_ENTRIES`` (100 000).
30
+
31
+ For very large datasets, use ``export_memories_streaming()``
32
+ which yields one entry at a time via an async generator and
33
+ avoids loading the full result set into memory.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import logging
39
+ import time
40
+ from collections.abc import AsyncIterator, Callable
41
+ from dataclasses import dataclass, field
42
+ from datetime import datetime, timezone
43
+ from enum import Enum
44
+ from typing import Literal, Optional
45
+
46
+ from ..interfaces import IVectorStore, MemoryEntry
47
+ from ..portability.embedding_metadata import (
48
+ EmbeddingMetadata,
49
+ EmbeddingManifest,
50
+ PortableBundle,
51
+ ReembeddingStrategy,
52
+ create_portable_bundle,
53
+ import_bundle as portability_import_bundle,
54
+ )
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+ # Hard ceiling on a single export. A warning is logged when the
59
+ # result set reaches this limit. For larger stores, export in
60
+ # batches (e.g. by date range) or use export_memories_streaming().
61
+ MAX_EXPORT_ENTRIES = 100_000
62
+
63
+ # Valid values for user-facing enum parameters
64
+ VALID_CONFLICT_RESOLUTIONS = {"skip", "overwrite", "merge"}
65
+ VALID_EMBEDDING_STRATEGIES = {"auto", "keep", "drop"}
66
+
67
+
68
+ class ConflictResolution(Enum):
69
+ """How to handle ID collisions on import."""
70
+ SKIP = "skip"
71
+ OVERWRITE = "overwrite"
72
+ MERGE = "merge"
73
+
74
+
75
+ @dataclass
76
+ class ExportFilter:
77
+ """Filters for memory export.
78
+
79
+ Attributes:
80
+ tags: Include entries matching *any* of these tags.
81
+ ``None`` means no tag filter.
82
+ date_from: Include entries created on or after this.
83
+ date_to: Include entries created on or before this.
84
+ """
85
+ tags: Optional[list[str]] = None
86
+ date_from: Optional[datetime] = None
87
+ date_to: Optional[datetime] = None
88
+
89
+
90
+ @dataclass
91
+ class ImportSummary:
92
+ """Result summary of an import operation.
93
+
94
+ Attributes:
95
+ dry_run: True if this was a preview — no writes occurred.
96
+ """
97
+ total: int = 0
98
+ imported: int = 0
99
+ skipped: int = 0
100
+ overwritten: int = 0
101
+ errors: int = 0
102
+ needs_reembedding: bool = False
103
+ dry_run: bool = False
104
+ duration_ms: float = 0.0
105
+ error_details: list[str] = field(default_factory=list)
106
+
107
+
108
+ # -------------------------------------------------------------- #
109
+ # Export #
110
+ # -------------------------------------------------------------- #
111
+
112
+
113
+ async def export_memories(
114
+ store: IVectorStore,
115
+ embedding_metadata: EmbeddingMetadata,
116
+ filters: Optional[ExportFilter] = None,
117
+ schema_version: Literal["1.0"] = "1.0",
118
+ limit: int = MAX_EXPORT_ENTRIES,
119
+ ) -> PortableBundle:
120
+ """Export memories from a vector store as a portable bundle.
121
+
122
+ Args:
123
+ store: Vector store to export from.
124
+ embedding_metadata: Metadata describing the embedding
125
+ model used by this store.
126
+ filters: Optional tag / date filters.
127
+ schema_version: Bundle schema version.
128
+ limit: Max entries to fetch. Defaults to
129
+ ``MAX_EXPORT_ENTRIES``. A warning is logged when
130
+ the result set is truncated.
131
+
132
+ Returns:
133
+ A ``PortableBundle`` ready for serialization.
134
+
135
+ Note:
136
+ For stores with >50k entries, consider exporting in
137
+ date-range batches or using
138
+ ``export_memories_streaming()`` to limit peak memory.
139
+ """
140
+ t0 = time.monotonic()
141
+
142
+ tag_filter = None
143
+ if filters and filters.tags:
144
+ tag_filter = {"tags": filters.tags}
145
+
146
+ entries = await store.list(limit=limit, filters=tag_filter)
147
+
148
+ if len(entries) >= limit:
149
+ logger.warning(
150
+ "Export hit limit of %d entries — result may be "
151
+ "truncated. Pass a higher limit or export in "
152
+ "batches.",
153
+ limit,
154
+ )
155
+
156
+ if filters:
157
+ entries = _apply_date_filter(entries, filters)
158
+
159
+ bundle = create_portable_bundle(
160
+ entries=entries,
161
+ embedding_metadata=embedding_metadata,
162
+ schema_version=schema_version,
163
+ )
164
+
165
+ elapsed = (time.monotonic() - t0) * 1000
166
+ logger.info(
167
+ "Exported %d memories in %.0fms (filters=%s)",
168
+ len(entries),
169
+ elapsed,
170
+ _describe_filter(filters),
171
+ )
172
+
173
+ return bundle
174
+
175
+
176
+ async def export_memories_streaming(
177
+ store: IVectorStore,
178
+ embedding_metadata: EmbeddingMetadata,
179
+ filters: Optional[ExportFilter] = None,
180
+ batch_size: int = 1000,
181
+ ) -> AsyncIterator[MemoryEntry]:
182
+ """Stream-export memories one at a time.
183
+
184
+ Fetches in batches of ``batch_size`` from the store and
185
+ yields individual entries. This avoids loading the entire
186
+ result set into memory, making it suitable for large stores.
187
+
188
+ Usage::
189
+
190
+ async for entry in export_memories_streaming(store, meta):
191
+ write_jsonl_line(entry)
192
+
193
+ Date/tag filtering is applied per-batch.
194
+ """
195
+ offset = 0
196
+ tag_filter = None
197
+ if filters and filters.tags:
198
+ tag_filter = {"tags": filters.tags}
199
+
200
+ total_yielded = 0
201
+ while True:
202
+ batch = await store.list(
203
+ limit=batch_size,
204
+ offset=offset,
205
+ filters=tag_filter,
206
+ )
207
+ if not batch:
208
+ break
209
+
210
+ if filters:
211
+ batch = _apply_date_filter(batch, filters)
212
+
213
+ for entry in batch:
214
+ yield entry
215
+ total_yielded += 1
216
+
217
+ offset += batch_size
218
+
219
+ logger.info(
220
+ "Streaming export yielded %d entries", total_yielded,
221
+ )
222
+
223
+
224
+ # -------------------------------------------------------------- #
225
+ # Import #
226
+ # -------------------------------------------------------------- #
227
+
228
+
229
+ """Type alias for an import progress callback.
230
+
231
+ Called after each entry is processed with (current, total).
232
+ """
233
+ ProgressCallback = Callable[[int, int], None]
234
+
235
+
236
+ async def import_memories(
237
+ bundle: PortableBundle,
238
+ store: IVectorStore,
239
+ target_metadata: EmbeddingMetadata,
240
+ conflict_resolution: ConflictResolution = (
241
+ ConflictResolution.SKIP
242
+ ),
243
+ embedding_strategy: ReembeddingStrategy = (
244
+ ReembeddingStrategy.AUTO
245
+ ),
246
+ dry_run: bool = False,
247
+ on_progress: Optional[ProgressCallback] = None,
248
+ ) -> ImportSummary:
249
+ """Import a portable bundle into a vector store.
250
+
251
+ Args:
252
+ bundle: The bundle to import.
253
+ store: Target vector store.
254
+ target_metadata: Embedding metadata of the target system.
255
+ conflict_resolution: How to handle ID collisions.
256
+ embedding_strategy: How to handle embedding mismatches.
257
+ dry_run: If True, compute the summary without writing
258
+ anything. Useful for previewing changes.
259
+ on_progress: Optional callback invoked after each entry
260
+ with ``(current_index, total_count)``. Useful for
261
+ progress bars on large imports.
262
+
263
+ Returns:
264
+ ``ImportSummary`` with counts and error details.
265
+ """
266
+ t0 = time.monotonic()
267
+ total = len(bundle.entries)
268
+ summary = ImportSummary(
269
+ total=total,
270
+ dry_run=dry_run,
271
+ )
272
+
273
+ import_result = portability_import_bundle(
274
+ bundle=bundle,
275
+ target_metadata=target_metadata,
276
+ strategy=embedding_strategy,
277
+ )
278
+ summary.needs_reembedding = import_result.needs_embedding
279
+
280
+ for idx, entry in enumerate(import_result.entries):
281
+ try:
282
+ existing = await store.get(entry.id)
283
+
284
+ if existing is None:
285
+ if dry_run:
286
+ summary.imported += 1
287
+ else:
288
+ result = await store.store(entry)
289
+ if result.success:
290
+ summary.imported += 1
291
+ else:
292
+ summary.errors += 1
293
+ summary.error_details.append(
294
+ _safe_error(
295
+ entry.id, result.error,
296
+ ),
297
+ )
298
+ else:
299
+ if dry_run:
300
+ _resolve_conflict_dry(
301
+ entry, existing,
302
+ conflict_resolution, summary,
303
+ )
304
+ else:
305
+ await _resolve_conflict(
306
+ entry, existing, store,
307
+ conflict_resolution, summary,
308
+ )
309
+ except Exception as exc:
310
+ summary.errors += 1
311
+ summary.error_details.append(
312
+ _safe_error(entry.id, str(exc)),
313
+ )
314
+
315
+ if on_progress is not None:
316
+ on_progress(idx + 1, total)
317
+
318
+ summary.duration_ms = (time.monotonic() - t0) * 1000
319
+
320
+ mode = "dry-run" if dry_run else "live"
321
+ logger.info(
322
+ "Import (%s): %d total, %d imported, %d skipped, "
323
+ "%d overwritten, %d errors in %.0fms",
324
+ mode,
325
+ summary.total,
326
+ summary.imported,
327
+ summary.skipped,
328
+ summary.overwritten,
329
+ summary.errors,
330
+ summary.duration_ms,
331
+ )
332
+
333
+ return summary
334
+
335
+
336
+ async def _resolve_conflict(
337
+ incoming: MemoryEntry,
338
+ existing: MemoryEntry,
339
+ store: IVectorStore,
340
+ resolution: ConflictResolution,
341
+ summary: ImportSummary,
342
+ ) -> None:
343
+ """Apply conflict resolution for an ID collision."""
344
+ if resolution == ConflictResolution.SKIP:
345
+ summary.skipped += 1
346
+ return
347
+
348
+ if resolution == ConflictResolution.OVERWRITE:
349
+ await _upsert(incoming, store, summary)
350
+ return
351
+
352
+ if resolution == ConflictResolution.MERGE:
353
+ incoming_t = _ensure_tz_aware(incoming.updated_at)
354
+ existing_t = _ensure_tz_aware(existing.updated_at)
355
+ if incoming_t > existing_t:
356
+ await _upsert(incoming, store, summary)
357
+ else:
358
+ summary.skipped += 1
359
+
360
+
361
+ def _resolve_conflict_dry(
362
+ incoming: MemoryEntry,
363
+ existing: MemoryEntry,
364
+ resolution: ConflictResolution,
365
+ summary: ImportSummary,
366
+ ) -> None:
367
+ """Dry-run conflict resolution (no writes)."""
368
+ if resolution == ConflictResolution.SKIP:
369
+ summary.skipped += 1
370
+ return
371
+
372
+ if resolution == ConflictResolution.OVERWRITE:
373
+ summary.overwritten += 1
374
+ return
375
+
376
+ if resolution == ConflictResolution.MERGE:
377
+ incoming_t = _ensure_tz_aware(incoming.updated_at)
378
+ existing_t = _ensure_tz_aware(existing.updated_at)
379
+ if incoming_t > existing_t:
380
+ summary.overwritten += 1
381
+ else:
382
+ summary.skipped += 1
383
+
384
+
385
+ async def _upsert(
386
+ entry: MemoryEntry,
387
+ store: IVectorStore,
388
+ summary: ImportSummary,
389
+ ) -> None:
390
+ """Insert-or-replace via the store's public upsert API."""
391
+ result = await store.upsert(entry)
392
+ if result.success:
393
+ summary.overwritten += 1
394
+ else:
395
+ summary.errors += 1
396
+ summary.error_details.append(
397
+ _safe_error(entry.id, result.error),
398
+ )
399
+
400
+
401
+ # -------------------------------------------------------------- #
402
+ # Validation helpers (for MCP / HTTP layers) #
403
+ # -------------------------------------------------------------- #
404
+
405
+
406
+ def validate_conflict_resolution(value: str) -> str | None:
407
+ """Return an error message if *value* is not valid."""
408
+ if value not in VALID_CONFLICT_RESOLUTIONS:
409
+ return (
410
+ f"Invalid conflict_resolution '{value}'. "
411
+ f"Must be one of: "
412
+ f"{sorted(VALID_CONFLICT_RESOLUTIONS)}"
413
+ )
414
+ return None
415
+
416
+
417
+ def validate_embedding_strategy(value: str) -> str | None:
418
+ """Return an error message if *value* is not valid."""
419
+ if value not in VALID_EMBEDDING_STRATEGIES:
420
+ return (
421
+ f"Invalid embedding_strategy '{value}'. "
422
+ f"Must be one of: "
423
+ f"{sorted(VALID_EMBEDDING_STRATEGIES)}"
424
+ )
425
+ return None
426
+
427
+
428
+ def parse_iso_datetime(
429
+ value: str | None,
430
+ field_name: str,
431
+ ) -> tuple[datetime | None, str | None]:
432
+ """Parse an ISO 8601 string.
433
+
434
+ Returns:
435
+ ``(datetime, None)`` on success,
436
+ ``(None, error_msg)`` on failure.
437
+ """
438
+ if not value:
439
+ return None, None
440
+ try:
441
+ return datetime.fromisoformat(value), None
442
+ except (ValueError, TypeError) as exc:
443
+ return None, (
444
+ f"Invalid {field_name}: '{value}' "
445
+ f"is not a valid ISO 8601 datetime ({exc})"
446
+ )
447
+
448
+
449
+ # -------------------------------------------------------------- #
450
+ # Internal helpers #
451
+ # -------------------------------------------------------------- #
452
+
453
+
454
+ def _apply_date_filter(
455
+ entries: list[MemoryEntry],
456
+ filters: ExportFilter,
457
+ ) -> list[MemoryEntry]:
458
+ """Filter entries by date range."""
459
+ result = entries
460
+
461
+ if filters.date_from is not None:
462
+ date_from = _ensure_tz_aware(filters.date_from)
463
+ result = [
464
+ e for e in result
465
+ if _ensure_tz_aware(e.created_at) >= date_from
466
+ ]
467
+
468
+ if filters.date_to is not None:
469
+ date_to = _ensure_tz_aware(filters.date_to)
470
+ result = [
471
+ e for e in result
472
+ if _ensure_tz_aware(e.created_at) <= date_to
473
+ ]
474
+
475
+ return result
476
+
477
+
478
+ def _ensure_tz_aware(dt: datetime) -> datetime:
479
+ """Treat naive datetimes as UTC (project convention)."""
480
+ if dt.tzinfo is None:
481
+ return dt.replace(tzinfo=timezone.utc)
482
+ return dt
483
+
484
+
485
+ def _safe_error(entry_id: str, detail: str | None) -> str:
486
+ """Sanitize error detail for user-facing output."""
487
+ if not detail:
488
+ return f"{entry_id}: unknown error"
489
+ safe = detail.split("\n")[0][:200]
490
+ return f"{entry_id}: {safe}"
491
+
492
+
493
+ def _describe_filter(
494
+ filters: Optional[ExportFilter],
495
+ ) -> str:
496
+ """Human-readable description of active filters."""
497
+ if not filters:
498
+ return "none"
499
+ parts = []
500
+ if filters.tags:
501
+ parts.append(f"tags={filters.tags}")
502
+ if filters.date_from:
503
+ parts.append(f"from={filters.date_from.isoformat()}")
504
+ if filters.date_to:
505
+ parts.append(f"to={filters.date_to.isoformat()}")
506
+ return ", ".join(parts) if parts else "none"