AbstractMemory 0.0.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,544 @@
1
+ """
2
+ LanceDB Storage Backend with SQL + Vector Search via AbstractCore embeddings.
3
+ Provides powerful querying capabilities for AI memory.
4
+ """
5
+
6
+ import uuid
7
+ from typing import Optional, Dict, List, Any
8
+ from datetime import datetime
9
+ import logging
10
+
11
+ from ..core.interfaces import IStorage
12
+ from ..embeddings import create_embedding_adapter
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ try:
17
+ import lancedb
18
+ LANCEDB_AVAILABLE = True
19
+ except ImportError:
20
+ LANCEDB_AVAILABLE = False
21
+ logger.warning("LanceDB not available. Install with: pip install lancedb")
22
+
23
+
24
+ class LanceDBStorage(IStorage):
25
+ """
26
+ LanceDB storage with vector embeddings from AbstractCore.
27
+
28
+ Tables:
29
+ - interactions: Verbatim user-agent interactions with embeddings
30
+ - experiential_notes: AI reflections and insights with embeddings
31
+ - links: Bidirectional relationships between interactions and notes
32
+ - memory_components: Snapshots of memory components
33
+ """
34
+
35
+ def __init__(self, uri: str, embedding_provider: Optional[Any] = None):
36
+ """
37
+ Initialize LanceDB storage.
38
+
39
+ Args:
40
+ uri: LanceDB connection URI (e.g., "./lance.db")
41
+ embedding_provider: AbstractCore instance for generating embeddings
42
+
43
+ Raises:
44
+ ImportError: If LanceDB is not installed
45
+ ValueError: If no embedding provider is provided
46
+ """
47
+ if not LANCEDB_AVAILABLE:
48
+ raise ImportError("LanceDB is required but not installed. Install with: pip install lancedb")
49
+
50
+ if embedding_provider is None:
51
+ raise ValueError(
52
+ "LanceDB storage requires a real embedding provider for semantic search. "
53
+ "Please provide an AbstractCore EmbeddingManager or other embedding provider."
54
+ )
55
+
56
+ self.uri = uri
57
+ self.embedding_adapter = create_embedding_adapter(embedding_provider)
58
+ self.db = lancedb.connect(uri)
59
+
60
+ # Initialize tables and check embedding consistency
61
+ self._init_tables()
62
+ self._check_embedding_consistency()
63
+
64
+ def _init_tables(self):
65
+ """Initialize LanceDB tables with schemas"""
66
+
67
+ # Interactions table schema
68
+ interactions_schema = [
69
+ {"name": "id", "type": "string"},
70
+ {"name": "user_id", "type": "string"},
71
+ {"name": "timestamp", "type": "timestamp"},
72
+ {"name": "user_input", "type": "string"},
73
+ {"name": "agent_response", "type": "string"},
74
+ {"name": "topic", "type": "string"},
75
+ {"name": "metadata", "type": "string"}, # JSON string
76
+ {"name": "embedding", "type": "vector"} # Vector embedding
77
+ ]
78
+
79
+ # Experiential notes table schema
80
+ notes_schema = [
81
+ {"name": "id", "type": "string"},
82
+ {"name": "timestamp", "type": "timestamp"},
83
+ {"name": "reflection", "type": "string"},
84
+ {"name": "interaction_id", "type": "string"},
85
+ {"name": "note_type", "type": "string"},
86
+ {"name": "metadata", "type": "string"}, # JSON string
87
+ {"name": "embedding", "type": "vector"} # Vector embedding
88
+ ]
89
+
90
+ # Links table schema
91
+ links_schema = [
92
+ {"name": "interaction_id", "type": "string"},
93
+ {"name": "note_id", "type": "string"},
94
+ {"name": "created", "type": "timestamp"},
95
+ {"name": "link_type", "type": "string"}
96
+ ]
97
+
98
+ # Memory components table schema
99
+ components_schema = [
100
+ {"name": "component_name", "type": "string"},
101
+ {"name": "timestamp", "type": "timestamp"},
102
+ {"name": "data", "type": "string"}, # JSON string
103
+ {"name": "version", "type": "int64"}
104
+ ]
105
+
106
+ # Create tables if they don't exist
107
+ import pandas as pd
108
+
109
+ try:
110
+ self.interactions_table = self.db.open_table("interactions")
111
+ except (FileNotFoundError, ValueError):
112
+ # Create table with proper schema and sample data
113
+ import pyarrow as pa
114
+
115
+ # Get actual embedding dimension from adapter
116
+ test_embedding = self.embedding_adapter.generate_embedding("test")
117
+ embedding_dim = len(test_embedding)
118
+
119
+ # Create proper schema with fixed-size list for embeddings
120
+ schema = pa.schema([
121
+ pa.field("id", pa.string()),
122
+ pa.field("user_id", pa.string()),
123
+ pa.field("timestamp", pa.timestamp('us')),
124
+ pa.field("user_input", pa.string()),
125
+ pa.field("agent_response", pa.string()),
126
+ pa.field("topic", pa.string()),
127
+ pa.field("metadata", pa.string()),
128
+ pa.field("embedding", pa.list_(pa.float32(), embedding_dim))
129
+ ])
130
+
131
+ # Create empty table with proper schema
132
+ self.interactions_table = self.db.create_table("interactions", schema=schema)
133
+
134
+ try:
135
+ self.notes_table = self.db.open_table("experiential_notes")
136
+ except (FileNotFoundError, ValueError):
137
+ # Create notes table with proper schema
138
+ notes_schema = pa.schema([
139
+ pa.field("id", pa.string()),
140
+ pa.field("timestamp", pa.timestamp('us')),
141
+ pa.field("reflection", pa.string()),
142
+ pa.field("interaction_id", pa.string()),
143
+ pa.field("note_type", pa.string()),
144
+ pa.field("metadata", pa.string()),
145
+ pa.field("embedding", pa.list_(pa.float32(), embedding_dim))
146
+ ])
147
+ self.notes_table = self.db.create_table("experiential_notes", schema=notes_schema)
148
+
149
+ try:
150
+ self.links_table = self.db.open_table("links")
151
+ except (FileNotFoundError, ValueError):
152
+ sample_data = pd.DataFrame([{
153
+ "interaction_id": "sample_int",
154
+ "note_id": "sample_note",
155
+ "created": datetime.now(),
156
+ "link_type": "bidirectional"
157
+ }])
158
+ self.links_table = self.db.create_table("links", sample_data)
159
+ self.links_table.delete("interaction_id = 'sample_int'")
160
+
161
+ try:
162
+ self.components_table = self.db.open_table("memory_components")
163
+ except (FileNotFoundError, ValueError):
164
+ sample_data = pd.DataFrame([{
165
+ "component_name": "sample",
166
+ "timestamp": datetime.now(),
167
+ "data": "{}",
168
+ "version": 1
169
+ }])
170
+ self.components_table = self.db.create_table("memory_components", sample_data)
171
+ self.components_table.delete("component_name = 'sample'")
172
+
173
+ # Embedding metadata table for consistency tracking
174
+ try:
175
+ self.embedding_metadata_table = self.db.open_table("embedding_metadata")
176
+ except (FileNotFoundError, ValueError):
177
+ sample_data = pd.DataFrame([{
178
+ "key": "sample",
179
+ "value": "{}",
180
+ "created_at": datetime.now()
181
+ }])
182
+ self.embedding_metadata_table = self.db.create_table("embedding_metadata", sample_data)
183
+ self.embedding_metadata_table.delete("key = 'sample'")
184
+
185
+ def _check_embedding_consistency(self) -> None:
186
+ """Check for embedding model consistency with previously stored data."""
187
+ try:
188
+ # Get current embedding model info
189
+ current_info = self.embedding_adapter.get_embedding_info()
190
+
191
+ # Try to retrieve previously stored embedding info
192
+ stored_info_df = self.embedding_metadata_table.search().where("key = 'embedding_model_info'").to_pandas()
193
+
194
+ if len(stored_info_df) > 0:
195
+ # We have previously stored embedding info
196
+ import json
197
+ stored_info = json.loads(stored_info_df.iloc[0]['value'])
198
+
199
+ # Check consistency and warn if needed
200
+ self.embedding_adapter.warn_about_consistency(stored_info)
201
+ else:
202
+ # First time - store the current embedding info
203
+ self._store_embedding_info(current_info)
204
+ logger.info(f"Stored embedding model info for consistency tracking: {current_info}")
205
+
206
+ except Exception as e:
207
+ logger.warning(f"Failed to check embedding consistency: {e}")
208
+
209
+ def _store_embedding_info(self, embedding_info: dict) -> None:
210
+ """Store embedding model information for consistency tracking."""
211
+ try:
212
+ import json
213
+ import pandas as pd
214
+
215
+ # Delete any existing embedding_model_info records
216
+ try:
217
+ self.embedding_metadata_table.delete("key = 'embedding_model_info'")
218
+ except:
219
+ pass # Table might be empty
220
+
221
+ # Store new info
222
+ data = pd.DataFrame([{
223
+ "key": "embedding_model_info",
224
+ "value": json.dumps(embedding_info),
225
+ "created_at": datetime.now()
226
+ }])
227
+
228
+ self.embedding_metadata_table.add(data)
229
+ logger.debug(f"Stored embedding model info: {embedding_info}")
230
+
231
+ except Exception as e:
232
+ logger.error(f"Failed to store embedding info: {e}")
233
+
234
+ def _generate_embedding(self, text: str) -> List[float]:
235
+ """Generate embedding using embedding adapter"""
236
+ return self.embedding_adapter.generate_embedding(text)
237
+
238
+ def save_interaction(self, user_id: str, timestamp: datetime,
239
+ user_input: str, agent_response: str,
240
+ topic: str, metadata: Optional[Dict] = None) -> str:
241
+ """Save verbatim interaction with vector embedding"""
242
+
243
+ interaction_id = f"int_{uuid.uuid4().hex[:8]}"
244
+
245
+ # Generate embedding for the full interaction
246
+ interaction_text = f"{user_input} {agent_response}"
247
+ embedding = self._generate_embedding(interaction_text)
248
+
249
+ # Prepare data
250
+ import json
251
+ import pandas as pd
252
+
253
+ data = {
254
+ "id": interaction_id,
255
+ "user_id": user_id,
256
+ "timestamp": timestamp,
257
+ "user_input": user_input,
258
+ "agent_response": agent_response,
259
+ "topic": topic,
260
+ "metadata": json.dumps(metadata or {}),
261
+ "embedding": [float(x) for x in embedding] # Ensure float32 compatibility
262
+ }
263
+
264
+ # Insert into table
265
+ df = pd.DataFrame([data])
266
+
267
+ try:
268
+ self.interactions_table.add(df)
269
+ logger.debug(f"Saved interaction {interaction_id} to LanceDB")
270
+ except Exception as e:
271
+ logger.error(f"Failed to save interaction to LanceDB: {e}")
272
+ raise
273
+
274
+ return interaction_id
275
+
276
+ def save_experiential_note(self, timestamp: datetime, reflection: str,
277
+ interaction_id: str, note_type: str = "reflection",
278
+ metadata: Optional[Dict] = None) -> str:
279
+ """Save AI experiential note with vector embedding"""
280
+
281
+ note_id = f"note_{uuid.uuid4().hex[:8]}"
282
+
283
+ # Generate embedding for the reflection
284
+ embedding = self._generate_embedding(reflection)
285
+
286
+ # Prepare data
287
+ import json
288
+ import pandas as pd
289
+
290
+ data = {
291
+ "id": note_id,
292
+ "timestamp": timestamp,
293
+ "reflection": reflection,
294
+ "interaction_id": interaction_id,
295
+ "note_type": note_type,
296
+ "metadata": json.dumps(metadata or {}),
297
+ "embedding": [float(x) for x in embedding] # Ensure float32 compatibility
298
+ }
299
+
300
+ # Insert into table
301
+ df = pd.DataFrame([data])
302
+
303
+ try:
304
+ self.notes_table.add(df)
305
+ logger.debug(f"Saved experiential note {note_id} to LanceDB")
306
+ except Exception as e:
307
+ logger.error(f"Failed to save experiential note to LanceDB: {e}")
308
+ raise
309
+
310
+ return note_id
311
+
312
+ def link_interaction_to_note(self, interaction_id: str, note_id: str) -> None:
313
+ """Create bidirectional link between interaction and note"""
314
+
315
+ import pandas as pd
316
+
317
+ link_data = {
318
+ "interaction_id": interaction_id,
319
+ "note_id": note_id,
320
+ "created": datetime.now(),
321
+ "link_type": "bidirectional"
322
+ }
323
+
324
+ df = pd.DataFrame([link_data])
325
+
326
+ try:
327
+ self.links_table.add(df)
328
+ logger.debug(f"Created link between {interaction_id} and {note_id}")
329
+ except Exception as e:
330
+ logger.error(f"Failed to create link in LanceDB: {e}")
331
+
332
+ def search_interactions(self, query: str, user_id: Optional[str] = None,
333
+ start_date: Optional[datetime] = None,
334
+ end_date: Optional[datetime] = None) -> List[Dict]:
335
+ """
336
+ Search interactions using SQL filters and vector similarity.
337
+
338
+ Combines:
339
+ 1. SQL filters for user_id, date range
340
+ 2. Text search in user_input, agent_response, topic
341
+ 3. Vector similarity search if embedding provider available
342
+ """
343
+
344
+ try:
345
+ # Start with base query
346
+ query_parts = []
347
+
348
+ # Filter by user_id
349
+ if user_id:
350
+ query_parts.append(f"user_id = '{user_id}'")
351
+
352
+ # Filter by date range
353
+ if start_date:
354
+ query_parts.append(f"timestamp >= '{start_date.isoformat()}'")
355
+ if end_date:
356
+ query_parts.append(f"timestamp <= '{end_date.isoformat()}'")
357
+
358
+ # Build WHERE clause
359
+ where_clause = " AND ".join(query_parts) if query_parts else None
360
+
361
+ # Try vector search first if embedding adapter available
362
+ if self.embedding_adapter:
363
+ try:
364
+ query_embedding = self._generate_embedding(query)
365
+ if query_embedding:
366
+ # Vector similarity search
367
+ results = self.interactions_table.search(query_embedding, vector_column_name="embedding").limit(50)
368
+
369
+ # Apply additional filters
370
+ if where_clause:
371
+ results = results.where(where_clause)
372
+
373
+ df = results.to_pandas()
374
+
375
+ return self._convert_df_to_dicts(df)
376
+ except Exception as e:
377
+ logger.warning(f"Vector search failed, falling back to text search: {e}")
378
+
379
+ # Fallback to text search
380
+ search_conditions = []
381
+ query_lower = query.lower()
382
+
383
+ # Search in multiple text fields
384
+ search_conditions.extend([
385
+ f"LOWER(user_input) LIKE '%{query_lower}%'",
386
+ f"LOWER(agent_response) LIKE '%{query_lower}%'",
387
+ f"LOWER(topic) LIKE '%{query_lower}%'"
388
+ ])
389
+
390
+ text_search = "(" + " OR ".join(search_conditions) + ")"
391
+
392
+ # Combine with other filters
393
+ if where_clause:
394
+ final_where = f"({where_clause}) AND {text_search}"
395
+ else:
396
+ final_where = text_search
397
+
398
+ # Execute search
399
+ df = self.interactions_table.search().where(final_where).limit(100).to_pandas()
400
+
401
+ return self._convert_df_to_dicts(df)
402
+
403
+ except Exception as e:
404
+ logger.error(f"Search failed in LanceDB: {e}")
405
+ return []
406
+
407
+ def _convert_df_to_dicts(self, df) -> List[Dict]:
408
+ """Convert pandas DataFrame to list of dictionaries"""
409
+ import json
410
+
411
+ results = []
412
+ for _, row in df.iterrows():
413
+ try:
414
+ result = {
415
+ "id": row["id"],
416
+ "user_id": row["user_id"],
417
+ "timestamp": row["timestamp"].isoformat() if hasattr(row["timestamp"], 'isoformat') else str(row["timestamp"]),
418
+ "user_input": row["user_input"],
419
+ "agent_response": row["agent_response"],
420
+ "topic": row["topic"],
421
+ "metadata": json.loads(row["metadata"]) if row["metadata"] else {}
422
+ }
423
+ results.append(result)
424
+ except Exception as e:
425
+ logger.warning(f"Failed to convert row to dict: {e}")
426
+ continue
427
+
428
+ return results
429
+
430
+ # IStorage interface implementation
431
+ def save(self, key: str, value: Any) -> None:
432
+ """Generic save for compatibility"""
433
+ if "/" in key:
434
+ component_name = key.split("/")[-1]
435
+ self.save_memory_component(component_name, value)
436
+
437
+ def load(self, key: str) -> Any:
438
+ """Generic load for compatibility"""
439
+ if "/" in key:
440
+ component_name = key.split("/")[-1]
441
+ return self.load_memory_component(component_name)
442
+ return None
443
+
444
+ def exists(self, key: str) -> bool:
445
+ """Check if key exists"""
446
+ if "/" in key:
447
+ component_name = key.split("/")[-1]
448
+ try:
449
+ df = self.components_table.search().where(f"component_name = '{component_name}'").limit(1).to_pandas()
450
+ return len(df) > 0
451
+ except:
452
+ return False
453
+ return False
454
+
455
+ def save_memory_component(self, component_name: str, component_data: Any) -> None:
456
+ """Save memory component to LanceDB"""
457
+ import json
458
+ import pandas as pd
459
+
460
+ # Convert component to JSON
461
+ if hasattr(component_data, '__dict__'):
462
+ data = component_data.__dict__
463
+ else:
464
+ data = component_data
465
+
466
+ # Get next version number
467
+ try:
468
+ existing = self.components_table.search().where(f"component_name = '{component_name}'").to_pandas()
469
+ version = existing["version"].max() + 1 if len(existing) > 0 else 1
470
+ except:
471
+ version = 1
472
+
473
+ component_record = {
474
+ "component_name": component_name,
475
+ "timestamp": datetime.now(),
476
+ "data": json.dumps(data, default=str),
477
+ "version": version
478
+ }
479
+
480
+ df = pd.DataFrame([component_record])
481
+
482
+ try:
483
+ self.components_table.add(df)
484
+ logger.debug(f"Saved {component_name} component version {version} to LanceDB")
485
+ except Exception as e:
486
+ logger.error(f"Failed to save {component_name} component: {e}")
487
+
488
+ def load_memory_component(self, component_name: str) -> Optional[Any]:
489
+ """Load latest memory component from LanceDB"""
490
+ try:
491
+ import json
492
+
493
+ # Get latest version
494
+ df = self.components_table.search().where(f"component_name = '{component_name}'").to_pandas()
495
+
496
+ if len(df) == 0:
497
+ return None
498
+
499
+ # Get the latest version
500
+ latest = df.loc[df["version"].idxmax()]
501
+
502
+ return json.loads(latest["data"])
503
+
504
+ except Exception as e:
505
+ logger.error(f"Failed to load {component_name} component: {e}")
506
+ return None
507
+
508
+ def get_stats(self) -> Dict[str, Any]:
509
+ """Get storage statistics"""
510
+ try:
511
+ interactions_count = len(self.interactions_table.search().limit(10000).to_pandas())
512
+ notes_count = len(self.notes_table.search().limit(10000).to_pandas())
513
+ links_count = len(self.links_table.search().limit(10000).to_pandas())
514
+ components_count = len(self.components_table.search().limit(1000).to_pandas())
515
+
516
+ stats = {
517
+ "total_interactions": interactions_count,
518
+ "total_notes": notes_count,
519
+ "total_links": links_count,
520
+ "total_components": components_count,
521
+ "uri": self.uri,
522
+ "embedding_provider_available": self.embedding_adapter is not None,
523
+ "embedding_info": self.embedding_adapter.get_embedding_info() if self.embedding_adapter else None
524
+ }
525
+
526
+ # Add stored embedding model info for comparison
527
+ try:
528
+ stored_info_df = self.embedding_metadata_table.search().where("key = 'embedding_model_info'").to_pandas()
529
+ if len(stored_info_df) > 0:
530
+ import json
531
+ stats["stored_embedding_info"] = json.loads(stored_info_df.iloc[0]['value'])
532
+ stats["embedding_consistency"] = self.embedding_adapter.check_consistency_with(stats["stored_embedding_info"]) if self.embedding_adapter else False
533
+ except Exception as e:
534
+ logger.debug(f"Could not retrieve stored embedding info: {e}")
535
+
536
+ return stats
537
+ except Exception as e:
538
+ logger.error(f"Failed to get stats: {e}")
539
+ return {
540
+ "error": str(e),
541
+ "uri": self.uri,
542
+ "embedding_provider_available": self.embedding_adapter is not None,
543
+ "embedding_info": self.embedding_adapter.get_embedding_info() if self.embedding_adapter else None
544
+ }