causaliq-knowledge 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,632 @@
1
+ """
2
+ TokenCache: SQLite-backed cache with shared token dictionary.
3
+
4
+ Provides efficient storage for cache entries with:
5
+ - Fast indexed key lookup via SQLite
6
+ - In-memory mode via :memory:
7
+ - Concurrency support via SQLite locking
8
+ - Shared token dictionary for cross-entry compression
9
+
10
+ Note: This module is designed for future migration to causaliq-core.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import sqlite3
16
+ from contextlib import contextmanager
17
+ from datetime import datetime, timezone
18
+ from pathlib import Path
19
+ from typing import TYPE_CHECKING, Any, Iterator
20
+
21
+ if TYPE_CHECKING: # pragma: no cover
22
+ from causaliq_knowledge.cache.encoders.base import EntryEncoder
23
+
24
+
25
+ class TokenCache:
26
+ """SQLite-backed cache with shared token dictionary.
27
+
28
+ Attributes:
29
+ db_path: Path to SQLite database file, or ":memory:" for in-memory.
30
+ conn: SQLite connection (None until open() called or context entered).
31
+
32
+ Example:
33
+ >>> with TokenCache(":memory:") as cache:
34
+ ... cache.put("abc123", "test", b"hello")
35
+ ... data = cache.get("abc123", "test")
36
+ """
37
+
38
+ # SQL statements for schema creation
39
+ _SCHEMA_SQL = """
40
+ -- Token dictionary (grows dynamically, shared across encoders)
41
+ CREATE TABLE IF NOT EXISTS tokens (
42
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
43
+ token TEXT UNIQUE NOT NULL,
44
+ frequency INTEGER DEFAULT 1
45
+ );
46
+
47
+ -- Generic cache entries
48
+ CREATE TABLE IF NOT EXISTS cache_entries (
49
+ hash TEXT NOT NULL,
50
+ entry_type TEXT NOT NULL,
51
+ data BLOB NOT NULL,
52
+ created_at TEXT NOT NULL,
53
+ metadata BLOB,
54
+ PRIMARY KEY (hash, entry_type)
55
+ );
56
+
57
+ -- Indexes for common queries
58
+ CREATE INDEX IF NOT EXISTS idx_entry_type
59
+ ON cache_entries(entry_type);
60
+ CREATE INDEX IF NOT EXISTS idx_created_at
61
+ ON cache_entries(created_at);
62
+ """
63
+
64
+ def __init__(self, db_path: str | Path) -> None:
65
+ """Initialise TokenCache.
66
+
67
+ Args:
68
+ db_path: Path to SQLite database file. Use ":memory:" for
69
+ in-memory database (fast, non-persistent).
70
+ """
71
+ self.db_path = str(db_path)
72
+ self._conn: sqlite3.Connection | None = None
73
+ # In-memory token dictionary for fast lookup
74
+ self._token_to_id: dict[str, int] = {}
75
+ self._id_to_token: dict[int, str] = {}
76
+ # Registered encoders for auto-encoding (entry_type -> encoder)
77
+ self._encoders: dict[str, EntryEncoder] = {}
78
+
79
+ @property
80
+ def conn(self) -> sqlite3.Connection:
81
+ """Get the database connection, raising if not connected."""
82
+ if self._conn is None:
83
+ raise RuntimeError(
84
+ "TokenCache not connected. Use 'with cache:' or call open()."
85
+ )
86
+ return self._conn
87
+
88
+ @property
89
+ def is_open(self) -> bool:
90
+ """Check if the cache connection is open."""
91
+ return self._conn is not None
92
+
93
+ @property
94
+ def is_memory(self) -> bool:
95
+ """Check if this is an in-memory database."""
96
+ return self.db_path == ":memory:"
97
+
98
+ def open(self) -> TokenCache:
99
+ """Open the database connection and initialise schema.
100
+
101
+ Returns:
102
+ self for method chaining.
103
+
104
+ Raises:
105
+ RuntimeError: If already connected.
106
+ """
107
+ if self._conn is not None:
108
+ raise RuntimeError("TokenCache already connected.")
109
+
110
+ self._conn = sqlite3.connect(
111
+ self.db_path,
112
+ check_same_thread=False, # Allow multi-threaded access
113
+ )
114
+ # Enable foreign keys and WAL mode for better concurrency
115
+ self._conn.execute("PRAGMA foreign_keys = ON")
116
+ if not self.is_memory:
117
+ self._conn.execute("PRAGMA journal_mode = WAL")
118
+
119
+ self._init_schema()
120
+ return self
121
+
122
+ def close(self) -> None:
123
+ """Close the database connection."""
124
+ if self._conn is not None:
125
+ self._conn.close()
126
+ self._conn = None
127
+
128
+ def _init_schema(self) -> None:
129
+ """Create database tables if they don't exist."""
130
+ self.conn.executescript(self._SCHEMA_SQL)
131
+ self.conn.commit()
132
+ self._load_token_dict()
133
+
134
+ def _load_token_dict(self) -> None:
135
+ """Load token dictionary from database into memory."""
136
+ cursor = self.conn.execute("SELECT id, token FROM tokens")
137
+ self._token_to_id.clear()
138
+ self._id_to_token.clear()
139
+ for row in cursor:
140
+ token_id, token = row[0], row[1]
141
+ self._token_to_id[token] = token_id
142
+ self._id_to_token[token_id] = token
143
+
144
+ def __enter__(self) -> TokenCache:
145
+ """Context manager entry - opens connection."""
146
+ return self.open()
147
+
148
+ def __exit__(
149
+ self,
150
+ exc_type: type[BaseException] | None,
151
+ exc_val: BaseException | None,
152
+ exc_tb: object,
153
+ ) -> None:
154
+ """Context manager exit - closes connection."""
155
+ self.close()
156
+
157
+ @contextmanager
158
+ def transaction(self) -> Iterator[sqlite3.Cursor]:
159
+ """Context manager for a database transaction.
160
+
161
+ Commits on success, rolls back on exception.
162
+
163
+ Yields:
164
+ SQLite cursor for executing statements.
165
+ """
166
+ cursor = self.conn.cursor()
167
+ try:
168
+ yield cursor
169
+ self.conn.commit()
170
+ except Exception:
171
+ self.conn.rollback()
172
+ raise
173
+ finally:
174
+ cursor.close()
175
+
176
+ def _utcnow_iso(self) -> str:
177
+ """Get current UTC time as ISO 8601 string."""
178
+ return datetime.now(timezone.utc).isoformat()
179
+
180
+ def table_exists(self, table_name: str) -> bool:
181
+ """Check if a table exists in the database.
182
+
183
+ Args:
184
+ table_name: Name of the table to check.
185
+
186
+ Returns:
187
+ True if table exists, False otherwise.
188
+ """
189
+ cursor = self.conn.execute(
190
+ "SELECT name FROM sqlite_master " "WHERE type='table' AND name=?",
191
+ (table_name,),
192
+ )
193
+ return cursor.fetchone() is not None
194
+
195
+ def entry_count(self, entry_type: str | None = None) -> int:
196
+ """Count cache entries, optionally filtered by type.
197
+
198
+ Args:
199
+ entry_type: If provided, count only entries of this type.
200
+
201
+ Returns:
202
+ Number of matching entries.
203
+ """
204
+ if entry_type is None:
205
+ cursor = self.conn.execute("SELECT COUNT(*) FROM cache_entries")
206
+ else:
207
+ cursor = self.conn.execute(
208
+ "SELECT COUNT(*) FROM cache_entries WHERE entry_type = ?",
209
+ (entry_type,),
210
+ )
211
+ row = cursor.fetchone()
212
+ return int(row[0]) if row else 0
213
+
214
+ def list_entry_types(self) -> list[str]:
215
+ """List all distinct entry types in the cache.
216
+
217
+ Returns:
218
+ List of entry type names found in the cache.
219
+
220
+ Example:
221
+ >>> with TokenCache(":memory:") as cache:
222
+ ... cache.register_encoder("llm", LLMEntryEncoder())
223
+ ... cache.put_data("h1", "llm", {"data": "test"})
224
+ ... cache.list_entry_types()
225
+ ['llm']
226
+ """
227
+ cursor = self.conn.execute(
228
+ "SELECT DISTINCT entry_type FROM cache_entries ORDER BY entry_type"
229
+ )
230
+ return [row[0] for row in cursor.fetchall()]
231
+
232
+ def token_count(self) -> int:
233
+ """Count tokens in the dictionary.
234
+
235
+ Returns:
236
+ Number of tokens.
237
+ """
238
+ cursor = self.conn.execute("SELECT COUNT(*) FROM tokens")
239
+ row = cursor.fetchone()
240
+ return int(row[0]) if row else 0
241
+
242
+ def get_or_create_token(self, token: str) -> int:
243
+ """Get token ID, creating a new entry if needed.
244
+
245
+ This method is used by encoders to compress strings to integer IDs.
246
+ The token dictionary grows dynamically as new tokens are encountered.
247
+
248
+ Args:
249
+ token: The string token to look up or create.
250
+
251
+ Returns:
252
+ Integer ID for the token (1-65535 range).
253
+
254
+ Raises:
255
+ ValueError: If token dictionary exceeds uint16 capacity.
256
+ """
257
+ # Fast path: check in-memory cache
258
+ if token in self._token_to_id:
259
+ return self._token_to_id[token]
260
+
261
+ # Slow path: insert into database
262
+ cursor = self.conn.execute(
263
+ "INSERT INTO tokens (token) VALUES (?) RETURNING id",
264
+ (token,),
265
+ )
266
+ token_id: int = cursor.fetchone()[0]
267
+ self.conn.commit()
268
+
269
+ # Check uint16 capacity (max 65,535 tokens)
270
+ if token_id > 65535: # pragma: no cover
271
+ raise ValueError(
272
+ f"Token dictionary exceeded uint16 capacity: {token_id}"
273
+ )
274
+
275
+ # Update in-memory cache
276
+ self._token_to_id[token] = token_id
277
+ self._id_to_token[token_id] = token
278
+
279
+ return token_id
280
+
281
+ def get_token(self, token_id: int) -> str | None:
282
+ """Get token string by ID.
283
+
284
+ This method is used by decoders to expand integer IDs back to strings.
285
+
286
+ Args:
287
+ token_id: The integer ID to look up.
288
+
289
+ Returns:
290
+ The token string, or None if not found.
291
+ """
292
+ return self._id_to_token.get(token_id)
293
+
294
+ # ========================================================================
295
+ # Cache entry operations
296
+ # ========================================================================
297
+
298
+ def put(
299
+ self,
300
+ hash: str,
301
+ entry_type: str,
302
+ data: bytes,
303
+ metadata: bytes | None = None,
304
+ ) -> None:
305
+ """Store a cache entry.
306
+
307
+ Args:
308
+ hash: Unique identifier for the entry (e.g. SHA-256 truncated).
309
+ entry_type: Type of entry (e.g. 'llm', 'graph', 'score').
310
+ data: Binary data to store.
311
+ metadata: Optional binary metadata.
312
+ """
313
+ self.conn.execute(
314
+ "INSERT OR REPLACE INTO cache_entries "
315
+ "(hash, entry_type, data, created_at, metadata) "
316
+ "VALUES (?, ?, ?, ?, ?)",
317
+ (hash, entry_type, data, self._utcnow_iso(), metadata),
318
+ )
319
+ self.conn.commit()
320
+
321
+ def get(self, hash: str, entry_type: str) -> bytes | None:
322
+ """Retrieve a cache entry.
323
+
324
+ Args:
325
+ hash: Unique identifier for the entry.
326
+ entry_type: Type of entry to retrieve.
327
+
328
+ Returns:
329
+ Binary data if found, None otherwise.
330
+ """
331
+ cursor = self.conn.execute(
332
+ "SELECT data FROM cache_entries "
333
+ "WHERE hash = ? AND entry_type = ?",
334
+ (hash, entry_type),
335
+ )
336
+ row = cursor.fetchone()
337
+ return row[0] if row else None
338
+
339
+ def get_with_metadata(
340
+ self, hash: str, entry_type: str
341
+ ) -> tuple[bytes, bytes | None] | None:
342
+ """Retrieve a cache entry with its metadata.
343
+
344
+ Args:
345
+ hash: Unique identifier for the entry.
346
+ entry_type: Type of entry to retrieve.
347
+
348
+ Returns:
349
+ Tuple of (data, metadata) if found, None otherwise.
350
+ """
351
+ cursor = self.conn.execute(
352
+ "SELECT data, metadata FROM cache_entries "
353
+ "WHERE hash = ? AND entry_type = ?",
354
+ (hash, entry_type),
355
+ )
356
+ row = cursor.fetchone()
357
+ return (row[0], row[1]) if row else None
358
+
359
+ def exists(self, hash: str, entry_type: str) -> bool:
360
+ """Check if a cache entry exists.
361
+
362
+ Args:
363
+ hash: Unique identifier for the entry.
364
+ entry_type: Type of entry to check.
365
+
366
+ Returns:
367
+ True if entry exists, False otherwise.
368
+ """
369
+ cursor = self.conn.execute(
370
+ "SELECT 1 FROM cache_entries " "WHERE hash = ? AND entry_type = ?",
371
+ (hash, entry_type),
372
+ )
373
+ return cursor.fetchone() is not None
374
+
375
+ def delete(self, hash: str, entry_type: str) -> bool:
376
+ """Delete a cache entry.
377
+
378
+ Args:
379
+ hash: Unique identifier for the entry.
380
+ entry_type: Type of entry to delete.
381
+
382
+ Returns:
383
+ True if entry was deleted, False if it didn't exist.
384
+ """
385
+ cursor = self.conn.execute(
386
+ "DELETE FROM cache_entries WHERE hash = ? AND entry_type = ?",
387
+ (hash, entry_type),
388
+ )
389
+ self.conn.commit()
390
+ return cursor.rowcount > 0
391
+
392
+ # ========================================================================
393
+ # Encoder registration and auto-encoding operations
394
+ # ========================================================================
395
+
396
+ def register_encoder(self, entry_type: str, encoder: EntryEncoder) -> None:
397
+ """Register an encoder for a specific entry type.
398
+
399
+ Once registered, `put_data()` and `get_data()` will automatically
400
+ encode/decode entries of this type using the registered encoder.
401
+
402
+ Args:
403
+ entry_type: Type identifier (e.g. 'llm', 'json', 'score').
404
+ encoder: EntryEncoder instance for this type.
405
+
406
+ Example:
407
+ >>> from causaliq_knowledge.cache.encoders import JsonEncoder
408
+ >>> with TokenCache(":memory:") as cache:
409
+ ... cache.register_encoder("json", JsonEncoder())
410
+ ... cache.put_data("key1", "json", {"msg": "hello"})
411
+ """
412
+ self._encoders[entry_type] = encoder
413
+
414
+ def get_encoder(self, entry_type: str) -> EntryEncoder | None:
415
+ """Get the registered encoder for an entry type.
416
+
417
+ Args:
418
+ entry_type: Type identifier to look up.
419
+
420
+ Returns:
421
+ The registered encoder, or None if not registered.
422
+ """
423
+ return self._encoders.get(entry_type)
424
+
425
+ def has_encoder(self, entry_type: str) -> bool:
426
+ """Check if an encoder is registered for an entry type.
427
+
428
+ Args:
429
+ entry_type: Type identifier to check.
430
+
431
+ Returns:
432
+ True if encoder is registered, False otherwise.
433
+ """
434
+ return entry_type in self._encoders
435
+
436
+ def put_data(
437
+ self,
438
+ hash: str,
439
+ entry_type: str,
440
+ data: Any,
441
+ metadata: Any | None = None,
442
+ ) -> None:
443
+ """Store data using the registered encoder for the entry type.
444
+
445
+ This method automatically encodes the data using the encoder
446
+ registered for the given entry_type. Use `put()` for raw bytes.
447
+
448
+ Args:
449
+ hash: Unique identifier for the entry.
450
+ entry_type: Type of entry (must have registered encoder).
451
+ data: Data to encode and store.
452
+ metadata: Optional metadata to encode and store.
453
+
454
+ Raises:
455
+ KeyError: If no encoder is registered for entry_type.
456
+
457
+ Example:
458
+ >>> with TokenCache(":memory:") as cache:
459
+ ... cache.register_encoder("json", JsonEncoder())
460
+ ... cache.put_data("abc", "json", {"key": "value"})
461
+ """
462
+ encoder = self._encoders[entry_type]
463
+ blob = encoder.encode(data, self)
464
+ meta_blob = (
465
+ encoder.encode(metadata, self) if metadata is not None else None
466
+ )
467
+ self.put(hash, entry_type, blob, meta_blob)
468
+
469
+ def get_data(self, hash: str, entry_type: str) -> Any | None:
470
+ """Retrieve and decode data using the registered encoder.
471
+
472
+ This method automatically decodes the data using the encoder
473
+ registered for the given entry_type. Use `get()` for raw bytes.
474
+
475
+ Args:
476
+ hash: Unique identifier for the entry.
477
+ entry_type: Type of entry (must have registered encoder).
478
+
479
+ Returns:
480
+ Decoded data if found, None otherwise.
481
+
482
+ Raises:
483
+ KeyError: If no encoder is registered for entry_type.
484
+
485
+ Example:
486
+ >>> with TokenCache(":memory:") as cache:
487
+ ... cache.register_encoder("json", JsonEncoder())
488
+ ... cache.put_data("abc", "json", {"key": "value"})
489
+ ... data = cache.get_data("abc", "json")
490
+ """
491
+ blob = self.get(hash, entry_type)
492
+ if blob is None:
493
+ return None
494
+ encoder = self._encoders[entry_type]
495
+ return encoder.decode(blob, self)
496
+
497
+ def get_data_with_metadata(
498
+ self, hash: str, entry_type: str
499
+ ) -> tuple[Any, Any | None] | None:
500
+ """Retrieve and decode data with metadata using registered encoder.
501
+
502
+ Args:
503
+ hash: Unique identifier for the entry.
504
+ entry_type: Type of entry (must have registered encoder).
505
+
506
+ Returns:
507
+ Tuple of (decoded_data, decoded_metadata) if found, None otherwise.
508
+ metadata may be None if not stored.
509
+
510
+ Raises:
511
+ KeyError: If no encoder is registered for entry_type.
512
+ """
513
+ result = self.get_with_metadata(hash, entry_type)
514
+ if result is None:
515
+ return None
516
+ data_blob, meta_blob = result
517
+ encoder = self._encoders[entry_type]
518
+ decoded_data = encoder.decode(data_blob, self)
519
+ decoded_meta = encoder.decode(meta_blob, self) if meta_blob else None
520
+ return (decoded_data, decoded_meta)
521
+
522
+ # ========================================================================
523
+ # Import/Export operations
524
+ # ========================================================================
525
+
526
+ def export_entries(
527
+ self,
528
+ output_dir: Path,
529
+ entry_type: str,
530
+ fmt: str | None = None,
531
+ ) -> int:
532
+ """Export cache entries to human-readable files.
533
+
534
+ Each entry is exported to a separate file named `{hash}.{ext}` where
535
+ ext is determined by the format or encoder's default_export_format.
536
+
537
+ Args:
538
+ output_dir: Directory to write exported files to. Created if
539
+ it doesn't exist.
540
+ entry_type: Type of entries to export (must have registered
541
+ encoder).
542
+ fmt: Export format (e.g. 'json', 'yaml'). If None, uses the
543
+ encoder's default_export_format.
544
+
545
+ Returns:
546
+ Number of entries exported.
547
+
548
+ Raises:
549
+ KeyError: If no encoder is registered for entry_type.
550
+
551
+ Example:
552
+ >>> from pathlib import Path
553
+ >>> from causaliq_knowledge.cache import TokenCache
554
+ >>> from causaliq_knowledge.cache.encoders import JsonEncoder
555
+ >>> with TokenCache(":memory:") as cache:
556
+ ... cache.register_encoder("json", JsonEncoder())
557
+ ... cache.put_data("abc123", "json", {"key": "value"})
558
+ ... count = cache.export_entries(Path("./export"), "json")
559
+ ... # Creates ./export/abc123.json
560
+ """
561
+ encoder = self._encoders[entry_type]
562
+ ext = fmt or encoder.default_export_format
563
+
564
+ # Create output directory if needed
565
+ output_dir.mkdir(parents=True, exist_ok=True)
566
+
567
+ # Query all entries of this type
568
+ cursor = self.conn.execute(
569
+ "SELECT hash, data FROM cache_entries WHERE entry_type = ?",
570
+ (entry_type,),
571
+ )
572
+
573
+ count = 0
574
+ for hash_val, blob in cursor:
575
+ # Decode the blob to get original data
576
+ data = encoder.decode(blob, self)
577
+ # Export to file using encoder's export method
578
+ file_path = output_dir / f"{hash_val}.{ext}"
579
+ encoder.export(data, file_path)
580
+ count += 1
581
+
582
+ return count
583
+
584
+ def import_entries(
585
+ self,
586
+ input_dir: Path,
587
+ entry_type: str,
588
+ ) -> int:
589
+ """Import human-readable files into the cache.
590
+
591
+ Each file is imported with its stem (filename without extension)
592
+ used as the cache hash. The encoder's import_() method reads the
593
+ file and the data is encoded before storage.
594
+
595
+ Args:
596
+ input_dir: Directory containing files to import.
597
+ entry_type: Type to assign to imported entries (must have
598
+ registered encoder).
599
+
600
+ Returns:
601
+ Number of entries imported.
602
+
603
+ Raises:
604
+ KeyError: If no encoder is registered for entry_type.
605
+ FileNotFoundError: If input_dir doesn't exist.
606
+
607
+ Example:
608
+ >>> from pathlib import Path
609
+ >>> from causaliq_knowledge.cache import TokenCache
610
+ >>> from causaliq_knowledge.cache.encoders import JsonEncoder
611
+ >>> with TokenCache(":memory:") as cache:
612
+ ... cache.register_encoder("json", JsonEncoder())
613
+ ... count = cache.import_entries(Path("./import"), "json")
614
+ ... # Imports all files from ./import as "json" entries
615
+ """
616
+ encoder = self._encoders[entry_type]
617
+
618
+ if not input_dir.exists():
619
+ raise FileNotFoundError(f"Input directory not found: {input_dir}")
620
+
621
+ count = 0
622
+ for file_path in input_dir.iterdir():
623
+ if file_path.is_file():
624
+ # Use filename (without extension) as hash
625
+ hash_val = file_path.stem
626
+ # Import data using encoder
627
+ data = encoder.import_(file_path)
628
+ # Encode and store
629
+ self.put_data(hash_val, entry_type, data)
630
+ count += 1
631
+
632
+ return count