cached-duckdb 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cached_duckdb/CHANGELOG.md +52 -0
- cached_duckdb/ENVIRONMENT_VARIABLES.md +179 -0
- cached_duckdb/VERSION +1 -0
- cached_duckdb/__init__.py +48 -0
- cached_duckdb/cache_manager.py +621 -0
- cached_duckdb/cache_persistence.py +391 -0
- cached_duckdb/cache_query.py +173 -0
- cached_duckdb/cache_store.py +146 -0
- cached_duckdb/cache_ttl.py +245 -0
- cached_duckdb/cached_duckdb_USER_MANUAL.md +1155 -0
- cached_duckdb/config.py +242 -0
- cached_duckdb/errors.py +33 -0
- cached_duckdb/example.py +177 -0
- cached_duckdb/setup.py +63 -0
- cached_duckdb/table_utils.py +248 -0
- cached_duckdb/test_duckdb_libraries.py +1109 -0
- cached_duckdb/test_libs_clickhouse_config.py +511 -0
- cached_duckdb-0.2.0.dist-info/METADATA +375 -0
- cached_duckdb-0.2.0.dist-info/RECORD +22 -0
- cached_duckdb-0.2.0.dist-info/WHEEL +5 -0
- cached_duckdb-0.2.0.dist-info/licenses/LICENSE +21 -0
- cached_duckdb-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.2.0] - 2026-06-01
|
|
9
|
+
|
|
10
|
+
### Changed
|
|
11
|
+
- Standardized packaging to match deployment rules
|
|
12
|
+
- Added `license-files`, `classifiers`, and `cibuildwheel` config to pyproject.toml
|
|
13
|
+
- Replaced legacy setup.py with Cython-gated build pattern
|
|
14
|
+
- Added version lookup via `importlib.metadata` in `__init__.py`
|
|
15
|
+
- Added GitHub Actions publish workflow (`.github/workflows/publish.yml`)
|
|
16
|
+
- Updated MANIFEST.in to include all documentation files
|
|
17
|
+
- Bumped `requires-python` to `>=3.10`
|
|
18
|
+
|
|
19
|
+
### Added
|
|
20
|
+
- PersistenceCoordinator for external DB persistence
|
|
21
|
+
- `DuckDbCachePersistenceError` exception
|
|
22
|
+
- `cache_persistence.py` module
|
|
23
|
+
|
|
24
|
+
## [0.1.0] - 2026-05-13
|
|
25
|
+
|
|
26
|
+
### Added
|
|
27
|
+
- Initial release of cached_duckdb library
|
|
28
|
+
- Generic database/table API for in-memory DataFrame caching
|
|
29
|
+
- Two storage modes: single_db and per_table_db
|
|
30
|
+
- Atomic swap for safe concurrent writes
|
|
31
|
+
- TTL-based expiry with background cleanup thread
|
|
32
|
+
- Lazy stale flagging for active readers
|
|
33
|
+
- Scheduler-managed table support (bypass TTL)
|
|
34
|
+
- SQL query interface with WHERE clause filtering
|
|
35
|
+
- Cross-table JOIN support (single_db mode)
|
|
36
|
+
- Per-database configuration via JSON file
|
|
37
|
+
- Environment variable configuration
|
|
38
|
+
- Thread-safe operations with per-database/table locking
|
|
39
|
+
- Comprehensive error handling with custom exceptions
|
|
40
|
+
- Metadata queries (row count, columns, types, last updated)
|
|
41
|
+
- Manual cache invalidation (per table or entire database)
|
|
42
|
+
- Raw DuckDB connection access for advanced queries
|
|
43
|
+
- Graceful shutdown with connection cleanup
|
|
44
|
+
|
|
45
|
+
### Features
|
|
46
|
+
- Zero disk usage - pure in-memory storage
|
|
47
|
+
- Columnar format - 20-30% less RAM than pandas
|
|
48
|
+
- Single-pass SQL queries - filter + aggregate in one operation
|
|
49
|
+
- Safe concurrent reads during writes
|
|
50
|
+
- Configurable TTL per table or database
|
|
51
|
+
- Priority-based configuration resolution
|
|
52
|
+
- Background cleanup thread with configurable interval
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Environment Variables
|
|
2
|
+
|
|
3
|
+
This document lists all environment variables supported by cached_duckdb.
|
|
4
|
+
|
|
5
|
+
All variables use the `CACHED_DUCKDB_` prefix by default.
|
|
6
|
+
|
|
7
|
+
## Configuration Variables
|
|
8
|
+
|
|
9
|
+
### CACHED_DUCKDB_DEFAULT_MODE
|
|
10
|
+
- **Type:** String
|
|
11
|
+
- **Default:** `single_db`
|
|
12
|
+
- **Options:** `single_db` | `per_table_db`
|
|
13
|
+
- **Description:** Default storage mode for all databases
|
|
14
|
+
- `single_db`: One connection per database (enables JOINs)
|
|
15
|
+
- `per_table_db`: One connection per (database, table) pair (parallel writes)
|
|
16
|
+
|
|
17
|
+
### CACHED_DUCKDB_DEFAULT_TTL_MINUTES
|
|
18
|
+
- **Type:** Integer
|
|
19
|
+
- **Default:** `30`
|
|
20
|
+
- **Description:** Default time-to-live in minutes for cached data
|
|
21
|
+
- **Note:** Can be overridden per database/table in config file
|
|
22
|
+
|
|
23
|
+
### CACHED_DUCKDB_CLEANUP_INTERVAL_MINUTES
|
|
24
|
+
- **Type:** Integer
|
|
25
|
+
- **Default:** `5`
|
|
26
|
+
- **Description:** How often the background cleanup thread runs (in minutes)
|
|
27
|
+
|
|
28
|
+
### CACHED_DUCKDB_LOCK_TIMEOUT_SECONDS
|
|
29
|
+
- **Type:** Float
|
|
30
|
+
- **Default:** `30.0`
|
|
31
|
+
- **Description:** Timeout for acquiring write locks (in seconds)
|
|
32
|
+
- **Note:** Raises `DuckDbCacheLockError` if timeout exceeded
|
|
33
|
+
|
|
34
|
+
### CACHED_DUCKDB_CONFIG_FILE_PATH
|
|
35
|
+
- **Type:** String (file path)
|
|
36
|
+
- **Default:** `None`
|
|
37
|
+
- **Description:** Path to connector_config.json for per-database settings
|
|
38
|
+
- **Example:** `/path/to/connector_config.json`
|
|
39
|
+
|
|
40
|
+
### CACHED_DUCKDB_LOG_NAME
|
|
41
|
+
- **Type:** String
|
|
42
|
+
- **Default:** `cached_duckdb`
|
|
43
|
+
- **Description:** Logger name for this library
|
|
44
|
+
- **Note:** Use this to configure logging for cached_duckdb specifically
|
|
45
|
+
|
|
46
|
+
## Persistence Variables (v0.2.0+)
|
|
47
|
+
|
|
48
|
+
### CACHED_DUCKDB_PERSIST_BASE_PATH
|
|
49
|
+
- **Type:** String (directory path)
|
|
50
|
+
- **Default:** `None` (in-memory only, no persistence)
|
|
51
|
+
- **Description:** Base directory for file-based persistence. When set:
|
|
52
|
+
- **Scenario 1 (DB-level):** Databases with a `persist_path` in connector_config.json (or all databases if no per-DB override) are stored as `{path}/{db_name}.duckdb` files instead of in-memory.
|
|
53
|
+
- **Scenario 2 (Table-level):** Tables marked with `"persist": true` in connector_config.json are saved as `{path}/{db_name}/{table_name}.parquet` files after each `store()`.
|
|
54
|
+
- **Example:** `/data/cache` or `C:\data\cache`
|
|
55
|
+
|
|
56
|
+
### CACHED_DUCKDB_SERVICE_NAME
|
|
57
|
+
- **Type:** String
|
|
58
|
+
- **Default:** `default`
|
|
59
|
+
- **Description:** Service identifier used to namespace rows in the external DB snapshot table (`cached_duckdb_snapshots`). Allows multiple services to share the same external snapshot table.
|
|
60
|
+
- **Example:** `order_service`, `analytics_pipeline`
|
|
61
|
+
|
|
62
|
+
## Example Configuration
|
|
63
|
+
|
|
64
|
+
### Linux/macOS (.env file)
|
|
65
|
+
```bash
|
|
66
|
+
CACHED_DUCKDB_DEFAULT_MODE=single_db
|
|
67
|
+
CACHED_DUCKDB_DEFAULT_TTL_MINUTES=30
|
|
68
|
+
CACHED_DUCKDB_CLEANUP_INTERVAL_MINUTES=5
|
|
69
|
+
CACHED_DUCKDB_LOCK_TIMEOUT_SECONDS=30
|
|
70
|
+
CACHED_DUCKDB_CONFIG_FILE_PATH=/opt/config/connector_config.json
|
|
71
|
+
CACHED_DUCKDB_PERSIST_BASE_PATH=/data/cache
|
|
72
|
+
CACHED_DUCKDB_SERVICE_NAME=my_service
|
|
73
|
+
CACHED_DUCKDB_LOG_NAME=cached_duckdb
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Windows (PowerShell)
|
|
77
|
+
```powershell
|
|
78
|
+
$env:CACHED_DUCKDB_DEFAULT_MODE="single_db"
|
|
79
|
+
$env:CACHED_DUCKDB_DEFAULT_TTL_MINUTES="30"
|
|
80
|
+
$env:CACHED_DUCKDB_CLEANUP_INTERVAL_MINUTES="5"
|
|
81
|
+
$env:CACHED_DUCKDB_LOCK_TIMEOUT_SECONDS="30"
|
|
82
|
+
$env:CACHED_DUCKDB_CONFIG_FILE_PATH="C:\config\connector_config.json"
|
|
83
|
+
$env:CACHED_DUCKDB_PERSIST_BASE_PATH="C:\data\cache"
|
|
84
|
+
$env:CACHED_DUCKDB_SERVICE_NAME="my_service"
|
|
85
|
+
$env:CACHED_DUCKDB_LOG_NAME="cached_duckdb"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Python Code
|
|
89
|
+
```python
|
|
90
|
+
import os
|
|
91
|
+
|
|
92
|
+
os.environ['CACHED_DUCKDB_DEFAULT_MODE'] = 'per_table_db'
|
|
93
|
+
os.environ['CACHED_DUCKDB_DEFAULT_TTL_MINUTES'] = '60'
|
|
94
|
+
os.environ['CACHED_DUCKDB_CLEANUP_INTERVAL_MINUTES'] = '10'
|
|
95
|
+
os.environ['CACHED_DUCKDB_PERSIST_BASE_PATH'] = '/data/cache'
|
|
96
|
+
os.environ['CACHED_DUCKDB_SERVICE_NAME'] = 'order_service'
|
|
97
|
+
|
|
98
|
+
from cached_duckdb import DuckDbCacheConfig, DuckDbCacheManager
|
|
99
|
+
|
|
100
|
+
config = DuckDbCacheConfig.from_env()
|
|
101
|
+
cache = DuckDbCacheManager(config)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Configuration Priority
|
|
105
|
+
|
|
106
|
+
Configuration is resolved in this order (highest to lowest):
|
|
107
|
+
|
|
108
|
+
1. **Per-table config** in connector_config.json (database → table → setting)
|
|
109
|
+
2. **Per-database config** in connector_config.json (database → setting)
|
|
110
|
+
3. **Environment variables** (CACHED_DUCKDB_*)
|
|
111
|
+
4. **Hardcoded defaults** in DuckDbCacheConfig
|
|
112
|
+
|
|
113
|
+
### Example Priority Resolution
|
|
114
|
+
|
|
115
|
+
For TTL of `database="client_abc"`, `table="sales_data"`:
|
|
116
|
+
|
|
117
|
+
```json
|
|
118
|
+
// connector_config.json
|
|
119
|
+
{
|
|
120
|
+
"client_abc": {
|
|
121
|
+
"default_cache_ttl_minutes": 45, // Database-level
|
|
122
|
+
"sales_data": {
|
|
123
|
+
"cache_ttl_minutes": 60 // Table-level (highest priority)
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Persistence Configuration in connector_config.json (v0.2.0+)
|
|
130
|
+
|
|
131
|
+
```json
|
|
132
|
+
{
|
|
133
|
+
"client_abc": {
|
|
134
|
+
"persist_path": "/data/cache", // Scenario 1: DB saved as /data/cache/client_abc.duckdb
|
|
135
|
+
"sales_data": {
|
|
136
|
+
"persist": true // Scenario 2: table saved as {persist_base_path}/client_abc/sales_data.parquet
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Note:** If `persist_path` is set on the database AND `persist: true` is set on a table within that database, the file-based DB takes precedence — the table is already on disk inside the `.duckdb` file, so no separate Parquet file is created.
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# .env
|
|
147
|
+
CACHED_DUCKDB_DEFAULT_TTL_MINUTES=30 # Environment (lower priority)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Result:** `cache_ttl_minutes = 60` (from table-level config)
|
|
151
|
+
|
|
152
|
+
## Logging Configuration
|
|
153
|
+
|
|
154
|
+
To configure logging for cached_duckdb:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
import logging
|
|
158
|
+
|
|
159
|
+
# Set log level
|
|
160
|
+
logging.getLogger('cached_duckdb').setLevel(logging.DEBUG)
|
|
161
|
+
|
|
162
|
+
# Add handler
|
|
163
|
+
handler = logging.StreamHandler()
|
|
164
|
+
handler.setFormatter(logging.Formatter(
|
|
165
|
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
166
|
+
))
|
|
167
|
+
logging.getLogger('cached_duckdb').addHandler(handler)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Or use environment-based log name:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
export CACHED_DUCKDB_LOG_NAME=my_cache_logger
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
import logging
|
|
178
|
+
logging.getLogger('my_cache_logger').setLevel(logging.INFO)
|
|
179
|
+
```
|
cached_duckdb/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.2.0
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""cached_duckdb - Fast in-memory DataFrame cache using DuckDB."""
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
__version__ = version("cached-duckdb")
|
|
6
|
+
except Exception:
|
|
7
|
+
__version__ = "0.0.0"
|
|
8
|
+
|
|
9
|
+
from .config import DuckDbCacheConfig, CacheConfigResolver
|
|
10
|
+
from .cache_manager import DuckDbCacheManager
|
|
11
|
+
from .cache_persistence import PersistenceCoordinator
|
|
12
|
+
from .errors import (
|
|
13
|
+
DuckDbCacheError,
|
|
14
|
+
DuckDbCacheConfigError,
|
|
15
|
+
DuckDbCacheLockError,
|
|
16
|
+
DuckDbCacheNotFoundError,
|
|
17
|
+
DuckDbCacheStaleError,
|
|
18
|
+
DuckDbCachePersistenceError
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_cache_manager(config: DuckDbCacheConfig = None, external_connector=None) -> DuckDbCacheManager:
|
|
23
|
+
"""Get DuckDB cache manager instance (singleton).
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
config: Optional configuration. If None, loads from environment.
|
|
27
|
+
external_connector: Optional duck-typed connector for external DB
|
|
28
|
+
persistence. Must implement execute(sql, params), fetchone(), fetchall().
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
DuckDbCacheManager instance
|
|
32
|
+
"""
|
|
33
|
+
return DuckDbCacheManager(config, external_connector=external_connector)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"DuckDbCacheConfig",
|
|
38
|
+
"CacheConfigResolver",
|
|
39
|
+
"DuckDbCacheManager",
|
|
40
|
+
"PersistenceCoordinator",
|
|
41
|
+
"DuckDbCacheError",
|
|
42
|
+
"DuckDbCacheConfigError",
|
|
43
|
+
"DuckDbCacheLockError",
|
|
44
|
+
"DuckDbCacheNotFoundError",
|
|
45
|
+
"DuckDbCacheStaleError",
|
|
46
|
+
"DuckDbCachePersistenceError",
|
|
47
|
+
"get_cache_manager",
|
|
48
|
+
]
|