nfscache 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nfscache-0.1.0/.gitignore +29 -0
- nfscache-0.1.0/Dockerfile +11 -0
- nfscache-0.1.0/LICENSE +21 -0
- nfscache-0.1.0/PKG-INFO +284 -0
- nfscache-0.1.0/README.md +262 -0
- nfscache-0.1.0/build_and_run.sh +80 -0
- nfscache-0.1.0/init/001_create_user_and_privs.sql +13 -0
- nfscache-0.1.0/nfscache/__init__.py +3 -0
- nfscache-0.1.0/nfscache/data/__init__.py +0 -0
- nfscache-0.1.0/nfscache/data/data_container.py +29 -0
- nfscache-0.1.0/nfscache/data/data_holder.py +23 -0
- nfscache-0.1.0/nfscache/database/__init__.py +0 -0
- nfscache-0.1.0/nfscache/database/oracle_env.py +77 -0
- nfscache-0.1.0/nfscache/database/oracle_pool.py +69 -0
- nfscache-0.1.0/nfscache/database/oracle_read.py +100 -0
- nfscache-0.1.0/nfscache/database/oracle_write.py +182 -0
- nfscache-0.1.0/nfscache/database/oracle_write_container.py +186 -0
- nfscache-0.1.0/nfscache/nfs_cache.py +836 -0
- nfscache-0.1.0/nfscache/util/__init__.py +0 -0
- nfscache-0.1.0/nfscache/util/generate_parquets.py +164 -0
- nfscache-0.1.0/nfscache/util/main.py +80 -0
- nfscache-0.1.0/nfscache/util/swarm_file.py +204 -0
- nfscache-0.1.0/nfscache/util/swarm_sql.py +328 -0
- nfscache-0.1.0/pyproject.toml +49 -0
- nfscache-0.1.0/tests/test_nfs_cache_locking.py +237 -0
- nfscache-0.1.0/tests/test_nfs_cache_metadata.py +164 -0
- nfscache-0.1.0/tests/test_nfs_cache_sql.py +215 -0
- nfscache-0.1.0/tests/test_oracle_pool.py +93 -0
- nfscache-0.1.0/uv.lock +551 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
build/
|
|
6
|
+
dist/
|
|
7
|
+
|
|
8
|
+
# Virtual environment
|
|
9
|
+
.venv/
|
|
10
|
+
|
|
11
|
+
# uv
|
|
12
|
+
.uv/
|
|
13
|
+
|
|
14
|
+
# Testing / coverage
|
|
15
|
+
.pytest_cache/
|
|
16
|
+
.coverage
|
|
17
|
+
htmlcov/
|
|
18
|
+
__cache__/
|
|
19
|
+
|
|
20
|
+
# Editor / OS
|
|
21
|
+
.idea/
|
|
22
|
+
.vscode/
|
|
23
|
+
.DS_Store
|
|
24
|
+
*.parquet
|
|
25
|
+
*.env
|
|
26
|
+
*.log
|
|
27
|
+
*.txt
|
|
28
|
+
.claude/
|
|
29
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Lightweight-ish Oracle for dev/testing (Oracle Database Free)
|
|
2
|
+
FROM gvenzl/oracle-free:23-slim
|
|
3
|
+
|
|
4
|
+
# Init scripts: executed once on first DB initialization, in alphabetical order
|
|
5
|
+
# SQL scripts run as SYS; if you want app schema objects, you must CONNECT as that user inside the script.
|
|
6
|
+
COPY ./init/ /container-entrypoint-initdb.d/
|
|
7
|
+
|
|
8
|
+
EXPOSE 1521
|
|
9
|
+
|
|
10
|
+
# Image ships with healthcheck.sh; keep it simple
|
|
11
|
+
HEALTHCHECK --interval=10s --timeout=5s --retries=20 CMD ["healthcheck.sh"]
|
nfscache-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mannetroll
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
nfscache-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nfscache
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Concurrency-safe, NFS-friendly Parquet cache for Polars DataFrames
|
|
5
|
+
Author: Torbjörn Sjögren
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: cache,concurrency,dataframe,nfs,oracle,parquet,polars
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: POSIX
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Topic :: Database
|
|
14
|
+
Classifier: Topic :: System :: Filesystems
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: <3.14,>=3.13
|
|
17
|
+
Requires-Dist: numpy<3,>=2.4.6
|
|
18
|
+
Requires-Dist: oracledb<4,>=3.4.2
|
|
19
|
+
Requires-Dist: polars<2,>=1.41.2
|
|
20
|
+
Requires-Dist: pyarrow<25,>=24.0.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# nfscache
|
|
24
|
+
|
|
25
|
+
Prototype NFS-backed cache for `DataContainer` objects whose payload is a
|
|
26
|
+
Polars `DataFrame`.
|
|
27
|
+
|
|
28
|
+
The cache stores container data as Parquet on a shared filesystem. Cold loads
|
|
29
|
+
can read from any slow source, for example Oracle, MySQL, or a local parquet
|
|
30
|
+
file. Warm loads use `polars.read_parquet`.
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install nfscache
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
Create an `NFSCache` pointed at a directory on the shared filesystem, then wrap
|
|
41
|
+
your cold-load function with a decorator. The wrapped function only runs on a
|
|
42
|
+
cache miss; warm hits are served from the Parquet cache.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
|
|
47
|
+
import polars as pl
|
|
48
|
+
|
|
49
|
+
from nfscache.nfs_cache import NFSCache
|
|
50
|
+
from nfscache.data.data_container import DataContainer
|
|
51
|
+
|
|
52
|
+
nfscache = NFSCache(Path("__cache__/nfs"))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# File / in-process source: cache key and version come from `filename`.
|
|
56
|
+
@nfscache.parquet
|
|
57
|
+
def load(filename: Path) -> DataContainer:
|
|
58
|
+
df = pl.read_parquet(filename)
|
|
59
|
+
return DataContainer({"headers": tuple(df.columns), "data": df})
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
container = load(Path("parquet/A_TEST.parquet")) # cold: runs the body
|
|
63
|
+
container = load(Path("parquet/A_TEST.parquet")) # warm: served from cache
|
|
64
|
+
df = container.data.rows_data_pl
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
For SQL sources, set `nfscache.connect_factory` (a `Callable[[], connection]`)
|
|
68
|
+
and use `@nfscache.sql`; the first argument is the SQL string. The cache key is
|
|
69
|
+
derived from the normalized SQL and the source version from
|
|
70
|
+
`MAX(ORA_ROWSCN)` plus the row count. See `nfscache/database/oracle_read.py`
|
|
71
|
+
for a complete Oracle wiring example.
|
|
72
|
+
|
|
73
|
+
## Current Functionality
|
|
74
|
+
|
|
75
|
+
- Decorator API: `@nfscache.parquet` and `@nfscache.sql`.
|
|
76
|
+
- Stores `DataContainer.data.rows_data_pl` as a Parquet cache file.
|
|
77
|
+
- Reads cached objects with the fast Polars parquet reader.
|
|
78
|
+
- Writes cached objects with `pyarrow.parquet.ParquetWriter`.
|
|
79
|
+
- Writes through unique `*.part` files, then atomically replaces the final file
|
|
80
|
+
with `os.replace`.
|
|
81
|
+
- Cleans up partial cache files on write failure.
|
|
82
|
+
- Uses a per-cache-key mkdir-based read/write lock: warm readers create
|
|
83
|
+
per-reader tokens and can overlap, while writers and invalidations block new
|
|
84
|
+
readers and wait for active readers to finish.
|
|
85
|
+
- Lock tokens include `lock.json` metadata with hostname, PID, UUID,
|
|
86
|
+
`created_at`, and `last_seen`; held locks heartbeat `last_seen`, and stale
|
|
87
|
+
reader/writer tokens are broken after `stale_lock_seconds`.
|
|
88
|
+
- The default stale lock timeout is 30 minutes, sized for cold Oracle reads that
|
|
89
|
+
can take around 10 minutes while still heartbeating as live work.
|
|
90
|
+
- Adds an authoritative metadata sidecar:
|
|
91
|
+
|
|
92
|
+
```text
|
|
93
|
+
__cache__/nfs/parquet/A_TEST_1048576.parquet
|
|
94
|
+
__cache__/nfs/parquet/A_TEST_1048576.parquet.meta.json
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
- Metadata includes source key/version, parquet byte size, parquet SHA-256, row
|
|
98
|
+
count, column count, schema hash, writer version, created time, and normalized
|
|
99
|
+
`source_sql` for SQL-backed entries.
|
|
100
|
+
- Readers reject missing, stale, unsupported, or corrupt metadata and validate
|
|
101
|
+
parquet size/checksum/row count/schema hash before returning a warm hit.
|
|
102
|
+
- Invalidates stale cache entries when the source version changes.
|
|
103
|
+
- For file path arguments, the default source version is a SHA-256 content hash.
|
|
104
|
+
- SQL sources use normalized SQL for cache keys and `COUNT(*)` plus
|
|
105
|
+
`MAX(ORA_ROWSCN)` as the Oracle version token for the detected `FROM` table.
|
|
106
|
+
- Cold loads re-read the source version before and after loading and retry if
|
|
107
|
+
the source changes during the read.
|
|
108
|
+
|
|
109
|
+
## Demo
|
|
110
|
+
|
|
111
|
+
Run:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
uv run --no-cache --no-sync python -m nfscache.util.main
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
`main.py` runs:
|
|
118
|
+
|
|
119
|
+
1. clear `__cache__`
|
|
120
|
+
2. generate parquet source data
|
|
121
|
+
3. cold load and write cache
|
|
122
|
+
4. warm cache load
|
|
123
|
+
5. regenerate parquet source data
|
|
124
|
+
6. reload because the source hash changed
|
|
125
|
+
7. warm cache load again
|
|
126
|
+
|
|
127
|
+
Expected shape:
|
|
128
|
+
|
|
129
|
+
```text
|
|
130
|
+
Clearing cache: __cache__
|
|
131
|
+
Generating: parquet/A_TEST_1048576.parquet...
|
|
132
|
+
Reading: parquet/A_TEST_1048576.parquet...
|
|
133
|
+
Returning cached object: parquet/A_TEST_1048576.parquet sha=<first 40 chars>...
|
|
134
|
+
Generating: parquet/A_TEST_1048576.parquet...
|
|
135
|
+
Ignoring cache entry: parquet/A_TEST_1048576.parquet: stale source version
|
|
136
|
+
Reading: parquet/A_TEST_1048576.parquet...
|
|
137
|
+
Returning cached object: parquet/A_TEST_1048576.parquet sha=<first 40 chars>...
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Swarm Test
|
|
141
|
+
|
|
142
|
+
`swarm_file.py` tests a multi-client environment with process-level concurrency.
|
|
143
|
+
It mixes cache gets with source regeneration to simulate clients reading while
|
|
144
|
+
the source data changes.
|
|
145
|
+
|
|
146
|
+
Run the default swarm:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_file
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Default behavior:
|
|
153
|
+
|
|
154
|
+
- 4 client processes
|
|
155
|
+
- 12 get waves
|
|
156
|
+
- 6 source regenerations
|
|
157
|
+
- generations are injected throughout the get waves
|
|
158
|
+
- final warm check after all waves complete
|
|
159
|
+
|
|
160
|
+
Useful smaller run:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_file \
|
|
164
|
+
--clients 3 \
|
|
165
|
+
--generators 1 \
|
|
166
|
+
--gets-per-client 6 \
|
|
167
|
+
--generations 3 \
|
|
168
|
+
--n-rows 1024 \
|
|
169
|
+
--cols 6 \
|
|
170
|
+
--n-int-cols 2 \
|
|
171
|
+
--n-str-cols 1 \
|
|
172
|
+
--data-dir /tmp/parquet-nfs-wave-swarm-parquet \
|
|
173
|
+
--cache-dir /tmp/parquet-nfs-wave-swarm-cache
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Swarm output includes:
|
|
177
|
+
|
|
178
|
+
- source generation hash
|
|
179
|
+
- cold `Reading: ...` reloads after invalidation
|
|
180
|
+
- warm `Returning cached object: ... sha=...` hits
|
|
181
|
+
- final multi-client warm check
|
|
182
|
+
|
|
183
|
+
## SQL Swarm Test
|
|
184
|
+
|
|
185
|
+
`swarm_sql.py` tests the same process-level concurrency path for Oracle-backed
|
|
186
|
+
SQL reads. It creates an Oracle table, runs client reads through `@nfscache.sql`,
|
|
187
|
+
and rewrites the table between read waves so the cache has to invalidate and
|
|
188
|
+
reload under load.
|
|
189
|
+
|
|
190
|
+
Start Oracle first, then run:
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_sql
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Useful smaller run:
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_sql \
|
|
200
|
+
--clients 2 \
|
|
201
|
+
--writers 1 \
|
|
202
|
+
--gets-per-client 3 \
|
|
203
|
+
--generations 2 \
|
|
204
|
+
--n-rows 128 \
|
|
205
|
+
--batch-size 64 \
|
|
206
|
+
--table SWARM_SQL_TEST \
|
|
207
|
+
--cache-dir /tmp/parquet-nfs-swarm-sql-cache
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
SQL swarm output includes Oracle cold reads, writer SCNs, stale SQL cache
|
|
211
|
+
invalidation, warm cache hits, and a final multi-client warm check.
|
|
212
|
+
|
|
213
|
+
## Tests
|
|
214
|
+
|
|
215
|
+
Run focused unit tests:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
uv run --no-cache --no-sync python -m unittest discover -s tests
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
The tests cover authoritative metadata, corrupted metadata/parquet recovery,
|
|
222
|
+
normalized SQL metadata, overlapping warm readers, and writer-preference
|
|
223
|
+
locking. A syntax check for all modules:
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
uv run --no-cache --no-sync python -m compileall -q nfscache tests
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Generate Parquets
|
|
230
|
+
|
|
231
|
+
Generate or replace test parquet files:
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
uv run --no-cache --no-sync python -m nfscache.util.generate_parquets
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
The generator writes to a unique `*.part` file and atomically replaces the final
|
|
238
|
+
parquet when the write is complete.
|
|
239
|
+
|
|
240
|
+
By default, content changes on every run. Use `--seed` for reproducible data:
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
uv run --no-cache --no-sync python -m nfscache.util.generate_parquets --seed 123
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Oracle SQL Cache
|
|
247
|
+
|
|
248
|
+
Start the local Oracle demo container:
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
./build_and_run.sh [--wipe]
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Populate the demo table:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
uv run --no-cache --no-sync python -m nfscache.database.oracle_write_container
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Read through the SQL cache:
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
uv run --no-cache --no-sync python -m nfscache.database.oracle_read "select * from DATA_CONTAINER_DEMO"
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
SQL cache keys use normalized SQL plus requested columns. Metadata stores the
|
|
267
|
+
normalized `source_sql`, and source versions use `COUNT(*)` plus
|
|
268
|
+
`MAX(ORA_ROWSCN)` for the detected `FROM` table.
|
|
269
|
+
|
|
270
|
+
## Production Notes
|
|
271
|
+
|
|
272
|
+
This is not yet production-grade enterprise software.
|
|
273
|
+
|
|
274
|
+
For Oracle on NFS with many clients, the next important pieces are:
|
|
275
|
+
|
|
276
|
+
- validate `mkdir` lock tokens, writer intent, stale-lock recovery, and
|
|
277
|
+
`os.replace` semantics on the actual NFS mount
|
|
278
|
+
- tie long Oracle reads to a documented consistent SCN/snapshot strategy
|
|
279
|
+
- add structured logs and metrics for hit/miss/reload, reader/writer lock wait,
|
|
280
|
+
cold load duration, parquet write/read duration, and corruption/retry counts
|
|
281
|
+
- broaden automated failure tests for crashed lock holders, corrupted files,
|
|
282
|
+
source changes during cold load, and multi-host NFS integration
|
|
283
|
+
- add operational controls for cache retention, quotas, old `*.part` cleanup,
|
|
284
|
+
version migration, compression, permissions, and bad-key runbooks
|
nfscache-0.1.0/README.md
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# nfscache
|
|
2
|
+
|
|
3
|
+
Prototype NFS-backed cache for `DataContainer` objects whose payload is a
|
|
4
|
+
Polars `DataFrame`.
|
|
5
|
+
|
|
6
|
+
The cache stores container data as Parquet on a shared filesystem. Cold loads
|
|
7
|
+
can read from any slow source, for example Oracle, MySQL, or a local parquet
|
|
8
|
+
file. Warm loads use `polars.read_parquet`.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install nfscache
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
Create an `NFSCache` pointed at a directory on the shared filesystem, then wrap
|
|
19
|
+
your cold-load function with a decorator. The wrapped function only runs on a
|
|
20
|
+
cache miss; warm hits are served from the Parquet cache.
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
import polars as pl
|
|
26
|
+
|
|
27
|
+
from nfscache.nfs_cache import NFSCache
|
|
28
|
+
from nfscache.data.data_container import DataContainer
|
|
29
|
+
|
|
30
|
+
nfscache = NFSCache(Path("__cache__/nfs"))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# File / in-process source: cache key and version come from `filename`.
|
|
34
|
+
@nfscache.parquet
|
|
35
|
+
def load(filename: Path) -> DataContainer:
|
|
36
|
+
df = pl.read_parquet(filename)
|
|
37
|
+
return DataContainer({"headers": tuple(df.columns), "data": df})
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
container = load(Path("parquet/A_TEST.parquet")) # cold: runs the body
|
|
41
|
+
container = load(Path("parquet/A_TEST.parquet")) # warm: served from cache
|
|
42
|
+
df = container.data.rows_data_pl
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
For SQL sources, set `nfscache.connect_factory` (a `Callable[[], connection]`)
|
|
46
|
+
and use `@nfscache.sql`; the first argument is the SQL string. The cache key is
|
|
47
|
+
derived from the normalized SQL and the source version from
|
|
48
|
+
`MAX(ORA_ROWSCN)` plus the row count. See `nfscache/database/oracle_read.py`
|
|
49
|
+
for a complete Oracle wiring example.
|
|
50
|
+
|
|
51
|
+
## Current Functionality
|
|
52
|
+
|
|
53
|
+
- Decorator API: `@nfscache.parquet` and `@nfscache.sql`.
|
|
54
|
+
- Stores `DataContainer.data.rows_data_pl` as a Parquet cache file.
|
|
55
|
+
- Reads cached objects with the fast Polars parquet reader.
|
|
56
|
+
- Writes cached objects with `pyarrow.parquet.ParquetWriter`.
|
|
57
|
+
- Writes through unique `*.part` files, then atomically replaces the final file
|
|
58
|
+
with `os.replace`.
|
|
59
|
+
- Cleans up partial cache files on write failure.
|
|
60
|
+
- Uses a per-cache-key mkdir-based read/write lock: warm readers create
|
|
61
|
+
per-reader tokens and can overlap, while writers and invalidations block new
|
|
62
|
+
readers and wait for active readers to finish.
|
|
63
|
+
- Lock tokens include `lock.json` metadata with hostname, PID, UUID,
|
|
64
|
+
`created_at`, and `last_seen`; held locks heartbeat `last_seen`, and stale
|
|
65
|
+
reader/writer tokens are broken after `stale_lock_seconds`.
|
|
66
|
+
- The default stale lock timeout is 30 minutes, sized for cold Oracle reads that
|
|
67
|
+
can take around 10 minutes while still heartbeating as live work.
|
|
68
|
+
- Adds an authoritative metadata sidecar:
|
|
69
|
+
|
|
70
|
+
```text
|
|
71
|
+
__cache__/nfs/parquet/A_TEST_1048576.parquet
|
|
72
|
+
__cache__/nfs/parquet/A_TEST_1048576.parquet.meta.json
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
- Metadata includes source key/version, parquet byte size, parquet SHA-256, row
|
|
76
|
+
count, column count, schema hash, writer version, created time, and normalized
|
|
77
|
+
`source_sql` for SQL-backed entries.
|
|
78
|
+
- Readers reject missing, stale, unsupported, or corrupt metadata and validate
|
|
79
|
+
parquet size/checksum/row count/schema hash before returning a warm hit.
|
|
80
|
+
- Invalidates stale cache entries when the source version changes.
|
|
81
|
+
- For file path arguments, the default source version is a SHA-256 content hash.
|
|
82
|
+
- SQL sources use normalized SQL for cache keys and `COUNT(*)` plus
|
|
83
|
+
`MAX(ORA_ROWSCN)` as the Oracle version token for the detected `FROM` table.
|
|
84
|
+
- Cold loads re-read the source version before and after loading and retry if
|
|
85
|
+
the source changes during the read.
|
|
86
|
+
|
|
87
|
+
## Demo
|
|
88
|
+
|
|
89
|
+
Run:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
uv run --no-cache --no-sync python -m nfscache.util.main
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
`main.py` runs:
|
|
96
|
+
|
|
97
|
+
1. clear `__cache__`
|
|
98
|
+
2. generate parquet source data
|
|
99
|
+
3. cold load and write cache
|
|
100
|
+
4. warm cache load
|
|
101
|
+
5. regenerate parquet source data
|
|
102
|
+
6. reload because the source hash changed
|
|
103
|
+
7. warm cache load again
|
|
104
|
+
|
|
105
|
+
Expected shape:
|
|
106
|
+
|
|
107
|
+
```text
|
|
108
|
+
Clearing cache: __cache__
|
|
109
|
+
Generating: parquet/A_TEST_1048576.parquet...
|
|
110
|
+
Reading: parquet/A_TEST_1048576.parquet...
|
|
111
|
+
Returning cached object: parquet/A_TEST_1048576.parquet sha=<first 40 chars>...
|
|
112
|
+
Generating: parquet/A_TEST_1048576.parquet...
|
|
113
|
+
Ignoring cache entry: parquet/A_TEST_1048576.parquet: stale source version
|
|
114
|
+
Reading: parquet/A_TEST_1048576.parquet...
|
|
115
|
+
Returning cached object: parquet/A_TEST_1048576.parquet sha=<first 40 chars>...
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Swarm Test
|
|
119
|
+
|
|
120
|
+
`swarm_file.py` tests a multi-client environment with process-level concurrency.
|
|
121
|
+
It mixes cache gets with source regeneration to simulate clients reading while
|
|
122
|
+
the source data changes.
|
|
123
|
+
|
|
124
|
+
Run the default swarm:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_file
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Default behavior:
|
|
131
|
+
|
|
132
|
+
- 4 client processes
|
|
133
|
+
- 12 get waves
|
|
134
|
+
- 6 source regenerations
|
|
135
|
+
- generations are injected throughout the get waves
|
|
136
|
+
- final warm check after all waves complete
|
|
137
|
+
|
|
138
|
+
Useful smaller run:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_file \
|
|
142
|
+
--clients 3 \
|
|
143
|
+
--generators 1 \
|
|
144
|
+
--gets-per-client 6 \
|
|
145
|
+
--generations 3 \
|
|
146
|
+
--n-rows 1024 \
|
|
147
|
+
--cols 6 \
|
|
148
|
+
--n-int-cols 2 \
|
|
149
|
+
--n-str-cols 1 \
|
|
150
|
+
--data-dir /tmp/parquet-nfs-wave-swarm-parquet \
|
|
151
|
+
--cache-dir /tmp/parquet-nfs-wave-swarm-cache
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Swarm output includes:
|
|
155
|
+
|
|
156
|
+
- source generation hash
|
|
157
|
+
- cold `Reading: ...` reloads after invalidation
|
|
158
|
+
- warm `Returning cached object: ... sha=...` hits
|
|
159
|
+
- final multi-client warm check
|
|
160
|
+
|
|
161
|
+
## SQL Swarm Test
|
|
162
|
+
|
|
163
|
+
`swarm_sql.py` tests the same process-level concurrency path for Oracle-backed
|
|
164
|
+
SQL reads. It creates an Oracle table, runs client reads through `@nfscache.sql`,
|
|
165
|
+
and rewrites the table between read waves so the cache has to invalidate and
|
|
166
|
+
reload under load.
|
|
167
|
+
|
|
168
|
+
Start Oracle first, then run:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_sql
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Useful smaller run:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
uv run --no-cache --no-sync python -m nfscache.util.swarm_sql \
|
|
178
|
+
--clients 2 \
|
|
179
|
+
--writers 1 \
|
|
180
|
+
--gets-per-client 3 \
|
|
181
|
+
--generations 2 \
|
|
182
|
+
--n-rows 128 \
|
|
183
|
+
--batch-size 64 \
|
|
184
|
+
--table SWARM_SQL_TEST \
|
|
185
|
+
--cache-dir /tmp/parquet-nfs-swarm-sql-cache
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
SQL swarm output includes Oracle cold reads, writer SCNs, stale SQL cache
|
|
189
|
+
invalidation, warm cache hits, and a final multi-client warm check.
|
|
190
|
+
|
|
191
|
+
## Tests
|
|
192
|
+
|
|
193
|
+
Run focused unit tests:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
uv run --no-cache --no-sync python -m unittest discover -s tests
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
The tests cover authoritative metadata, corrupted metadata/parquet recovery,
|
|
200
|
+
normalized SQL metadata, overlapping warm readers, and writer-preference
|
|
201
|
+
locking. A syntax check for all modules:
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
uv run --no-cache --no-sync python -m compileall -q nfscache tests
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Generate Parquets
|
|
208
|
+
|
|
209
|
+
Generate or replace test parquet files:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
uv run --no-cache --no-sync python -m nfscache.util.generate_parquets
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
The generator writes to a unique `*.part` file and atomically replaces the final
|
|
216
|
+
parquet when the write is complete.
|
|
217
|
+
|
|
218
|
+
By default, content changes on every run. Use `--seed` for reproducible data:
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
uv run --no-cache --no-sync python -m nfscache.util.generate_parquets --seed 123
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Oracle SQL Cache
|
|
225
|
+
|
|
226
|
+
Start the local Oracle demo container:
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
./build_and_run.sh [--wipe]
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Populate the demo table:
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
uv run --no-cache --no-sync python -m nfscache.database.oracle_write_container
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Read through the SQL cache:
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
uv run --no-cache --no-sync python -m nfscache.database.oracle_read "select * from DATA_CONTAINER_DEMO"
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
SQL cache keys use normalized SQL plus requested columns. Metadata stores the
|
|
245
|
+
normalized `source_sql`, and source versions use `COUNT(*)` plus
|
|
246
|
+
`MAX(ORA_ROWSCN)` for the detected `FROM` table.
|
|
247
|
+
|
|
248
|
+
## Production Notes
|
|
249
|
+
|
|
250
|
+
This is not yet production-grade enterprise software.
|
|
251
|
+
|
|
252
|
+
For Oracle on NFS with many clients, the next important pieces are:
|
|
253
|
+
|
|
254
|
+
- validate `mkdir` lock tokens, writer intent, stale-lock recovery, and
|
|
255
|
+
`os.replace` semantics on the actual NFS mount
|
|
256
|
+
- tie long Oracle reads to a documented consistent SCN/snapshot strategy
|
|
257
|
+
- add structured logs and metrics for hit/miss/reload, reader/writer lock wait,
|
|
258
|
+
cold load duration, parquet write/read duration, and corruption/retry counts
|
|
259
|
+
- broaden automated failure tests for crashed lock holders, corrupted files,
|
|
260
|
+
source changes during cold load, and multi-host NFS integration
|
|
261
|
+
- add operational controls for cache retention, quotas, old `*.part` cleanup,
|
|
262
|
+
version migration, compression, permissions, and bad-key runbooks
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# build_and_run.sh
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
IMAGE="oracle-scn-test"
|
|
6
|
+
CONTAINER="oracle-scn"
|
|
7
|
+
VOLUME="oracle_scn_data"
|
|
8
|
+
ORACLE_PASSWORD="${ORACLE_PASSWORD:-AdminPassword123}"
|
|
9
|
+
APP_USER="SOMEUSER"
|
|
10
|
+
APP_PASSWORD="cache"
|
|
11
|
+
WIPE=0
|
|
12
|
+
|
|
13
|
+
usage() {
|
|
14
|
+
echo "Usage: $0 [--wipe]"
|
|
15
|
+
echo
|
|
16
|
+
echo " --wipe Remove the existing $VOLUME Docker volume before starting Oracle."
|
|
17
|
+
echo
|
|
18
|
+
echo "Environment:"
|
|
19
|
+
echo " ORACLE_PLATFORM=linux/amd64 Optional Docker platform override."
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
while [[ $# -gt 0 ]]; do
|
|
23
|
+
case "$1" in
|
|
24
|
+
--wipe)
|
|
25
|
+
WIPE=1
|
|
26
|
+
;;
|
|
27
|
+
-h|--help)
|
|
28
|
+
usage
|
|
29
|
+
exit 0
|
|
30
|
+
;;
|
|
31
|
+
*)
|
|
32
|
+
echo "Unknown argument: $1" >&2
|
|
33
|
+
usage >&2
|
|
34
|
+
exit 2
|
|
35
|
+
;;
|
|
36
|
+
esac
|
|
37
|
+
shift
|
|
38
|
+
done
|
|
39
|
+
|
|
40
|
+
if [[ -n "${ORACLE_PLATFORM:-}" ]]; then
|
|
41
|
+
docker build --platform="$ORACLE_PLATFORM" -t "$IMAGE" .
|
|
42
|
+
else
|
|
43
|
+
docker build -t "$IMAGE" .
|
|
44
|
+
fi
|
|
45
|
+
|
|
46
|
+
# If container already exists, remove it (keep volume unless you wipe it explicitly)
|
|
47
|
+
if docker ps -a --format '{{.Names}}' | grep -qx "$CONTAINER"; then
|
|
48
|
+
docker rm -f "$CONTAINER"
|
|
49
|
+
fi
|
|
50
|
+
|
|
51
|
+
if [[ "$WIPE" == "1" ]] && docker volume inspect "$VOLUME" >/dev/null 2>&1; then
|
|
52
|
+
docker volume rm "$VOLUME"
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
if [[ -n "${ORACLE_PLATFORM:-}" ]]; then
|
|
56
|
+
docker run -d --name "$CONTAINER" \
|
|
57
|
+
--platform="$ORACLE_PLATFORM" \
|
|
58
|
+
-p 1521:1521 \
|
|
59
|
+
--shm-size=1g \
|
|
60
|
+
-e ORACLE_PASSWORD="$ORACLE_PASSWORD" \
|
|
61
|
+
-v "$VOLUME":/opt/oracle/oradata \
|
|
62
|
+
"$IMAGE"
|
|
63
|
+
else
|
|
64
|
+
docker run -d --name "$CONTAINER" \
|
|
65
|
+
-p 1521:1521 \
|
|
66
|
+
--shm-size=1g \
|
|
67
|
+
-e ORACLE_PASSWORD="$ORACLE_PASSWORD" \
|
|
68
|
+
-v "$VOLUME":/opt/oracle/oradata \
|
|
69
|
+
"$IMAGE"
|
|
70
|
+
fi
|
|
71
|
+
|
|
72
|
+
echo "Oracle container started:"
|
|
73
|
+
echo " name: $CONTAINER"
|
|
74
|
+
echo " image: $IMAGE"
|
|
75
|
+
echo " port: 1521"
|
|
76
|
+
echo " service: FREEPDB1"
|
|
77
|
+
echo " volume: $VOLUME"
|
|
78
|
+
echo " platform: ${ORACLE_PLATFORM:-native}"
|
|
79
|
+
echo " user: $APP_USER"
|
|
80
|
+
echo " pass: $APP_PASSWORD"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
-- Run as SYS (per image behavior)
|
|
2
|
+
ALTER SESSION SET CONTAINER=FREEPDB1;
|
|
3
|
+
|
|
4
|
+
-- App user for SCN-based cache tests
|
|
5
|
+
CREATE USER SOMEUSER IDENTIFIED BY cache
|
|
6
|
+
DEFAULT TABLESPACE USERS
|
|
7
|
+
QUOTA UNLIMITED ON USERS;
|
|
8
|
+
|
|
9
|
+
GRANT CREATE SESSION TO SOMEUSER;
|
|
10
|
+
GRANT CREATE TABLE, CREATE SEQUENCE, CREATE PROCEDURE, CREATE VIEW, CREATE TRIGGER TO SOMEUSER;
|
|
11
|
+
|
|
12
|
+
-- SELECT current_scn FROM v$database)
|
|
13
|
+
GRANT SELECT ON V_$DATABASE TO SOMEUSER;
|