PyMkDB 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymkdb/__init__.py +6 -0
- pymkdb/cli.py +57 -0
- pymkdb-0.1.0.dist-info/METADATA +86 -0
- pymkdb-0.1.0.dist-info/RECORD +54 -0
- pymkdb-0.1.0.dist-info/WHEEL +5 -0
- pymkdb-0.1.0.dist-info/entry_points.txt +2 -0
- pymkdb-0.1.0.dist-info/top_level.txt +3 -0
- sdk/__init__.py +1 -0
- sdk/connection.py +225 -0
- sdk/delta.py +19 -0
- sdk/http_connection.py +180 -0
- sdk/mkdb_client.py +226 -0
- sdk/responses.py +154 -0
- src/__init__.py +1 -0
- src/config/db.py +227 -0
- src/config/server.py +52 -0
- src/db/__init__.py +207 -0
- src/db/cache/__init__.py +1 -0
- src/db/cache/ram_cache.py +144 -0
- src/db/cache/write_queue.py +156 -0
- src/db/maintenance/__init__.py +0 -0
- src/db/maintenance/compactor.py +118 -0
- src/db/maintenance/task_scheduler.py +73 -0
- src/db/objects/store.py +283 -0
- src/db/parity/__init__.py +0 -0
- src/db/parity/parity_manager.py +196 -0
- src/db/query/__init__.py +1 -0
- src/db/query/full_text_index.py +168 -0
- src/db/query/numeric_index.py +196 -0
- src/db/query/query_engine.py +308 -0
- src/db/query/tokenizer.py +48 -0
- src/db/query_workers/__init__.py +16 -0
- src/db/query_workers/dispatcher.py +339 -0
- src/db/query_workers/task.py +78 -0
- src/db/query_workers/worker.py +292 -0
- src/db/requesting/main.py +0 -0
- src/db/storage/__init__.py +1 -0
- src/db/storage/blob_store.py +47 -0
- src/db/storage/index_manager.py +92 -0
- src/db/storage/log_manager.py +119 -0
- src/db/storage/serializer.py +38 -0
- src/filing/__init__.py +31 -0
- src/objects/__init__.py +190 -0
- src/runtime/__init__.py +15 -0
- src/server/__init__.py +0 -0
- src/server/coms/actions.py +209 -0
- src/server/coms/http.py +46 -0
- src/server/coms/http_handlers.py +445 -0
- src/server/coms/metrics.py +231 -0
- src/server/coms/socket.py +461 -0
- src/server/coms/socket_protocol.py +54 -0
- src/server/control/api/actions.py +1001 -0
- src/server/control/server.py +404 -0
- src/server/event_log.py +58 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QueryWorker — runs inside a worker process (or thread) to resolve QueryTasks.
|
|
3
|
+
|
|
4
|
+
Each worker owns:
|
|
5
|
+
- A lightweight LRU + TTL RAM cache keyed by record_id.
|
|
6
|
+
- Read-only access to the store's disk files via the storage layer.
|
|
7
|
+
- A private invalidation queue so the dispatcher can evict stale entries.
|
|
8
|
+
|
|
9
|
+
Entry points
|
|
10
|
+
------------
|
|
11
|
+
worker_process_main — for multiprocessing.Process targets
|
|
12
|
+
worker_thread_main — for threading.Thread targets (single-worker / no-GIL mode)
|
|
13
|
+
|
|
14
|
+
Both share the same _worker_loop implementation; the only difference is how
|
|
15
|
+
sys.path is initialised (process needs it, thread already has it).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import sys
|
|
20
|
+
import time
|
|
21
|
+
import queue
|
|
22
|
+
import logging
|
|
23
|
+
from collections import OrderedDict
|
|
24
|
+
from typing import Any, Optional
|
|
25
|
+
|
|
26
|
+
# The sentinel object placed in the work queue to signal graceful shutdown.
|
|
27
|
+
WORKER_SENTINEL = None
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Minimal worker-local LRU + TTL cache
|
|
34
|
+
# (A full RamCache from WS-2 will replace this once implemented.)
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
class _WorkerCache:
|
|
38
|
+
"""
|
|
39
|
+
Lightweight LRU cache with per-entry TTL.
|
|
40
|
+
|
|
41
|
+
Operations are O(1) (dict + OrderedDict).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, max_size: int, ttl: float):
|
|
45
|
+
self._max = max(1, max_size)
|
|
46
|
+
self._ttl = float(ttl)
|
|
47
|
+
self._data: dict[str, Any] = {}
|
|
48
|
+
self._order: OrderedDict[str, None] = OrderedDict()
|
|
49
|
+
self._ts: dict[str, float] = {}
|
|
50
|
+
|
|
51
|
+
# -- public API ----------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
def get(self, record_id: str) -> Optional[Any]:
|
|
54
|
+
if record_id not in self._data:
|
|
55
|
+
return None
|
|
56
|
+
if time.monotonic() - self._ts[record_id] > self._ttl:
|
|
57
|
+
self._evict_one(record_id)
|
|
58
|
+
return None
|
|
59
|
+
self._order.move_to_end(record_id)
|
|
60
|
+
return self._data[record_id]
|
|
61
|
+
|
|
62
|
+
def set(self, record_id: str, value: Any) -> None:
|
|
63
|
+
if record_id in self._data:
|
|
64
|
+
self._order.move_to_end(record_id)
|
|
65
|
+
else:
|
|
66
|
+
if len(self._data) >= self._max:
|
|
67
|
+
oldest, _ = self._order.popitem(last=False)
|
|
68
|
+
del self._data[oldest]
|
|
69
|
+
del self._ts[oldest]
|
|
70
|
+
self._order[record_id] = None
|
|
71
|
+
self._data[record_id] = value
|
|
72
|
+
self._ts[record_id] = time.monotonic()
|
|
73
|
+
|
|
74
|
+
def delete(self, record_id: str) -> None:
|
|
75
|
+
self._evict_one(record_id)
|
|
76
|
+
|
|
77
|
+
def clear(self) -> None:
|
|
78
|
+
self._data.clear()
|
|
79
|
+
self._order.clear()
|
|
80
|
+
self._ts.clear()
|
|
81
|
+
|
|
82
|
+
def __len__(self) -> int:
|
|
83
|
+
return len(self._data)
|
|
84
|
+
|
|
85
|
+
# -- internal ------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
def _evict_one(self, record_id: str) -> None:
|
|
88
|
+
self._data.pop(record_id, None)
|
|
89
|
+
self._order.pop(record_id, None)
|
|
90
|
+
self._ts.pop(record_id, None)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Task resolution — integration point for WS-1 and WS-3
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def _resolve(task_dict: dict, cache: _WorkerCache, base_path: str) -> Any:
|
|
98
|
+
"""
|
|
99
|
+
Dispatch a task dict to the correct resolver.
|
|
100
|
+
|
|
101
|
+
Integration notes
|
|
102
|
+
-----------------
|
|
103
|
+
WS-1 (storage layer):
|
|
104
|
+
Replace the NotImplementedError blocks in _read and _exists with:
|
|
105
|
+
index_mgr = IndexManager(base_path, store_name)
|
|
106
|
+
log_mgr = LogManager(base_path, store_name)
|
|
107
|
+
raw_line = log_mgr.read(*index_mgr.get(record_id))
|
|
108
|
+
# parse flat line -> dict
|
|
109
|
+
|
|
110
|
+
WS-3 (query engine):
|
|
111
|
+
Replace the NotImplementedError blocks in _query and _count with:
|
|
112
|
+
engine = QueryEngine(store)
|
|
113
|
+
return engine.query(filter_dict)
|
|
114
|
+
"""
|
|
115
|
+
op = task_dict["operation"]
|
|
116
|
+
store_name = task_dict["store_name"]
|
|
117
|
+
params = task_dict["params"]
|
|
118
|
+
|
|
119
|
+
if op == "read":
|
|
120
|
+
return _read(store_name, params, cache, base_path)
|
|
121
|
+
if op == "multi_read":
|
|
122
|
+
return _multi_read(store_name, params, cache, base_path)
|
|
123
|
+
if op == "exists":
|
|
124
|
+
return _exists(store_name, params, cache, base_path)
|
|
125
|
+
if op == "query":
|
|
126
|
+
return _query(store_name, params, cache, base_path)
|
|
127
|
+
if op == "count":
|
|
128
|
+
return _count(store_name, params, cache, base_path)
|
|
129
|
+
raise ValueError(f"Unknown operation: {op!r}")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _read(store_name: str, params: dict, cache: _WorkerCache, base_path: str) -> dict:
|
|
133
|
+
record_id = params.get("record_id", "")
|
|
134
|
+
if not record_id:
|
|
135
|
+
raise ValueError("record_id required for 'read'")
|
|
136
|
+
|
|
137
|
+
cached = cache.get(record_id)
|
|
138
|
+
if cached is not None:
|
|
139
|
+
return cached
|
|
140
|
+
|
|
141
|
+
# TODO(WS-1): seek by byte offset from IndexManager / LogManager
|
|
142
|
+
# from src.db.storage.index_manager import IndexManager
|
|
143
|
+
# from src.db.storage.log_manager import LogManager
|
|
144
|
+
# idx = IndexManager(base_path, store_name); idx.load()
|
|
145
|
+
# lmgr = LogManager(base_path, store_name)
|
|
146
|
+
# seg, offset, size = idx.get(record_id)
|
|
147
|
+
# raw = lmgr.read(seg, offset, size)
|
|
148
|
+
# result = _parse_flat_line(raw)
|
|
149
|
+
# cache.set(record_id, result)
|
|
150
|
+
# return result
|
|
151
|
+
raise NotImplementedError("Storage layer (WS-1) not yet implemented")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _multi_read(store_name: str, params: dict, cache: _WorkerCache, base_path: str) -> dict:
|
|
155
|
+
record_ids = params.get("record_ids", [])
|
|
156
|
+
results = {}
|
|
157
|
+
for rid in record_ids:
|
|
158
|
+
results[rid] = _read(store_name, {"record_id": rid}, cache, base_path)
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _exists(store_name: str, params: dict, cache: _WorkerCache, base_path: str) -> bool:
|
|
163
|
+
record_id = params.get("record_id", "")
|
|
164
|
+
if not record_id:
|
|
165
|
+
raise ValueError("record_id required for 'exists'")
|
|
166
|
+
|
|
167
|
+
if cache.get(record_id) is not None:
|
|
168
|
+
return True
|
|
169
|
+
|
|
170
|
+
# TODO(WS-1): check IndexManager._map
|
|
171
|
+
raise NotImplementedError("Storage layer (WS-1) not yet implemented")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _query(store_name: str, params: dict, cache: _WorkerCache, base_path: str) -> list:
|
|
175
|
+
# TODO(WS-3): route params["filter"] through QueryEngine
|
|
176
|
+
raise NotImplementedError("Query engine (WS-3) not yet implemented")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _count(store_name: str, params: dict, cache: _WorkerCache, base_path: str) -> int:
|
|
180
|
+
# TODO(WS-3): route params["filter"] through QueryEngine, return len
|
|
181
|
+
raise NotImplementedError("Query engine (WS-3) not yet implemented")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
# Core worker loop — shared by process and thread entry points
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
|
|
188
|
+
def _worker_loop(
|
|
189
|
+
worker_id: int,
|
|
190
|
+
store_name: str,
|
|
191
|
+
base_path: str,
|
|
192
|
+
work_queue, # multiprocessing.Queue
|
|
193
|
+
results_queue, # multiprocessing.Queue
|
|
194
|
+
invalidation_queue, # multiprocessing.Queue (private to this worker)
|
|
195
|
+
cache_max_size: int,
|
|
196
|
+
cache_ttl: float,
|
|
197
|
+
stop_event, # multiprocessing.Event or threading.Event
|
|
198
|
+
) -> None:
|
|
199
|
+
log = logging.getLogger(f"QueryWorker[{store_name}#{worker_id}]")
|
|
200
|
+
log.info("Worker started (pid=%s)", os.getpid())
|
|
201
|
+
|
|
202
|
+
cache = _WorkerCache(cache_max_size, cache_ttl)
|
|
203
|
+
|
|
204
|
+
while not stop_event.is_set():
|
|
205
|
+
# 1. Drain the private invalidation queue to keep cache consistent
|
|
206
|
+
# with writes that the dispatcher has broadcast.
|
|
207
|
+
try:
|
|
208
|
+
while True:
|
|
209
|
+
record_id = invalidation_queue.get_nowait()
|
|
210
|
+
cache.delete(record_id)
|
|
211
|
+
except Exception:
|
|
212
|
+
pass # queue.Empty or similar — expected
|
|
213
|
+
|
|
214
|
+
# 2. Pull the next task (short timeout so stop_event is checked)
|
|
215
|
+
try:
|
|
216
|
+
task_dict = work_queue.get(timeout=0.5)
|
|
217
|
+
except Exception:
|
|
218
|
+
continue # timeout — loop back to check stop_event
|
|
219
|
+
|
|
220
|
+
# Sentinel signals graceful shutdown
|
|
221
|
+
if task_dict is WORKER_SENTINEL:
|
|
222
|
+
log.info("Received shutdown sentinel")
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
task_id = task_dict.get("task_id", "?")
|
|
226
|
+
log.debug("Handling task %s op=%s", task_id, task_dict.get("operation"))
|
|
227
|
+
|
|
228
|
+
# 3. Resolve and post result
|
|
229
|
+
try:
|
|
230
|
+
data = _resolve(task_dict, cache, base_path)
|
|
231
|
+
results_queue.put({
|
|
232
|
+
"task_id": task_id,
|
|
233
|
+
"status": "ok",
|
|
234
|
+
"data": data,
|
|
235
|
+
})
|
|
236
|
+
except Exception as exc:
|
|
237
|
+
log.warning("Task %s failed: %s", task_id, exc)
|
|
238
|
+
results_queue.put({
|
|
239
|
+
"task_id": task_id,
|
|
240
|
+
"status": "error",
|
|
241
|
+
"error": str(exc),
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
log.info("Worker exiting (pid=%s)", os.getpid())
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ---------------------------------------------------------------------------
|
|
248
|
+
# Entry points
|
|
249
|
+
# ---------------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
def worker_process_main(
|
|
252
|
+
worker_id: int,
|
|
253
|
+
project_root: str, # added to sys.path so imports resolve
|
|
254
|
+
store_name: str,
|
|
255
|
+
base_path: str,
|
|
256
|
+
work_queue,
|
|
257
|
+
results_queue,
|
|
258
|
+
invalidation_queue,
|
|
259
|
+
cache_max_size: int,
|
|
260
|
+
cache_ttl: float,
|
|
261
|
+
stop_event,
|
|
262
|
+
) -> None:
|
|
263
|
+
"""Entry point for multiprocessing.Process workers."""
|
|
264
|
+
# Ensure the project is importable inside the child process
|
|
265
|
+
if project_root not in sys.path:
|
|
266
|
+
sys.path.insert(0, project_root)
|
|
267
|
+
|
|
268
|
+
logging.basicConfig(level=logging.INFO)
|
|
269
|
+
_worker_loop(
|
|
270
|
+
worker_id, store_name, base_path,
|
|
271
|
+
work_queue, results_queue, invalidation_queue,
|
|
272
|
+
cache_max_size, cache_ttl, stop_event,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def worker_thread_main(
|
|
277
|
+
worker_id: int,
|
|
278
|
+
store_name: str,
|
|
279
|
+
base_path: str,
|
|
280
|
+
work_queue,
|
|
281
|
+
results_queue,
|
|
282
|
+
invalidation_queue,
|
|
283
|
+
cache_max_size: int,
|
|
284
|
+
cache_ttl: float,
|
|
285
|
+
stop_event,
|
|
286
|
+
) -> None:
|
|
287
|
+
"""Entry point for threading.Thread workers (parallel_enabled=False)."""
|
|
288
|
+
_worker_loop(
|
|
289
|
+
worker_id, store_name, base_path,
|
|
290
|
+
work_queue, results_queue, invalidation_queue,
|
|
291
|
+
cache_max_size, cache_ttl, stop_event,
|
|
292
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BlobStore — stores oversized record values as individual binary/text files.
|
|
3
|
+
|
|
4
|
+
Blob files live at: {store_path}/blobs/{record_id}.dat
|
|
5
|
+
The pseudo-segment string returned is "blobs/{record_id}.dat", which
|
|
6
|
+
LogManager.read() recognises and handles by reading the file directly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_blob(segment_str: str) -> bool:
|
|
13
|
+
"""Return True if this segment string represents a blob file path."""
|
|
14
|
+
return segment_str.startswith("blobs/")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def write_blob(store_path: str, record_id: str, data_str: str) -> tuple:
|
|
18
|
+
"""
|
|
19
|
+
Write data_str to {store_path}/blobs/{record_id}.dat.
|
|
20
|
+
|
|
21
|
+
Returns (path_str, 0, byte_size) where path_str is the pseudo-segment
|
|
22
|
+
string "blobs/{record_id}.dat".
|
|
23
|
+
"""
|
|
24
|
+
blobs_dir = os.path.join(store_path, "blobs")
|
|
25
|
+
os.makedirs(blobs_dir, exist_ok=True)
|
|
26
|
+
# Security: strip path separators from record_id
|
|
27
|
+
safe_id = record_id.replace("/", "_").replace("\\", "_").replace("..", "_")
|
|
28
|
+
file_path = os.path.join(blobs_dir, f"{safe_id}.dat")
|
|
29
|
+
encoded = data_str.encode("utf-8")
|
|
30
|
+
with open(file_path, "wb") as fh:
|
|
31
|
+
fh.write(encoded)
|
|
32
|
+
path_str = f"blobs/{safe_id}.dat"
|
|
33
|
+
return (path_str, 0, len(encoded))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def read_blob(store_path: str, path_str: str) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Read and return the full content of a blob file.
|
|
39
|
+
path_str is the pseudo-segment string "blobs/{record_id}.dat".
|
|
40
|
+
"""
|
|
41
|
+
# Security: validate path stays within store_path/blobs/
|
|
42
|
+
full_path = os.path.normpath(os.path.join(store_path, path_str))
|
|
43
|
+
blobs_dir = os.path.normpath(os.path.join(store_path, "blobs"))
|
|
44
|
+
if not full_path.startswith(blobs_dir + os.sep) and full_path != blobs_dir:
|
|
45
|
+
raise ValueError(f"Blob path traversal rejected: {path_str!r}")
|
|
46
|
+
with open(full_path, "rb") as fh:
|
|
47
|
+
return fh.read().decode("utf-8")
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
IndexManager — manages the primary index for one store.
|
|
3
|
+
|
|
4
|
+
Index file: {store_path}/{service}.idx
|
|
5
|
+
Line format: {record_id}:{segment_seq}:{byte_offset}:{byte_size}
|
|
6
|
+
Tombstone: !{record_id}:{segment_seq}:{byte_offset}:{byte_size}
|
|
7
|
+
|
|
8
|
+
Tombstoned entries are excluded from _map at load time; the physical .idx
|
|
9
|
+
file is only fully rewritten during compaction via save_full().
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class IndexManager:
|
|
16
|
+
def __init__(self, store_path: str, service: str):
|
|
17
|
+
self.index_path = os.path.join(store_path, f"{service}.idx")
|
|
18
|
+
self._map: dict = {} # record_id -> (segment_seq_str, offset, size)
|
|
19
|
+
self._dirty: bool = False
|
|
20
|
+
self.load()
|
|
21
|
+
|
|
22
|
+
# ------------------------------------------------------------------
|
|
23
|
+
# Load / save
|
|
24
|
+
# ------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
def load(self) -> None:
|
|
27
|
+
"""Parse the .idx file into _map, skipping tombstoned entries."""
|
|
28
|
+
if not os.path.exists(self.index_path):
|
|
29
|
+
return
|
|
30
|
+
with open(self.index_path, "r", encoding="utf-8") as fh:
|
|
31
|
+
for raw in fh:
|
|
32
|
+
line = raw.strip()
|
|
33
|
+
if not line:
|
|
34
|
+
continue
|
|
35
|
+
if line.startswith("!"):
|
|
36
|
+
# Tombstone — ensure it's absent from the map
|
|
37
|
+
record_id = line[1:].split(":")[0]
|
|
38
|
+
self._map.pop(record_id, None)
|
|
39
|
+
continue
|
|
40
|
+
parts = line.split(":")
|
|
41
|
+
if len(parts) < 4:
|
|
42
|
+
continue
|
|
43
|
+
# segment_seq may be a path like "blobs/abc.dat" containing "/"
|
|
44
|
+
# Format: record_id:segment_seq:offset:size
|
|
45
|
+
# We split on the last two colons to get offset and size safely
|
|
46
|
+
record_id = parts[0]
|
|
47
|
+
size = int(parts[-1])
|
|
48
|
+
offset = int(parts[-2])
|
|
49
|
+
segment_seq = ":".join(parts[1:-2])
|
|
50
|
+
self._map[record_id] = (segment_seq, offset, size)
|
|
51
|
+
|
|
52
|
+
def save_full(self) -> None:
|
|
53
|
+
"""Rewrite the entire .idx file from _map (used after compaction)."""
|
|
54
|
+
tmp_path = self.index_path + ".tmp"
|
|
55
|
+
with open(tmp_path, "w", encoding="utf-8") as fh:
|
|
56
|
+
for record_id, (seg, offset, size) in self._map.items():
|
|
57
|
+
fh.write(f"{record_id}:{seg}:{offset}:{size}\n")
|
|
58
|
+
os.replace(tmp_path, self.index_path)
|
|
59
|
+
self._dirty = False
|
|
60
|
+
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
# CRUD
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def get(self, record_id: str):
|
|
66
|
+
"""Return (segment_seq, offset, size) or None."""
|
|
67
|
+
return self._map.get(record_id)
|
|
68
|
+
|
|
69
|
+
def set(self, record_id: str, segment_seq: str, offset: int, size: int) -> None:
|
|
70
|
+
"""Update _map and append the new entry to the .idx file."""
|
|
71
|
+
self._map[record_id] = (segment_seq, offset, size)
|
|
72
|
+
with open(self.index_path, "a", encoding="utf-8") as fh:
|
|
73
|
+
fh.write(f"{record_id}:{segment_seq}:{offset}:{size}\n")
|
|
74
|
+
self._dirty = True
|
|
75
|
+
|
|
76
|
+
def delete(self, record_id: str) -> None:
|
|
77
|
+
"""Remove from _map and write a tombstone to the .idx file."""
|
|
78
|
+
if record_id not in self._map:
|
|
79
|
+
return
|
|
80
|
+
entry = self._map.pop(record_id)
|
|
81
|
+
seg, offset, size = entry
|
|
82
|
+
with open(self.index_path, "a", encoding="utf-8") as fh:
|
|
83
|
+
fh.write(f"!{record_id}:{seg}:{offset}:{size}\n")
|
|
84
|
+
self._dirty = True
|
|
85
|
+
|
|
86
|
+
def records_in_segment(self, seq_str: str) -> list:
|
|
87
|
+
"""Return all record IDs whose current segment matches seq_str."""
|
|
88
|
+
return [rid for rid, (seg, _, _) in self._map.items() if seg == seq_str]
|
|
89
|
+
|
|
90
|
+
def all_record_ids(self) -> list:
|
|
91
|
+
"""Return all live record IDs."""
|
|
92
|
+
return list(self._map.keys())
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LogManager — append-only rolling log file manager for one store.
|
|
3
|
+
|
|
4
|
+
Segment files: {store_path}/{service}_{NNN}.log (NNN = zero-padded 3-digit int)
|
|
5
|
+
On init, scans store_path for existing segments and opens the highest one
|
|
6
|
+
in append mode. When a segment exceeds segment_threshold bytes it is closed
|
|
7
|
+
and a new one is started (_rollover).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LogManager:
|
|
15
|
+
def __init__(self, store_path: str, service: str, segment_threshold: int):
|
|
16
|
+
"""
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
store_path : absolute path to the store directory
|
|
20
|
+
service : store name (used as the filename prefix)
|
|
21
|
+
segment_threshold : byte size at which a new segment is opened
|
|
22
|
+
"""
|
|
23
|
+
self.store_path = store_path
|
|
24
|
+
self.service = service
|
|
25
|
+
self.segment_threshold = segment_threshold
|
|
26
|
+
|
|
27
|
+
self.active_segment: int = self._find_highest_segment()
|
|
28
|
+
seg_path = self.segment_path(self.active_segment)
|
|
29
|
+
self.active_fh = open(seg_path, "ab")
|
|
30
|
+
|
|
31
|
+
# ------------------------------------------------------------------
|
|
32
|
+
# Internal helpers
|
|
33
|
+
# ------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
def _find_highest_segment(self) -> int:
|
|
36
|
+
"""Return the highest existing segment number, or 1 if none exist."""
|
|
37
|
+
pattern = re.compile(rf"^{re.escape(self.service)}_(\d{{3}})\.log$")
|
|
38
|
+
highest = 0
|
|
39
|
+
for name in os.listdir(self.store_path):
|
|
40
|
+
m = pattern.match(name)
|
|
41
|
+
if m:
|
|
42
|
+
n = int(m.group(1))
|
|
43
|
+
if n > highest:
|
|
44
|
+
highest = n
|
|
45
|
+
if highest == 0:
|
|
46
|
+
highest = 1
|
|
47
|
+
open(self.segment_path(1), "ab").close()
|
|
48
|
+
return highest
|
|
49
|
+
|
|
50
|
+
def segment_path(self, seq_int: int) -> str:
|
|
51
|
+
"""Return the absolute path for segment number seq_int."""
|
|
52
|
+
return os.path.join(self.store_path, f"{self.service}_{seq_int:03d}.log")
|
|
53
|
+
|
|
54
|
+
def _rollover(self) -> None:
|
|
55
|
+
"""Close the current segment and open a new one."""
|
|
56
|
+
self.active_fh.flush()
|
|
57
|
+
self.active_fh.close()
|
|
58
|
+
self.active_segment += 1
|
|
59
|
+
self.active_fh = open(self.segment_path(self.active_segment), "ab")
|
|
60
|
+
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
# Public API
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def append(self, record_id: str, flat_line_str: str) -> tuple:
|
|
66
|
+
"""
|
|
67
|
+
Write one record line to the active segment.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
record_id : the record's ID (used only for return value metadata)
|
|
72
|
+
flat_line_str : already-serialized line string (no trailing newline)
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
(segment_seq_str, byte_offset, byte_size)
|
|
77
|
+
"""
|
|
78
|
+
encoded = (flat_line_str + "\n").encode("utf-8")
|
|
79
|
+
offset = self.active_fh.tell()
|
|
80
|
+
self.active_fh.write(encoded)
|
|
81
|
+
self.active_fh.flush()
|
|
82
|
+
size = len(encoded)
|
|
83
|
+
seg_str = f"{self.active_segment:03d}"
|
|
84
|
+
if self.active_fh.tell() >= self.segment_threshold:
|
|
85
|
+
self._rollover()
|
|
86
|
+
return (seg_str, offset, size)
|
|
87
|
+
|
|
88
|
+
def read(self, segment_seq_str: str, offset: int, size: int) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Read exactly `size` bytes from the indicated segment at `offset`.
|
|
91
|
+
Uses stored size — does NOT use readline to avoid partial reads.
|
|
92
|
+
Returns the decoded string with trailing newline stripped.
|
|
93
|
+
"""
|
|
94
|
+
# Handle blob entries (segment_seq_str starts with "blobs/")
|
|
95
|
+
if segment_seq_str.startswith("blobs/"):
|
|
96
|
+
blob_path = os.path.join(self.store_path, segment_seq_str)
|
|
97
|
+
with open(blob_path, "r", encoding="utf-8") as fh:
|
|
98
|
+
return fh.read()
|
|
99
|
+
seq_int = int(segment_seq_str)
|
|
100
|
+
path = self.segment_path(seq_int)
|
|
101
|
+
with open(path, "rb") as fh:
|
|
102
|
+
fh.seek(offset)
|
|
103
|
+
return fh.read(size).decode("utf-8").rstrip("\n")
|
|
104
|
+
|
|
105
|
+
def list_segments(self) -> list:
|
|
106
|
+
"""Return all segment numbers found on disk, sorted ascending."""
|
|
107
|
+
pattern = re.compile(rf"^{re.escape(self.service)}_(\d{{3}})\.log$")
|
|
108
|
+
nums = []
|
|
109
|
+
for name in os.listdir(self.store_path):
|
|
110
|
+
m = pattern.match(name)
|
|
111
|
+
if m:
|
|
112
|
+
nums.append(int(m.group(1)))
|
|
113
|
+
return sorted(nums)
|
|
114
|
+
|
|
115
|
+
def close(self) -> None:
|
|
116
|
+
"""Flush and close the active file handle."""
|
|
117
|
+
if self.active_fh and not self.active_fh.closed:
|
|
118
|
+
self.active_fh.flush()
|
|
119
|
+
self.active_fh.close()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Record serialization helpers for .log file lines.
|
|
3
|
+
|
|
4
|
+
Wire format: {record_id} {json_object}
|
|
5
|
+
One line per record: record_id (no spaces) followed by a single space,
|
|
6
|
+
then a compact JSON object. Native JSON types (str, int, float, bool, list,
|
|
7
|
+
dict, None) are preserved exactly — no custom encoding required.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def serialize_record(record_id: str, flat_dict: dict) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Produce a single log line string (no trailing newline).
|
|
16
|
+
Example: abc123 {"name": "Gizmo", "price": 19.99, "in_stock": true}
|
|
17
|
+
"""
|
|
18
|
+
return f"{record_id} {json.dumps(flat_dict, ensure_ascii=False)}"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def deserialize_record(line: str) -> tuple:
|
|
22
|
+
"""
|
|
23
|
+
Parse a log line into (record_id, flat_dict).
|
|
24
|
+
Strips trailing newline/whitespace before parsing.
|
|
25
|
+
Returns (None, {}) on empty or malformed lines.
|
|
26
|
+
"""
|
|
27
|
+
line = line.strip()
|
|
28
|
+
if not line:
|
|
29
|
+
return (None, {})
|
|
30
|
+
space = line.find(" ")
|
|
31
|
+
if space == -1:
|
|
32
|
+
return (None, {})
|
|
33
|
+
record_id = line[:space]
|
|
34
|
+
try:
|
|
35
|
+
flat_dict = json.loads(line[space + 1:])
|
|
36
|
+
except json.JSONDecodeError:
|
|
37
|
+
return (None, {})
|
|
38
|
+
return (record_id, flat_dict)
|
src/filing/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
def read_file(path):
|
|
4
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
5
|
+
return f.read()
|
|
6
|
+
|
|
7
|
+
def write_file(path, content):
|
|
8
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
9
|
+
f.write(content)
|
|
10
|
+
|
|
11
|
+
def append_file(path, content):
|
|
12
|
+
with open(path, 'a', encoding='utf-8') as f:
|
|
13
|
+
f.write(content)
|
|
14
|
+
|
|
15
|
+
def delete_file(path):
|
|
16
|
+
import os
|
|
17
|
+
os.remove(path)
|
|
18
|
+
|
|
19
|
+
def file_exists(path):
|
|
20
|
+
import os
|
|
21
|
+
return os.path.exists(path)
|
|
22
|
+
|
|
23
|
+
def list_files(directory):
|
|
24
|
+
import os
|
|
25
|
+
return os.listdir(directory)
|
|
26
|
+
|
|
27
|
+
def read_json(path):
|
|
28
|
+
return json.loads(read_file(path))
|
|
29
|
+
|
|
30
|
+
def write_json(path, data):
|
|
31
|
+
write_file(path, json.dumps(data, indent=4))
|