PyMkDB 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. pymkdb/__init__.py +6 -0
  2. pymkdb/cli.py +57 -0
  3. pymkdb-0.1.0.dist-info/METADATA +86 -0
  4. pymkdb-0.1.0.dist-info/RECORD +54 -0
  5. pymkdb-0.1.0.dist-info/WHEEL +5 -0
  6. pymkdb-0.1.0.dist-info/entry_points.txt +2 -0
  7. pymkdb-0.1.0.dist-info/top_level.txt +3 -0
  8. sdk/__init__.py +1 -0
  9. sdk/connection.py +225 -0
  10. sdk/delta.py +19 -0
  11. sdk/http_connection.py +180 -0
  12. sdk/mkdb_client.py +226 -0
  13. sdk/responses.py +154 -0
  14. src/__init__.py +1 -0
  15. src/config/db.py +227 -0
  16. src/config/server.py +52 -0
  17. src/db/__init__.py +207 -0
  18. src/db/cache/__init__.py +1 -0
  19. src/db/cache/ram_cache.py +144 -0
  20. src/db/cache/write_queue.py +156 -0
  21. src/db/maintenance/__init__.py +0 -0
  22. src/db/maintenance/compactor.py +118 -0
  23. src/db/maintenance/task_scheduler.py +73 -0
  24. src/db/objects/store.py +283 -0
  25. src/db/parity/__init__.py +0 -0
  26. src/db/parity/parity_manager.py +196 -0
  27. src/db/query/__init__.py +1 -0
  28. src/db/query/full_text_index.py +168 -0
  29. src/db/query/numeric_index.py +196 -0
  30. src/db/query/query_engine.py +308 -0
  31. src/db/query/tokenizer.py +48 -0
  32. src/db/query_workers/__init__.py +16 -0
  33. src/db/query_workers/dispatcher.py +339 -0
  34. src/db/query_workers/task.py +78 -0
  35. src/db/query_workers/worker.py +292 -0
  36. src/db/requesting/main.py +0 -0
  37. src/db/storage/__init__.py +1 -0
  38. src/db/storage/blob_store.py +47 -0
  39. src/db/storage/index_manager.py +92 -0
  40. src/db/storage/log_manager.py +119 -0
  41. src/db/storage/serializer.py +38 -0
  42. src/filing/__init__.py +31 -0
  43. src/objects/__init__.py +190 -0
  44. src/runtime/__init__.py +15 -0
  45. src/server/__init__.py +0 -0
  46. src/server/coms/actions.py +209 -0
  47. src/server/coms/http.py +46 -0
  48. src/server/coms/http_handlers.py +445 -0
  49. src/server/coms/metrics.py +231 -0
  50. src/server/coms/socket.py +461 -0
  51. src/server/coms/socket_protocol.py +54 -0
  52. src/server/control/api/actions.py +1001 -0
  53. src/server/control/server.py +404 -0
  54. src/server/event_log.py +58 -0
src/db/__init__.py ADDED
@@ -0,0 +1,207 @@
1
+ import os
2
+ import time
3
+ from typing import Union
4
+
5
+ from colorama import Fore
6
+ from src.config.db import mkdb_config
7
+ from src.server.coms.http import HTTPServer
8
+
9
+
10
+ class mkdb:
11
+ def __init__(self, config:Union[dict, mkdb_config]):
12
+ from src.db.objects.store import store
13
+
14
+ if type(config) == dict:
15
+ config = mkdb_config(config)
16
+ self.config:mkdb_config = config #type: ignore
17
+ self.stores:dict[str, "store"] = {}
18
+ self._servers: dict = {} # "http" | "socket" | "control" -> instance
19
+ self._started_at: float = 0.0 # unix timestamp of when run() was called
20
+
21
+ self.setup()
22
+
23
+ @property
24
+ def file_path(self):
25
+ return self.config.base_path
26
+
27
+ def __verify__(self, config:Union[mkdb_config, None]=None):
28
+ """
29
+ Verify the database configuration and setup.
30
+ """
31
+ if config is None:
32
+ config = self.config
33
+
34
+ if config.name in [None, ""]:
35
+ raise ValueError("Database name cannot be empty.")
36
+ if config.base_path in [None, ""]:
37
+ raise ValueError("Base path cannot be empty.")
38
+ if config.servers.socket_server.enabled:
39
+ if config.servers.socket_server.address.host in [None, ""]:
40
+ raise ValueError("Socket server enabled but no host specified.")
41
+ if config.servers.socket_server.address.port == 0:
42
+ raise ValueError("Socket server enabled but no port specified.")
43
+ if config.servers.http_server.enabled:
44
+ if config.servers.http_server.address.host in [None, ""]:
45
+ raise ValueError("HTTP server enabled but no host specified.")
46
+ if config.servers.http_server.address.port == 0:
47
+ raise ValueError("HTTP server enabled but no port specified.")
48
+
49
+ def setup(self):
50
+ try:
51
+ self.__verify__()
52
+ except Exception as e:
53
+ print(f"Error verifying database configuration: {Fore.RED}{e}{Fore.RESET}")
54
+ raise e
55
+ return
56
+
57
+ os.makedirs(os.path.join(self.file_path, "stores"), exist_ok=True)
58
+ for store_name, store_config in self.config.stores.items():
59
+ from src.db.objects.store import store
60
+ if store_name not in self.stores:
61
+ s = store(self, store_config)
62
+ self.stores[store_name] = s
63
+ else:
64
+ s = self.stores[store_name]
65
+ s.config.update(store_config.json)
66
+ s.setup()
67
+
68
+ def run(self):
69
+ print(f"{Fore.GREEN}Database '{self.config.name}' initialized at {os.getcwd()}{Fore.RESET}")
70
+
71
+ print("Starting servers...")
72
+ self._started_at = time.time()
73
+ if self.config.servers.http_server.enabled:
74
+ from src.server.coms.http import HTTPServer
75
+ from src.server.coms.http_handlers import HTTPDataHandler
76
+ HTTPDataHandler.database = self
77
+ self._servers["http"] = HTTPServer(
78
+ name="Communication-HTTP",
79
+ host=self.config.servers.http_server.address.host,
80
+ port=self.config.servers.http_server.address.port,
81
+ responder=HTTPDataHandler,
82
+ )
83
+
84
+ if self.config.servers.socket_server.enabled:
85
+ from src.server.coms.socket import SocketServer
86
+ _sock_srv = SocketServer(
87
+ host=self.config.servers.socket_server.address.host,
88
+ port=self.config.servers.socket_server.address.port,
89
+ database=self,
90
+ heartbeat_interval=self.config.servers.socket_server.heartbeat_interval,
91
+ max_clients=self.config.servers.socket_server.max_clients,
92
+ recv_timeout=self.config.servers.socket_server.recv_timeout,
93
+ )
94
+ _sock_srv.start()
95
+ self._servers["socket"] = _sock_srv
96
+
97
+ if self.config.servers.control_server.enabled:
98
+ from src.server.control.server import start_control_server
99
+ self._servers["control"] = start_control_server(
100
+ host=self.config.servers.control_server.address.host,
101
+ port=self.config.servers.control_server.address.port,
102
+ database=self
103
+ )
104
+
105
+ print(f"MkDB {Fore.CYAN}'{self.config.name}'{Fore.RESET} is running at {Fore.GREEN}{os.getcwd()}{Fore.RESET}")
106
+
107
+ while True:
108
+ try:
109
+ time.sleep(1)
110
+ pass
111
+ except KeyboardInterrupt:
112
+ print(f"{Fore.YELLOW}Shutting down MkDB '{self.config.name}'...{Fore.RESET}")
113
+ break
114
+ self.shutdown()
115
+
116
+ def get_server_status(self) -> dict:
117
+ """Return running/stopped status for all three servers."""
118
+ return {name: (self._servers.get(name) is not None) for name in ("http", "socket", "control")}
119
+
120
+ def get_uptime(self) -> float:
121
+ """Return seconds since run() was called, or 0 if not yet started."""
122
+ return (time.time() - self._started_at) if self._started_at else 0.0
123
+
124
+ def stop_server(self, name: str) -> None:
125
+ """Stop a named server and clear its reference."""
126
+ srv = self._servers.get(name)
127
+ if srv is None:
128
+ return
129
+ if hasattr(srv, "stop"):
130
+ srv.stop()
131
+ elif hasattr(srv, "shutdown"):
132
+ try:
133
+ srv.shutdown()
134
+ except Exception:
135
+ pass
136
+ self._servers[name] = None
137
+
138
+ def start_server(self, name: str) -> None:
139
+ """Start a named server using current config. No-op if already running."""
140
+ if self._servers.get(name) is not None:
141
+ return
142
+ cfg = self.config.servers
143
+ if name == "http":
144
+ if not cfg.http_server.enabled:
145
+ return
146
+ from src.server.coms.http import HTTPServer
147
+ from src.server.coms.http_handlers import HTTPDataHandler
148
+ HTTPDataHandler.database = self
149
+ self._servers["http"] = HTTPServer(
150
+ name="Communication-HTTP",
151
+ host=cfg.http_server.address.host,
152
+ port=cfg.http_server.address.port,
153
+ responder=HTTPDataHandler,
154
+ )
155
+ elif name == "socket":
156
+ if not cfg.socket_server.enabled:
157
+ return
158
+ from src.server.coms.socket import SocketServer
159
+ srv = SocketServer(
160
+ host=cfg.socket_server.address.host,
161
+ port=cfg.socket_server.address.port,
162
+ database=self,
163
+ heartbeat_interval=cfg.socket_server.heartbeat_interval,
164
+ max_clients=cfg.socket_server.max_clients,
165
+ recv_timeout=cfg.socket_server.recv_timeout,
166
+ )
167
+ srv.start()
168
+ self._servers["socket"] = srv
169
+ elif name == "control":
170
+ if not cfg.control_server.enabled:
171
+ return
172
+ from src.server.control.server import start_control_server
173
+ self._servers["control"] = start_control_server(
174
+ host=cfg.control_server.address.host,
175
+ port=cfg.control_server.address.port,
176
+ database=self,
177
+ )
178
+
179
+ def restart_server(self, name: str) -> None:
180
+ """Stop then start a named server."""
181
+ self.stop_server(name)
182
+ self.start_server(name)
183
+
184
+ def shutdown(self) -> None:
185
+ """Stop all servers and tear down all stores."""
186
+ for name in ("http", "socket", "control"):
187
+ self.stop_server(name)
188
+ for s in self.stores.values():
189
+ try:
190
+ s.teardown()
191
+ except Exception:
192
+ pass
193
+ self.stores.clear()
194
+ print(f"{Fore.YELLOW}MkDB '{self.config.name}' shut down.{Fore.RESET}")
195
+
196
+ def update_from_config(self, new_config:Union[dict, mkdb_config]):
197
+ if type(new_config) == dict:
198
+ new_config = mkdb_config(new_config)
199
+ try:
200
+ self.__verify__(new_config) #type: ignore
201
+ self.config = new_config #type: ignore
202
+ print(f"{Fore.GREEN}Configuration updated successfully.{Fore.RESET}")
203
+ except Exception as e:
204
+ print(f"Error updating database configuration: {Fore.RED}{e}{Fore.RESET}")
205
+ return
206
+
207
+ self.setup()
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,144 @@
1
+ """
2
+ RamCache — in-process record cache for one store.
3
+
4
+ Supports three eviction strategies controlled by ram_config.clean_type:
5
+ "lru" — evict least-recently-used entry
6
+ "lfu" — evict least-frequently-used entry
7
+ "ttl" — evict only entries whose TTL has expired (no count-based eviction)
8
+
9
+ All public methods are thread-safe via a single threading.Lock.
10
+ """
11
+
12
+ import threading
13
+ import time
14
+ from collections import OrderedDict
15
+
16
+
17
+ class RamCache:
18
+ def __init__(self, max_size: int, ttl: int, clean_type: str = "lru"):
19
+ self.max_size = max_size
20
+ self.ttl = ttl # seconds; 0 = never expire
21
+ self.clean_type = clean_type
22
+
23
+ self._store: dict = {} # record_id -> flat dict
24
+ self._access_log: OrderedDict = OrderedDict() # record_id -> None (LRU order)
25
+ self._freq: dict = {} # record_id -> access count (LFU)
26
+ self._timestamps: dict = {} # record_id -> last_access float
27
+ self._lock: threading.Lock = threading.Lock()
28
+
29
+ # ------------------------------------------------------------------
30
+ # Public API
31
+ # ------------------------------------------------------------------
32
+
33
+ def get(self, record_id: str):
34
+ """Return the cached flat dict, or None on miss / expired entry."""
35
+ with self._lock:
36
+ if record_id not in self._store:
37
+ return None
38
+ if self._is_expired(record_id):
39
+ self._evict_one(record_id)
40
+ return None
41
+ self._touch(record_id)
42
+ return dict(self._store[record_id]) # return a shallow copy
43
+
44
+ def set(self, record_id: str, flat_dict: dict) -> None:
45
+ """Insert or overwrite an entry, evicting if over max_size."""
46
+ with self._lock:
47
+ if record_id in self._store:
48
+ self._store[record_id] = dict(flat_dict)
49
+ self._touch(record_id)
50
+ else:
51
+ self._store[record_id] = dict(flat_dict)
52
+ self._access_log[record_id] = None
53
+ self._freq[record_id] = 1
54
+ self._timestamps[record_id] = time.time()
55
+ while len(self._store) > self.max_size:
56
+ self.evict()
57
+
58
+ def delete(self, record_id: str) -> None:
59
+ """Remove an entry from the cache."""
60
+ with self._lock:
61
+ self._evict_one(record_id)
62
+
63
+ def size(self) -> int:
64
+ """Return the current number of cached entries."""
65
+ with self._lock:
66
+ return len(self._store)
67
+
68
+ def estimated_bytes(self) -> int:
69
+ """
70
+ Rough estimate of memory used by cached data.
71
+ Counts the total character length of all cached flat-dict values.
72
+ Accurate enough for dashboard display; not a true malloc measurement.
73
+ """
74
+ import sys
75
+ with self._lock:
76
+ total = 0
77
+ for record in self._store.values():
78
+ for k, v in record.items():
79
+ total += sys.getsizeof(k) + sys.getsizeof(v)
80
+ total += sys.getsizeof(self._store)
81
+ return total
82
+
83
+ def apply_delta(self, record_id: str, delta: dict) -> None:
84
+ """
85
+ Merge delta flat-paths into the existing cached object (upsert).
86
+ If the record is not cached, create a new entry with just the delta.
87
+ """
88
+ with self._lock:
89
+ if record_id in self._store:
90
+ self._store[record_id].update(delta)
91
+ self._touch(record_id)
92
+ else:
93
+ self._store[record_id] = dict(delta)
94
+ self._access_log[record_id] = None
95
+ self._freq[record_id] = 1
96
+ self._timestamps[record_id] = time.time()
97
+
98
+ def evict(self) -> None:
99
+ """
100
+ Evict one entry according to the configured strategy.
101
+ Called with _lock already held.
102
+ """
103
+ if not self._store:
104
+ return
105
+ if self.clean_type == "lru":
106
+ oldest_id = next(iter(self._access_log))
107
+ self._evict_one(oldest_id)
108
+ elif self.clean_type == "lfu":
109
+ lfu_id = min(self._freq, key=lambda k: self._freq[k])
110
+ self._evict_one(lfu_id)
111
+ elif self.clean_type == "ttl":
112
+ # Evict all expired entries
113
+ now = time.time()
114
+ expired = [rid for rid, ts in self._timestamps.items()
115
+ if self.ttl > 0 and (now - ts) > self.ttl]
116
+ for rid in expired:
117
+ self._evict_one(rid)
118
+ # If nothing expired but we're still over max_size, fall back to LRU
119
+ if len(self._store) > self.max_size:
120
+ oldest_id = next(iter(self._access_log))
121
+ self._evict_one(oldest_id)
122
+
123
+ # ------------------------------------------------------------------
124
+ # Internal helpers (all called with _lock held)
125
+ # ------------------------------------------------------------------
126
+
127
+ def _touch(self, record_id: str) -> None:
128
+ """Update LRU order, frequency, and timestamp for an accessed entry."""
129
+ self._access_log.move_to_end(record_id)
130
+ self._freq[record_id] = self._freq.get(record_id, 0) + 1
131
+ self._timestamps[record_id] = time.time()
132
+
133
+ def _is_expired(self, record_id: str) -> bool:
134
+ if self.ttl <= 0:
135
+ return False
136
+ ts = self._timestamps.get(record_id, 0)
137
+ return (time.time() - ts) > self.ttl
138
+
139
+ def _evict_one(self, record_id: str) -> None:
140
+ """Remove a single record_id from all internal structures."""
141
+ self._store.pop(record_id, None)
142
+ self._access_log.pop(record_id, None)
143
+ self._freq.pop(record_id, None)
144
+ self._timestamps.pop(record_id, None)
@@ -0,0 +1,156 @@
1
+ """
2
+ WriteQueue — debounced async write queue for one store.
3
+
4
+ Enqueued "write" operations are buffered and coalesced per record_id for
5
+ debounce_window seconds before being flushed to the registered handler.
6
+ Operations with op == "compact" or "rebuild_index" bypass debouncing and
7
+ are pushed directly to the queue.
8
+
9
+ A dead-letter list captures tasks whose handler raised an exception, capped
10
+ at 1000 entries (oldest dropped first).
11
+ """
12
+
13
+ import logging
14
+ import queue
15
+ import threading
16
+ import time
17
+ from typing import Callable
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ WORKER_SENTINEL = object()
22
+
23
+
24
+ class WriteQueue:
25
+ def __init__(self, debounce_window: float = 5.0, max_pending: int = 10_000):
26
+ self.debounce_window = debounce_window
27
+ self.max_pending = max_pending
28
+
29
+ self._queue: queue.Queue = queue.Queue()
30
+ self._pending: dict = {} # record_id -> accumulated delta dict
31
+ self._timers: dict = {} # record_id -> enqueue timestamp
32
+ self._handlers: dict = {} # op_str -> Callable
33
+ self._dead_letter: list = [] # failed tasks, capped at 1000
34
+ self._stop_event: threading.Event = threading.Event()
35
+ self._worker_thread: threading.Thread = threading.Thread(
36
+ target=self._worker_loop, daemon=True, name="WriteQueue-worker"
37
+ )
38
+ self._lock: threading.Lock = threading.Lock()
39
+
40
+ # ------------------------------------------------------------------
41
+ # Public API
42
+ # ------------------------------------------------------------------
43
+
44
+ def start(self) -> None:
45
+ """Start the background worker thread."""
46
+ self._worker_thread.start()
47
+
48
+ def register_handler(self, op: str, fn: Callable) -> None:
49
+ """Map an operation string to a handler function."""
50
+ self._handlers[op] = fn
51
+
52
+ def enqueue(self, task: dict) -> None:
53
+ """
54
+ Add a task to the queue.
55
+
56
+ For "write" ops: coalesce delta into _pending[record_id] and reset timer.
57
+ For all other ops: push directly to _queue.
58
+
59
+ Raises RuntimeError if _pending is at max_pending capacity.
60
+ """
61
+ op = task.get("op")
62
+ if op == "write":
63
+ record_id = task.get("record_id")
64
+ delta = task.get("delta", {})
65
+ if record_id is None:
66
+ logger.warning("WriteQueue: write task missing record_id, skipping")
67
+ return
68
+ with self._lock:
69
+ if record_id not in self._pending and len(self._pending) >= self.max_pending:
70
+ raise RuntimeError(
71
+ f"WriteQueue pending buffer full ({self.max_pending} entries)"
72
+ )
73
+ if record_id in self._pending:
74
+ self._pending[record_id].update(delta)
75
+ else:
76
+ self._pending[record_id] = dict(delta)
77
+ self._timers[record_id] = time.time()
78
+ else:
79
+ self._queue.put(task)
80
+
81
+ def flush_all(self) -> None:
82
+ """Force-drain all pending write timers immediately (call before stop)."""
83
+ with self._lock:
84
+ for record_id, delta in list(self._pending.items()):
85
+ store = self._timers.get(record_id) # reuse timer slot for store name
86
+ # Build a flush task from whatever is in _pending
87
+ task = {
88
+ "op": "write",
89
+ "record_id": record_id,
90
+ "delta": delta,
91
+ "ts": time.time(),
92
+ }
93
+ self._queue.put(task)
94
+ self._pending.clear()
95
+ self._timers.clear()
96
+
97
+ def stop(self) -> None:
98
+ """Signal stop, flush, and join the worker thread (10 s timeout)."""
99
+ self.flush_all()
100
+ self._stop_event.set()
101
+ self._queue.put(WORKER_SENTINEL)
102
+ self._worker_thread.join(timeout=10)
103
+
104
+ def get_dead_letters(self) -> list:
105
+ """Return a copy of the dead-letter list for admin inspection."""
106
+ return list(self._dead_letter)
107
+
108
+ # ------------------------------------------------------------------
109
+ # Worker loop (runs in daemon thread)
110
+ # ------------------------------------------------------------------
111
+
112
+ def _worker_loop(self) -> None:
113
+ while not self._stop_event.is_set():
114
+ # 1. Drain expired pending entries
115
+ now = time.time()
116
+ with self._lock:
117
+ due = [
118
+ rid for rid, ts in self._timers.items()
119
+ if (now - ts) >= self.debounce_window
120
+ ]
121
+ for record_id in due:
122
+ task = {
123
+ "op": "write",
124
+ "record_id": record_id,
125
+ "delta": self._pending.pop(record_id),
126
+ "ts": self._timers.pop(record_id),
127
+ }
128
+ self._queue.put(task)
129
+
130
+ # 2. Pull and dispatch one item
131
+ try:
132
+ task = self._queue.get(timeout=0.1)
133
+ except queue.Empty:
134
+ continue
135
+
136
+ if task is WORKER_SENTINEL:
137
+ break
138
+
139
+ self._dispatch(task)
140
+
141
+ def _dispatch(self, task: dict) -> None:
142
+ op = task.get("op")
143
+ handler = self._handlers.get(op)
144
+ if handler is None:
145
+ logger.warning("WriteQueue: no handler for op=%r", op)
146
+ return
147
+ try:
148
+ handler(task)
149
+ except Exception as exc:
150
+ logger.error(
151
+ "WriteQueue: handler for op=%r record_id=%r raised: %s",
152
+ op, task.get("record_id"), exc,
153
+ )
154
+ self._dead_letter.append(task)
155
+ if len(self._dead_letter) > 1000:
156
+ self._dead_letter.pop(0)
File without changes
@@ -0,0 +1,118 @@
1
+ """
2
+ Compactor — rewrites log segments to reclaim space from deleted/overwritten records.
3
+
4
+ For each segment:
5
+ 1. Collect all active record IDs whose current segment == this segment.
6
+ 2. Re-read each record line and write it to a .tmp file, tracking new offsets.
7
+ 3. Atomically replace the .log with the .tmp via os.replace.
8
+ 4. Batch-update IndexManager with new offsets and call save_full().
9
+ """
10
+
11
+ import logging
12
+ import os
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class Compactor:
18
+ def __init__(self, store):
19
+ """
20
+ Parameters
21
+ ----------
22
+ store : src.db.objects.store.store
23
+ """
24
+ self._store = store
25
+
26
+ # ------------------------------------------------------------------
27
+ # Size helpers
28
+ # ------------------------------------------------------------------
29
+
30
+ def compute_logical_size(self, seq_str: str) -> int:
31
+ """Sum of all stored entry sizes in index_manager._map for this segment."""
32
+ idx = self._store.index_manager
33
+ if idx is None:
34
+ return 0
35
+ return sum(
36
+ size
37
+ for _rid, (seg, _off, size) in idx._map.items()
38
+ if seg == seq_str
39
+ )
40
+
41
+ def needs_compaction(self, seq_str: str, dead_ratio_threshold: float = None) -> bool:
42
+ """
43
+ Return True if the dead-record ratio exceeds the threshold.
44
+ dead_ratio = (physical_size - logical_size) / physical_size
45
+ """
46
+ store = self._store
47
+ if store.log_manager is None:
48
+ return False
49
+ # Default threshold from config
50
+ if dead_ratio_threshold is None:
51
+ dead_ratio_threshold = getattr(
52
+ store.config.file_config, "compaction_dead_ratio", 0.3
53
+ )
54
+ seq_int = int(seq_str)
55
+ path = store.log_manager.segment_path(seq_int)
56
+ if not os.path.exists(path):
57
+ return False
58
+ physical_size = os.path.getsize(path)
59
+ if physical_size == 0:
60
+ return False
61
+ logical_size = self.compute_logical_size(seq_str)
62
+ ratio = (physical_size - logical_size) / physical_size
63
+ return ratio > dead_ratio_threshold
64
+
65
+ # ------------------------------------------------------------------
66
+ # Compact
67
+ # ------------------------------------------------------------------
68
+
69
+ def compact_segment(self, seq_int: int) -> None:
70
+ """
71
+ Compact one log segment in place.
72
+ Skips the active (currently-written) segment to avoid corruption.
73
+ """
74
+ store = self._store
75
+ if store.log_manager is None or store.index_manager is None:
76
+ raise RuntimeError("Store storage not initialised.")
77
+
78
+ # Never compact the active segment (it has an open file handle)
79
+ if seq_int == store.log_manager.active_segment:
80
+ logger.info("compact_segment: skipping active segment %03d", seq_int)
81
+ return
82
+
83
+ seq_str = f"{seq_int:03d}"
84
+ active_ids = store.index_manager.records_in_segment(seq_str)
85
+ if not active_ids:
86
+ logger.info("compact_segment: segment %s has no live records, skipping", seq_str)
87
+ return
88
+
89
+ seg_path = store.log_manager.segment_path(seq_int)
90
+ tmp_path = seg_path + ".tmp"
91
+
92
+ new_offsets: dict = {} # record_id -> (seq_str, new_offset, new_size)
93
+
94
+ with open(tmp_path, "w", encoding="utf-8") as out_fh:
95
+ for record_id in active_ids:
96
+ entry = store.index_manager.get(record_id)
97
+ if entry is None:
98
+ continue
99
+ seg, offset, size = entry
100
+ try:
101
+ line_str = store.log_manager.read(seg, offset, size)
102
+ except Exception as exc:
103
+ logger.warning("compact_segment: could not read %s: %s", record_id, exc)
104
+ continue
105
+ # Write to .tmp
106
+ new_offset = out_fh.tell()
107
+ out_fh.write(line_str + "\n")
108
+ new_size = len((line_str + "\n").encode("utf-8"))
109
+ new_offsets[record_id] = (seq_str, new_offset, new_size)
110
+
111
+ # Atomic swap
112
+ os.replace(tmp_path, seg_path)
113
+ logger.info("compact_segment: segment %s compacted (%d records kept)", seq_str, len(new_offsets))
114
+
115
+ # Update index
116
+ for record_id, (seg, new_offset, new_size) in new_offsets.items():
117
+ store.index_manager._map[record_id] = (seg, new_offset, new_size)
118
+ store.index_manager.save_full()