PyMkDB 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. pymkdb/__init__.py +6 -0
  2. pymkdb/cli.py +57 -0
  3. pymkdb-0.1.0.dist-info/METADATA +86 -0
  4. pymkdb-0.1.0.dist-info/RECORD +54 -0
  5. pymkdb-0.1.0.dist-info/WHEEL +5 -0
  6. pymkdb-0.1.0.dist-info/entry_points.txt +2 -0
  7. pymkdb-0.1.0.dist-info/top_level.txt +3 -0
  8. sdk/__init__.py +1 -0
  9. sdk/connection.py +225 -0
  10. sdk/delta.py +19 -0
  11. sdk/http_connection.py +180 -0
  12. sdk/mkdb_client.py +226 -0
  13. sdk/responses.py +154 -0
  14. src/__init__.py +1 -0
  15. src/config/db.py +227 -0
  16. src/config/server.py +52 -0
  17. src/db/__init__.py +207 -0
  18. src/db/cache/__init__.py +1 -0
  19. src/db/cache/ram_cache.py +144 -0
  20. src/db/cache/write_queue.py +156 -0
  21. src/db/maintenance/__init__.py +0 -0
  22. src/db/maintenance/compactor.py +118 -0
  23. src/db/maintenance/task_scheduler.py +73 -0
  24. src/db/objects/store.py +283 -0
  25. src/db/parity/__init__.py +0 -0
  26. src/db/parity/parity_manager.py +196 -0
  27. src/db/query/__init__.py +1 -0
  28. src/db/query/full_text_index.py +168 -0
  29. src/db/query/numeric_index.py +196 -0
  30. src/db/query/query_engine.py +308 -0
  31. src/db/query/tokenizer.py +48 -0
  32. src/db/query_workers/__init__.py +16 -0
  33. src/db/query_workers/dispatcher.py +339 -0
  34. src/db/query_workers/task.py +78 -0
  35. src/db/query_workers/worker.py +292 -0
  36. src/db/requesting/main.py +0 -0
  37. src/db/storage/__init__.py +1 -0
  38. src/db/storage/blob_store.py +47 -0
  39. src/db/storage/index_manager.py +92 -0
  40. src/db/storage/log_manager.py +119 -0
  41. src/db/storage/serializer.py +38 -0
  42. src/filing/__init__.py +31 -0
  43. src/objects/__init__.py +190 -0
  44. src/runtime/__init__.py +15 -0
  45. src/server/__init__.py +0 -0
  46. src/server/coms/actions.py +209 -0
  47. src/server/coms/http.py +46 -0
  48. src/server/coms/http_handlers.py +445 -0
  49. src/server/coms/metrics.py +231 -0
  50. src/server/coms/socket.py +461 -0
  51. src/server/coms/socket_protocol.py +54 -0
  52. src/server/control/api/actions.py +1001 -0
  53. src/server/control/server.py +404 -0
  54. src/server/event_log.py +58 -0
@@ -0,0 +1,48 @@
1
+ """
2
+ Text tokenizer for full-text indexing.
3
+
4
+ Pipeline: lowercase → strip non-alphanumeric (keep spaces) → split on whitespace
5
+ → remove stop-words → apply simple suffix-stripping stemmer.
6
+ """
7
+
8
+ import re
9
+
10
+ STOP_WORDS = {
11
+ "the", "is", "a", "and", "for", "in", "to", "of",
12
+ "with", "an", "on", "at", "by", "it", "as", "or",
13
+ "be", "this", "that", "are", "was", "were", "not",
14
+ }
15
+
16
+
17
+ def _stem(word: str) -> str:
18
+ """
19
+ Minimal suffix-stripping stemmer (no NLTK dependency).
20
+ Applies the most common English suffix rules in order.
21
+ """
22
+ if len(word) <= 3:
23
+ return word
24
+ for suffix, replacement in [
25
+ ("ational", "ate"), ("tional", "tion"), ("enci", "ence"),
26
+ ("anci", "ance"), ("izer", "ize"), ("ising", "ise"),
27
+ ("izing", "ize"), ("ness", ""), ("ment", ""), ("ful", ""),
28
+ ("less", ""), ("ings", "ing"), ("ing", ""), ("edly", ""),
29
+ ("edly", "ed"), ("edly", ""), ("ed", ""), ("er", ""),
30
+ ("ly", ""), ("ies", "i"), ("ied", "i"), ("es", "e"),
31
+ ("s", ""),
32
+ ]:
33
+ if word.endswith(suffix) and len(word) - len(suffix) >= 3:
34
+ return word[: -len(suffix)] + replacement
35
+ return word
36
+
37
+
38
+ def tokenize(text: str) -> list:
39
+ """
40
+ Tokenize text into a list of stems.
41
+ Returns an empty list for empty/None input.
42
+ """
43
+ if not text:
44
+ return []
45
+ lowered = text.lower()
46
+ cleaned = re.sub(r"[^a-z0-9\s]", " ", lowered)
47
+ words = cleaned.split()
48
+ return [_stem(w) for w in words if w not in STOP_WORDS and len(w) > 1]
@@ -0,0 +1,16 @@
1
+ """
2
+ src.db.query_workers
3
+ ====================
4
+ Query worker pool infrastructure.
5
+
6
+ Public surface
7
+ --------------
8
+ QueryDispatcher — create one per store; call .start() then .submit(op, params)
9
+ QueryTask — the picklable unit of work; mostly internal
10
+ OPERATIONS — frozenset of valid operation strings
11
+ """
12
+
13
+ from src.db.query_workers.dispatcher import QueryDispatcher
14
+ from src.db.query_workers.task import QueryTask, OPERATIONS
15
+
16
+ __all__ = ["QueryDispatcher", "QueryTask", "OPERATIONS"]
@@ -0,0 +1,339 @@
1
+ """
2
+ QueryDispatcher — the main-process facade for the query worker pool.
3
+
4
+ Responsibilities
5
+ ----------------
6
+ 1. Start / stop worker processes or threads based on config.
7
+ 2. Accept submit() calls from any thread, assign a task_id, put the task
8
+ on the shared work queue, and block until the worker posts a result.
9
+ 3. Route results from the shared results queue back to the correct waiter
10
+ via a lightweight result-router daemon thread.
11
+ 4. Broadcast cache-invalidation messages to every worker's private queue
12
+ so their caches stay consistent with writes.
13
+
14
+ Parallel vs. single-worker mode
15
+ --------------------------------
16
+ parallel_enabled=False → one threading.Thread worker in the same process.
17
+ Shares the GIL; suitable for I/O-bound reads.
18
+
19
+ parallel_enabled=True → N multiprocessing.Process workers.
20
+ Each has its own GIL and Python heap — suitable
21
+ for CPU-bound query scans (WS-3).
22
+ N defaults to os.cpu_count() when worker_count=0.
23
+
24
+ Both modes expose an identical submit() / invalidate() / stop() interface.
25
+ """
26
+
27
+ import logging
28
+ import multiprocessing
29
+ import os
30
+ import queue
31
+ import threading
32
+ import uuid
33
+ from typing import Any
34
+
35
+ from src.db.query_workers.task import QueryTask
36
+ from src.db.query_workers.worker import (
37
+ WORKER_SENTINEL,
38
+ worker_process_main,
39
+ worker_thread_main,
40
+ )
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ class QueryDispatcher:
46
+ """
47
+ Owns the work queue, results queue, and the worker pool for one store.
48
+
49
+ Parameters
50
+ ----------
51
+ store_name : str
52
+ base_path : str Absolute path to the store root on disk.
53
+ config : query_worker_config
54
+ """
55
+
56
+ def __init__(self, store_name: str, base_path: str, config) -> None:
57
+ self.store_name = store_name
58
+ self.base_path = base_path
59
+ self.config = config
60
+
61
+ # -- Shared queues ---------------------------------------------------
62
+ # work_queue: dispatcher → workers (all workers compete for items)
63
+ # results_queue: workers → dispatcher result-router thread
64
+ self._work_queue: multiprocessing.Queue = multiprocessing.Queue()
65
+ self._results_queue: multiprocessing.Queue = multiprocessing.Queue()
66
+
67
+ # Per-worker private invalidation queues (one per worker so that a
68
+ # single broadcast reaches every worker independently)
69
+ self._invalidation_queues: list[multiprocessing.Queue] = []
70
+
71
+ # -- Worker handles --------------------------------------------------
72
+ self._workers: list[multiprocessing.Process | threading.Thread] = []
73
+
74
+ # -- Stop signalling -------------------------------------------------
75
+ # multiprocessing.Event works in both process and thread contexts.
76
+ self._stop_event: multiprocessing.Event = multiprocessing.Event()
77
+
78
+ # -- Pending-result tracking (main process / main thread only) -------
79
+ self._pending_lock = threading.Lock()
80
+ self._pending: dict[str, threading.Event] = {} # task_id -> event
81
+ self._results: dict[str, dict] = {} # task_id -> result dict
82
+
83
+ # -- Result-router daemon thread -------------------------------------
84
+ self._result_router: threading.Thread | None = None
85
+
86
+ self._started = False
87
+
88
+ # -----------------------------------------------------------------------
89
+ # Lifecycle
90
+ # -----------------------------------------------------------------------
91
+
92
+ def start(self) -> None:
93
+ """Start the worker pool and the result-router thread."""
94
+ if self._started:
95
+ return
96
+ self._started = True
97
+
98
+ # Always start the result-router thread (handles both modes)
99
+ self._result_router = threading.Thread(
100
+ target=self._result_router_loop,
101
+ daemon=True,
102
+ name=f"QueryDispatcher-Router[{self.store_name}]",
103
+ )
104
+ self._result_router.start()
105
+
106
+ if self.config.parallel_enabled:
107
+ self._start_process_workers()
108
+ else:
109
+ self._start_thread_worker()
110
+
111
+ logger.info(
112
+ "QueryDispatcher started for store '%s' | parallel=%s | workers=%d",
113
+ self.store_name,
114
+ self.config.parallel_enabled,
115
+ len(self._workers),
116
+ )
117
+
118
+ def stop(self) -> None:
119
+ """Gracefully stop all workers and the result-router."""
120
+ if not self._started:
121
+ return
122
+
123
+ self._stop_event.set()
124
+
125
+ # Send a sentinel for each live worker so they exit their blocking get()
126
+ for _ in self._workers:
127
+ self._work_queue.put(WORKER_SENTINEL)
128
+
129
+ for w in self._workers:
130
+ w.join(timeout=5.0)
131
+ if hasattr(w, "is_alive") and w.is_alive():
132
+ logger.warning("Worker %s did not exit cleanly; terminating", w.name)
133
+ if isinstance(w, multiprocessing.Process):
134
+ w.terminate()
135
+
136
+ self._workers.clear()
137
+ self._invalidation_queues.clear()
138
+ self._started = False
139
+ logger.info("QueryDispatcher stopped for store '%s'", self.store_name)
140
+
141
+ # -----------------------------------------------------------------------
142
+ # Public API
143
+ # -----------------------------------------------------------------------
144
+
145
+ def submit(self, operation: str, params: dict, timeout: float | None = None) -> Any:
146
+ """
147
+ Submit a query task and block until the worker returns a result.
148
+
149
+ Parameters
150
+ ----------
151
+ operation : str One of task.OPERATIONS.
152
+ params : dict Operation-specific parameters (see QueryTask docstring).
153
+ timeout : float Seconds to wait. Defaults to config.task_timeout.
154
+
155
+ Returns
156
+ -------
157
+ The resolved result (type depends on operation).
158
+
159
+ Raises
160
+ ------
161
+ TimeoutError If no result arrives within `timeout` seconds.
162
+ RuntimeError If the worker reports an error.
163
+ """
164
+ if not self._started:
165
+ self.start()
166
+
167
+ if timeout is None:
168
+ timeout = float(self.config.task_timeout)
169
+
170
+ task = QueryTask(operation=operation, store_name=self.store_name, params=params)
171
+ event = threading.Event()
172
+
173
+ with self._pending_lock:
174
+ self._pending[task.task_id] = event
175
+
176
+ self._work_queue.put(task.to_dict())
177
+
178
+ if not event.wait(timeout=timeout):
179
+ with self._pending_lock:
180
+ self._pending.pop(task.task_id, None)
181
+ self._results.pop(task.task_id, None)
182
+ raise TimeoutError(
183
+ f"Query task {task.task_id!r} (op={operation!r}) timed out after {timeout}s"
184
+ )
185
+
186
+ with self._pending_lock:
187
+ result = self._results.pop(task.task_id)
188
+
189
+ if result["status"] == "error":
190
+ raise RuntimeError(result.get("error", "Unknown worker error"))
191
+
192
+ return result["data"]
193
+
194
+ def invalidate(self, record_id: str) -> None:
195
+ """
196
+ Broadcast a cache-invalidation message to every worker.
197
+
198
+ Called by the write queue after a record is flushed to disk so that
199
+ stale entries are purged from all worker caches before the next read.
200
+ """
201
+ for inv_q in self._invalidation_queues:
202
+ try:
203
+ inv_q.put_nowait(record_id)
204
+ except Exception:
205
+ pass # non-fatal if queue is full or closed
206
+
207
+ # -----------------------------------------------------------------------
208
+ # Observability
209
+ # -----------------------------------------------------------------------
210
+
211
+ @property
212
+ def worker_count(self) -> int:
213
+ return len(self._workers)
214
+
215
+ @property
216
+ def queue_depth(self) -> int:
217
+ """Approximate number of unprocessed tasks in the work queue."""
218
+ try:
219
+ return self._work_queue.qsize()
220
+ except NotImplementedError:
221
+ # qsize() is not supported on macOS
222
+ return -1
223
+
224
+ @property
225
+ def is_running(self) -> bool:
226
+ return self._started and not self._stop_event.is_set()
227
+
228
+ def status(self) -> dict:
229
+ return {
230
+ "store_name": self.store_name,
231
+ "parallel": self.config.parallel_enabled,
232
+ "worker_count": self.worker_count,
233
+ "queue_depth": self.queue_depth,
234
+ "pending_tasks": len(self._pending),
235
+ "running": self.is_running,
236
+ }
237
+
238
+ # -----------------------------------------------------------------------
239
+ # Internal — worker startup helpers
240
+ # -----------------------------------------------------------------------
241
+
242
+ def _start_process_workers(self) -> None:
243
+ count = self.config.worker_count or (os.cpu_count() or 2)
244
+ project_root = _find_project_root()
245
+
246
+ for i in range(count):
247
+ inv_q = multiprocessing.Queue()
248
+ self._invalidation_queues.append(inv_q)
249
+
250
+ p = multiprocessing.Process(
251
+ target=worker_process_main,
252
+ args=(
253
+ i,
254
+ project_root,
255
+ self.store_name,
256
+ self.base_path,
257
+ self._work_queue,
258
+ self._results_queue,
259
+ inv_q,
260
+ self.config.worker_cache_size,
261
+ float(self.config.worker_cache_ttl),
262
+ self._stop_event,
263
+ ),
264
+ daemon=True,
265
+ name=f"QueryWorker[{self.store_name}#{i}]",
266
+ )
267
+ p.start()
268
+ self._workers.append(p)
269
+
270
+ def _start_thread_worker(self) -> None:
271
+ inv_q = multiprocessing.Queue()
272
+ self._invalidation_queues.append(inv_q)
273
+
274
+ t = threading.Thread(
275
+ target=worker_thread_main,
276
+ args=(
277
+ 0,
278
+ self.store_name,
279
+ self.base_path,
280
+ self._work_queue,
281
+ self._results_queue,
282
+ inv_q,
283
+ self.config.worker_cache_size,
284
+ float(self.config.worker_cache_ttl),
285
+ self._stop_event,
286
+ ),
287
+ daemon=True,
288
+ name=f"QueryWorker[{self.store_name}#thread]",
289
+ )
290
+ t.start()
291
+ self._workers.append(t)
292
+
293
+ # -----------------------------------------------------------------------
294
+ # Internal — result-router loop
295
+ # -----------------------------------------------------------------------
296
+
297
+ def _result_router_loop(self) -> None:
298
+ """
299
+ Daemon thread that reads from results_queue and wakes the correct waiter.
300
+
301
+ Runs continuously until _stop_event is set and the results queue is drained.
302
+ """
303
+ while not self._stop_event.is_set():
304
+ try:
305
+ result = self._results_queue.get(timeout=0.5)
306
+ except Exception:
307
+ continue # timeout — check stop_event and loop
308
+
309
+ task_id = result.get("task_id")
310
+ if not task_id:
311
+ continue
312
+
313
+ with self._pending_lock:
314
+ self._results[task_id] = result
315
+ event = self._pending.get(task_id)
316
+
317
+ if event is not None:
318
+ event.set()
319
+
320
+
321
+ # ---------------------------------------------------------------------------
322
+ # Utility
323
+ # ---------------------------------------------------------------------------
324
+
325
+ def _find_project_root() -> str:
326
+ """
327
+ Walk up from this file's location to find the project root directory
328
+ (the first ancestor that does NOT contain an __init__.py).
329
+ Used to bootstrap sys.path in worker processes.
330
+ """
331
+ path = os.path.dirname(os.path.abspath(__file__))
332
+ while True:
333
+ parent = os.path.dirname(path)
334
+ if parent == path:
335
+ break
336
+ if not os.path.exists(os.path.join(parent, "__init__.py")):
337
+ return parent
338
+ path = parent
339
+ return path
@@ -0,0 +1,78 @@
1
+ """
2
+ QueryTask — the unit of work placed on the shared work queue.
3
+
4
+ Instances must be fully picklable so they can be sent through a
5
+ multiprocessing.Queue to worker processes.
6
+ """
7
+
8
+ import uuid
9
+ from dataclasses import dataclass, field
10
+
11
+
12
+ # Operations a worker can execute.
13
+ OPERATIONS = frozenset({"read", "query", "count", "exists", "multi_read"})
14
+
15
+
16
+ @dataclass
17
+ class QueryTask:
18
+ """
19
+ A single query request.
20
+
21
+ Fields
22
+ ------
23
+ operation : str
24
+ One of OPERATIONS.
25
+ store_name : str
26
+ Name of the target store.
27
+ params : dict
28
+ Operation-specific parameters:
29
+
30
+ read / exists:
31
+ {"record_id": str}
32
+
33
+ multi_read:
34
+ {"record_ids": list[str]}
35
+
36
+ query:
37
+ {"filter": dict} — e.g. {"price": {"$gte": 10, "$lte": 50},
38
+ "name": {"$text": "steel bolt"}}
39
+
40
+ count:
41
+ {"filter": dict} — same filter format; returns an integer
42
+
43
+ task_id : str
44
+ Auto-generated UUID hex string. Used by the dispatcher to route
45
+ results back to the correct waiter.
46
+ """
47
+
48
+ operation: str
49
+ store_name: str
50
+ params: dict
51
+ task_id: str = field(default_factory=lambda: uuid.uuid4().hex)
52
+
53
+ def __post_init__(self):
54
+ if self.operation not in OPERATIONS:
55
+ raise ValueError(
56
+ f"Unknown operation {self.operation!r}. Must be one of {sorted(OPERATIONS)}"
57
+ )
58
+ if not self.store_name:
59
+ raise ValueError("store_name must not be empty")
60
+ if not isinstance(self.params, dict):
61
+ raise TypeError("params must be a dict")
62
+
63
+ def to_dict(self) -> dict:
64
+ return {
65
+ "task_id": self.task_id,
66
+ "operation": self.operation,
67
+ "store_name": self.store_name,
68
+ "params": self.params,
69
+ }
70
+
71
+ @staticmethod
72
+ def from_dict(d: dict) -> "QueryTask":
73
+ return QueryTask(
74
+ operation=d["operation"],
75
+ store_name=d["store_name"],
76
+ params=d["params"],
77
+ task_id=d.get("task_id", uuid.uuid4().hex),
78
+ )