PyMkDB 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymkdb/__init__.py +6 -0
- pymkdb/cli.py +57 -0
- pymkdb-0.1.0.dist-info/METADATA +86 -0
- pymkdb-0.1.0.dist-info/RECORD +54 -0
- pymkdb-0.1.0.dist-info/WHEEL +5 -0
- pymkdb-0.1.0.dist-info/entry_points.txt +2 -0
- pymkdb-0.1.0.dist-info/top_level.txt +3 -0
- sdk/__init__.py +1 -0
- sdk/connection.py +225 -0
- sdk/delta.py +19 -0
- sdk/http_connection.py +180 -0
- sdk/mkdb_client.py +226 -0
- sdk/responses.py +154 -0
- src/__init__.py +1 -0
- src/config/db.py +227 -0
- src/config/server.py +52 -0
- src/db/__init__.py +207 -0
- src/db/cache/__init__.py +1 -0
- src/db/cache/ram_cache.py +144 -0
- src/db/cache/write_queue.py +156 -0
- src/db/maintenance/__init__.py +0 -0
- src/db/maintenance/compactor.py +118 -0
- src/db/maintenance/task_scheduler.py +73 -0
- src/db/objects/store.py +283 -0
- src/db/parity/__init__.py +0 -0
- src/db/parity/parity_manager.py +196 -0
- src/db/query/__init__.py +1 -0
- src/db/query/full_text_index.py +168 -0
- src/db/query/numeric_index.py +196 -0
- src/db/query/query_engine.py +308 -0
- src/db/query/tokenizer.py +48 -0
- src/db/query_workers/__init__.py +16 -0
- src/db/query_workers/dispatcher.py +339 -0
- src/db/query_workers/task.py +78 -0
- src/db/query_workers/worker.py +292 -0
- src/db/requesting/main.py +0 -0
- src/db/storage/__init__.py +1 -0
- src/db/storage/blob_store.py +47 -0
- src/db/storage/index_manager.py +92 -0
- src/db/storage/log_manager.py +119 -0
- src/db/storage/serializer.py +38 -0
- src/filing/__init__.py +31 -0
- src/objects/__init__.py +190 -0
- src/runtime/__init__.py +15 -0
- src/server/__init__.py +0 -0
- src/server/coms/actions.py +209 -0
- src/server/coms/http.py +46 -0
- src/server/coms/http_handlers.py +445 -0
- src/server/coms/metrics.py +231 -0
- src/server/coms/socket.py +461 -0
- src/server/coms/socket_protocol.py +54 -0
- src/server/control/api/actions.py +1001 -0
- src/server/control/server.py +404 -0
- src/server/event_log.py +58 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FullTextIndex — inverted index for one text field of one store.
|
|
3
|
+
|
|
4
|
+
File format: idx_{service}_{field}_words.txt
|
|
5
|
+
Each line: {stem}:{id1},{id2},{id3}
|
|
6
|
+
|
|
7
|
+
RAM-threshold mode
|
|
8
|
+
------------------
|
|
9
|
+
If ram_threshold_bytes > 0 and the index file on disk exceeds that size,
|
|
10
|
+
the in-memory map is evicted. Pending mutations are buffered and flushed
|
|
11
|
+
to disk before each query and at each explicit save() call.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from src.db.query.tokenizer import tokenize
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FullTextIndex:
|
|
19
|
+
def __init__(self, store_path: str, service: str, field: str,
|
|
20
|
+
ram_threshold_bytes: int = 0):
|
|
21
|
+
safe_field = field.replace("/", "_").replace("\\", "_")
|
|
22
|
+
self.path = os.path.join(store_path, f"idx_{service}_{safe_field}_words.txt")
|
|
23
|
+
self._map: dict = {} # stem -> set[rid] (RAM mode)
|
|
24
|
+
self._in_ram: bool = True
|
|
25
|
+
self._ram_threshold: int = ram_threshold_bytes
|
|
26
|
+
# Disk-mode pending buffers (only used when _in_ram is False)
|
|
27
|
+
self._add_buf: dict = {} # stem -> set[rid] to add
|
|
28
|
+
self._remove_buf: dict = {} # stem -> set[rid] to remove
|
|
29
|
+
self.load()
|
|
30
|
+
|
|
31
|
+
# ------------------------------------------------------------------
|
|
32
|
+
# Persistence
|
|
33
|
+
# ------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
def load(self) -> None:
|
|
36
|
+
"""Load the index from disk into _map."""
|
|
37
|
+
if not os.path.exists(self.path):
|
|
38
|
+
return
|
|
39
|
+
with open(self.path, "r", encoding="utf-8") as fh:
|
|
40
|
+
for raw in fh:
|
|
41
|
+
line = raw.strip()
|
|
42
|
+
if not line or ":" not in line:
|
|
43
|
+
continue
|
|
44
|
+
stem, _, ids_str = line.partition(":")
|
|
45
|
+
ids = set(ids_str.split(",")) if ids_str else set()
|
|
46
|
+
self._map[stem] = ids
|
|
47
|
+
self._apply_threshold()
|
|
48
|
+
|
|
49
|
+
def save(self) -> None:
|
|
50
|
+
"""Write the index to disk."""
|
|
51
|
+
if self._in_ram:
|
|
52
|
+
self._write_file(self._map)
|
|
53
|
+
self._apply_threshold()
|
|
54
|
+
else:
|
|
55
|
+
self._flush_pending()
|
|
56
|
+
|
|
57
|
+
def _apply_threshold(self) -> None:
|
|
58
|
+
"""Evict map from RAM if the disk file exceeds the configured threshold."""
|
|
59
|
+
if self._ram_threshold > 0 and os.path.exists(self.path):
|
|
60
|
+
if os.path.getsize(self.path) > self._ram_threshold:
|
|
61
|
+
self._map.clear()
|
|
62
|
+
self._in_ram = False
|
|
63
|
+
|
|
64
|
+
def _write_file(self, data_map: dict) -> None:
|
|
65
|
+
"""Atomically write a stem->ids map to disk."""
|
|
66
|
+
tmp = self.path + ".tmp"
|
|
67
|
+
with open(tmp, "w", encoding="utf-8") as fh:
|
|
68
|
+
for stem, ids in data_map.items():
|
|
69
|
+
if ids:
|
|
70
|
+
fh.write(f"{stem}:{','.join(sorted(ids))}\n")
|
|
71
|
+
os.replace(tmp, self.path)
|
|
72
|
+
|
|
73
|
+
def _load_disk_map(self) -> dict:
|
|
74
|
+
"""Read the disk file and return a stem->set map without touching self._map."""
|
|
75
|
+
disk_map: dict = {}
|
|
76
|
+
if os.path.exists(self.path):
|
|
77
|
+
with open(self.path, "r", encoding="utf-8") as fh:
|
|
78
|
+
for raw in fh:
|
|
79
|
+
line = raw.strip()
|
|
80
|
+
if not line or ":" not in line:
|
|
81
|
+
continue
|
|
82
|
+
stem, _, ids_str = line.partition(":")
|
|
83
|
+
disk_map[stem] = set(ids_str.split(",")) if ids_str else set()
|
|
84
|
+
return disk_map
|
|
85
|
+
|
|
86
|
+
def _flush_pending(self) -> None:
|
|
87
|
+
"""Merge add/remove buffers into the disk file and clear them."""
|
|
88
|
+
if not self._add_buf and not self._remove_buf:
|
|
89
|
+
return
|
|
90
|
+
disk_map = self._load_disk_map()
|
|
91
|
+
|
|
92
|
+
for stem, ids in self._add_buf.items():
|
|
93
|
+
disk_map.setdefault(stem, set()).update(ids)
|
|
94
|
+
|
|
95
|
+
for stem, ids in self._remove_buf.items():
|
|
96
|
+
existing = disk_map.get(stem)
|
|
97
|
+
if existing:
|
|
98
|
+
existing -= ids
|
|
99
|
+
if not existing:
|
|
100
|
+
del disk_map[stem]
|
|
101
|
+
|
|
102
|
+
self._write_file(disk_map)
|
|
103
|
+
self._add_buf.clear()
|
|
104
|
+
self._remove_buf.clear()
|
|
105
|
+
|
|
106
|
+
# ------------------------------------------------------------------
|
|
107
|
+
# Mutation
|
|
108
|
+
# ------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
def add(self, record_id: str, text_value: str) -> None:
|
|
111
|
+
"""Tokenize text_value and add record_id to every stem's set."""
|
|
112
|
+
if self._in_ram:
|
|
113
|
+
for stem in tokenize(text_value):
|
|
114
|
+
self._map.setdefault(stem, set()).add(record_id)
|
|
115
|
+
else:
|
|
116
|
+
for stem in tokenize(text_value):
|
|
117
|
+
self._add_buf.setdefault(stem, set()).add(record_id)
|
|
118
|
+
|
|
119
|
+
def remove(self, record_id: str, text_value: str) -> None:
|
|
120
|
+
"""Remove record_id from every stem's set; prune empty stems."""
|
|
121
|
+
if self._in_ram:
|
|
122
|
+
for stem in tokenize(text_value):
|
|
123
|
+
ids = self._map.get(stem)
|
|
124
|
+
if ids:
|
|
125
|
+
ids.discard(record_id)
|
|
126
|
+
if not ids:
|
|
127
|
+
del self._map[stem]
|
|
128
|
+
else:
|
|
129
|
+
for stem in tokenize(text_value):
|
|
130
|
+
self._remove_buf.setdefault(stem, set()).add(record_id)
|
|
131
|
+
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
# Query
|
|
134
|
+
# ------------------------------------------------------------------
|
|
135
|
+
|
|
136
|
+
def search(self, keywords: list, mode: str = "and") -> set:
|
|
137
|
+
"""
|
|
138
|
+
Search for records matching the given keywords.
|
|
139
|
+
|
|
140
|
+
keywords : list of raw keyword strings (will be tokenised)
|
|
141
|
+
mode : "and" (intersection) | "or" (union)
|
|
142
|
+
Returns a set of record IDs.
|
|
143
|
+
"""
|
|
144
|
+
if not keywords:
|
|
145
|
+
return set()
|
|
146
|
+
if self._in_ram:
|
|
147
|
+
data_map = self._map
|
|
148
|
+
else:
|
|
149
|
+
self._flush_pending()
|
|
150
|
+
data_map = self._load_disk_map()
|
|
151
|
+
candidate_sets = []
|
|
152
|
+
for kw in keywords:
|
|
153
|
+
ids = set()
|
|
154
|
+
for stem in tokenize(kw):
|
|
155
|
+
ids |= data_map.get(stem, set())
|
|
156
|
+
candidate_sets.append(ids)
|
|
157
|
+
if not candidate_sets:
|
|
158
|
+
return set()
|
|
159
|
+
if mode == "or":
|
|
160
|
+
result = set()
|
|
161
|
+
for s in candidate_sets:
|
|
162
|
+
result |= s
|
|
163
|
+
return result
|
|
164
|
+
# AND: intersect across keywords
|
|
165
|
+
result = candidate_sets[0]
|
|
166
|
+
for s in candidate_sets[1:]:
|
|
167
|
+
result = result & s
|
|
168
|
+
return result
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NumericIndex — sorted numeric index for one numeric field of one store.
|
|
3
|
+
|
|
4
|
+
File format: idx_{service}_{field}_numeric.txt
|
|
5
|
+
Each line: {value}:{id1},{id2}
|
|
6
|
+
File is kept in ascending sorted order by value.
|
|
7
|
+
|
|
8
|
+
RAM-threshold mode
|
|
9
|
+
------------------
|
|
10
|
+
If ram_threshold_bytes > 0 and the index file on disk exceeds that size,
|
|
11
|
+
the in-memory map is evicted after load/save. All queries then read
|
|
12
|
+
directly from disk; pending mutations are buffered and flushed to disk
|
|
13
|
+
before each query and at each explicit save() call.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import bisect
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NumericIndex:
|
|
21
|
+
def __init__(self, store_path: str, service: str, field: str,
|
|
22
|
+
ram_threshold_bytes: int = 0):
|
|
23
|
+
safe_field = field.replace("/", "_").replace("\\", "_")
|
|
24
|
+
self.path = os.path.join(store_path, f"idx_{service}_{safe_field}_numeric.txt")
|
|
25
|
+
self._values: list = [] # sorted float values (RAM mode)
|
|
26
|
+
self._map: dict = {} # float -> set[rid] (RAM mode)
|
|
27
|
+
self._in_ram: bool = True
|
|
28
|
+
self._ram_threshold: int = ram_threshold_bytes
|
|
29
|
+
# Disk-mode pending buffers (only used when _in_ram is False)
|
|
30
|
+
self._write_buf: dict = {} # val -> set[rid] to add
|
|
31
|
+
self._remove_buf: list = [] # list of (val, rid) to remove
|
|
32
|
+
self.load()
|
|
33
|
+
|
|
34
|
+
# ------------------------------------------------------------------
|
|
35
|
+
# Persistence
|
|
36
|
+
# ------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
def load(self) -> None:
|
|
39
|
+
if not os.path.exists(self.path):
|
|
40
|
+
return
|
|
41
|
+
with open(self.path, "r", encoding="utf-8") as fh:
|
|
42
|
+
for raw in fh:
|
|
43
|
+
line = raw.strip()
|
|
44
|
+
if not line or ":" not in line:
|
|
45
|
+
continue
|
|
46
|
+
val_str, _, ids_str = line.partition(":")
|
|
47
|
+
try:
|
|
48
|
+
val = float(val_str)
|
|
49
|
+
except ValueError:
|
|
50
|
+
continue
|
|
51
|
+
ids = set(ids_str.split(",")) if ids_str else set()
|
|
52
|
+
self._map[val] = ids
|
|
53
|
+
self._values = sorted(self._map.keys())
|
|
54
|
+
self._apply_threshold()
|
|
55
|
+
|
|
56
|
+
def save(self) -> None:
|
|
57
|
+
if self._in_ram:
|
|
58
|
+
self._write_file(self._map, self._values)
|
|
59
|
+
self._apply_threshold()
|
|
60
|
+
else:
|
|
61
|
+
self._flush_pending()
|
|
62
|
+
|
|
63
|
+
def _apply_threshold(self) -> None:
|
|
64
|
+
"""Evict map from RAM if the disk file exceeds the configured threshold."""
|
|
65
|
+
if self._ram_threshold > 0 and os.path.exists(self.path):
|
|
66
|
+
if os.path.getsize(self.path) > self._ram_threshold:
|
|
67
|
+
self._map.clear()
|
|
68
|
+
self._values.clear()
|
|
69
|
+
self._in_ram = False
|
|
70
|
+
|
|
71
|
+
def _write_file(self, data_map: dict, data_values: list) -> None:
|
|
72
|
+
"""Atomically write a map+sorted-values to disk."""
|
|
73
|
+
tmp = self.path + ".tmp"
|
|
74
|
+
with open(tmp, "w", encoding="utf-8") as fh:
|
|
75
|
+
for val in data_values:
|
|
76
|
+
ids = data_map.get(val)
|
|
77
|
+
if ids:
|
|
78
|
+
fh.write(f"{val}:{','.join(sorted(ids))}\n")
|
|
79
|
+
os.replace(tmp, self.path)
|
|
80
|
+
|
|
81
|
+
def _load_disk_map(self) -> tuple:
|
|
82
|
+
"""Read the disk file and return (map, sorted_values) without touching self._map."""
|
|
83
|
+
disk_map: dict = {}
|
|
84
|
+
if os.path.exists(self.path):
|
|
85
|
+
with open(self.path, "r", encoding="utf-8") as fh:
|
|
86
|
+
for raw in fh:
|
|
87
|
+
line = raw.strip()
|
|
88
|
+
if not line or ":" not in line:
|
|
89
|
+
continue
|
|
90
|
+
val_str, _, ids_str = line.partition(":")
|
|
91
|
+
try:
|
|
92
|
+
val = float(val_str)
|
|
93
|
+
except ValueError:
|
|
94
|
+
continue
|
|
95
|
+
disk_map[val] = set(ids_str.split(",")) if ids_str else set()
|
|
96
|
+
return disk_map, sorted(disk_map.keys())
|
|
97
|
+
|
|
98
|
+
def _flush_pending(self) -> None:
|
|
99
|
+
"""Merge write/remove buffers into the disk file and clear them."""
|
|
100
|
+
if not self._write_buf and not self._remove_buf:
|
|
101
|
+
return
|
|
102
|
+
disk_map, disk_values = self._load_disk_map()
|
|
103
|
+
|
|
104
|
+
for val, ids in self._write_buf.items():
|
|
105
|
+
if val not in disk_map:
|
|
106
|
+
bisect.insort(disk_values, val)
|
|
107
|
+
disk_map[val] = set()
|
|
108
|
+
disk_map[val].update(ids)
|
|
109
|
+
|
|
110
|
+
for val, rid in self._remove_buf:
|
|
111
|
+
ids = disk_map.get(val)
|
|
112
|
+
if ids:
|
|
113
|
+
ids.discard(rid)
|
|
114
|
+
if not ids:
|
|
115
|
+
del disk_map[val]
|
|
116
|
+
i = bisect.bisect_left(disk_values, val)
|
|
117
|
+
if i < len(disk_values) and disk_values[i] == val:
|
|
118
|
+
disk_values.pop(i)
|
|
119
|
+
|
|
120
|
+
self._write_file(disk_map, disk_values)
|
|
121
|
+
self._write_buf.clear()
|
|
122
|
+
self._remove_buf.clear()
|
|
123
|
+
|
|
124
|
+
# ------------------------------------------------------------------
|
|
125
|
+
# Mutation
|
|
126
|
+
# ------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
def add(self, record_id: str, numeric_value) -> None:
|
|
129
|
+
val = float(numeric_value)
|
|
130
|
+
if self._in_ram:
|
|
131
|
+
if val not in self._map:
|
|
132
|
+
bisect.insort(self._values, val)
|
|
133
|
+
self._map[val] = set()
|
|
134
|
+
self._map[val].add(record_id)
|
|
135
|
+
else:
|
|
136
|
+
self._write_buf.setdefault(val, set()).add(record_id)
|
|
137
|
+
|
|
138
|
+
def remove(self, record_id: str, numeric_value) -> None:
|
|
139
|
+
val = float(numeric_value)
|
|
140
|
+
if self._in_ram:
|
|
141
|
+
ids = self._map.get(val)
|
|
142
|
+
if not ids:
|
|
143
|
+
return
|
|
144
|
+
ids.discard(record_id)
|
|
145
|
+
if not ids:
|
|
146
|
+
del self._map[val]
|
|
147
|
+
i = bisect.bisect_left(self._values, val)
|
|
148
|
+
if i < len(self._values) and self._values[i] == val:
|
|
149
|
+
self._values.pop(i)
|
|
150
|
+
else:
|
|
151
|
+
self._remove_buf.append((val, record_id))
|
|
152
|
+
|
|
153
|
+
# ------------------------------------------------------------------
|
|
154
|
+
# Query
|
|
155
|
+
# ------------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
def exact_query(self, value) -> set:
|
|
158
|
+
if self._in_ram:
|
|
159
|
+
return set(self._map.get(float(value), set()))
|
|
160
|
+
self._flush_pending()
|
|
161
|
+
disk_map, _ = self._load_disk_map()
|
|
162
|
+
return set(disk_map.get(float(value), set()))
|
|
163
|
+
|
|
164
|
+
def range_query(self, lo=None, hi=None,
|
|
165
|
+
lo_inclusive: bool = True, hi_inclusive: bool = True) -> set:
|
|
166
|
+
"""
|
|
167
|
+
Return all record IDs whose value falls in [lo, hi] (bounds optional).
|
|
168
|
+
lo/hi may be None for open-ended ranges.
|
|
169
|
+
"""
|
|
170
|
+
if self._in_ram:
|
|
171
|
+
return self._range_from(self._map, self._values, lo, hi, lo_inclusive, hi_inclusive)
|
|
172
|
+
self._flush_pending()
|
|
173
|
+
disk_map, disk_values = self._load_disk_map()
|
|
174
|
+
return self._range_from(disk_map, disk_values, lo, hi, lo_inclusive, hi_inclusive)
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def _range_from(data_map: dict, data_values: list,
|
|
178
|
+
lo, hi, lo_inclusive: bool, hi_inclusive: bool) -> set:
|
|
179
|
+
if lo is None:
|
|
180
|
+
lo_idx = 0
|
|
181
|
+
else:
|
|
182
|
+
lo_f = float(lo)
|
|
183
|
+
lo_idx = (bisect.bisect_left(data_values, lo_f) if lo_inclusive
|
|
184
|
+
else bisect.bisect_right(data_values, lo_f))
|
|
185
|
+
|
|
186
|
+
if hi is None:
|
|
187
|
+
hi_idx = len(data_values)
|
|
188
|
+
else:
|
|
189
|
+
hi_f = float(hi)
|
|
190
|
+
hi_idx = (bisect.bisect_right(data_values, hi_f) if hi_inclusive
|
|
191
|
+
else bisect.bisect_left(data_values, hi_f))
|
|
192
|
+
|
|
193
|
+
result: set = set()
|
|
194
|
+
for val in data_values[lo_idx:hi_idx]:
|
|
195
|
+
result |= data_map.get(val, set())
|
|
196
|
+
return result
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QueryEngine — declarative dict query dispatcher for one store.
|
|
3
|
+
|
|
4
|
+
Query syntax (from the plan's Query Syntax Specification):
|
|
5
|
+
{"id": "x"} → fast-path primary index lookup
|
|
6
|
+
{"id": ["x", "y"]} → multi-ID fetch
|
|
7
|
+
{"field": "exact string"} → full-scan exact text match
|
|
8
|
+
{"field": ["kw1", "kw2"]} → FullTextIndex AND search
|
|
9
|
+
{"field": 2010} → NumericIndex exact_query
|
|
10
|
+
{"field": {">": 1999, "<": 2015}} → NumericIndex range_query
|
|
11
|
+
Compound dict → AND-intersect all candidate sets
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
OPERATORS = {">=", "<=", ">", "<"}
|
|
20
|
+
|
|
21
|
+
# Friendly aliases accepted from SDK callers
|
|
22
|
+
_OP_ALIASES = {"gte": ">=", "lte": "<=", "gt": ">", "lt": "<"}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class QuerySyntaxError(ValueError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class QueryEngine:
|
|
30
|
+
def __init__(self, store):
|
|
31
|
+
"""
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
store : src.db.objects.store.store
|
|
35
|
+
The owning store instance (used for read() and index_manager).
|
|
36
|
+
"""
|
|
37
|
+
self._store = store
|
|
38
|
+
self._full_text_indexes: dict = {} # field_name -> FullTextIndex
|
|
39
|
+
self._numeric_indexes: dict = {} # field_name -> NumericIndex
|
|
40
|
+
|
|
41
|
+
# ------------------------------------------------------------------
|
|
42
|
+
# Index lifecycle
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def build_indexes(self) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Instantiate indexes for all fields marked queryable in schema_config.
|
|
48
|
+
Called from store.setup() after storage is ready.
|
|
49
|
+
|
|
50
|
+
If an index file does not exist on disk, the store is scanned and the
|
|
51
|
+
index is built and saved immediately so it survives the next restart.
|
|
52
|
+
Fields whose index file exceeds ram_config.index_ram_threshold_bytes are
|
|
53
|
+
kept on disk only — queries read from the file; mutations are buffered
|
|
54
|
+
and flushed before each query.
|
|
55
|
+
"""
|
|
56
|
+
from src.db.query.full_text_index import FullTextIndex
|
|
57
|
+
from src.db.query.numeric_index import NumericIndex
|
|
58
|
+
|
|
59
|
+
schema = getattr(self._store.config, "schema_config", None)
|
|
60
|
+
if schema is None:
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
ram_cfg = getattr(self._store.config, "ram_config", None)
|
|
64
|
+
threshold = getattr(ram_cfg, "index_ram_threshold_bytes", 0) if ram_cfg else 0
|
|
65
|
+
|
|
66
|
+
store_name = self._store.config.name
|
|
67
|
+
print(f"[{store_name}] build_indexes: checking {len(schema.fields)} schema field(s)...")
|
|
68
|
+
|
|
69
|
+
needs_rebuild: list = []
|
|
70
|
+
for field_name, field_schema in schema.fields.items():
|
|
71
|
+
q = getattr(field_schema, "queryable", False)
|
|
72
|
+
if not q:
|
|
73
|
+
continue
|
|
74
|
+
if q in ("full-text", True):
|
|
75
|
+
idx = FullTextIndex(
|
|
76
|
+
store_path=self._store.store_path,
|
|
77
|
+
service=self._store.config.name,
|
|
78
|
+
field=field_name,
|
|
79
|
+
ram_threshold_bytes=threshold,
|
|
80
|
+
)
|
|
81
|
+
self._full_text_indexes[field_name] = idx
|
|
82
|
+
if not os.path.exists(idx.path):
|
|
83
|
+
needs_rebuild.append(field_name)
|
|
84
|
+
elif q == "numeric":
|
|
85
|
+
idx = NumericIndex(
|
|
86
|
+
store_path=self._store.store_path,
|
|
87
|
+
service=self._store.config.name,
|
|
88
|
+
field=field_name,
|
|
89
|
+
ram_threshold_bytes=threshold,
|
|
90
|
+
)
|
|
91
|
+
self._numeric_indexes[field_name] = idx
|
|
92
|
+
if not os.path.exists(idx.path):
|
|
93
|
+
needs_rebuild.append(field_name)
|
|
94
|
+
|
|
95
|
+
if needs_rebuild:
|
|
96
|
+
store_name = self._store.config.name
|
|
97
|
+
print(f"[{store_name}] Index files missing for: {needs_rebuild} — rebuilding from stored data...")
|
|
98
|
+
for field_name in needs_rebuild:
|
|
99
|
+
try:
|
|
100
|
+
self.rebuild_index(field_name)
|
|
101
|
+
except Exception as exc:
|
|
102
|
+
import traceback
|
|
103
|
+
print(f"[{store_name}] ERROR: rebuild_index('{field_name}') failed: {exc}")
|
|
104
|
+
traceback.print_exc()
|
|
105
|
+
logger.warning("build_indexes: rebuild_index('%s') failed: %s", field_name, exc)
|
|
106
|
+
else:
|
|
107
|
+
print(f"[{self._store.config.name}] All index files present — loaded from disk.")
|
|
108
|
+
|
|
109
|
+
def on_write(self, record_id: str, flat_delta: dict) -> None:
|
|
110
|
+
"""Update all relevant indexes when a record is written."""
|
|
111
|
+
for field, idx in self._full_text_indexes.items():
|
|
112
|
+
if field in flat_delta:
|
|
113
|
+
idx.add(record_id, str(flat_delta[field]))
|
|
114
|
+
for field, idx in self._numeric_indexes.items():
|
|
115
|
+
if field in flat_delta:
|
|
116
|
+
try:
|
|
117
|
+
idx.add(record_id, flat_delta[field])
|
|
118
|
+
except (ValueError, TypeError):
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
def on_delete(self, record_id: str, flat_dict: dict) -> None:
|
|
122
|
+
"""Remove a record from all indexes."""
|
|
123
|
+
for field, idx in self._full_text_indexes.items():
|
|
124
|
+
if field in flat_dict:
|
|
125
|
+
idx.remove(record_id, str(flat_dict[field]))
|
|
126
|
+
for field, idx in self._numeric_indexes.items():
|
|
127
|
+
if field in flat_dict:
|
|
128
|
+
try:
|
|
129
|
+
idx.remove(record_id, flat_dict[field])
|
|
130
|
+
except (ValueError, TypeError):
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
def save_all(self) -> None:
|
|
134
|
+
"""Persist all in-memory indexes to disk."""
|
|
135
|
+
for idx in self._full_text_indexes.values():
|
|
136
|
+
idx.save()
|
|
137
|
+
for idx in self._numeric_indexes.values():
|
|
138
|
+
idx.save()
|
|
139
|
+
|
|
140
|
+
def rebuild_index(self, field_name: str) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Backfill the index for a single field by scanning all log segments.
|
|
143
|
+
Safe to call from a background thread — reads only, no writes to log.
|
|
144
|
+
"""
|
|
145
|
+
from src.db.query.full_text_index import FullTextIndex
|
|
146
|
+
from src.db.query.numeric_index import NumericIndex
|
|
147
|
+
from src.db.storage import serializer as _ser
|
|
148
|
+
|
|
149
|
+
store = self._store
|
|
150
|
+
if store.log_manager is None or store.index_manager is None:
|
|
151
|
+
raise RuntimeError("Store storage not initialised.")
|
|
152
|
+
|
|
153
|
+
# Determine index type from schema_config
|
|
154
|
+
schema = getattr(store.config, "schema_config", None)
|
|
155
|
+
if schema is None or field_name not in schema.fields:
|
|
156
|
+
raise QuerySyntaxError(f"Field '{field_name}' not in schema_config.")
|
|
157
|
+
field_schema = schema.fields[field_name]
|
|
158
|
+
q = getattr(field_schema, "queryable", False)
|
|
159
|
+
|
|
160
|
+
# Determine RAM threshold from store config
|
|
161
|
+
ram_cfg = getattr(store.config, "ram_config", None)
|
|
162
|
+
threshold = getattr(ram_cfg, "index_ram_threshold_bytes", 0) if ram_cfg else 0
|
|
163
|
+
|
|
164
|
+
# Build a fresh in-memory index (start empty, ignoring any stale file)
|
|
165
|
+
if q in ("full-text", True):
|
|
166
|
+
new_idx = FullTextIndex(store.store_path, store.config.name, field_name,
|
|
167
|
+
ram_threshold_bytes=threshold)
|
|
168
|
+
new_idx._map = {}
|
|
169
|
+
new_idx._in_ram = True # force RAM mode while backfilling
|
|
170
|
+
self._full_text_indexes[field_name] = new_idx
|
|
171
|
+
elif q == "numeric":
|
|
172
|
+
new_idx = NumericIndex(store.store_path, store.config.name, field_name,
|
|
173
|
+
ram_threshold_bytes=threshold)
|
|
174
|
+
new_idx._map = {}
|
|
175
|
+
new_idx._values = []
|
|
176
|
+
new_idx._in_ram = True # force RAM mode while backfilling
|
|
177
|
+
self._numeric_indexes[field_name] = new_idx
|
|
178
|
+
else:
|
|
179
|
+
raise QuerySyntaxError(f"Field '{field_name}' is not queryable.")
|
|
180
|
+
|
|
181
|
+
# Scan all live records
|
|
182
|
+
for record_id in store.index_manager.all_record_ids():
|
|
183
|
+
entry = store.index_manager.get(record_id)
|
|
184
|
+
if entry is None:
|
|
185
|
+
continue
|
|
186
|
+
seg, offset, size = entry
|
|
187
|
+
try:
|
|
188
|
+
line = store.log_manager.read(seg, offset, size)
|
|
189
|
+
_, flat = _ser.deserialize_record(line)
|
|
190
|
+
if field_name in flat:
|
|
191
|
+
new_idx.add(record_id, flat[field_name])
|
|
192
|
+
except Exception as exc:
|
|
193
|
+
logger.warning("rebuild_index: skipping %s: %s", record_id, exc)
|
|
194
|
+
|
|
195
|
+
new_idx.save()
|
|
196
|
+
n = len(new_idx._map)
|
|
197
|
+
print(f"[{store.config.name}] Index for '{field_name}' rebuilt: {n} entries — saved to {new_idx.path}")
|
|
198
|
+
logger.info("rebuild_index: field '%s' rebuilt (%d entries)", field_name, n)
|
|
199
|
+
|
|
200
|
+
# ------------------------------------------------------------------
|
|
201
|
+
# Query
|
|
202
|
+
# ------------------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
def query(self, filter_dict: dict) -> list:
|
|
205
|
+
"""
|
|
206
|
+
Execute a declarative query dict and return a sorted list of matching record IDs.
|
|
207
|
+
Raises QuerySyntaxError for malformed input.
|
|
208
|
+
"""
|
|
209
|
+
if not isinstance(filter_dict, dict):
|
|
210
|
+
raise QuerySyntaxError("filter_dict must be a dict")
|
|
211
|
+
|
|
212
|
+
# Empty filter → return all record IDs
|
|
213
|
+
if not filter_dict:
|
|
214
|
+
idx_mgr = self._store.index_manager
|
|
215
|
+
if idx_mgr is None:
|
|
216
|
+
return []
|
|
217
|
+
return sorted(idx_mgr.all_record_ids())
|
|
218
|
+
|
|
219
|
+
# ID fast path
|
|
220
|
+
if "id" in filter_dict:
|
|
221
|
+
ids = filter_dict["id"]
|
|
222
|
+
if isinstance(ids, str):
|
|
223
|
+
return [ids]
|
|
224
|
+
if isinstance(ids, list):
|
|
225
|
+
return list(ids)
|
|
226
|
+
raise QuerySyntaxError("'id' value must be a string or list of strings")
|
|
227
|
+
|
|
228
|
+
candidate_sets: list = []
|
|
229
|
+
|
|
230
|
+
for field, value in filter_dict.items():
|
|
231
|
+
candidates = self._eval_field(field, value)
|
|
232
|
+
if not candidates: # short-circuit
|
|
233
|
+
return []
|
|
234
|
+
candidate_sets.append(candidates)
|
|
235
|
+
|
|
236
|
+
if not candidate_sets:
|
|
237
|
+
return []
|
|
238
|
+
|
|
239
|
+
result = candidate_sets[0]
|
|
240
|
+
for s in candidate_sets[1:]:
|
|
241
|
+
result = result & s
|
|
242
|
+
return sorted(result)
|
|
243
|
+
|
|
244
|
+
def _eval_field(self, field: str, value) -> set:
|
|
245
|
+
"""Evaluate one field clause and return a set of matching record IDs."""
|
|
246
|
+
if isinstance(value, (int, float)):
|
|
247
|
+
# Exact numeric match
|
|
248
|
+
idx = self._numeric_indexes.get(field)
|
|
249
|
+
if idx is None:
|
|
250
|
+
raise QuerySyntaxError(
|
|
251
|
+
f"Field '{field}' has no numeric index. "
|
|
252
|
+
"Mark it queryable='numeric' in schema_config."
|
|
253
|
+
)
|
|
254
|
+
return idx.exact_query(value)
|
|
255
|
+
|
|
256
|
+
if isinstance(value, str):
|
|
257
|
+
# Exact text match — linear scan
|
|
258
|
+
result = set()
|
|
259
|
+
idx_mgr = self._store.index_manager
|
|
260
|
+
if idx_mgr is None:
|
|
261
|
+
return result
|
|
262
|
+
for record_id in idx_mgr.all_record_ids():
|
|
263
|
+
rec = self._store.read(record_id)
|
|
264
|
+
if rec and rec.get(field) == value:
|
|
265
|
+
result.add(record_id)
|
|
266
|
+
return result
|
|
267
|
+
|
|
268
|
+
if isinstance(value, list):
|
|
269
|
+
# Partial / tokenised full-text search (AND across keywords)
|
|
270
|
+
idx = self._full_text_indexes.get(field)
|
|
271
|
+
if idx is None:
|
|
272
|
+
raise QuerySyntaxError(
|
|
273
|
+
f"Field '{field}' has no full-text index. "
|
|
274
|
+
"Mark it queryable='full-text' in schema_config."
|
|
275
|
+
)
|
|
276
|
+
return idx.search(value, mode="and")
|
|
277
|
+
|
|
278
|
+
if isinstance(value, dict):
|
|
279
|
+
# Numeric range query — keys are operators, values are numbers
|
|
280
|
+
idx = self._numeric_indexes.get(field)
|
|
281
|
+
if idx is None:
|
|
282
|
+
raise QuerySyntaxError(
|
|
283
|
+
f"Field '{field}' has no numeric index. "
|
|
284
|
+
"Mark it queryable='numeric' in schema_config."
|
|
285
|
+
)
|
|
286
|
+
lo = hi = None
|
|
287
|
+
lo_inc = hi_inc = True
|
|
288
|
+
for op, bound in value.items():
|
|
289
|
+
op = _OP_ALIASES.get(op, op) # normalise gte/lte/gt/lt
|
|
290
|
+
if op not in OPERATORS:
|
|
291
|
+
raise QuerySyntaxError(
|
|
292
|
+
f"Unsupported operator '{op}'. Must be one of {sorted(OPERATORS)}."
|
|
293
|
+
)
|
|
294
|
+
if op == ">=":
|
|
295
|
+
lo, lo_inc = bound, True
|
|
296
|
+
elif op == ">":
|
|
297
|
+
lo, lo_inc = bound, False
|
|
298
|
+
elif op == "<=":
|
|
299
|
+
hi, hi_inc = bound, True
|
|
300
|
+
elif op == "<":
|
|
301
|
+
hi, hi_inc = bound, False
|
|
302
|
+
return idx.range_query(lo=lo, hi=hi, lo_inclusive=lo_inc, hi_inclusive=hi_inc)
|
|
303
|
+
|
|
304
|
+
raise QuerySyntaxError(
|
|
305
|
+
f"Unsupported value type {type(value).__name__!r} for field '{field}'. "
|
|
306
|
+
"Use int/float (exact numeric), str (exact text), "
|
|
307
|
+
"list (keyword search), or dict (range query)."
|
|
308
|
+
)
|