lithos 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +28 -0
- data/LICENSE.txt +21 -0
- data/README.md +101 -0
- data/ext/lithos/extconf.rb +20 -0
- data/ext/lithos/lithos.cpp +1026 -0
- data/lib/lithos/version.rb +5 -0
- data/lib/lithos.rb +170 -0
- metadata +119 -0
|
@@ -0,0 +1,1026 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* lithos — a small embedded, ordered, crash-safe key-value store for Ruby.
|
|
3
|
+
*
|
|
4
|
+
* Engine: a log-structured merge (LSM) tree.
|
|
5
|
+
* - Every put/delete is appended to a write-ahead log (WAL) with a CRC32 and,
|
|
6
|
+
* in sync mode, fsync'd before the call returns (durable-on-return).
|
|
7
|
+
* - Live data sits in an in-memory sorted memtable (std::map, memcmp order).
|
|
8
|
+
* - When the memtable grows past a threshold it is flushed to an immutable,
|
|
9
|
+
* sorted SSTable file (sparse index + bloom filter + footer/magic), and a
|
|
10
|
+
* fresh WAL is started.
|
|
11
|
+
* - Reads check the memtable, then SSTables newest->oldest (bloom-filtered).
|
|
12
|
+
* - compact() merges all SSTables into one, dropping shadowed keys/tombstones.
|
|
13
|
+
* - The live set (SSTables + active WAL + file counter) is recorded in a
|
|
14
|
+
* MANIFEST that is rewritten atomically (temp + FlushFileBuffers + rename).
|
|
15
|
+
* - On open, the MANIFEST is read and the active WAL replayed; a torn/garbage
|
|
16
|
+
* WAL tail is detected by CRC and discarded.
|
|
17
|
+
*
|
|
18
|
+
* Keys and values are arbitrary BINARY strings (embedded NULs allowed); ordering
|
|
19
|
+
* is lexicographic over unsigned bytes (memcmp). Single-writer (exclusive dir
|
|
20
|
+
* lock); a DB is not safe to share across threads.
|
|
21
|
+
*
|
|
22
|
+
* Build (mswin/MSVC): <ruby.h> before <windows.h>; $CXXFLAGS << " -EHsc".
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
#include <ruby.h>
|
|
26
|
+
#include <ruby/encoding.h>
|
|
27
|
+
|
|
28
|
+
#define WIN32_LEAN_AND_MEAN
|
|
29
|
+
#include <windows.h>
|
|
30
|
+
#include <io.h>
|
|
31
|
+
|
|
32
|
+
#include <stdint.h>
|
|
33
|
+
#include <string>
|
|
34
|
+
#include <map>
|
|
35
|
+
#include <vector>
|
|
36
|
+
#include <queue>
|
|
37
|
+
#include <algorithm>
|
|
38
|
+
#include <cstring>
|
|
39
|
+
#include <cstdio>
|
|
40
|
+
|
|
41
|
+
/* ===================== little-endian + varint + crc + hash ============== */
|
|
42
|
+
|
|
43
|
+
static void put_u32(std::string &b, uint32_t v) { for (int i = 0; i < 4; i++) b.push_back((char)((v >> (8 * i)) & 0xFF)); }
|
|
44
|
+
static void put_u64(std::string &b, uint64_t v) { for (int i = 0; i < 8; i++) b.push_back((char)((v >> (8 * i)) & 0xFF)); }
|
|
45
|
+
static uint32_t get_u32(const uint8_t *p) { uint32_t v = 0; for (int i = 0; i < 4; i++) v |= (uint32_t)p[i] << (8 * i); return v; }
|
|
46
|
+
static uint64_t get_u64(const uint8_t *p) { uint64_t v = 0; for (int i = 0; i < 8; i++) v |= (uint64_t)p[i] << (8 * i); return v; }
|
|
47
|
+
|
|
48
|
+
static void put_varint(std::string &b, uint64_t v) {
|
|
49
|
+
do { uint8_t x = (uint8_t)(v & 0x7F); v >>= 7; if (v) x |= 0x80; b.push_back((char)x); } while (v);
|
|
50
|
+
}
|
|
51
|
+
/* Advances p; returns false on malformed/overflow/out-of-bounds. */
|
|
52
|
+
static bool get_varint(const uint8_t *&p, const uint8_t *end, uint64_t *out) {
|
|
53
|
+
uint64_t v = 0; int sh = 0;
|
|
54
|
+
while (p < end) {
|
|
55
|
+
uint8_t b = *p++;
|
|
56
|
+
v |= (uint64_t)(b & 0x7F) << sh;
|
|
57
|
+
if (!(b & 0x80)) { *out = v; return true; }
|
|
58
|
+
sh += 7; if (sh > 63) return false;
|
|
59
|
+
}
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
static uint32_t crc_table[256];
|
|
64
|
+
static bool crc_ready = false;
|
|
65
|
+
static void crc32_init(void) {
|
|
66
|
+
for (uint32_t i = 0; i < 256; i++) {
|
|
67
|
+
uint32_t c = i;
|
|
68
|
+
for (int k = 0; k < 8; k++) c = (c & 1) ? (0xEDB88320u ^ (c >> 1)) : (c >> 1);
|
|
69
|
+
crc_table[i] = c;
|
|
70
|
+
}
|
|
71
|
+
crc_ready = true;
|
|
72
|
+
}
|
|
73
|
+
static uint32_t crc32_buf(const uint8_t *b, size_t n) {
|
|
74
|
+
if (!crc_ready) crc32_init();
|
|
75
|
+
uint32_t c = 0xFFFFFFFFu;
|
|
76
|
+
for (size_t i = 0; i < n; i++) c = crc_table[(c ^ b[i]) & 0xFF] ^ (c >> 8);
|
|
77
|
+
return c ^ 0xFFFFFFFFu;
|
|
78
|
+
}
|
|
79
|
+
static uint64_t fnv1a64(const char *p, size_t n) {
|
|
80
|
+
uint64_t h = 1469598103934665603ull;
|
|
81
|
+
for (size_t i = 0; i < n; i++) { h ^= (uint8_t)p[i]; h *= 1099511628211ull; }
|
|
82
|
+
return h;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/* unsigned-byte (memcmp) comparator with length tiebreak */
|
|
86
|
+
struct ByteCmp {
|
|
87
|
+
bool operator()(const std::string &a, const std::string &b) const {
|
|
88
|
+
size_t n = a.size() < b.size() ? a.size() : b.size();
|
|
89
|
+
int c = n ? memcmp(a.data(), b.data(), n) : 0;
|
|
90
|
+
if (c != 0) return c < 0;
|
|
91
|
+
return a.size() < b.size();
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
static int bytecmp(const std::string &a, const std::string &b) {
|
|
95
|
+
size_t n = a.size() < b.size() ? a.size() : b.size();
|
|
96
|
+
int c = n ? memcmp(a.data(), b.data(), n) : 0;
|
|
97
|
+
if (c != 0) return c < 0 ? -1 : 1;
|
|
98
|
+
if (a.size() == b.size()) return 0;
|
|
99
|
+
return a.size() < b.size() ? -1 : 1;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/* ===================== records / memtable =============================== */
|
|
103
|
+
|
|
104
|
+
enum { KIND_VALUE = 0, KIND_TOMB = 1 };
|
|
105
|
+
struct Record { uint8_t kind; std::string value; };
|
|
106
|
+
typedef std::map<std::string, Record, ByteCmp> Memtable;
|
|
107
|
+
|
|
108
|
+
static VALUE mLithos, cDB, eError;
|
|
109
|
+
|
|
110
|
+
/* ===================== path / file helpers ============================== */
|
|
111
|
+
|
|
112
|
+
static std::wstring utf8_to_wide(const char *s, size_t n) {
|
|
113
|
+
if (n == 0) return std::wstring();
|
|
114
|
+
int need = MultiByteToWideChar(CP_UTF8, 0, s, (int)n, NULL, 0);
|
|
115
|
+
std::wstring w((size_t)(need > 0 ? need : 0), L'\0');
|
|
116
|
+
if (need > 0) MultiByteToWideChar(CP_UTF8, 0, s, (int)n, &w[0], need);
|
|
117
|
+
return w;
|
|
118
|
+
}
|
|
119
|
+
static std::wstring child_path(const std::wstring &dir, const char *name) {
|
|
120
|
+
std::wstring w = dir;
|
|
121
|
+
if (!w.empty() && w.back() != L'\\' && w.back() != L'/') w.push_back(L'\\');
|
|
122
|
+
for (const char *p = name; *p; p++) w.push_back((wchar_t)(unsigned char)*p);
|
|
123
|
+
return w;
|
|
124
|
+
}
|
|
125
|
+
static std::wstring numbered(const std::wstring &dir, uint64_t num, const char *ext) {
|
|
126
|
+
char buf[64];
|
|
127
|
+
snprintf(buf, sizeof(buf), "%06llu%s", (unsigned long long)num, ext);
|
|
128
|
+
return child_path(dir, buf);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
static bool write_all(HANDLE h, const void *buf, size_t len) {
|
|
132
|
+
const char *p = (const char *)buf;
|
|
133
|
+
while (len > 0) {
|
|
134
|
+
DWORD chunk = (len > 0x40000000u) ? 0x40000000u : (DWORD)len;
|
|
135
|
+
DWORD wrote = 0;
|
|
136
|
+
if (!WriteFile(h, p, chunk, &wrote, NULL) || wrote == 0) return false;
|
|
137
|
+
p += wrote; len -= wrote;
|
|
138
|
+
}
|
|
139
|
+
return true;
|
|
140
|
+
}
|
|
141
|
+
static bool fsync_handle(HANDLE h) { return FlushFileBuffers(h) != 0; }
|
|
142
|
+
|
|
143
|
+
/* Transient locks (antivirus / the search indexer briefly holding a freshly
|
|
144
|
+
touched file) surface as ERROR_ACCESS_DENIED / ERROR_SHARING_VIOLATION on
|
|
145
|
+
rename/delete/create. Retry those a few times before giving up. */
|
|
146
|
+
static bool transient_err(DWORD e) {
|
|
147
|
+
return e == ERROR_ACCESS_DENIED || e == ERROR_SHARING_VIOLATION;
|
|
148
|
+
}
|
|
149
|
+
static HANDLE create_retry(const std::wstring &path, DWORD access, DWORD share,
|
|
150
|
+
DWORD disp, DWORD flags) {
|
|
151
|
+
for (int i = 0; i < 60; i++) {
|
|
152
|
+
HANDLE h = CreateFileW(path.c_str(), access, share, NULL, disp, flags, NULL);
|
|
153
|
+
if (h != INVALID_HANDLE_VALUE) return h;
|
|
154
|
+
if (!transient_err(GetLastError())) return INVALID_HANDLE_VALUE;
|
|
155
|
+
Sleep(10);
|
|
156
|
+
}
|
|
157
|
+
return INVALID_HANDLE_VALUE;
|
|
158
|
+
}
|
|
159
|
+
static bool move_replace_retry(const std::wstring &from, const std::wstring &to) {
|
|
160
|
+
for (int i = 0; i < 60; i++) {
|
|
161
|
+
if (MoveFileExW(from.c_str(), to.c_str(), MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH))
|
|
162
|
+
return true;
|
|
163
|
+
if (!transient_err(GetLastError())) return false;
|
|
164
|
+
Sleep(10);
|
|
165
|
+
}
|
|
166
|
+
return false;
|
|
167
|
+
}
|
|
168
|
+
static void delete_retry(const std::wstring &path) {
|
|
169
|
+
for (int i = 0; i < 60; i++) {
|
|
170
|
+
if (DeleteFileW(path.c_str())) return;
|
|
171
|
+
DWORD e = GetLastError();
|
|
172
|
+
if (e == ERROR_FILE_NOT_FOUND || e == ERROR_PATH_NOT_FOUND) return;
|
|
173
|
+
if (!transient_err(e)) return;
|
|
174
|
+
Sleep(10);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/* Write `data` to a temp file, fsync, then atomically rename over `final`. */
|
|
179
|
+
static bool publish_file(const std::wstring &final_path, const std::string &data) {
|
|
180
|
+
std::wstring tmp = final_path + L".tmp";
|
|
181
|
+
HANDLE h = create_retry(tmp, GENERIC_WRITE, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL);
|
|
182
|
+
if (h == INVALID_HANDLE_VALUE) return false;
|
|
183
|
+
bool ok = write_all(h, data.data(), data.size()) && fsync_handle(h);
|
|
184
|
+
CloseHandle(h);
|
|
185
|
+
if (!ok) { delete_retry(tmp); return false; }
|
|
186
|
+
if (!move_replace_retry(tmp, final_path)) { delete_retry(tmp); return false; }
|
|
187
|
+
return true;
|
|
188
|
+
}
|
|
189
|
+
/* Read an entire file into `out`. Returns false if it doesn't exist / fails. */
|
|
190
|
+
static bool read_whole(const std::wstring &path, std::string &out) {
|
|
191
|
+
HANDLE h = CreateFileW(path.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL,
|
|
192
|
+
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
193
|
+
if (h == INVALID_HANDLE_VALUE) return false;
|
|
194
|
+
LARGE_INTEGER sz;
|
|
195
|
+
if (!GetFileSizeEx(h, &sz)) { CloseHandle(h); return false; }
|
|
196
|
+
out.resize((size_t)sz.QuadPart);
|
|
197
|
+
size_t off = 0;
|
|
198
|
+
bool ok = true;
|
|
199
|
+
while (off < out.size()) {
|
|
200
|
+
DWORD want = (DWORD)((out.size() - off > 0x40000000u) ? 0x40000000u : (out.size() - off));
|
|
201
|
+
DWORD got = 0;
|
|
202
|
+
if (!ReadFile(h, &out[off], want, &got, NULL) || got == 0) { ok = false; break; }
|
|
203
|
+
off += got;
|
|
204
|
+
}
|
|
205
|
+
CloseHandle(h);
|
|
206
|
+
return ok;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/* ===================== SSTable format ===================================
|
|
210
|
+
* data: [ entry ]* (ascending key)
|
|
211
|
+
* entry = varint klen | key | u8 kind | varint vlen | value
|
|
212
|
+
* bloom: u32 k | u32 m | bitmap[ceil(m/8)]
|
|
213
|
+
* index: [ varint klen | key | u64 dataoffset ]* (sparse, ascending)
|
|
214
|
+
* footer (40 bytes, at EOF):
|
|
215
|
+
* u64 index_off | u64 index_len | u64 bloom_off | u64 bloom_len | u64 MAGIC
|
|
216
|
+
* ====================================================================== */
|
|
217
|
+
|
|
218
|
+
static const uint64_t SST_MAGIC = 0x4C495448534F5331ull; /* "LITHSOS1" */
|
|
219
|
+
static const size_t SST_FOOTER = 40;
|
|
220
|
+
static const int SST_INDEX_STRIDE = 16; /* one sparse index entry per N data entries */
|
|
221
|
+
|
|
222
|
+
/* Build an SSTable image (as a byte string) from sorted entries. */
|
|
223
|
+
struct OutEntry { std::string key; uint8_t kind; std::string value; };
|
|
224
|
+
|
|
225
|
+
static std::string build_sstable(const std::vector<OutEntry> &entries) {
|
|
226
|
+
std::string data, index;
|
|
227
|
+
size_t n = entries.size();
|
|
228
|
+
|
|
229
|
+
/* bloom params: ~10 bits/key, k≈7 */
|
|
230
|
+
uint32_t m = (uint32_t)(n * 10 + 64);
|
|
231
|
+
uint32_t kfun = 7;
|
|
232
|
+
std::string bloom_bits((m + 7) / 8, '\0');
|
|
233
|
+
|
|
234
|
+
for (size_t i = 0; i < n; i++) {
|
|
235
|
+
const OutEntry &e = entries[i];
|
|
236
|
+
if ((i % SST_INDEX_STRIDE) == 0) {
|
|
237
|
+
put_varint(index, e.key.size());
|
|
238
|
+
index.append(e.key);
|
|
239
|
+
put_u64(index, (uint64_t)data.size());
|
|
240
|
+
}
|
|
241
|
+
put_varint(data, e.key.size());
|
|
242
|
+
data.append(e.key);
|
|
243
|
+
data.push_back((char)e.kind);
|
|
244
|
+
put_varint(data, e.value.size());
|
|
245
|
+
data.append(e.value);
|
|
246
|
+
|
|
247
|
+
uint64_t hv = fnv1a64(e.key.data(), e.key.size());
|
|
248
|
+
uint32_t h1 = (uint32_t)hv, h2 = (uint32_t)(hv >> 32);
|
|
249
|
+
for (uint32_t j = 0; j < kfun; j++) {
|
|
250
|
+
uint32_t bit = (uint32_t)((uint64_t)h1 + (uint64_t)j * h2) % m;
|
|
251
|
+
bloom_bits[bit >> 3] |= (char)(1 << (bit & 7));
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
std::string out;
|
|
256
|
+
out.append(data);
|
|
257
|
+
uint64_t bloom_off = out.size();
|
|
258
|
+
put_u32(out, kfun);
|
|
259
|
+
put_u32(out, m);
|
|
260
|
+
out.append(bloom_bits);
|
|
261
|
+
uint64_t bloom_len = out.size() - bloom_off;
|
|
262
|
+
uint64_t index_off = out.size();
|
|
263
|
+
out.append(index);
|
|
264
|
+
uint64_t index_len = out.size() - index_off;
|
|
265
|
+
put_u64(out, index_off);
|
|
266
|
+
put_u64(out, index_len);
|
|
267
|
+
put_u64(out, bloom_off);
|
|
268
|
+
put_u64(out, bloom_len);
|
|
269
|
+
put_u64(out, SST_MAGIC);
|
|
270
|
+
return out;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/* A memory-mapped, immutable SSTable opened for reading. */
|
|
274
|
+
struct SSTable {
|
|
275
|
+
uint64_t number;
|
|
276
|
+
HANDLE hFile, hMap;
|
|
277
|
+
const uint8_t *base;
|
|
278
|
+
size_t size;
|
|
279
|
+
uint64_t data_end; /* data spans [0, data_end) */
|
|
280
|
+
const uint8_t *bloom_bits; /* points into the map */
|
|
281
|
+
uint32_t bloom_k, bloom_m;
|
|
282
|
+
std::vector<std::pair<std::string, uint64_t> > index; /* sparse, ascending */
|
|
283
|
+
|
|
284
|
+
SSTable() : number(0), hFile(INVALID_HANDLE_VALUE), hMap(NULL), base(NULL),
|
|
285
|
+
size(0), data_end(0), bloom_bits(NULL), bloom_k(0), bloom_m(0) {}
|
|
286
|
+
~SSTable() {
|
|
287
|
+
if (base) UnmapViewOfFile(base);
|
|
288
|
+
if (hMap) CloseHandle(hMap);
|
|
289
|
+
if (hFile != INVALID_HANDLE_VALUE) CloseHandle(hFile);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/* Open + parse footer/index/bloom. Returns false on any malformation. */
|
|
293
|
+
bool open(const std::wstring &path, uint64_t num) {
|
|
294
|
+
number = num;
|
|
295
|
+
hFile = CreateFileW(path.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL,
|
|
296
|
+
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
297
|
+
if (hFile == INVALID_HANDLE_VALUE) return false;
|
|
298
|
+
LARGE_INTEGER sz;
|
|
299
|
+
if (!GetFileSizeEx(hFile, &sz) || (uint64_t)sz.QuadPart < SST_FOOTER) return false;
|
|
300
|
+
size = (size_t)sz.QuadPart;
|
|
301
|
+
hMap = CreateFileMappingW(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
|
302
|
+
if (!hMap) return false;
|
|
303
|
+
base = (const uint8_t *)MapViewOfFile(hMap, FILE_MAP_READ, 0, 0, 0);
|
|
304
|
+
if (!base) return false;
|
|
305
|
+
|
|
306
|
+
const uint8_t *foot = base + size - SST_FOOTER;
|
|
307
|
+
uint64_t index_off = get_u64(foot);
|
|
308
|
+
uint64_t index_len = get_u64(foot + 8);
|
|
309
|
+
uint64_t bloom_off = get_u64(foot + 16);
|
|
310
|
+
uint64_t bloom_len = get_u64(foot + 24);
|
|
311
|
+
uint64_t magic = get_u64(foot + 32);
|
|
312
|
+
if (magic != SST_MAGIC) return false;
|
|
313
|
+
/* Subtractive bounds checks: the additive form `off + len > size` can
|
|
314
|
+
wrap mod 2^64 for an attacker/corruption-controlled 64-bit len and
|
|
315
|
+
bypass the check. Establish `off <= size` first, then `len <= size-off`. */
|
|
316
|
+
if (bloom_off > size || bloom_len > size - bloom_off || bloom_len < 8) return false;
|
|
317
|
+
if (index_off > size || index_len > size - index_off) return false;
|
|
318
|
+
if (bloom_off > index_off) return false;
|
|
319
|
+
|
|
320
|
+
data_end = bloom_off;
|
|
321
|
+
bloom_k = get_u32(base + bloom_off);
|
|
322
|
+
bloom_m = get_u32(base + bloom_off + 4);
|
|
323
|
+
bloom_bits = base + bloom_off + 8;
|
|
324
|
+
/* (bloom_m + 7) in 32-bit would wrap for bloom_m near 2^32; widen first. */
|
|
325
|
+
uint64_t bloom_need = 8 + ((uint64_t)bloom_m + 7) / 8;
|
|
326
|
+
if (bloom_m == 0 || bloom_need > bloom_len) return false;
|
|
327
|
+
|
|
328
|
+
const uint8_t *p = base + index_off;
|
|
329
|
+
const uint8_t *iend = p + index_len;
|
|
330
|
+
while (p < iend) {
|
|
331
|
+
uint64_t klen;
|
|
332
|
+
if (!get_varint(p, iend, &klen) || klen > (uint64_t)(iend - p)) return false;
|
|
333
|
+
std::string key((const char *)p, (size_t)klen); p += klen;
|
|
334
|
+
if ((uint64_t)(iend - p) < 8) return false;
|
|
335
|
+
uint64_t off = get_u64(p); p += 8;
|
|
336
|
+
if (off > data_end) return false;
|
|
337
|
+
index.push_back(std::make_pair(key, off));
|
|
338
|
+
}
|
|
339
|
+
return true;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
bool bloom_maybe(const std::string &key) const {
|
|
343
|
+
uint64_t hv = fnv1a64(key.data(), key.size());
|
|
344
|
+
uint32_t h1 = (uint32_t)hv, h2 = (uint32_t)(hv >> 32);
|
|
345
|
+
for (uint32_t j = 0; j < bloom_k; j++) {
|
|
346
|
+
uint32_t bit = (uint32_t)((uint64_t)h1 + (uint64_t)j * h2) % bloom_m;
|
|
347
|
+
if (!(bloom_bits[bit >> 3] & (1 << (bit & 7)))) return false;
|
|
348
|
+
}
|
|
349
|
+
return true;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
/* Offset to start scanning for `key` (largest sparse-index key <= key). */
|
|
353
|
+
uint64_t start_offset(const std::string &key) const {
|
|
354
|
+
size_t lo = 0, hi = index.size();
|
|
355
|
+
while (lo < hi) { /* first index entry with key > target */
|
|
356
|
+
size_t mid = (lo + hi) / 2;
|
|
357
|
+
if (bytecmp(index[mid].first, key) <= 0) lo = mid + 1; else hi = mid;
|
|
358
|
+
}
|
|
359
|
+
return lo == 0 ? 0 : index[lo - 1].second;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/* Parse the entry at p (within [base,data_end)); advance p. false if malformed. */
|
|
363
|
+
bool parse_entry(const uint8_t *&p, std::string &key, uint8_t &kind, std::string &value) const {
|
|
364
|
+
const uint8_t *end = base + data_end;
|
|
365
|
+
uint64_t klen;
|
|
366
|
+
if (!get_varint(p, end, &klen) || klen > (uint64_t)(end - p)) return false;
|
|
367
|
+
key.assign((const char *)p, (size_t)klen); p += klen;
|
|
368
|
+
if (p >= end) return false; /* need the kind byte */
|
|
369
|
+
kind = *p++;
|
|
370
|
+
uint64_t vlen;
|
|
371
|
+
if (!get_varint(p, end, &vlen) || vlen > (uint64_t)(end - p)) return false;
|
|
372
|
+
value.assign((const char *)p, (size_t)vlen); p += vlen;
|
|
373
|
+
return true;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/* 0 = absent, 1 = value (sets *out), 2 = tombstone. */
|
|
377
|
+
int get(const std::string &key, std::string *out) const {
|
|
378
|
+
if (!bloom_maybe(key)) return 0;
|
|
379
|
+
const uint8_t *p = base + start_offset(key);
|
|
380
|
+
const uint8_t *end = base + data_end;
|
|
381
|
+
std::string k, v; uint8_t kind;
|
|
382
|
+
while (p < end) {
|
|
383
|
+
if (!parse_entry(p, k, kind, v)) return 0;
|
|
384
|
+
int c = bytecmp(k, key);
|
|
385
|
+
if (c == 0) { if (kind == KIND_VALUE) { *out = v; return 1; } return 2; }
|
|
386
|
+
if (c > 0) return 0;
|
|
387
|
+
}
|
|
388
|
+
return 0;
|
|
389
|
+
}
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
/* ===================== merge iterator =================================== */
|
|
393
|
+
|
|
394
|
+
struct Source {
|
|
395
|
+
bool is_mem;
|
|
396
|
+
uint64_t recency; /* memtable highest; SSTables by number */
|
|
397
|
+
/* memtable */
|
|
398
|
+
Memtable::const_iterator mit, mend;
|
|
399
|
+
/* sstable */
|
|
400
|
+
const SSTable *sst;
|
|
401
|
+
const uint8_t *p, *pend;
|
|
402
|
+
/* current head */
|
|
403
|
+
std::string key, value;
|
|
404
|
+
uint8_t kind;
|
|
405
|
+
bool valid;
|
|
406
|
+
/* upper bound */
|
|
407
|
+
const std::string *upper; bool upper_incl;
|
|
408
|
+
|
|
409
|
+
bool within_upper(const std::string &k) const {
|
|
410
|
+
if (!upper) return true;
|
|
411
|
+
int c = bytecmp(k, *upper);
|
|
412
|
+
return upper_incl ? (c <= 0) : (c < 0);
|
|
413
|
+
}
|
|
414
|
+
void load_current() {
|
|
415
|
+
if (is_mem) {
|
|
416
|
+
if (mit == mend) { valid = false; return; }
|
|
417
|
+
key = mit->first; kind = mit->second.kind; value = mit->second.value;
|
|
418
|
+
} /* sstable head already parsed into key/kind/value by advance */
|
|
419
|
+
valid = within_upper(key);
|
|
420
|
+
}
|
|
421
|
+
void advance() {
|
|
422
|
+
if (is_mem) {
|
|
423
|
+
++mit;
|
|
424
|
+
if (mit == mend) { valid = false; return; }
|
|
425
|
+
key = mit->first; kind = mit->second.kind; value = mit->second.value;
|
|
426
|
+
valid = within_upper(key);
|
|
427
|
+
} else {
|
|
428
|
+
if (p >= pend) { valid = false; return; }
|
|
429
|
+
if (!sst->parse_entry(p, key, kind, value)) { valid = false; return; }
|
|
430
|
+
valid = within_upper(key);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
/* Pick the valid source with the smallest key; ties go to the newest (highest
|
|
436
|
+
recency). Returns its index, or -1 when all sources are exhausted. */
|
|
437
|
+
static int merge_pick(std::vector<Source> &src) {
|
|
438
|
+
int best = -1;
|
|
439
|
+
for (int i = 0; i < (int)src.size(); i++) {
|
|
440
|
+
if (!src[i].valid) continue;
|
|
441
|
+
if (best < 0) { best = i; continue; }
|
|
442
|
+
int c = bytecmp(src[i].key, src[best].key);
|
|
443
|
+
if (c < 0 || (c == 0 && src[i].recency > src[best].recency)) best = i;
|
|
444
|
+
}
|
|
445
|
+
return best;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/* Collecting merge for compaction (no Ruby calls — C++ temporaries are fine). */
|
|
449
|
+
static void merge_collect(std::vector<Source> &src, std::vector<OutEntry> &out, bool drop_tombstones) {
|
|
450
|
+
int best;
|
|
451
|
+
while ((best = merge_pick(src)) >= 0) {
|
|
452
|
+
std::string key = src[best].key, value = src[best].value;
|
|
453
|
+
uint8_t kind = src[best].kind;
|
|
454
|
+
src[best].advance();
|
|
455
|
+
for (int j = 0; j < (int)src.size(); j++) { /* drain older duplicates */
|
|
456
|
+
if (j == best) continue;
|
|
457
|
+
while (src[j].valid && bytecmp(src[j].key, key) == 0) src[j].advance();
|
|
458
|
+
}
|
|
459
|
+
if (kind == KIND_TOMB) {
|
|
460
|
+
if (!drop_tombstones) { OutEntry e; e.key = key; e.kind = KIND_TOMB; e.value.clear(); out.push_back(e); }
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
OutEntry e; e.key = key; e.kind = KIND_VALUE; e.value = value; out.push_back(e);
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/* Build the [key,value] strings and yield — runs under rb_protect so any
|
|
468
|
+
exception/break from the user block (or a NoMemError from rb_str_new) is
|
|
469
|
+
captured locally instead of longjmp-ing through C++ frames. */
|
|
470
|
+
struct YieldArg { const std::string *k; const std::string *v; };
|
|
471
|
+
static VALUE merge_yield_one(VALUE arg) {
|
|
472
|
+
YieldArg *y = (YieldArg *)arg;
|
|
473
|
+
VALUE pair[2];
|
|
474
|
+
pair[0] = rb_str_new(y->k->data(), (long)y->k->size());
|
|
475
|
+
pair[1] = rb_str_new(y->v->data(), (long)y->v->size());
|
|
476
|
+
rb_enc_associate(pair[0], rb_ascii8bit_encoding());
|
|
477
|
+
rb_enc_associate(pair[1], rb_ascii8bit_encoding());
|
|
478
|
+
return rb_yield_values2(2, pair);
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/* Yielding merge for #each/#scan. ek/ev are caller-owned (heap) buffers. The
|
|
482
|
+
only Ruby calls that can raise happen inside rb_protect, so NO longjmp ever
|
|
483
|
+
crosses this C++ frame. Returns 0 normally, or a non-zero Ruby tag (break /
|
|
484
|
+
exception) for the caller to re-raise with rb_jump_tag from a clean frame. */
|
|
485
|
+
static int merge_yield(std::vector<Source> &src, std::string &ek, std::string &ev) {
|
|
486
|
+
int best;
|
|
487
|
+
while ((best = merge_pick(src)) >= 0) {
|
|
488
|
+
ek = src[best].key; ev = src[best].value;
|
|
489
|
+
uint8_t kind = src[best].kind;
|
|
490
|
+
src[best].advance();
|
|
491
|
+
for (int j = 0; j < (int)src.size(); j++) {
|
|
492
|
+
if (j == best) continue;
|
|
493
|
+
while (src[j].valid && bytecmp(src[j].key, ek) == 0) src[j].advance();
|
|
494
|
+
}
|
|
495
|
+
if (kind == KIND_TOMB) continue;
|
|
496
|
+
YieldArg y; y.k = &ek; y.v = &ev;
|
|
497
|
+
int state = 0;
|
|
498
|
+
rb_protect(merge_yield_one, (VALUE)&y, &state);
|
|
499
|
+
if (state) return state;
|
|
500
|
+
}
|
|
501
|
+
return 0;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
/* ===================== the database ===================================== */
|
|
505
|
+
|
|
506
|
+
struct DB {
|
|
507
|
+
std::wstring dir;
|
|
508
|
+
std::string dir_utf8;
|
|
509
|
+
HANDLE lock;
|
|
510
|
+
HANDLE wal;
|
|
511
|
+
uint64_t wal_number;
|
|
512
|
+
uint64_t next_number;
|
|
513
|
+
Memtable mem;
|
|
514
|
+
size_t mem_bytes;
|
|
515
|
+
std::vector<SSTable *> ssts; /* ascending by number; newest at back */
|
|
516
|
+
bool sync_writes;
|
|
517
|
+
size_t memtable_limit;
|
|
518
|
+
size_t compact_trigger;
|
|
519
|
+
bool closed;
|
|
520
|
+
char errbuf[256]; /* last engine error; raised by the thin Ruby wrappers */
|
|
521
|
+
|
|
522
|
+
DB() : lock(INVALID_HANDLE_VALUE), wal(INVALID_HANDLE_VALUE), wal_number(0),
|
|
523
|
+
next_number(1), mem_bytes(0), sync_writes(true),
|
|
524
|
+
memtable_limit(4u * 1024 * 1024), compact_trigger(8), closed(true) {
|
|
525
|
+
errbuf[0] = '\0';
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
/* Engine methods NEVER call rb_raise (a longjmp through these /EHsc C++
|
|
529
|
+
frames is UB on MSVC). They record the error here and return false; the
|
|
530
|
+
Ruby-facing wrapper raises from a frame holding no live C++ object. */
|
|
531
|
+
bool fail(const char *what) {
|
|
532
|
+
snprintf(errbuf, sizeof(errbuf), "%s (GetLastError=%lu)", what,
|
|
533
|
+
(unsigned long)GetLastError());
|
|
534
|
+
return false;
|
|
535
|
+
}
|
|
536
|
+
~DB() { release(); }
|
|
537
|
+
|
|
538
|
+
void release() {
|
|
539
|
+
if (wal != INVALID_HANDLE_VALUE) { CloseHandle(wal); wal = INVALID_HANDLE_VALUE; }
|
|
540
|
+
for (size_t i = 0; i < ssts.size(); i++) delete ssts[i];
|
|
541
|
+
ssts.clear();
|
|
542
|
+
if (lock != INVALID_HANDLE_VALUE) { CloseHandle(lock); lock = INVALID_HANDLE_VALUE; }
|
|
543
|
+
closed = true;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
/* ---- MANIFEST: framed [u32 len][u32 crc][payload]; payload =
|
|
547
|
+
u32 MAGIC | u32 ver | u64 next | u64 wal | u32 sstcount | u64 sst* ---- */
|
|
548
|
+
static const uint32_t MAN_MAGIC = 0x4C54484Du; /* "LTHM" */
|
|
549
|
+
std::wstring manifest_path() { return child_path(dir, "MANIFEST"); }
|
|
550
|
+
|
|
551
|
+
bool write_manifest() {
|
|
552
|
+
std::string pay;
|
|
553
|
+
put_u32(pay, MAN_MAGIC);
|
|
554
|
+
put_u32(pay, 1);
|
|
555
|
+
put_u64(pay, next_number);
|
|
556
|
+
put_u64(pay, wal_number);
|
|
557
|
+
put_u32(pay, (uint32_t)ssts.size());
|
|
558
|
+
for (size_t i = 0; i < ssts.size(); i++) put_u64(pay, ssts[i]->number);
|
|
559
|
+
std::string framed;
|
|
560
|
+
put_u32(framed, (uint32_t)pay.size());
|
|
561
|
+
put_u32(framed, crc32_buf((const uint8_t *)pay.data(), pay.size()));
|
|
562
|
+
framed.append(pay);
|
|
563
|
+
return publish_file(manifest_path(), framed);
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
/* Parse MANIFEST -> sst numbers + counters. false if missing/corrupt. */
|
|
567
|
+
bool read_manifest(std::vector<uint64_t> &sst_numbers) {
|
|
568
|
+
std::string buf;
|
|
569
|
+
if (!read_whole(manifest_path(), buf)) return false;
|
|
570
|
+
if (buf.size() < 8) return false;
|
|
571
|
+
const uint8_t *b = (const uint8_t *)buf.data();
|
|
572
|
+
uint32_t len = get_u32(b), crc = get_u32(b + 4);
|
|
573
|
+
if ((uint64_t)8 + len > buf.size()) return false;
|
|
574
|
+
const uint8_t *pay = b + 8;
|
|
575
|
+
if (crc32_buf(pay, len) != crc) return false;
|
|
576
|
+
const uint8_t *p = pay, *end = pay + len;
|
|
577
|
+
if (end - p < 4 + 4 + 8 + 8 + 4) return false;
|
|
578
|
+
if (get_u32(p) != MAN_MAGIC) return false; p += 4;
|
|
579
|
+
p += 4; /* version */
|
|
580
|
+
next_number = get_u64(p); p += 8;
|
|
581
|
+
wal_number = get_u64(p); p += 8;
|
|
582
|
+
uint32_t cnt = get_u32(p); p += 4;
|
|
583
|
+
for (uint32_t i = 0; i < cnt; i++) {
|
|
584
|
+
if (end - p < 8) return false;
|
|
585
|
+
sst_numbers.push_back(get_u64(p)); p += 8;
|
|
586
|
+
}
|
|
587
|
+
return true;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
/* ---- WAL ---- */
|
|
591
|
+
bool open_wal(uint64_t num, bool create) {
|
|
592
|
+
std::wstring path = numbered(dir, num, ".wal");
|
|
593
|
+
wal = CreateFileW(path.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, NULL,
|
|
594
|
+
create ? OPEN_ALWAYS : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
595
|
+
if (wal == INVALID_HANDLE_VALUE) return false;
|
|
596
|
+
wal_number = num;
|
|
597
|
+
return true;
|
|
598
|
+
}
|
|
599
|
+
bool wal_seek_end() {
|
|
600
|
+
LARGE_INTEGER zero; zero.QuadPart = 0; LARGE_INTEGER np;
|
|
601
|
+
return SetFilePointerEx(wal, zero, &np, FILE_END) != 0;
|
|
602
|
+
}
|
|
603
|
+
bool wal_append(uint8_t op, const std::string &key, const std::string &value) {
|
|
604
|
+
std::string pay;
|
|
605
|
+
pay.push_back((char)op);
|
|
606
|
+
put_varint(pay, key.size()); pay.append(key);
|
|
607
|
+
if (op == KIND_VALUE) { put_varint(pay, value.size()); pay.append(value); }
|
|
608
|
+
std::string rec;
|
|
609
|
+
put_u32(rec, (uint32_t)pay.size());
|
|
610
|
+
put_u32(rec, crc32_buf((const uint8_t *)pay.data(), pay.size()));
|
|
611
|
+
rec.append(pay);
|
|
612
|
+
if (!write_all(wal, rec.data(), rec.size())) return false;
|
|
613
|
+
if (sync_writes && !fsync_handle(wal)) return false;
|
|
614
|
+
return true;
|
|
615
|
+
}
|
|
616
|
+
/* Replay a WAL file (read by path, BEFORE the writable handle is opened, so
|
|
617
|
+
there is no share conflict) into the memtable. Returns the durable byte
|
|
618
|
+
length: everything before a torn/garbage tail. */
|
|
619
|
+
size_t wal_replay_file(const std::wstring &path) {
|
|
620
|
+
std::string buf;
|
|
621
|
+
if (!read_whole(path, buf)) return 0;
|
|
622
|
+
const uint8_t *b = (const uint8_t *)buf.data();
|
|
623
|
+
size_t fsize = buf.size(), pos = 0, good = 0;
|
|
624
|
+
while (pos + 8 <= fsize) {
|
|
625
|
+
uint32_t len = get_u32(b + pos), crc = get_u32(b + pos + 4);
|
|
626
|
+
if ((uint64_t)pos + 8 + len > fsize) break; /* torn tail */
|
|
627
|
+
const uint8_t *pay = b + pos + 8;
|
|
628
|
+
if (crc32_buf(pay, len) != crc) break; /* bad crc */
|
|
629
|
+
const uint8_t *p = pay, *end = pay + len;
|
|
630
|
+
if (p >= end) break;
|
|
631
|
+
uint8_t op = *p++;
|
|
632
|
+
uint64_t klen;
|
|
633
|
+
if (!get_varint(p, end, &klen) || (uint64_t)(end - p) < klen) break;
|
|
634
|
+
std::string key((const char *)p, (size_t)klen); p += klen;
|
|
635
|
+
if (op == KIND_VALUE) {
|
|
636
|
+
uint64_t vlen;
|
|
637
|
+
if (!get_varint(p, end, &vlen) || (uint64_t)(end - p) < vlen) break;
|
|
638
|
+
std::string val((const char *)p, (size_t)vlen); p += vlen;
|
|
639
|
+
Record r; r.kind = KIND_VALUE; r.value = val;
|
|
640
|
+
mem[key] = r; mem_bytes += key.size() + val.size();
|
|
641
|
+
} else {
|
|
642
|
+
Record r; r.kind = KIND_TOMB;
|
|
643
|
+
mem[key] = r; mem_bytes += key.size();
|
|
644
|
+
}
|
|
645
|
+
pos += 8 + len; good = pos;
|
|
646
|
+
}
|
|
647
|
+
return good;
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
/* ---- flush memtable -> new SSTable, rotate WAL, rewrite manifest.
|
|
651
|
+
Returns false (with errbuf set) on failure; never raises. ---- */
|
|
652
|
+
bool flush() {
|
|
653
|
+
if (mem.empty()) return true;
|
|
654
|
+
uint64_t sst_num = next_number++;
|
|
655
|
+
{
|
|
656
|
+
std::vector<OutEntry> entries;
|
|
657
|
+
entries.reserve(mem.size());
|
|
658
|
+
for (Memtable::const_iterator it = mem.begin(); it != mem.end(); ++it) {
|
|
659
|
+
OutEntry e; e.key = it->first; e.kind = it->second.kind; e.value = it->second.value;
|
|
660
|
+
entries.push_back(e);
|
|
661
|
+
}
|
|
662
|
+
std::string image = build_sstable(entries);
|
|
663
|
+
if (!publish_file(numbered(dir, sst_num, ".sst"), image)) return fail("write SSTable");
|
|
664
|
+
} /* entries + image freed before any later return */
|
|
665
|
+
|
|
666
|
+
SSTable *t = new SSTable();
|
|
667
|
+
if (!t->open(numbered(dir, sst_num, ".sst"), sst_num)) { delete t; return fail("re-open SSTable"); }
|
|
668
|
+
ssts.push_back(t);
|
|
669
|
+
|
|
670
|
+
uint64_t old_wal = wal_number; /* rotate to a fresh WAL */
|
|
671
|
+
uint64_t new_wal = next_number++;
|
|
672
|
+
HANDLE old = wal;
|
|
673
|
+
if (!open_wal(new_wal, true)) { wal = old; return fail("open new WAL"); }
|
|
674
|
+
if (old != INVALID_HANDLE_VALUE) CloseHandle(old);
|
|
675
|
+
|
|
676
|
+
if (!write_manifest()) return fail("write MANIFEST");
|
|
677
|
+
delete_retry(numbered(dir, old_wal, ".wal"));
|
|
678
|
+
|
|
679
|
+
mem.clear();
|
|
680
|
+
mem_bytes = 0;
|
|
681
|
+
return true;
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
bool maybe_flush() {
|
|
685
|
+
if (mem_bytes >= memtable_limit && !flush()) return false;
|
|
686
|
+
if (ssts.size() > compact_trigger && !compact()) return false;
|
|
687
|
+
return true;
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
/* ---- compaction: merge all SSTables into one (drops tombstones).
|
|
691
|
+
Returns false (with errbuf set) on failure; never raises. ---- */
|
|
692
|
+
bool compact() {
|
|
693
|
+
if (!flush()) return false; /* fold the memtable in first */
|
|
694
|
+
if (ssts.size() <= 1) return true;
|
|
695
|
+
|
|
696
|
+
uint64_t sst_num = next_number++;
|
|
697
|
+
{
|
|
698
|
+
std::vector<OutEntry> merged = merge_all(/*drop_tombstones=*/true);
|
|
699
|
+
std::string image = build_sstable(merged);
|
|
700
|
+
if (!publish_file(numbered(dir, sst_num, ".sst"), image)) return fail("write compacted SSTable");
|
|
701
|
+
} /* merged + image freed */
|
|
702
|
+
|
|
703
|
+
SSTable *t = new SSTable();
|
|
704
|
+
if (!t->open(numbered(dir, sst_num, ".sst"), sst_num)) { delete t; return fail("re-open compacted SSTable"); }
|
|
705
|
+
|
|
706
|
+
std::vector<SSTable *> old = ssts;
|
|
707
|
+
ssts.clear();
|
|
708
|
+
ssts.push_back(t);
|
|
709
|
+
if (!write_manifest()) { ssts = old; ssts.push_back(t); return fail("write MANIFEST (compact)"); }
|
|
710
|
+
|
|
711
|
+
for (size_t i = 0; i < old.size(); i++) {
|
|
712
|
+
uint64_t num = old[i]->number;
|
|
713
|
+
delete old[i]; /* unmap+close so the file can be deleted */
|
|
714
|
+
delete_retry(numbered(dir, num, ".sst"));
|
|
715
|
+
}
|
|
716
|
+
return true;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
/* Acquire the lock, load SSTables from the MANIFEST, replay the WAL (or
|
|
720
|
+
create a fresh DB). Returns false (errbuf set) on failure; never raises. */
|
|
721
|
+
bool bootstrap() {
|
|
722
|
+
CreateDirectoryW(dir.c_str(), NULL); /* ok if it already exists */
|
|
723
|
+
{
|
|
724
|
+
std::wstring lockp = child_path(dir, "LOCK");
|
|
725
|
+
lock = CreateFileW(lockp.c_str(), GENERIC_READ | GENERIC_WRITE, 0 /* exclusive */,
|
|
726
|
+
NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
727
|
+
}
|
|
728
|
+
if (lock == INVALID_HANDLE_VALUE) return fail("could not lock directory (already open elsewhere?)");
|
|
729
|
+
|
|
730
|
+
std::vector<uint64_t> sst_numbers;
|
|
731
|
+
if (read_manifest(sst_numbers)) {
|
|
732
|
+
for (size_t i = 0; i < sst_numbers.size(); i++) {
|
|
733
|
+
SSTable *t = new SSTable();
|
|
734
|
+
if (!t->open(numbered(dir, sst_numbers[i], ".sst"), sst_numbers[i])) {
|
|
735
|
+
delete t;
|
|
736
|
+
snprintf(errbuf, sizeof(errbuf), "SSTable %06llu missing or corrupt",
|
|
737
|
+
(unsigned long long)sst_numbers[i]);
|
|
738
|
+
return false;
|
|
739
|
+
}
|
|
740
|
+
ssts.push_back(t);
|
|
741
|
+
}
|
|
742
|
+
/* Replay the WAL by path FIRST (no writable handle held -> no share
|
|
743
|
+
conflict), then open it for appending and truncate any torn tail. */
|
|
744
|
+
size_t good = wal_replay_file(numbered(dir, wal_number, ".wal"));
|
|
745
|
+
if (!open_wal(wal_number, true)) return fail("open WAL");
|
|
746
|
+
LARGE_INTEGER fsz;
|
|
747
|
+
if (GetFileSizeEx(wal, &fsz) && good < (size_t)fsz.QuadPart) {
|
|
748
|
+
LARGE_INTEGER gp; gp.QuadPart = (LONGLONG)good;
|
|
749
|
+
if (SetFilePointerEx(wal, gp, NULL, FILE_BEGIN)) SetEndOfFile(wal);
|
|
750
|
+
}
|
|
751
|
+
if (!wal_seek_end()) return fail("seek WAL");
|
|
752
|
+
} else {
|
|
753
|
+
next_number = 1;
|
|
754
|
+
uint64_t wal_num = next_number++;
|
|
755
|
+
if (!open_wal(wal_num, true)) return fail("create WAL");
|
|
756
|
+
if (!write_manifest()) return fail("write MANIFEST");
|
|
757
|
+
}
|
|
758
|
+
closed = false;
|
|
759
|
+
return true;
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
/* Full ordered merge into a vector (used by compaction). */
|
|
763
|
+
std::vector<OutEntry> merge_all(bool drop_tombstones) {
|
|
764
|
+
std::vector<OutEntry> out;
|
|
765
|
+
std::vector<Source> src;
|
|
766
|
+
build_sources(src, NULL, false, NULL, false);
|
|
767
|
+
merge_collect(src, out, drop_tombstones);
|
|
768
|
+
return out;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
/* Build merge sources seeked to the lower bound; upper stored per source. */
|
|
772
|
+
void build_sources(std::vector<Source> &src,
|
|
773
|
+
const std::string *lower, bool lower_incl,
|
|
774
|
+
const std::string *upper, bool upper_incl) {
|
|
775
|
+
/* memtable source (recency = max) */
|
|
776
|
+
{
|
|
777
|
+
Source s; s.is_mem = true; s.recency = (uint64_t)-1; s.sst = NULL;
|
|
778
|
+
s.upper = upper; s.upper_incl = upper_incl;
|
|
779
|
+
if (lower) {
|
|
780
|
+
s.mit = lower_incl ? mem.lower_bound(*lower) : mem.upper_bound(*lower);
|
|
781
|
+
} else s.mit = mem.begin();
|
|
782
|
+
s.mend = mem.end();
|
|
783
|
+
s.valid = false;
|
|
784
|
+
if (s.mit != s.mend) { s.key = s.mit->first; s.kind = s.mit->second.kind; s.value = s.mit->second.value; s.valid = s.within_upper(s.key); }
|
|
785
|
+
src.push_back(s);
|
|
786
|
+
}
|
|
787
|
+
for (size_t i = 0; i < ssts.size(); i++) {
|
|
788
|
+
SSTable *t = ssts[i];
|
|
789
|
+
Source s; s.is_mem = false; s.recency = t->number; s.sst = t;
|
|
790
|
+
s.upper = upper; s.upper_incl = upper_incl;
|
|
791
|
+
uint64_t off = lower ? t->start_offset(*lower) : 0;
|
|
792
|
+
s.p = t->base + off; s.pend = t->base + t->data_end;
|
|
793
|
+
s.valid = false;
|
|
794
|
+
/* parse forward to the first in-range key */
|
|
795
|
+
while (s.p < s.pend) {
|
|
796
|
+
if (!t->parse_entry(s.p, s.key, s.kind, s.value)) { s.valid = false; break; }
|
|
797
|
+
if (lower) {
|
|
798
|
+
int c = bytecmp(s.key, *lower);
|
|
799
|
+
if (c < 0 || (c == 0 && !lower_incl)) continue;
|
|
800
|
+
}
|
|
801
|
+
s.valid = s.within_upper(s.key);
|
|
802
|
+
break;
|
|
803
|
+
}
|
|
804
|
+
src.push_back(s);
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
/* (Ordered merge lives in the free functions merge_collect / merge_yield
|
|
809
|
+
above; merge_yield keeps no C++ destructor on the stack across rb_yield.) */
|
|
810
|
+
};
|
|
811
|
+
|
|
812
|
+
/* ===================== Ruby bindings ==================================== */
|
|
813
|
+
|
|
814
|
+
typedef struct { DB *db; } DBWrap;
|
|
815
|
+
|
|
816
|
+
static void db_free(void *p) {
|
|
817
|
+
DBWrap *w = (DBWrap *)p;
|
|
818
|
+
if (w) { delete w->db; w->db = NULL; xfree(w); }
|
|
819
|
+
}
|
|
820
|
+
static size_t db_memsize(const void *p) {
|
|
821
|
+
const DBWrap *w = (const DBWrap *)p;
|
|
822
|
+
size_t n = sizeof(DBWrap);
|
|
823
|
+
if (w && w->db) { n += sizeof(DB) + w->db->mem_bytes; }
|
|
824
|
+
return n;
|
|
825
|
+
}
|
|
826
|
+
static const rb_data_type_t db_type = { "Lithos::DB", { 0, db_free, db_memsize }, 0, 0, RUBY_TYPED_FREE_IMMEDIATELY };
|
|
827
|
+
|
|
828
|
+
static VALUE db_alloc(VALUE klass) {
|
|
829
|
+
DBWrap *w;
|
|
830
|
+
VALUE obj = TypedData_Make_Struct(klass, DBWrap, &db_type, w);
|
|
831
|
+
w->db = new DB();
|
|
832
|
+
return obj;
|
|
833
|
+
}
|
|
834
|
+
static DB *get_db(VALUE self) {
|
|
835
|
+
DBWrap *w;
|
|
836
|
+
TypedData_Get_Struct(self, DBWrap, &db_type, w);
|
|
837
|
+
if (!w || !w->db) rb_raise(eError, "lithos: store is not initialized");
|
|
838
|
+
if (w->db->closed) rb_raise(eError, "lithos: store is closed");
|
|
839
|
+
return w->db;
|
|
840
|
+
}
|
|
841
|
+
static std::string to_bytes(VALUE v) {
|
|
842
|
+
StringValue(v);
|
|
843
|
+
return std::string(RSTRING_PTR(v), (size_t)RSTRING_LEN(v));
|
|
844
|
+
}
|
|
845
|
+
static VALUE from_bytes(const std::string &s) {
|
|
846
|
+
VALUE r = rb_str_new(s.data(), (long)s.size());
|
|
847
|
+
rb_enc_associate(r, rb_ascii8bit_encoding());
|
|
848
|
+
return r;
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
static VALUE db_initialize(int argc, VALUE *argv, VALUE self) {
|
|
852
|
+
VALUE vpath, vopts;
|
|
853
|
+
rb_scan_args(argc, argv, "1:", &vpath, &vopts);
|
|
854
|
+
StringValue(vpath);
|
|
855
|
+
DBWrap *w; TypedData_Get_Struct(self, DBWrap, &db_type, w);
|
|
856
|
+
DB *db = w->db;
|
|
857
|
+
if (!db->closed) rb_raise(eError, "lithos: store is already open"); /* no double-init */
|
|
858
|
+
db->dir_utf8.assign(RSTRING_PTR(vpath), (size_t)RSTRING_LEN(vpath));
|
|
859
|
+
db->dir = utf8_to_wide(db->dir_utf8.data(), db->dir_utf8.size());
|
|
860
|
+
|
|
861
|
+
if (!NIL_P(vopts)) {
|
|
862
|
+
VALUE s = rb_hash_aref(vopts, ID2SYM(rb_intern("sync")));
|
|
863
|
+
if (!NIL_P(s)) db->sync_writes = RTEST(s);
|
|
864
|
+
VALUE ml = rb_hash_aref(vopts, ID2SYM(rb_intern("memtable_bytes")));
|
|
865
|
+
if (!NIL_P(ml)) db->memtable_limit = (size_t)NUM2ULL(ml);
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
/* All filesystem work (lock, load SSTables, WAL replay) happens in
|
|
869
|
+
bootstrap(), which never raises; we raise here from a clean frame. */
|
|
870
|
+
if (!db->bootstrap()) {
|
|
871
|
+
db->release();
|
|
872
|
+
rb_raise(eError, "lithos: %s", db->errbuf);
|
|
873
|
+
}
|
|
874
|
+
return self;
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
static VALUE db_put(VALUE self, VALUE key, VALUE val) {
|
|
878
|
+
DB *db = get_db(self);
|
|
879
|
+
StringValue(key); StringValue(val); /* TypeError here: no C++ object live yet */
|
|
880
|
+
bool ok;
|
|
881
|
+
{
|
|
882
|
+
std::string k(RSTRING_PTR(key), (size_t)RSTRING_LEN(key));
|
|
883
|
+
std::string v(RSTRING_PTR(val), (size_t)RSTRING_LEN(val));
|
|
884
|
+
ok = db->wal_append(KIND_VALUE, k, v);
|
|
885
|
+
if (!ok) db->fail("WAL append");
|
|
886
|
+
else {
|
|
887
|
+
Record r; r.kind = KIND_VALUE; r.value = v;
|
|
888
|
+
db->mem[k] = r;
|
|
889
|
+
db->mem_bytes += k.size() + v.size();
|
|
890
|
+
ok = db->maybe_flush();
|
|
891
|
+
}
|
|
892
|
+
} /* k, v freed before any raise */
|
|
893
|
+
if (!ok) rb_raise(eError, "lithos: %s", db->errbuf);
|
|
894
|
+
return self; /* chainable; `db[k]=v` still evaluates to v per Ruby semantics */
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
static VALUE db_get(VALUE self, VALUE key) {
|
|
898
|
+
DB *db = get_db(self);
|
|
899
|
+
std::string k = to_bytes(key);
|
|
900
|
+
Memtable::const_iterator it = db->mem.find(k);
|
|
901
|
+
if (it != db->mem.end()) return it->second.kind == KIND_VALUE ? from_bytes(it->second.value) : Qnil;
|
|
902
|
+
std::string out;
|
|
903
|
+
for (size_t i = db->ssts.size(); i-- > 0;) {
|
|
904
|
+
int r = db->ssts[i]->get(k, &out);
|
|
905
|
+
if (r == 1) return from_bytes(out);
|
|
906
|
+
if (r == 2) return Qnil; /* tombstone shadows older */
|
|
907
|
+
}
|
|
908
|
+
return Qnil;
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
static VALUE db_key_p(VALUE self, VALUE key) {
|
|
912
|
+
DB *db = get_db(self);
|
|
913
|
+
std::string k = to_bytes(key);
|
|
914
|
+
Memtable::const_iterator it = db->mem.find(k);
|
|
915
|
+
if (it != db->mem.end()) return it->second.kind == KIND_VALUE ? Qtrue : Qfalse;
|
|
916
|
+
std::string out;
|
|
917
|
+
for (size_t i = db->ssts.size(); i-- > 0;) {
|
|
918
|
+
int r = db->ssts[i]->get(k, &out);
|
|
919
|
+
if (r == 1) return Qtrue;
|
|
920
|
+
if (r == 2) return Qfalse;
|
|
921
|
+
}
|
|
922
|
+
return Qfalse;
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
static VALUE db_delete(VALUE self, VALUE key) {
|
|
926
|
+
DB *db = get_db(self);
|
|
927
|
+
bool existed = RTEST(db_key_p(self, key));
|
|
928
|
+
bool ok;
|
|
929
|
+
{
|
|
930
|
+
std::string k = to_bytes(key);
|
|
931
|
+
ok = db->wal_append(KIND_TOMB, k, std::string());
|
|
932
|
+
if (!ok) db->fail("WAL append");
|
|
933
|
+
else {
|
|
934
|
+
Record r; r.kind = KIND_TOMB;
|
|
935
|
+
db->mem[k] = r;
|
|
936
|
+
db->mem_bytes += k.size();
|
|
937
|
+
ok = db->maybe_flush();
|
|
938
|
+
}
|
|
939
|
+
} /* k freed before any raise */
|
|
940
|
+
if (!ok) rb_raise(eError, "lithos: %s", db->errbuf);
|
|
941
|
+
return existed ? Qtrue : Qfalse;
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
/* _each_range(lower|nil, lower_incl_bool, upper|nil, upper_incl_bool) { |k,v| } */
|
|
945
|
+
struct MergeRun { DB *db; std::vector<Source> src; std::string lo, hi; std::string ek, ev; };
|
|
946
|
+
|
|
947
|
+
static VALUE db_each_range(VALUE self, VALUE vlo, VALUE vloi, VALUE vhi, VALUE vhii) {
|
|
948
|
+
DB *db = get_db(self);
|
|
949
|
+
if (!rb_block_given_p())
|
|
950
|
+
rb_raise(rb_eLocalJumpError, "lithos: no block given");
|
|
951
|
+
|
|
952
|
+
MergeRun *m = new MergeRun();
|
|
953
|
+
m->db = db;
|
|
954
|
+
{
|
|
955
|
+
/* Convert the bounds FIRST (StringValue can raise TypeError); copy into
|
|
956
|
+
the heap MergeRun so these stack locals are gone before any later jump. */
|
|
957
|
+
bool have_lo = !NIL_P(vlo), have_hi = !NIL_P(vhi);
|
|
958
|
+
const std::string *lo = NULL, *hi = NULL;
|
|
959
|
+
try {
|
|
960
|
+
if (have_lo) { m->lo = to_bytes(vlo); lo = &m->lo; }
|
|
961
|
+
if (have_hi) { m->hi = to_bytes(vhi); hi = &m->hi; }
|
|
962
|
+
db->build_sources(m->src, lo, RTEST(vloi), hi, RTEST(vhii));
|
|
963
|
+
} catch (...) { delete m; rb_raise(eError, "lithos: out of memory building scan"); }
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
/* Run the merge. merge_yield catches block exceptions/breaks via rb_protect
|
|
967
|
+
and returns a Ruby tag; NO longjmp crosses a C++ frame. We free our state,
|
|
968
|
+
then re-raise the tag from this clean C frame. (Mutation-during-iteration
|
|
969
|
+
is forbidden in the Ruby layer, which keeps that guard a pure-Ruby raise.) */
|
|
970
|
+
int state = merge_yield(m->src, m->ek, m->ev);
|
|
971
|
+
delete m;
|
|
972
|
+
if (state) rb_jump_tag(state); /* propagate the block's break/exception cleanly */
|
|
973
|
+
return self;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
static VALUE db_flush(VALUE self) {
|
|
977
|
+
DB *db = get_db(self);
|
|
978
|
+
if (!db->flush()) rb_raise(eError, "lithos: %s", db->errbuf);
|
|
979
|
+
return self;
|
|
980
|
+
}
|
|
981
|
+
static VALUE db_compact(VALUE self) {
|
|
982
|
+
DB *db = get_db(self);
|
|
983
|
+
if (!db->compact()) rb_raise(eError, "lithos: %s", db->errbuf);
|
|
984
|
+
return self;
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
static VALUE db_close(VALUE self) {
|
|
988
|
+
DBWrap *w; TypedData_Get_Struct(self, DBWrap, &db_type, w);
|
|
989
|
+
if (w && w->db && !w->db->closed) {
|
|
990
|
+
DB *db = w->db;
|
|
991
|
+
db->flush(); /* best-effort clean shutdown; data is already WAL-durable */
|
|
992
|
+
db->release();
|
|
993
|
+
}
|
|
994
|
+
return Qnil;
|
|
995
|
+
}
|
|
996
|
+
static VALUE db_closed_p(VALUE self) {
|
|
997
|
+
DBWrap *w; TypedData_Get_Struct(self, DBWrap, &db_type, w);
|
|
998
|
+
return (w && w->db && !w->db->closed) ? Qfalse : Qtrue;
|
|
999
|
+
}
|
|
1000
|
+
static VALUE db_path(VALUE self) {
|
|
1001
|
+
DBWrap *w; TypedData_Get_Struct(self, DBWrap, &db_type, w);
|
|
1002
|
+
/* the path is conceptually UTF-8 text (not binary like keys/values) */
|
|
1003
|
+
return rb_utf8_str_new(w->db->dir_utf8.data(), (long)w->db->dir_utf8.size());
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
extern "C" void Init_lithos(void) {
|
|
1007
|
+
mLithos = rb_define_module("Lithos");
|
|
1008
|
+
cDB = rb_define_class_under(mLithos, "DB", rb_cObject);
|
|
1009
|
+
eError = rb_define_class_under(mLithos, "Error", rb_eStandardError);
|
|
1010
|
+
|
|
1011
|
+
rb_define_alloc_func(cDB, db_alloc);
|
|
1012
|
+
rb_define_method(cDB, "initialize", RUBY_METHOD_FUNC(db_initialize), -1);
|
|
1013
|
+
/* Read-only methods are safe during iteration -> exposed directly. */
|
|
1014
|
+
rb_define_method(cDB, "get", RUBY_METHOD_FUNC(db_get), 1);
|
|
1015
|
+
rb_define_method(cDB, "key?", RUBY_METHOD_FUNC(db_key_p), 1);
|
|
1016
|
+
rb_define_method(cDB, "_each_range", RUBY_METHOD_FUNC(db_each_range), 4);
|
|
1017
|
+
rb_define_method(cDB, "closed?", RUBY_METHOD_FUNC(db_closed_p), 0);
|
|
1018
|
+
rb_define_method(cDB, "path", RUBY_METHOD_FUNC(db_path), 0);
|
|
1019
|
+
/* Mutators are primitives wrapped by the Ruby layer, which forbids calling
|
|
1020
|
+
them during iteration (a pure-Ruby raise — never a longjmp through C++). */
|
|
1021
|
+
rb_define_method(cDB, "_put", RUBY_METHOD_FUNC(db_put), 2);
|
|
1022
|
+
rb_define_method(cDB, "_delete", RUBY_METHOD_FUNC(db_delete), 1);
|
|
1023
|
+
rb_define_method(cDB, "_flush", RUBY_METHOD_FUNC(db_flush), 0);
|
|
1024
|
+
rb_define_method(cDB, "_compact", RUBY_METHOD_FUNC(db_compact), 0);
|
|
1025
|
+
rb_define_method(cDB, "_close", RUBY_METHOD_FUNC(db_close), 0);
|
|
1026
|
+
}
|