mdbxmou 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +32 -0
- package/.github/workflows/publish.yml +27 -0
- package/.gitmodules +3 -0
- package/CMakeLists.txt +53 -0
- package/LICENSE +201 -0
- package/README.md +639 -0
- package/build.js +11 -0
- package/deps/libmdbx/.clang-format +3 -0
- package/deps/libmdbx/.cmake-format.yaml +3 -0
- package/deps/libmdbx/.le.ini +40 -0
- package/deps/libmdbx/CMakeLists.txt +1269 -0
- package/deps/libmdbx/COPYRIGHT +159 -0
- package/deps/libmdbx/ChangeLog.md +2786 -0
- package/deps/libmdbx/GNUmakefile +950 -0
- package/deps/libmdbx/LICENSE +177 -0
- package/deps/libmdbx/Makefile +16 -0
- package/deps/libmdbx/NOTICE +39 -0
- package/deps/libmdbx/README.md +863 -0
- package/deps/libmdbx/TODO.md +43 -0
- package/deps/libmdbx/cmake/compiler.cmake +1221 -0
- package/deps/libmdbx/cmake/profile.cmake +58 -0
- package/deps/libmdbx/cmake/utils.cmake +524 -0
- package/deps/libmdbx/conanfile.py +323 -0
- package/deps/libmdbx/docs/Doxyfile.in +2734 -0
- package/deps/libmdbx/docs/_preface.md +47 -0
- package/deps/libmdbx/docs/_restrictions.md +248 -0
- package/deps/libmdbx/docs/_starting.md +245 -0
- package/deps/libmdbx/docs/_toc.md +34 -0
- package/deps/libmdbx/docs/header.html +96 -0
- package/deps/libmdbx/example/CMakeLists.txt +6 -0
- package/deps/libmdbx/example/README.md +1 -0
- package/deps/libmdbx/example/example-mdbx.c +154 -0
- package/deps/libmdbx/example/sample-bdb.txt +77 -0
- package/deps/libmdbx/mdbx.h +6655 -0
- package/deps/libmdbx/mdbx.h++ +6428 -0
- package/deps/libmdbx/packages/buildroot/0001-package-libmdbx-new-package-library-database.patch +173 -0
- package/deps/libmdbx/src/alloy.c +54 -0
- package/deps/libmdbx/src/api-cold.c +543 -0
- package/deps/libmdbx/src/api-copy.c +912 -0
- package/deps/libmdbx/src/api-cursor.c +754 -0
- package/deps/libmdbx/src/api-dbi.c +315 -0
- package/deps/libmdbx/src/api-env.c +1434 -0
- package/deps/libmdbx/src/api-extra.c +165 -0
- package/deps/libmdbx/src/api-key-transform.c +197 -0
- package/deps/libmdbx/src/api-misc.c +286 -0
- package/deps/libmdbx/src/api-opts.c +575 -0
- package/deps/libmdbx/src/api-range-estimate.c +365 -0
- package/deps/libmdbx/src/api-txn-data.c +454 -0
- package/deps/libmdbx/src/api-txn.c +921 -0
- package/deps/libmdbx/src/atomics-ops.h +364 -0
- package/deps/libmdbx/src/atomics-types.h +97 -0
- package/deps/libmdbx/src/audit.c +109 -0
- package/deps/libmdbx/src/bits.md +34 -0
- package/deps/libmdbx/src/chk.c +1796 -0
- package/deps/libmdbx/src/cogs.c +309 -0
- package/deps/libmdbx/src/cogs.h +506 -0
- package/deps/libmdbx/src/coherency.c +170 -0
- package/deps/libmdbx/src/config.h.in +88 -0
- package/deps/libmdbx/src/cursor.c +2396 -0
- package/deps/libmdbx/src/cursor.h +391 -0
- package/deps/libmdbx/src/dbi.c +717 -0
- package/deps/libmdbx/src/dbi.h +142 -0
- package/deps/libmdbx/src/debug_begin.h +36 -0
- package/deps/libmdbx/src/debug_end.h +15 -0
- package/deps/libmdbx/src/dpl.c +486 -0
- package/deps/libmdbx/src/dpl.h +134 -0
- package/deps/libmdbx/src/dxb.c +1335 -0
- package/deps/libmdbx/src/env.c +607 -0
- package/deps/libmdbx/src/essentials.h +125 -0
- package/deps/libmdbx/src/gc-get.c +1345 -0
- package/deps/libmdbx/src/gc-put.c +970 -0
- package/deps/libmdbx/src/gc.h +40 -0
- package/deps/libmdbx/src/global.c +474 -0
- package/deps/libmdbx/src/internals.h +585 -0
- package/deps/libmdbx/src/layout-dxb.h +288 -0
- package/deps/libmdbx/src/layout-lck.h +289 -0
- package/deps/libmdbx/src/lck-posix.c +859 -0
- package/deps/libmdbx/src/lck-windows.c +607 -0
- package/deps/libmdbx/src/lck.c +174 -0
- package/deps/libmdbx/src/lck.h +110 -0
- package/deps/libmdbx/src/logging_and_debug.c +250 -0
- package/deps/libmdbx/src/logging_and_debug.h +159 -0
- package/deps/libmdbx/src/man1/mdbx_chk.1 +106 -0
- package/deps/libmdbx/src/man1/mdbx_copy.1 +95 -0
- package/deps/libmdbx/src/man1/mdbx_drop.1 +48 -0
- package/deps/libmdbx/src/man1/mdbx_dump.1 +101 -0
- package/deps/libmdbx/src/man1/mdbx_load.1 +105 -0
- package/deps/libmdbx/src/man1/mdbx_stat.1 +86 -0
- package/deps/libmdbx/src/mdbx.c++ +1837 -0
- package/deps/libmdbx/src/meta.c +656 -0
- package/deps/libmdbx/src/meta.h +168 -0
- package/deps/libmdbx/src/mvcc-readers.c +414 -0
- package/deps/libmdbx/src/node.c +365 -0
- package/deps/libmdbx/src/node.h +102 -0
- package/deps/libmdbx/src/ntdll.def +1246 -0
- package/deps/libmdbx/src/options.h +534 -0
- package/deps/libmdbx/src/osal.c +3485 -0
- package/deps/libmdbx/src/osal.h +587 -0
- package/deps/libmdbx/src/page-get.c +483 -0
- package/deps/libmdbx/src/page-iov.c +185 -0
- package/deps/libmdbx/src/page-iov.h +34 -0
- package/deps/libmdbx/src/page-ops.c +744 -0
- package/deps/libmdbx/src/page-ops.h +142 -0
- package/deps/libmdbx/src/pnl.c +236 -0
- package/deps/libmdbx/src/pnl.h +146 -0
- package/deps/libmdbx/src/preface.h +990 -0
- package/deps/libmdbx/src/proto.h +105 -0
- package/deps/libmdbx/src/refund.c +212 -0
- package/deps/libmdbx/src/sort.h +484 -0
- package/deps/libmdbx/src/spill.c +431 -0
- package/deps/libmdbx/src/spill.h +74 -0
- package/deps/libmdbx/src/table.c +107 -0
- package/deps/libmdbx/src/tls.c +551 -0
- package/deps/libmdbx/src/tls.h +43 -0
- package/deps/libmdbx/src/tools/chk.c +673 -0
- package/deps/libmdbx/src/tools/copy.c +166 -0
- package/deps/libmdbx/src/tools/drop.c +199 -0
- package/deps/libmdbx/src/tools/dump.c +515 -0
- package/deps/libmdbx/src/tools/load.c +831 -0
- package/deps/libmdbx/src/tools/stat.c +516 -0
- package/deps/libmdbx/src/tools/wingetopt.c +87 -0
- package/deps/libmdbx/src/tools/wingetopt.h +30 -0
- package/deps/libmdbx/src/tree-ops.c +1554 -0
- package/deps/libmdbx/src/tree-search.c +140 -0
- package/deps/libmdbx/src/txl.c +99 -0
- package/deps/libmdbx/src/txl.h +26 -0
- package/deps/libmdbx/src/txn.c +1083 -0
- package/deps/libmdbx/src/unaligned.h +205 -0
- package/deps/libmdbx/src/utils.c +32 -0
- package/deps/libmdbx/src/utils.h +76 -0
- package/deps/libmdbx/src/version.c.in +44 -0
- package/deps/libmdbx/src/walk.c +290 -0
- package/deps/libmdbx/src/walk.h +20 -0
- package/deps/libmdbx/src/windows-import.c +152 -0
- package/deps/libmdbx/src/windows-import.h +128 -0
- package/deps/libmdbx/test/CMakeLists.txt +317 -0
- package/deps/libmdbx/test/append.c++ +237 -0
- package/deps/libmdbx/test/base.h++ +92 -0
- package/deps/libmdbx/test/battery-tmux.sh +64 -0
- package/deps/libmdbx/test/cases.c++ +118 -0
- package/deps/libmdbx/test/chrono.c++ +134 -0
- package/deps/libmdbx/test/chrono.h++ +85 -0
- package/deps/libmdbx/test/config.c++ +643 -0
- package/deps/libmdbx/test/config.h++ +334 -0
- package/deps/libmdbx/test/copy.c++ +62 -0
- package/deps/libmdbx/test/dead.c++ +39 -0
- package/deps/libmdbx/test/dump-load.sh +40 -0
- package/deps/libmdbx/test/extra/crunched_delete.c++ +409 -0
- package/deps/libmdbx/test/extra/cursor_closing.c++ +410 -0
- package/deps/libmdbx/test/extra/dbi.c++ +229 -0
- package/deps/libmdbx/test/extra/doubtless_positioning.c++ +253 -0
- package/deps/libmdbx/test/extra/dupfix_addodd.c +94 -0
- package/deps/libmdbx/test/extra/dupfix_multiple.c++ +311 -0
- package/deps/libmdbx/test/extra/early_close_dbi.c++ +137 -0
- package/deps/libmdbx/test/extra/hex_base64_base58.c++ +118 -0
- package/deps/libmdbx/test/extra/maindb_ordinal.c++ +61 -0
- package/deps/libmdbx/test/extra/open.c++ +96 -0
- package/deps/libmdbx/test/extra/pcrf/README.md +2 -0
- package/deps/libmdbx/test/extra/pcrf/pcrf_test.c +380 -0
- package/deps/libmdbx/test/extra/probe.c++ +10 -0
- package/deps/libmdbx/test/extra/txn.c++ +407 -0
- package/deps/libmdbx/test/extra/upsert_alldups.c +193 -0
- package/deps/libmdbx/test/fork.c++ +263 -0
- package/deps/libmdbx/test/hill.c++ +447 -0
- package/deps/libmdbx/test/jitter.c++ +197 -0
- package/deps/libmdbx/test/keygen.c++ +393 -0
- package/deps/libmdbx/test/keygen.h++ +130 -0
- package/deps/libmdbx/test/log.c++ +358 -0
- package/deps/libmdbx/test/log.h++ +91 -0
- package/deps/libmdbx/test/main.c++ +706 -0
- package/deps/libmdbx/test/nested.c++ +318 -0
- package/deps/libmdbx/test/osal-unix.c++ +647 -0
- package/deps/libmdbx/test/osal-windows.c++ +440 -0
- package/deps/libmdbx/test/osal.h++ +41 -0
- package/deps/libmdbx/test/stochastic.sh +690 -0
- package/deps/libmdbx/test/stub/LICENSE +24 -0
- package/deps/libmdbx/test/stub/README.md +8 -0
- package/deps/libmdbx/test/stub/pthread_barrier.c +104 -0
- package/deps/libmdbx/test/stub/pthread_barrier.h +77 -0
- package/deps/libmdbx/test/test.c++ +1551 -0
- package/deps/libmdbx/test/test.h++ +298 -0
- package/deps/libmdbx/test/tmux.conf +3 -0
- package/deps/libmdbx/test/try.c++ +30 -0
- package/deps/libmdbx/test/ttl.c++ +240 -0
- package/deps/libmdbx/test/utils.c++ +203 -0
- package/deps/libmdbx/test/utils.h++ +326 -0
- package/deps/libmdbx/test/valgrind_suppress.txt +536 -0
- package/lib/mdbx_evn_async.js +211 -0
- package/lib/mdbx_worker.js +195 -0
- package/lib/nativemou.js +6 -0
- package/package.json +38 -0
- package/src/async/envmou_close.cpp +34 -0
- package/src/async/envmou_close.hpp +32 -0
- package/src/async/envmou_copy_to.cpp +29 -0
- package/src/async/envmou_copy_to.hpp +38 -0
- package/src/async/envmou_keys.cpp +201 -0
- package/src/async/envmou_keys.hpp +50 -0
- package/src/async/envmou_open.cpp +38 -0
- package/src/async/envmou_open.hpp +33 -0
- package/src/async/envmou_query.cpp +167 -0
- package/src/async/envmou_query.hpp +53 -0
- package/src/dbimou.cpp +522 -0
- package/src/dbimou.hpp +82 -0
- package/src/env_arg0.hpp +24 -0
- package/src/envmou.cpp +445 -0
- package/src/envmou.hpp +116 -0
- package/src/modulemou.cpp +113 -0
- package/src/querymou.cpp +177 -0
- package/src/querymou.hpp +93 -0
- package/src/txnmou.cpp +254 -0
- package/src/txnmou.hpp +122 -0
- package/src/typemou.hpp +239 -0
- package/src/valuemou.hpp +194 -0
- package/test/async.js +67 -0
- package/test/e3.js +38 -0
- package/test/e4.js +89 -0
- package/test/e5.js +162 -0
- package/test/test-batch-ops.js +243 -0
- package/test/test-cursor-mode.js +84 -0
- package/test/test-multi-mode.js +87 -0
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
/// \copyright SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
|
|
3
|
+
|
|
4
|
+
#include "internals.h"
|
|
5
|
+
|
|
6
|
+
__cold int MDBX_PRINTF_ARGS(2, 3) bad_page(const page_t *mp, const char *fmt, ...) {
|
|
7
|
+
if (LOG_ENABLED(MDBX_LOG_ERROR)) {
|
|
8
|
+
static const page_t *prev;
|
|
9
|
+
if (prev != mp) {
|
|
10
|
+
char buf4unknown[16];
|
|
11
|
+
prev = mp;
|
|
12
|
+
debug_log(MDBX_LOG_ERROR, "badpage", 0, "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n",
|
|
13
|
+
pagetype_caption(page_type(mp), buf4unknown), mp->pgno, mp->txnid);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
va_list args;
|
|
17
|
+
va_start(args, fmt);
|
|
18
|
+
debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args);
|
|
19
|
+
va_end(args);
|
|
20
|
+
}
|
|
21
|
+
return MDBX_CORRUPTED;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
__cold void MDBX_PRINTF_ARGS(2, 3) poor_page(const page_t *mp, const char *fmt, ...) {
|
|
25
|
+
if (LOG_ENABLED(MDBX_LOG_NOTICE)) {
|
|
26
|
+
static const page_t *prev;
|
|
27
|
+
if (prev != mp) {
|
|
28
|
+
char buf4unknown[16];
|
|
29
|
+
prev = mp;
|
|
30
|
+
debug_log(MDBX_LOG_NOTICE, "poorpage", 0, "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n",
|
|
31
|
+
pagetype_caption(page_type(mp), buf4unknown), mp->pgno, mp->txnid);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
va_list args;
|
|
35
|
+
va_start(args, fmt);
|
|
36
|
+
debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args);
|
|
37
|
+
va_end(args);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
MDBX_CONST_FUNCTION static clc_t value_clc(const MDBX_cursor *mc) {
|
|
42
|
+
if (likely((mc->flags & z_inner) == 0))
|
|
43
|
+
return mc->clc->v;
|
|
44
|
+
else {
|
|
45
|
+
clc_t stub = {.cmp = cmp_equal_or_wrong, .lmin = 0, .lmax = 0};
|
|
46
|
+
return stub;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
__cold int page_check(const MDBX_cursor *const mc, const page_t *const mp) {
|
|
51
|
+
DKBUF;
|
|
52
|
+
int rc = MDBX_SUCCESS;
|
|
53
|
+
if (unlikely(mp->pgno < MIN_PAGENO || mp->pgno > MAX_PAGENO))
|
|
54
|
+
rc = bad_page(mp, "invalid pgno (%u)\n", mp->pgno);
|
|
55
|
+
|
|
56
|
+
MDBX_env *const env = mc->txn->env;
|
|
57
|
+
const ptrdiff_t offset = ptr_dist(mp, env->dxb_mmap.base);
|
|
58
|
+
unsigned flags_mask = P_ILL_BITS;
|
|
59
|
+
unsigned flags_expected = 0;
|
|
60
|
+
if (offset < 0 || offset > (ptrdiff_t)(pgno2bytes(env, mc->txn->geo.first_unallocated) -
|
|
61
|
+
((mp->flags & P_SUBP) ? PAGEHDRSZ + 1 : env->ps))) {
|
|
62
|
+
/* should be dirty page without MDBX_WRITEMAP, or a subpage of. */
|
|
63
|
+
flags_mask -= P_SUBP;
|
|
64
|
+
if ((env->flags & MDBX_WRITEMAP) != 0 || (!is_shadowed(mc->txn, mp) && !(mp->flags & P_SUBP)))
|
|
65
|
+
rc = bad_page(mp, "invalid page-address %p, offset %zi\n", __Wpedantic_format_voidptr(mp), offset);
|
|
66
|
+
} else if (offset & (env->ps - 1))
|
|
67
|
+
flags_expected = P_SUBP;
|
|
68
|
+
|
|
69
|
+
if (unlikely((mp->flags & flags_mask) != flags_expected))
|
|
70
|
+
rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", mp->flags & flags_mask, flags_expected);
|
|
71
|
+
|
|
72
|
+
cASSERT(mc, (mc->checking & z_dupfix) == 0 || (mc->flags & z_inner) != 0);
|
|
73
|
+
const uint8_t type = page_type(mp);
|
|
74
|
+
switch (type) {
|
|
75
|
+
default:
|
|
76
|
+
return bad_page(mp, "invalid type (%u)\n", type);
|
|
77
|
+
case P_LARGE:
|
|
78
|
+
if (unlikely(mc->flags & z_inner))
|
|
79
|
+
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large", "nested dupsort tree", mc->tree->flags);
|
|
80
|
+
const pgno_t npages = mp->pages;
|
|
81
|
+
if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2))
|
|
82
|
+
rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages);
|
|
83
|
+
if (unlikely(mp->pgno + npages > mc->txn->geo.first_unallocated))
|
|
84
|
+
rc = bad_page(mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", mp->pgno + npages,
|
|
85
|
+
mc->txn->geo.first_unallocated);
|
|
86
|
+
return rc; //-------------------------- end of large/overflow page handling
|
|
87
|
+
case P_LEAF | P_SUBP:
|
|
88
|
+
if (unlikely(mc->tree->height != 1))
|
|
89
|
+
rc =
|
|
90
|
+
bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "leaf-sub", "nested dupsort db", mc->tree->flags);
|
|
91
|
+
/* fall through */
|
|
92
|
+
__fallthrough;
|
|
93
|
+
case P_LEAF:
|
|
94
|
+
if (unlikely((mc->checking & z_dupfix) != 0))
|
|
95
|
+
rc = bad_page(mp, "unexpected leaf-page for dupfix subtree (db-lags 0x%x)\n", mc->tree->flags);
|
|
96
|
+
break;
|
|
97
|
+
case P_LEAF | P_DUPFIX | P_SUBP:
|
|
98
|
+
if (unlikely(mc->tree->height != 1))
|
|
99
|
+
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "leaf2-sub", "nested dupsort db",
|
|
100
|
+
mc->tree->flags);
|
|
101
|
+
/* fall through */
|
|
102
|
+
__fallthrough;
|
|
103
|
+
case P_LEAF | P_DUPFIX:
|
|
104
|
+
if (unlikely((mc->checking & z_dupfix) == 0))
|
|
105
|
+
rc = bad_page(mp, "unexpected leaf2-page for non-dupfix (sub)tree (db-flags 0x%x)\n", mc->tree->flags);
|
|
106
|
+
break;
|
|
107
|
+
case P_BRANCH:
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (unlikely(mp->upper < mp->lower || (mp->lower & 1) || PAGEHDRSZ + mp->upper > env->ps))
|
|
112
|
+
rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", mp->lower, mp->upper, page_space(env));
|
|
113
|
+
|
|
114
|
+
const char *const end_of_page = ptr_disp(mp, env->ps);
|
|
115
|
+
const size_t nkeys = page_numkeys(mp);
|
|
116
|
+
STATIC_ASSERT(P_BRANCH == 1);
|
|
117
|
+
if (unlikely(nkeys <= (uint8_t)(mp->flags & P_BRANCH))) {
|
|
118
|
+
if ((!(mc->flags & z_inner) || mc->tree->items) &&
|
|
119
|
+
(!(mc->checking & z_updating) || !(is_modifable(mc->txn, mp) || (mp->flags & P_SUBP))))
|
|
120
|
+
rc = bad_page(mp, "%s-page nkeys (%zu) < %u\n", is_branch(mp) ? "branch" : "leaf", nkeys, 1 + is_branch(mp));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const size_t ksize_max = keysize_max(env->ps, 0);
|
|
124
|
+
const size_t leaf2_ksize = mp->dupfix_ksize;
|
|
125
|
+
if (is_dupfix_leaf(mp)) {
|
|
126
|
+
if (unlikely((mc->flags & z_inner) == 0 || (mc->tree->flags & MDBX_DUPFIXED) == 0))
|
|
127
|
+
rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", mc->tree->flags);
|
|
128
|
+
else if (unlikely(leaf2_ksize != mc->tree->dupfix_size))
|
|
129
|
+
rc = bad_page(mp, "invalid leaf2_ksize %zu\n", leaf2_ksize);
|
|
130
|
+
else if (unlikely(((leaf2_ksize & nkeys) ^ mp->upper) & 1))
|
|
131
|
+
rc = bad_page(mp, "invalid page upper (%u) for nkeys %zu with leaf2-length %zu\n", mp->upper, nkeys, leaf2_ksize);
|
|
132
|
+
} else {
|
|
133
|
+
if (unlikely((mp->upper & 1) || PAGEHDRSZ + mp->upper + nkeys * sizeof(node_t) + nkeys - 1 > env->ps))
|
|
134
|
+
rc = bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", mp->upper, nkeys, page_space(env));
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
MDBX_val here, prev = {0, 0};
|
|
138
|
+
clc_t v_clc = value_clc(mc);
|
|
139
|
+
for (size_t i = 0; i < nkeys; ++i) {
|
|
140
|
+
if (is_dupfix_leaf(mp)) {
|
|
141
|
+
const char *const key = page_dupfix_ptr(mp, i, mc->tree->dupfix_size);
|
|
142
|
+
if (unlikely(end_of_page < key + leaf2_ksize)) {
|
|
143
|
+
rc = bad_page(mp, "leaf2-item beyond (%zu) page-end\n", key + leaf2_ksize - end_of_page);
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (unlikely(leaf2_ksize != mc->clc->k.lmin)) {
|
|
148
|
+
if (unlikely(leaf2_ksize < mc->clc->k.lmin || leaf2_ksize > mc->clc->k.lmax))
|
|
149
|
+
rc = bad_page(mp, "leaf2-item size (%zu) <> min/max length (%zu/%zu)\n", leaf2_ksize, mc->clc->k.lmin,
|
|
150
|
+
mc->clc->k.lmax);
|
|
151
|
+
else
|
|
152
|
+
mc->clc->k.lmin = mc->clc->k.lmax = leaf2_ksize;
|
|
153
|
+
}
|
|
154
|
+
if ((mc->checking & z_ignord) == 0) {
|
|
155
|
+
here.iov_base = (void *)key;
|
|
156
|
+
here.iov_len = leaf2_ksize;
|
|
157
|
+
if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0))
|
|
158
|
+
rc = bad_page(mp, "leaf2-item #%zu wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here));
|
|
159
|
+
prev = here;
|
|
160
|
+
}
|
|
161
|
+
} else {
|
|
162
|
+
const node_t *const node = page_node(mp, i);
|
|
163
|
+
const char *const node_end = ptr_disp(node, NODESIZE);
|
|
164
|
+
if (unlikely(node_end > end_of_page)) {
|
|
165
|
+
rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i, node_end - end_of_page);
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
const size_t ksize = node_ks(node);
|
|
169
|
+
if (unlikely(ksize > ksize_max))
|
|
170
|
+
rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize);
|
|
171
|
+
const char *const key = node_key(node);
|
|
172
|
+
if (unlikely(end_of_page < key + ksize)) {
|
|
173
|
+
rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i, key + ksize - end_of_page);
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
if ((is_leaf(mp) || i > 0)) {
|
|
177
|
+
if (unlikely(ksize < mc->clc->k.lmin || ksize > mc->clc->k.lmax))
|
|
178
|
+
rc = bad_page(mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n", i, ksize, mc->clc->k.lmin,
|
|
179
|
+
mc->clc->k.lmax);
|
|
180
|
+
if ((mc->checking & z_ignord) == 0) {
|
|
181
|
+
here.iov_base = (void *)key;
|
|
182
|
+
here.iov_len = ksize;
|
|
183
|
+
if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0))
|
|
184
|
+
rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here));
|
|
185
|
+
prev = here;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (is_branch(mp)) {
|
|
189
|
+
if ((mc->checking & z_updating) == 0 && i == 0 && unlikely(ksize != 0))
|
|
190
|
+
rc = bad_page(mp, "branch-node[%zu] wrong 0-node key-length (%zu)\n", i, ksize);
|
|
191
|
+
const pgno_t ref = node_pgno(node);
|
|
192
|
+
if (unlikely(ref < MIN_PAGENO) || (unlikely(ref >= mc->txn->geo.first_unallocated) &&
|
|
193
|
+
(unlikely(ref >= mc->txn->geo.now) || !(mc->checking & z_retiring))))
|
|
194
|
+
rc = bad_page(mp, "branch-node[%zu] wrong pgno (%u)\n", i, ref);
|
|
195
|
+
if (unlikely(node_flags(node)))
|
|
196
|
+
rc = bad_page(mp, "branch-node[%zu] wrong flags (%u)\n", i, node_flags(node));
|
|
197
|
+
continue;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
switch (node_flags(node)) {
|
|
201
|
+
default:
|
|
202
|
+
rc = bad_page(mp, "invalid node[%zu] flags (%u)\n", i, node_flags(node));
|
|
203
|
+
break;
|
|
204
|
+
case N_BIG /* data on large-page */:
|
|
205
|
+
case 0 /* usual */:
|
|
206
|
+
case N_TREE /* sub-db */:
|
|
207
|
+
case N_TREE | N_DUP /* dupsorted sub-tree */:
|
|
208
|
+
case N_DUP /* short sub-page */:
|
|
209
|
+
break;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const size_t dsize = node_ds(node);
|
|
213
|
+
const char *const data = node_data(node);
|
|
214
|
+
if (node_flags(node) & N_BIG) {
|
|
215
|
+
if (unlikely(end_of_page < data + sizeof(pgno_t))) {
|
|
216
|
+
rc = bad_page(mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", "bigdata-pgno", i, nkeys, dsize,
|
|
217
|
+
data + dsize - end_of_page);
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
if (unlikely(dsize <= v_clc.lmin || dsize > v_clc.lmax))
|
|
221
|
+
rc = bad_page(mp, "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", dsize, v_clc.lmin,
|
|
222
|
+
v_clc.lmax);
|
|
223
|
+
if (unlikely(node_size_len(node_ks(node), dsize) <= mc->txn->env->leaf_nodemax) &&
|
|
224
|
+
mc->tree != &mc->txn->dbs[FREE_DBI])
|
|
225
|
+
poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize);
|
|
226
|
+
|
|
227
|
+
if ((mc->checking & z_retiring) == 0) {
|
|
228
|
+
const pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->txnid);
|
|
229
|
+
if (unlikely(lp.err != MDBX_SUCCESS))
|
|
230
|
+
return lp.err;
|
|
231
|
+
cASSERT(mc, page_type(lp.page) == P_LARGE);
|
|
232
|
+
const unsigned npages = largechunk_npages(env, dsize);
|
|
233
|
+
if (unlikely(lp.page->pages != npages)) {
|
|
234
|
+
if (lp.page->pages < npages)
|
|
235
|
+
rc = bad_page(lp.page, "too less n-pages %u for bigdata-node (%zu bytes)", lp.page->pages, dsize);
|
|
236
|
+
else if (mc->tree != &mc->txn->dbs[FREE_DBI])
|
|
237
|
+
poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", lp.page->pages, dsize);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
continue;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
if (unlikely(end_of_page < data + dsize)) {
|
|
244
|
+
rc = bad_page(mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", "data", i, nkeys, dsize,
|
|
245
|
+
data + dsize - end_of_page);
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
switch (node_flags(node)) {
|
|
250
|
+
default:
|
|
251
|
+
/* wrong, but already handled */
|
|
252
|
+
continue;
|
|
253
|
+
case 0 /* usual */:
|
|
254
|
+
if (unlikely(dsize < v_clc.lmin || dsize > v_clc.lmax)) {
|
|
255
|
+
rc = bad_page(mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", dsize, v_clc.lmin, v_clc.lmax);
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
break;
|
|
259
|
+
case N_TREE /* sub-db */:
|
|
260
|
+
if (unlikely(dsize != sizeof(tree_t))) {
|
|
261
|
+
rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize);
|
|
262
|
+
continue;
|
|
263
|
+
}
|
|
264
|
+
break;
|
|
265
|
+
case N_TREE | N_DUP /* dupsorted sub-tree */:
|
|
266
|
+
if (unlikely(dsize != sizeof(tree_t))) {
|
|
267
|
+
rc = bad_page(mp, "invalid nested-db record size (%zu, expect %zu)\n", dsize, sizeof(tree_t));
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
break;
|
|
271
|
+
case N_DUP /* short sub-page */:
|
|
272
|
+
if (unlikely(dsize <= PAGEHDRSZ)) {
|
|
273
|
+
rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n", dsize);
|
|
274
|
+
continue;
|
|
275
|
+
} else {
|
|
276
|
+
const page_t *const sp = (page_t *)data;
|
|
277
|
+
switch (sp->flags &
|
|
278
|
+
/* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) {
|
|
279
|
+
case P_LEAF | P_SUBP:
|
|
280
|
+
case P_LEAF | P_DUPFIX | P_SUBP:
|
|
281
|
+
break;
|
|
282
|
+
default:
|
|
283
|
+
rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n", sp->flags);
|
|
284
|
+
continue;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
const char *const end_of_subpage = data + dsize;
|
|
288
|
+
const intptr_t nsubkeys = page_numkeys(sp);
|
|
289
|
+
if (unlikely(nsubkeys == 0) && !(mc->checking & z_updating) && mc->tree->items)
|
|
290
|
+
rc = bad_page(mp, "no keys on a %s-page\n", is_dupfix_leaf(sp) ? "leaf2-sub" : "leaf-sub");
|
|
291
|
+
|
|
292
|
+
MDBX_val sub_here, sub_prev = {0, 0};
|
|
293
|
+
for (int ii = 0; ii < nsubkeys; ii++) {
|
|
294
|
+
if (is_dupfix_leaf(sp)) {
|
|
295
|
+
/* DUPFIX pages have no entries[] or node headers */
|
|
296
|
+
const size_t sub_ksize = sp->dupfix_ksize;
|
|
297
|
+
const char *const sub_key = page_dupfix_ptr(sp, ii, mc->tree->dupfix_size);
|
|
298
|
+
if (unlikely(end_of_subpage < sub_key + sub_ksize)) {
|
|
299
|
+
rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", sub_key + sub_ksize - end_of_subpage);
|
|
300
|
+
continue;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
if (unlikely(sub_ksize != v_clc.lmin)) {
|
|
304
|
+
if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax))
|
|
305
|
+
rc = bad_page(mp,
|
|
306
|
+
"nested-leaf2-key size (%zu) <> min/max "
|
|
307
|
+
"value-length (%zu/%zu)\n",
|
|
308
|
+
sub_ksize, v_clc.lmin, v_clc.lmax);
|
|
309
|
+
else
|
|
310
|
+
v_clc.lmin = v_clc.lmax = sub_ksize;
|
|
311
|
+
}
|
|
312
|
+
if ((mc->checking & z_ignord) == 0) {
|
|
313
|
+
sub_here.iov_base = (void *)sub_key;
|
|
314
|
+
sub_here.iov_len = sub_ksize;
|
|
315
|
+
if (sub_prev.iov_base && unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0))
|
|
316
|
+
rc = bad_page(mp, "nested-leaf2-key #%u wrong order (%s >= %s)\n", ii, DKEY(&sub_prev),
|
|
317
|
+
DVAL(&sub_here));
|
|
318
|
+
sub_prev = sub_here;
|
|
319
|
+
}
|
|
320
|
+
} else {
|
|
321
|
+
const node_t *const sub_node = page_node(sp, ii);
|
|
322
|
+
const char *const sub_node_end = ptr_disp(sub_node, NODESIZE);
|
|
323
|
+
if (unlikely(sub_node_end > end_of_subpage)) {
|
|
324
|
+
rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", end_of_subpage - sub_node_end);
|
|
325
|
+
continue;
|
|
326
|
+
}
|
|
327
|
+
if (unlikely(node_flags(sub_node) != 0))
|
|
328
|
+
rc = bad_page(mp, "nested-node invalid flags (%u)\n", node_flags(sub_node));
|
|
329
|
+
|
|
330
|
+
const size_t sub_ksize = node_ks(sub_node);
|
|
331
|
+
const char *const sub_key = node_key(sub_node);
|
|
332
|
+
const size_t sub_dsize = node_ds(sub_node);
|
|
333
|
+
/* char *sub_data = node_data(sub_node); */
|
|
334
|
+
|
|
335
|
+
if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax))
|
|
336
|
+
rc = bad_page(mp,
|
|
337
|
+
"nested-node-key size (%zu) <> min/max "
|
|
338
|
+
"value-length (%zu/%zu)\n",
|
|
339
|
+
sub_ksize, v_clc.lmin, v_clc.lmax);
|
|
340
|
+
if ((mc->checking & z_ignord) == 0) {
|
|
341
|
+
sub_here.iov_base = (void *)sub_key;
|
|
342
|
+
sub_here.iov_len = sub_ksize;
|
|
343
|
+
if (sub_prev.iov_base && unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0))
|
|
344
|
+
rc = bad_page(mp, "nested-node-key #%u wrong order (%s >= %s)\n", ii, DKEY(&sub_prev),
|
|
345
|
+
DVAL(&sub_here));
|
|
346
|
+
sub_prev = sub_here;
|
|
347
|
+
}
|
|
348
|
+
if (unlikely(sub_dsize != 0))
|
|
349
|
+
rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", sub_dsize);
|
|
350
|
+
if (unlikely(end_of_subpage < sub_key + sub_ksize))
|
|
351
|
+
rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n", sub_key + sub_ksize - end_of_subpage);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
break;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
return rc;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
static __always_inline int check_page_header(const uint16_t ILL, const page_t *page, MDBX_txn *const txn,
|
|
363
|
+
const txnid_t front) {
|
|
364
|
+
if (unlikely(page->flags & ILL)) {
|
|
365
|
+
if (ILL == P_ILL_BITS || (page->flags & P_ILL_BITS))
|
|
366
|
+
return bad_page(page, "invalid page's flags (%u)\n", page->flags);
|
|
367
|
+
else if (ILL & P_LARGE) {
|
|
368
|
+
assert((ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0);
|
|
369
|
+
assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX));
|
|
370
|
+
return bad_page(page, "unexpected %s instead of %s (%u)\n", "large/overflow", "branch/leaf/leaf2", page->flags);
|
|
371
|
+
} else if (ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) {
|
|
372
|
+
assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_DUPFIX));
|
|
373
|
+
assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX));
|
|
374
|
+
return bad_page(page, "unexpected %s instead of %s (%u)\n", "branch/leaf/leaf2", "large/overflow", page->flags);
|
|
375
|
+
} else {
|
|
376
|
+
assert(false);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
if (unlikely(page->txnid > front) && unlikely(page->txnid > txn->front_txnid || front < txn->txnid))
|
|
381
|
+
return bad_page(page, "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", page->txnid,
|
|
382
|
+
(front == txn->front_txnid && front != txn->txnid) ? "front-txn" : "parent-page", front);
|
|
383
|
+
|
|
384
|
+
if (((ILL & P_LARGE) || !is_largepage(page)) && (ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0) {
|
|
385
|
+
/* Контроль четности page->upper тут либо приводит к ложным ошибкам,
|
|
386
|
+
* либо слишком дорог по количеству операций. Заковырка в том, что upper
|
|
387
|
+
* может быть нечетным на DUPFIX-страницах, при нечетном количестве
|
|
388
|
+
* элементов нечетной длины. Поэтому четность page->upper здесь не
|
|
389
|
+
* проверяется, но соответствующие полные проверки есть в page_check(). */
|
|
390
|
+
if (unlikely(page->upper < page->lower || (page->lower & 1) || PAGEHDRSZ + page->upper > txn->env->ps))
|
|
391
|
+
return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %zu\n", page->lower, page->upper,
|
|
392
|
+
page_space(txn->env));
|
|
393
|
+
|
|
394
|
+
} else if ((ILL & P_LARGE) == 0) {
|
|
395
|
+
const pgno_t npages = page->pages;
|
|
396
|
+
if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2))
|
|
397
|
+
return bad_page(page, "invalid n-pages (%u) for large-page\n", npages);
|
|
398
|
+
if (unlikely(page->pgno + npages > txn->geo.first_unallocated))
|
|
399
|
+
return bad_page(page, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", page->pgno + npages,
|
|
400
|
+
txn->geo.first_unallocated);
|
|
401
|
+
} else {
|
|
402
|
+
assert(false);
|
|
403
|
+
}
|
|
404
|
+
return MDBX_SUCCESS;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
__cold static __noinline pgr_t check_page_complete(const uint16_t ILL, page_t *page, const MDBX_cursor *const mc,
|
|
408
|
+
const txnid_t front) {
|
|
409
|
+
pgr_t r = {page, check_page_header(ILL, page, mc->txn, front)};
|
|
410
|
+
if (likely(r.err == MDBX_SUCCESS))
|
|
411
|
+
r.err = page_check(mc, page);
|
|
412
|
+
if (unlikely(r.err != MDBX_SUCCESS))
|
|
413
|
+
mc->txn->flags |= MDBX_TXN_ERROR;
|
|
414
|
+
return r;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
static __always_inline pgr_t page_get_inline(const uint16_t ILL, const MDBX_cursor *const mc, const pgno_t pgno,
|
|
418
|
+
const txnid_t front) {
|
|
419
|
+
MDBX_txn *const txn = mc->txn;
|
|
420
|
+
tASSERT(txn, front <= txn->front_txnid);
|
|
421
|
+
|
|
422
|
+
pgr_t r;
|
|
423
|
+
if (unlikely(pgno >= txn->geo.first_unallocated)) {
|
|
424
|
+
ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno);
|
|
425
|
+
r.page = nullptr;
|
|
426
|
+
r.err = MDBX_PAGE_NOTFOUND;
|
|
427
|
+
bailout:
|
|
428
|
+
txn->flags |= MDBX_TXN_ERROR;
|
|
429
|
+
return r;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
eASSERT(txn->env, ((txn->flags ^ txn->env->flags) & MDBX_WRITEMAP) == 0);
|
|
433
|
+
r.page = pgno2page(txn->env, pgno);
|
|
434
|
+
if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) {
|
|
435
|
+
const MDBX_txn *spiller = txn;
|
|
436
|
+
do {
|
|
437
|
+
/* Spilled pages were dirtied in this txn and flushed
|
|
438
|
+
* because the dirty list got full. Bring this page
|
|
439
|
+
* back in from the map (but don't unspill it here,
|
|
440
|
+
* leave that unless page_touch happens again). */
|
|
441
|
+
if (unlikely(spiller->flags & MDBX_TXN_SPILLS) && spill_search(spiller, pgno))
|
|
442
|
+
break;
|
|
443
|
+
|
|
444
|
+
const size_t i = dpl_search(spiller, pgno);
|
|
445
|
+
tASSERT(txn, (intptr_t)i > 0);
|
|
446
|
+
if (spiller->tw.dirtylist->items[i].pgno == pgno) {
|
|
447
|
+
r.page = spiller->tw.dirtylist->items[i].ptr;
|
|
448
|
+
break;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
spiller = spiller->parent;
|
|
452
|
+
} while (unlikely(spiller));
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
if (unlikely(r.page->pgno != pgno)) {
|
|
456
|
+
r.err = bad_page(r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", r.page->pgno, pgno);
|
|
457
|
+
goto bailout;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
if (unlikely(mc->checking & z_pagecheck))
|
|
461
|
+
return check_page_complete(ILL, r.page, mc, front);
|
|
462
|
+
|
|
463
|
+
#if MDBX_DISABLE_VALIDATION
|
|
464
|
+
r.err = MDBX_SUCCESS;
|
|
465
|
+
#else
|
|
466
|
+
r.err = check_page_header(ILL, r.page, txn, front);
|
|
467
|
+
if (unlikely(r.err != MDBX_SUCCESS))
|
|
468
|
+
goto bailout;
|
|
469
|
+
#endif /* MDBX_DISABLE_VALIDATION */
|
|
470
|
+
return r;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) {
|
|
474
|
+
return page_get_inline(P_ILL_BITS, mc, pgno, front);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
__hot pgr_t page_get_three(const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) {
|
|
478
|
+
return page_get_inline(P_ILL_BITS | P_LARGE, mc, pgno, front);
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) {
|
|
482
|
+
return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_DUPFIX, mc, pgno, front);
|
|
483
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/// \copyright SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
|
|
3
|
+
|
|
4
|
+
#include "internals.h"
|
|
5
|
+
|
|
6
|
+
int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages, mdbx_filehandle_t fd,
|
|
7
|
+
bool check_coherence) {
|
|
8
|
+
ctx->env = txn->env;
|
|
9
|
+
ctx->ior = &txn->env->ioring;
|
|
10
|
+
ctx->fd = fd;
|
|
11
|
+
ctx->coherency_timestamp =
|
|
12
|
+
(check_coherence || txn->env->lck->pgops.incoherence.weak) ? 0 : UINT64_MAX /* не выполнять сверку */;
|
|
13
|
+
ctx->err = osal_ioring_prepare(ctx->ior, items, pgno_align2os_bytes(txn->env, npages));
|
|
14
|
+
if (likely(ctx->err == MDBX_SUCCESS)) {
|
|
15
|
+
#if MDBX_NEED_WRITTEN_RANGE
|
|
16
|
+
ctx->flush_begin = MAX_PAGENO;
|
|
17
|
+
ctx->flush_end = MIN_PAGENO;
|
|
18
|
+
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
|
19
|
+
osal_ioring_reset(ctx->ior);
|
|
20
|
+
}
|
|
21
|
+
return ctx->err;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes) {
|
|
25
|
+
MDBX_env *const env = ctx->env;
|
|
26
|
+
eASSERT(env, (env->flags & MDBX_WRITEMAP) == 0);
|
|
27
|
+
|
|
28
|
+
page_t *wp = (page_t *)data;
|
|
29
|
+
eASSERT(env, wp->pgno == bytes2pgno(env, offset));
|
|
30
|
+
eASSERT(env, bytes2pgno(env, bytes) >= (is_largepage(wp) ? wp->pages : 1u));
|
|
31
|
+
eASSERT(env, (wp->flags & P_ILL_BITS) == 0);
|
|
32
|
+
|
|
33
|
+
if (likely(ctx->err == MDBX_SUCCESS)) {
|
|
34
|
+
const page_t *const rp = ptr_disp(env->dxb_mmap.base, offset);
|
|
35
|
+
VALGRIND_MAKE_MEM_DEFINED(rp, bytes);
|
|
36
|
+
MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes);
|
|
37
|
+
osal_flush_incoherent_mmap(rp, bytes, globals.sys_pagesize);
|
|
38
|
+
/* check with timeout as the workaround
|
|
39
|
+
* for https://libmdbx.dqdkfa.ru/dead-github/issues/269
|
|
40
|
+
*
|
|
41
|
+
* Проблема проявляется только при неупорядоченности: если записанная
|
|
42
|
+
* последней мета-страница "обгоняет" ранее записанные, т.е. когда
|
|
43
|
+
* записанное в файл позже становится видимым в отображении раньше,
|
|
44
|
+
* чем записанное ранее.
|
|
45
|
+
*
|
|
46
|
+
* Исходно здесь всегда выполнялась полная сверка. Это давало полную
|
|
47
|
+
* гарантию защиты от проявления проблемы, но порождало накладные расходы.
|
|
48
|
+
* В некоторых сценариях наблюдалось снижение производительности до 10-15%,
|
|
49
|
+
* а в синтетических тестах до 30%. Конечно никто не вникал в причины,
|
|
50
|
+
* а просто останавливался на мнении "libmdbx не быстрее LMDB",
|
|
51
|
+
* например: https://clck.ru/3386er
|
|
52
|
+
*
|
|
53
|
+
* Поэтому после серии экспериментов и тестов реализовано следующее:
|
|
54
|
+
* 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1
|
|
55
|
+
* можно включить полную сверку после записи.
|
|
56
|
+
* Остальные пункты являются взвешенным компромиссом между полной
|
|
57
|
+
* гарантией обнаружения проблемы и бесполезными затратами на системах
|
|
58
|
+
* без этого недостатка.
|
|
59
|
+
* 1. При старте транзакций проверяется соответствие выбранной мета-страницы
|
|
60
|
+
* корневым страницам b-tree проверяется. Эта проверка показала себя
|
|
61
|
+
* достаточной без сверки после записи. При обнаружении "некогерентности"
|
|
62
|
+
* эти случаи подсчитываются, а при их ненулевом счетчике выполняется
|
|
63
|
+
* полная сверка. Таким образом, произойдет переключение в режим полной
|
|
64
|
+
* сверки, если показавшая себя достаточной проверка заметит проявление
|
|
65
|
+
* проблемы хоты-бы раз.
|
|
66
|
+
* 2. Сверка не выполняется при фиксации транзакции, так как:
|
|
67
|
+
* - при наличии проблемы "не-когерентности" (при отложенном копировании
|
|
68
|
+
* или обновлении PTE, после возврата из write-syscall), проверка
|
|
69
|
+
* в этом процессе не гарантирует актуальность данных в другом
|
|
70
|
+
* процессе, который может запустить транзакцию сразу после коммита;
|
|
71
|
+
* - сверка только последнего блока позволяет почти восстановить
|
|
72
|
+
* производительность в больших транзакциях, но одновременно размывает
|
|
73
|
+
* уверенность в отсутствии сбоев, чем обесценивает всю затею;
|
|
74
|
+
* - после записи данных будет записана мета-страница, соответствие
|
|
75
|
+
* которой корневым страницам b-tree проверяется при старте
|
|
76
|
+
* транзакций, и только эта проверка показала себя достаточной;
|
|
77
|
+
* 3. При спиллинге производится полная сверка записанных страниц. Тут был
|
|
78
|
+
* соблазн сверять не полностью, а например начало и конец каждого блока.
|
|
79
|
+
* Но при спиллинге возможна ситуация повторного вытеснения страниц, в
|
|
80
|
+
* том числе large/overflow. При этом возникает риск прочитать в текущей
|
|
81
|
+
* транзакции старую версию страницы, до повторной записи. В этом случае
|
|
82
|
+
* могут возникать крайне редкие невоспроизводимые ошибки. С учетом того
|
|
83
|
+
* что спиллинг выполняет крайне редко, решено отказаться от экономии
|
|
84
|
+
* в пользу надежности. */
|
|
85
|
+
#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY
|
|
86
|
+
#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0
|
|
87
|
+
#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */
|
|
88
|
+
if ((MDBX_FORCE_CHECK_MMAP_COHERENCY || ctx->coherency_timestamp != UINT64_MAX) &&
|
|
89
|
+
unlikely(memcmp(wp, rp, bytes))) {
|
|
90
|
+
ctx->coherency_timestamp = 0;
|
|
91
|
+
env->lck->pgops.incoherence.weak =
|
|
92
|
+
(env->lck->pgops.incoherence.weak >= INT32_MAX) ? INT32_MAX : env->lck->pgops.incoherence.weak + 1;
|
|
93
|
+
WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->pgno,
|
|
94
|
+
"(workaround for incoherent flaw of unified page/buffer cache)");
|
|
95
|
+
do
|
|
96
|
+
if (coherency_timeout(&ctx->coherency_timestamp, wp->pgno, env) != MDBX_RESULT_TRUE) {
|
|
97
|
+
ctx->err = MDBX_PROBLEM;
|
|
98
|
+
break;
|
|
99
|
+
}
|
|
100
|
+
while (unlikely(memcmp(wp, rp, bytes)));
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (likely(bytes == env->ps))
|
|
105
|
+
page_shadow_release(env, wp, 1);
|
|
106
|
+
else {
|
|
107
|
+
do {
|
|
108
|
+
eASSERT(env, wp->pgno == bytes2pgno(env, offset));
|
|
109
|
+
eASSERT(env, (wp->flags & P_ILL_BITS) == 0);
|
|
110
|
+
size_t npages = is_largepage(wp) ? wp->pages : 1u;
|
|
111
|
+
size_t chunk = pgno2bytes(env, npages);
|
|
112
|
+
eASSERT(env, bytes >= chunk);
|
|
113
|
+
page_t *next = ptr_disp(wp, chunk);
|
|
114
|
+
page_shadow_release(env, wp, npages);
|
|
115
|
+
wp = next;
|
|
116
|
+
offset += chunk;
|
|
117
|
+
bytes -= chunk;
|
|
118
|
+
} while (bytes);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
static void iov_complete(iov_ctx_t *ctx) {
|
|
123
|
+
if ((ctx->env->flags & MDBX_WRITEMAP) == 0)
|
|
124
|
+
osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages);
|
|
125
|
+
osal_ioring_reset(ctx->ior);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
int iov_write(iov_ctx_t *ctx) {
|
|
129
|
+
eASSERT(ctx->env, !iov_empty(ctx));
|
|
130
|
+
osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd);
|
|
131
|
+
#if MDBX_ENABLE_PGOP_STAT
|
|
132
|
+
ctx->env->lck->pgops.wops.weak += r.wops;
|
|
133
|
+
#endif /* MDBX_ENABLE_PGOP_STAT */
|
|
134
|
+
ctx->err = r.err;
|
|
135
|
+
if (unlikely(ctx->err != MDBX_SUCCESS))
|
|
136
|
+
ERROR("Write error: %s", mdbx_strerror(ctx->err));
|
|
137
|
+
iov_complete(ctx);
|
|
138
|
+
return ctx->err;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, size_t npages) {
|
|
142
|
+
MDBX_env *const env = txn->env;
|
|
143
|
+
tASSERT(txn, ctx->err == MDBX_SUCCESS);
|
|
144
|
+
tASSERT(txn, dp->pgno >= MIN_PAGENO && dp->pgno < txn->geo.first_unallocated);
|
|
145
|
+
tASSERT(txn, is_modifable(txn, dp));
|
|
146
|
+
tASSERT(txn, !(dp->flags & ~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE)));
|
|
147
|
+
|
|
148
|
+
if (is_shadowed(txn, dp)) {
|
|
149
|
+
tASSERT(txn, !(txn->flags & MDBX_WRITEMAP));
|
|
150
|
+
dp->txnid = txn->txnid;
|
|
151
|
+
tASSERT(txn, is_spilled(txn, dp));
|
|
152
|
+
#if MDBX_AVOID_MSYNC
|
|
153
|
+
doit:;
|
|
154
|
+
#endif /* MDBX_AVOID_MSYNC */
|
|
155
|
+
int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp, pgno2bytes(env, npages));
|
|
156
|
+
if (unlikely(err != MDBX_SUCCESS)) {
|
|
157
|
+
ctx->err = err;
|
|
158
|
+
if (unlikely(err != MDBX_RESULT_TRUE)) {
|
|
159
|
+
iov_complete(ctx);
|
|
160
|
+
return err;
|
|
161
|
+
}
|
|
162
|
+
err = iov_write(ctx);
|
|
163
|
+
tASSERT(txn, iov_empty(ctx));
|
|
164
|
+
if (likely(err == MDBX_SUCCESS)) {
|
|
165
|
+
err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp, pgno2bytes(env, npages));
|
|
166
|
+
if (unlikely(err != MDBX_SUCCESS)) {
|
|
167
|
+
iov_complete(ctx);
|
|
168
|
+
return ctx->err = err;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
tASSERT(txn, ctx->err == MDBX_SUCCESS);
|
|
172
|
+
}
|
|
173
|
+
} else {
|
|
174
|
+
tASSERT(txn, txn->flags & MDBX_WRITEMAP);
|
|
175
|
+
#if MDBX_AVOID_MSYNC
|
|
176
|
+
goto doit;
|
|
177
|
+
#endif /* MDBX_AVOID_MSYNC */
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
#if MDBX_NEED_WRITTEN_RANGE
|
|
181
|
+
ctx->flush_begin = (ctx->flush_begin < dp->pgno) ? ctx->flush_begin : dp->pgno;
|
|
182
|
+
ctx->flush_end = (ctx->flush_end > dp->pgno + (pgno_t)npages) ? ctx->flush_end : dp->pgno + (pgno_t)npages;
|
|
183
|
+
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
|
184
|
+
return MDBX_SUCCESS;
|
|
185
|
+
}
|