lmdb 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGES +4 -0
- data/README.md +4 -1
- data/ext/lmdb_ext/liblmdb/CHANGES +26 -1
- data/ext/lmdb_ext/liblmdb/lmdb.h +80 -10
- data/ext/lmdb_ext/liblmdb/mdb.c +648 -616
- data/ext/lmdb_ext/liblmdb/midl.c +8 -8
- data/ext/lmdb_ext/liblmdb/midl.h +1 -1
- data/ext/lmdb_ext/lmdb_ext.c +28 -45
- data/ext/lmdb_ext/lmdb_ext.h +4 -7
- data/lib/lmdb/version.rb +1 -1
- data/spec/lmdb_spec.rb +7 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e16a42693e5150e076829337a0d44feb46bc3f3
|
4
|
+
data.tar.gz: ce0bc77de0cf3c7e17df2522c8eab69e760e1221
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e02924de6a59386ec215b45baf321a6d098d818900f7816a3805185ab5b7441427cc5af866d2d8141eecfd1d166a659ebc2a87d5ae7626a9a574bd4d1bd4bf44
|
7
|
+
data.tar.gz: 3ee55c5879fd385e53d25eeffd694dab0de03c5b152fe07562f9b631de25ccd90f09ae886a1d30b7f9abd6ea3a8da3fedac0b1f9d69dbffc3874bfad18570dd9
|
data/.travis.yml
CHANGED
data/CHANGES
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# LMDB
|
2
2
|
|
3
|
-
|
3
|
+
[](https://www.gittip.com/min4d/ "Donate weekly to this project using Gittip")
|
4
|
+
[](https://flattr.com/submit/auto?user_id=min4d&url=https://github.com/minad/lmdb&title=LMDB&language=&tags=github&category=software)
|
5
|
+
|
6
|
+
Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LMDB)
|
4
7
|
http://symas.com/mdb/
|
5
8
|
|
6
9
|
### Installation
|
@@ -1,6 +1,31 @@
|
|
1
1
|
LMDB 0.9 Change Log
|
2
2
|
|
3
|
-
LMDB 0.9.
|
3
|
+
LMDB 0.9.10 Release (2013/11/12)
|
4
|
+
Add MDB_NOMEMINIT option
|
5
|
+
Fix mdb_page_split() again (ITS#7589)
|
6
|
+
Fix MDB_NORDAHEAD definition (ITS#7734)
|
7
|
+
Fix mdb_cursor_del() positioning (ITS#7733)
|
8
|
+
Partial fix for larger page sizes (ITS#7713)
|
9
|
+
Fix Windows64/MSVC build issues
|
10
|
+
|
11
|
+
LMDB 0.9.9 Release (2013/10/24)
|
12
|
+
Add mdb_env_get_fd()
|
13
|
+
Add MDB_NORDAHEAD option
|
14
|
+
Add MDB_NOLOCK option
|
15
|
+
Avoid wasting space in mdb_page_split() (ITS#7589)
|
16
|
+
Fix mdb_page_merge() cursor fixup (ITS#7722)
|
17
|
+
Fix mdb_cursor_del() on last delete (ITS#7718)
|
18
|
+
Fix adding WRITEMAP on existing env (ITS#7715)
|
19
|
+
Fixes for nested txns (ITS#7515)
|
20
|
+
Fix mdb_env_copy() O_DIRECT bug (ITS#7682)
|
21
|
+
Fix mdb_cursor_set(SET_RANGE) return code (ITS#7681)
|
22
|
+
Fix mdb_rebalance() cursor fixup (ITS#7701)
|
23
|
+
Misc code cleanup
|
24
|
+
Documentation
|
25
|
+
Note that by default, readers need write access
|
26
|
+
|
27
|
+
|
28
|
+
LMDB 0.9.8 Release (2013/09/09)
|
4
29
|
Allow mdb_env_set_mapsize() on an open environment
|
5
30
|
Fix mdb_dbi_flags() (ITS#7672)
|
6
31
|
Fix mdb_page_unspill() in nested txns
|
data/ext/lmdb_ext/liblmdb/lmdb.h
CHANGED
@@ -66,6 +66,20 @@
|
|
66
66
|
* BSD systems or when otherwise configured with MDB_USE_POSIX_SEM.
|
67
67
|
* Multiple users can cause startup to fail later, as noted above.
|
68
68
|
*
|
69
|
+
* - There is normally no pure read-only mode, since readers need write
|
70
|
+
* access to locks and lock file. Exceptions: On read-only filesystems
|
71
|
+
* or with the #MDB_NOLOCK flag described under #mdb_env_open().
|
72
|
+
*
|
73
|
+
* - By default, in versions before 0.9.10, unused portions of the data
|
74
|
+
* file might receive garbage data from memory freed by other code.
|
75
|
+
* (This does not happen when using the #MDB_WRITEMAP flag.) As of
|
76
|
+
* 0.9.10 the default behavior is to initialize such memory before
|
77
|
+
* writing to the data file. Since there may be a slight performance
|
78
|
+
* cost due to this initialization, applications may disable it using
|
79
|
+
* the #MDB_NOMEMINIT flag. Applications handling sensitive data
|
80
|
+
* which must not be written should not use this flag. This flag is
|
81
|
+
* irrelevant when using #MDB_WRITEMAP.
|
82
|
+
*
|
69
83
|
* - A thread can only use one transaction at a time, plus any child
|
70
84
|
* transactions. Each transaction belongs to one thread. See below.
|
71
85
|
* The #MDB_NOTLS flag changes this for read-only transactions.
|
@@ -170,7 +184,7 @@ typedef int mdb_filehandle_t;
|
|
170
184
|
/** Library minor version */
|
171
185
|
#define MDB_VERSION_MINOR 9
|
172
186
|
/** Library patch version */
|
173
|
-
#define MDB_VERSION_PATCH
|
187
|
+
#define MDB_VERSION_PATCH 10
|
174
188
|
|
175
189
|
/** Combine args a,b,c into a single integer for easy version comparisons */
|
176
190
|
#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
|
@@ -180,7 +194,7 @@ typedef int mdb_filehandle_t;
|
|
180
194
|
MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
|
181
195
|
|
182
196
|
/** The release date of this library version */
|
183
|
-
#define MDB_VERSION_DATE "
|
197
|
+
#define MDB_VERSION_DATE "November 11, 2013"
|
184
198
|
|
185
199
|
/** A stringifier for the version info */
|
186
200
|
#define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
|
@@ -216,13 +230,13 @@ typedef struct MDB_cursor MDB_cursor;
|
|
216
230
|
/** @brief Generic structure used for passing keys and data in and out
|
217
231
|
* of the database.
|
218
232
|
*
|
219
|
-
* Key sizes must be between 1 and the liblmdb build-time constant
|
220
|
-
* #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The
|
221
|
-
* same applies to data sizes in databases with the #MDB_DUPSORT flag.
|
222
|
-
* Other data items can in theory be from 0 to 0xffffffff bytes long.
|
223
|
-
*
|
224
233
|
* Values returned from the database are valid only until a subsequent
|
225
|
-
* update operation, or the end of the transaction.
|
234
|
+
* update operation, or the end of the transaction. Do not modify or
|
235
|
+
* free them, they commonly point into the database itself.
|
236
|
+
*
|
237
|
+
* Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive.
|
238
|
+
* The same applies to data sizes in databases with the #MDB_DUPSORT flag.
|
239
|
+
* Other data items can in theory be from 0 to 0xffffffff bytes long.
|
226
240
|
*/
|
227
241
|
typedef struct MDB_val {
|
228
242
|
size_t mv_size; /**< size of the data item */
|
@@ -265,10 +279,16 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
|
|
265
279
|
#define MDB_NOMETASYNC 0x40000
|
266
280
|
/** use writable mmap */
|
267
281
|
#define MDB_WRITEMAP 0x80000
|
268
|
-
/** use asynchronous msync when MDB_WRITEMAP is used */
|
282
|
+
/** use asynchronous msync when #MDB_WRITEMAP is used */
|
269
283
|
#define MDB_MAPASYNC 0x100000
|
270
284
|
/** tie reader locktable slots to #MDB_txn objects instead of to threads */
|
271
285
|
#define MDB_NOTLS 0x200000
|
286
|
+
/** don't do any locking, caller must manage their own locks */
|
287
|
+
#define MDB_NOLOCK 0x400000
|
288
|
+
/** don't do readahead (no effect on Windows) */
|
289
|
+
#define MDB_NORDAHEAD 0x800000
|
290
|
+
/** don't initialize malloc'd memory before writing to datafile */
|
291
|
+
#define MDB_NOMEMINIT 0x1000000
|
272
292
|
/** @} */
|
273
293
|
|
274
294
|
/** @defgroup mdb_dbi_open Database Flags
|
@@ -486,6 +506,8 @@ int mdb_env_create(MDB_env **env);
|
|
486
506
|
* and uses fewer mallocs, but loses protection from application bugs
|
487
507
|
* like wild pointer writes and other bad updates into the database.
|
488
508
|
* Incompatible with nested transactions.
|
509
|
+
* Processes with and without MDB_WRITEMAP on the same environment do
|
510
|
+
* not cooperate well.
|
489
511
|
* <li>#MDB_NOMETASYNC
|
490
512
|
* Flush system buffers to disk only once per transaction, omit the
|
491
513
|
* metadata flush. Defer that until the system flushes files to disk,
|
@@ -523,6 +545,38 @@ int mdb_env_create(MDB_env **env);
|
|
523
545
|
* user threads over individual OS threads need this option. Such an
|
524
546
|
* application must also serialize the write transactions in an OS
|
525
547
|
* thread, since MDB's write locking is unaware of the user threads.
|
548
|
+
* <li>#MDB_NOLOCK
|
549
|
+
* Don't do any locking. If concurrent access is anticipated, the
|
550
|
+
* caller must manage all concurrency itself. For proper operation
|
551
|
+
* the caller must enforce single-writer semantics, and must ensure
|
552
|
+
* that no readers are using old transactions while a writer is
|
553
|
+
* active. The simplest approach is to use an exclusive lock so that
|
554
|
+
* no readers may be active at all when a writer begins.
|
555
|
+
* <li>#MDB_NORDAHEAD
|
556
|
+
* Turn off readahead. Most operating systems perform readahead on
|
557
|
+
* read requests by default. This option turns it off if the OS
|
558
|
+
* supports it. Turning it off may help random read performance
|
559
|
+
* when the DB is larger than RAM and system RAM is full.
|
560
|
+
* The option is not implemented on Windows.
|
561
|
+
* <li>#MDB_NOMEMINIT
|
562
|
+
* Don't initialize malloc'd memory before writing to unused spaces
|
563
|
+
* in the data file. By default, memory for pages written to the data
|
564
|
+
* file is obtained using malloc. While these pages may be reused in
|
565
|
+
* subsequent transactions, freshly malloc'd pages will be initialized
|
566
|
+
* to zeroes before use. This avoids persisting leftover data from other
|
567
|
+
* code (that used the heap and subsequently freed the memory) into the
|
568
|
+
* data file. Note that many other system libraries may allocate
|
569
|
+
* and free memory from the heap for arbitrary uses. E.g., stdio may
|
570
|
+
* use the heap for file I/O buffers. This initialization step has a
|
571
|
+
* modest performance cost so some applications may want to disable
|
572
|
+
* it using this flag. This option can be a problem for applications
|
573
|
+
* which handle sensitive data like passwords, and it makes memory
|
574
|
+
* checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP,
|
575
|
+
* which writes directly to the mmap instead of using malloc for pages. The
|
576
|
+
* initialization is also skipped if #MDB_RESERVE is used; the
|
577
|
+
* caller is expected to overwrite all of the memory that was
|
578
|
+
* reserved in that case.
|
579
|
+
* This flag may be changed at any time using #mdb_env_set_flags().
|
526
580
|
* </ul>
|
527
581
|
* @param[in] mode The UNIX permissions to set on created files. This parameter
|
528
582
|
* is ignored on Windows.
|
@@ -656,6 +710,18 @@ int mdb_env_get_flags(MDB_env *env, unsigned int *flags);
|
|
656
710
|
*/
|
657
711
|
int mdb_env_get_path(MDB_env *env, const char **path);
|
658
712
|
|
713
|
+
/** @brief Return the filedescriptor for the given environment.
|
714
|
+
*
|
715
|
+
* @param[in] env An environment handle returned by #mdb_env_create()
|
716
|
+
* @param[out] fd Address of a mdb_filehandle_t to contain the descriptor.
|
717
|
+
* @return A non-zero error value on failure and 0 on success. Some possible
|
718
|
+
* errors are:
|
719
|
+
* <ul>
|
720
|
+
* <li>EINVAL - an invalid parameter was specified.
|
721
|
+
* </ul>
|
722
|
+
*/
|
723
|
+
int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
|
724
|
+
|
659
725
|
/** @brief Set the size of the memory map to use for this environment.
|
660
726
|
*
|
661
727
|
* The size should be a multiple of the OS page size. The default is
|
@@ -733,8 +799,10 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs);
|
|
733
799
|
|
734
800
|
/** @brief Get the maximum size of a key for the environment.
|
735
801
|
*
|
802
|
+
* This is the compile-time constant #MDB_MAXKEYSIZE, default 511.
|
803
|
+
* See @ref MDB_val.
|
736
804
|
* @param[in] env An environment handle returned by #mdb_env_create()
|
737
|
-
* @return The maximum size of a key
|
805
|
+
* @return The maximum size of a key
|
738
806
|
*/
|
739
807
|
int mdb_env_get_maxkeysize(MDB_env *env);
|
740
808
|
|
@@ -1094,6 +1162,8 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
|
|
1094
1162
|
* reserved space, which the caller can fill in later - before
|
1095
1163
|
* the next update operation or the transaction ends. This saves
|
1096
1164
|
* an extra memcpy if the data is being generated later.
|
1165
|
+
* MDB does nothing else with this memory, the caller is expected
|
1166
|
+
* to modify all of the space requested.
|
1097
1167
|
* <li>#MDB_APPEND - append the given key/data pair to the end of the
|
1098
1168
|
* database. No key comparisons are performed. This option allows
|
1099
1169
|
* fast bulk loading when keys are already known to be in the
|
data/ext/lmdb_ext/liblmdb/mdb.c
CHANGED
@@ -37,10 +37,26 @@
|
|
37
37
|
#endif
|
38
38
|
#include <sys/types.h>
|
39
39
|
#include <sys/stat.h>
|
40
|
-
#include <sys/param.h>
|
41
40
|
#ifdef _WIN32
|
42
41
|
#include <windows.h>
|
42
|
+
/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
|
43
|
+
* as int64 which is wrong. MSVC doesn't define it at all, so just
|
44
|
+
* don't use it.
|
45
|
+
*/
|
46
|
+
#define MDB_PID_T int
|
47
|
+
#ifdef __GNUC__
|
48
|
+
# include <sys/param.h>
|
43
49
|
#else
|
50
|
+
# define LITTLE_ENDIAN 1234
|
51
|
+
# define BIG_ENDIAN 4321
|
52
|
+
# define BYTE_ORDER LITTLE_ENDIAN
|
53
|
+
# ifndef SSIZE_MAX
|
54
|
+
# define SSIZE_MAX INT_MAX
|
55
|
+
# endif
|
56
|
+
#endif
|
57
|
+
#else
|
58
|
+
#define MDB_PID_T pid_t
|
59
|
+
#include <sys/param.h>
|
44
60
|
#include <sys/uio.h>
|
45
61
|
#include <sys/mman.h>
|
46
62
|
#ifdef HAVE_SYS_FILE_H
|
@@ -75,6 +91,7 @@
|
|
75
91
|
#ifndef _WIN32
|
76
92
|
#include <pthread.h>
|
77
93
|
#ifdef MDB_USE_POSIX_SEM
|
94
|
+
# define MDB_USE_HASH 1
|
78
95
|
#include <semaphore.h>
|
79
96
|
#endif
|
80
97
|
#endif
|
@@ -140,6 +157,7 @@
|
|
140
157
|
* @{
|
141
158
|
*/
|
142
159
|
#ifdef _WIN32
|
160
|
+
#define MDB_USE_HASH 1
|
143
161
|
#define MDB_PIDLOCK 0
|
144
162
|
#define pthread_t DWORD
|
145
163
|
#define pthread_mutex_t HANDLE
|
@@ -171,7 +189,7 @@
|
|
171
189
|
#define Z "I"
|
172
190
|
#else
|
173
191
|
|
174
|
-
#define Z "z"
|
192
|
+
#define Z "z" /**< printf format modifier for size_t */
|
175
193
|
|
176
194
|
/** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
|
177
195
|
#define MDB_PIDLOCK 1
|
@@ -317,12 +335,18 @@ static txnid_t mdb_debug_start;
|
|
317
335
|
* The string is printed literally, with no format processing.
|
318
336
|
*/
|
319
337
|
#define DPUTS(arg) DPRINTF(("%s", arg))
|
338
|
+
/** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
|
339
|
+
#define DDBI(mc) \
|
340
|
+
(((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
|
320
341
|
/** @} */
|
321
342
|
|
322
|
-
/**
|
323
|
-
*
|
324
|
-
*
|
325
|
-
*
|
343
|
+
/** @brief The maximum size of a database page.
|
344
|
+
*
|
345
|
+
* This is 32k, since it must fit in #MDB_page.#mp_upper.
|
346
|
+
*
|
347
|
+
* LMDB will use database pages < OS pages if needed.
|
348
|
+
* That causes more I/O in write transactions: The OS must
|
349
|
+
* know (read) the whole page before writing a partial page.
|
326
350
|
*
|
327
351
|
* Note that we don't currently support Huge pages. On Linux,
|
328
352
|
* regular data files cannot use Huge pages, and in general
|
@@ -331,7 +355,7 @@ static txnid_t mdb_debug_start;
|
|
331
355
|
* pressure from other processes is high. So until OSs have
|
332
356
|
* actual paging support for Huge pages, they're not viable.
|
333
357
|
*/
|
334
|
-
#define
|
358
|
+
#define MAX_PAGESIZE 0x8000
|
335
359
|
|
336
360
|
/** The minimum number of keys required in a database page.
|
337
361
|
* Setting this to a larger value will place a smaller bound on the
|
@@ -365,7 +389,7 @@ static txnid_t mdb_debug_start;
|
|
365
389
|
*
|
366
390
|
* We require that keys all fit onto a regular page. This limit
|
367
391
|
* could be raised a bit further if needed; to something just
|
368
|
-
* under
|
392
|
+
* under (page size / #MDB_MINKEYS / 3).
|
369
393
|
*
|
370
394
|
* Note that data items in an #MDB_DUPSORT database are actually keys
|
371
395
|
* of a subDB, so they're also limited to this size.
|
@@ -425,7 +449,8 @@ typedef uint16_t indx_t;
|
|
425
449
|
*
|
426
450
|
* If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
|
427
451
|
*
|
428
|
-
* No reader table is used if the database is on a read-only filesystem
|
452
|
+
* No reader table is used if the database is on a read-only filesystem, or
|
453
|
+
* if #MDB_NOLOCK is set.
|
429
454
|
*
|
430
455
|
* Since the database uses multi-version concurrency control, readers don't
|
431
456
|
* actually need any locking. This table is used to keep track of which
|
@@ -488,7 +513,7 @@ typedef struct MDB_rxbody {
|
|
488
513
|
*/
|
489
514
|
txnid_t mrb_txnid;
|
490
515
|
/** The process ID of the process owning this reader txn. */
|
491
|
-
|
516
|
+
MDB_PID_T mrb_pid;
|
492
517
|
/** The thread ID of the thread owning this txn. */
|
493
518
|
pthread_t mrb_tid;
|
494
519
|
} MDB_rxbody;
|
@@ -600,7 +625,7 @@ typedef struct MDB_page {
|
|
600
625
|
#define P_LEAF 0x02 /**< leaf page */
|
601
626
|
#define P_OVERFLOW 0x04 /**< overflow page */
|
602
627
|
#define P_META 0x08 /**< meta page */
|
603
|
-
#define P_DIRTY 0x10 /**< dirty page */
|
628
|
+
#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
|
604
629
|
#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
|
605
630
|
#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
|
606
631
|
#define P_KEEP 0x8000 /**< leave this page alone during spill */
|
@@ -786,7 +811,10 @@ typedef struct MDB_db {
|
|
786
811
|
/** Handle for the default DB. */
|
787
812
|
#define MAIN_DBI 1
|
788
813
|
|
789
|
-
/** Meta page content.
|
814
|
+
/** Meta page content.
|
815
|
+
* A meta page is the start point for accessing a database snapshot.
|
816
|
+
* Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
|
817
|
+
*/
|
790
818
|
typedef struct MDB_meta {
|
791
819
|
/** Stamp identifying this as an MDB file. It must be set
|
792
820
|
* to #MDB_MAGIC. */
|
@@ -804,19 +832,18 @@ typedef struct MDB_meta {
|
|
804
832
|
txnid_t mm_txnid; /**< txnid that committed this page */
|
805
833
|
} MDB_meta;
|
806
834
|
|
807
|
-
/** Buffer for a stack-allocated
|
835
|
+
/** Buffer for a stack-allocated meta page.
|
808
836
|
* The members define size and alignment, and silence type
|
809
837
|
* aliasing warnings. They are not used directly; that could
|
810
838
|
* mean incorrectly using several union members in parallel.
|
811
839
|
*/
|
812
|
-
typedef union
|
813
|
-
char mb_raw[MDB_PAGESIZE];
|
840
|
+
typedef union MDB_metabuf {
|
814
841
|
MDB_page mb_page;
|
815
842
|
struct {
|
816
843
|
char mm_pad[PAGEHDRSZ];
|
817
844
|
MDB_meta mm_meta;
|
818
845
|
} mb_metabuf;
|
819
|
-
}
|
846
|
+
} MDB_metabuf;
|
820
847
|
|
821
848
|
/** Auxiliary DB info.
|
822
849
|
* The information here is mostly static/read-only. There is
|
@@ -865,9 +892,9 @@ struct MDB_txn {
|
|
865
892
|
* @ingroup internal
|
866
893
|
* @{
|
867
894
|
*/
|
868
|
-
#define DB_DIRTY 0x01 /**< DB was
|
869
|
-
#define DB_STALE 0x02 /**< DB record is older than txnID */
|
870
|
-
#define DB_NEW 0x04 /**< DB handle opened in this txn */
|
895
|
+
#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */
|
896
|
+
#define DB_STALE 0x02 /**< Named-DB record is older than txnID */
|
897
|
+
#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
|
871
898
|
#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
|
872
899
|
/** @} */
|
873
900
|
/** In write txns, array of cursors for each DB */
|
@@ -889,12 +916,12 @@ struct MDB_txn {
|
|
889
916
|
#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
|
890
917
|
/** @} */
|
891
918
|
unsigned int mt_flags; /**< @ref mdb_txn */
|
892
|
-
/** dirty_list
|
893
|
-
|
894
|
-
|
895
|
-
*
|
919
|
+
/** dirty_list room: Array size - #dirty pages visible to this txn.
|
920
|
+
* Includes ancestor txns' dirty pages not hidden by other txns'
|
921
|
+
* dirty/spilled pages. Thus commit(nested txn) has room to merge
|
922
|
+
* dirty_list into mt_parent after freeing hidden mt_parent pages.
|
896
923
|
*/
|
897
|
-
unsigned int
|
924
|
+
unsigned int mt_dirty_room;
|
898
925
|
};
|
899
926
|
|
900
927
|
/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
|
@@ -905,7 +932,14 @@ struct MDB_txn {
|
|
905
932
|
|
906
933
|
struct MDB_xcursor;
|
907
934
|
|
908
|
-
/** Cursors are used for all DB operations
|
935
|
+
/** Cursors are used for all DB operations.
|
936
|
+
* A cursor holds a path of (page pointer, key index) from the DB
|
937
|
+
* root to a position in the DB, plus other state. #MDB_DUPSORT
|
938
|
+
* cursors include an xcursor to the current data item. Write txns
|
939
|
+
* track their cursors and keep them up to date when data moves.
|
940
|
+
* Exception: An xcursor's pointer to a #P_SUBP page can be stale.
|
941
|
+
* (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
|
942
|
+
*/
|
909
943
|
struct MDB_cursor {
|
910
944
|
/** Next cursor on this DB in this txn */
|
911
945
|
MDB_cursor *mc_next;
|
@@ -978,16 +1012,18 @@ struct MDB_env {
|
|
978
1012
|
/** Have liveness lock in reader table */
|
979
1013
|
#define MDB_LIVE_READER 0x08000000U
|
980
1014
|
uint32_t me_flags; /**< @ref mdb_env */
|
981
|
-
unsigned int me_psize; /**<
|
1015
|
+
unsigned int me_psize; /**< DB page size, inited from me_os_psize */
|
1016
|
+
unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
|
982
1017
|
unsigned int me_maxreaders; /**< size of the reader table */
|
983
1018
|
unsigned int me_numreaders; /**< max numreaders set by this env */
|
984
1019
|
MDB_dbi me_numdbs; /**< number of DBs opened */
|
985
1020
|
MDB_dbi me_maxdbs; /**< size of the DB table */
|
986
|
-
|
1021
|
+
MDB_PID_T me_pid; /**< process ID of this env */
|
987
1022
|
char *me_path; /**< path to the DB files */
|
988
1023
|
char *me_map; /**< the memory map of the data file */
|
989
1024
|
MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
|
990
1025
|
MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
|
1026
|
+
void *me_pbuf; /**< scratch area for DUPSORT put() */
|
991
1027
|
MDB_txn *me_txn; /**< current write transaction */
|
992
1028
|
size_t me_mapsize; /**< size of the data memory map */
|
993
1029
|
off_t me_size; /**< current file size */
|
@@ -1019,8 +1055,8 @@ struct MDB_env {
|
|
1019
1055
|
|
1020
1056
|
/** Nested transaction */
|
1021
1057
|
typedef struct MDB_ntxn {
|
1022
|
-
MDB_txn mnt_txn;
|
1023
|
-
MDB_pgstate mnt_pgstate;
|
1058
|
+
MDB_txn mnt_txn; /**< the transaction */
|
1059
|
+
MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
|
1024
1060
|
} MDB_ntxn;
|
1025
1061
|
|
1026
1062
|
/** max number of pages to commit in one writev() call */
|
@@ -1042,6 +1078,8 @@ static int mdb_page_search_root(MDB_cursor *mc,
|
|
1042
1078
|
MDB_val *key, int modify);
|
1043
1079
|
#define MDB_PS_MODIFY 1
|
1044
1080
|
#define MDB_PS_ROOTONLY 2
|
1081
|
+
#define MDB_PS_FIRST 4
|
1082
|
+
#define MDB_PS_LAST 8
|
1045
1083
|
static int mdb_page_search(MDB_cursor *mc,
|
1046
1084
|
MDB_val *key, int flags);
|
1047
1085
|
static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
|
@@ -1255,7 +1293,7 @@ static void mdb_audit(MDB_txn *txn)
|
|
1255
1293
|
txn->mt_dbs[i].md_leaf_pages +
|
1256
1294
|
txn->mt_dbs[i].md_overflow_pages;
|
1257
1295
|
if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
|
1258
|
-
mdb_page_search(&mc, NULL,
|
1296
|
+
mdb_page_search(&mc, NULL, MDB_PS_FIRST);
|
1259
1297
|
do {
|
1260
1298
|
unsigned j;
|
1261
1299
|
MDB_page *mp;
|
@@ -1300,7 +1338,12 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
|
|
1300
1338
|
{
|
1301
1339
|
MDB_env *env = txn->mt_env;
|
1302
1340
|
MDB_page *ret = env->me_dpages;
|
1303
|
-
size_t
|
1341
|
+
size_t psize = env->me_psize, sz = psize, off;
|
1342
|
+
/* For ! #MDB_NOMEMINIT, psize counts how much to init.
|
1343
|
+
* For a single page alloc, we init everything after the page header.
|
1344
|
+
* For multi-page, we init the final page; if the caller needed that
|
1345
|
+
* many pages they will be filling in at least up to the last page.
|
1346
|
+
*/
|
1304
1347
|
if (num == 1) {
|
1305
1348
|
if (ret) {
|
1306
1349
|
VGMEMP_ALLOC(env, ret, sz);
|
@@ -1308,10 +1351,16 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
|
|
1308
1351
|
env->me_dpages = ret->mp_next;
|
1309
1352
|
return ret;
|
1310
1353
|
}
|
1354
|
+
psize -= off = PAGEHDRSZ;
|
1311
1355
|
} else {
|
1312
1356
|
sz *= num;
|
1357
|
+
off = sz - psize;
|
1313
1358
|
}
|
1314
1359
|
if ((ret = malloc(sz)) != NULL) {
|
1360
|
+
if (!(env->me_flags & MDB_NOMEMINIT)) {
|
1361
|
+
memset((char *)ret + off, 0, psize);
|
1362
|
+
ret->mp_pad = 0;
|
1363
|
+
}
|
1315
1364
|
VGMEMP_ALLOC(env, ret, sz);
|
1316
1365
|
}
|
1317
1366
|
return ret;
|
@@ -1329,7 +1378,7 @@ mdb_page_free(MDB_env *env, MDB_page *mp)
|
|
1329
1378
|
env->me_dpages = mp;
|
1330
1379
|
}
|
1331
1380
|
|
1332
|
-
|
1381
|
+
/** Free a dirty page */
|
1333
1382
|
static void
|
1334
1383
|
mdb_dpage_free(MDB_env *env, MDB_page *dp)
|
1335
1384
|
{
|
@@ -1356,7 +1405,7 @@ mdb_dlist_free(MDB_txn *txn)
|
|
1356
1405
|
dl[0].mid = 0;
|
1357
1406
|
}
|
1358
1407
|
|
1359
|
-
|
1408
|
+
/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
|
1360
1409
|
* @param[in] mc A cursor handle for the current operation.
|
1361
1410
|
* @param[in] pflags Flags of the pages to update:
|
1362
1411
|
* P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
|
@@ -1366,10 +1415,12 @@ mdb_dlist_free(MDB_txn *txn)
|
|
1366
1415
|
static int
|
1367
1416
|
mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
|
1368
1417
|
{
|
1418
|
+
enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
|
1369
1419
|
MDB_txn *txn = mc->mc_txn;
|
1370
1420
|
MDB_cursor *m3;
|
1371
1421
|
MDB_xcursor *mx;
|
1372
|
-
MDB_page *dp;
|
1422
|
+
MDB_page *dp, *mp;
|
1423
|
+
MDB_node *leaf;
|
1373
1424
|
unsigned i, j;
|
1374
1425
|
int rc = MDB_SUCCESS, level;
|
1375
1426
|
|
@@ -1378,14 +1429,24 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
|
|
1378
1429
|
mc = NULL; /* will find mc in mt_cursors */
|
1379
1430
|
for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
|
1380
1431
|
for (; mc; mc=mc->mc_next) {
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
if (
|
1388
|
-
|
1432
|
+
if (!(mc->mc_flags & C_INITIALIZED))
|
1433
|
+
continue;
|
1434
|
+
for (m3 = mc;; m3 = &mx->mx_cursor) {
|
1435
|
+
mp = NULL;
|
1436
|
+
for (j=0; j<m3->mc_snum; j++) {
|
1437
|
+
mp = m3->mc_pg[j];
|
1438
|
+
if ((mp->mp_flags & Mask) == pflags)
|
1439
|
+
mp->mp_flags ^= P_KEEP;
|
1440
|
+
}
|
1441
|
+
mx = m3->mc_xcursor;
|
1442
|
+
/* Proceed to mx if it is at a sub-database */
|
1443
|
+
if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
|
1444
|
+
break;
|
1445
|
+
if (! (mp && (mp->mp_flags & P_LEAF)))
|
1446
|
+
break;
|
1447
|
+
leaf = NODEPTR(mp, m3->mc_ki[j-1]);
|
1448
|
+
if (!(leaf->mn_flags & F_SUBDATA))
|
1449
|
+
break;
|
1389
1450
|
}
|
1390
1451
|
}
|
1391
1452
|
if (i == 0)
|
@@ -1401,7 +1462,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
|
|
1401
1462
|
continue;
|
1402
1463
|
if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
|
1403
1464
|
break;
|
1404
|
-
if ((dp->mp_flags &
|
1465
|
+
if ((dp->mp_flags & Mask) == pflags && level <= 1)
|
1405
1466
|
dp->mp_flags ^= P_KEEP;
|
1406
1467
|
}
|
1407
1468
|
}
|
@@ -1415,15 +1476,12 @@ static int mdb_page_flush(MDB_txn *txn, int keep);
|
|
1415
1476
|
/** Spill pages from the dirty list back to disk.
|
1416
1477
|
* This is intended to prevent running into #MDB_TXN_FULL situations,
|
1417
1478
|
* but note that they may still occur in a few cases:
|
1418
|
-
* 1)
|
1419
|
-
*
|
1420
|
-
* too full.
|
1479
|
+
* 1) our estimate of the txn size could be too small. Currently this
|
1480
|
+
* seems unlikely, except with a large number of #MDB_MULTIPLE items.
|
1421
1481
|
* 2) child txns may run out of space if their parents dirtied a
|
1422
1482
|
* lot of pages and never spilled them. TODO: we probably should do
|
1423
1483
|
* a preemptive spill during #mdb_txn_begin() of a child txn, if
|
1424
1484
|
* the parent's dirty_room is below a given threshold.
|
1425
|
-
* 3) our estimate of the txn size could be too small. At the
|
1426
|
-
* moment this seems unlikely.
|
1427
1485
|
*
|
1428
1486
|
* Otherwise, if not using nested txns, it is expected that apps will
|
1429
1487
|
* not run into #MDB_TXN_FULL any more. The pages are flushed to disk
|
@@ -1541,31 +1599,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
|
|
1541
1599
|
rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
|
1542
1600
|
|
1543
1601
|
done:
|
1544
|
-
|
1545
|
-
if (txn->mt_parent) {
|
1546
|
-
txn->mt_dirty_room = txn->mt_parent->mt_dirty_room - dl[0].mid;
|
1547
|
-
/* dirty pages that are dirty in an ancestor don't
|
1548
|
-
* count against this txn's dirty_room.
|
1549
|
-
*/
|
1550
|
-
for (i=1; i<=dl[0].mid; i++) {
|
1551
|
-
pgno_t pgno = dl[i].mid;
|
1552
|
-
MDB_txn *tx2;
|
1553
|
-
for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
|
1554
|
-
j = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
|
1555
|
-
if (j <= tx2->mt_u.dirty_list[0].mid &&
|
1556
|
-
tx2->mt_u.dirty_list[j].mid == pgno) {
|
1557
|
-
txn->mt_dirty_room++;
|
1558
|
-
break;
|
1559
|
-
}
|
1560
|
-
}
|
1561
|
-
}
|
1562
|
-
} else {
|
1563
|
-
txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid;
|
1564
|
-
}
|
1565
|
-
txn->mt_flags |= MDB_TXN_SPILLS;
|
1566
|
-
} else {
|
1567
|
-
txn->mt_flags |= MDB_TXN_ERROR;
|
1568
|
-
}
|
1602
|
+
txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
|
1569
1603
|
return rc;
|
1570
1604
|
}
|
1571
1605
|
|
@@ -1575,12 +1609,14 @@ mdb_find_oldest(MDB_txn *txn)
|
|
1575
1609
|
{
|
1576
1610
|
int i;
|
1577
1611
|
txnid_t mr, oldest = txn->mt_txnid - 1;
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
oldest
|
1612
|
+
if (txn->mt_env->me_txns) {
|
1613
|
+
MDB_reader *r = txn->mt_env->me_txns->mti_readers;
|
1614
|
+
for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
|
1615
|
+
if (r[i].mr_pid) {
|
1616
|
+
mr = r[i].mr_txnid;
|
1617
|
+
if (oldest > mr)
|
1618
|
+
oldest = mr;
|
1619
|
+
}
|
1584
1620
|
}
|
1585
1621
|
}
|
1586
1622
|
return oldest;
|
@@ -1790,26 +1826,28 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
|
|
1790
1826
|
/** Pull a page off the txn's spill list, if present.
|
1791
1827
|
* If a page being referenced was spilled to disk in this txn, bring
|
1792
1828
|
* it back and make it dirty/writable again.
|
1793
|
-
* @param[in]
|
1829
|
+
* @param[in] txn the transaction handle.
|
1794
1830
|
* @param[in] mp the page being referenced.
|
1795
1831
|
* @param[out] ret the writable page, if any. ret is unchanged if
|
1796
1832
|
* mp wasn't spilled.
|
1797
1833
|
*/
|
1798
1834
|
static int
|
1799
|
-
mdb_page_unspill(MDB_txn *
|
1835
|
+
mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
|
1800
1836
|
{
|
1801
|
-
MDB_env *env =
|
1802
|
-
MDB_txn *
|
1837
|
+
MDB_env *env = txn->mt_env;
|
1838
|
+
const MDB_txn *tx2;
|
1803
1839
|
unsigned x;
|
1804
1840
|
pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
|
1805
1841
|
|
1806
|
-
for (
|
1807
|
-
if (!
|
1842
|
+
for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
|
1843
|
+
if (!tx2->mt_spill_pgs)
|
1808
1844
|
continue;
|
1809
|
-
x = mdb_midl_search(
|
1810
|
-
if (x <=
|
1845
|
+
x = mdb_midl_search(tx2->mt_spill_pgs, pn);
|
1846
|
+
if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
|
1811
1847
|
MDB_page *np;
|
1812
1848
|
int num;
|
1849
|
+
if (txn->mt_dirty_room == 0)
|
1850
|
+
return MDB_TXN_FULL;
|
1813
1851
|
if (IS_OVERFLOW(mp))
|
1814
1852
|
num = mp->mp_pages;
|
1815
1853
|
else
|
@@ -1825,7 +1863,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
|
|
1825
1863
|
else
|
1826
1864
|
mdb_page_copy(np, mp, env->me_psize);
|
1827
1865
|
}
|
1828
|
-
if (
|
1866
|
+
if (tx2 == txn) {
|
1829
1867
|
/* If in current txn, this page is no longer spilled.
|
1830
1868
|
* If it happens to be the last page, truncate the spill list.
|
1831
1869
|
* Otherwise mark it as deleted by setting the LSB.
|
@@ -1838,22 +1876,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
|
|
1838
1876
|
* page remains spilled until child commits
|
1839
1877
|
*/
|
1840
1878
|
|
1841
|
-
|
1842
|
-
MDB_txn *tx2;
|
1843
|
-
/* If this page is also in a parent's dirty list, then
|
1844
|
-
* it's already accounted in dirty_room, and we need to
|
1845
|
-
* cancel out the decrement that mdb_page_dirty does.
|
1846
|
-
*/
|
1847
|
-
for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
|
1848
|
-
x = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
|
1849
|
-
if (x <= tx2->mt_u.dirty_list[0].mid &&
|
1850
|
-
tx2->mt_u.dirty_list[x].mid == pgno) {
|
1851
|
-
tx0->mt_dirty_room++;
|
1852
|
-
break;
|
1853
|
-
}
|
1854
|
-
}
|
1855
|
-
}
|
1856
|
-
mdb_page_dirty(tx0, np);
|
1879
|
+
mdb_page_dirty(txn, np);
|
1857
1880
|
np->mp_flags |= P_DIRTY;
|
1858
1881
|
*ret = np;
|
1859
1882
|
break;
|
@@ -1872,7 +1895,6 @@ mdb_page_touch(MDB_cursor *mc)
|
|
1872
1895
|
MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
|
1873
1896
|
MDB_txn *txn = mc->mc_txn;
|
1874
1897
|
MDB_cursor *m2, *m3;
|
1875
|
-
MDB_dbi dbi;
|
1876
1898
|
pgno_t pgno;
|
1877
1899
|
int rc;
|
1878
1900
|
|
@@ -1889,7 +1911,8 @@ mdb_page_touch(MDB_cursor *mc)
|
|
1889
1911
|
(rc = mdb_page_alloc(mc, 1, &np)))
|
1890
1912
|
return rc;
|
1891
1913
|
pgno = np->mp_pgno;
|
1892
|
-
DPRINTF(("touched db %
|
1914
|
+
DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
|
1915
|
+
mp->mp_pgno, pgno));
|
1893
1916
|
assert(mp->mp_pgno != pgno);
|
1894
1917
|
mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
|
1895
1918
|
/* Update the parent page, if any, to point to the new page */
|
@@ -1935,17 +1958,16 @@ mdb_page_touch(MDB_cursor *mc)
|
|
1935
1958
|
done:
|
1936
1959
|
/* Adjust cursors pointing to mp */
|
1937
1960
|
mc->mc_pg[mc->mc_top] = np;
|
1938
|
-
|
1961
|
+
m2 = txn->mt_cursors[mc->mc_dbi];
|
1939
1962
|
if (mc->mc_flags & C_SUB) {
|
1940
|
-
|
1941
|
-
for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
1963
|
+
for (; m2; m2=m2->mc_next) {
|
1942
1964
|
m3 = &m2->mc_xcursor->mx_cursor;
|
1943
1965
|
if (m3->mc_snum < mc->mc_snum) continue;
|
1944
1966
|
if (m3->mc_pg[mc->mc_top] == mp)
|
1945
1967
|
m3->mc_pg[mc->mc_top] = np;
|
1946
1968
|
}
|
1947
1969
|
} else {
|
1948
|
-
for (
|
1970
|
+
for (; m2; m2=m2->mc_next) {
|
1949
1971
|
if (m2->mc_snum < mc->mc_snum) continue;
|
1950
1972
|
if (m2->mc_pg[mc->mc_top] == mp) {
|
1951
1973
|
m2->mc_pg[mc->mc_top] = np;
|
@@ -2087,7 +2109,7 @@ enum Pidlock_op {
|
|
2087
2109
|
* lock on the lockfile, set at an offset equal to the pid.
|
2088
2110
|
*/
|
2089
2111
|
static int
|
2090
|
-
mdb_reader_pid(MDB_env *env, enum Pidlock_op op,
|
2112
|
+
mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
|
2091
2113
|
{
|
2092
2114
|
#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
|
2093
2115
|
int ret = 0;
|
@@ -2130,7 +2152,9 @@ static int
|
|
2130
2152
|
mdb_txn_renew0(MDB_txn *txn)
|
2131
2153
|
{
|
2132
2154
|
MDB_env *env = txn->mt_env;
|
2133
|
-
|
2155
|
+
MDB_txninfo *ti = env->me_txns;
|
2156
|
+
MDB_meta *meta;
|
2157
|
+
unsigned int i, nr;
|
2134
2158
|
uint16_t x;
|
2135
2159
|
int rc, new_notls = 0;
|
2136
2160
|
|
@@ -2139,9 +2163,9 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2139
2163
|
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
|
2140
2164
|
|
2141
2165
|
if (txn->mt_flags & MDB_TXN_RDONLY) {
|
2142
|
-
if (!
|
2143
|
-
|
2144
|
-
txn->mt_txnid =
|
2166
|
+
if (!ti) {
|
2167
|
+
meta = env->me_metas[ mdb_env_pick_meta(env) ];
|
2168
|
+
txn->mt_txnid = meta->mm_txnid;
|
2145
2169
|
txn->mt_u.reader = NULL;
|
2146
2170
|
} else {
|
2147
2171
|
MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
|
@@ -2150,7 +2174,7 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2150
2174
|
if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
|
2151
2175
|
return MDB_BAD_RSLOT;
|
2152
2176
|
} else {
|
2153
|
-
|
2177
|
+
MDB_PID_T pid = env->me_pid;
|
2154
2178
|
pthread_t tid = pthread_self();
|
2155
2179
|
|
2156
2180
|
if (!(env->me_flags & MDB_LIVE_READER)) {
|
@@ -2163,36 +2187,43 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2163
2187
|
}
|
2164
2188
|
|
2165
2189
|
LOCK_MUTEX_R(env);
|
2166
|
-
|
2167
|
-
|
2190
|
+
nr = ti->mti_numreaders;
|
2191
|
+
for (i=0; i<nr; i++)
|
2192
|
+
if (ti->mti_readers[i].mr_pid == 0)
|
2168
2193
|
break;
|
2169
2194
|
if (i == env->me_maxreaders) {
|
2170
2195
|
UNLOCK_MUTEX_R(env);
|
2171
2196
|
return MDB_READERS_FULL;
|
2172
2197
|
}
|
2173
|
-
|
2174
|
-
|
2175
|
-
if (i
|
2176
|
-
|
2198
|
+
ti->mti_readers[i].mr_pid = pid;
|
2199
|
+
ti->mti_readers[i].mr_tid = tid;
|
2200
|
+
if (i == nr)
|
2201
|
+
ti->mti_numreaders = ++nr;
|
2177
2202
|
/* Save numreaders for un-mutexed mdb_env_close() */
|
2178
|
-
env->me_numreaders =
|
2203
|
+
env->me_numreaders = nr;
|
2179
2204
|
UNLOCK_MUTEX_R(env);
|
2180
|
-
|
2205
|
+
|
2206
|
+
r = &ti->mti_readers[i];
|
2181
2207
|
new_notls = (env->me_flags & MDB_NOTLS);
|
2182
2208
|
if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
|
2183
2209
|
r->mr_pid = 0;
|
2184
2210
|
return rc;
|
2185
2211
|
}
|
2186
2212
|
}
|
2187
|
-
txn->mt_txnid = r->mr_txnid =
|
2213
|
+
txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
|
2188
2214
|
txn->mt_u.reader = r;
|
2215
|
+
meta = env->me_metas[txn->mt_txnid & 1];
|
2189
2216
|
}
|
2190
|
-
txn->mt_toggle = txn->mt_txnid & 1;
|
2191
2217
|
} else {
|
2192
|
-
|
2218
|
+
if (ti) {
|
2219
|
+
LOCK_MUTEX_W(env);
|
2193
2220
|
|
2194
|
-
|
2195
|
-
|
2221
|
+
txn->mt_txnid = ti->mti_txnid;
|
2222
|
+
meta = env->me_metas[txn->mt_txnid & 1];
|
2223
|
+
} else {
|
2224
|
+
meta = env->me_metas[ mdb_env_pick_meta(env) ];
|
2225
|
+
txn->mt_txnid = meta->mm_txnid;
|
2226
|
+
}
|
2196
2227
|
txn->mt_txnid++;
|
2197
2228
|
#if MDB_DEBUG
|
2198
2229
|
if (txn->mt_txnid == mdb_debug_start)
|
@@ -2208,10 +2239,10 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2208
2239
|
}
|
2209
2240
|
|
2210
2241
|
/* Copy the DB info and flags */
|
2211
|
-
memcpy(txn->mt_dbs,
|
2242
|
+
memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
|
2212
2243
|
|
2213
2244
|
/* Moved to here to avoid a data race in read TXNs */
|
2214
|
-
txn->mt_next_pgno =
|
2245
|
+
txn->mt_next_pgno = meta->mm_last_pg+1;
|
2215
2246
|
|
2216
2247
|
for (i=2; i<txn->mt_numdbs; i++) {
|
2217
2248
|
x = env->me_dbflags[i];
|
@@ -2307,7 +2338,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
|
|
2307
2338
|
return ENOMEM;
|
2308
2339
|
}
|
2309
2340
|
txn->mt_txnid = parent->mt_txnid;
|
2310
|
-
txn->mt_toggle = parent->mt_toggle;
|
2311
2341
|
txn->mt_dirty_room = parent->mt_dirty_room;
|
2312
2342
|
txn->mt_u.dirty_list[0].mid = 0;
|
2313
2343
|
txn->mt_spill_pgs = NULL;
|
@@ -2433,7 +2463,8 @@ mdb_txn_reset0(MDB_txn *txn, const char *act)
|
|
2433
2463
|
|
2434
2464
|
env->me_txn = NULL;
|
2435
2465
|
/* The writer mutex was locked in mdb_txn_begin. */
|
2436
|
-
|
2466
|
+
if (env->me_txns)
|
2467
|
+
UNLOCK_MUTEX_W(env);
|
2437
2468
|
}
|
2438
2469
|
}
|
2439
2470
|
|
@@ -2482,20 +2513,26 @@ mdb_freelist_save(MDB_txn *txn)
|
|
2482
2513
|
int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
|
2483
2514
|
txnid_t pglast = 0, head_id = 0;
|
2484
2515
|
pgno_t freecnt = 0, *free_pgs, *mop;
|
2485
|
-
ssize_t head_room = 0, total_room = 0, mop_len;
|
2516
|
+
ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
|
2486
2517
|
|
2487
2518
|
mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
|
2488
2519
|
|
2489
2520
|
if (env->me_pghead) {
|
2490
2521
|
/* Make sure first page of freeDB is touched and on freelist */
|
2491
|
-
rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
|
2522
|
+
rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
|
2492
2523
|
if (rc && rc != MDB_NOTFOUND)
|
2493
2524
|
return rc;
|
2494
2525
|
}
|
2495
2526
|
|
2527
|
+
/* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
|
2528
|
+
clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
|
2529
|
+
? SSIZE_MAX : maxfree_1pg;
|
2530
|
+
|
2496
2531
|
for (;;) {
|
2497
2532
|
/* Come back here after each Put() in case freelist changed */
|
2498
2533
|
MDB_val key, data;
|
2534
|
+
pgno_t *pgs;
|
2535
|
+
ssize_t j;
|
2499
2536
|
|
2500
2537
|
/* If using records from freeDB which we have not yet
|
2501
2538
|
* deleted, delete them and any we reserved for me_pghead.
|
@@ -2516,9 +2553,7 @@ mdb_freelist_save(MDB_txn *txn)
|
|
2516
2553
|
if (freecnt < txn->mt_free_pgs[0]) {
|
2517
2554
|
if (!freecnt) {
|
2518
2555
|
/* Make sure last page of freeDB is touched and on freelist */
|
2519
|
-
|
2520
|
-
key.mv_data = NULL;
|
2521
|
-
rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
|
2556
|
+
rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
|
2522
2557
|
if (rc && rc != MDB_NOTFOUND)
|
2523
2558
|
return rc;
|
2524
2559
|
}
|
@@ -2581,11 +2616,16 @@ mdb_freelist_save(MDB_txn *txn)
|
|
2581
2616
|
rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
|
2582
2617
|
if (rc)
|
2583
2618
|
return rc;
|
2584
|
-
|
2619
|
+
/* IDL is initially empty, zero out at least the length */
|
2620
|
+
pgs = (pgno_t *)data.mv_data;
|
2621
|
+
j = head_room > clean_limit ? head_room : 0;
|
2622
|
+
do {
|
2623
|
+
pgs[j] = 0;
|
2624
|
+
} while (--j >= 0);
|
2585
2625
|
total_room += head_room;
|
2586
2626
|
}
|
2587
2627
|
|
2588
|
-
/* Fill in the reserved
|
2628
|
+
/* Fill in the reserved me_pghead records */
|
2589
2629
|
rc = MDB_SUCCESS;
|
2590
2630
|
if (mop_len) {
|
2591
2631
|
MDB_val key, data;
|
@@ -2655,8 +2695,7 @@ mdb_page_flush(MDB_txn *txn, int keep)
|
|
2655
2695
|
}
|
2656
2696
|
dp->mp_flags &= ~P_DIRTY;
|
2657
2697
|
}
|
2658
|
-
|
2659
|
-
return MDB_SUCCESS;
|
2698
|
+
goto done;
|
2660
2699
|
}
|
2661
2700
|
|
2662
2701
|
/* Write the pages */
|
@@ -2750,8 +2789,11 @@ mdb_page_flush(MDB_txn *txn, int keep)
|
|
2750
2789
|
}
|
2751
2790
|
mdb_dpage_free(env, dp);
|
2752
2791
|
}
|
2753
|
-
dl[0].mid = j;
|
2754
2792
|
|
2793
|
+
done:
|
2794
|
+
i--;
|
2795
|
+
txn->mt_dirty_room += i - j;
|
2796
|
+
dl[0].mid = j;
|
2755
2797
|
return MDB_SUCCESS;
|
2756
2798
|
}
|
2757
2799
|
|
@@ -2791,14 +2833,18 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2791
2833
|
|
2792
2834
|
if (txn->mt_parent) {
|
2793
2835
|
MDB_txn *parent = txn->mt_parent;
|
2794
|
-
unsigned x, y, len;
|
2795
2836
|
MDB_ID2L dst, src;
|
2837
|
+
MDB_IDL pspill;
|
2838
|
+
unsigned x, y, len, ps_len;
|
2796
2839
|
|
2797
2840
|
/* Append our free list to parent's */
|
2798
2841
|
rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
|
2799
2842
|
if (rc)
|
2800
2843
|
goto fail;
|
2801
2844
|
mdb_midl_free(txn->mt_free_pgs);
|
2845
|
+
/* Failures after this must either undo the changes
|
2846
|
+
* to the parent or set MDB_TXN_ERROR in the parent.
|
2847
|
+
*/
|
2802
2848
|
|
2803
2849
|
parent->mt_next_pgno = txn->mt_next_pgno;
|
2804
2850
|
parent->mt_flags = txn->mt_flags;
|
@@ -2820,37 +2866,26 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2820
2866
|
dst = parent->mt_u.dirty_list;
|
2821
2867
|
src = txn->mt_u.dirty_list;
|
2822
2868
|
/* Remove anything in our dirty list from parent's spill list */
|
2823
|
-
if (parent->mt_spill_pgs) {
|
2824
|
-
x =
|
2825
|
-
|
2826
|
-
/*
|
2827
|
-
for (i=
|
2869
|
+
if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
|
2870
|
+
x = y = ps_len;
|
2871
|
+
pspill[0] = (pgno_t)-1;
|
2872
|
+
/* Mark our dirty pages as deleted in parent spill list */
|
2873
|
+
for (i=0, len=src[0].mid; ++i <= len; ) {
|
2828
2874
|
MDB_ID pn = src[i].mid << 1;
|
2829
|
-
|
2830
|
-
continue;
|
2831
|
-
if (pn > parent->mt_spill_pgs[x]) {
|
2832
|
-
if (x <= 1)
|
2833
|
-
break;
|
2875
|
+
while (pn > pspill[x])
|
2834
2876
|
x--;
|
2835
|
-
|
2836
|
-
|
2837
|
-
|
2838
|
-
len--;
|
2839
|
-
}
|
2840
|
-
/* OK, we had a few hits, squash zeros from the spill list */
|
2841
|
-
if (len < parent->mt_spill_pgs[0]) {
|
2842
|
-
x=1;
|
2843
|
-
for (y=1; y<=parent->mt_spill_pgs[0]; y++) {
|
2844
|
-
if (parent->mt_spill_pgs[y]) {
|
2845
|
-
if (y != x) {
|
2846
|
-
parent->mt_spill_pgs[x] = parent->mt_spill_pgs[y];
|
2847
|
-
}
|
2848
|
-
x++;
|
2849
|
-
}
|
2877
|
+
if (pn == pspill[x]) {
|
2878
|
+
pspill[x] = 1;
|
2879
|
+
y = --x;
|
2850
2880
|
}
|
2851
|
-
parent->mt_spill_pgs[0] = len;
|
2852
2881
|
}
|
2882
|
+
/* Squash deleted pagenums if we deleted any */
|
2883
|
+
for (x=y; ++x <= ps_len; )
|
2884
|
+
if (!(pspill[x] & 1))
|
2885
|
+
pspill[++y] = pspill[x];
|
2886
|
+
pspill[0] = y;
|
2853
2887
|
}
|
2888
|
+
|
2854
2889
|
/* Find len = length of merging our dirty list with parent's */
|
2855
2890
|
x = dst[0].mid;
|
2856
2891
|
dst[0].mid = 0; /* simplify loops */
|
@@ -2884,7 +2919,10 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2884
2919
|
parent->mt_dirty_room = txn->mt_dirty_room;
|
2885
2920
|
if (txn->mt_spill_pgs) {
|
2886
2921
|
if (parent->mt_spill_pgs) {
|
2887
|
-
|
2922
|
+
/* TODO: Prevent failure here, so parent does not fail */
|
2923
|
+
rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
|
2924
|
+
if (rc)
|
2925
|
+
parent->mt_flags |= MDB_TXN_ERROR;
|
2888
2926
|
mdb_midl_free(txn->mt_spill_pgs);
|
2889
2927
|
mdb_midl_sort(parent->mt_spill_pgs);
|
2890
2928
|
} else {
|
@@ -2895,7 +2933,7 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2895
2933
|
parent->mt_child = NULL;
|
2896
2934
|
mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
|
2897
2935
|
free(txn);
|
2898
|
-
return
|
2936
|
+
return rc;
|
2899
2937
|
}
|
2900
2938
|
|
2901
2939
|
if (txn != env->me_txn) {
|
@@ -2954,7 +2992,8 @@ done:
|
|
2954
2992
|
env->me_txn = NULL;
|
2955
2993
|
mdb_dbis_update(txn, 1);
|
2956
2994
|
|
2957
|
-
|
2995
|
+
if (env->me_txns)
|
2996
|
+
UNLOCK_MUTEX_W(env);
|
2958
2997
|
free(txn);
|
2959
2998
|
|
2960
2999
|
return MDB_SUCCESS;
|
@@ -2973,10 +3012,11 @@ fail:
|
|
2973
3012
|
static int
|
2974
3013
|
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
|
2975
3014
|
{
|
2976
|
-
|
3015
|
+
MDB_metabuf pbuf;
|
2977
3016
|
MDB_page *p;
|
2978
3017
|
MDB_meta *m;
|
2979
3018
|
int i, rc, off;
|
3019
|
+
enum { Size = sizeof(pbuf) };
|
2980
3020
|
|
2981
3021
|
/* We don't know the page size yet, so use a minimum value.
|
2982
3022
|
* Read both meta pages so we can use the latest one.
|
@@ -2988,13 +3028,13 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
|
|
2988
3028
|
OVERLAPPED ov;
|
2989
3029
|
memset(&ov, 0, sizeof(ov));
|
2990
3030
|
ov.Offset = off;
|
2991
|
-
rc = ReadFile(env->me_fd
|
3031
|
+
rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
|
2992
3032
|
if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
|
2993
3033
|
rc = 0;
|
2994
3034
|
#else
|
2995
|
-
rc = pread(env->me_fd, &pbuf,
|
3035
|
+
rc = pread(env->me_fd, &pbuf, Size, off);
|
2996
3036
|
#endif
|
2997
|
-
if (rc !=
|
3037
|
+
if (rc != Size) {
|
2998
3038
|
if (rc == 0 && off == 0)
|
2999
3039
|
return ENOENT;
|
3000
3040
|
rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
|
@@ -3109,7 +3149,7 @@ mdb_env_write_meta(MDB_txn *txn)
|
|
3109
3149
|
assert(txn != NULL);
|
3110
3150
|
assert(txn->mt_env != NULL);
|
3111
3151
|
|
3112
|
-
toggle =
|
3152
|
+
toggle = txn->mt_txnid & 1;
|
3113
3153
|
DPRINTF(("writing meta page %d for root page %"Z"u",
|
3114
3154
|
toggle, txn->mt_dbs[MAIN_DBI].md_root));
|
3115
3155
|
|
@@ -3125,11 +3165,18 @@ mdb_env_write_meta(MDB_txn *txn)
|
|
3125
3165
|
mp->mm_last_pg = txn->mt_next_pgno - 1;
|
3126
3166
|
mp->mm_txnid = txn->mt_txnid;
|
3127
3167
|
if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
|
3168
|
+
unsigned meta_size = env->me_psize;
|
3128
3169
|
rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
|
3129
3170
|
ptr = env->me_map;
|
3130
|
-
if (toggle)
|
3131
|
-
|
3132
|
-
|
3171
|
+
if (toggle) {
|
3172
|
+
#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
|
3173
|
+
if (meta_size < env->me_os_psize)
|
3174
|
+
meta_size += meta_size;
|
3175
|
+
else
|
3176
|
+
#endif
|
3177
|
+
ptr += meta_size;
|
3178
|
+
}
|
3179
|
+
if (MDB_MSYNC(ptr, meta_size, rc)) {
|
3133
3180
|
rc = ErrCode();
|
3134
3181
|
goto fail;
|
3135
3182
|
}
|
@@ -3200,7 +3247,8 @@ done:
|
|
3200
3247
|
* readers will get consistent data regardless of how fresh or
|
3201
3248
|
* how stale their view of these values is.
|
3202
3249
|
*/
|
3203
|
-
env->me_txns
|
3250
|
+
if (env->me_txns)
|
3251
|
+
env->me_txns->mti_txnid = txn->mt_txnid;
|
3204
3252
|
|
3205
3253
|
return MDB_SUCCESS;
|
3206
3254
|
}
|
@@ -3234,6 +3282,7 @@ mdb_env_create(MDB_env **env)
|
|
3234
3282
|
e->me_wmutex = SEM_FAILED;
|
3235
3283
|
#endif
|
3236
3284
|
e->me_pid = getpid();
|
3285
|
+
GET_PAGESIZE(e->me_os_psize);
|
3237
3286
|
VGMEMP_CREATE(e,0,0);
|
3238
3287
|
*env = e;
|
3239
3288
|
return MDB_SUCCESS;
|
@@ -3276,7 +3325,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
|
|
3276
3325
|
int prot = PROT_READ;
|
3277
3326
|
if (flags & MDB_WRITEMAP) {
|
3278
3327
|
prot |= PROT_WRITE;
|
3279
|
-
if (
|
3328
|
+
if (ftruncate(env->me_fd, env->me_mapsize) < 0)
|
3280
3329
|
return ErrCode();
|
3281
3330
|
}
|
3282
3331
|
env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
|
@@ -3285,14 +3334,17 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
|
|
3285
3334
|
env->me_map = NULL;
|
3286
3335
|
return ErrCode();
|
3287
3336
|
}
|
3288
|
-
|
3337
|
+
|
3338
|
+
if (flags & MDB_NORDAHEAD) {
|
3339
|
+
/* Turn off readahead. It's harmful when the DB is larger than RAM. */
|
3289
3340
|
#ifdef MADV_RANDOM
|
3290
|
-
|
3341
|
+
madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
|
3291
3342
|
#else
|
3292
3343
|
#ifdef POSIX_MADV_RANDOM
|
3293
|
-
|
3344
|
+
posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
|
3294
3345
|
#endif /* POSIX_MADV_RANDOM */
|
3295
3346
|
#endif /* MADV_RANDOM */
|
3347
|
+
}
|
3296
3348
|
#endif /* _WIN32 */
|
3297
3349
|
|
3298
3350
|
/* Can happen because the address argument to mmap() is just a
|
@@ -3323,6 +3375,14 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
|
|
3323
3375
|
return EINVAL;
|
3324
3376
|
if (!size)
|
3325
3377
|
size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize;
|
3378
|
+
else if (size < env->me_mapsize) {
|
3379
|
+
/* If the configured size is smaller, make sure it's
|
3380
|
+
* still big enough. Silently round up to minimum if not.
|
3381
|
+
*/
|
3382
|
+
size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize;
|
3383
|
+
if (size < minsize)
|
3384
|
+
size = minsize;
|
3385
|
+
}
|
3326
3386
|
munmap(env->me_map, env->me_mapsize);
|
3327
3387
|
env->me_mapsize = size;
|
3328
3388
|
old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
|
@@ -3388,7 +3448,9 @@ mdb_env_open2(MDB_env *env)
|
|
3388
3448
|
return i;
|
3389
3449
|
DPUTS("new mdbenv");
|
3390
3450
|
newenv = 1;
|
3391
|
-
|
3451
|
+
env->me_psize = env->me_os_psize;
|
3452
|
+
if (env->me_psize > MAX_PAGESIZE)
|
3453
|
+
env->me_psize = MAX_PAGESIZE;
|
3392
3454
|
} else {
|
3393
3455
|
env->me_psize = meta.mm_psize;
|
3394
3456
|
}
|
@@ -3499,7 +3561,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_
|
|
3499
3561
|
#pragma comment(linker, "/INCLUDE:_tls_used")
|
3500
3562
|
#pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
|
3501
3563
|
#pragma const_seg(".CRT$XLB")
|
3502
|
-
extern const PIMAGE_TLS_CALLBACK
|
3564
|
+
extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
|
3503
3565
|
const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
|
3504
3566
|
#pragma const_seg()
|
3505
3567
|
#else /* WIN32 */
|
@@ -3597,7 +3659,7 @@ mdb_env_excl_lock(MDB_env *env, int *excl)
|
|
3597
3659
|
return rc;
|
3598
3660
|
}
|
3599
3661
|
|
3600
|
-
#
|
3662
|
+
#ifdef MDB_USE_HASH
|
3601
3663
|
/*
|
3602
3664
|
* hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
|
3603
3665
|
*
|
@@ -3763,7 +3825,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
|
|
3763
3825
|
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
|
3764
3826
|
if (size < rsize && *excl > 0) {
|
3765
3827
|
#ifdef _WIN32
|
3766
|
-
if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize
|
3828
|
+
if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
|
3767
3829
|
|| !SetEndOfFile(env->me_lfd))
|
3768
3830
|
goto fail_errno;
|
3769
3831
|
#else
|
@@ -3919,8 +3981,9 @@ fail:
|
|
3919
3981
|
* at runtime. Changing other flags requires closing the
|
3920
3982
|
* environment and re-opening it with the new flags.
|
3921
3983
|
*/
|
3922
|
-
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
|
3923
|
-
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|
|
3984
|
+
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
|
3985
|
+
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
|
3986
|
+
MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
|
3924
3987
|
|
3925
3988
|
int
|
3926
3989
|
mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
|
@@ -3973,7 +4036,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
|
|
3973
4036
|
}
|
3974
4037
|
|
3975
4038
|
/* For RDONLY, get lockfile after we know datafile exists */
|
3976
|
-
if (!
|
4039
|
+
if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
|
3977
4040
|
rc = mdb_env_setup_locks(env, lpath, mode, &excl);
|
3978
4041
|
if (rc)
|
3979
4042
|
goto leave;
|
@@ -4003,7 +4066,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
|
|
4003
4066
|
goto leave;
|
4004
4067
|
}
|
4005
4068
|
|
4006
|
-
if (
|
4069
|
+
if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
|
4007
4070
|
rc = mdb_env_setup_locks(env, lpath, mode, &excl);
|
4008
4071
|
if (rc)
|
4009
4072
|
goto leave;
|
@@ -4033,7 +4096,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
|
|
4033
4096
|
DPRINTF(("opened dbenv %p", (void *) env));
|
4034
4097
|
if (excl > 0) {
|
4035
4098
|
rc = mdb_env_share_locks(env, &excl);
|
4099
|
+
if (rc)
|
4100
|
+
goto leave;
|
4036
4101
|
}
|
4102
|
+
if (!((flags & MDB_RDONLY) ||
|
4103
|
+
(env->me_pbuf = calloc(1, env->me_psize))))
|
4104
|
+
rc = ENOMEM;
|
4037
4105
|
}
|
4038
4106
|
|
4039
4107
|
leave:
|
@@ -4057,6 +4125,7 @@ mdb_env_close0(MDB_env *env, int excl)
|
|
4057
4125
|
for (i = env->me_maxdbs; --i > MAIN_DBI; )
|
4058
4126
|
free(env->me_dbxs[i].md_name.mv_data);
|
4059
4127
|
|
4128
|
+
free(env->me_pbuf);
|
4060
4129
|
free(env->me_dbflags);
|
4061
4130
|
free(env->me_dbxs);
|
4062
4131
|
free(env->me_path);
|
@@ -4084,7 +4153,7 @@ mdb_env_close0(MDB_env *env, int excl)
|
|
4084
4153
|
if (env->me_fd != INVALID_HANDLE_VALUE)
|
4085
4154
|
(void) close(env->me_fd);
|
4086
4155
|
if (env->me_txns) {
|
4087
|
-
|
4156
|
+
MDB_PID_T pid = env->me_pid;
|
4088
4157
|
/* Clearing readers is done in this function because
|
4089
4158
|
* me_txkey with its destructor must be disabled first.
|
4090
4159
|
*/
|
@@ -4246,14 +4315,6 @@ mdb_env_copy(MDB_env *env, const char *path)
|
|
4246
4315
|
newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
|
4247
4316
|
FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
|
4248
4317
|
#else
|
4249
|
-
#ifdef O_DIRECT
|
4250
|
-
/* The OS supports O_DIRECT, try with it */
|
4251
|
-
newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_DIRECT, 0666);
|
4252
|
-
/* But open can fail if O_DIRECT isn't supported by the file system
|
4253
|
-
* so retry without the flag
|
4254
|
-
*/
|
4255
|
-
if (newfd == INVALID_HANDLE_VALUE && ErrCode() == EINVAL)
|
4256
|
-
#endif
|
4257
4318
|
newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
|
4258
4319
|
#endif
|
4259
4320
|
if (newfd == INVALID_HANDLE_VALUE) {
|
@@ -4261,6 +4322,11 @@ mdb_env_copy(MDB_env *env, const char *path)
|
|
4261
4322
|
goto leave;
|
4262
4323
|
}
|
4263
4324
|
|
4325
|
+
#ifdef O_DIRECT
|
4326
|
+
/* Set O_DIRECT if the file system supports it */
|
4327
|
+
if ((rc = fcntl(newfd, F_GETFL)) != -1)
|
4328
|
+
(void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
|
4329
|
+
#endif
|
4264
4330
|
#ifdef F_NOCACHE /* __APPLE__ */
|
4265
4331
|
rc = fcntl(newfd, F_NOCACHE, 1);
|
4266
4332
|
if (rc) {
|
@@ -4308,7 +4374,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b)
|
|
4308
4374
|
*(size_t *)a->mv_data > *(size_t *)b->mv_data;
|
4309
4375
|
}
|
4310
4376
|
|
4311
|
-
/** Compare two items pointing at aligned int's */
|
4377
|
+
/** Compare two items pointing at aligned unsigned int's */
|
4312
4378
|
static int
|
4313
4379
|
mdb_cmp_int(const MDB_val *a, const MDB_val *b)
|
4314
4380
|
{
|
@@ -4316,7 +4382,7 @@ mdb_cmp_int(const MDB_val *a, const MDB_val *b)
|
|
4316
4382
|
*(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
|
4317
4383
|
}
|
4318
4384
|
|
4319
|
-
/** Compare two items pointing at ints of unknown alignment.
|
4385
|
+
/** Compare two items pointing at unsigned ints of unknown alignment.
|
4320
4386
|
* Nodes and keys are guaranteed to be 2-byte aligned.
|
4321
4387
|
*/
|
4322
4388
|
static int
|
@@ -4514,8 +4580,8 @@ mdb_cursor_pop(MDB_cursor *mc)
|
|
4514
4580
|
if (mc->mc_snum)
|
4515
4581
|
mc->mc_top--;
|
4516
4582
|
|
4517
|
-
DPRINTF(("popped page %"Z"u off db %
|
4518
|
-
mc
|
4583
|
+
DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
|
4584
|
+
DDBI(mc), (void *) mc));
|
4519
4585
|
}
|
4520
4586
|
}
|
4521
4587
|
|
@@ -4523,8 +4589,8 @@ mdb_cursor_pop(MDB_cursor *mc)
|
|
4523
4589
|
static int
|
4524
4590
|
mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
|
4525
4591
|
{
|
4526
|
-
DPRINTF(("pushing page %"Z"u on db %
|
4527
|
-
mc
|
4592
|
+
DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
|
4593
|
+
DDBI(mc), (void *) mc));
|
4528
4594
|
|
4529
4595
|
if (mc->mc_snum >= CURSOR_STACK) {
|
4530
4596
|
assert(mc->mc_snum < CURSOR_STACK);
|
@@ -4598,18 +4664,11 @@ done:
|
|
4598
4664
|
return MDB_SUCCESS;
|
4599
4665
|
}
|
4600
4666
|
|
4601
|
-
/**
|
4602
|
-
*
|
4603
|
-
* search on a cursor that has already been initialized. (Usually by
|
4604
|
-
* #mdb_page_search() but also by #mdb_node_move().)
|
4605
|
-
* @param[in,out] mc the cursor for this operation.
|
4606
|
-
* @param[in] key the key to search for. If NULL, search for the lowest
|
4607
|
-
* page. (This is used by #mdb_cursor_first().)
|
4608
|
-
* @param[in] modify If true, visited pages are updated with new page numbers.
|
4609
|
-
* @return 0 on success, non-zero on failure.
|
4667
|
+
/** Finish #mdb_page_search() / #mdb_page_search_lowest().
|
4668
|
+
* The cursor is at the root page, set up the rest of it.
|
4610
4669
|
*/
|
4611
4670
|
static int
|
4612
|
-
mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int
|
4671
|
+
mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
|
4613
4672
|
{
|
4614
4673
|
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
4615
4674
|
int rc;
|
@@ -4623,11 +4682,10 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4623
4682
|
assert(NUMKEYS(mp) > 1);
|
4624
4683
|
DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
|
4625
4684
|
|
4626
|
-
if (
|
4685
|
+
if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
|
4627
4686
|
i = 0;
|
4628
|
-
|
4629
|
-
|
4630
|
-
i = NUMKEYS(mp)-1;
|
4687
|
+
if (flags & MDB_PS_LAST)
|
4688
|
+
i = NUMKEYS(mp) - 1;
|
4631
4689
|
} else {
|
4632
4690
|
int exact;
|
4633
4691
|
node = mdb_node_search(mc, key, &exact);
|
@@ -4640,10 +4698,9 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4640
4698
|
i--;
|
4641
4699
|
}
|
4642
4700
|
}
|
4701
|
+
DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
|
4643
4702
|
}
|
4644
4703
|
|
4645
|
-
if (key)
|
4646
|
-
DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
|
4647
4704
|
assert(i < NUMKEYS(mp));
|
4648
4705
|
node = NODEPTR(mp, i);
|
4649
4706
|
|
@@ -4654,7 +4711,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4654
4711
|
if ((rc = mdb_cursor_push(mc, mp)))
|
4655
4712
|
return rc;
|
4656
4713
|
|
4657
|
-
if (
|
4714
|
+
if (flags & MDB_PS_MODIFY) {
|
4658
4715
|
if ((rc = mdb_page_touch(mc)) != 0)
|
4659
4716
|
return rc;
|
4660
4717
|
mp = mc->mc_pg[mc->mc_top];
|
@@ -4668,7 +4725,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4668
4725
|
}
|
4669
4726
|
|
4670
4727
|
DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
|
4671
|
-
key ? DKEY(key) :
|
4728
|
+
key ? DKEY(key) : "null"));
|
4672
4729
|
mc->mc_flags |= C_INITIALIZED;
|
4673
4730
|
mc->mc_flags &= ~C_EOF;
|
4674
4731
|
|
@@ -4694,18 +4751,17 @@ mdb_page_search_lowest(MDB_cursor *mc)
|
|
4694
4751
|
mc->mc_ki[mc->mc_top] = 0;
|
4695
4752
|
if ((rc = mdb_cursor_push(mc, mp)))
|
4696
4753
|
return rc;
|
4697
|
-
return mdb_page_search_root(mc, NULL,
|
4754
|
+
return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
|
4698
4755
|
}
|
4699
4756
|
|
4700
4757
|
/** Search for the page a given key should be in.
|
4701
|
-
*
|
4702
|
-
* the search; it finds the root page for \b mc's database and sets this
|
4703
|
-
* as the root of the cursor's stack. Then #mdb_page_search_root() is
|
4704
|
-
* called to complete the search.
|
4758
|
+
* Push it and its parent pages on the cursor stack.
|
4705
4759
|
* @param[in,out] mc the cursor for this operation.
|
4706
|
-
* @param[in] key the key to search for
|
4707
|
-
*
|
4708
|
-
*
|
4760
|
+
* @param[in] key the key to search for, or NULL for first/last page.
|
4761
|
+
* @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
|
4762
|
+
* are touched (updated with new page numbers).
|
4763
|
+
* If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
|
4764
|
+
* This is used by #mdb_cursor_first() and #mdb_cursor_last().
|
4709
4765
|
* If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
|
4710
4766
|
* @return 0 on success, non-zero on failure.
|
4711
4767
|
*/
|
@@ -4716,23 +4772,20 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
|
|
4716
4772
|
pgno_t root;
|
4717
4773
|
|
4718
4774
|
/* Make sure the txn is still viable, then find the root from
|
4719
|
-
* the txn's db table.
|
4775
|
+
* the txn's db table and set it as the root of the cursor's stack.
|
4720
4776
|
*/
|
4721
4777
|
if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
|
4722
4778
|
DPUTS("transaction has failed, must abort");
|
4723
4779
|
return MDB_BAD_TXN;
|
4724
4780
|
} else {
|
4725
4781
|
/* Make sure we're using an up-to-date root */
|
4726
|
-
if (mc->
|
4727
|
-
if ((*mc->mc_dbflag & DB_STALE) ||
|
4728
|
-
((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) {
|
4782
|
+
if (*mc->mc_dbflag & DB_STALE) {
|
4729
4783
|
MDB_cursor mc2;
|
4730
|
-
unsigned char dbflag = 0;
|
4731
4784
|
mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
|
4732
|
-
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name,
|
4785
|
+
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
|
4733
4786
|
if (rc)
|
4734
4787
|
return rc;
|
4735
|
-
|
4788
|
+
{
|
4736
4789
|
MDB_val data;
|
4737
4790
|
int exact = 0;
|
4738
4791
|
uint16_t flags;
|
@@ -4752,11 +4805,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
|
|
4752
4805
|
return MDB_INCOMPATIBLE;
|
4753
4806
|
memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
|
4754
4807
|
}
|
4755
|
-
if (flags & MDB_PS_MODIFY)
|
4756
|
-
dbflag = DB_DIRTY;
|
4757
4808
|
*mc->mc_dbflag &= ~DB_STALE;
|
4758
|
-
*mc->mc_dbflag |= dbflag;
|
4759
|
-
}
|
4760
4809
|
}
|
4761
4810
|
root = mc->mc_db->md_root;
|
4762
4811
|
|
@@ -4774,8 +4823,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
|
|
4774
4823
|
mc->mc_snum = 1;
|
4775
4824
|
mc->mc_top = 0;
|
4776
4825
|
|
4777
|
-
DPRINTF(("db %
|
4778
|
-
mc
|
4826
|
+
DPRINTF(("db %d root page %"Z"u has flags 0x%X",
|
4827
|
+
DDBI(mc), root, mc->mc_pg[0]->mp_flags));
|
4779
4828
|
|
4780
4829
|
if (flags & MDB_PS_MODIFY) {
|
4781
4830
|
if ((rc = mdb_page_touch(mc)))
|
@@ -4914,7 +4963,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi,
|
|
4914
4963
|
if (txn->mt_flags & MDB_TXN_ERROR)
|
4915
4964
|
return MDB_BAD_TXN;
|
4916
4965
|
|
4917
|
-
if (key->mv_size
|
4966
|
+
if (key->mv_size > MDB_MAXKEYSIZE) {
|
4918
4967
|
return MDB_BAD_VALSIZE;
|
4919
4968
|
}
|
4920
4969
|
|
@@ -4966,8 +5015,11 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right)
|
|
4966
5015
|
assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
|
4967
5016
|
|
4968
5017
|
indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
4969
|
-
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0)
|
5018
|
+
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
|
5019
|
+
/* mc will be inconsistent if caller does mc_snum++ as above */
|
5020
|
+
mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
|
4970
5021
|
return rc;
|
5022
|
+
}
|
4971
5023
|
|
4972
5024
|
mdb_cursor_push(mc, mp);
|
4973
5025
|
if (!move_right)
|
@@ -5143,7 +5195,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5143
5195
|
|
5144
5196
|
assert(mc);
|
5145
5197
|
assert(key);
|
5146
|
-
|
5198
|
+
if (key->mv_size == 0)
|
5199
|
+
return MDB_BAD_VALSIZE;
|
5147
5200
|
|
5148
5201
|
if (mc->mc_xcursor)
|
5149
5202
|
mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
|
@@ -5329,7 +5382,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
|
|
5329
5382
|
mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
|
5330
5383
|
|
5331
5384
|
if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
|
5332
|
-
rc = mdb_page_search(mc, NULL,
|
5385
|
+
rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
|
5333
5386
|
if (rc != MDB_SUCCESS)
|
5334
5387
|
return rc;
|
5335
5388
|
}
|
@@ -5375,11 +5428,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
|
|
5375
5428
|
if (!(mc->mc_flags & C_EOF)) {
|
5376
5429
|
|
5377
5430
|
if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
|
5378
|
-
|
5379
|
-
|
5380
|
-
lkey.mv_size = MDB_MAXKEYSIZE+1;
|
5381
|
-
lkey.mv_data = NULL;
|
5382
|
-
rc = mdb_page_search(mc, &lkey, 0);
|
5431
|
+
rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
|
5383
5432
|
if (rc != MDB_SUCCESS)
|
5384
5433
|
return rc;
|
5385
5434
|
}
|
@@ -5431,8 +5480,9 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5431
5480
|
rc = EINVAL;
|
5432
5481
|
} else {
|
5433
5482
|
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
5434
|
-
|
5435
|
-
|
5483
|
+
int nkeys = NUMKEYS(mp);
|
5484
|
+
if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
|
5485
|
+
mc->mc_ki[mc->mc_top] = nkeys;
|
5436
5486
|
rc = MDB_NOTFOUND;
|
5437
5487
|
break;
|
5438
5488
|
}
|
@@ -5471,7 +5521,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5471
5521
|
case MDB_SET_RANGE:
|
5472
5522
|
if (key == NULL) {
|
5473
5523
|
rc = EINVAL;
|
5474
|
-
} else if (key->mv_size
|
5524
|
+
} else if (key->mv_size > MDB_MAXKEYSIZE) {
|
5475
5525
|
rc = MDB_BAD_VALSIZE;
|
5476
5526
|
} else if (op == MDB_SET_RANGE)
|
5477
5527
|
rc = mdb_cursor_set(mc, key, data, op, NULL);
|
@@ -5577,14 +5627,14 @@ fetchm:
|
|
5577
5627
|
return rc;
|
5578
5628
|
}
|
5579
5629
|
|
5580
|
-
/** Touch all the pages in the cursor stack.
|
5630
|
+
/** Touch all the pages in the cursor stack. Set mc_top.
|
5581
5631
|
* Makes sure all the pages are writable, before attempting a write operation.
|
5582
5632
|
* @param[in] mc The cursor to operate on.
|
5583
5633
|
*/
|
5584
5634
|
static int
|
5585
5635
|
mdb_cursor_touch(MDB_cursor *mc)
|
5586
5636
|
{
|
5587
|
-
int rc;
|
5637
|
+
int rc = MDB_SUCCESS;
|
5588
5638
|
|
5589
5639
|
if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
|
5590
5640
|
MDB_cursor mc2;
|
@@ -5595,13 +5645,14 @@ mdb_cursor_touch(MDB_cursor *mc)
|
|
5595
5645
|
return rc;
|
5596
5646
|
*mc->mc_dbflag |= DB_DIRTY;
|
5597
5647
|
}
|
5598
|
-
|
5599
|
-
|
5600
|
-
|
5601
|
-
|
5648
|
+
mc->mc_top = 0;
|
5649
|
+
if (mc->mc_snum) {
|
5650
|
+
do {
|
5651
|
+
rc = mdb_page_touch(mc);
|
5652
|
+
} while (!rc && ++(mc->mc_top) < mc->mc_snum);
|
5653
|
+
mc->mc_top = mc->mc_snum-1;
|
5602
5654
|
}
|
5603
|
-
|
5604
|
-
return MDB_SUCCESS;
|
5655
|
+
return rc;
|
5605
5656
|
}
|
5606
5657
|
|
5607
5658
|
/** Do not spill pages to disk if txn is getting full, may fail instead */
|
@@ -5612,15 +5663,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5612
5663
|
unsigned int flags)
|
5613
5664
|
{
|
5614
5665
|
enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
|
5666
|
+
MDB_env *env = mc->mc_txn->mt_env;
|
5615
5667
|
MDB_node *leaf = NULL;
|
5616
5668
|
MDB_val xdata, *rdata, dkey;
|
5617
|
-
MDB_page *fp;
|
5618
5669
|
MDB_db dummy;
|
5619
5670
|
int do_sub = 0, insert = 0;
|
5620
5671
|
unsigned int mcount = 0, dcount = 0, nospill;
|
5621
5672
|
size_t nsize;
|
5622
5673
|
int rc, rc2;
|
5623
|
-
MDB_pagebuf pbuf;
|
5624
5674
|
char dbuf[MDB_MAXKEYSIZE+1];
|
5625
5675
|
unsigned int nflags;
|
5626
5676
|
DKBUF;
|
@@ -5652,8 +5702,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5652
5702
|
return MDB_BAD_VALSIZE;
|
5653
5703
|
#endif
|
5654
5704
|
|
5655
|
-
DPRINTF(("==> put db %
|
5656
|
-
mc
|
5705
|
+
DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
|
5706
|
+
DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
|
5657
5707
|
|
5658
5708
|
dkey.mv_size = 0;
|
5659
5709
|
|
@@ -5664,6 +5714,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5664
5714
|
} else if (mc->mc_db->md_root == P_INVALID) {
|
5665
5715
|
/* new database, cursor has nothing to point to */
|
5666
5716
|
mc->mc_snum = 0;
|
5717
|
+
mc->mc_top = 0;
|
5667
5718
|
mc->mc_flags &= ~C_INITIALIZED;
|
5668
5719
|
rc = MDB_NO_ROOT;
|
5669
5720
|
} else {
|
@@ -5733,6 +5784,9 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5733
5784
|
|
5734
5785
|
/* The key already exists */
|
5735
5786
|
if (rc == MDB_SUCCESS) {
|
5787
|
+
MDB_page *fp, *mp;
|
5788
|
+
MDB_val olddata;
|
5789
|
+
|
5736
5790
|
/* there's only a key anyway, so this is a no-op */
|
5737
5791
|
if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
|
5738
5792
|
unsigned int ksize = mc->mc_db->md_pad;
|
@@ -5745,19 +5799,23 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5745
5799
|
return MDB_SUCCESS;
|
5746
5800
|
}
|
5747
5801
|
|
5802
|
+
more:
|
5748
5803
|
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
5804
|
+
olddata.mv_size = NODEDSZ(leaf);
|
5805
|
+
olddata.mv_data = NODEDATA(leaf);
|
5749
5806
|
|
5750
5807
|
/* DB has dups? */
|
5751
5808
|
if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
|
5809
|
+
mp = fp = xdata.mv_data = env->me_pbuf;
|
5810
|
+
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5811
|
+
|
5752
5812
|
/* Was a single item before, must convert now */
|
5753
|
-
more:
|
5754
5813
|
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
5755
5814
|
/* Just overwrite the current item */
|
5756
5815
|
if (flags == MDB_CURRENT)
|
5757
5816
|
goto current;
|
5758
5817
|
|
5759
|
-
dkey
|
5760
|
-
dkey.mv_data = NODEDATA(leaf);
|
5818
|
+
dkey = olddata;
|
5761
5819
|
#if UINT_MAX < SIZE_MAX
|
5762
5820
|
if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
|
5763
5821
|
#ifdef MISALIGNED_OK
|
@@ -5780,85 +5838,76 @@ more:
|
|
5780
5838
|
/* create a fake page for the dup items */
|
5781
5839
|
memcpy(dbuf, dkey.mv_data, dkey.mv_size);
|
5782
5840
|
dkey.mv_data = dbuf;
|
5783
|
-
fp = (MDB_page *)&pbuf;
|
5784
|
-
fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5785
5841
|
fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
|
5786
5842
|
fp->mp_lower = PAGEHDRSZ;
|
5787
|
-
|
5843
|
+
xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
|
5788
5844
|
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5789
5845
|
fp->mp_flags |= P_LEAF2;
|
5790
5846
|
fp->mp_pad = data->mv_size;
|
5791
|
-
|
5847
|
+
xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
|
5792
5848
|
} else {
|
5793
|
-
|
5849
|
+
xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
|
5794
5850
|
(dkey.mv_size & 1) + (data->mv_size & 1);
|
5795
5851
|
}
|
5796
|
-
|
5797
|
-
|
5798
|
-
|
5799
|
-
|
5800
|
-
|
5801
|
-
|
5802
|
-
goto new_sub;
|
5803
|
-
}
|
5804
|
-
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
5852
|
+
fp->mp_upper = xdata.mv_size;
|
5853
|
+
} else if (leaf->mn_flags & F_SUBDATA) {
|
5854
|
+
/* Data is on sub-DB, just store it */
|
5855
|
+
flags |= F_DUPDATA|F_SUBDATA;
|
5856
|
+
goto put_sub;
|
5857
|
+
} else {
|
5805
5858
|
/* See if we need to convert from fake page to subDB */
|
5806
|
-
MDB_page *mp;
|
5807
5859
|
unsigned int offset;
|
5808
5860
|
unsigned int i;
|
5809
5861
|
uint16_t fp_flags;
|
5810
5862
|
|
5811
|
-
fp =
|
5812
|
-
|
5813
|
-
|
5863
|
+
fp = olddata.mv_data;
|
5864
|
+
switch (flags) {
|
5865
|
+
default:
|
5866
|
+
if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
|
5867
|
+
offset = NODESIZE + sizeof(indx_t) + data->mv_size;
|
5868
|
+
offset += offset & 1;
|
5869
|
+
break;
|
5870
|
+
}
|
5871
|
+
offset = fp->mp_pad;
|
5872
|
+
if (SIZELEFT(fp) < offset) {
|
5873
|
+
offset *= 4; /* space for 4 more */
|
5874
|
+
break;
|
5875
|
+
}
|
5876
|
+
/* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
|
5877
|
+
case MDB_CURRENT:
|
5814
5878
|
fp->mp_flags |= P_DIRTY;
|
5815
|
-
COPY_PGNO(fp->mp_pgno,
|
5879
|
+
COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
|
5816
5880
|
mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
|
5817
5881
|
flags |= F_DUPDATA;
|
5818
5882
|
goto put_sub;
|
5819
5883
|
}
|
5820
|
-
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5821
|
-
offset = fp->mp_pad;
|
5822
|
-
if (SIZELEFT(fp) >= offset)
|
5823
|
-
goto reuse;
|
5824
|
-
offset *= 4; /* space for 4 more */
|
5825
|
-
} else {
|
5826
|
-
offset = NODESIZE + sizeof(indx_t) + data->mv_size;
|
5827
|
-
}
|
5828
|
-
offset += offset & 1;
|
5829
5884
|
fp_flags = fp->mp_flags;
|
5830
|
-
|
5831
|
-
|
5885
|
+
xdata.mv_size = olddata.mv_size + offset;
|
5886
|
+
if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + xdata.mv_size
|
5887
|
+
>= env->me_nodemax) {
|
5832
5888
|
/* yes, convert it */
|
5833
|
-
dummy.md_flags = 0;
|
5834
5889
|
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5835
5890
|
dummy.md_pad = fp->mp_pad;
|
5836
5891
|
dummy.md_flags = MDB_DUPFIXED;
|
5837
5892
|
if (mc->mc_db->md_flags & MDB_INTEGERDUP)
|
5838
5893
|
dummy.md_flags |= MDB_INTEGERKEY;
|
5894
|
+
} else {
|
5895
|
+
dummy.md_pad = 0;
|
5896
|
+
dummy.md_flags = 0;
|
5839
5897
|
}
|
5840
5898
|
dummy.md_depth = 1;
|
5841
5899
|
dummy.md_branch_pages = 0;
|
5842
5900
|
dummy.md_leaf_pages = 1;
|
5843
5901
|
dummy.md_overflow_pages = 0;
|
5844
5902
|
dummy.md_entries = NUMKEYS(fp);
|
5845
|
-
rdata = &xdata;
|
5846
5903
|
xdata.mv_size = sizeof(MDB_db);
|
5847
5904
|
xdata.mv_data = &dummy;
|
5848
5905
|
if ((rc = mdb_page_alloc(mc, 1, &mp)))
|
5849
5906
|
return rc;
|
5850
|
-
offset =
|
5907
|
+
offset = env->me_psize - olddata.mv_size;
|
5851
5908
|
flags |= F_DUPDATA|F_SUBDATA;
|
5852
5909
|
dummy.md_root = mp->mp_pgno;
|
5853
5910
|
fp_flags &= ~P_SUBP;
|
5854
|
-
} else {
|
5855
|
-
/* no, just grow it */
|
5856
|
-
rdata = &xdata;
|
5857
|
-
xdata.mv_size = NODEDSZ(leaf) + offset;
|
5858
|
-
xdata.mv_data = &pbuf;
|
5859
|
-
mp = (MDB_page *)&pbuf;
|
5860
|
-
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5861
|
-
flags |= F_DUPDATA;
|
5862
5911
|
}
|
5863
5912
|
mp->mp_flags = fp_flags | P_DIRTY;
|
5864
5913
|
mp->mp_pad = fp->mp_pad;
|
@@ -5867,28 +5916,27 @@ reuse:
|
|
5867
5916
|
if (IS_LEAF2(fp)) {
|
5868
5917
|
memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
|
5869
5918
|
} else {
|
5870
|
-
|
5871
|
-
|
5919
|
+
memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
|
5920
|
+
olddata.mv_size - fp->mp_upper);
|
5872
5921
|
for (i=0; i<NUMKEYS(fp); i++)
|
5873
5922
|
mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
|
5874
5923
|
}
|
5875
|
-
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
5876
|
-
do_sub = 1;
|
5877
|
-
goto new_sub;
|
5878
5924
|
}
|
5879
|
-
|
5880
|
-
|
5881
|
-
|
5925
|
+
|
5926
|
+
rdata = &xdata;
|
5927
|
+
flags |= F_DUPDATA;
|
5928
|
+
do_sub = 1;
|
5929
|
+
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
5930
|
+
goto new_sub;
|
5882
5931
|
}
|
5883
5932
|
current:
|
5884
5933
|
/* overflow page overwrites need special handling */
|
5885
5934
|
if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
|
5886
5935
|
MDB_page *omp;
|
5887
5936
|
pgno_t pg;
|
5888
|
-
|
5889
|
-
int level, ovpages, dpages = OVPAGES(data->mv_size, psize);
|
5937
|
+
int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
|
5890
5938
|
|
5891
|
-
memcpy(&pg,
|
5939
|
+
memcpy(&pg, olddata.mv_data, sizeof(pg));
|
5892
5940
|
if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
|
5893
5941
|
return rc2;
|
5894
5942
|
ovpages = omp->mp_pages;
|
@@ -5896,7 +5944,7 @@ current:
|
|
5896
5944
|
/* Is the ov page large enough? */
|
5897
5945
|
if (ovpages >= dpages) {
|
5898
5946
|
if (!(omp->mp_flags & P_DIRTY) &&
|
5899
|
-
(level || (
|
5947
|
+
(level || (env->me_flags & MDB_WRITEMAP)))
|
5900
5948
|
{
|
5901
5949
|
rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
|
5902
5950
|
if (rc)
|
@@ -5911,7 +5959,7 @@ current:
|
|
5911
5959
|
*/
|
5912
5960
|
if (level > 1) {
|
5913
5961
|
/* It is writable only in a parent txn */
|
5914
|
-
size_t sz = (size_t)
|
5962
|
+
size_t sz = (size_t) env->me_psize * ovpages, off;
|
5915
5963
|
MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
|
5916
5964
|
MDB_ID2 id2;
|
5917
5965
|
if (!np)
|
@@ -5941,15 +5989,15 @@ current:
|
|
5941
5989
|
}
|
5942
5990
|
if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
|
5943
5991
|
return rc2;
|
5944
|
-
} else if (
|
5992
|
+
} else if (data->mv_size == olddata.mv_size) {
|
5945
5993
|
/* same size, just replace it. Note that we could
|
5946
5994
|
* also reuse this node if the new data is smaller,
|
5947
5995
|
* but instead we opt to shrink the node in that case.
|
5948
5996
|
*/
|
5949
5997
|
if (F_ISSET(flags, MDB_RESERVE))
|
5950
|
-
data->mv_data =
|
5998
|
+
data->mv_data = olddata.mv_data;
|
5951
5999
|
else if (data->mv_size)
|
5952
|
-
memcpy(
|
6000
|
+
memcpy(olddata.mv_data, data->mv_data, data->mv_size);
|
5953
6001
|
else
|
5954
6002
|
memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
|
5955
6003
|
goto done;
|
@@ -5965,7 +6013,7 @@ current:
|
|
5965
6013
|
|
5966
6014
|
new_sub:
|
5967
6015
|
nflags = flags & NODE_ADD_FLAGS;
|
5968
|
-
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(
|
6016
|
+
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
|
5969
6017
|
if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
|
5970
6018
|
if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
|
5971
6019
|
nflags &= ~MDB_APPEND;
|
@@ -5982,9 +6030,6 @@ new_sub:
|
|
5982
6030
|
unsigned i = mc->mc_top;
|
5983
6031
|
MDB_page *mp = mc->mc_pg[i];
|
5984
6032
|
|
5985
|
-
if (mc->mc_flags & C_SUB)
|
5986
|
-
dbi--;
|
5987
|
-
|
5988
6033
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
5989
6034
|
if (mc->mc_flags & C_SUB)
|
5990
6035
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -6062,7 +6107,6 @@ next_mult:
|
|
6062
6107
|
data[1].mv_size = mcount;
|
6063
6108
|
if (mcount < dcount) {
|
6064
6109
|
data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
|
6065
|
-
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
6066
6110
|
goto more;
|
6067
6111
|
}
|
6068
6112
|
}
|
@@ -6081,6 +6125,7 @@ int
|
|
6081
6125
|
mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
6082
6126
|
{
|
6083
6127
|
MDB_node *leaf;
|
6128
|
+
MDB_page *mp;
|
6084
6129
|
int rc;
|
6085
6130
|
|
6086
6131
|
if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
|
@@ -6089,17 +6134,20 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
|
6089
6134
|
if (!(mc->mc_flags & C_INITIALIZED))
|
6090
6135
|
return EINVAL;
|
6091
6136
|
|
6137
|
+
if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
|
6138
|
+
return MDB_NOTFOUND;
|
6139
|
+
|
6092
6140
|
if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
|
6093
6141
|
return rc;
|
6094
|
-
flags &= ~MDB_NOSPILL; /* TODO: Or change (flags != MDB_NODUPDATA) to ~(flags & MDB_NODUPDATA), not looking at the logic of that code just now */
|
6095
6142
|
|
6096
6143
|
rc = mdb_cursor_touch(mc);
|
6097
6144
|
if (rc)
|
6098
6145
|
return rc;
|
6099
6146
|
|
6100
|
-
|
6147
|
+
mp = mc->mc_pg[mc->mc_top];
|
6148
|
+
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
6101
6149
|
|
6102
|
-
if (!IS_LEAF2(
|
6150
|
+
if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
6103
6151
|
if (!(flags & MDB_NODUPDATA)) {
|
6104
6152
|
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
6105
6153
|
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
@@ -6114,13 +6162,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
|
6114
6162
|
} else {
|
6115
6163
|
MDB_cursor *m2;
|
6116
6164
|
/* shrink fake page */
|
6117
|
-
mdb_node_shrink(
|
6118
|
-
leaf = NODEPTR(
|
6165
|
+
mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
|
6166
|
+
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
6119
6167
|
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
6120
6168
|
/* fix other sub-DB cursors pointed at this fake page */
|
6121
6169
|
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
|
6122
6170
|
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
|
6123
|
-
if (m2->mc_pg[mc->mc_top] ==
|
6171
|
+
if (m2->mc_pg[mc->mc_top] == mp &&
|
6124
6172
|
m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
|
6125
6173
|
m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
6126
6174
|
}
|
@@ -6252,6 +6300,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6252
6300
|
{
|
6253
6301
|
unsigned int i;
|
6254
6302
|
size_t node_size = NODESIZE;
|
6303
|
+
ssize_t room;
|
6255
6304
|
indx_t ofs;
|
6256
6305
|
MDB_node *node;
|
6257
6306
|
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
@@ -6264,7 +6313,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6264
6313
|
IS_LEAF(mp) ? "leaf" : "branch",
|
6265
6314
|
IS_SUBP(mp) ? "sub-" : "",
|
6266
6315
|
mp->mp_pgno, indx, data ? data->mv_size : 0,
|
6267
|
-
key ? key->mv_size : 0, key ? DKEY(key) :
|
6316
|
+
key ? key->mv_size : 0, key ? DKEY(key) : "null"));
|
6268
6317
|
|
6269
6318
|
if (IS_LEAF2(mp)) {
|
6270
6319
|
/* Move higher keys up one slot. */
|
@@ -6282,9 +6331,9 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6282
6331
|
return MDB_SUCCESS;
|
6283
6332
|
}
|
6284
6333
|
|
6334
|
+
room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
|
6285
6335
|
if (key != NULL)
|
6286
6336
|
node_size += key->mv_size;
|
6287
|
-
|
6288
6337
|
if (IS_LEAF(mp)) {
|
6289
6338
|
assert(data);
|
6290
6339
|
if (F_ISSET(flags, F_BIGDATA)) {
|
@@ -6296,26 +6345,23 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6296
6345
|
/* Put data on overflow page. */
|
6297
6346
|
DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
|
6298
6347
|
data->mv_size, node_size+data->mv_size));
|
6299
|
-
node_size += sizeof(pgno_t);
|
6348
|
+
node_size += sizeof(pgno_t) + (node_size & 1);
|
6349
|
+
if ((ssize_t)node_size > room)
|
6350
|
+
goto full;
|
6300
6351
|
if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
|
6301
6352
|
return rc;
|
6302
6353
|
DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
|
6303
6354
|
flags |= F_BIGDATA;
|
6355
|
+
goto update;
|
6304
6356
|
} else {
|
6305
6357
|
node_size += data->mv_size;
|
6306
6358
|
}
|
6307
6359
|
}
|
6308
6360
|
node_size += node_size & 1;
|
6361
|
+
if ((ssize_t)node_size > room)
|
6362
|
+
goto full;
|
6309
6363
|
|
6310
|
-
|
6311
|
-
DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
|
6312
|
-
mp->mp_pgno, NUMKEYS(mp)));
|
6313
|
-
DPRINTF(("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
|
6314
|
-
mp->mp_upper - mp->mp_lower));
|
6315
|
-
DPRINTF(("node size = %"Z"u", node_size));
|
6316
|
-
return MDB_PAGE_FULL;
|
6317
|
-
}
|
6318
|
-
|
6364
|
+
update:
|
6319
6365
|
/* Move higher pointers up one slot. */
|
6320
6366
|
for (i = NUMKEYS(mp); i > indx; i--)
|
6321
6367
|
mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
|
@@ -6361,6 +6407,13 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6361
6407
|
}
|
6362
6408
|
|
6363
6409
|
return MDB_SUCCESS;
|
6410
|
+
|
6411
|
+
full:
|
6412
|
+
DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
|
6413
|
+
mp->mp_pgno, NUMKEYS(mp)));
|
6414
|
+
DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
|
6415
|
+
DPRINTF(("node size = %"Z"u", node_size));
|
6416
|
+
return MDB_PAGE_FULL;
|
6364
6417
|
}
|
6365
6418
|
|
6366
6419
|
/** Delete the specified node from a page.
|
@@ -6495,11 +6548,13 @@ mdb_xcursor_init0(MDB_cursor *mc)
|
|
6495
6548
|
mx->mx_cursor.mc_txn = mc->mc_txn;
|
6496
6549
|
mx->mx_cursor.mc_db = &mx->mx_db;
|
6497
6550
|
mx->mx_cursor.mc_dbx = &mx->mx_dbx;
|
6498
|
-
mx->mx_cursor.mc_dbi = mc->mc_dbi
|
6551
|
+
mx->mx_cursor.mc_dbi = mc->mc_dbi;
|
6499
6552
|
mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
|
6500
6553
|
mx->mx_cursor.mc_snum = 0;
|
6501
6554
|
mx->mx_cursor.mc_top = 0;
|
6502
6555
|
mx->mx_cursor.mc_flags = C_SUB;
|
6556
|
+
mx->mx_dbx.md_name.mv_size = 0;
|
6557
|
+
mx->mx_dbx.md_name.mv_data = NULL;
|
6503
6558
|
mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
|
6504
6559
|
mx->mx_dbx.md_dcmp = NULL;
|
6505
6560
|
mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
|
@@ -6520,6 +6575,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
|
|
6520
6575
|
memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
|
6521
6576
|
mx->mx_cursor.mc_pg[0] = 0;
|
6522
6577
|
mx->mx_cursor.mc_snum = 0;
|
6578
|
+
mx->mx_cursor.mc_top = 0;
|
6523
6579
|
mx->mx_cursor.mc_flags = C_SUB;
|
6524
6580
|
} else {
|
6525
6581
|
MDB_page *fp = NODEDATA(node);
|
@@ -6532,8 +6588,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
|
|
6532
6588
|
mx->mx_db.md_entries = NUMKEYS(fp);
|
6533
6589
|
COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
|
6534
6590
|
mx->mx_cursor.mc_snum = 1;
|
6535
|
-
mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
|
6536
6591
|
mx->mx_cursor.mc_top = 0;
|
6592
|
+
mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
|
6537
6593
|
mx->mx_cursor.mc_pg[0] = fp;
|
6538
6594
|
mx->mx_cursor.mc_ki[0] = 0;
|
6539
6595
|
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
@@ -6543,12 +6599,9 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
|
|
6543
6599
|
mx->mx_db.md_flags |= MDB_INTEGERKEY;
|
6544
6600
|
}
|
6545
6601
|
}
|
6546
|
-
DPRINTF(("Sub-db
|
6602
|
+
DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
|
6547
6603
|
mx->mx_db.md_root));
|
6548
|
-
mx->mx_dbflag = DB_VALID
|
6549
|
-
DB_DIRTY : 0);
|
6550
|
-
mx->mx_dbx.md_name.mv_data = NODEKEY(node);
|
6551
|
-
mx->mx_dbx.md_name.mv_size = node->mn_ksize;
|
6604
|
+
mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
|
6552
6605
|
#if UINT_MAX < SIZE_MAX
|
6553
6606
|
if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
|
6554
6607
|
#ifdef MISALIGNED_OK
|
@@ -6793,7 +6846,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
6793
6846
|
flags = 0;
|
6794
6847
|
} else {
|
6795
6848
|
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
|
6796
|
-
assert(!((
|
6849
|
+
assert(!((size_t)srcnode&1));
|
6797
6850
|
srcpg = NODEPGNO(srcnode);
|
6798
6851
|
flags = srcnode->mn_flags;
|
6799
6852
|
if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
|
@@ -6864,9 +6917,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
6864
6917
|
MDB_dbi dbi = csrc->mc_dbi;
|
6865
6918
|
MDB_page *mp = csrc->mc_pg[csrc->mc_top];
|
6866
6919
|
|
6867
|
-
if (csrc->mc_flags & C_SUB)
|
6868
|
-
dbi--;
|
6869
|
-
|
6870
6920
|
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
6871
6921
|
if (csrc->mc_flags & C_SUB)
|
6872
6922
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7041,9 +7091,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
7041
7091
|
MDB_dbi dbi = csrc->mc_dbi;
|
7042
7092
|
MDB_page *mp = cdst->mc_pg[cdst->mc_top];
|
7043
7093
|
|
7044
|
-
if (csrc->mc_flags & C_SUB)
|
7045
|
-
dbi--;
|
7046
|
-
|
7047
7094
|
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7048
7095
|
if (csrc->mc_flags & C_SUB)
|
7049
7096
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7138,13 +7185,11 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7138
7185
|
/* Adjust cursors pointing to mp */
|
7139
7186
|
mc->mc_snum = 0;
|
7140
7187
|
mc->mc_top = 0;
|
7188
|
+
mc->mc_flags &= ~C_INITIALIZED;
|
7141
7189
|
{
|
7142
7190
|
MDB_cursor *m2, *m3;
|
7143
7191
|
MDB_dbi dbi = mc->mc_dbi;
|
7144
7192
|
|
7145
|
-
if (mc->mc_flags & C_SUB)
|
7146
|
-
dbi--;
|
7147
|
-
|
7148
7193
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7149
7194
|
if (mc->mc_flags & C_SUB)
|
7150
7195
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7154,6 +7199,7 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7154
7199
|
if (m3->mc_pg[0] == mp) {
|
7155
7200
|
m3->mc_snum = 0;
|
7156
7201
|
m3->mc_top = 0;
|
7202
|
+
m3->mc_flags &= ~C_INITIALIZED;
|
7157
7203
|
}
|
7158
7204
|
}
|
7159
7205
|
}
|
@@ -7174,9 +7220,6 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7174
7220
|
MDB_cursor *m2, *m3;
|
7175
7221
|
MDB_dbi dbi = mc->mc_dbi;
|
7176
7222
|
|
7177
|
-
if (mc->mc_flags & C_SUB)
|
7178
|
-
dbi--;
|
7179
|
-
|
7180
7223
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7181
7224
|
if (mc->mc_flags & C_SUB)
|
7182
7225
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7184,10 +7227,13 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7184
7227
|
m3 = m2;
|
7185
7228
|
if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
|
7186
7229
|
if (m3->mc_pg[0] == mp) {
|
7187
|
-
|
7188
|
-
m3->mc_snum
|
7189
|
-
m3->mc_top
|
7190
|
-
|
7230
|
+
int i;
|
7231
|
+
m3->mc_snum--;
|
7232
|
+
m3->mc_top--;
|
7233
|
+
for (i=0; i<m3->mc_snum; i++) {
|
7234
|
+
m3->mc_pg[i] = m3->mc_pg[i+1];
|
7235
|
+
m3->mc_ki[i] = m3->mc_ki[i+1];
|
7236
|
+
}
|
7191
7237
|
}
|
7192
7238
|
}
|
7193
7239
|
}
|
@@ -7300,7 +7346,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
|
|
7300
7346
|
|
7301
7347
|
/* Adjust other cursors pointing to mp */
|
7302
7348
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7303
|
-
if (m2 == mc)
|
7349
|
+
if (m2 == mc || m2->mc_snum < mc->mc_snum)
|
7304
7350
|
continue;
|
7305
7351
|
if (!(m2->mc_flags & C_INITIALIZED))
|
7306
7352
|
continue;
|
@@ -7341,7 +7387,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi,
|
|
7341
7387
|
if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
|
7342
7388
|
return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
|
7343
7389
|
|
7344
|
-
if (key->mv_size
|
7390
|
+
if (key->mv_size > MDB_MAXKEYSIZE) {
|
7345
7391
|
return MDB_BAD_VALSIZE;
|
7346
7392
|
}
|
7347
7393
|
|
@@ -7394,24 +7440,26 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
|
|
7394
7440
|
unsigned int nflags)
|
7395
7441
|
{
|
7396
7442
|
unsigned int flags;
|
7397
|
-
int rc = MDB_SUCCESS,
|
7443
|
+
int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
|
7398
7444
|
indx_t newindx;
|
7399
7445
|
pgno_t pgno = 0;
|
7400
|
-
|
7446
|
+
int i, j, split_indx, nkeys, pmax;
|
7447
|
+
MDB_env *env = mc->mc_txn->mt_env;
|
7401
7448
|
MDB_node *node;
|
7402
7449
|
MDB_val sepkey, rkey, xdata, *rdata = &xdata;
|
7403
|
-
MDB_page *copy;
|
7450
|
+
MDB_page *copy = NULL;
|
7404
7451
|
MDB_page *mp, *rp, *pp;
|
7405
|
-
|
7452
|
+
int ptop;
|
7406
7453
|
MDB_cursor mn;
|
7407
7454
|
DKBUF;
|
7408
7455
|
|
7409
7456
|
mp = mc->mc_pg[mc->mc_top];
|
7410
7457
|
newindx = mc->mc_ki[mc->mc_top];
|
7458
|
+
nkeys = NUMKEYS(mp);
|
7411
7459
|
|
7412
|
-
DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i",
|
7460
|
+
DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
|
7413
7461
|
IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
|
7414
|
-
DKEY(newkey), mc->mc_ki[mc->mc_top]));
|
7462
|
+
DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
|
7415
7463
|
|
7416
7464
|
/* Create a right sibling. */
|
7417
7465
|
if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
|
@@ -7458,141 +7506,139 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
|
|
7458
7506
|
sepkey = *newkey;
|
7459
7507
|
split_indx = newindx;
|
7460
7508
|
nkeys = 0;
|
7461
|
-
|
7462
|
-
}
|
7509
|
+
} else {
|
7463
7510
|
|
7464
|
-
|
7465
|
-
|
7466
|
-
|
7467
|
-
|
7468
|
-
|
7469
|
-
|
7470
|
-
|
7471
|
-
|
7472
|
-
|
7473
|
-
|
7474
|
-
|
7475
|
-
|
7476
|
-
|
7477
|
-
|
7478
|
-
|
7479
|
-
|
7480
|
-
|
7481
|
-
|
7482
|
-
|
7483
|
-
|
7484
|
-
|
7485
|
-
|
7486
|
-
|
7487
|
-
|
7488
|
-
|
7489
|
-
|
7490
|
-
|
7491
|
-
|
7492
|
-
|
7493
|
-
|
7494
|
-
|
7495
|
-
|
7496
|
-
|
7497
|
-
|
7511
|
+
split_indx = (nkeys+1) / 2;
|
7512
|
+
|
7513
|
+
if (IS_LEAF2(rp)) {
|
7514
|
+
char *split, *ins;
|
7515
|
+
int x;
|
7516
|
+
unsigned int lsize, rsize, ksize;
|
7517
|
+
/* Move half of the keys to the right sibling */
|
7518
|
+
copy = NULL;
|
7519
|
+
x = mc->mc_ki[mc->mc_top] - split_indx;
|
7520
|
+
ksize = mc->mc_db->md_pad;
|
7521
|
+
split = LEAF2KEY(mp, split_indx, ksize);
|
7522
|
+
rsize = (nkeys - split_indx) * ksize;
|
7523
|
+
lsize = (nkeys - split_indx) * sizeof(indx_t);
|
7524
|
+
mp->mp_lower -= lsize;
|
7525
|
+
rp->mp_lower += lsize;
|
7526
|
+
mp->mp_upper += rsize - lsize;
|
7527
|
+
rp->mp_upper -= rsize - lsize;
|
7528
|
+
sepkey.mv_size = ksize;
|
7529
|
+
if (newindx == split_indx) {
|
7530
|
+
sepkey.mv_data = newkey->mv_data;
|
7531
|
+
} else {
|
7532
|
+
sepkey.mv_data = split;
|
7533
|
+
}
|
7534
|
+
if (x<0) {
|
7535
|
+
ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
|
7536
|
+
memcpy(rp->mp_ptrs, split, rsize);
|
7537
|
+
sepkey.mv_data = rp->mp_ptrs;
|
7538
|
+
memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
|
7539
|
+
memcpy(ins, newkey->mv_data, ksize);
|
7540
|
+
mp->mp_lower += sizeof(indx_t);
|
7541
|
+
mp->mp_upper -= ksize - sizeof(indx_t);
|
7542
|
+
} else {
|
7543
|
+
if (x)
|
7544
|
+
memcpy(rp->mp_ptrs, split, x * ksize);
|
7545
|
+
ins = LEAF2KEY(rp, x, ksize);
|
7546
|
+
memcpy(ins, newkey->mv_data, ksize);
|
7547
|
+
memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
|
7548
|
+
rp->mp_lower += sizeof(indx_t);
|
7549
|
+
rp->mp_upper -= ksize - sizeof(indx_t);
|
7550
|
+
mc->mc_ki[mc->mc_top] = x;
|
7551
|
+
mc->mc_pg[mc->mc_top] = rp;
|
7552
|
+
}
|
7498
7553
|
} else {
|
7499
|
-
|
7500
|
-
|
7501
|
-
|
7502
|
-
|
7503
|
-
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
mc->mc_pg[mc->mc_top] = rp;
|
7508
|
-
}
|
7509
|
-
goto newsep;
|
7510
|
-
}
|
7554
|
+
int psize, nsize, k;
|
7555
|
+
/* Maximum free space in an empty page */
|
7556
|
+
pmax = env->me_psize - PAGEHDRSZ;
|
7557
|
+
if (IS_LEAF(mp))
|
7558
|
+
nsize = mdb_leaf_size(env, newkey, newdata);
|
7559
|
+
else
|
7560
|
+
nsize = mdb_branch_size(env, newkey);
|
7561
|
+
nsize += nsize & 1;
|
7511
7562
|
|
7512
|
-
|
7513
|
-
|
7514
|
-
|
7515
|
-
|
7516
|
-
|
7517
|
-
|
7518
|
-
|
7519
|
-
|
7520
|
-
|
7521
|
-
|
7522
|
-
|
7523
|
-
|
7524
|
-
|
7525
|
-
|
7526
|
-
|
7527
|
-
|
7528
|
-
|
7529
|
-
|
7530
|
-
|
7531
|
-
|
7532
|
-
|
7533
|
-
|
7534
|
-
|
7535
|
-
|
7536
|
-
|
7537
|
-
|
7538
|
-
|
7539
|
-
|
7540
|
-
|
7541
|
-
|
7542
|
-
|
7543
|
-
|
7544
|
-
|
7545
|
-
|
7546
|
-
|
7563
|
+
/* grab a page to hold a temporary copy */
|
7564
|
+
copy = mdb_page_malloc(mc->mc_txn, 1);
|
7565
|
+
if (copy == NULL)
|
7566
|
+
return ENOMEM;
|
7567
|
+
copy->mp_pgno = mp->mp_pgno;
|
7568
|
+
copy->mp_flags = mp->mp_flags;
|
7569
|
+
copy->mp_lower = PAGEHDRSZ;
|
7570
|
+
copy->mp_upper = env->me_psize;
|
7571
|
+
|
7572
|
+
/* prepare to insert */
|
7573
|
+
for (i=0, j=0; i<nkeys; i++) {
|
7574
|
+
if (i == newindx) {
|
7575
|
+
copy->mp_ptrs[j++] = 0;
|
7576
|
+
}
|
7577
|
+
copy->mp_ptrs[j++] = mp->mp_ptrs[i];
|
7578
|
+
}
|
7579
|
+
|
7580
|
+
/* When items are relatively large the split point needs
|
7581
|
+
* to be checked, because being off-by-one will make the
|
7582
|
+
* difference between success or failure in mdb_node_add.
|
7583
|
+
*
|
7584
|
+
* It's also relevant if a page happens to be laid out
|
7585
|
+
* such that one half of its nodes are all "small" and
|
7586
|
+
* the other half of its nodes are "large." If the new
|
7587
|
+
* item is also "large" and falls on the half with
|
7588
|
+
* "large" nodes, it also may not fit.
|
7589
|
+
*
|
7590
|
+
* As a final tweak, if the new item goes on the last
|
7591
|
+
* spot on the page (and thus, onto the new page), bias
|
7592
|
+
* the split so the new page is emptier than the old page.
|
7593
|
+
* This yields better packing during sequential inserts.
|
7594
|
+
*/
|
7595
|
+
if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
|
7596
|
+
/* Find split point */
|
7597
|
+
psize = 0;
|
7598
|
+
if (newindx <= split_indx || newindx >= nkeys) {
|
7599
|
+
i = 0; j = 1;
|
7600
|
+
k = newindx >= nkeys ? nkeys : split_indx+2;
|
7601
|
+
} else {
|
7602
|
+
i = nkeys; j = -1;
|
7603
|
+
k = split_indx-1;
|
7604
|
+
}
|
7605
|
+
for (; i!=k; i+=j) {
|
7606
|
+
if (i == newindx) {
|
7607
|
+
psize += nsize;
|
7608
|
+
node = NULL;
|
7609
|
+
} else {
|
7610
|
+
node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
|
7611
|
+
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
7612
|
+
if (IS_LEAF(mp)) {
|
7613
|
+
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
7614
|
+
psize += sizeof(pgno_t);
|
7615
|
+
else
|
7616
|
+
psize += NODEDSZ(node);
|
7547
7617
|
}
|
7548
|
-
|
7549
|
-
split_indx = i;
|
7550
|
-
break;
|
7618
|
+
psize += psize & 1;
|
7551
7619
|
}
|
7552
|
-
|
7553
|
-
|
7554
|
-
psize = nsize;
|
7555
|
-
for (i=nkeys-1; i>=split_indx; i--) {
|
7556
|
-
node = NODEPTR(mp, i);
|
7557
|
-
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
7558
|
-
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
7559
|
-
psize += sizeof(pgno_t);
|
7560
|
-
else
|
7561
|
-
psize += NODEDSZ(node);
|
7562
|
-
psize += psize & 1;
|
7563
|
-
if (psize > pmax) {
|
7564
|
-
if (i >= newindx) {
|
7565
|
-
split_indx = newindx;
|
7566
|
-
newpos = 0;
|
7567
|
-
} else
|
7568
|
-
split_indx = i+1;
|
7620
|
+
if (psize > pmax || i == k-j) {
|
7621
|
+
split_indx = i + (j<0);
|
7569
7622
|
break;
|
7570
7623
|
}
|
7571
7624
|
}
|
7572
7625
|
}
|
7626
|
+
if (split_indx == newindx) {
|
7627
|
+
sepkey.mv_size = newkey->mv_size;
|
7628
|
+
sepkey.mv_data = newkey->mv_data;
|
7629
|
+
} else {
|
7630
|
+
node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
|
7631
|
+
sepkey.mv_size = node->mn_ksize;
|
7632
|
+
sepkey.mv_data = NODEKEY(node);
|
7633
|
+
}
|
7573
7634
|
}
|
7574
7635
|
}
|
7575
7636
|
|
7576
|
-
|
7577
|
-
* The case where newindx == split_indx is ambiguous; the
|
7578
|
-
* new item could go to the new page or stay on the original
|
7579
|
-
* page. If newpos == 1 it goes to the new page.
|
7580
|
-
*/
|
7581
|
-
if (newindx == split_indx && newpos) {
|
7582
|
-
sepkey.mv_size = newkey->mv_size;
|
7583
|
-
sepkey.mv_data = newkey->mv_data;
|
7584
|
-
} else {
|
7585
|
-
node = NODEPTR(mp, split_indx);
|
7586
|
-
sepkey.mv_size = node->mn_ksize;
|
7587
|
-
sepkey.mv_data = NODEKEY(node);
|
7588
|
-
}
|
7589
|
-
|
7590
|
-
newsep:
|
7591
|
-
DPRINTF(("separator is [%s]", DKEY(&sepkey)));
|
7637
|
+
DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
|
7592
7638
|
|
7593
7639
|
/* Copy separator key to the parent.
|
7594
7640
|
*/
|
7595
|
-
if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(
|
7641
|
+
if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
|
7596
7642
|
mn.mc_snum--;
|
7597
7643
|
mn.mc_top--;
|
7598
7644
|
did_split = 1;
|
@@ -7637,117 +7683,97 @@ newsep:
|
|
7637
7683
|
return rc;
|
7638
7684
|
for (i=0; i<mc->mc_top; i++)
|
7639
7685
|
mc->mc_ki[i] = mn.mc_ki[i];
|
7640
|
-
|
7641
|
-
|
7642
|
-
|
7643
|
-
|
7644
|
-
|
7645
|
-
|
7646
|
-
|
7686
|
+
} else if (!IS_LEAF2(mp)) {
|
7687
|
+
/* Move nodes */
|
7688
|
+
mc->mc_pg[mc->mc_top] = rp;
|
7689
|
+
i = split_indx;
|
7690
|
+
j = 0;
|
7691
|
+
do {
|
7692
|
+
if (i == newindx) {
|
7693
|
+
rkey.mv_data = newkey->mv_data;
|
7694
|
+
rkey.mv_size = newkey->mv_size;
|
7695
|
+
if (IS_LEAF(mp)) {
|
7696
|
+
rdata = newdata;
|
7697
|
+
} else
|
7698
|
+
pgno = newpgno;
|
7699
|
+
flags = nflags;
|
7700
|
+
/* Update index for the new key. */
|
7701
|
+
mc->mc_ki[mc->mc_top] = j;
|
7702
|
+
} else {
|
7703
|
+
node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
|
7704
|
+
rkey.mv_data = NODEKEY(node);
|
7705
|
+
rkey.mv_size = node->mn_ksize;
|
7706
|
+
if (IS_LEAF(mp)) {
|
7707
|
+
xdata.mv_data = NODEDATA(node);
|
7708
|
+
xdata.mv_size = NODEDSZ(node);
|
7709
|
+
rdata = &xdata;
|
7710
|
+
} else
|
7711
|
+
pgno = NODEPGNO(node);
|
7712
|
+
flags = node->mn_flags;
|
7713
|
+
}
|
7647
7714
|
|
7648
|
-
|
7649
|
-
|
7650
|
-
|
7651
|
-
|
7715
|
+
if (!IS_LEAF(mp) && j == 0) {
|
7716
|
+
/* First branch index doesn't need key data. */
|
7717
|
+
rkey.mv_size = 0;
|
7718
|
+
}
|
7652
7719
|
|
7653
|
-
|
7654
|
-
|
7655
|
-
|
7656
|
-
|
7657
|
-
|
7658
|
-
|
7659
|
-
|
7660
|
-
|
7661
|
-
/* Reset insert index for right sibling. */
|
7662
|
-
if (i != newindx || (newpos ^ ins_new)) {
|
7720
|
+
rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
|
7721
|
+
if (rc) {
|
7722
|
+
/* return tmp page to freelist */
|
7723
|
+
mdb_page_free(env, copy);
|
7724
|
+
return rc;
|
7725
|
+
}
|
7726
|
+
if (i == nkeys) {
|
7727
|
+
i = 0;
|
7663
7728
|
j = 0;
|
7664
|
-
mc->mc_pg[mc->mc_top] =
|
7729
|
+
mc->mc_pg[mc->mc_top] = copy;
|
7730
|
+
} else {
|
7731
|
+
i++;
|
7732
|
+
j++;
|
7733
|
+
}
|
7734
|
+
} while (i != split_indx);
|
7735
|
+
|
7736
|
+
nkeys = NUMKEYS(copy);
|
7737
|
+
for (i=0; i<nkeys; i++)
|
7738
|
+
mp->mp_ptrs[i] = copy->mp_ptrs[i];
|
7739
|
+
mp->mp_lower = copy->mp_lower;
|
7740
|
+
mp->mp_upper = copy->mp_upper;
|
7741
|
+
memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
|
7742
|
+
env->me_psize - copy->mp_upper);
|
7743
|
+
|
7744
|
+
/* reset back to original page */
|
7745
|
+
if (newindx < split_indx) {
|
7746
|
+
mc->mc_pg[mc->mc_top] = mp;
|
7747
|
+
if (nflags & MDB_RESERVE) {
|
7748
|
+
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
7749
|
+
if (!(node->mn_flags & F_BIGDATA))
|
7750
|
+
newdata->mv_data = NODEDATA(node);
|
7665
7751
|
}
|
7666
|
-
}
|
7667
|
-
|
7668
|
-
if (i == newindx && !ins_new) {
|
7669
|
-
/* Insert the original entry that caused the split. */
|
7670
|
-
rkey.mv_data = newkey->mv_data;
|
7671
|
-
rkey.mv_size = newkey->mv_size;
|
7672
|
-
if (IS_LEAF(mp)) {
|
7673
|
-
rdata = newdata;
|
7674
|
-
} else
|
7675
|
-
pgno = newpgno;
|
7676
|
-
flags = nflags;
|
7677
|
-
|
7678
|
-
ins_new = 1;
|
7679
|
-
|
7680
|
-
/* Update index for the new key. */
|
7681
|
-
mc->mc_ki[mc->mc_top] = j;
|
7682
|
-
} else if (i == nkeys) {
|
7683
|
-
break;
|
7684
7752
|
} else {
|
7685
|
-
|
7686
|
-
|
7687
|
-
|
7688
|
-
|
7689
|
-
|
7690
|
-
|
7691
|
-
|
7692
|
-
|
7693
|
-
|
7694
|
-
|
7695
|
-
|
7696
|
-
|
7697
|
-
}
|
7698
|
-
|
7699
|
-
if (!IS_LEAF(mp) && j == 0) {
|
7700
|
-
/* First branch index doesn't need key data. */
|
7701
|
-
rkey.mv_size = 0;
|
7702
|
-
}
|
7703
|
-
|
7704
|
-
rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
|
7705
|
-
if (rc) break;
|
7706
|
-
}
|
7707
|
-
|
7708
|
-
nkeys = NUMKEYS(copy);
|
7709
|
-
for (i=0; i<nkeys; i++)
|
7710
|
-
mp->mp_ptrs[i] = copy->mp_ptrs[i];
|
7711
|
-
mp->mp_lower = copy->mp_lower;
|
7712
|
-
mp->mp_upper = copy->mp_upper;
|
7713
|
-
memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
|
7714
|
-
mc->mc_txn->mt_env->me_psize - copy->mp_upper);
|
7715
|
-
|
7716
|
-
/* reset back to original page */
|
7717
|
-
if (newindx < split_indx || (!newpos && newindx == split_indx)) {
|
7718
|
-
mc->mc_pg[mc->mc_top] = mp;
|
7719
|
-
if (nflags & MDB_RESERVE) {
|
7720
|
-
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
7721
|
-
if (!(node->mn_flags & F_BIGDATA))
|
7722
|
-
newdata->mv_data = NODEDATA(node);
|
7723
|
-
}
|
7724
|
-
} else {
|
7725
|
-
mc->mc_ki[ptop]++;
|
7726
|
-
/* Make sure mc_ki is still valid.
|
7727
|
-
*/
|
7728
|
-
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
|
7729
|
-
mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
|
7730
|
-
for (i=0; i<ptop; i++) {
|
7731
|
-
mc->mc_pg[i] = mn.mc_pg[i];
|
7732
|
-
mc->mc_ki[i] = mn.mc_ki[i];
|
7753
|
+
mc->mc_pg[mc->mc_top] = rp;
|
7754
|
+
mc->mc_ki[ptop]++;
|
7755
|
+
/* Make sure mc_ki is still valid.
|
7756
|
+
*/
|
7757
|
+
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
|
7758
|
+
mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
|
7759
|
+
for (i=0; i<ptop; i++) {
|
7760
|
+
mc->mc_pg[i] = mn.mc_pg[i];
|
7761
|
+
mc->mc_ki[i] = mn.mc_ki[i];
|
7762
|
+
}
|
7763
|
+
mc->mc_pg[ptop] = mn.mc_pg[ptop];
|
7764
|
+
mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
|
7733
7765
|
}
|
7734
|
-
mc->mc_pg[ptop] = mn.mc_pg[ptop];
|
7735
|
-
mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
|
7736
7766
|
}
|
7767
|
+
/* return tmp page to freelist */
|
7768
|
+
mdb_page_free(env, copy);
|
7737
7769
|
}
|
7738
7770
|
|
7739
|
-
/* return tmp page to freelist */
|
7740
|
-
mdb_page_free(mc->mc_txn->mt_env, copy);
|
7741
|
-
done:
|
7742
7771
|
{
|
7743
7772
|
/* Adjust other cursors pointing to mp */
|
7744
7773
|
MDB_cursor *m2, *m3;
|
7745
7774
|
MDB_dbi dbi = mc->mc_dbi;
|
7746
7775
|
int fixup = NUMKEYS(mp);
|
7747
7776
|
|
7748
|
-
if (mc->mc_flags & C_SUB)
|
7749
|
-
dbi--;
|
7750
|
-
|
7751
7777
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7752
7778
|
if (mc->mc_flags & C_SUB)
|
7753
7779
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7789,6 +7815,7 @@ done:
|
|
7789
7815
|
}
|
7790
7816
|
}
|
7791
7817
|
}
|
7818
|
+
DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
|
7792
7819
|
return rc;
|
7793
7820
|
}
|
7794
7821
|
|
@@ -7805,13 +7832,6 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
|
|
7805
7832
|
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
7806
7833
|
return EINVAL;
|
7807
7834
|
|
7808
|
-
if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
|
7809
|
-
return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
|
7810
|
-
|
7811
|
-
if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
|
7812
|
-
return MDB_BAD_VALSIZE;
|
7813
|
-
}
|
7814
|
-
|
7815
7835
|
if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
|
7816
7836
|
return EINVAL;
|
7817
7837
|
|
@@ -7851,6 +7871,16 @@ mdb_env_get_path(MDB_env *env, const char **arg)
|
|
7851
7871
|
return MDB_SUCCESS;
|
7852
7872
|
}
|
7853
7873
|
|
7874
|
+
int
|
7875
|
+
mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
|
7876
|
+
{
|
7877
|
+
if (!env || !arg)
|
7878
|
+
return EINVAL;
|
7879
|
+
|
7880
|
+
*arg = env->me_fd;
|
7881
|
+
return MDB_SUCCESS;
|
7882
|
+
}
|
7883
|
+
|
7854
7884
|
/** Common code for #mdb_stat() and #mdb_env_stat().
|
7855
7885
|
* @param[in] env the environment to operate in.
|
7856
7886
|
* @param[in] db the #MDB_db record containing the stats to return.
|
@@ -8075,7 +8105,7 @@ mdb_drop0(MDB_cursor *mc, int subs)
|
|
8075
8105
|
{
|
8076
8106
|
int rc;
|
8077
8107
|
|
8078
|
-
rc = mdb_page_search(mc, NULL,
|
8108
|
+
rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
|
8079
8109
|
if (rc == MDB_SUCCESS) {
|
8080
8110
|
MDB_txn *txn = mc->mc_txn;
|
8081
8111
|
MDB_node *ni;
|
@@ -8273,10 +8303,10 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
|
|
8273
8303
|
return 0;
|
8274
8304
|
}
|
8275
8305
|
|
8276
|
-
|
8306
|
+
/** Insert pid into list if not already present.
|
8277
8307
|
* return -1 if already present.
|
8278
8308
|
*/
|
8279
|
-
static int mdb_pid_insert(
|
8309
|
+
static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
|
8280
8310
|
{
|
8281
8311
|
/* binary search of pid in list */
|
8282
8312
|
unsigned base = 0;
|
@@ -8301,7 +8331,7 @@ static int mdb_pid_insert(pid_t *ids, pid_t pid)
|
|
8301
8331
|
return -1;
|
8302
8332
|
}
|
8303
8333
|
}
|
8304
|
-
|
8334
|
+
|
8305
8335
|
if( val > 0 ) {
|
8306
8336
|
++cursor;
|
8307
8337
|
}
|
@@ -8316,7 +8346,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
|
|
8316
8346
|
{
|
8317
8347
|
unsigned int i, j, rdrs;
|
8318
8348
|
MDB_reader *mr;
|
8319
|
-
|
8349
|
+
MDB_PID_T *pids, pid;
|
8320
8350
|
int count = 0;
|
8321
8351
|
|
8322
8352
|
if (!env)
|
@@ -8326,7 +8356,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
|
|
8326
8356
|
if (!env->me_txns)
|
8327
8357
|
return MDB_SUCCESS;
|
8328
8358
|
rdrs = env->me_txns->mti_numreaders;
|
8329
|
-
pids = malloc((rdrs+1) * sizeof(
|
8359
|
+
pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
|
8330
8360
|
if (!pids)
|
8331
8361
|
return ENOMEM;
|
8332
8362
|
pids[0] = 0;
|
@@ -8342,6 +8372,8 @@ int mdb_reader_check(MDB_env *env, int *dead)
|
|
8342
8372
|
if (!mdb_reader_pid(env, Pidcheck, pid)) {
|
8343
8373
|
for (j=i; j<rdrs; j++)
|
8344
8374
|
if (mr[j].mr_pid == pid) {
|
8375
|
+
DPRINTF(("clear stale reader pid %u txn %"Z"d",
|
8376
|
+
(unsigned) pid, mr[j].mr_txnid));
|
8345
8377
|
mr[j].mr_pid = 0;
|
8346
8378
|
count++;
|
8347
8379
|
}
|