lmdb 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGES +4 -0
- data/README.md +4 -1
- data/ext/lmdb_ext/liblmdb/CHANGES +26 -1
- data/ext/lmdb_ext/liblmdb/lmdb.h +80 -10
- data/ext/lmdb_ext/liblmdb/mdb.c +648 -616
- data/ext/lmdb_ext/liblmdb/midl.c +8 -8
- data/ext/lmdb_ext/liblmdb/midl.h +1 -1
- data/ext/lmdb_ext/lmdb_ext.c +28 -45
- data/ext/lmdb_ext/lmdb_ext.h +4 -7
- data/lib/lmdb/version.rb +1 -1
- data/spec/lmdb_spec.rb +7 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e16a42693e5150e076829337a0d44feb46bc3f3
|
4
|
+
data.tar.gz: ce0bc77de0cf3c7e17df2522c8eab69e760e1221
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e02924de6a59386ec215b45baf321a6d098d818900f7816a3805185ab5b7441427cc5af866d2d8141eecfd1d166a659ebc2a87d5ae7626a9a574bd4d1bd4bf44
|
7
|
+
data.tar.gz: 3ee55c5879fd385e53d25eeffd694dab0de03c5b152fe07562f9b631de25ccd90f09ae886a1d30b7f9abd6ea3a8da3fedac0b1f9d69dbffc3874bfad18570dd9
|
data/.travis.yml
CHANGED
data/CHANGES
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# LMDB
|
2
2
|
|
3
|
-
|
3
|
+
[![Gittip donate button](http://img.shields.io/gittip/bevry.png)](https://www.gittip.com/min4d/ "Donate weekly to this project using Gittip")
|
4
|
+
[![Flattr this git repo](http://api.flattr.com/button/flattr-badge-large.png)](https://flattr.com/submit/auto?user_id=min4d&url=https://github.com/minad/lmdb&title=LMDB&language=&tags=github&category=software)
|
5
|
+
|
6
|
+
Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LMDB)
|
4
7
|
http://symas.com/mdb/
|
5
8
|
|
6
9
|
### Installation
|
@@ -1,6 +1,31 @@
|
|
1
1
|
LMDB 0.9 Change Log
|
2
2
|
|
3
|
-
LMDB 0.9.
|
3
|
+
LMDB 0.9.10 Release (2013/11/12)
|
4
|
+
Add MDB_NOMEMINIT option
|
5
|
+
Fix mdb_page_split() again (ITS#7589)
|
6
|
+
Fix MDB_NORDAHEAD definition (ITS#7734)
|
7
|
+
Fix mdb_cursor_del() positioning (ITS#7733)
|
8
|
+
Partial fix for larger page sizes (ITS#7713)
|
9
|
+
Fix Windows64/MSVC build issues
|
10
|
+
|
11
|
+
LMDB 0.9.9 Release (2013/10/24)
|
12
|
+
Add mdb_env_get_fd()
|
13
|
+
Add MDB_NORDAHEAD option
|
14
|
+
Add MDB_NOLOCK option
|
15
|
+
Avoid wasting space in mdb_page_split() (ITS#7589)
|
16
|
+
Fix mdb_page_merge() cursor fixup (ITS#7722)
|
17
|
+
Fix mdb_cursor_del() on last delete (ITS#7718)
|
18
|
+
Fix adding WRITEMAP on existing env (ITS#7715)
|
19
|
+
Fixes for nested txns (ITS#7515)
|
20
|
+
Fix mdb_env_copy() O_DIRECT bug (ITS#7682)
|
21
|
+
Fix mdb_cursor_set(SET_RANGE) return code (ITS#7681)
|
22
|
+
Fix mdb_rebalance() cursor fixup (ITS#7701)
|
23
|
+
Misc code cleanup
|
24
|
+
Documentation
|
25
|
+
Note that by default, readers need write access
|
26
|
+
|
27
|
+
|
28
|
+
LMDB 0.9.8 Release (2013/09/09)
|
4
29
|
Allow mdb_env_set_mapsize() on an open environment
|
5
30
|
Fix mdb_dbi_flags() (ITS#7672)
|
6
31
|
Fix mdb_page_unspill() in nested txns
|
data/ext/lmdb_ext/liblmdb/lmdb.h
CHANGED
@@ -66,6 +66,20 @@
|
|
66
66
|
* BSD systems or when otherwise configured with MDB_USE_POSIX_SEM.
|
67
67
|
* Multiple users can cause startup to fail later, as noted above.
|
68
68
|
*
|
69
|
+
* - There is normally no pure read-only mode, since readers need write
|
70
|
+
* access to locks and lock file. Exceptions: On read-only filesystems
|
71
|
+
* or with the #MDB_NOLOCK flag described under #mdb_env_open().
|
72
|
+
*
|
73
|
+
* - By default, in versions before 0.9.10, unused portions of the data
|
74
|
+
* file might receive garbage data from memory freed by other code.
|
75
|
+
* (This does not happen when using the #MDB_WRITEMAP flag.) As of
|
76
|
+
* 0.9.10 the default behavior is to initialize such memory before
|
77
|
+
* writing to the data file. Since there may be a slight performance
|
78
|
+
* cost due to this initialization, applications may disable it using
|
79
|
+
* the #MDB_NOMEMINIT flag. Applications handling sensitive data
|
80
|
+
* which must not be written should not use this flag. This flag is
|
81
|
+
* irrelevant when using #MDB_WRITEMAP.
|
82
|
+
*
|
69
83
|
* - A thread can only use one transaction at a time, plus any child
|
70
84
|
* transactions. Each transaction belongs to one thread. See below.
|
71
85
|
* The #MDB_NOTLS flag changes this for read-only transactions.
|
@@ -170,7 +184,7 @@ typedef int mdb_filehandle_t;
|
|
170
184
|
/** Library minor version */
|
171
185
|
#define MDB_VERSION_MINOR 9
|
172
186
|
/** Library patch version */
|
173
|
-
#define MDB_VERSION_PATCH
|
187
|
+
#define MDB_VERSION_PATCH 10
|
174
188
|
|
175
189
|
/** Combine args a,b,c into a single integer for easy version comparisons */
|
176
190
|
#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
|
@@ -180,7 +194,7 @@ typedef int mdb_filehandle_t;
|
|
180
194
|
MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
|
181
195
|
|
182
196
|
/** The release date of this library version */
|
183
|
-
#define MDB_VERSION_DATE "
|
197
|
+
#define MDB_VERSION_DATE "November 11, 2013"
|
184
198
|
|
185
199
|
/** A stringifier for the version info */
|
186
200
|
#define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
|
@@ -216,13 +230,13 @@ typedef struct MDB_cursor MDB_cursor;
|
|
216
230
|
/** @brief Generic structure used for passing keys and data in and out
|
217
231
|
* of the database.
|
218
232
|
*
|
219
|
-
* Key sizes must be between 1 and the liblmdb build-time constant
|
220
|
-
* #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The
|
221
|
-
* same applies to data sizes in databases with the #MDB_DUPSORT flag.
|
222
|
-
* Other data items can in theory be from 0 to 0xffffffff bytes long.
|
223
|
-
*
|
224
233
|
* Values returned from the database are valid only until a subsequent
|
225
|
-
* update operation, or the end of the transaction.
|
234
|
+
* update operation, or the end of the transaction. Do not modify or
|
235
|
+
* free them, they commonly point into the database itself.
|
236
|
+
*
|
237
|
+
* Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive.
|
238
|
+
* The same applies to data sizes in databases with the #MDB_DUPSORT flag.
|
239
|
+
* Other data items can in theory be from 0 to 0xffffffff bytes long.
|
226
240
|
*/
|
227
241
|
typedef struct MDB_val {
|
228
242
|
size_t mv_size; /**< size of the data item */
|
@@ -265,10 +279,16 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
|
|
265
279
|
#define MDB_NOMETASYNC 0x40000
|
266
280
|
/** use writable mmap */
|
267
281
|
#define MDB_WRITEMAP 0x80000
|
268
|
-
/** use asynchronous msync when MDB_WRITEMAP is used */
|
282
|
+
/** use asynchronous msync when #MDB_WRITEMAP is used */
|
269
283
|
#define MDB_MAPASYNC 0x100000
|
270
284
|
/** tie reader locktable slots to #MDB_txn objects instead of to threads */
|
271
285
|
#define MDB_NOTLS 0x200000
|
286
|
+
/** don't do any locking, caller must manage their own locks */
|
287
|
+
#define MDB_NOLOCK 0x400000
|
288
|
+
/** don't do readahead (no effect on Windows) */
|
289
|
+
#define MDB_NORDAHEAD 0x800000
|
290
|
+
/** don't initialize malloc'd memory before writing to datafile */
|
291
|
+
#define MDB_NOMEMINIT 0x1000000
|
272
292
|
/** @} */
|
273
293
|
|
274
294
|
/** @defgroup mdb_dbi_open Database Flags
|
@@ -486,6 +506,8 @@ int mdb_env_create(MDB_env **env);
|
|
486
506
|
* and uses fewer mallocs, but loses protection from application bugs
|
487
507
|
* like wild pointer writes and other bad updates into the database.
|
488
508
|
* Incompatible with nested transactions.
|
509
|
+
* Processes with and without MDB_WRITEMAP on the same environment do
|
510
|
+
* not cooperate well.
|
489
511
|
* <li>#MDB_NOMETASYNC
|
490
512
|
* Flush system buffers to disk only once per transaction, omit the
|
491
513
|
* metadata flush. Defer that until the system flushes files to disk,
|
@@ -523,6 +545,38 @@ int mdb_env_create(MDB_env **env);
|
|
523
545
|
* user threads over individual OS threads need this option. Such an
|
524
546
|
* application must also serialize the write transactions in an OS
|
525
547
|
* thread, since MDB's write locking is unaware of the user threads.
|
548
|
+
* <li>#MDB_NOLOCK
|
549
|
+
* Don't do any locking. If concurrent access is anticipated, the
|
550
|
+
* caller must manage all concurrency itself. For proper operation
|
551
|
+
* the caller must enforce single-writer semantics, and must ensure
|
552
|
+
* that no readers are using old transactions while a writer is
|
553
|
+
* active. The simplest approach is to use an exclusive lock so that
|
554
|
+
* no readers may be active at all when a writer begins.
|
555
|
+
* <li>#MDB_NORDAHEAD
|
556
|
+
* Turn off readahead. Most operating systems perform readahead on
|
557
|
+
* read requests by default. This option turns it off if the OS
|
558
|
+
* supports it. Turning it off may help random read performance
|
559
|
+
* when the DB is larger than RAM and system RAM is full.
|
560
|
+
* The option is not implemented on Windows.
|
561
|
+
* <li>#MDB_NOMEMINIT
|
562
|
+
* Don't initialize malloc'd memory before writing to unused spaces
|
563
|
+
* in the data file. By default, memory for pages written to the data
|
564
|
+
* file is obtained using malloc. While these pages may be reused in
|
565
|
+
* subsequent transactions, freshly malloc'd pages will be initialized
|
566
|
+
* to zeroes before use. This avoids persisting leftover data from other
|
567
|
+
* code (that used the heap and subsequently freed the memory) into the
|
568
|
+
* data file. Note that many other system libraries may allocate
|
569
|
+
* and free memory from the heap for arbitrary uses. E.g., stdio may
|
570
|
+
* use the heap for file I/O buffers. This initialization step has a
|
571
|
+
* modest performance cost so some applications may want to disable
|
572
|
+
* it using this flag. This option can be a problem for applications
|
573
|
+
* which handle sensitive data like passwords, and it makes memory
|
574
|
+
* checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP,
|
575
|
+
* which writes directly to the mmap instead of using malloc for pages. The
|
576
|
+
* initialization is also skipped if #MDB_RESERVE is used; the
|
577
|
+
* caller is expected to overwrite all of the memory that was
|
578
|
+
* reserved in that case.
|
579
|
+
* This flag may be changed at any time using #mdb_env_set_flags().
|
526
580
|
* </ul>
|
527
581
|
* @param[in] mode The UNIX permissions to set on created files. This parameter
|
528
582
|
* is ignored on Windows.
|
@@ -656,6 +710,18 @@ int mdb_env_get_flags(MDB_env *env, unsigned int *flags);
|
|
656
710
|
*/
|
657
711
|
int mdb_env_get_path(MDB_env *env, const char **path);
|
658
712
|
|
713
|
+
/** @brief Return the filedescriptor for the given environment.
|
714
|
+
*
|
715
|
+
* @param[in] env An environment handle returned by #mdb_env_create()
|
716
|
+
* @param[out] fd Address of a mdb_filehandle_t to contain the descriptor.
|
717
|
+
* @return A non-zero error value on failure and 0 on success. Some possible
|
718
|
+
* errors are:
|
719
|
+
* <ul>
|
720
|
+
* <li>EINVAL - an invalid parameter was specified.
|
721
|
+
* </ul>
|
722
|
+
*/
|
723
|
+
int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
|
724
|
+
|
659
725
|
/** @brief Set the size of the memory map to use for this environment.
|
660
726
|
*
|
661
727
|
* The size should be a multiple of the OS page size. The default is
|
@@ -733,8 +799,10 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs);
|
|
733
799
|
|
734
800
|
/** @brief Get the maximum size of a key for the environment.
|
735
801
|
*
|
802
|
+
* This is the compile-time constant #MDB_MAXKEYSIZE, default 511.
|
803
|
+
* See @ref MDB_val.
|
736
804
|
* @param[in] env An environment handle returned by #mdb_env_create()
|
737
|
-
* @return The maximum size of a key
|
805
|
+
* @return The maximum size of a key
|
738
806
|
*/
|
739
807
|
int mdb_env_get_maxkeysize(MDB_env *env);
|
740
808
|
|
@@ -1094,6 +1162,8 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
|
|
1094
1162
|
* reserved space, which the caller can fill in later - before
|
1095
1163
|
* the next update operation or the transaction ends. This saves
|
1096
1164
|
* an extra memcpy if the data is being generated later.
|
1165
|
+
* MDB does nothing else with this memory, the caller is expected
|
1166
|
+
* to modify all of the space requested.
|
1097
1167
|
* <li>#MDB_APPEND - append the given key/data pair to the end of the
|
1098
1168
|
* database. No key comparisons are performed. This option allows
|
1099
1169
|
* fast bulk loading when keys are already known to be in the
|
data/ext/lmdb_ext/liblmdb/mdb.c
CHANGED
@@ -37,10 +37,26 @@
|
|
37
37
|
#endif
|
38
38
|
#include <sys/types.h>
|
39
39
|
#include <sys/stat.h>
|
40
|
-
#include <sys/param.h>
|
41
40
|
#ifdef _WIN32
|
42
41
|
#include <windows.h>
|
42
|
+
/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
|
43
|
+
* as int64 which is wrong. MSVC doesn't define it at all, so just
|
44
|
+
* don't use it.
|
45
|
+
*/
|
46
|
+
#define MDB_PID_T int
|
47
|
+
#ifdef __GNUC__
|
48
|
+
# include <sys/param.h>
|
43
49
|
#else
|
50
|
+
# define LITTLE_ENDIAN 1234
|
51
|
+
# define BIG_ENDIAN 4321
|
52
|
+
# define BYTE_ORDER LITTLE_ENDIAN
|
53
|
+
# ifndef SSIZE_MAX
|
54
|
+
# define SSIZE_MAX INT_MAX
|
55
|
+
# endif
|
56
|
+
#endif
|
57
|
+
#else
|
58
|
+
#define MDB_PID_T pid_t
|
59
|
+
#include <sys/param.h>
|
44
60
|
#include <sys/uio.h>
|
45
61
|
#include <sys/mman.h>
|
46
62
|
#ifdef HAVE_SYS_FILE_H
|
@@ -75,6 +91,7 @@
|
|
75
91
|
#ifndef _WIN32
|
76
92
|
#include <pthread.h>
|
77
93
|
#ifdef MDB_USE_POSIX_SEM
|
94
|
+
# define MDB_USE_HASH 1
|
78
95
|
#include <semaphore.h>
|
79
96
|
#endif
|
80
97
|
#endif
|
@@ -140,6 +157,7 @@
|
|
140
157
|
* @{
|
141
158
|
*/
|
142
159
|
#ifdef _WIN32
|
160
|
+
#define MDB_USE_HASH 1
|
143
161
|
#define MDB_PIDLOCK 0
|
144
162
|
#define pthread_t DWORD
|
145
163
|
#define pthread_mutex_t HANDLE
|
@@ -171,7 +189,7 @@
|
|
171
189
|
#define Z "I"
|
172
190
|
#else
|
173
191
|
|
174
|
-
#define Z "z"
|
192
|
+
#define Z "z" /**< printf format modifier for size_t */
|
175
193
|
|
176
194
|
/** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
|
177
195
|
#define MDB_PIDLOCK 1
|
@@ -317,12 +335,18 @@ static txnid_t mdb_debug_start;
|
|
317
335
|
* The string is printed literally, with no format processing.
|
318
336
|
*/
|
319
337
|
#define DPUTS(arg) DPRINTF(("%s", arg))
|
338
|
+
/** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
|
339
|
+
#define DDBI(mc) \
|
340
|
+
(((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
|
320
341
|
/** @} */
|
321
342
|
|
322
|
-
/**
|
323
|
-
*
|
324
|
-
*
|
325
|
-
*
|
343
|
+
/** @brief The maximum size of a database page.
|
344
|
+
*
|
345
|
+
* This is 32k, since it must fit in #MDB_page.#mp_upper.
|
346
|
+
*
|
347
|
+
* LMDB will use database pages < OS pages if needed.
|
348
|
+
* That causes more I/O in write transactions: The OS must
|
349
|
+
* know (read) the whole page before writing a partial page.
|
326
350
|
*
|
327
351
|
* Note that we don't currently support Huge pages. On Linux,
|
328
352
|
* regular data files cannot use Huge pages, and in general
|
@@ -331,7 +355,7 @@ static txnid_t mdb_debug_start;
|
|
331
355
|
* pressure from other processes is high. So until OSs have
|
332
356
|
* actual paging support for Huge pages, they're not viable.
|
333
357
|
*/
|
334
|
-
#define
|
358
|
+
#define MAX_PAGESIZE 0x8000
|
335
359
|
|
336
360
|
/** The minimum number of keys required in a database page.
|
337
361
|
* Setting this to a larger value will place a smaller bound on the
|
@@ -365,7 +389,7 @@ static txnid_t mdb_debug_start;
|
|
365
389
|
*
|
366
390
|
* We require that keys all fit onto a regular page. This limit
|
367
391
|
* could be raised a bit further if needed; to something just
|
368
|
-
* under
|
392
|
+
* under (page size / #MDB_MINKEYS / 3).
|
369
393
|
*
|
370
394
|
* Note that data items in an #MDB_DUPSORT database are actually keys
|
371
395
|
* of a subDB, so they're also limited to this size.
|
@@ -425,7 +449,8 @@ typedef uint16_t indx_t;
|
|
425
449
|
*
|
426
450
|
* If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
|
427
451
|
*
|
428
|
-
* No reader table is used if the database is on a read-only filesystem
|
452
|
+
* No reader table is used if the database is on a read-only filesystem, or
|
453
|
+
* if #MDB_NOLOCK is set.
|
429
454
|
*
|
430
455
|
* Since the database uses multi-version concurrency control, readers don't
|
431
456
|
* actually need any locking. This table is used to keep track of which
|
@@ -488,7 +513,7 @@ typedef struct MDB_rxbody {
|
|
488
513
|
*/
|
489
514
|
txnid_t mrb_txnid;
|
490
515
|
/** The process ID of the process owning this reader txn. */
|
491
|
-
|
516
|
+
MDB_PID_T mrb_pid;
|
492
517
|
/** The thread ID of the thread owning this txn. */
|
493
518
|
pthread_t mrb_tid;
|
494
519
|
} MDB_rxbody;
|
@@ -600,7 +625,7 @@ typedef struct MDB_page {
|
|
600
625
|
#define P_LEAF 0x02 /**< leaf page */
|
601
626
|
#define P_OVERFLOW 0x04 /**< overflow page */
|
602
627
|
#define P_META 0x08 /**< meta page */
|
603
|
-
#define P_DIRTY 0x10 /**< dirty page */
|
628
|
+
#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
|
604
629
|
#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
|
605
630
|
#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
|
606
631
|
#define P_KEEP 0x8000 /**< leave this page alone during spill */
|
@@ -786,7 +811,10 @@ typedef struct MDB_db {
|
|
786
811
|
/** Handle for the default DB. */
|
787
812
|
#define MAIN_DBI 1
|
788
813
|
|
789
|
-
/** Meta page content.
|
814
|
+
/** Meta page content.
|
815
|
+
* A meta page is the start point for accessing a database snapshot.
|
816
|
+
* Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
|
817
|
+
*/
|
790
818
|
typedef struct MDB_meta {
|
791
819
|
/** Stamp identifying this as an MDB file. It must be set
|
792
820
|
* to #MDB_MAGIC. */
|
@@ -804,19 +832,18 @@ typedef struct MDB_meta {
|
|
804
832
|
txnid_t mm_txnid; /**< txnid that committed this page */
|
805
833
|
} MDB_meta;
|
806
834
|
|
807
|
-
/** Buffer for a stack-allocated
|
835
|
+
/** Buffer for a stack-allocated meta page.
|
808
836
|
* The members define size and alignment, and silence type
|
809
837
|
* aliasing warnings. They are not used directly; that could
|
810
838
|
* mean incorrectly using several union members in parallel.
|
811
839
|
*/
|
812
|
-
typedef union
|
813
|
-
char mb_raw[MDB_PAGESIZE];
|
840
|
+
typedef union MDB_metabuf {
|
814
841
|
MDB_page mb_page;
|
815
842
|
struct {
|
816
843
|
char mm_pad[PAGEHDRSZ];
|
817
844
|
MDB_meta mm_meta;
|
818
845
|
} mb_metabuf;
|
819
|
-
}
|
846
|
+
} MDB_metabuf;
|
820
847
|
|
821
848
|
/** Auxiliary DB info.
|
822
849
|
* The information here is mostly static/read-only. There is
|
@@ -865,9 +892,9 @@ struct MDB_txn {
|
|
865
892
|
* @ingroup internal
|
866
893
|
* @{
|
867
894
|
*/
|
868
|
-
#define DB_DIRTY 0x01 /**< DB was
|
869
|
-
#define DB_STALE 0x02 /**< DB record is older than txnID */
|
870
|
-
#define DB_NEW 0x04 /**< DB handle opened in this txn */
|
895
|
+
#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */
|
896
|
+
#define DB_STALE 0x02 /**< Named-DB record is older than txnID */
|
897
|
+
#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
|
871
898
|
#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
|
872
899
|
/** @} */
|
873
900
|
/** In write txns, array of cursors for each DB */
|
@@ -889,12 +916,12 @@ struct MDB_txn {
|
|
889
916
|
#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
|
890
917
|
/** @} */
|
891
918
|
unsigned int mt_flags; /**< @ref mdb_txn */
|
892
|
-
/** dirty_list
|
893
|
-
|
894
|
-
|
895
|
-
*
|
919
|
+
/** dirty_list room: Array size - #dirty pages visible to this txn.
|
920
|
+
* Includes ancestor txns' dirty pages not hidden by other txns'
|
921
|
+
* dirty/spilled pages. Thus commit(nested txn) has room to merge
|
922
|
+
* dirty_list into mt_parent after freeing hidden mt_parent pages.
|
896
923
|
*/
|
897
|
-
unsigned int
|
924
|
+
unsigned int mt_dirty_room;
|
898
925
|
};
|
899
926
|
|
900
927
|
/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
|
@@ -905,7 +932,14 @@ struct MDB_txn {
|
|
905
932
|
|
906
933
|
struct MDB_xcursor;
|
907
934
|
|
908
|
-
/** Cursors are used for all DB operations
|
935
|
+
/** Cursors are used for all DB operations.
|
936
|
+
* A cursor holds a path of (page pointer, key index) from the DB
|
937
|
+
* root to a position in the DB, plus other state. #MDB_DUPSORT
|
938
|
+
* cursors include an xcursor to the current data item. Write txns
|
939
|
+
* track their cursors and keep them up to date when data moves.
|
940
|
+
* Exception: An xcursor's pointer to a #P_SUBP page can be stale.
|
941
|
+
* (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
|
942
|
+
*/
|
909
943
|
struct MDB_cursor {
|
910
944
|
/** Next cursor on this DB in this txn */
|
911
945
|
MDB_cursor *mc_next;
|
@@ -978,16 +1012,18 @@ struct MDB_env {
|
|
978
1012
|
/** Have liveness lock in reader table */
|
979
1013
|
#define MDB_LIVE_READER 0x08000000U
|
980
1014
|
uint32_t me_flags; /**< @ref mdb_env */
|
981
|
-
unsigned int me_psize; /**<
|
1015
|
+
unsigned int me_psize; /**< DB page size, inited from me_os_psize */
|
1016
|
+
unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
|
982
1017
|
unsigned int me_maxreaders; /**< size of the reader table */
|
983
1018
|
unsigned int me_numreaders; /**< max numreaders set by this env */
|
984
1019
|
MDB_dbi me_numdbs; /**< number of DBs opened */
|
985
1020
|
MDB_dbi me_maxdbs; /**< size of the DB table */
|
986
|
-
|
1021
|
+
MDB_PID_T me_pid; /**< process ID of this env */
|
987
1022
|
char *me_path; /**< path to the DB files */
|
988
1023
|
char *me_map; /**< the memory map of the data file */
|
989
1024
|
MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
|
990
1025
|
MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
|
1026
|
+
void *me_pbuf; /**< scratch area for DUPSORT put() */
|
991
1027
|
MDB_txn *me_txn; /**< current write transaction */
|
992
1028
|
size_t me_mapsize; /**< size of the data memory map */
|
993
1029
|
off_t me_size; /**< current file size */
|
@@ -1019,8 +1055,8 @@ struct MDB_env {
|
|
1019
1055
|
|
1020
1056
|
/** Nested transaction */
|
1021
1057
|
typedef struct MDB_ntxn {
|
1022
|
-
MDB_txn mnt_txn;
|
1023
|
-
MDB_pgstate mnt_pgstate;
|
1058
|
+
MDB_txn mnt_txn; /**< the transaction */
|
1059
|
+
MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
|
1024
1060
|
} MDB_ntxn;
|
1025
1061
|
|
1026
1062
|
/** max number of pages to commit in one writev() call */
|
@@ -1042,6 +1078,8 @@ static int mdb_page_search_root(MDB_cursor *mc,
|
|
1042
1078
|
MDB_val *key, int modify);
|
1043
1079
|
#define MDB_PS_MODIFY 1
|
1044
1080
|
#define MDB_PS_ROOTONLY 2
|
1081
|
+
#define MDB_PS_FIRST 4
|
1082
|
+
#define MDB_PS_LAST 8
|
1045
1083
|
static int mdb_page_search(MDB_cursor *mc,
|
1046
1084
|
MDB_val *key, int flags);
|
1047
1085
|
static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
|
@@ -1255,7 +1293,7 @@ static void mdb_audit(MDB_txn *txn)
|
|
1255
1293
|
txn->mt_dbs[i].md_leaf_pages +
|
1256
1294
|
txn->mt_dbs[i].md_overflow_pages;
|
1257
1295
|
if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
|
1258
|
-
mdb_page_search(&mc, NULL,
|
1296
|
+
mdb_page_search(&mc, NULL, MDB_PS_FIRST);
|
1259
1297
|
do {
|
1260
1298
|
unsigned j;
|
1261
1299
|
MDB_page *mp;
|
@@ -1300,7 +1338,12 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
|
|
1300
1338
|
{
|
1301
1339
|
MDB_env *env = txn->mt_env;
|
1302
1340
|
MDB_page *ret = env->me_dpages;
|
1303
|
-
size_t
|
1341
|
+
size_t psize = env->me_psize, sz = psize, off;
|
1342
|
+
/* For ! #MDB_NOMEMINIT, psize counts how much to init.
|
1343
|
+
* For a single page alloc, we init everything after the page header.
|
1344
|
+
* For multi-page, we init the final page; if the caller needed that
|
1345
|
+
* many pages they will be filling in at least up to the last page.
|
1346
|
+
*/
|
1304
1347
|
if (num == 1) {
|
1305
1348
|
if (ret) {
|
1306
1349
|
VGMEMP_ALLOC(env, ret, sz);
|
@@ -1308,10 +1351,16 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
|
|
1308
1351
|
env->me_dpages = ret->mp_next;
|
1309
1352
|
return ret;
|
1310
1353
|
}
|
1354
|
+
psize -= off = PAGEHDRSZ;
|
1311
1355
|
} else {
|
1312
1356
|
sz *= num;
|
1357
|
+
off = sz - psize;
|
1313
1358
|
}
|
1314
1359
|
if ((ret = malloc(sz)) != NULL) {
|
1360
|
+
if (!(env->me_flags & MDB_NOMEMINIT)) {
|
1361
|
+
memset((char *)ret + off, 0, psize);
|
1362
|
+
ret->mp_pad = 0;
|
1363
|
+
}
|
1315
1364
|
VGMEMP_ALLOC(env, ret, sz);
|
1316
1365
|
}
|
1317
1366
|
return ret;
|
@@ -1329,7 +1378,7 @@ mdb_page_free(MDB_env *env, MDB_page *mp)
|
|
1329
1378
|
env->me_dpages = mp;
|
1330
1379
|
}
|
1331
1380
|
|
1332
|
-
|
1381
|
+
/** Free a dirty page */
|
1333
1382
|
static void
|
1334
1383
|
mdb_dpage_free(MDB_env *env, MDB_page *dp)
|
1335
1384
|
{
|
@@ -1356,7 +1405,7 @@ mdb_dlist_free(MDB_txn *txn)
|
|
1356
1405
|
dl[0].mid = 0;
|
1357
1406
|
}
|
1358
1407
|
|
1359
|
-
|
1408
|
+
/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
|
1360
1409
|
* @param[in] mc A cursor handle for the current operation.
|
1361
1410
|
* @param[in] pflags Flags of the pages to update:
|
1362
1411
|
* P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
|
@@ -1366,10 +1415,12 @@ mdb_dlist_free(MDB_txn *txn)
|
|
1366
1415
|
static int
|
1367
1416
|
mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
|
1368
1417
|
{
|
1418
|
+
enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
|
1369
1419
|
MDB_txn *txn = mc->mc_txn;
|
1370
1420
|
MDB_cursor *m3;
|
1371
1421
|
MDB_xcursor *mx;
|
1372
|
-
MDB_page *dp;
|
1422
|
+
MDB_page *dp, *mp;
|
1423
|
+
MDB_node *leaf;
|
1373
1424
|
unsigned i, j;
|
1374
1425
|
int rc = MDB_SUCCESS, level;
|
1375
1426
|
|
@@ -1378,14 +1429,24 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
|
|
1378
1429
|
mc = NULL; /* will find mc in mt_cursors */
|
1379
1430
|
for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
|
1380
1431
|
for (; mc; mc=mc->mc_next) {
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
if (
|
1388
|
-
|
1432
|
+
if (!(mc->mc_flags & C_INITIALIZED))
|
1433
|
+
continue;
|
1434
|
+
for (m3 = mc;; m3 = &mx->mx_cursor) {
|
1435
|
+
mp = NULL;
|
1436
|
+
for (j=0; j<m3->mc_snum; j++) {
|
1437
|
+
mp = m3->mc_pg[j];
|
1438
|
+
if ((mp->mp_flags & Mask) == pflags)
|
1439
|
+
mp->mp_flags ^= P_KEEP;
|
1440
|
+
}
|
1441
|
+
mx = m3->mc_xcursor;
|
1442
|
+
/* Proceed to mx if it is at a sub-database */
|
1443
|
+
if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
|
1444
|
+
break;
|
1445
|
+
if (! (mp && (mp->mp_flags & P_LEAF)))
|
1446
|
+
break;
|
1447
|
+
leaf = NODEPTR(mp, m3->mc_ki[j-1]);
|
1448
|
+
if (!(leaf->mn_flags & F_SUBDATA))
|
1449
|
+
break;
|
1389
1450
|
}
|
1390
1451
|
}
|
1391
1452
|
if (i == 0)
|
@@ -1401,7 +1462,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
|
|
1401
1462
|
continue;
|
1402
1463
|
if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
|
1403
1464
|
break;
|
1404
|
-
if ((dp->mp_flags &
|
1465
|
+
if ((dp->mp_flags & Mask) == pflags && level <= 1)
|
1405
1466
|
dp->mp_flags ^= P_KEEP;
|
1406
1467
|
}
|
1407
1468
|
}
|
@@ -1415,15 +1476,12 @@ static int mdb_page_flush(MDB_txn *txn, int keep);
|
|
1415
1476
|
/** Spill pages from the dirty list back to disk.
|
1416
1477
|
* This is intended to prevent running into #MDB_TXN_FULL situations,
|
1417
1478
|
* but note that they may still occur in a few cases:
|
1418
|
-
* 1)
|
1419
|
-
*
|
1420
|
-
* too full.
|
1479
|
+
* 1) our estimate of the txn size could be too small. Currently this
|
1480
|
+
* seems unlikely, except with a large number of #MDB_MULTIPLE items.
|
1421
1481
|
* 2) child txns may run out of space if their parents dirtied a
|
1422
1482
|
* lot of pages and never spilled them. TODO: we probably should do
|
1423
1483
|
* a preemptive spill during #mdb_txn_begin() of a child txn, if
|
1424
1484
|
* the parent's dirty_room is below a given threshold.
|
1425
|
-
* 3) our estimate of the txn size could be too small. At the
|
1426
|
-
* moment this seems unlikely.
|
1427
1485
|
*
|
1428
1486
|
* Otherwise, if not using nested txns, it is expected that apps will
|
1429
1487
|
* not run into #MDB_TXN_FULL any more. The pages are flushed to disk
|
@@ -1541,31 +1599,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
|
|
1541
1599
|
rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
|
1542
1600
|
|
1543
1601
|
done:
|
1544
|
-
|
1545
|
-
if (txn->mt_parent) {
|
1546
|
-
txn->mt_dirty_room = txn->mt_parent->mt_dirty_room - dl[0].mid;
|
1547
|
-
/* dirty pages that are dirty in an ancestor don't
|
1548
|
-
* count against this txn's dirty_room.
|
1549
|
-
*/
|
1550
|
-
for (i=1; i<=dl[0].mid; i++) {
|
1551
|
-
pgno_t pgno = dl[i].mid;
|
1552
|
-
MDB_txn *tx2;
|
1553
|
-
for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
|
1554
|
-
j = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
|
1555
|
-
if (j <= tx2->mt_u.dirty_list[0].mid &&
|
1556
|
-
tx2->mt_u.dirty_list[j].mid == pgno) {
|
1557
|
-
txn->mt_dirty_room++;
|
1558
|
-
break;
|
1559
|
-
}
|
1560
|
-
}
|
1561
|
-
}
|
1562
|
-
} else {
|
1563
|
-
txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid;
|
1564
|
-
}
|
1565
|
-
txn->mt_flags |= MDB_TXN_SPILLS;
|
1566
|
-
} else {
|
1567
|
-
txn->mt_flags |= MDB_TXN_ERROR;
|
1568
|
-
}
|
1602
|
+
txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
|
1569
1603
|
return rc;
|
1570
1604
|
}
|
1571
1605
|
|
@@ -1575,12 +1609,14 @@ mdb_find_oldest(MDB_txn *txn)
|
|
1575
1609
|
{
|
1576
1610
|
int i;
|
1577
1611
|
txnid_t mr, oldest = txn->mt_txnid - 1;
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
oldest
|
1612
|
+
if (txn->mt_env->me_txns) {
|
1613
|
+
MDB_reader *r = txn->mt_env->me_txns->mti_readers;
|
1614
|
+
for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
|
1615
|
+
if (r[i].mr_pid) {
|
1616
|
+
mr = r[i].mr_txnid;
|
1617
|
+
if (oldest > mr)
|
1618
|
+
oldest = mr;
|
1619
|
+
}
|
1584
1620
|
}
|
1585
1621
|
}
|
1586
1622
|
return oldest;
|
@@ -1790,26 +1826,28 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
|
|
1790
1826
|
/** Pull a page off the txn's spill list, if present.
|
1791
1827
|
* If a page being referenced was spilled to disk in this txn, bring
|
1792
1828
|
* it back and make it dirty/writable again.
|
1793
|
-
* @param[in]
|
1829
|
+
* @param[in] txn the transaction handle.
|
1794
1830
|
* @param[in] mp the page being referenced.
|
1795
1831
|
* @param[out] ret the writable page, if any. ret is unchanged if
|
1796
1832
|
* mp wasn't spilled.
|
1797
1833
|
*/
|
1798
1834
|
static int
|
1799
|
-
mdb_page_unspill(MDB_txn *
|
1835
|
+
mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
|
1800
1836
|
{
|
1801
|
-
MDB_env *env =
|
1802
|
-
MDB_txn *
|
1837
|
+
MDB_env *env = txn->mt_env;
|
1838
|
+
const MDB_txn *tx2;
|
1803
1839
|
unsigned x;
|
1804
1840
|
pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
|
1805
1841
|
|
1806
|
-
for (
|
1807
|
-
if (!
|
1842
|
+
for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
|
1843
|
+
if (!tx2->mt_spill_pgs)
|
1808
1844
|
continue;
|
1809
|
-
x = mdb_midl_search(
|
1810
|
-
if (x <=
|
1845
|
+
x = mdb_midl_search(tx2->mt_spill_pgs, pn);
|
1846
|
+
if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
|
1811
1847
|
MDB_page *np;
|
1812
1848
|
int num;
|
1849
|
+
if (txn->mt_dirty_room == 0)
|
1850
|
+
return MDB_TXN_FULL;
|
1813
1851
|
if (IS_OVERFLOW(mp))
|
1814
1852
|
num = mp->mp_pages;
|
1815
1853
|
else
|
@@ -1825,7 +1863,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
|
|
1825
1863
|
else
|
1826
1864
|
mdb_page_copy(np, mp, env->me_psize);
|
1827
1865
|
}
|
1828
|
-
if (
|
1866
|
+
if (tx2 == txn) {
|
1829
1867
|
/* If in current txn, this page is no longer spilled.
|
1830
1868
|
* If it happens to be the last page, truncate the spill list.
|
1831
1869
|
* Otherwise mark it as deleted by setting the LSB.
|
@@ -1838,22 +1876,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
|
|
1838
1876
|
* page remains spilled until child commits
|
1839
1877
|
*/
|
1840
1878
|
|
1841
|
-
|
1842
|
-
MDB_txn *tx2;
|
1843
|
-
/* If this page is also in a parent's dirty list, then
|
1844
|
-
* it's already accounted in dirty_room, and we need to
|
1845
|
-
* cancel out the decrement that mdb_page_dirty does.
|
1846
|
-
*/
|
1847
|
-
for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
|
1848
|
-
x = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
|
1849
|
-
if (x <= tx2->mt_u.dirty_list[0].mid &&
|
1850
|
-
tx2->mt_u.dirty_list[x].mid == pgno) {
|
1851
|
-
tx0->mt_dirty_room++;
|
1852
|
-
break;
|
1853
|
-
}
|
1854
|
-
}
|
1855
|
-
}
|
1856
|
-
mdb_page_dirty(tx0, np);
|
1879
|
+
mdb_page_dirty(txn, np);
|
1857
1880
|
np->mp_flags |= P_DIRTY;
|
1858
1881
|
*ret = np;
|
1859
1882
|
break;
|
@@ -1872,7 +1895,6 @@ mdb_page_touch(MDB_cursor *mc)
|
|
1872
1895
|
MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
|
1873
1896
|
MDB_txn *txn = mc->mc_txn;
|
1874
1897
|
MDB_cursor *m2, *m3;
|
1875
|
-
MDB_dbi dbi;
|
1876
1898
|
pgno_t pgno;
|
1877
1899
|
int rc;
|
1878
1900
|
|
@@ -1889,7 +1911,8 @@ mdb_page_touch(MDB_cursor *mc)
|
|
1889
1911
|
(rc = mdb_page_alloc(mc, 1, &np)))
|
1890
1912
|
return rc;
|
1891
1913
|
pgno = np->mp_pgno;
|
1892
|
-
DPRINTF(("touched db %
|
1914
|
+
DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
|
1915
|
+
mp->mp_pgno, pgno));
|
1893
1916
|
assert(mp->mp_pgno != pgno);
|
1894
1917
|
mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
|
1895
1918
|
/* Update the parent page, if any, to point to the new page */
|
@@ -1935,17 +1958,16 @@ mdb_page_touch(MDB_cursor *mc)
|
|
1935
1958
|
done:
|
1936
1959
|
/* Adjust cursors pointing to mp */
|
1937
1960
|
mc->mc_pg[mc->mc_top] = np;
|
1938
|
-
|
1961
|
+
m2 = txn->mt_cursors[mc->mc_dbi];
|
1939
1962
|
if (mc->mc_flags & C_SUB) {
|
1940
|
-
|
1941
|
-
for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
1963
|
+
for (; m2; m2=m2->mc_next) {
|
1942
1964
|
m3 = &m2->mc_xcursor->mx_cursor;
|
1943
1965
|
if (m3->mc_snum < mc->mc_snum) continue;
|
1944
1966
|
if (m3->mc_pg[mc->mc_top] == mp)
|
1945
1967
|
m3->mc_pg[mc->mc_top] = np;
|
1946
1968
|
}
|
1947
1969
|
} else {
|
1948
|
-
for (
|
1970
|
+
for (; m2; m2=m2->mc_next) {
|
1949
1971
|
if (m2->mc_snum < mc->mc_snum) continue;
|
1950
1972
|
if (m2->mc_pg[mc->mc_top] == mp) {
|
1951
1973
|
m2->mc_pg[mc->mc_top] = np;
|
@@ -2087,7 +2109,7 @@ enum Pidlock_op {
|
|
2087
2109
|
* lock on the lockfile, set at an offset equal to the pid.
|
2088
2110
|
*/
|
2089
2111
|
static int
|
2090
|
-
mdb_reader_pid(MDB_env *env, enum Pidlock_op op,
|
2112
|
+
mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
|
2091
2113
|
{
|
2092
2114
|
#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
|
2093
2115
|
int ret = 0;
|
@@ -2130,7 +2152,9 @@ static int
|
|
2130
2152
|
mdb_txn_renew0(MDB_txn *txn)
|
2131
2153
|
{
|
2132
2154
|
MDB_env *env = txn->mt_env;
|
2133
|
-
|
2155
|
+
MDB_txninfo *ti = env->me_txns;
|
2156
|
+
MDB_meta *meta;
|
2157
|
+
unsigned int i, nr;
|
2134
2158
|
uint16_t x;
|
2135
2159
|
int rc, new_notls = 0;
|
2136
2160
|
|
@@ -2139,9 +2163,9 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2139
2163
|
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
|
2140
2164
|
|
2141
2165
|
if (txn->mt_flags & MDB_TXN_RDONLY) {
|
2142
|
-
if (!
|
2143
|
-
|
2144
|
-
txn->mt_txnid =
|
2166
|
+
if (!ti) {
|
2167
|
+
meta = env->me_metas[ mdb_env_pick_meta(env) ];
|
2168
|
+
txn->mt_txnid = meta->mm_txnid;
|
2145
2169
|
txn->mt_u.reader = NULL;
|
2146
2170
|
} else {
|
2147
2171
|
MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
|
@@ -2150,7 +2174,7 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2150
2174
|
if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
|
2151
2175
|
return MDB_BAD_RSLOT;
|
2152
2176
|
} else {
|
2153
|
-
|
2177
|
+
MDB_PID_T pid = env->me_pid;
|
2154
2178
|
pthread_t tid = pthread_self();
|
2155
2179
|
|
2156
2180
|
if (!(env->me_flags & MDB_LIVE_READER)) {
|
@@ -2163,36 +2187,43 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2163
2187
|
}
|
2164
2188
|
|
2165
2189
|
LOCK_MUTEX_R(env);
|
2166
|
-
|
2167
|
-
|
2190
|
+
nr = ti->mti_numreaders;
|
2191
|
+
for (i=0; i<nr; i++)
|
2192
|
+
if (ti->mti_readers[i].mr_pid == 0)
|
2168
2193
|
break;
|
2169
2194
|
if (i == env->me_maxreaders) {
|
2170
2195
|
UNLOCK_MUTEX_R(env);
|
2171
2196
|
return MDB_READERS_FULL;
|
2172
2197
|
}
|
2173
|
-
|
2174
|
-
|
2175
|
-
if (i
|
2176
|
-
|
2198
|
+
ti->mti_readers[i].mr_pid = pid;
|
2199
|
+
ti->mti_readers[i].mr_tid = tid;
|
2200
|
+
if (i == nr)
|
2201
|
+
ti->mti_numreaders = ++nr;
|
2177
2202
|
/* Save numreaders for un-mutexed mdb_env_close() */
|
2178
|
-
env->me_numreaders =
|
2203
|
+
env->me_numreaders = nr;
|
2179
2204
|
UNLOCK_MUTEX_R(env);
|
2180
|
-
|
2205
|
+
|
2206
|
+
r = &ti->mti_readers[i];
|
2181
2207
|
new_notls = (env->me_flags & MDB_NOTLS);
|
2182
2208
|
if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
|
2183
2209
|
r->mr_pid = 0;
|
2184
2210
|
return rc;
|
2185
2211
|
}
|
2186
2212
|
}
|
2187
|
-
txn->mt_txnid = r->mr_txnid =
|
2213
|
+
txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
|
2188
2214
|
txn->mt_u.reader = r;
|
2215
|
+
meta = env->me_metas[txn->mt_txnid & 1];
|
2189
2216
|
}
|
2190
|
-
txn->mt_toggle = txn->mt_txnid & 1;
|
2191
2217
|
} else {
|
2192
|
-
|
2218
|
+
if (ti) {
|
2219
|
+
LOCK_MUTEX_W(env);
|
2193
2220
|
|
2194
|
-
|
2195
|
-
|
2221
|
+
txn->mt_txnid = ti->mti_txnid;
|
2222
|
+
meta = env->me_metas[txn->mt_txnid & 1];
|
2223
|
+
} else {
|
2224
|
+
meta = env->me_metas[ mdb_env_pick_meta(env) ];
|
2225
|
+
txn->mt_txnid = meta->mm_txnid;
|
2226
|
+
}
|
2196
2227
|
txn->mt_txnid++;
|
2197
2228
|
#if MDB_DEBUG
|
2198
2229
|
if (txn->mt_txnid == mdb_debug_start)
|
@@ -2208,10 +2239,10 @@ mdb_txn_renew0(MDB_txn *txn)
|
|
2208
2239
|
}
|
2209
2240
|
|
2210
2241
|
/* Copy the DB info and flags */
|
2211
|
-
memcpy(txn->mt_dbs,
|
2242
|
+
memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
|
2212
2243
|
|
2213
2244
|
/* Moved to here to avoid a data race in read TXNs */
|
2214
|
-
txn->mt_next_pgno =
|
2245
|
+
txn->mt_next_pgno = meta->mm_last_pg+1;
|
2215
2246
|
|
2216
2247
|
for (i=2; i<txn->mt_numdbs; i++) {
|
2217
2248
|
x = env->me_dbflags[i];
|
@@ -2307,7 +2338,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
|
|
2307
2338
|
return ENOMEM;
|
2308
2339
|
}
|
2309
2340
|
txn->mt_txnid = parent->mt_txnid;
|
2310
|
-
txn->mt_toggle = parent->mt_toggle;
|
2311
2341
|
txn->mt_dirty_room = parent->mt_dirty_room;
|
2312
2342
|
txn->mt_u.dirty_list[0].mid = 0;
|
2313
2343
|
txn->mt_spill_pgs = NULL;
|
@@ -2433,7 +2463,8 @@ mdb_txn_reset0(MDB_txn *txn, const char *act)
|
|
2433
2463
|
|
2434
2464
|
env->me_txn = NULL;
|
2435
2465
|
/* The writer mutex was locked in mdb_txn_begin. */
|
2436
|
-
|
2466
|
+
if (env->me_txns)
|
2467
|
+
UNLOCK_MUTEX_W(env);
|
2437
2468
|
}
|
2438
2469
|
}
|
2439
2470
|
|
@@ -2482,20 +2513,26 @@ mdb_freelist_save(MDB_txn *txn)
|
|
2482
2513
|
int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
|
2483
2514
|
txnid_t pglast = 0, head_id = 0;
|
2484
2515
|
pgno_t freecnt = 0, *free_pgs, *mop;
|
2485
|
-
ssize_t head_room = 0, total_room = 0, mop_len;
|
2516
|
+
ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
|
2486
2517
|
|
2487
2518
|
mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
|
2488
2519
|
|
2489
2520
|
if (env->me_pghead) {
|
2490
2521
|
/* Make sure first page of freeDB is touched and on freelist */
|
2491
|
-
rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
|
2522
|
+
rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
|
2492
2523
|
if (rc && rc != MDB_NOTFOUND)
|
2493
2524
|
return rc;
|
2494
2525
|
}
|
2495
2526
|
|
2527
|
+
/* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
|
2528
|
+
clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
|
2529
|
+
? SSIZE_MAX : maxfree_1pg;
|
2530
|
+
|
2496
2531
|
for (;;) {
|
2497
2532
|
/* Come back here after each Put() in case freelist changed */
|
2498
2533
|
MDB_val key, data;
|
2534
|
+
pgno_t *pgs;
|
2535
|
+
ssize_t j;
|
2499
2536
|
|
2500
2537
|
/* If using records from freeDB which we have not yet
|
2501
2538
|
* deleted, delete them and any we reserved for me_pghead.
|
@@ -2516,9 +2553,7 @@ mdb_freelist_save(MDB_txn *txn)
|
|
2516
2553
|
if (freecnt < txn->mt_free_pgs[0]) {
|
2517
2554
|
if (!freecnt) {
|
2518
2555
|
/* Make sure last page of freeDB is touched and on freelist */
|
2519
|
-
|
2520
|
-
key.mv_data = NULL;
|
2521
|
-
rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
|
2556
|
+
rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
|
2522
2557
|
if (rc && rc != MDB_NOTFOUND)
|
2523
2558
|
return rc;
|
2524
2559
|
}
|
@@ -2581,11 +2616,16 @@ mdb_freelist_save(MDB_txn *txn)
|
|
2581
2616
|
rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
|
2582
2617
|
if (rc)
|
2583
2618
|
return rc;
|
2584
|
-
|
2619
|
+
/* IDL is initially empty, zero out at least the length */
|
2620
|
+
pgs = (pgno_t *)data.mv_data;
|
2621
|
+
j = head_room > clean_limit ? head_room : 0;
|
2622
|
+
do {
|
2623
|
+
pgs[j] = 0;
|
2624
|
+
} while (--j >= 0);
|
2585
2625
|
total_room += head_room;
|
2586
2626
|
}
|
2587
2627
|
|
2588
|
-
/* Fill in the reserved
|
2628
|
+
/* Fill in the reserved me_pghead records */
|
2589
2629
|
rc = MDB_SUCCESS;
|
2590
2630
|
if (mop_len) {
|
2591
2631
|
MDB_val key, data;
|
@@ -2655,8 +2695,7 @@ mdb_page_flush(MDB_txn *txn, int keep)
|
|
2655
2695
|
}
|
2656
2696
|
dp->mp_flags &= ~P_DIRTY;
|
2657
2697
|
}
|
2658
|
-
|
2659
|
-
return MDB_SUCCESS;
|
2698
|
+
goto done;
|
2660
2699
|
}
|
2661
2700
|
|
2662
2701
|
/* Write the pages */
|
@@ -2750,8 +2789,11 @@ mdb_page_flush(MDB_txn *txn, int keep)
|
|
2750
2789
|
}
|
2751
2790
|
mdb_dpage_free(env, dp);
|
2752
2791
|
}
|
2753
|
-
dl[0].mid = j;
|
2754
2792
|
|
2793
|
+
done:
|
2794
|
+
i--;
|
2795
|
+
txn->mt_dirty_room += i - j;
|
2796
|
+
dl[0].mid = j;
|
2755
2797
|
return MDB_SUCCESS;
|
2756
2798
|
}
|
2757
2799
|
|
@@ -2791,14 +2833,18 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2791
2833
|
|
2792
2834
|
if (txn->mt_parent) {
|
2793
2835
|
MDB_txn *parent = txn->mt_parent;
|
2794
|
-
unsigned x, y, len;
|
2795
2836
|
MDB_ID2L dst, src;
|
2837
|
+
MDB_IDL pspill;
|
2838
|
+
unsigned x, y, len, ps_len;
|
2796
2839
|
|
2797
2840
|
/* Append our free list to parent's */
|
2798
2841
|
rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
|
2799
2842
|
if (rc)
|
2800
2843
|
goto fail;
|
2801
2844
|
mdb_midl_free(txn->mt_free_pgs);
|
2845
|
+
/* Failures after this must either undo the changes
|
2846
|
+
* to the parent or set MDB_TXN_ERROR in the parent.
|
2847
|
+
*/
|
2802
2848
|
|
2803
2849
|
parent->mt_next_pgno = txn->mt_next_pgno;
|
2804
2850
|
parent->mt_flags = txn->mt_flags;
|
@@ -2820,37 +2866,26 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2820
2866
|
dst = parent->mt_u.dirty_list;
|
2821
2867
|
src = txn->mt_u.dirty_list;
|
2822
2868
|
/* Remove anything in our dirty list from parent's spill list */
|
2823
|
-
if (parent->mt_spill_pgs) {
|
2824
|
-
x =
|
2825
|
-
|
2826
|
-
/*
|
2827
|
-
for (i=
|
2869
|
+
if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
|
2870
|
+
x = y = ps_len;
|
2871
|
+
pspill[0] = (pgno_t)-1;
|
2872
|
+
/* Mark our dirty pages as deleted in parent spill list */
|
2873
|
+
for (i=0, len=src[0].mid; ++i <= len; ) {
|
2828
2874
|
MDB_ID pn = src[i].mid << 1;
|
2829
|
-
|
2830
|
-
continue;
|
2831
|
-
if (pn > parent->mt_spill_pgs[x]) {
|
2832
|
-
if (x <= 1)
|
2833
|
-
break;
|
2875
|
+
while (pn > pspill[x])
|
2834
2876
|
x--;
|
2835
|
-
|
2836
|
-
|
2837
|
-
|
2838
|
-
len--;
|
2839
|
-
}
|
2840
|
-
/* OK, we had a few hits, squash zeros from the spill list */
|
2841
|
-
if (len < parent->mt_spill_pgs[0]) {
|
2842
|
-
x=1;
|
2843
|
-
for (y=1; y<=parent->mt_spill_pgs[0]; y++) {
|
2844
|
-
if (parent->mt_spill_pgs[y]) {
|
2845
|
-
if (y != x) {
|
2846
|
-
parent->mt_spill_pgs[x] = parent->mt_spill_pgs[y];
|
2847
|
-
}
|
2848
|
-
x++;
|
2849
|
-
}
|
2877
|
+
if (pn == pspill[x]) {
|
2878
|
+
pspill[x] = 1;
|
2879
|
+
y = --x;
|
2850
2880
|
}
|
2851
|
-
parent->mt_spill_pgs[0] = len;
|
2852
2881
|
}
|
2882
|
+
/* Squash deleted pagenums if we deleted any */
|
2883
|
+
for (x=y; ++x <= ps_len; )
|
2884
|
+
if (!(pspill[x] & 1))
|
2885
|
+
pspill[++y] = pspill[x];
|
2886
|
+
pspill[0] = y;
|
2853
2887
|
}
|
2888
|
+
|
2854
2889
|
/* Find len = length of merging our dirty list with parent's */
|
2855
2890
|
x = dst[0].mid;
|
2856
2891
|
dst[0].mid = 0; /* simplify loops */
|
@@ -2884,7 +2919,10 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2884
2919
|
parent->mt_dirty_room = txn->mt_dirty_room;
|
2885
2920
|
if (txn->mt_spill_pgs) {
|
2886
2921
|
if (parent->mt_spill_pgs) {
|
2887
|
-
|
2922
|
+
/* TODO: Prevent failure here, so parent does not fail */
|
2923
|
+
rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
|
2924
|
+
if (rc)
|
2925
|
+
parent->mt_flags |= MDB_TXN_ERROR;
|
2888
2926
|
mdb_midl_free(txn->mt_spill_pgs);
|
2889
2927
|
mdb_midl_sort(parent->mt_spill_pgs);
|
2890
2928
|
} else {
|
@@ -2895,7 +2933,7 @@ mdb_txn_commit(MDB_txn *txn)
|
|
2895
2933
|
parent->mt_child = NULL;
|
2896
2934
|
mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
|
2897
2935
|
free(txn);
|
2898
|
-
return
|
2936
|
+
return rc;
|
2899
2937
|
}
|
2900
2938
|
|
2901
2939
|
if (txn != env->me_txn) {
|
@@ -2954,7 +2992,8 @@ done:
|
|
2954
2992
|
env->me_txn = NULL;
|
2955
2993
|
mdb_dbis_update(txn, 1);
|
2956
2994
|
|
2957
|
-
|
2995
|
+
if (env->me_txns)
|
2996
|
+
UNLOCK_MUTEX_W(env);
|
2958
2997
|
free(txn);
|
2959
2998
|
|
2960
2999
|
return MDB_SUCCESS;
|
@@ -2973,10 +3012,11 @@ fail:
|
|
2973
3012
|
static int
|
2974
3013
|
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
|
2975
3014
|
{
|
2976
|
-
|
3015
|
+
MDB_metabuf pbuf;
|
2977
3016
|
MDB_page *p;
|
2978
3017
|
MDB_meta *m;
|
2979
3018
|
int i, rc, off;
|
3019
|
+
enum { Size = sizeof(pbuf) };
|
2980
3020
|
|
2981
3021
|
/* We don't know the page size yet, so use a minimum value.
|
2982
3022
|
* Read both meta pages so we can use the latest one.
|
@@ -2988,13 +3028,13 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
|
|
2988
3028
|
OVERLAPPED ov;
|
2989
3029
|
memset(&ov, 0, sizeof(ov));
|
2990
3030
|
ov.Offset = off;
|
2991
|
-
rc = ReadFile(env->me_fd
|
3031
|
+
rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
|
2992
3032
|
if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
|
2993
3033
|
rc = 0;
|
2994
3034
|
#else
|
2995
|
-
rc = pread(env->me_fd, &pbuf,
|
3035
|
+
rc = pread(env->me_fd, &pbuf, Size, off);
|
2996
3036
|
#endif
|
2997
|
-
if (rc !=
|
3037
|
+
if (rc != Size) {
|
2998
3038
|
if (rc == 0 && off == 0)
|
2999
3039
|
return ENOENT;
|
3000
3040
|
rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
|
@@ -3109,7 +3149,7 @@ mdb_env_write_meta(MDB_txn *txn)
|
|
3109
3149
|
assert(txn != NULL);
|
3110
3150
|
assert(txn->mt_env != NULL);
|
3111
3151
|
|
3112
|
-
toggle =
|
3152
|
+
toggle = txn->mt_txnid & 1;
|
3113
3153
|
DPRINTF(("writing meta page %d for root page %"Z"u",
|
3114
3154
|
toggle, txn->mt_dbs[MAIN_DBI].md_root));
|
3115
3155
|
|
@@ -3125,11 +3165,18 @@ mdb_env_write_meta(MDB_txn *txn)
|
|
3125
3165
|
mp->mm_last_pg = txn->mt_next_pgno - 1;
|
3126
3166
|
mp->mm_txnid = txn->mt_txnid;
|
3127
3167
|
if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
|
3168
|
+
unsigned meta_size = env->me_psize;
|
3128
3169
|
rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
|
3129
3170
|
ptr = env->me_map;
|
3130
|
-
if (toggle)
|
3131
|
-
|
3132
|
-
|
3171
|
+
if (toggle) {
|
3172
|
+
#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
|
3173
|
+
if (meta_size < env->me_os_psize)
|
3174
|
+
meta_size += meta_size;
|
3175
|
+
else
|
3176
|
+
#endif
|
3177
|
+
ptr += meta_size;
|
3178
|
+
}
|
3179
|
+
if (MDB_MSYNC(ptr, meta_size, rc)) {
|
3133
3180
|
rc = ErrCode();
|
3134
3181
|
goto fail;
|
3135
3182
|
}
|
@@ -3200,7 +3247,8 @@ done:
|
|
3200
3247
|
* readers will get consistent data regardless of how fresh or
|
3201
3248
|
* how stale their view of these values is.
|
3202
3249
|
*/
|
3203
|
-
env->me_txns
|
3250
|
+
if (env->me_txns)
|
3251
|
+
env->me_txns->mti_txnid = txn->mt_txnid;
|
3204
3252
|
|
3205
3253
|
return MDB_SUCCESS;
|
3206
3254
|
}
|
@@ -3234,6 +3282,7 @@ mdb_env_create(MDB_env **env)
|
|
3234
3282
|
e->me_wmutex = SEM_FAILED;
|
3235
3283
|
#endif
|
3236
3284
|
e->me_pid = getpid();
|
3285
|
+
GET_PAGESIZE(e->me_os_psize);
|
3237
3286
|
VGMEMP_CREATE(e,0,0);
|
3238
3287
|
*env = e;
|
3239
3288
|
return MDB_SUCCESS;
|
@@ -3276,7 +3325,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
|
|
3276
3325
|
int prot = PROT_READ;
|
3277
3326
|
if (flags & MDB_WRITEMAP) {
|
3278
3327
|
prot |= PROT_WRITE;
|
3279
|
-
if (
|
3328
|
+
if (ftruncate(env->me_fd, env->me_mapsize) < 0)
|
3280
3329
|
return ErrCode();
|
3281
3330
|
}
|
3282
3331
|
env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
|
@@ -3285,14 +3334,17 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
|
|
3285
3334
|
env->me_map = NULL;
|
3286
3335
|
return ErrCode();
|
3287
3336
|
}
|
3288
|
-
|
3337
|
+
|
3338
|
+
if (flags & MDB_NORDAHEAD) {
|
3339
|
+
/* Turn off readahead. It's harmful when the DB is larger than RAM. */
|
3289
3340
|
#ifdef MADV_RANDOM
|
3290
|
-
|
3341
|
+
madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
|
3291
3342
|
#else
|
3292
3343
|
#ifdef POSIX_MADV_RANDOM
|
3293
|
-
|
3344
|
+
posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
|
3294
3345
|
#endif /* POSIX_MADV_RANDOM */
|
3295
3346
|
#endif /* MADV_RANDOM */
|
3347
|
+
}
|
3296
3348
|
#endif /* _WIN32 */
|
3297
3349
|
|
3298
3350
|
/* Can happen because the address argument to mmap() is just a
|
@@ -3323,6 +3375,14 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
|
|
3323
3375
|
return EINVAL;
|
3324
3376
|
if (!size)
|
3325
3377
|
size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize;
|
3378
|
+
else if (size < env->me_mapsize) {
|
3379
|
+
/* If the configured size is smaller, make sure it's
|
3380
|
+
* still big enough. Silently round up to minimum if not.
|
3381
|
+
*/
|
3382
|
+
size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize;
|
3383
|
+
if (size < minsize)
|
3384
|
+
size = minsize;
|
3385
|
+
}
|
3326
3386
|
munmap(env->me_map, env->me_mapsize);
|
3327
3387
|
env->me_mapsize = size;
|
3328
3388
|
old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
|
@@ -3388,7 +3448,9 @@ mdb_env_open2(MDB_env *env)
|
|
3388
3448
|
return i;
|
3389
3449
|
DPUTS("new mdbenv");
|
3390
3450
|
newenv = 1;
|
3391
|
-
|
3451
|
+
env->me_psize = env->me_os_psize;
|
3452
|
+
if (env->me_psize > MAX_PAGESIZE)
|
3453
|
+
env->me_psize = MAX_PAGESIZE;
|
3392
3454
|
} else {
|
3393
3455
|
env->me_psize = meta.mm_psize;
|
3394
3456
|
}
|
@@ -3499,7 +3561,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_
|
|
3499
3561
|
#pragma comment(linker, "/INCLUDE:_tls_used")
|
3500
3562
|
#pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
|
3501
3563
|
#pragma const_seg(".CRT$XLB")
|
3502
|
-
extern const PIMAGE_TLS_CALLBACK
|
3564
|
+
extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
|
3503
3565
|
const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
|
3504
3566
|
#pragma const_seg()
|
3505
3567
|
#else /* WIN32 */
|
@@ -3597,7 +3659,7 @@ mdb_env_excl_lock(MDB_env *env, int *excl)
|
|
3597
3659
|
return rc;
|
3598
3660
|
}
|
3599
3661
|
|
3600
|
-
#
|
3662
|
+
#ifdef MDB_USE_HASH
|
3601
3663
|
/*
|
3602
3664
|
* hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
|
3603
3665
|
*
|
@@ -3763,7 +3825,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
|
|
3763
3825
|
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
|
3764
3826
|
if (size < rsize && *excl > 0) {
|
3765
3827
|
#ifdef _WIN32
|
3766
|
-
if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize
|
3828
|
+
if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
|
3767
3829
|
|| !SetEndOfFile(env->me_lfd))
|
3768
3830
|
goto fail_errno;
|
3769
3831
|
#else
|
@@ -3919,8 +3981,9 @@ fail:
|
|
3919
3981
|
* at runtime. Changing other flags requires closing the
|
3920
3982
|
* environment and re-opening it with the new flags.
|
3921
3983
|
*/
|
3922
|
-
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
|
3923
|
-
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|
|
3984
|
+
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
|
3985
|
+
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
|
3986
|
+
MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
|
3924
3987
|
|
3925
3988
|
int
|
3926
3989
|
mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
|
@@ -3973,7 +4036,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
|
|
3973
4036
|
}
|
3974
4037
|
|
3975
4038
|
/* For RDONLY, get lockfile after we know datafile exists */
|
3976
|
-
if (!
|
4039
|
+
if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
|
3977
4040
|
rc = mdb_env_setup_locks(env, lpath, mode, &excl);
|
3978
4041
|
if (rc)
|
3979
4042
|
goto leave;
|
@@ -4003,7 +4066,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
|
|
4003
4066
|
goto leave;
|
4004
4067
|
}
|
4005
4068
|
|
4006
|
-
if (
|
4069
|
+
if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
|
4007
4070
|
rc = mdb_env_setup_locks(env, lpath, mode, &excl);
|
4008
4071
|
if (rc)
|
4009
4072
|
goto leave;
|
@@ -4033,7 +4096,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
|
|
4033
4096
|
DPRINTF(("opened dbenv %p", (void *) env));
|
4034
4097
|
if (excl > 0) {
|
4035
4098
|
rc = mdb_env_share_locks(env, &excl);
|
4099
|
+
if (rc)
|
4100
|
+
goto leave;
|
4036
4101
|
}
|
4102
|
+
if (!((flags & MDB_RDONLY) ||
|
4103
|
+
(env->me_pbuf = calloc(1, env->me_psize))))
|
4104
|
+
rc = ENOMEM;
|
4037
4105
|
}
|
4038
4106
|
|
4039
4107
|
leave:
|
@@ -4057,6 +4125,7 @@ mdb_env_close0(MDB_env *env, int excl)
|
|
4057
4125
|
for (i = env->me_maxdbs; --i > MAIN_DBI; )
|
4058
4126
|
free(env->me_dbxs[i].md_name.mv_data);
|
4059
4127
|
|
4128
|
+
free(env->me_pbuf);
|
4060
4129
|
free(env->me_dbflags);
|
4061
4130
|
free(env->me_dbxs);
|
4062
4131
|
free(env->me_path);
|
@@ -4084,7 +4153,7 @@ mdb_env_close0(MDB_env *env, int excl)
|
|
4084
4153
|
if (env->me_fd != INVALID_HANDLE_VALUE)
|
4085
4154
|
(void) close(env->me_fd);
|
4086
4155
|
if (env->me_txns) {
|
4087
|
-
|
4156
|
+
MDB_PID_T pid = env->me_pid;
|
4088
4157
|
/* Clearing readers is done in this function because
|
4089
4158
|
* me_txkey with its destructor must be disabled first.
|
4090
4159
|
*/
|
@@ -4246,14 +4315,6 @@ mdb_env_copy(MDB_env *env, const char *path)
|
|
4246
4315
|
newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
|
4247
4316
|
FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
|
4248
4317
|
#else
|
4249
|
-
#ifdef O_DIRECT
|
4250
|
-
/* The OS supports O_DIRECT, try with it */
|
4251
|
-
newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_DIRECT, 0666);
|
4252
|
-
/* But open can fail if O_DIRECT isn't supported by the file system
|
4253
|
-
* so retry without the flag
|
4254
|
-
*/
|
4255
|
-
if (newfd == INVALID_HANDLE_VALUE && ErrCode() == EINVAL)
|
4256
|
-
#endif
|
4257
4318
|
newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
|
4258
4319
|
#endif
|
4259
4320
|
if (newfd == INVALID_HANDLE_VALUE) {
|
@@ -4261,6 +4322,11 @@ mdb_env_copy(MDB_env *env, const char *path)
|
|
4261
4322
|
goto leave;
|
4262
4323
|
}
|
4263
4324
|
|
4325
|
+
#ifdef O_DIRECT
|
4326
|
+
/* Set O_DIRECT if the file system supports it */
|
4327
|
+
if ((rc = fcntl(newfd, F_GETFL)) != -1)
|
4328
|
+
(void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
|
4329
|
+
#endif
|
4264
4330
|
#ifdef F_NOCACHE /* __APPLE__ */
|
4265
4331
|
rc = fcntl(newfd, F_NOCACHE, 1);
|
4266
4332
|
if (rc) {
|
@@ -4308,7 +4374,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b)
|
|
4308
4374
|
*(size_t *)a->mv_data > *(size_t *)b->mv_data;
|
4309
4375
|
}
|
4310
4376
|
|
4311
|
-
/** Compare two items pointing at aligned int's */
|
4377
|
+
/** Compare two items pointing at aligned unsigned int's */
|
4312
4378
|
static int
|
4313
4379
|
mdb_cmp_int(const MDB_val *a, const MDB_val *b)
|
4314
4380
|
{
|
@@ -4316,7 +4382,7 @@ mdb_cmp_int(const MDB_val *a, const MDB_val *b)
|
|
4316
4382
|
*(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
|
4317
4383
|
}
|
4318
4384
|
|
4319
|
-
/** Compare two items pointing at ints of unknown alignment.
|
4385
|
+
/** Compare two items pointing at unsigned ints of unknown alignment.
|
4320
4386
|
* Nodes and keys are guaranteed to be 2-byte aligned.
|
4321
4387
|
*/
|
4322
4388
|
static int
|
@@ -4514,8 +4580,8 @@ mdb_cursor_pop(MDB_cursor *mc)
|
|
4514
4580
|
if (mc->mc_snum)
|
4515
4581
|
mc->mc_top--;
|
4516
4582
|
|
4517
|
-
DPRINTF(("popped page %"Z"u off db %
|
4518
|
-
mc
|
4583
|
+
DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
|
4584
|
+
DDBI(mc), (void *) mc));
|
4519
4585
|
}
|
4520
4586
|
}
|
4521
4587
|
|
@@ -4523,8 +4589,8 @@ mdb_cursor_pop(MDB_cursor *mc)
|
|
4523
4589
|
static int
|
4524
4590
|
mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
|
4525
4591
|
{
|
4526
|
-
DPRINTF(("pushing page %"Z"u on db %
|
4527
|
-
mc
|
4592
|
+
DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
|
4593
|
+
DDBI(mc), (void *) mc));
|
4528
4594
|
|
4529
4595
|
if (mc->mc_snum >= CURSOR_STACK) {
|
4530
4596
|
assert(mc->mc_snum < CURSOR_STACK);
|
@@ -4598,18 +4664,11 @@ done:
|
|
4598
4664
|
return MDB_SUCCESS;
|
4599
4665
|
}
|
4600
4666
|
|
4601
|
-
/**
|
4602
|
-
*
|
4603
|
-
* search on a cursor that has already been initialized. (Usually by
|
4604
|
-
* #mdb_page_search() but also by #mdb_node_move().)
|
4605
|
-
* @param[in,out] mc the cursor for this operation.
|
4606
|
-
* @param[in] key the key to search for. If NULL, search for the lowest
|
4607
|
-
* page. (This is used by #mdb_cursor_first().)
|
4608
|
-
* @param[in] modify If true, visited pages are updated with new page numbers.
|
4609
|
-
* @return 0 on success, non-zero on failure.
|
4667
|
+
/** Finish #mdb_page_search() / #mdb_page_search_lowest().
|
4668
|
+
* The cursor is at the root page, set up the rest of it.
|
4610
4669
|
*/
|
4611
4670
|
static int
|
4612
|
-
mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int
|
4671
|
+
mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
|
4613
4672
|
{
|
4614
4673
|
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
4615
4674
|
int rc;
|
@@ -4623,11 +4682,10 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4623
4682
|
assert(NUMKEYS(mp) > 1);
|
4624
4683
|
DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
|
4625
4684
|
|
4626
|
-
if (
|
4685
|
+
if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
|
4627
4686
|
i = 0;
|
4628
|
-
|
4629
|
-
|
4630
|
-
i = NUMKEYS(mp)-1;
|
4687
|
+
if (flags & MDB_PS_LAST)
|
4688
|
+
i = NUMKEYS(mp) - 1;
|
4631
4689
|
} else {
|
4632
4690
|
int exact;
|
4633
4691
|
node = mdb_node_search(mc, key, &exact);
|
@@ -4640,10 +4698,9 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4640
4698
|
i--;
|
4641
4699
|
}
|
4642
4700
|
}
|
4701
|
+
DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
|
4643
4702
|
}
|
4644
4703
|
|
4645
|
-
if (key)
|
4646
|
-
DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
|
4647
4704
|
assert(i < NUMKEYS(mp));
|
4648
4705
|
node = NODEPTR(mp, i);
|
4649
4706
|
|
@@ -4654,7 +4711,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4654
4711
|
if ((rc = mdb_cursor_push(mc, mp)))
|
4655
4712
|
return rc;
|
4656
4713
|
|
4657
|
-
if (
|
4714
|
+
if (flags & MDB_PS_MODIFY) {
|
4658
4715
|
if ((rc = mdb_page_touch(mc)) != 0)
|
4659
4716
|
return rc;
|
4660
4717
|
mp = mc->mc_pg[mc->mc_top];
|
@@ -4668,7 +4725,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4668
4725
|
}
|
4669
4726
|
|
4670
4727
|
DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
|
4671
|
-
key ? DKEY(key) :
|
4728
|
+
key ? DKEY(key) : "null"));
|
4672
4729
|
mc->mc_flags |= C_INITIALIZED;
|
4673
4730
|
mc->mc_flags &= ~C_EOF;
|
4674
4731
|
|
@@ -4694,18 +4751,17 @@ mdb_page_search_lowest(MDB_cursor *mc)
|
|
4694
4751
|
mc->mc_ki[mc->mc_top] = 0;
|
4695
4752
|
if ((rc = mdb_cursor_push(mc, mp)))
|
4696
4753
|
return rc;
|
4697
|
-
return mdb_page_search_root(mc, NULL,
|
4754
|
+
return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
|
4698
4755
|
}
|
4699
4756
|
|
4700
4757
|
/** Search for the page a given key should be in.
|
4701
|
-
*
|
4702
|
-
* the search; it finds the root page for \b mc's database and sets this
|
4703
|
-
* as the root of the cursor's stack. Then #mdb_page_search_root() is
|
4704
|
-
* called to complete the search.
|
4758
|
+
* Push it and its parent pages on the cursor stack.
|
4705
4759
|
* @param[in,out] mc the cursor for this operation.
|
4706
|
-
* @param[in] key the key to search for
|
4707
|
-
*
|
4708
|
-
*
|
4760
|
+
* @param[in] key the key to search for, or NULL for first/last page.
|
4761
|
+
* @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
|
4762
|
+
* are touched (updated with new page numbers).
|
4763
|
+
* If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
|
4764
|
+
* This is used by #mdb_cursor_first() and #mdb_cursor_last().
|
4709
4765
|
* If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
|
4710
4766
|
* @return 0 on success, non-zero on failure.
|
4711
4767
|
*/
|
@@ -4716,23 +4772,20 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
|
|
4716
4772
|
pgno_t root;
|
4717
4773
|
|
4718
4774
|
/* Make sure the txn is still viable, then find the root from
|
4719
|
-
* the txn's db table.
|
4775
|
+
* the txn's db table and set it as the root of the cursor's stack.
|
4720
4776
|
*/
|
4721
4777
|
if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
|
4722
4778
|
DPUTS("transaction has failed, must abort");
|
4723
4779
|
return MDB_BAD_TXN;
|
4724
4780
|
} else {
|
4725
4781
|
/* Make sure we're using an up-to-date root */
|
4726
|
-
if (mc->
|
4727
|
-
if ((*mc->mc_dbflag & DB_STALE) ||
|
4728
|
-
((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) {
|
4782
|
+
if (*mc->mc_dbflag & DB_STALE) {
|
4729
4783
|
MDB_cursor mc2;
|
4730
|
-
unsigned char dbflag = 0;
|
4731
4784
|
mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
|
4732
|
-
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name,
|
4785
|
+
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
|
4733
4786
|
if (rc)
|
4734
4787
|
return rc;
|
4735
|
-
|
4788
|
+
{
|
4736
4789
|
MDB_val data;
|
4737
4790
|
int exact = 0;
|
4738
4791
|
uint16_t flags;
|
@@ -4752,11 +4805,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
|
|
4752
4805
|
return MDB_INCOMPATIBLE;
|
4753
4806
|
memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
|
4754
4807
|
}
|
4755
|
-
if (flags & MDB_PS_MODIFY)
|
4756
|
-
dbflag = DB_DIRTY;
|
4757
4808
|
*mc->mc_dbflag &= ~DB_STALE;
|
4758
|
-
*mc->mc_dbflag |= dbflag;
|
4759
|
-
}
|
4760
4809
|
}
|
4761
4810
|
root = mc->mc_db->md_root;
|
4762
4811
|
|
@@ -4774,8 +4823,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
|
|
4774
4823
|
mc->mc_snum = 1;
|
4775
4824
|
mc->mc_top = 0;
|
4776
4825
|
|
4777
|
-
DPRINTF(("db %
|
4778
|
-
mc
|
4826
|
+
DPRINTF(("db %d root page %"Z"u has flags 0x%X",
|
4827
|
+
DDBI(mc), root, mc->mc_pg[0]->mp_flags));
|
4779
4828
|
|
4780
4829
|
if (flags & MDB_PS_MODIFY) {
|
4781
4830
|
if ((rc = mdb_page_touch(mc)))
|
@@ -4914,7 +4963,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi,
|
|
4914
4963
|
if (txn->mt_flags & MDB_TXN_ERROR)
|
4915
4964
|
return MDB_BAD_TXN;
|
4916
4965
|
|
4917
|
-
if (key->mv_size
|
4966
|
+
if (key->mv_size > MDB_MAXKEYSIZE) {
|
4918
4967
|
return MDB_BAD_VALSIZE;
|
4919
4968
|
}
|
4920
4969
|
|
@@ -4966,8 +5015,11 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right)
|
|
4966
5015
|
assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
|
4967
5016
|
|
4968
5017
|
indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
4969
|
-
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0)
|
5018
|
+
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
|
5019
|
+
/* mc will be inconsistent if caller does mc_snum++ as above */
|
5020
|
+
mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
|
4970
5021
|
return rc;
|
5022
|
+
}
|
4971
5023
|
|
4972
5024
|
mdb_cursor_push(mc, mp);
|
4973
5025
|
if (!move_right)
|
@@ -5143,7 +5195,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5143
5195
|
|
5144
5196
|
assert(mc);
|
5145
5197
|
assert(key);
|
5146
|
-
|
5198
|
+
if (key->mv_size == 0)
|
5199
|
+
return MDB_BAD_VALSIZE;
|
5147
5200
|
|
5148
5201
|
if (mc->mc_xcursor)
|
5149
5202
|
mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
|
@@ -5329,7 +5382,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
|
|
5329
5382
|
mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
|
5330
5383
|
|
5331
5384
|
if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
|
5332
|
-
rc = mdb_page_search(mc, NULL,
|
5385
|
+
rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
|
5333
5386
|
if (rc != MDB_SUCCESS)
|
5334
5387
|
return rc;
|
5335
5388
|
}
|
@@ -5375,11 +5428,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
|
|
5375
5428
|
if (!(mc->mc_flags & C_EOF)) {
|
5376
5429
|
|
5377
5430
|
if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
|
5378
|
-
|
5379
|
-
|
5380
|
-
lkey.mv_size = MDB_MAXKEYSIZE+1;
|
5381
|
-
lkey.mv_data = NULL;
|
5382
|
-
rc = mdb_page_search(mc, &lkey, 0);
|
5431
|
+
rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
|
5383
5432
|
if (rc != MDB_SUCCESS)
|
5384
5433
|
return rc;
|
5385
5434
|
}
|
@@ -5431,8 +5480,9 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5431
5480
|
rc = EINVAL;
|
5432
5481
|
} else {
|
5433
5482
|
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
5434
|
-
|
5435
|
-
|
5483
|
+
int nkeys = NUMKEYS(mp);
|
5484
|
+
if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
|
5485
|
+
mc->mc_ki[mc->mc_top] = nkeys;
|
5436
5486
|
rc = MDB_NOTFOUND;
|
5437
5487
|
break;
|
5438
5488
|
}
|
@@ -5471,7 +5521,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5471
5521
|
case MDB_SET_RANGE:
|
5472
5522
|
if (key == NULL) {
|
5473
5523
|
rc = EINVAL;
|
5474
|
-
} else if (key->mv_size
|
5524
|
+
} else if (key->mv_size > MDB_MAXKEYSIZE) {
|
5475
5525
|
rc = MDB_BAD_VALSIZE;
|
5476
5526
|
} else if (op == MDB_SET_RANGE)
|
5477
5527
|
rc = mdb_cursor_set(mc, key, data, op, NULL);
|
@@ -5577,14 +5627,14 @@ fetchm:
|
|
5577
5627
|
return rc;
|
5578
5628
|
}
|
5579
5629
|
|
5580
|
-
/** Touch all the pages in the cursor stack.
|
5630
|
+
/** Touch all the pages in the cursor stack. Set mc_top.
|
5581
5631
|
* Makes sure all the pages are writable, before attempting a write operation.
|
5582
5632
|
* @param[in] mc The cursor to operate on.
|
5583
5633
|
*/
|
5584
5634
|
static int
|
5585
5635
|
mdb_cursor_touch(MDB_cursor *mc)
|
5586
5636
|
{
|
5587
|
-
int rc;
|
5637
|
+
int rc = MDB_SUCCESS;
|
5588
5638
|
|
5589
5639
|
if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
|
5590
5640
|
MDB_cursor mc2;
|
@@ -5595,13 +5645,14 @@ mdb_cursor_touch(MDB_cursor *mc)
|
|
5595
5645
|
return rc;
|
5596
5646
|
*mc->mc_dbflag |= DB_DIRTY;
|
5597
5647
|
}
|
5598
|
-
|
5599
|
-
|
5600
|
-
|
5601
|
-
|
5648
|
+
mc->mc_top = 0;
|
5649
|
+
if (mc->mc_snum) {
|
5650
|
+
do {
|
5651
|
+
rc = mdb_page_touch(mc);
|
5652
|
+
} while (!rc && ++(mc->mc_top) < mc->mc_snum);
|
5653
|
+
mc->mc_top = mc->mc_snum-1;
|
5602
5654
|
}
|
5603
|
-
|
5604
|
-
return MDB_SUCCESS;
|
5655
|
+
return rc;
|
5605
5656
|
}
|
5606
5657
|
|
5607
5658
|
/** Do not spill pages to disk if txn is getting full, may fail instead */
|
@@ -5612,15 +5663,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5612
5663
|
unsigned int flags)
|
5613
5664
|
{
|
5614
5665
|
enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
|
5666
|
+
MDB_env *env = mc->mc_txn->mt_env;
|
5615
5667
|
MDB_node *leaf = NULL;
|
5616
5668
|
MDB_val xdata, *rdata, dkey;
|
5617
|
-
MDB_page *fp;
|
5618
5669
|
MDB_db dummy;
|
5619
5670
|
int do_sub = 0, insert = 0;
|
5620
5671
|
unsigned int mcount = 0, dcount = 0, nospill;
|
5621
5672
|
size_t nsize;
|
5622
5673
|
int rc, rc2;
|
5623
|
-
MDB_pagebuf pbuf;
|
5624
5674
|
char dbuf[MDB_MAXKEYSIZE+1];
|
5625
5675
|
unsigned int nflags;
|
5626
5676
|
DKBUF;
|
@@ -5652,8 +5702,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5652
5702
|
return MDB_BAD_VALSIZE;
|
5653
5703
|
#endif
|
5654
5704
|
|
5655
|
-
DPRINTF(("==> put db %
|
5656
|
-
mc
|
5705
|
+
DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
|
5706
|
+
DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
|
5657
5707
|
|
5658
5708
|
dkey.mv_size = 0;
|
5659
5709
|
|
@@ -5664,6 +5714,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5664
5714
|
} else if (mc->mc_db->md_root == P_INVALID) {
|
5665
5715
|
/* new database, cursor has nothing to point to */
|
5666
5716
|
mc->mc_snum = 0;
|
5717
|
+
mc->mc_top = 0;
|
5667
5718
|
mc->mc_flags &= ~C_INITIALIZED;
|
5668
5719
|
rc = MDB_NO_ROOT;
|
5669
5720
|
} else {
|
@@ -5733,6 +5784,9 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5733
5784
|
|
5734
5785
|
/* The key already exists */
|
5735
5786
|
if (rc == MDB_SUCCESS) {
|
5787
|
+
MDB_page *fp, *mp;
|
5788
|
+
MDB_val olddata;
|
5789
|
+
|
5736
5790
|
/* there's only a key anyway, so this is a no-op */
|
5737
5791
|
if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
|
5738
5792
|
unsigned int ksize = mc->mc_db->md_pad;
|
@@ -5745,19 +5799,23 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
5745
5799
|
return MDB_SUCCESS;
|
5746
5800
|
}
|
5747
5801
|
|
5802
|
+
more:
|
5748
5803
|
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
5804
|
+
olddata.mv_size = NODEDSZ(leaf);
|
5805
|
+
olddata.mv_data = NODEDATA(leaf);
|
5749
5806
|
|
5750
5807
|
/* DB has dups? */
|
5751
5808
|
if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
|
5809
|
+
mp = fp = xdata.mv_data = env->me_pbuf;
|
5810
|
+
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5811
|
+
|
5752
5812
|
/* Was a single item before, must convert now */
|
5753
|
-
more:
|
5754
5813
|
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
5755
5814
|
/* Just overwrite the current item */
|
5756
5815
|
if (flags == MDB_CURRENT)
|
5757
5816
|
goto current;
|
5758
5817
|
|
5759
|
-
dkey
|
5760
|
-
dkey.mv_data = NODEDATA(leaf);
|
5818
|
+
dkey = olddata;
|
5761
5819
|
#if UINT_MAX < SIZE_MAX
|
5762
5820
|
if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
|
5763
5821
|
#ifdef MISALIGNED_OK
|
@@ -5780,85 +5838,76 @@ more:
|
|
5780
5838
|
/* create a fake page for the dup items */
|
5781
5839
|
memcpy(dbuf, dkey.mv_data, dkey.mv_size);
|
5782
5840
|
dkey.mv_data = dbuf;
|
5783
|
-
fp = (MDB_page *)&pbuf;
|
5784
|
-
fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5785
5841
|
fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
|
5786
5842
|
fp->mp_lower = PAGEHDRSZ;
|
5787
|
-
|
5843
|
+
xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
|
5788
5844
|
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5789
5845
|
fp->mp_flags |= P_LEAF2;
|
5790
5846
|
fp->mp_pad = data->mv_size;
|
5791
|
-
|
5847
|
+
xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
|
5792
5848
|
} else {
|
5793
|
-
|
5849
|
+
xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
|
5794
5850
|
(dkey.mv_size & 1) + (data->mv_size & 1);
|
5795
5851
|
}
|
5796
|
-
|
5797
|
-
|
5798
|
-
|
5799
|
-
|
5800
|
-
|
5801
|
-
|
5802
|
-
goto new_sub;
|
5803
|
-
}
|
5804
|
-
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
5852
|
+
fp->mp_upper = xdata.mv_size;
|
5853
|
+
} else if (leaf->mn_flags & F_SUBDATA) {
|
5854
|
+
/* Data is on sub-DB, just store it */
|
5855
|
+
flags |= F_DUPDATA|F_SUBDATA;
|
5856
|
+
goto put_sub;
|
5857
|
+
} else {
|
5805
5858
|
/* See if we need to convert from fake page to subDB */
|
5806
|
-
MDB_page *mp;
|
5807
5859
|
unsigned int offset;
|
5808
5860
|
unsigned int i;
|
5809
5861
|
uint16_t fp_flags;
|
5810
5862
|
|
5811
|
-
fp =
|
5812
|
-
|
5813
|
-
|
5863
|
+
fp = olddata.mv_data;
|
5864
|
+
switch (flags) {
|
5865
|
+
default:
|
5866
|
+
if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
|
5867
|
+
offset = NODESIZE + sizeof(indx_t) + data->mv_size;
|
5868
|
+
offset += offset & 1;
|
5869
|
+
break;
|
5870
|
+
}
|
5871
|
+
offset = fp->mp_pad;
|
5872
|
+
if (SIZELEFT(fp) < offset) {
|
5873
|
+
offset *= 4; /* space for 4 more */
|
5874
|
+
break;
|
5875
|
+
}
|
5876
|
+
/* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
|
5877
|
+
case MDB_CURRENT:
|
5814
5878
|
fp->mp_flags |= P_DIRTY;
|
5815
|
-
COPY_PGNO(fp->mp_pgno,
|
5879
|
+
COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
|
5816
5880
|
mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
|
5817
5881
|
flags |= F_DUPDATA;
|
5818
5882
|
goto put_sub;
|
5819
5883
|
}
|
5820
|
-
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5821
|
-
offset = fp->mp_pad;
|
5822
|
-
if (SIZELEFT(fp) >= offset)
|
5823
|
-
goto reuse;
|
5824
|
-
offset *= 4; /* space for 4 more */
|
5825
|
-
} else {
|
5826
|
-
offset = NODESIZE + sizeof(indx_t) + data->mv_size;
|
5827
|
-
}
|
5828
|
-
offset += offset & 1;
|
5829
5884
|
fp_flags = fp->mp_flags;
|
5830
|
-
|
5831
|
-
|
5885
|
+
xdata.mv_size = olddata.mv_size + offset;
|
5886
|
+
if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + xdata.mv_size
|
5887
|
+
>= env->me_nodemax) {
|
5832
5888
|
/* yes, convert it */
|
5833
|
-
dummy.md_flags = 0;
|
5834
5889
|
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5835
5890
|
dummy.md_pad = fp->mp_pad;
|
5836
5891
|
dummy.md_flags = MDB_DUPFIXED;
|
5837
5892
|
if (mc->mc_db->md_flags & MDB_INTEGERDUP)
|
5838
5893
|
dummy.md_flags |= MDB_INTEGERKEY;
|
5894
|
+
} else {
|
5895
|
+
dummy.md_pad = 0;
|
5896
|
+
dummy.md_flags = 0;
|
5839
5897
|
}
|
5840
5898
|
dummy.md_depth = 1;
|
5841
5899
|
dummy.md_branch_pages = 0;
|
5842
5900
|
dummy.md_leaf_pages = 1;
|
5843
5901
|
dummy.md_overflow_pages = 0;
|
5844
5902
|
dummy.md_entries = NUMKEYS(fp);
|
5845
|
-
rdata = &xdata;
|
5846
5903
|
xdata.mv_size = sizeof(MDB_db);
|
5847
5904
|
xdata.mv_data = &dummy;
|
5848
5905
|
if ((rc = mdb_page_alloc(mc, 1, &mp)))
|
5849
5906
|
return rc;
|
5850
|
-
offset =
|
5907
|
+
offset = env->me_psize - olddata.mv_size;
|
5851
5908
|
flags |= F_DUPDATA|F_SUBDATA;
|
5852
5909
|
dummy.md_root = mp->mp_pgno;
|
5853
5910
|
fp_flags &= ~P_SUBP;
|
5854
|
-
} else {
|
5855
|
-
/* no, just grow it */
|
5856
|
-
rdata = &xdata;
|
5857
|
-
xdata.mv_size = NODEDSZ(leaf) + offset;
|
5858
|
-
xdata.mv_data = &pbuf;
|
5859
|
-
mp = (MDB_page *)&pbuf;
|
5860
|
-
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5861
|
-
flags |= F_DUPDATA;
|
5862
5911
|
}
|
5863
5912
|
mp->mp_flags = fp_flags | P_DIRTY;
|
5864
5913
|
mp->mp_pad = fp->mp_pad;
|
@@ -5867,28 +5916,27 @@ reuse:
|
|
5867
5916
|
if (IS_LEAF2(fp)) {
|
5868
5917
|
memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
|
5869
5918
|
} else {
|
5870
|
-
|
5871
|
-
|
5919
|
+
memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
|
5920
|
+
olddata.mv_size - fp->mp_upper);
|
5872
5921
|
for (i=0; i<NUMKEYS(fp); i++)
|
5873
5922
|
mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
|
5874
5923
|
}
|
5875
|
-
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
5876
|
-
do_sub = 1;
|
5877
|
-
goto new_sub;
|
5878
5924
|
}
|
5879
|
-
|
5880
|
-
|
5881
|
-
|
5925
|
+
|
5926
|
+
rdata = &xdata;
|
5927
|
+
flags |= F_DUPDATA;
|
5928
|
+
do_sub = 1;
|
5929
|
+
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
5930
|
+
goto new_sub;
|
5882
5931
|
}
|
5883
5932
|
current:
|
5884
5933
|
/* overflow page overwrites need special handling */
|
5885
5934
|
if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
|
5886
5935
|
MDB_page *omp;
|
5887
5936
|
pgno_t pg;
|
5888
|
-
|
5889
|
-
int level, ovpages, dpages = OVPAGES(data->mv_size, psize);
|
5937
|
+
int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
|
5890
5938
|
|
5891
|
-
memcpy(&pg,
|
5939
|
+
memcpy(&pg, olddata.mv_data, sizeof(pg));
|
5892
5940
|
if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
|
5893
5941
|
return rc2;
|
5894
5942
|
ovpages = omp->mp_pages;
|
@@ -5896,7 +5944,7 @@ current:
|
|
5896
5944
|
/* Is the ov page large enough? */
|
5897
5945
|
if (ovpages >= dpages) {
|
5898
5946
|
if (!(omp->mp_flags & P_DIRTY) &&
|
5899
|
-
(level || (
|
5947
|
+
(level || (env->me_flags & MDB_WRITEMAP)))
|
5900
5948
|
{
|
5901
5949
|
rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
|
5902
5950
|
if (rc)
|
@@ -5911,7 +5959,7 @@ current:
|
|
5911
5959
|
*/
|
5912
5960
|
if (level > 1) {
|
5913
5961
|
/* It is writable only in a parent txn */
|
5914
|
-
size_t sz = (size_t)
|
5962
|
+
size_t sz = (size_t) env->me_psize * ovpages, off;
|
5915
5963
|
MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
|
5916
5964
|
MDB_ID2 id2;
|
5917
5965
|
if (!np)
|
@@ -5941,15 +5989,15 @@ current:
|
|
5941
5989
|
}
|
5942
5990
|
if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
|
5943
5991
|
return rc2;
|
5944
|
-
} else if (
|
5992
|
+
} else if (data->mv_size == olddata.mv_size) {
|
5945
5993
|
/* same size, just replace it. Note that we could
|
5946
5994
|
* also reuse this node if the new data is smaller,
|
5947
5995
|
* but instead we opt to shrink the node in that case.
|
5948
5996
|
*/
|
5949
5997
|
if (F_ISSET(flags, MDB_RESERVE))
|
5950
|
-
data->mv_data =
|
5998
|
+
data->mv_data = olddata.mv_data;
|
5951
5999
|
else if (data->mv_size)
|
5952
|
-
memcpy(
|
6000
|
+
memcpy(olddata.mv_data, data->mv_data, data->mv_size);
|
5953
6001
|
else
|
5954
6002
|
memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
|
5955
6003
|
goto done;
|
@@ -5965,7 +6013,7 @@ current:
|
|
5965
6013
|
|
5966
6014
|
new_sub:
|
5967
6015
|
nflags = flags & NODE_ADD_FLAGS;
|
5968
|
-
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(
|
6016
|
+
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
|
5969
6017
|
if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
|
5970
6018
|
if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
|
5971
6019
|
nflags &= ~MDB_APPEND;
|
@@ -5982,9 +6030,6 @@ new_sub:
|
|
5982
6030
|
unsigned i = mc->mc_top;
|
5983
6031
|
MDB_page *mp = mc->mc_pg[i];
|
5984
6032
|
|
5985
|
-
if (mc->mc_flags & C_SUB)
|
5986
|
-
dbi--;
|
5987
|
-
|
5988
6033
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
5989
6034
|
if (mc->mc_flags & C_SUB)
|
5990
6035
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -6062,7 +6107,6 @@ next_mult:
|
|
6062
6107
|
data[1].mv_size = mcount;
|
6063
6108
|
if (mcount < dcount) {
|
6064
6109
|
data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
|
6065
|
-
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
6066
6110
|
goto more;
|
6067
6111
|
}
|
6068
6112
|
}
|
@@ -6081,6 +6125,7 @@ int
|
|
6081
6125
|
mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
6082
6126
|
{
|
6083
6127
|
MDB_node *leaf;
|
6128
|
+
MDB_page *mp;
|
6084
6129
|
int rc;
|
6085
6130
|
|
6086
6131
|
if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
|
@@ -6089,17 +6134,20 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
|
6089
6134
|
if (!(mc->mc_flags & C_INITIALIZED))
|
6090
6135
|
return EINVAL;
|
6091
6136
|
|
6137
|
+
if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
|
6138
|
+
return MDB_NOTFOUND;
|
6139
|
+
|
6092
6140
|
if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
|
6093
6141
|
return rc;
|
6094
|
-
flags &= ~MDB_NOSPILL; /* TODO: Or change (flags != MDB_NODUPDATA) to ~(flags & MDB_NODUPDATA), not looking at the logic of that code just now */
|
6095
6142
|
|
6096
6143
|
rc = mdb_cursor_touch(mc);
|
6097
6144
|
if (rc)
|
6098
6145
|
return rc;
|
6099
6146
|
|
6100
|
-
|
6147
|
+
mp = mc->mc_pg[mc->mc_top];
|
6148
|
+
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
6101
6149
|
|
6102
|
-
if (!IS_LEAF2(
|
6150
|
+
if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
6103
6151
|
if (!(flags & MDB_NODUPDATA)) {
|
6104
6152
|
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
6105
6153
|
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
@@ -6114,13 +6162,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
|
6114
6162
|
} else {
|
6115
6163
|
MDB_cursor *m2;
|
6116
6164
|
/* shrink fake page */
|
6117
|
-
mdb_node_shrink(
|
6118
|
-
leaf = NODEPTR(
|
6165
|
+
mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
|
6166
|
+
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
6119
6167
|
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
6120
6168
|
/* fix other sub-DB cursors pointed at this fake page */
|
6121
6169
|
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
|
6122
6170
|
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
|
6123
|
-
if (m2->mc_pg[mc->mc_top] ==
|
6171
|
+
if (m2->mc_pg[mc->mc_top] == mp &&
|
6124
6172
|
m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
|
6125
6173
|
m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
6126
6174
|
}
|
@@ -6252,6 +6300,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6252
6300
|
{
|
6253
6301
|
unsigned int i;
|
6254
6302
|
size_t node_size = NODESIZE;
|
6303
|
+
ssize_t room;
|
6255
6304
|
indx_t ofs;
|
6256
6305
|
MDB_node *node;
|
6257
6306
|
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
@@ -6264,7 +6313,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6264
6313
|
IS_LEAF(mp) ? "leaf" : "branch",
|
6265
6314
|
IS_SUBP(mp) ? "sub-" : "",
|
6266
6315
|
mp->mp_pgno, indx, data ? data->mv_size : 0,
|
6267
|
-
key ? key->mv_size : 0, key ? DKEY(key) :
|
6316
|
+
key ? key->mv_size : 0, key ? DKEY(key) : "null"));
|
6268
6317
|
|
6269
6318
|
if (IS_LEAF2(mp)) {
|
6270
6319
|
/* Move higher keys up one slot. */
|
@@ -6282,9 +6331,9 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6282
6331
|
return MDB_SUCCESS;
|
6283
6332
|
}
|
6284
6333
|
|
6334
|
+
room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
|
6285
6335
|
if (key != NULL)
|
6286
6336
|
node_size += key->mv_size;
|
6287
|
-
|
6288
6337
|
if (IS_LEAF(mp)) {
|
6289
6338
|
assert(data);
|
6290
6339
|
if (F_ISSET(flags, F_BIGDATA)) {
|
@@ -6296,26 +6345,23 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6296
6345
|
/* Put data on overflow page. */
|
6297
6346
|
DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
|
6298
6347
|
data->mv_size, node_size+data->mv_size));
|
6299
|
-
node_size += sizeof(pgno_t);
|
6348
|
+
node_size += sizeof(pgno_t) + (node_size & 1);
|
6349
|
+
if ((ssize_t)node_size > room)
|
6350
|
+
goto full;
|
6300
6351
|
if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
|
6301
6352
|
return rc;
|
6302
6353
|
DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
|
6303
6354
|
flags |= F_BIGDATA;
|
6355
|
+
goto update;
|
6304
6356
|
} else {
|
6305
6357
|
node_size += data->mv_size;
|
6306
6358
|
}
|
6307
6359
|
}
|
6308
6360
|
node_size += node_size & 1;
|
6361
|
+
if ((ssize_t)node_size > room)
|
6362
|
+
goto full;
|
6309
6363
|
|
6310
|
-
|
6311
|
-
DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
|
6312
|
-
mp->mp_pgno, NUMKEYS(mp)));
|
6313
|
-
DPRINTF(("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
|
6314
|
-
mp->mp_upper - mp->mp_lower));
|
6315
|
-
DPRINTF(("node size = %"Z"u", node_size));
|
6316
|
-
return MDB_PAGE_FULL;
|
6317
|
-
}
|
6318
|
-
|
6364
|
+
update:
|
6319
6365
|
/* Move higher pointers up one slot. */
|
6320
6366
|
for (i = NUMKEYS(mp); i > indx; i--)
|
6321
6367
|
mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
|
@@ -6361,6 +6407,13 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
6361
6407
|
}
|
6362
6408
|
|
6363
6409
|
return MDB_SUCCESS;
|
6410
|
+
|
6411
|
+
full:
|
6412
|
+
DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
|
6413
|
+
mp->mp_pgno, NUMKEYS(mp)));
|
6414
|
+
DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
|
6415
|
+
DPRINTF(("node size = %"Z"u", node_size));
|
6416
|
+
return MDB_PAGE_FULL;
|
6364
6417
|
}
|
6365
6418
|
|
6366
6419
|
/** Delete the specified node from a page.
|
@@ -6495,11 +6548,13 @@ mdb_xcursor_init0(MDB_cursor *mc)
|
|
6495
6548
|
mx->mx_cursor.mc_txn = mc->mc_txn;
|
6496
6549
|
mx->mx_cursor.mc_db = &mx->mx_db;
|
6497
6550
|
mx->mx_cursor.mc_dbx = &mx->mx_dbx;
|
6498
|
-
mx->mx_cursor.mc_dbi = mc->mc_dbi
|
6551
|
+
mx->mx_cursor.mc_dbi = mc->mc_dbi;
|
6499
6552
|
mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
|
6500
6553
|
mx->mx_cursor.mc_snum = 0;
|
6501
6554
|
mx->mx_cursor.mc_top = 0;
|
6502
6555
|
mx->mx_cursor.mc_flags = C_SUB;
|
6556
|
+
mx->mx_dbx.md_name.mv_size = 0;
|
6557
|
+
mx->mx_dbx.md_name.mv_data = NULL;
|
6503
6558
|
mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
|
6504
6559
|
mx->mx_dbx.md_dcmp = NULL;
|
6505
6560
|
mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
|
@@ -6520,6 +6575,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
|
|
6520
6575
|
memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
|
6521
6576
|
mx->mx_cursor.mc_pg[0] = 0;
|
6522
6577
|
mx->mx_cursor.mc_snum = 0;
|
6578
|
+
mx->mx_cursor.mc_top = 0;
|
6523
6579
|
mx->mx_cursor.mc_flags = C_SUB;
|
6524
6580
|
} else {
|
6525
6581
|
MDB_page *fp = NODEDATA(node);
|
@@ -6532,8 +6588,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
|
|
6532
6588
|
mx->mx_db.md_entries = NUMKEYS(fp);
|
6533
6589
|
COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
|
6534
6590
|
mx->mx_cursor.mc_snum = 1;
|
6535
|
-
mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
|
6536
6591
|
mx->mx_cursor.mc_top = 0;
|
6592
|
+
mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
|
6537
6593
|
mx->mx_cursor.mc_pg[0] = fp;
|
6538
6594
|
mx->mx_cursor.mc_ki[0] = 0;
|
6539
6595
|
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
@@ -6543,12 +6599,9 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
|
|
6543
6599
|
mx->mx_db.md_flags |= MDB_INTEGERKEY;
|
6544
6600
|
}
|
6545
6601
|
}
|
6546
|
-
DPRINTF(("Sub-db
|
6602
|
+
DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
|
6547
6603
|
mx->mx_db.md_root));
|
6548
|
-
mx->mx_dbflag = DB_VALID
|
6549
|
-
DB_DIRTY : 0);
|
6550
|
-
mx->mx_dbx.md_name.mv_data = NODEKEY(node);
|
6551
|
-
mx->mx_dbx.md_name.mv_size = node->mn_ksize;
|
6604
|
+
mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
|
6552
6605
|
#if UINT_MAX < SIZE_MAX
|
6553
6606
|
if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
|
6554
6607
|
#ifdef MISALIGNED_OK
|
@@ -6793,7 +6846,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
6793
6846
|
flags = 0;
|
6794
6847
|
} else {
|
6795
6848
|
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
|
6796
|
-
assert(!((
|
6849
|
+
assert(!((size_t)srcnode&1));
|
6797
6850
|
srcpg = NODEPGNO(srcnode);
|
6798
6851
|
flags = srcnode->mn_flags;
|
6799
6852
|
if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
|
@@ -6864,9 +6917,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
6864
6917
|
MDB_dbi dbi = csrc->mc_dbi;
|
6865
6918
|
MDB_page *mp = csrc->mc_pg[csrc->mc_top];
|
6866
6919
|
|
6867
|
-
if (csrc->mc_flags & C_SUB)
|
6868
|
-
dbi--;
|
6869
|
-
|
6870
6920
|
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
6871
6921
|
if (csrc->mc_flags & C_SUB)
|
6872
6922
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7041,9 +7091,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
7041
7091
|
MDB_dbi dbi = csrc->mc_dbi;
|
7042
7092
|
MDB_page *mp = cdst->mc_pg[cdst->mc_top];
|
7043
7093
|
|
7044
|
-
if (csrc->mc_flags & C_SUB)
|
7045
|
-
dbi--;
|
7046
|
-
|
7047
7094
|
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7048
7095
|
if (csrc->mc_flags & C_SUB)
|
7049
7096
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7138,13 +7185,11 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7138
7185
|
/* Adjust cursors pointing to mp */
|
7139
7186
|
mc->mc_snum = 0;
|
7140
7187
|
mc->mc_top = 0;
|
7188
|
+
mc->mc_flags &= ~C_INITIALIZED;
|
7141
7189
|
{
|
7142
7190
|
MDB_cursor *m2, *m3;
|
7143
7191
|
MDB_dbi dbi = mc->mc_dbi;
|
7144
7192
|
|
7145
|
-
if (mc->mc_flags & C_SUB)
|
7146
|
-
dbi--;
|
7147
|
-
|
7148
7193
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7149
7194
|
if (mc->mc_flags & C_SUB)
|
7150
7195
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7154,6 +7199,7 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7154
7199
|
if (m3->mc_pg[0] == mp) {
|
7155
7200
|
m3->mc_snum = 0;
|
7156
7201
|
m3->mc_top = 0;
|
7202
|
+
m3->mc_flags &= ~C_INITIALIZED;
|
7157
7203
|
}
|
7158
7204
|
}
|
7159
7205
|
}
|
@@ -7174,9 +7220,6 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7174
7220
|
MDB_cursor *m2, *m3;
|
7175
7221
|
MDB_dbi dbi = mc->mc_dbi;
|
7176
7222
|
|
7177
|
-
if (mc->mc_flags & C_SUB)
|
7178
|
-
dbi--;
|
7179
|
-
|
7180
7223
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7181
7224
|
if (mc->mc_flags & C_SUB)
|
7182
7225
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7184,10 +7227,13 @@ mdb_rebalance(MDB_cursor *mc)
|
|
7184
7227
|
m3 = m2;
|
7185
7228
|
if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
|
7186
7229
|
if (m3->mc_pg[0] == mp) {
|
7187
|
-
|
7188
|
-
m3->mc_snum
|
7189
|
-
m3->mc_top
|
7190
|
-
|
7230
|
+
int i;
|
7231
|
+
m3->mc_snum--;
|
7232
|
+
m3->mc_top--;
|
7233
|
+
for (i=0; i<m3->mc_snum; i++) {
|
7234
|
+
m3->mc_pg[i] = m3->mc_pg[i+1];
|
7235
|
+
m3->mc_ki[i] = m3->mc_ki[i+1];
|
7236
|
+
}
|
7191
7237
|
}
|
7192
7238
|
}
|
7193
7239
|
}
|
@@ -7300,7 +7346,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
|
|
7300
7346
|
|
7301
7347
|
/* Adjust other cursors pointing to mp */
|
7302
7348
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7303
|
-
if (m2 == mc)
|
7349
|
+
if (m2 == mc || m2->mc_snum < mc->mc_snum)
|
7304
7350
|
continue;
|
7305
7351
|
if (!(m2->mc_flags & C_INITIALIZED))
|
7306
7352
|
continue;
|
@@ -7341,7 +7387,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi,
|
|
7341
7387
|
if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
|
7342
7388
|
return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
|
7343
7389
|
|
7344
|
-
if (key->mv_size
|
7390
|
+
if (key->mv_size > MDB_MAXKEYSIZE) {
|
7345
7391
|
return MDB_BAD_VALSIZE;
|
7346
7392
|
}
|
7347
7393
|
|
@@ -7394,24 +7440,26 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
|
|
7394
7440
|
unsigned int nflags)
|
7395
7441
|
{
|
7396
7442
|
unsigned int flags;
|
7397
|
-
int rc = MDB_SUCCESS,
|
7443
|
+
int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
|
7398
7444
|
indx_t newindx;
|
7399
7445
|
pgno_t pgno = 0;
|
7400
|
-
|
7446
|
+
int i, j, split_indx, nkeys, pmax;
|
7447
|
+
MDB_env *env = mc->mc_txn->mt_env;
|
7401
7448
|
MDB_node *node;
|
7402
7449
|
MDB_val sepkey, rkey, xdata, *rdata = &xdata;
|
7403
|
-
MDB_page *copy;
|
7450
|
+
MDB_page *copy = NULL;
|
7404
7451
|
MDB_page *mp, *rp, *pp;
|
7405
|
-
|
7452
|
+
int ptop;
|
7406
7453
|
MDB_cursor mn;
|
7407
7454
|
DKBUF;
|
7408
7455
|
|
7409
7456
|
mp = mc->mc_pg[mc->mc_top];
|
7410
7457
|
newindx = mc->mc_ki[mc->mc_top];
|
7458
|
+
nkeys = NUMKEYS(mp);
|
7411
7459
|
|
7412
|
-
DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i",
|
7460
|
+
DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
|
7413
7461
|
IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
|
7414
|
-
DKEY(newkey), mc->mc_ki[mc->mc_top]));
|
7462
|
+
DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
|
7415
7463
|
|
7416
7464
|
/* Create a right sibling. */
|
7417
7465
|
if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
|
@@ -7458,141 +7506,139 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
|
|
7458
7506
|
sepkey = *newkey;
|
7459
7507
|
split_indx = newindx;
|
7460
7508
|
nkeys = 0;
|
7461
|
-
|
7462
|
-
}
|
7509
|
+
} else {
|
7463
7510
|
|
7464
|
-
|
7465
|
-
|
7466
|
-
|
7467
|
-
|
7468
|
-
|
7469
|
-
|
7470
|
-
|
7471
|
-
|
7472
|
-
|
7473
|
-
|
7474
|
-
|
7475
|
-
|
7476
|
-
|
7477
|
-
|
7478
|
-
|
7479
|
-
|
7480
|
-
|
7481
|
-
|
7482
|
-
|
7483
|
-
|
7484
|
-
|
7485
|
-
|
7486
|
-
|
7487
|
-
|
7488
|
-
|
7489
|
-
|
7490
|
-
|
7491
|
-
|
7492
|
-
|
7493
|
-
|
7494
|
-
|
7495
|
-
|
7496
|
-
|
7497
|
-
|
7511
|
+
split_indx = (nkeys+1) / 2;
|
7512
|
+
|
7513
|
+
if (IS_LEAF2(rp)) {
|
7514
|
+
char *split, *ins;
|
7515
|
+
int x;
|
7516
|
+
unsigned int lsize, rsize, ksize;
|
7517
|
+
/* Move half of the keys to the right sibling */
|
7518
|
+
copy = NULL;
|
7519
|
+
x = mc->mc_ki[mc->mc_top] - split_indx;
|
7520
|
+
ksize = mc->mc_db->md_pad;
|
7521
|
+
split = LEAF2KEY(mp, split_indx, ksize);
|
7522
|
+
rsize = (nkeys - split_indx) * ksize;
|
7523
|
+
lsize = (nkeys - split_indx) * sizeof(indx_t);
|
7524
|
+
mp->mp_lower -= lsize;
|
7525
|
+
rp->mp_lower += lsize;
|
7526
|
+
mp->mp_upper += rsize - lsize;
|
7527
|
+
rp->mp_upper -= rsize - lsize;
|
7528
|
+
sepkey.mv_size = ksize;
|
7529
|
+
if (newindx == split_indx) {
|
7530
|
+
sepkey.mv_data = newkey->mv_data;
|
7531
|
+
} else {
|
7532
|
+
sepkey.mv_data = split;
|
7533
|
+
}
|
7534
|
+
if (x<0) {
|
7535
|
+
ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
|
7536
|
+
memcpy(rp->mp_ptrs, split, rsize);
|
7537
|
+
sepkey.mv_data = rp->mp_ptrs;
|
7538
|
+
memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
|
7539
|
+
memcpy(ins, newkey->mv_data, ksize);
|
7540
|
+
mp->mp_lower += sizeof(indx_t);
|
7541
|
+
mp->mp_upper -= ksize - sizeof(indx_t);
|
7542
|
+
} else {
|
7543
|
+
if (x)
|
7544
|
+
memcpy(rp->mp_ptrs, split, x * ksize);
|
7545
|
+
ins = LEAF2KEY(rp, x, ksize);
|
7546
|
+
memcpy(ins, newkey->mv_data, ksize);
|
7547
|
+
memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
|
7548
|
+
rp->mp_lower += sizeof(indx_t);
|
7549
|
+
rp->mp_upper -= ksize - sizeof(indx_t);
|
7550
|
+
mc->mc_ki[mc->mc_top] = x;
|
7551
|
+
mc->mc_pg[mc->mc_top] = rp;
|
7552
|
+
}
|
7498
7553
|
} else {
|
7499
|
-
|
7500
|
-
|
7501
|
-
|
7502
|
-
|
7503
|
-
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
mc->mc_pg[mc->mc_top] = rp;
|
7508
|
-
}
|
7509
|
-
goto newsep;
|
7510
|
-
}
|
7554
|
+
int psize, nsize, k;
|
7555
|
+
/* Maximum free space in an empty page */
|
7556
|
+
pmax = env->me_psize - PAGEHDRSZ;
|
7557
|
+
if (IS_LEAF(mp))
|
7558
|
+
nsize = mdb_leaf_size(env, newkey, newdata);
|
7559
|
+
else
|
7560
|
+
nsize = mdb_branch_size(env, newkey);
|
7561
|
+
nsize += nsize & 1;
|
7511
7562
|
|
7512
|
-
|
7513
|
-
|
7514
|
-
|
7515
|
-
|
7516
|
-
|
7517
|
-
|
7518
|
-
|
7519
|
-
|
7520
|
-
|
7521
|
-
|
7522
|
-
|
7523
|
-
|
7524
|
-
|
7525
|
-
|
7526
|
-
|
7527
|
-
|
7528
|
-
|
7529
|
-
|
7530
|
-
|
7531
|
-
|
7532
|
-
|
7533
|
-
|
7534
|
-
|
7535
|
-
|
7536
|
-
|
7537
|
-
|
7538
|
-
|
7539
|
-
|
7540
|
-
|
7541
|
-
|
7542
|
-
|
7543
|
-
|
7544
|
-
|
7545
|
-
|
7546
|
-
|
7563
|
+
/* grab a page to hold a temporary copy */
|
7564
|
+
copy = mdb_page_malloc(mc->mc_txn, 1);
|
7565
|
+
if (copy == NULL)
|
7566
|
+
return ENOMEM;
|
7567
|
+
copy->mp_pgno = mp->mp_pgno;
|
7568
|
+
copy->mp_flags = mp->mp_flags;
|
7569
|
+
copy->mp_lower = PAGEHDRSZ;
|
7570
|
+
copy->mp_upper = env->me_psize;
|
7571
|
+
|
7572
|
+
/* prepare to insert */
|
7573
|
+
for (i=0, j=0; i<nkeys; i++) {
|
7574
|
+
if (i == newindx) {
|
7575
|
+
copy->mp_ptrs[j++] = 0;
|
7576
|
+
}
|
7577
|
+
copy->mp_ptrs[j++] = mp->mp_ptrs[i];
|
7578
|
+
}
|
7579
|
+
|
7580
|
+
/* When items are relatively large the split point needs
|
7581
|
+
* to be checked, because being off-by-one will make the
|
7582
|
+
* difference between success or failure in mdb_node_add.
|
7583
|
+
*
|
7584
|
+
* It's also relevant if a page happens to be laid out
|
7585
|
+
* such that one half of its nodes are all "small" and
|
7586
|
+
* the other half of its nodes are "large." If the new
|
7587
|
+
* item is also "large" and falls on the half with
|
7588
|
+
* "large" nodes, it also may not fit.
|
7589
|
+
*
|
7590
|
+
* As a final tweak, if the new item goes on the last
|
7591
|
+
* spot on the page (and thus, onto the new page), bias
|
7592
|
+
* the split so the new page is emptier than the old page.
|
7593
|
+
* This yields better packing during sequential inserts.
|
7594
|
+
*/
|
7595
|
+
if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
|
7596
|
+
/* Find split point */
|
7597
|
+
psize = 0;
|
7598
|
+
if (newindx <= split_indx || newindx >= nkeys) {
|
7599
|
+
i = 0; j = 1;
|
7600
|
+
k = newindx >= nkeys ? nkeys : split_indx+2;
|
7601
|
+
} else {
|
7602
|
+
i = nkeys; j = -1;
|
7603
|
+
k = split_indx-1;
|
7604
|
+
}
|
7605
|
+
for (; i!=k; i+=j) {
|
7606
|
+
if (i == newindx) {
|
7607
|
+
psize += nsize;
|
7608
|
+
node = NULL;
|
7609
|
+
} else {
|
7610
|
+
node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
|
7611
|
+
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
7612
|
+
if (IS_LEAF(mp)) {
|
7613
|
+
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
7614
|
+
psize += sizeof(pgno_t);
|
7615
|
+
else
|
7616
|
+
psize += NODEDSZ(node);
|
7547
7617
|
}
|
7548
|
-
|
7549
|
-
split_indx = i;
|
7550
|
-
break;
|
7618
|
+
psize += psize & 1;
|
7551
7619
|
}
|
7552
|
-
|
7553
|
-
|
7554
|
-
psize = nsize;
|
7555
|
-
for (i=nkeys-1; i>=split_indx; i--) {
|
7556
|
-
node = NODEPTR(mp, i);
|
7557
|
-
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
7558
|
-
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
7559
|
-
psize += sizeof(pgno_t);
|
7560
|
-
else
|
7561
|
-
psize += NODEDSZ(node);
|
7562
|
-
psize += psize & 1;
|
7563
|
-
if (psize > pmax) {
|
7564
|
-
if (i >= newindx) {
|
7565
|
-
split_indx = newindx;
|
7566
|
-
newpos = 0;
|
7567
|
-
} else
|
7568
|
-
split_indx = i+1;
|
7620
|
+
if (psize > pmax || i == k-j) {
|
7621
|
+
split_indx = i + (j<0);
|
7569
7622
|
break;
|
7570
7623
|
}
|
7571
7624
|
}
|
7572
7625
|
}
|
7626
|
+
if (split_indx == newindx) {
|
7627
|
+
sepkey.mv_size = newkey->mv_size;
|
7628
|
+
sepkey.mv_data = newkey->mv_data;
|
7629
|
+
} else {
|
7630
|
+
node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
|
7631
|
+
sepkey.mv_size = node->mn_ksize;
|
7632
|
+
sepkey.mv_data = NODEKEY(node);
|
7633
|
+
}
|
7573
7634
|
}
|
7574
7635
|
}
|
7575
7636
|
|
7576
|
-
|
7577
|
-
* The case where newindx == split_indx is ambiguous; the
|
7578
|
-
* new item could go to the new page or stay on the original
|
7579
|
-
* page. If newpos == 1 it goes to the new page.
|
7580
|
-
*/
|
7581
|
-
if (newindx == split_indx && newpos) {
|
7582
|
-
sepkey.mv_size = newkey->mv_size;
|
7583
|
-
sepkey.mv_data = newkey->mv_data;
|
7584
|
-
} else {
|
7585
|
-
node = NODEPTR(mp, split_indx);
|
7586
|
-
sepkey.mv_size = node->mn_ksize;
|
7587
|
-
sepkey.mv_data = NODEKEY(node);
|
7588
|
-
}
|
7589
|
-
|
7590
|
-
newsep:
|
7591
|
-
DPRINTF(("separator is [%s]", DKEY(&sepkey)));
|
7637
|
+
DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
|
7592
7638
|
|
7593
7639
|
/* Copy separator key to the parent.
|
7594
7640
|
*/
|
7595
|
-
if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(
|
7641
|
+
if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
|
7596
7642
|
mn.mc_snum--;
|
7597
7643
|
mn.mc_top--;
|
7598
7644
|
did_split = 1;
|
@@ -7637,117 +7683,97 @@ newsep:
|
|
7637
7683
|
return rc;
|
7638
7684
|
for (i=0; i<mc->mc_top; i++)
|
7639
7685
|
mc->mc_ki[i] = mn.mc_ki[i];
|
7640
|
-
|
7641
|
-
|
7642
|
-
|
7643
|
-
|
7644
|
-
|
7645
|
-
|
7646
|
-
|
7686
|
+
} else if (!IS_LEAF2(mp)) {
|
7687
|
+
/* Move nodes */
|
7688
|
+
mc->mc_pg[mc->mc_top] = rp;
|
7689
|
+
i = split_indx;
|
7690
|
+
j = 0;
|
7691
|
+
do {
|
7692
|
+
if (i == newindx) {
|
7693
|
+
rkey.mv_data = newkey->mv_data;
|
7694
|
+
rkey.mv_size = newkey->mv_size;
|
7695
|
+
if (IS_LEAF(mp)) {
|
7696
|
+
rdata = newdata;
|
7697
|
+
} else
|
7698
|
+
pgno = newpgno;
|
7699
|
+
flags = nflags;
|
7700
|
+
/* Update index for the new key. */
|
7701
|
+
mc->mc_ki[mc->mc_top] = j;
|
7702
|
+
} else {
|
7703
|
+
node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
|
7704
|
+
rkey.mv_data = NODEKEY(node);
|
7705
|
+
rkey.mv_size = node->mn_ksize;
|
7706
|
+
if (IS_LEAF(mp)) {
|
7707
|
+
xdata.mv_data = NODEDATA(node);
|
7708
|
+
xdata.mv_size = NODEDSZ(node);
|
7709
|
+
rdata = &xdata;
|
7710
|
+
} else
|
7711
|
+
pgno = NODEPGNO(node);
|
7712
|
+
flags = node->mn_flags;
|
7713
|
+
}
|
7647
7714
|
|
7648
|
-
|
7649
|
-
|
7650
|
-
|
7651
|
-
|
7715
|
+
if (!IS_LEAF(mp) && j == 0) {
|
7716
|
+
/* First branch index doesn't need key data. */
|
7717
|
+
rkey.mv_size = 0;
|
7718
|
+
}
|
7652
7719
|
|
7653
|
-
|
7654
|
-
|
7655
|
-
|
7656
|
-
|
7657
|
-
|
7658
|
-
|
7659
|
-
|
7660
|
-
|
7661
|
-
/* Reset insert index for right sibling. */
|
7662
|
-
if (i != newindx || (newpos ^ ins_new)) {
|
7720
|
+
rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
|
7721
|
+
if (rc) {
|
7722
|
+
/* return tmp page to freelist */
|
7723
|
+
mdb_page_free(env, copy);
|
7724
|
+
return rc;
|
7725
|
+
}
|
7726
|
+
if (i == nkeys) {
|
7727
|
+
i = 0;
|
7663
7728
|
j = 0;
|
7664
|
-
mc->mc_pg[mc->mc_top] =
|
7729
|
+
mc->mc_pg[mc->mc_top] = copy;
|
7730
|
+
} else {
|
7731
|
+
i++;
|
7732
|
+
j++;
|
7733
|
+
}
|
7734
|
+
} while (i != split_indx);
|
7735
|
+
|
7736
|
+
nkeys = NUMKEYS(copy);
|
7737
|
+
for (i=0; i<nkeys; i++)
|
7738
|
+
mp->mp_ptrs[i] = copy->mp_ptrs[i];
|
7739
|
+
mp->mp_lower = copy->mp_lower;
|
7740
|
+
mp->mp_upper = copy->mp_upper;
|
7741
|
+
memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
|
7742
|
+
env->me_psize - copy->mp_upper);
|
7743
|
+
|
7744
|
+
/* reset back to original page */
|
7745
|
+
if (newindx < split_indx) {
|
7746
|
+
mc->mc_pg[mc->mc_top] = mp;
|
7747
|
+
if (nflags & MDB_RESERVE) {
|
7748
|
+
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
7749
|
+
if (!(node->mn_flags & F_BIGDATA))
|
7750
|
+
newdata->mv_data = NODEDATA(node);
|
7665
7751
|
}
|
7666
|
-
}
|
7667
|
-
|
7668
|
-
if (i == newindx && !ins_new) {
|
7669
|
-
/* Insert the original entry that caused the split. */
|
7670
|
-
rkey.mv_data = newkey->mv_data;
|
7671
|
-
rkey.mv_size = newkey->mv_size;
|
7672
|
-
if (IS_LEAF(mp)) {
|
7673
|
-
rdata = newdata;
|
7674
|
-
} else
|
7675
|
-
pgno = newpgno;
|
7676
|
-
flags = nflags;
|
7677
|
-
|
7678
|
-
ins_new = 1;
|
7679
|
-
|
7680
|
-
/* Update index for the new key. */
|
7681
|
-
mc->mc_ki[mc->mc_top] = j;
|
7682
|
-
} else if (i == nkeys) {
|
7683
|
-
break;
|
7684
7752
|
} else {
|
7685
|
-
|
7686
|
-
|
7687
|
-
|
7688
|
-
|
7689
|
-
|
7690
|
-
|
7691
|
-
|
7692
|
-
|
7693
|
-
|
7694
|
-
|
7695
|
-
|
7696
|
-
|
7697
|
-
}
|
7698
|
-
|
7699
|
-
if (!IS_LEAF(mp) && j == 0) {
|
7700
|
-
/* First branch index doesn't need key data. */
|
7701
|
-
rkey.mv_size = 0;
|
7702
|
-
}
|
7703
|
-
|
7704
|
-
rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
|
7705
|
-
if (rc) break;
|
7706
|
-
}
|
7707
|
-
|
7708
|
-
nkeys = NUMKEYS(copy);
|
7709
|
-
for (i=0; i<nkeys; i++)
|
7710
|
-
mp->mp_ptrs[i] = copy->mp_ptrs[i];
|
7711
|
-
mp->mp_lower = copy->mp_lower;
|
7712
|
-
mp->mp_upper = copy->mp_upper;
|
7713
|
-
memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
|
7714
|
-
mc->mc_txn->mt_env->me_psize - copy->mp_upper);
|
7715
|
-
|
7716
|
-
/* reset back to original page */
|
7717
|
-
if (newindx < split_indx || (!newpos && newindx == split_indx)) {
|
7718
|
-
mc->mc_pg[mc->mc_top] = mp;
|
7719
|
-
if (nflags & MDB_RESERVE) {
|
7720
|
-
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
7721
|
-
if (!(node->mn_flags & F_BIGDATA))
|
7722
|
-
newdata->mv_data = NODEDATA(node);
|
7723
|
-
}
|
7724
|
-
} else {
|
7725
|
-
mc->mc_ki[ptop]++;
|
7726
|
-
/* Make sure mc_ki is still valid.
|
7727
|
-
*/
|
7728
|
-
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
|
7729
|
-
mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
|
7730
|
-
for (i=0; i<ptop; i++) {
|
7731
|
-
mc->mc_pg[i] = mn.mc_pg[i];
|
7732
|
-
mc->mc_ki[i] = mn.mc_ki[i];
|
7753
|
+
mc->mc_pg[mc->mc_top] = rp;
|
7754
|
+
mc->mc_ki[ptop]++;
|
7755
|
+
/* Make sure mc_ki is still valid.
|
7756
|
+
*/
|
7757
|
+
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
|
7758
|
+
mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
|
7759
|
+
for (i=0; i<ptop; i++) {
|
7760
|
+
mc->mc_pg[i] = mn.mc_pg[i];
|
7761
|
+
mc->mc_ki[i] = mn.mc_ki[i];
|
7762
|
+
}
|
7763
|
+
mc->mc_pg[ptop] = mn.mc_pg[ptop];
|
7764
|
+
mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
|
7733
7765
|
}
|
7734
|
-
mc->mc_pg[ptop] = mn.mc_pg[ptop];
|
7735
|
-
mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
|
7736
7766
|
}
|
7767
|
+
/* return tmp page to freelist */
|
7768
|
+
mdb_page_free(env, copy);
|
7737
7769
|
}
|
7738
7770
|
|
7739
|
-
/* return tmp page to freelist */
|
7740
|
-
mdb_page_free(mc->mc_txn->mt_env, copy);
|
7741
|
-
done:
|
7742
7771
|
{
|
7743
7772
|
/* Adjust other cursors pointing to mp */
|
7744
7773
|
MDB_cursor *m2, *m3;
|
7745
7774
|
MDB_dbi dbi = mc->mc_dbi;
|
7746
7775
|
int fixup = NUMKEYS(mp);
|
7747
7776
|
|
7748
|
-
if (mc->mc_flags & C_SUB)
|
7749
|
-
dbi--;
|
7750
|
-
|
7751
7777
|
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7752
7778
|
if (mc->mc_flags & C_SUB)
|
7753
7779
|
m3 = &m2->mc_xcursor->mx_cursor;
|
@@ -7789,6 +7815,7 @@ done:
|
|
7789
7815
|
}
|
7790
7816
|
}
|
7791
7817
|
}
|
7818
|
+
DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
|
7792
7819
|
return rc;
|
7793
7820
|
}
|
7794
7821
|
|
@@ -7805,13 +7832,6 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
|
|
7805
7832
|
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
7806
7833
|
return EINVAL;
|
7807
7834
|
|
7808
|
-
if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
|
7809
|
-
return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
|
7810
|
-
|
7811
|
-
if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
|
7812
|
-
return MDB_BAD_VALSIZE;
|
7813
|
-
}
|
7814
|
-
|
7815
7835
|
if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
|
7816
7836
|
return EINVAL;
|
7817
7837
|
|
@@ -7851,6 +7871,16 @@ mdb_env_get_path(MDB_env *env, const char **arg)
|
|
7851
7871
|
return MDB_SUCCESS;
|
7852
7872
|
}
|
7853
7873
|
|
7874
|
+
int
|
7875
|
+
mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
|
7876
|
+
{
|
7877
|
+
if (!env || !arg)
|
7878
|
+
return EINVAL;
|
7879
|
+
|
7880
|
+
*arg = env->me_fd;
|
7881
|
+
return MDB_SUCCESS;
|
7882
|
+
}
|
7883
|
+
|
7854
7884
|
/** Common code for #mdb_stat() and #mdb_env_stat().
|
7855
7885
|
* @param[in] env the environment to operate in.
|
7856
7886
|
* @param[in] db the #MDB_db record containing the stats to return.
|
@@ -8075,7 +8105,7 @@ mdb_drop0(MDB_cursor *mc, int subs)
|
|
8075
8105
|
{
|
8076
8106
|
int rc;
|
8077
8107
|
|
8078
|
-
rc = mdb_page_search(mc, NULL,
|
8108
|
+
rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
|
8079
8109
|
if (rc == MDB_SUCCESS) {
|
8080
8110
|
MDB_txn *txn = mc->mc_txn;
|
8081
8111
|
MDB_node *ni;
|
@@ -8273,10 +8303,10 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
|
|
8273
8303
|
return 0;
|
8274
8304
|
}
|
8275
8305
|
|
8276
|
-
|
8306
|
+
/** Insert pid into list if not already present.
|
8277
8307
|
* return -1 if already present.
|
8278
8308
|
*/
|
8279
|
-
static int mdb_pid_insert(
|
8309
|
+
static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
|
8280
8310
|
{
|
8281
8311
|
/* binary search of pid in list */
|
8282
8312
|
unsigned base = 0;
|
@@ -8301,7 +8331,7 @@ static int mdb_pid_insert(pid_t *ids, pid_t pid)
|
|
8301
8331
|
return -1;
|
8302
8332
|
}
|
8303
8333
|
}
|
8304
|
-
|
8334
|
+
|
8305
8335
|
if( val > 0 ) {
|
8306
8336
|
++cursor;
|
8307
8337
|
}
|
@@ -8316,7 +8346,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
|
|
8316
8346
|
{
|
8317
8347
|
unsigned int i, j, rdrs;
|
8318
8348
|
MDB_reader *mr;
|
8319
|
-
|
8349
|
+
MDB_PID_T *pids, pid;
|
8320
8350
|
int count = 0;
|
8321
8351
|
|
8322
8352
|
if (!env)
|
@@ -8326,7 +8356,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
|
|
8326
8356
|
if (!env->me_txns)
|
8327
8357
|
return MDB_SUCCESS;
|
8328
8358
|
rdrs = env->me_txns->mti_numreaders;
|
8329
|
-
pids = malloc((rdrs+1) * sizeof(
|
8359
|
+
pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
|
8330
8360
|
if (!pids)
|
8331
8361
|
return ENOMEM;
|
8332
8362
|
pids[0] = 0;
|
@@ -8342,6 +8372,8 @@ int mdb_reader_check(MDB_env *env, int *dead)
|
|
8342
8372
|
if (!mdb_reader_pid(env, Pidcheck, pid)) {
|
8343
8373
|
for (j=i; j<rdrs; j++)
|
8344
8374
|
if (mr[j].mr_pid == pid) {
|
8375
|
+
DPRINTF(("clear stale reader pid %u txn %"Z"d",
|
8376
|
+
(unsigned) pid, mr[j].mr_txnid));
|
8345
8377
|
mr[j].mr_pid = 0;
|
8346
8378
|
count++;
|
8347
8379
|
}
|