lmdb 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c192851303ba3bd76a139142c94322233284d3ed
4
- data.tar.gz: b9482f90f6599d71f49989246fe9f2c526f47986
3
+ metadata.gz: 1e16a42693e5150e076829337a0d44feb46bc3f3
4
+ data.tar.gz: ce0bc77de0cf3c7e17df2522c8eab69e760e1221
5
5
  SHA512:
6
- metadata.gz: e8fa0b74c2854ebd13ba07663ee65be141cefe08ae55075f6fd8817c805e21f48d32c886d687f4e85334958bfa1d9f39935a9764d343755dec3adbd7b3d67b08
7
- data.tar.gz: c760928ef6d4fb1edb6a5a5d5432b9a8823af38e12457efaef74640aef3b7a6e7956bd258cf22b68c54a89136742fc48329c78ac0ce9db2bbfa1b135d8dd3a8a
6
+ metadata.gz: e02924de6a59386ec215b45baf321a6d098d818900f7816a3805185ab5b7441427cc5af866d2d8141eecfd1d166a659ebc2a87d5ae7626a9a574bd4d1bd4bf44
7
+ data.tar.gz: 3ee55c5879fd385e53d25eeffd694dab0de03c5b152fe07562f9b631de25ccd90f09ae886a1d30b7f9abd6ea3a8da3fedac0b1f9d69dbffc3874bfad18570dd9
data/.travis.yml CHANGED
@@ -3,6 +3,7 @@ rvm:
3
3
  - 1.8.7
4
4
  - 1.9.3
5
5
  - 2.0.0
6
+ - 2.1.0
6
7
  - ruby-head
7
8
  - rbx-18mode
8
9
  - rbx-19mode
data/CHANGES CHANGED
@@ -1,3 +1,7 @@
1
+ 0.4.0
2
+
3
+ * Print warnings if open LMDB objects are found during garbage collection
4
+
1
5
  0.3.1
2
6
 
3
7
  * Minor fixes
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
  # LMDB
2
2
 
3
- Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LLMDB)
3
+ [![Gittip donate button](http://img.shields.io/gittip/bevry.png)](https://www.gittip.com/min4d/ "Donate weekly to this project using Gittip")
4
+ [![Flattr this git repo](http://api.flattr.com/button/flattr-badge-large.png)](https://flattr.com/submit/auto?user_id=min4d&url=https://github.com/minad/lmdb&title=LMDB&language=&tags=github&category=software)
5
+
6
+ Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LMDB)
4
7
  http://symas.com/mdb/
5
8
 
6
9
  ### Installation
@@ -1,6 +1,31 @@
1
1
  LMDB 0.9 Change Log
2
2
 
3
- LMDB 0.9.8 Engineering
3
+ LMDB 0.9.10 Release (2013/11/12)
4
+ Add MDB_NOMEMINIT option
5
+ Fix mdb_page_split() again (ITS#7589)
6
+ Fix MDB_NORDAHEAD definition (ITS#7734)
7
+ Fix mdb_cursor_del() positioning (ITS#7733)
8
+ Partial fix for larger page sizes (ITS#7713)
9
+ Fix Windows64/MSVC build issues
10
+
11
+ LMDB 0.9.9 Release (2013/10/24)
12
+ Add mdb_env_get_fd()
13
+ Add MDB_NORDAHEAD option
14
+ Add MDB_NOLOCK option
15
+ Avoid wasting space in mdb_page_split() (ITS#7589)
16
+ Fix mdb_page_merge() cursor fixup (ITS#7722)
17
+ Fix mdb_cursor_del() on last delete (ITS#7718)
18
+ Fix adding WRITEMAP on existing env (ITS#7715)
19
+ Fixes for nested txns (ITS#7515)
20
+ Fix mdb_env_copy() O_DIRECT bug (ITS#7682)
21
+ Fix mdb_cursor_set(SET_RANGE) return code (ITS#7681)
22
+ Fix mdb_rebalance() cursor fixup (ITS#7701)
23
+ Misc code cleanup
24
+ Documentation
25
+ Note that by default, readers need write access
26
+
27
+
28
+ LMDB 0.9.8 Release (2013/09/09)
4
29
  Allow mdb_env_set_mapsize() on an open environment
5
30
  Fix mdb_dbi_flags() (ITS#7672)
6
31
  Fix mdb_page_unspill() in nested txns
@@ -66,6 +66,20 @@
66
66
  * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM.
67
67
  * Multiple users can cause startup to fail later, as noted above.
68
68
  *
69
+ * - There is normally no pure read-only mode, since readers need write
70
+ * access to locks and lock file. Exceptions: On read-only filesystems
71
+ * or with the #MDB_NOLOCK flag described under #mdb_env_open().
72
+ *
73
+ * - By default, in versions before 0.9.10, unused portions of the data
74
+ * file might receive garbage data from memory freed by other code.
75
+ * (This does not happen when using the #MDB_WRITEMAP flag.) As of
76
+ * 0.9.10 the default behavior is to initialize such memory before
77
+ * writing to the data file. Since there may be a slight performance
78
+ * cost due to this initialization, applications may disable it using
79
+ * the #MDB_NOMEMINIT flag. Applications handling sensitive data
80
+ * which must not be written should not use this flag. This flag is
81
+ * irrelevant when using #MDB_WRITEMAP.
82
+ *
69
83
  * - A thread can only use one transaction at a time, plus any child
70
84
  * transactions. Each transaction belongs to one thread. See below.
71
85
  * The #MDB_NOTLS flag changes this for read-only transactions.
@@ -170,7 +184,7 @@ typedef int mdb_filehandle_t;
170
184
  /** Library minor version */
171
185
  #define MDB_VERSION_MINOR 9
172
186
  /** Library patch version */
173
- #define MDB_VERSION_PATCH 8
187
+ #define MDB_VERSION_PATCH 10
174
188
 
175
189
  /** Combine args a,b,c into a single integer for easy version comparisons */
176
190
  #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
@@ -180,7 +194,7 @@ typedef int mdb_filehandle_t;
180
194
  MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
181
195
 
182
196
  /** The release date of this library version */
183
- #define MDB_VERSION_DATE "September 9, 2013"
197
+ #define MDB_VERSION_DATE "November 11, 2013"
184
198
 
185
199
  /** A stringifier for the version info */
186
200
  #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
@@ -216,13 +230,13 @@ typedef struct MDB_cursor MDB_cursor;
216
230
  /** @brief Generic structure used for passing keys and data in and out
217
231
  * of the database.
218
232
  *
219
- * Key sizes must be between 1 and the liblmdb build-time constant
220
- * #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The
221
- * same applies to data sizes in databases with the #MDB_DUPSORT flag.
222
- * Other data items can in theory be from 0 to 0xffffffff bytes long.
223
- *
224
233
  * Values returned from the database are valid only until a subsequent
225
- * update operation, or the end of the transaction.
234
+ * update operation, or the end of the transaction. Do not modify or
235
+ * free them, they commonly point into the database itself.
236
+ *
237
+ * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive.
238
+ * The same applies to data sizes in databases with the #MDB_DUPSORT flag.
239
+ * Other data items can in theory be from 0 to 0xffffffff bytes long.
226
240
  */
227
241
  typedef struct MDB_val {
228
242
  size_t mv_size; /**< size of the data item */
@@ -265,10 +279,16 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
265
279
  #define MDB_NOMETASYNC 0x40000
266
280
  /** use writable mmap */
267
281
  #define MDB_WRITEMAP 0x80000
268
- /** use asynchronous msync when MDB_WRITEMAP is used */
282
+ /** use asynchronous msync when #MDB_WRITEMAP is used */
269
283
  #define MDB_MAPASYNC 0x100000
270
284
  /** tie reader locktable slots to #MDB_txn objects instead of to threads */
271
285
  #define MDB_NOTLS 0x200000
286
+ /** don't do any locking, caller must manage their own locks */
287
+ #define MDB_NOLOCK 0x400000
288
+ /** don't do readahead (no effect on Windows) */
289
+ #define MDB_NORDAHEAD 0x800000
290
+ /** don't initialize malloc'd memory before writing to datafile */
291
+ #define MDB_NOMEMINIT 0x1000000
272
292
  /** @} */
273
293
 
274
294
  /** @defgroup mdb_dbi_open Database Flags
@@ -486,6 +506,8 @@ int mdb_env_create(MDB_env **env);
486
506
  * and uses fewer mallocs, but loses protection from application bugs
487
507
  * like wild pointer writes and other bad updates into the database.
488
508
  * Incompatible with nested transactions.
509
+ * Processes with and without MDB_WRITEMAP on the same environment do
510
+ * not cooperate well.
489
511
  * <li>#MDB_NOMETASYNC
490
512
  * Flush system buffers to disk only once per transaction, omit the
491
513
  * metadata flush. Defer that until the system flushes files to disk,
@@ -523,6 +545,38 @@ int mdb_env_create(MDB_env **env);
523
545
  * user threads over individual OS threads need this option. Such an
524
546
  * application must also serialize the write transactions in an OS
525
547
  * thread, since MDB's write locking is unaware of the user threads.
548
+ * <li>#MDB_NOLOCK
549
+ * Don't do any locking. If concurrent access is anticipated, the
550
+ * caller must manage all concurrency itself. For proper operation
551
+ * the caller must enforce single-writer semantics, and must ensure
552
+ * that no readers are using old transactions while a writer is
553
+ * active. The simplest approach is to use an exclusive lock so that
554
+ * no readers may be active at all when a writer begins.
555
+ * <li>#MDB_NORDAHEAD
556
+ * Turn off readahead. Most operating systems perform readahead on
557
+ * read requests by default. This option turns it off if the OS
558
+ * supports it. Turning it off may help random read performance
559
+ * when the DB is larger than RAM and system RAM is full.
560
+ * The option is not implemented on Windows.
561
+ * <li>#MDB_NOMEMINIT
562
+ * Don't initialize malloc'd memory before writing to unused spaces
563
+ * in the data file. By default, memory for pages written to the data
564
+ * file is obtained using malloc. While these pages may be reused in
565
+ * subsequent transactions, freshly malloc'd pages will be initialized
566
+ * to zeroes before use. This avoids persisting leftover data from other
567
+ * code (that used the heap and subsequently freed the memory) into the
568
+ * data file. Note that many other system libraries may allocate
569
+ * and free memory from the heap for arbitrary uses. E.g., stdio may
570
+ * use the heap for file I/O buffers. This initialization step has a
571
+ * modest performance cost so some applications may want to disable
572
+ * it using this flag. This option can be a problem for applications
573
+ * which handle sensitive data like passwords, and it makes memory
574
+ * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP,
575
+ * which writes directly to the mmap instead of using malloc for pages. The
576
+ * initialization is also skipped if #MDB_RESERVE is used; the
577
+ * caller is expected to overwrite all of the memory that was
578
+ * reserved in that case.
579
+ * This flag may be changed at any time using #mdb_env_set_flags().
526
580
  * </ul>
527
581
  * @param[in] mode The UNIX permissions to set on created files. This parameter
528
582
  * is ignored on Windows.
@@ -656,6 +710,18 @@ int mdb_env_get_flags(MDB_env *env, unsigned int *flags);
656
710
  */
657
711
  int mdb_env_get_path(MDB_env *env, const char **path);
658
712
 
713
+ /** @brief Return the filedescriptor for the given environment.
714
+ *
715
+ * @param[in] env An environment handle returned by #mdb_env_create()
716
+ * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor.
717
+ * @return A non-zero error value on failure and 0 on success. Some possible
718
+ * errors are:
719
+ * <ul>
720
+ * <li>EINVAL - an invalid parameter was specified.
721
+ * </ul>
722
+ */
723
+ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
724
+
659
725
  /** @brief Set the size of the memory map to use for this environment.
660
726
  *
661
727
  * The size should be a multiple of the OS page size. The default is
@@ -733,8 +799,10 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs);
733
799
 
734
800
  /** @brief Get the maximum size of a key for the environment.
735
801
  *
802
+ * This is the compile-time constant #MDB_MAXKEYSIZE, default 511.
803
+ * See @ref MDB_val.
736
804
  * @param[in] env An environment handle returned by #mdb_env_create()
737
- * @return The maximum size of a key. (#MDB_MAXKEYSIZE)
805
+ * @return The maximum size of a key
738
806
  */
739
807
  int mdb_env_get_maxkeysize(MDB_env *env);
740
808
 
@@ -1094,6 +1162,8 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
1094
1162
  * reserved space, which the caller can fill in later - before
1095
1163
  * the next update operation or the transaction ends. This saves
1096
1164
  * an extra memcpy if the data is being generated later.
1165
+ * MDB does nothing else with this memory, the caller is expected
1166
+ * to modify all of the space requested.
1097
1167
  * <li>#MDB_APPEND - append the given key/data pair to the end of the
1098
1168
  * database. No key comparisons are performed. This option allows
1099
1169
  * fast bulk loading when keys are already known to be in the
@@ -37,10 +37,26 @@
37
37
  #endif
38
38
  #include <sys/types.h>
39
39
  #include <sys/stat.h>
40
- #include <sys/param.h>
41
40
  #ifdef _WIN32
42
41
  #include <windows.h>
42
+ /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
43
+ * as int64 which is wrong. MSVC doesn't define it at all, so just
44
+ * don't use it.
45
+ */
46
+ #define MDB_PID_T int
47
+ #ifdef __GNUC__
48
+ # include <sys/param.h>
43
49
  #else
50
+ # define LITTLE_ENDIAN 1234
51
+ # define BIG_ENDIAN 4321
52
+ # define BYTE_ORDER LITTLE_ENDIAN
53
+ # ifndef SSIZE_MAX
54
+ # define SSIZE_MAX INT_MAX
55
+ # endif
56
+ #endif
57
+ #else
58
+ #define MDB_PID_T pid_t
59
+ #include <sys/param.h>
44
60
  #include <sys/uio.h>
45
61
  #include <sys/mman.h>
46
62
  #ifdef HAVE_SYS_FILE_H
@@ -75,6 +91,7 @@
75
91
  #ifndef _WIN32
76
92
  #include <pthread.h>
77
93
  #ifdef MDB_USE_POSIX_SEM
94
+ # define MDB_USE_HASH 1
78
95
  #include <semaphore.h>
79
96
  #endif
80
97
  #endif
@@ -140,6 +157,7 @@
140
157
  * @{
141
158
  */
142
159
  #ifdef _WIN32
160
+ #define MDB_USE_HASH 1
143
161
  #define MDB_PIDLOCK 0
144
162
  #define pthread_t DWORD
145
163
  #define pthread_mutex_t HANDLE
@@ -171,7 +189,7 @@
171
189
  #define Z "I"
172
190
  #else
173
191
 
174
- #define Z "z"
192
+ #define Z "z" /**< printf format modifier for size_t */
175
193
 
176
194
  /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
177
195
  #define MDB_PIDLOCK 1
@@ -317,12 +335,18 @@ static txnid_t mdb_debug_start;
317
335
  * The string is printed literally, with no format processing.
318
336
  */
319
337
  #define DPUTS(arg) DPRINTF(("%s", arg))
338
+ /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
339
+ #define DDBI(mc) \
340
+ (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
320
341
  /** @} */
321
342
 
322
- /** A default memory page size.
323
- * The actual size is platform-dependent, but we use this for
324
- * boot-strapping. We probably should not be using this any more.
325
- * The #GET_PAGESIZE() macro is used to get the actual size.
343
+ /** @brief The maximum size of a database page.
344
+ *
345
+ * This is 32k, since it must fit in #MDB_page.#mp_upper.
346
+ *
347
+ * LMDB will use database pages < OS pages if needed.
348
+ * That causes more I/O in write transactions: The OS must
349
+ * know (read) the whole page before writing a partial page.
326
350
  *
327
351
  * Note that we don't currently support Huge pages. On Linux,
328
352
  * regular data files cannot use Huge pages, and in general
@@ -331,7 +355,7 @@ static txnid_t mdb_debug_start;
331
355
  * pressure from other processes is high. So until OSs have
332
356
  * actual paging support for Huge pages, they're not viable.
333
357
  */
334
- #define MDB_PAGESIZE 4096
358
+ #define MAX_PAGESIZE 0x8000
335
359
 
336
360
  /** The minimum number of keys required in a database page.
337
361
  * Setting this to a larger value will place a smaller bound on the
@@ -365,7 +389,7 @@ static txnid_t mdb_debug_start;
365
389
  *
366
390
  * We require that keys all fit onto a regular page. This limit
367
391
  * could be raised a bit further if needed; to something just
368
- * under #MDB_PAGESIZE / #MDB_MINKEYS.
392
+ * under (page size / #MDB_MINKEYS / 3).
369
393
  *
370
394
  * Note that data items in an #MDB_DUPSORT database are actually keys
371
395
  * of a subDB, so they're also limited to this size.
@@ -425,7 +449,8 @@ typedef uint16_t indx_t;
425
449
  *
426
450
  * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
427
451
  *
428
- * No reader table is used if the database is on a read-only filesystem.
452
+ * No reader table is used if the database is on a read-only filesystem, or
453
+ * if #MDB_NOLOCK is set.
429
454
  *
430
455
  * Since the database uses multi-version concurrency control, readers don't
431
456
  * actually need any locking. This table is used to keep track of which
@@ -488,7 +513,7 @@ typedef struct MDB_rxbody {
488
513
  */
489
514
  txnid_t mrb_txnid;
490
515
  /** The process ID of the process owning this reader txn. */
491
- pid_t mrb_pid;
516
+ MDB_PID_T mrb_pid;
492
517
  /** The thread ID of the thread owning this txn. */
493
518
  pthread_t mrb_tid;
494
519
  } MDB_rxbody;
@@ -600,7 +625,7 @@ typedef struct MDB_page {
600
625
  #define P_LEAF 0x02 /**< leaf page */
601
626
  #define P_OVERFLOW 0x04 /**< overflow page */
602
627
  #define P_META 0x08 /**< meta page */
603
- #define P_DIRTY 0x10 /**< dirty page */
628
+ #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
604
629
  #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
605
630
  #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
606
631
  #define P_KEEP 0x8000 /**< leave this page alone during spill */
@@ -786,7 +811,10 @@ typedef struct MDB_db {
786
811
  /** Handle for the default DB. */
787
812
  #define MAIN_DBI 1
788
813
 
789
- /** Meta page content. */
814
+ /** Meta page content.
815
+ * A meta page is the start point for accessing a database snapshot.
816
+ * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
817
+ */
790
818
  typedef struct MDB_meta {
791
819
  /** Stamp identifying this as an MDB file. It must be set
792
820
  * to #MDB_MAGIC. */
@@ -804,19 +832,18 @@ typedef struct MDB_meta {
804
832
  txnid_t mm_txnid; /**< txnid that committed this page */
805
833
  } MDB_meta;
806
834
 
807
- /** Buffer for a stack-allocated dirty page.
835
+ /** Buffer for a stack-allocated meta page.
808
836
  * The members define size and alignment, and silence type
809
837
  * aliasing warnings. They are not used directly; that could
810
838
  * mean incorrectly using several union members in parallel.
811
839
  */
812
- typedef union MDB_pagebuf {
813
- char mb_raw[MDB_PAGESIZE];
840
+ typedef union MDB_metabuf {
814
841
  MDB_page mb_page;
815
842
  struct {
816
843
  char mm_pad[PAGEHDRSZ];
817
844
  MDB_meta mm_meta;
818
845
  } mb_metabuf;
819
- } MDB_pagebuf;
846
+ } MDB_metabuf;
820
847
 
821
848
  /** Auxiliary DB info.
822
849
  * The information here is mostly static/read-only. There is
@@ -865,9 +892,9 @@ struct MDB_txn {
865
892
  * @ingroup internal
866
893
  * @{
867
894
  */
868
- #define DB_DIRTY 0x01 /**< DB was written in this txn */
869
- #define DB_STALE 0x02 /**< DB record is older than txnID */
870
- #define DB_NEW 0x04 /**< DB handle opened in this txn */
895
+ #define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */
896
+ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */
897
+ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
871
898
  #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
872
899
  /** @} */
873
900
  /** In write txns, array of cursors for each DB */
@@ -889,12 +916,12 @@ struct MDB_txn {
889
916
  #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
890
917
  /** @} */
891
918
  unsigned int mt_flags; /**< @ref mdb_txn */
892
- /** dirty_list maxsize - # of allocated pages allowed, including in parent txns */
893
- unsigned int mt_dirty_room;
894
- /** Tracks which of the two meta pages was used at the start
895
- * of this transaction.
919
+ /** dirty_list room: Array size - #dirty pages visible to this txn.
920
+ * Includes ancestor txns' dirty pages not hidden by other txns'
921
+ * dirty/spilled pages. Thus commit(nested txn) has room to merge
922
+ * dirty_list into mt_parent after freeing hidden mt_parent pages.
896
923
  */
897
- unsigned int mt_toggle;
924
+ unsigned int mt_dirty_room;
898
925
  };
899
926
 
900
927
  /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
@@ -905,7 +932,14 @@ struct MDB_txn {
905
932
 
906
933
  struct MDB_xcursor;
907
934
 
908
- /** Cursors are used for all DB operations */
935
+ /** Cursors are used for all DB operations.
936
+ * A cursor holds a path of (page pointer, key index) from the DB
937
+ * root to a position in the DB, plus other state. #MDB_DUPSORT
938
+ * cursors include an xcursor to the current data item. Write txns
939
+ * track their cursors and keep them up to date when data moves.
940
+ * Exception: An xcursor's pointer to a #P_SUBP page can be stale.
941
+ * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
942
+ */
909
943
  struct MDB_cursor {
910
944
  /** Next cursor on this DB in this txn */
911
945
  MDB_cursor *mc_next;
@@ -978,16 +1012,18 @@ struct MDB_env {
978
1012
  /** Have liveness lock in reader table */
979
1013
  #define MDB_LIVE_READER 0x08000000U
980
1014
  uint32_t me_flags; /**< @ref mdb_env */
981
- unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
1015
+ unsigned int me_psize; /**< DB page size, inited from me_os_psize */
1016
+ unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
982
1017
  unsigned int me_maxreaders; /**< size of the reader table */
983
1018
  unsigned int me_numreaders; /**< max numreaders set by this env */
984
1019
  MDB_dbi me_numdbs; /**< number of DBs opened */
985
1020
  MDB_dbi me_maxdbs; /**< size of the DB table */
986
- pid_t me_pid; /**< process ID of this env */
1021
+ MDB_PID_T me_pid; /**< process ID of this env */
987
1022
  char *me_path; /**< path to the DB files */
988
1023
  char *me_map; /**< the memory map of the data file */
989
1024
  MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
990
1025
  MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
1026
+ void *me_pbuf; /**< scratch area for DUPSORT put() */
991
1027
  MDB_txn *me_txn; /**< current write transaction */
992
1028
  size_t me_mapsize; /**< size of the data memory map */
993
1029
  off_t me_size; /**< current file size */
@@ -1019,8 +1055,8 @@ struct MDB_env {
1019
1055
 
1020
1056
  /** Nested transaction */
1021
1057
  typedef struct MDB_ntxn {
1022
- MDB_txn mnt_txn; /* the transaction */
1023
- MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */
1058
+ MDB_txn mnt_txn; /**< the transaction */
1059
+ MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
1024
1060
  } MDB_ntxn;
1025
1061
 
1026
1062
  /** max number of pages to commit in one writev() call */
@@ -1042,6 +1078,8 @@ static int mdb_page_search_root(MDB_cursor *mc,
1042
1078
  MDB_val *key, int modify);
1043
1079
  #define MDB_PS_MODIFY 1
1044
1080
  #define MDB_PS_ROOTONLY 2
1081
+ #define MDB_PS_FIRST 4
1082
+ #define MDB_PS_LAST 8
1045
1083
  static int mdb_page_search(MDB_cursor *mc,
1046
1084
  MDB_val *key, int flags);
1047
1085
  static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
@@ -1255,7 +1293,7 @@ static void mdb_audit(MDB_txn *txn)
1255
1293
  txn->mt_dbs[i].md_leaf_pages +
1256
1294
  txn->mt_dbs[i].md_overflow_pages;
1257
1295
  if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1258
- mdb_page_search(&mc, NULL, 0);
1296
+ mdb_page_search(&mc, NULL, MDB_PS_FIRST);
1259
1297
  do {
1260
1298
  unsigned j;
1261
1299
  MDB_page *mp;
@@ -1300,7 +1338,12 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
1300
1338
  {
1301
1339
  MDB_env *env = txn->mt_env;
1302
1340
  MDB_page *ret = env->me_dpages;
1303
- size_t sz = env->me_psize;
1341
+ size_t psize = env->me_psize, sz = psize, off;
1342
+ /* For ! #MDB_NOMEMINIT, psize counts how much to init.
1343
+ * For a single page alloc, we init everything after the page header.
1344
+ * For multi-page, we init the final page; if the caller needed that
1345
+ * many pages they will be filling in at least up to the last page.
1346
+ */
1304
1347
  if (num == 1) {
1305
1348
  if (ret) {
1306
1349
  VGMEMP_ALLOC(env, ret, sz);
@@ -1308,10 +1351,16 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
1308
1351
  env->me_dpages = ret->mp_next;
1309
1352
  return ret;
1310
1353
  }
1354
+ psize -= off = PAGEHDRSZ;
1311
1355
  } else {
1312
1356
  sz *= num;
1357
+ off = sz - psize;
1313
1358
  }
1314
1359
  if ((ret = malloc(sz)) != NULL) {
1360
+ if (!(env->me_flags & MDB_NOMEMINIT)) {
1361
+ memset((char *)ret + off, 0, psize);
1362
+ ret->mp_pad = 0;
1363
+ }
1315
1364
  VGMEMP_ALLOC(env, ret, sz);
1316
1365
  }
1317
1366
  return ret;
@@ -1329,7 +1378,7 @@ mdb_page_free(MDB_env *env, MDB_page *mp)
1329
1378
  env->me_dpages = mp;
1330
1379
  }
1331
1380
 
1332
- /* Free a dirty page */
1381
+ /** Free a dirty page */
1333
1382
  static void
1334
1383
  mdb_dpage_free(MDB_env *env, MDB_page *dp)
1335
1384
  {
@@ -1356,7 +1405,7 @@ mdb_dlist_free(MDB_txn *txn)
1356
1405
  dl[0].mid = 0;
1357
1406
  }
1358
1407
 
1359
- /* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1408
+ /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1360
1409
  * @param[in] mc A cursor handle for the current operation.
1361
1410
  * @param[in] pflags Flags of the pages to update:
1362
1411
  * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
@@ -1366,10 +1415,12 @@ mdb_dlist_free(MDB_txn *txn)
1366
1415
  static int
1367
1416
  mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1368
1417
  {
1418
+ enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
1369
1419
  MDB_txn *txn = mc->mc_txn;
1370
1420
  MDB_cursor *m3;
1371
1421
  MDB_xcursor *mx;
1372
- MDB_page *dp;
1422
+ MDB_page *dp, *mp;
1423
+ MDB_node *leaf;
1373
1424
  unsigned i, j;
1374
1425
  int rc = MDB_SUCCESS, level;
1375
1426
 
@@ -1378,14 +1429,24 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1378
1429
  mc = NULL; /* will find mc in mt_cursors */
1379
1430
  for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
1380
1431
  for (; mc; mc=mc->mc_next) {
1381
- for (m3 = mc; m3->mc_flags & C_INITIALIZED; m3 = &mx->mx_cursor) {
1382
- for (j=0; j<m3->mc_snum; j++)
1383
- if ((m3->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY|P_KEEP))
1384
- == pflags)
1385
- m3->mc_pg[j]->mp_flags ^= P_KEEP;
1386
- mx = m3->mc_xcursor;
1387
- if (mx == NULL)
1388
- break;
1432
+ if (!(mc->mc_flags & C_INITIALIZED))
1433
+ continue;
1434
+ for (m3 = mc;; m3 = &mx->mx_cursor) {
1435
+ mp = NULL;
1436
+ for (j=0; j<m3->mc_snum; j++) {
1437
+ mp = m3->mc_pg[j];
1438
+ if ((mp->mp_flags & Mask) == pflags)
1439
+ mp->mp_flags ^= P_KEEP;
1440
+ }
1441
+ mx = m3->mc_xcursor;
1442
+ /* Proceed to mx if it is at a sub-database */
1443
+ if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
1444
+ break;
1445
+ if (! (mp && (mp->mp_flags & P_LEAF)))
1446
+ break;
1447
+ leaf = NODEPTR(mp, m3->mc_ki[j-1]);
1448
+ if (!(leaf->mn_flags & F_SUBDATA))
1449
+ break;
1389
1450
  }
1390
1451
  }
1391
1452
  if (i == 0)
@@ -1401,7 +1462,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1401
1462
  continue;
1402
1463
  if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
1403
1464
  break;
1404
- if ((dp->mp_flags & (P_DIRTY|P_KEEP)) == pflags && level <= 1)
1465
+ if ((dp->mp_flags & Mask) == pflags && level <= 1)
1405
1466
  dp->mp_flags ^= P_KEEP;
1406
1467
  }
1407
1468
  }
@@ -1415,15 +1476,12 @@ static int mdb_page_flush(MDB_txn *txn, int keep);
1415
1476
  /** Spill pages from the dirty list back to disk.
1416
1477
  * This is intended to prevent running into #MDB_TXN_FULL situations,
1417
1478
  * but note that they may still occur in a few cases:
1418
- * 1) pages in #MDB_DUPSORT sub-DBs are never spilled, so if there
1419
- * are too many of these dirtied in one txn, the txn may still get
1420
- * too full.
1479
+ * 1) our estimate of the txn size could be too small. Currently this
1480
+ * seems unlikely, except with a large number of #MDB_MULTIPLE items.
1421
1481
  * 2) child txns may run out of space if their parents dirtied a
1422
1482
  * lot of pages and never spilled them. TODO: we probably should do
1423
1483
  * a preemptive spill during #mdb_txn_begin() of a child txn, if
1424
1484
  * the parent's dirty_room is below a given threshold.
1425
- * 3) our estimate of the txn size could be too small. At the
1426
- * moment this seems unlikely.
1427
1485
  *
1428
1486
  * Otherwise, if not using nested txns, it is expected that apps will
1429
1487
  * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
@@ -1541,31 +1599,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
1541
1599
  rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
1542
1600
 
1543
1601
  done:
1544
- if (rc == 0) {
1545
- if (txn->mt_parent) {
1546
- txn->mt_dirty_room = txn->mt_parent->mt_dirty_room - dl[0].mid;
1547
- /* dirty pages that are dirty in an ancestor don't
1548
- * count against this txn's dirty_room.
1549
- */
1550
- for (i=1; i<=dl[0].mid; i++) {
1551
- pgno_t pgno = dl[i].mid;
1552
- MDB_txn *tx2;
1553
- for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
1554
- j = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
1555
- if (j <= tx2->mt_u.dirty_list[0].mid &&
1556
- tx2->mt_u.dirty_list[j].mid == pgno) {
1557
- txn->mt_dirty_room++;
1558
- break;
1559
- }
1560
- }
1561
- }
1562
- } else {
1563
- txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid;
1564
- }
1565
- txn->mt_flags |= MDB_TXN_SPILLS;
1566
- } else {
1567
- txn->mt_flags |= MDB_TXN_ERROR;
1568
- }
1602
+ txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
1569
1603
  return rc;
1570
1604
  }
1571
1605
 
@@ -1575,12 +1609,14 @@ mdb_find_oldest(MDB_txn *txn)
1575
1609
  {
1576
1610
  int i;
1577
1611
  txnid_t mr, oldest = txn->mt_txnid - 1;
1578
- MDB_reader *r = txn->mt_env->me_txns->mti_readers;
1579
- for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
1580
- if (r[i].mr_pid) {
1581
- mr = r[i].mr_txnid;
1582
- if (oldest > mr)
1583
- oldest = mr;
1612
+ if (txn->mt_env->me_txns) {
1613
+ MDB_reader *r = txn->mt_env->me_txns->mti_readers;
1614
+ for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
1615
+ if (r[i].mr_pid) {
1616
+ mr = r[i].mr_txnid;
1617
+ if (oldest > mr)
1618
+ oldest = mr;
1619
+ }
1584
1620
  }
1585
1621
  }
1586
1622
  return oldest;
@@ -1790,26 +1826,28 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
1790
1826
  /** Pull a page off the txn's spill list, if present.
1791
1827
  * If a page being referenced was spilled to disk in this txn, bring
1792
1828
  * it back and make it dirty/writable again.
1793
- * @param[in] tx0 the transaction handle.
1829
+ * @param[in] txn the transaction handle.
1794
1830
  * @param[in] mp the page being referenced.
1795
1831
  * @param[out] ret the writable page, if any. ret is unchanged if
1796
1832
  * mp wasn't spilled.
1797
1833
  */
1798
1834
  static int
1799
- mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
1835
+ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
1800
1836
  {
1801
- MDB_env *env = tx0->mt_env;
1802
- MDB_txn *txn;
1837
+ MDB_env *env = txn->mt_env;
1838
+ const MDB_txn *tx2;
1803
1839
  unsigned x;
1804
1840
  pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
1805
1841
 
1806
- for (txn = tx0; txn; txn=txn->mt_parent) {
1807
- if (!txn->mt_spill_pgs)
1842
+ for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
1843
+ if (!tx2->mt_spill_pgs)
1808
1844
  continue;
1809
- x = mdb_midl_search(txn->mt_spill_pgs, pn);
1810
- if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pn) {
1845
+ x = mdb_midl_search(tx2->mt_spill_pgs, pn);
1846
+ if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
1811
1847
  MDB_page *np;
1812
1848
  int num;
1849
+ if (txn->mt_dirty_room == 0)
1850
+ return MDB_TXN_FULL;
1813
1851
  if (IS_OVERFLOW(mp))
1814
1852
  num = mp->mp_pages;
1815
1853
  else
@@ -1825,7 +1863,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
1825
1863
  else
1826
1864
  mdb_page_copy(np, mp, env->me_psize);
1827
1865
  }
1828
- if (txn == tx0) {
1866
+ if (tx2 == txn) {
1829
1867
  /* If in current txn, this page is no longer spilled.
1830
1868
  * If it happens to be the last page, truncate the spill list.
1831
1869
  * Otherwise mark it as deleted by setting the LSB.
@@ -1838,22 +1876,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
1838
1876
  * page remains spilled until child commits
1839
1877
  */
1840
1878
 
1841
- if (txn->mt_parent) {
1842
- MDB_txn *tx2;
1843
- /* If this page is also in a parent's dirty list, then
1844
- * it's already accounted in dirty_room, and we need to
1845
- * cancel out the decrement that mdb_page_dirty does.
1846
- */
1847
- for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
1848
- x = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
1849
- if (x <= tx2->mt_u.dirty_list[0].mid &&
1850
- tx2->mt_u.dirty_list[x].mid == pgno) {
1851
- tx0->mt_dirty_room++;
1852
- break;
1853
- }
1854
- }
1855
- }
1856
- mdb_page_dirty(tx0, np);
1879
+ mdb_page_dirty(txn, np);
1857
1880
  np->mp_flags |= P_DIRTY;
1858
1881
  *ret = np;
1859
1882
  break;
@@ -1872,7 +1895,6 @@ mdb_page_touch(MDB_cursor *mc)
1872
1895
  MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
1873
1896
  MDB_txn *txn = mc->mc_txn;
1874
1897
  MDB_cursor *m2, *m3;
1875
- MDB_dbi dbi;
1876
1898
  pgno_t pgno;
1877
1899
  int rc;
1878
1900
 
@@ -1889,7 +1911,8 @@ mdb_page_touch(MDB_cursor *mc)
1889
1911
  (rc = mdb_page_alloc(mc, 1, &np)))
1890
1912
  return rc;
1891
1913
  pgno = np->mp_pgno;
1892
- DPRINTF(("touched db %u page %"Z"u -> %"Z"u", mc->mc_dbi,mp->mp_pgno,pgno));
1914
+ DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
1915
+ mp->mp_pgno, pgno));
1893
1916
  assert(mp->mp_pgno != pgno);
1894
1917
  mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
1895
1918
  /* Update the parent page, if any, to point to the new page */
@@ -1935,17 +1958,16 @@ mdb_page_touch(MDB_cursor *mc)
1935
1958
  done:
1936
1959
  /* Adjust cursors pointing to mp */
1937
1960
  mc->mc_pg[mc->mc_top] = np;
1938
- dbi = mc->mc_dbi;
1961
+ m2 = txn->mt_cursors[mc->mc_dbi];
1939
1962
  if (mc->mc_flags & C_SUB) {
1940
- dbi--;
1941
- for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
1963
+ for (; m2; m2=m2->mc_next) {
1942
1964
  m3 = &m2->mc_xcursor->mx_cursor;
1943
1965
  if (m3->mc_snum < mc->mc_snum) continue;
1944
1966
  if (m3->mc_pg[mc->mc_top] == mp)
1945
1967
  m3->mc_pg[mc->mc_top] = np;
1946
1968
  }
1947
1969
  } else {
1948
- for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
1970
+ for (; m2; m2=m2->mc_next) {
1949
1971
  if (m2->mc_snum < mc->mc_snum) continue;
1950
1972
  if (m2->mc_pg[mc->mc_top] == mp) {
1951
1973
  m2->mc_pg[mc->mc_top] = np;
@@ -2087,7 +2109,7 @@ enum Pidlock_op {
2087
2109
  * lock on the lockfile, set at an offset equal to the pid.
2088
2110
  */
2089
2111
  static int
2090
- mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid)
2112
+ mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
2091
2113
  {
2092
2114
  #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2093
2115
  int ret = 0;
@@ -2130,7 +2152,9 @@ static int
2130
2152
  mdb_txn_renew0(MDB_txn *txn)
2131
2153
  {
2132
2154
  MDB_env *env = txn->mt_env;
2133
- unsigned int i;
2155
+ MDB_txninfo *ti = env->me_txns;
2156
+ MDB_meta *meta;
2157
+ unsigned int i, nr;
2134
2158
  uint16_t x;
2135
2159
  int rc, new_notls = 0;
2136
2160
 
@@ -2139,9 +2163,9 @@ mdb_txn_renew0(MDB_txn *txn)
2139
2163
  txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
2140
2164
 
2141
2165
  if (txn->mt_flags & MDB_TXN_RDONLY) {
2142
- if (!env->me_txns) {
2143
- i = mdb_env_pick_meta(env);
2144
- txn->mt_txnid = env->me_metas[i]->mm_txnid;
2166
+ if (!ti) {
2167
+ meta = env->me_metas[ mdb_env_pick_meta(env) ];
2168
+ txn->mt_txnid = meta->mm_txnid;
2145
2169
  txn->mt_u.reader = NULL;
2146
2170
  } else {
2147
2171
  MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
@@ -2150,7 +2174,7 @@ mdb_txn_renew0(MDB_txn *txn)
2150
2174
  if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
2151
2175
  return MDB_BAD_RSLOT;
2152
2176
  } else {
2153
- pid_t pid = env->me_pid;
2177
+ MDB_PID_T pid = env->me_pid;
2154
2178
  pthread_t tid = pthread_self();
2155
2179
 
2156
2180
  if (!(env->me_flags & MDB_LIVE_READER)) {
@@ -2163,36 +2187,43 @@ mdb_txn_renew0(MDB_txn *txn)
2163
2187
  }
2164
2188
 
2165
2189
  LOCK_MUTEX_R(env);
2166
- for (i=0; i<env->me_txns->mti_numreaders; i++)
2167
- if (env->me_txns->mti_readers[i].mr_pid == 0)
2190
+ nr = ti->mti_numreaders;
2191
+ for (i=0; i<nr; i++)
2192
+ if (ti->mti_readers[i].mr_pid == 0)
2168
2193
  break;
2169
2194
  if (i == env->me_maxreaders) {
2170
2195
  UNLOCK_MUTEX_R(env);
2171
2196
  return MDB_READERS_FULL;
2172
2197
  }
2173
- env->me_txns->mti_readers[i].mr_pid = pid;
2174
- env->me_txns->mti_readers[i].mr_tid = tid;
2175
- if (i >= env->me_txns->mti_numreaders)
2176
- env->me_txns->mti_numreaders = i+1;
2198
+ ti->mti_readers[i].mr_pid = pid;
2199
+ ti->mti_readers[i].mr_tid = tid;
2200
+ if (i == nr)
2201
+ ti->mti_numreaders = ++nr;
2177
2202
  /* Save numreaders for un-mutexed mdb_env_close() */
2178
- env->me_numreaders = env->me_txns->mti_numreaders;
2203
+ env->me_numreaders = nr;
2179
2204
  UNLOCK_MUTEX_R(env);
2180
- r = &env->me_txns->mti_readers[i];
2205
+
2206
+ r = &ti->mti_readers[i];
2181
2207
  new_notls = (env->me_flags & MDB_NOTLS);
2182
2208
  if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
2183
2209
  r->mr_pid = 0;
2184
2210
  return rc;
2185
2211
  }
2186
2212
  }
2187
- txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid;
2213
+ txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
2188
2214
  txn->mt_u.reader = r;
2215
+ meta = env->me_metas[txn->mt_txnid & 1];
2189
2216
  }
2190
- txn->mt_toggle = txn->mt_txnid & 1;
2191
2217
  } else {
2192
- LOCK_MUTEX_W(env);
2218
+ if (ti) {
2219
+ LOCK_MUTEX_W(env);
2193
2220
 
2194
- txn->mt_txnid = env->me_txns->mti_txnid;
2195
- txn->mt_toggle = txn->mt_txnid & 1;
2221
+ txn->mt_txnid = ti->mti_txnid;
2222
+ meta = env->me_metas[txn->mt_txnid & 1];
2223
+ } else {
2224
+ meta = env->me_metas[ mdb_env_pick_meta(env) ];
2225
+ txn->mt_txnid = meta->mm_txnid;
2226
+ }
2196
2227
  txn->mt_txnid++;
2197
2228
  #if MDB_DEBUG
2198
2229
  if (txn->mt_txnid == mdb_debug_start)
@@ -2208,10 +2239,10 @@ mdb_txn_renew0(MDB_txn *txn)
2208
2239
  }
2209
2240
 
2210
2241
  /* Copy the DB info and flags */
2211
- memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db));
2242
+ memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
2212
2243
 
2213
2244
  /* Moved to here to avoid a data race in read TXNs */
2214
- txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
2245
+ txn->mt_next_pgno = meta->mm_last_pg+1;
2215
2246
 
2216
2247
  for (i=2; i<txn->mt_numdbs; i++) {
2217
2248
  x = env->me_dbflags[i];
@@ -2307,7 +2338,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2307
2338
  return ENOMEM;
2308
2339
  }
2309
2340
  txn->mt_txnid = parent->mt_txnid;
2310
- txn->mt_toggle = parent->mt_toggle;
2311
2341
  txn->mt_dirty_room = parent->mt_dirty_room;
2312
2342
  txn->mt_u.dirty_list[0].mid = 0;
2313
2343
  txn->mt_spill_pgs = NULL;
@@ -2433,7 +2463,8 @@ mdb_txn_reset0(MDB_txn *txn, const char *act)
2433
2463
 
2434
2464
  env->me_txn = NULL;
2435
2465
  /* The writer mutex was locked in mdb_txn_begin. */
2436
- UNLOCK_MUTEX_W(env);
2466
+ if (env->me_txns)
2467
+ UNLOCK_MUTEX_W(env);
2437
2468
  }
2438
2469
  }
2439
2470
 
@@ -2482,20 +2513,26 @@ mdb_freelist_save(MDB_txn *txn)
2482
2513
  int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
2483
2514
  txnid_t pglast = 0, head_id = 0;
2484
2515
  pgno_t freecnt = 0, *free_pgs, *mop;
2485
- ssize_t head_room = 0, total_room = 0, mop_len;
2516
+ ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
2486
2517
 
2487
2518
  mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
2488
2519
 
2489
2520
  if (env->me_pghead) {
2490
2521
  /* Make sure first page of freeDB is touched and on freelist */
2491
- rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
2522
+ rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
2492
2523
  if (rc && rc != MDB_NOTFOUND)
2493
2524
  return rc;
2494
2525
  }
2495
2526
 
2527
+ /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2528
+ clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2529
+ ? SSIZE_MAX : maxfree_1pg;
2530
+
2496
2531
  for (;;) {
2497
2532
  /* Come back here after each Put() in case freelist changed */
2498
2533
  MDB_val key, data;
2534
+ pgno_t *pgs;
2535
+ ssize_t j;
2499
2536
 
2500
2537
  /* If using records from freeDB which we have not yet
2501
2538
  * deleted, delete them and any we reserved for me_pghead.
@@ -2516,9 +2553,7 @@ mdb_freelist_save(MDB_txn *txn)
2516
2553
  if (freecnt < txn->mt_free_pgs[0]) {
2517
2554
  if (!freecnt) {
2518
2555
  /* Make sure last page of freeDB is touched and on freelist */
2519
- key.mv_size = MDB_MAXKEYSIZE+1;
2520
- key.mv_data = NULL;
2521
- rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
2556
+ rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
2522
2557
  if (rc && rc != MDB_NOTFOUND)
2523
2558
  return rc;
2524
2559
  }
@@ -2581,11 +2616,16 @@ mdb_freelist_save(MDB_txn *txn)
2581
2616
  rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
2582
2617
  if (rc)
2583
2618
  return rc;
2584
- *(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */
2619
+ /* IDL is initially empty, zero out at least the length */
2620
+ pgs = (pgno_t *)data.mv_data;
2621
+ j = head_room > clean_limit ? head_room : 0;
2622
+ do {
2623
+ pgs[j] = 0;
2624
+ } while (--j >= 0);
2585
2625
  total_room += head_room;
2586
2626
  }
2587
2627
 
2588
- /* Fill in the reserved, touched me_pghead records */
2628
+ /* Fill in the reserved me_pghead records */
2589
2629
  rc = MDB_SUCCESS;
2590
2630
  if (mop_len) {
2591
2631
  MDB_val key, data;
@@ -2655,8 +2695,7 @@ mdb_page_flush(MDB_txn *txn, int keep)
2655
2695
  }
2656
2696
  dp->mp_flags &= ~P_DIRTY;
2657
2697
  }
2658
- dl[0].mid = j;
2659
- return MDB_SUCCESS;
2698
+ goto done;
2660
2699
  }
2661
2700
 
2662
2701
  /* Write the pages */
@@ -2750,8 +2789,11 @@ mdb_page_flush(MDB_txn *txn, int keep)
2750
2789
  }
2751
2790
  mdb_dpage_free(env, dp);
2752
2791
  }
2753
- dl[0].mid = j;
2754
2792
 
2793
+ done:
2794
+ i--;
2795
+ txn->mt_dirty_room += i - j;
2796
+ dl[0].mid = j;
2755
2797
  return MDB_SUCCESS;
2756
2798
  }
2757
2799
 
@@ -2791,14 +2833,18 @@ mdb_txn_commit(MDB_txn *txn)
2791
2833
 
2792
2834
  if (txn->mt_parent) {
2793
2835
  MDB_txn *parent = txn->mt_parent;
2794
- unsigned x, y, len;
2795
2836
  MDB_ID2L dst, src;
2837
+ MDB_IDL pspill;
2838
+ unsigned x, y, len, ps_len;
2796
2839
 
2797
2840
  /* Append our free list to parent's */
2798
2841
  rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
2799
2842
  if (rc)
2800
2843
  goto fail;
2801
2844
  mdb_midl_free(txn->mt_free_pgs);
2845
+ /* Failures after this must either undo the changes
2846
+ * to the parent or set MDB_TXN_ERROR in the parent.
2847
+ */
2802
2848
 
2803
2849
  parent->mt_next_pgno = txn->mt_next_pgno;
2804
2850
  parent->mt_flags = txn->mt_flags;
@@ -2820,37 +2866,26 @@ mdb_txn_commit(MDB_txn *txn)
2820
2866
  dst = parent->mt_u.dirty_list;
2821
2867
  src = txn->mt_u.dirty_list;
2822
2868
  /* Remove anything in our dirty list from parent's spill list */
2823
- if (parent->mt_spill_pgs) {
2824
- x = parent->mt_spill_pgs[0];
2825
- len = x;
2826
- /* zero out our dirty pages in parent spill list */
2827
- for (i=1; i<=src[0].mid; i++) {
2869
+ if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
2870
+ x = y = ps_len;
2871
+ pspill[0] = (pgno_t)-1;
2872
+ /* Mark our dirty pages as deleted in parent spill list */
2873
+ for (i=0, len=src[0].mid; ++i <= len; ) {
2828
2874
  MDB_ID pn = src[i].mid << 1;
2829
- if (pn < parent->mt_spill_pgs[x])
2830
- continue;
2831
- if (pn > parent->mt_spill_pgs[x]) {
2832
- if (x <= 1)
2833
- break;
2875
+ while (pn > pspill[x])
2834
2876
  x--;
2835
- continue;
2836
- }
2837
- parent->mt_spill_pgs[x] = 0;
2838
- len--;
2839
- }
2840
- /* OK, we had a few hits, squash zeros from the spill list */
2841
- if (len < parent->mt_spill_pgs[0]) {
2842
- x=1;
2843
- for (y=1; y<=parent->mt_spill_pgs[0]; y++) {
2844
- if (parent->mt_spill_pgs[y]) {
2845
- if (y != x) {
2846
- parent->mt_spill_pgs[x] = parent->mt_spill_pgs[y];
2847
- }
2848
- x++;
2849
- }
2877
+ if (pn == pspill[x]) {
2878
+ pspill[x] = 1;
2879
+ y = --x;
2850
2880
  }
2851
- parent->mt_spill_pgs[0] = len;
2852
2881
  }
2882
+ /* Squash deleted pagenums if we deleted any */
2883
+ for (x=y; ++x <= ps_len; )
2884
+ if (!(pspill[x] & 1))
2885
+ pspill[++y] = pspill[x];
2886
+ pspill[0] = y;
2853
2887
  }
2888
+
2854
2889
  /* Find len = length of merging our dirty list with parent's */
2855
2890
  x = dst[0].mid;
2856
2891
  dst[0].mid = 0; /* simplify loops */
@@ -2884,7 +2919,10 @@ mdb_txn_commit(MDB_txn *txn)
2884
2919
  parent->mt_dirty_room = txn->mt_dirty_room;
2885
2920
  if (txn->mt_spill_pgs) {
2886
2921
  if (parent->mt_spill_pgs) {
2887
- mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
2922
+ /* TODO: Prevent failure here, so parent does not fail */
2923
+ rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
2924
+ if (rc)
2925
+ parent->mt_flags |= MDB_TXN_ERROR;
2888
2926
  mdb_midl_free(txn->mt_spill_pgs);
2889
2927
  mdb_midl_sort(parent->mt_spill_pgs);
2890
2928
  } else {
@@ -2895,7 +2933,7 @@ mdb_txn_commit(MDB_txn *txn)
2895
2933
  parent->mt_child = NULL;
2896
2934
  mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
2897
2935
  free(txn);
2898
- return MDB_SUCCESS;
2936
+ return rc;
2899
2937
  }
2900
2938
 
2901
2939
  if (txn != env->me_txn) {
@@ -2954,7 +2992,8 @@ done:
2954
2992
  env->me_txn = NULL;
2955
2993
  mdb_dbis_update(txn, 1);
2956
2994
 
2957
- UNLOCK_MUTEX_W(env);
2995
+ if (env->me_txns)
2996
+ UNLOCK_MUTEX_W(env);
2958
2997
  free(txn);
2959
2998
 
2960
2999
  return MDB_SUCCESS;
@@ -2973,10 +3012,11 @@ fail:
2973
3012
  static int
2974
3013
  mdb_env_read_header(MDB_env *env, MDB_meta *meta)
2975
3014
  {
2976
- MDB_pagebuf pbuf;
3015
+ MDB_metabuf pbuf;
2977
3016
  MDB_page *p;
2978
3017
  MDB_meta *m;
2979
3018
  int i, rc, off;
3019
+ enum { Size = sizeof(pbuf) };
2980
3020
 
2981
3021
  /* We don't know the page size yet, so use a minimum value.
2982
3022
  * Read both meta pages so we can use the latest one.
@@ -2988,13 +3028,13 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
2988
3028
  OVERLAPPED ov;
2989
3029
  memset(&ov, 0, sizeof(ov));
2990
3030
  ov.Offset = off;
2991
- rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1;
3031
+ rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
2992
3032
  if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
2993
3033
  rc = 0;
2994
3034
  #else
2995
- rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off);
3035
+ rc = pread(env->me_fd, &pbuf, Size, off);
2996
3036
  #endif
2997
- if (rc != MDB_PAGESIZE) {
3037
+ if (rc != Size) {
2998
3038
  if (rc == 0 && off == 0)
2999
3039
  return ENOENT;
3000
3040
  rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
@@ -3109,7 +3149,7 @@ mdb_env_write_meta(MDB_txn *txn)
3109
3149
  assert(txn != NULL);
3110
3150
  assert(txn->mt_env != NULL);
3111
3151
 
3112
- toggle = !txn->mt_toggle;
3152
+ toggle = txn->mt_txnid & 1;
3113
3153
  DPRINTF(("writing meta page %d for root page %"Z"u",
3114
3154
  toggle, txn->mt_dbs[MAIN_DBI].md_root));
3115
3155
 
@@ -3125,11 +3165,18 @@ mdb_env_write_meta(MDB_txn *txn)
3125
3165
  mp->mm_last_pg = txn->mt_next_pgno - 1;
3126
3166
  mp->mm_txnid = txn->mt_txnid;
3127
3167
  if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3168
+ unsigned meta_size = env->me_psize;
3128
3169
  rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
3129
3170
  ptr = env->me_map;
3130
- if (toggle)
3131
- ptr += env->me_psize;
3132
- if (MDB_MSYNC(ptr, env->me_psize, rc)) {
3171
+ if (toggle) {
3172
+ #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
3173
+ if (meta_size < env->me_os_psize)
3174
+ meta_size += meta_size;
3175
+ else
3176
+ #endif
3177
+ ptr += meta_size;
3178
+ }
3179
+ if (MDB_MSYNC(ptr, meta_size, rc)) {
3133
3180
  rc = ErrCode();
3134
3181
  goto fail;
3135
3182
  }
@@ -3200,7 +3247,8 @@ done:
3200
3247
  * readers will get consistent data regardless of how fresh or
3201
3248
  * how stale their view of these values is.
3202
3249
  */
3203
- env->me_txns->mti_txnid = txn->mt_txnid;
3250
+ if (env->me_txns)
3251
+ env->me_txns->mti_txnid = txn->mt_txnid;
3204
3252
 
3205
3253
  return MDB_SUCCESS;
3206
3254
  }
@@ -3234,6 +3282,7 @@ mdb_env_create(MDB_env **env)
3234
3282
  e->me_wmutex = SEM_FAILED;
3235
3283
  #endif
3236
3284
  e->me_pid = getpid();
3285
+ GET_PAGESIZE(e->me_os_psize);
3237
3286
  VGMEMP_CREATE(e,0,0);
3238
3287
  *env = e;
3239
3288
  return MDB_SUCCESS;
@@ -3276,7 +3325,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3276
3325
  int prot = PROT_READ;
3277
3326
  if (flags & MDB_WRITEMAP) {
3278
3327
  prot |= PROT_WRITE;
3279
- if (newsize && ftruncate(env->me_fd, env->me_mapsize) < 0)
3328
+ if (ftruncate(env->me_fd, env->me_mapsize) < 0)
3280
3329
  return ErrCode();
3281
3330
  }
3282
3331
  env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
@@ -3285,14 +3334,17 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3285
3334
  env->me_map = NULL;
3286
3335
  return ErrCode();
3287
3336
  }
3288
- /* Turn off readahead. It's harmful when the DB is larger than RAM. */
3337
+
3338
+ if (flags & MDB_NORDAHEAD) {
3339
+ /* Turn off readahead. It's harmful when the DB is larger than RAM. */
3289
3340
  #ifdef MADV_RANDOM
3290
- madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
3341
+ madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
3291
3342
  #else
3292
3343
  #ifdef POSIX_MADV_RANDOM
3293
- posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
3344
+ posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
3294
3345
  #endif /* POSIX_MADV_RANDOM */
3295
3346
  #endif /* MADV_RANDOM */
3347
+ }
3296
3348
  #endif /* _WIN32 */
3297
3349
 
3298
3350
  /* Can happen because the address argument to mmap() is just a
@@ -3323,6 +3375,14 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
3323
3375
  return EINVAL;
3324
3376
  if (!size)
3325
3377
  size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize;
3378
+ else if (size < env->me_mapsize) {
3379
+ /* If the configured size is smaller, make sure it's
3380
+ * still big enough. Silently round up to minimum if not.
3381
+ */
3382
+ size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize;
3383
+ if (size < minsize)
3384
+ size = minsize;
3385
+ }
3326
3386
  munmap(env->me_map, env->me_mapsize);
3327
3387
  env->me_mapsize = size;
3328
3388
  old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
@@ -3388,7 +3448,9 @@ mdb_env_open2(MDB_env *env)
3388
3448
  return i;
3389
3449
  DPUTS("new mdbenv");
3390
3450
  newenv = 1;
3391
- GET_PAGESIZE(env->me_psize);
3451
+ env->me_psize = env->me_os_psize;
3452
+ if (env->me_psize > MAX_PAGESIZE)
3453
+ env->me_psize = MAX_PAGESIZE;
3392
3454
  } else {
3393
3455
  env->me_psize = meta.mm_psize;
3394
3456
  }
@@ -3499,7 +3561,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_
3499
3561
  #pragma comment(linker, "/INCLUDE:_tls_used")
3500
3562
  #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
3501
3563
  #pragma const_seg(".CRT$XLB")
3502
- extern const PIMAGE_TLS_CALLBACK mdb_tls_callback;
3564
+ extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
3503
3565
  const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
3504
3566
  #pragma const_seg()
3505
3567
  #else /* WIN32 */
@@ -3597,7 +3659,7 @@ mdb_env_excl_lock(MDB_env *env, int *excl)
3597
3659
  return rc;
3598
3660
  }
3599
3661
 
3600
- #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
3662
+ #ifdef MDB_USE_HASH
3601
3663
  /*
3602
3664
  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
3603
3665
  *
@@ -3763,7 +3825,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
3763
3825
  rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
3764
3826
  if (size < rsize && *excl > 0) {
3765
3827
  #ifdef _WIN32
3766
- if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize
3828
+ if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
3767
3829
  || !SetEndOfFile(env->me_lfd))
3768
3830
  goto fail_errno;
3769
3831
  #else
@@ -3919,8 +3981,9 @@ fail:
3919
3981
  * at runtime. Changing other flags requires closing the
3920
3982
  * environment and re-opening it with the new flags.
3921
3983
  */
3922
- #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
3923
- #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS)
3984
+ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
3985
+ #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
3986
+ MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
3924
3987
 
3925
3988
  int
3926
3989
  mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
@@ -3973,7 +4036,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
3973
4036
  }
3974
4037
 
3975
4038
  /* For RDONLY, get lockfile after we know datafile exists */
3976
- if (!F_ISSET(flags, MDB_RDONLY)) {
4039
+ if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
3977
4040
  rc = mdb_env_setup_locks(env, lpath, mode, &excl);
3978
4041
  if (rc)
3979
4042
  goto leave;
@@ -4003,7 +4066,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4003
4066
  goto leave;
4004
4067
  }
4005
4068
 
4006
- if (F_ISSET(flags, MDB_RDONLY)) {
4069
+ if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
4007
4070
  rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4008
4071
  if (rc)
4009
4072
  goto leave;
@@ -4033,7 +4096,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4033
4096
  DPRINTF(("opened dbenv %p", (void *) env));
4034
4097
  if (excl > 0) {
4035
4098
  rc = mdb_env_share_locks(env, &excl);
4099
+ if (rc)
4100
+ goto leave;
4036
4101
  }
4102
+ if (!((flags & MDB_RDONLY) ||
4103
+ (env->me_pbuf = calloc(1, env->me_psize))))
4104
+ rc = ENOMEM;
4037
4105
  }
4038
4106
 
4039
4107
  leave:
@@ -4057,6 +4125,7 @@ mdb_env_close0(MDB_env *env, int excl)
4057
4125
  for (i = env->me_maxdbs; --i > MAIN_DBI; )
4058
4126
  free(env->me_dbxs[i].md_name.mv_data);
4059
4127
 
4128
+ free(env->me_pbuf);
4060
4129
  free(env->me_dbflags);
4061
4130
  free(env->me_dbxs);
4062
4131
  free(env->me_path);
@@ -4084,7 +4153,7 @@ mdb_env_close0(MDB_env *env, int excl)
4084
4153
  if (env->me_fd != INVALID_HANDLE_VALUE)
4085
4154
  (void) close(env->me_fd);
4086
4155
  if (env->me_txns) {
4087
- pid_t pid = env->me_pid;
4156
+ MDB_PID_T pid = env->me_pid;
4088
4157
  /* Clearing readers is done in this function because
4089
4158
  * me_txkey with its destructor must be disabled first.
4090
4159
  */
@@ -4246,14 +4315,6 @@ mdb_env_copy(MDB_env *env, const char *path)
4246
4315
  newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
4247
4316
  FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
4248
4317
  #else
4249
- #ifdef O_DIRECT
4250
- /* The OS supports O_DIRECT, try with it */
4251
- newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_DIRECT, 0666);
4252
- /* But open can fail if O_DIRECT isn't supported by the file system
4253
- * so retry without the flag
4254
- */
4255
- if (newfd == INVALID_HANDLE_VALUE && ErrCode() == EINVAL)
4256
- #endif
4257
4318
  newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
4258
4319
  #endif
4259
4320
  if (newfd == INVALID_HANDLE_VALUE) {
@@ -4261,6 +4322,11 @@ mdb_env_copy(MDB_env *env, const char *path)
4261
4322
  goto leave;
4262
4323
  }
4263
4324
 
4325
+ #ifdef O_DIRECT
4326
+ /* Set O_DIRECT if the file system supports it */
4327
+ if ((rc = fcntl(newfd, F_GETFL)) != -1)
4328
+ (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
4329
+ #endif
4264
4330
  #ifdef F_NOCACHE /* __APPLE__ */
4265
4331
  rc = fcntl(newfd, F_NOCACHE, 1);
4266
4332
  if (rc) {
@@ -4308,7 +4374,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4308
4374
  *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4309
4375
  }
4310
4376
 
4311
- /** Compare two items pointing at aligned int's */
4377
+ /** Compare two items pointing at aligned unsigned int's */
4312
4378
  static int
4313
4379
  mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4314
4380
  {
@@ -4316,7 +4382,7 @@ mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4316
4382
  *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4317
4383
  }
4318
4384
 
4319
- /** Compare two items pointing at ints of unknown alignment.
4385
+ /** Compare two items pointing at unsigned ints of unknown alignment.
4320
4386
  * Nodes and keys are guaranteed to be 2-byte aligned.
4321
4387
  */
4322
4388
  static int
@@ -4514,8 +4580,8 @@ mdb_cursor_pop(MDB_cursor *mc)
4514
4580
  if (mc->mc_snum)
4515
4581
  mc->mc_top--;
4516
4582
 
4517
- DPRINTF(("popped page %"Z"u off db %u cursor %p", top->mp_pgno,
4518
- mc->mc_dbi, (void *) mc));
4583
+ DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
4584
+ DDBI(mc), (void *) mc));
4519
4585
  }
4520
4586
  }
4521
4587
 
@@ -4523,8 +4589,8 @@ mdb_cursor_pop(MDB_cursor *mc)
4523
4589
  static int
4524
4590
  mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
4525
4591
  {
4526
- DPRINTF(("pushing page %"Z"u on db %u cursor %p", mp->mp_pgno,
4527
- mc->mc_dbi, (void *) mc));
4592
+ DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
4593
+ DDBI(mc), (void *) mc));
4528
4594
 
4529
4595
  if (mc->mc_snum >= CURSOR_STACK) {
4530
4596
  assert(mc->mc_snum < CURSOR_STACK);
@@ -4598,18 +4664,11 @@ done:
4598
4664
  return MDB_SUCCESS;
4599
4665
  }
4600
4666
 
4601
- /** Search for the page a given key should be in.
4602
- * Pushes parent pages on the cursor stack. This function continues a
4603
- * search on a cursor that has already been initialized. (Usually by
4604
- * #mdb_page_search() but also by #mdb_node_move().)
4605
- * @param[in,out] mc the cursor for this operation.
4606
- * @param[in] key the key to search for. If NULL, search for the lowest
4607
- * page. (This is used by #mdb_cursor_first().)
4608
- * @param[in] modify If true, visited pages are updated with new page numbers.
4609
- * @return 0 on success, non-zero on failure.
4667
+ /** Finish #mdb_page_search() / #mdb_page_search_lowest().
4668
+ * The cursor is at the root page, set up the rest of it.
4610
4669
  */
4611
4670
  static int
4612
- mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4671
+ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
4613
4672
  {
4614
4673
  MDB_page *mp = mc->mc_pg[mc->mc_top];
4615
4674
  int rc;
@@ -4623,11 +4682,10 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4623
4682
  assert(NUMKEYS(mp) > 1);
4624
4683
  DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
4625
4684
 
4626
- if (key == NULL) /* Initialize cursor to first page. */
4685
+ if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
4627
4686
  i = 0;
4628
- else if (key->mv_size > MDB_MAXKEYSIZE && key->mv_data == NULL) {
4629
- /* cursor to last page */
4630
- i = NUMKEYS(mp)-1;
4687
+ if (flags & MDB_PS_LAST)
4688
+ i = NUMKEYS(mp) - 1;
4631
4689
  } else {
4632
4690
  int exact;
4633
4691
  node = mdb_node_search(mc, key, &exact);
@@ -4640,10 +4698,9 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4640
4698
  i--;
4641
4699
  }
4642
4700
  }
4701
+ DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
4643
4702
  }
4644
4703
 
4645
- if (key)
4646
- DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
4647
4704
  assert(i < NUMKEYS(mp));
4648
4705
  node = NODEPTR(mp, i);
4649
4706
 
@@ -4654,7 +4711,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4654
4711
  if ((rc = mdb_cursor_push(mc, mp)))
4655
4712
  return rc;
4656
4713
 
4657
- if (modify) {
4714
+ if (flags & MDB_PS_MODIFY) {
4658
4715
  if ((rc = mdb_page_touch(mc)) != 0)
4659
4716
  return rc;
4660
4717
  mp = mc->mc_pg[mc->mc_top];
@@ -4668,7 +4725,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4668
4725
  }
4669
4726
 
4670
4727
  DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
4671
- key ? DKEY(key) : NULL));
4728
+ key ? DKEY(key) : "null"));
4672
4729
  mc->mc_flags |= C_INITIALIZED;
4673
4730
  mc->mc_flags &= ~C_EOF;
4674
4731
 
@@ -4694,18 +4751,17 @@ mdb_page_search_lowest(MDB_cursor *mc)
4694
4751
  mc->mc_ki[mc->mc_top] = 0;
4695
4752
  if ((rc = mdb_cursor_push(mc, mp)))
4696
4753
  return rc;
4697
- return mdb_page_search_root(mc, NULL, 0);
4754
+ return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
4698
4755
  }
4699
4756
 
4700
4757
  /** Search for the page a given key should be in.
4701
- * Pushes parent pages on the cursor stack. This function just sets up
4702
- * the search; it finds the root page for \b mc's database and sets this
4703
- * as the root of the cursor's stack. Then #mdb_page_search_root() is
4704
- * called to complete the search.
4758
+ * Push it and its parent pages on the cursor stack.
4705
4759
  * @param[in,out] mc the cursor for this operation.
4706
- * @param[in] key the key to search for. If NULL, search for the lowest
4707
- * page. (This is used by #mdb_cursor_first().)
4708
- * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers.
4760
+ * @param[in] key the key to search for, or NULL for first/last page.
4761
+ * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
4762
+ * are touched (updated with new page numbers).
4763
+ * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
4764
+ * This is used by #mdb_cursor_first() and #mdb_cursor_last().
4709
4765
  * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
4710
4766
  * @return 0 on success, non-zero on failure.
4711
4767
  */
@@ -4716,23 +4772,20 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4716
4772
  pgno_t root;
4717
4773
 
4718
4774
  /* Make sure the txn is still viable, then find the root from
4719
- * the txn's db table.
4775
+ * the txn's db table and set it as the root of the cursor's stack.
4720
4776
  */
4721
4777
  if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
4722
4778
  DPUTS("transaction has failed, must abort");
4723
4779
  return MDB_BAD_TXN;
4724
4780
  } else {
4725
4781
  /* Make sure we're using an up-to-date root */
4726
- if (mc->mc_dbi > MAIN_DBI) {
4727
- if ((*mc->mc_dbflag & DB_STALE) ||
4728
- ((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) {
4782
+ if (*mc->mc_dbflag & DB_STALE) {
4729
4783
  MDB_cursor mc2;
4730
- unsigned char dbflag = 0;
4731
4784
  mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
4732
- rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, flags & MDB_PS_MODIFY);
4785
+ rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
4733
4786
  if (rc)
4734
4787
  return rc;
4735
- if (*mc->mc_dbflag & DB_STALE) {
4788
+ {
4736
4789
  MDB_val data;
4737
4790
  int exact = 0;
4738
4791
  uint16_t flags;
@@ -4752,11 +4805,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4752
4805
  return MDB_INCOMPATIBLE;
4753
4806
  memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
4754
4807
  }
4755
- if (flags & MDB_PS_MODIFY)
4756
- dbflag = DB_DIRTY;
4757
4808
  *mc->mc_dbflag &= ~DB_STALE;
4758
- *mc->mc_dbflag |= dbflag;
4759
- }
4760
4809
  }
4761
4810
  root = mc->mc_db->md_root;
4762
4811
 
@@ -4774,8 +4823,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4774
4823
  mc->mc_snum = 1;
4775
4824
  mc->mc_top = 0;
4776
4825
 
4777
- DPRINTF(("db %u root page %"Z"u has flags 0x%X",
4778
- mc->mc_dbi, root, mc->mc_pg[0]->mp_flags));
4826
+ DPRINTF(("db %d root page %"Z"u has flags 0x%X",
4827
+ DDBI(mc), root, mc->mc_pg[0]->mp_flags));
4779
4828
 
4780
4829
  if (flags & MDB_PS_MODIFY) {
4781
4830
  if ((rc = mdb_page_touch(mc)))
@@ -4914,7 +4963,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi,
4914
4963
  if (txn->mt_flags & MDB_TXN_ERROR)
4915
4964
  return MDB_BAD_TXN;
4916
4965
 
4917
- if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
4966
+ if (key->mv_size > MDB_MAXKEYSIZE) {
4918
4967
  return MDB_BAD_VALSIZE;
4919
4968
  }
4920
4969
 
@@ -4966,8 +5015,11 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right)
4966
5015
  assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
4967
5016
 
4968
5017
  indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4969
- if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0))
5018
+ if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
5019
+ /* mc will be inconsistent if caller does mc_snum++ as above */
5020
+ mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
4970
5021
  return rc;
5022
+ }
4971
5023
 
4972
5024
  mdb_cursor_push(mc, mp);
4973
5025
  if (!move_right)
@@ -5143,7 +5195,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5143
5195
 
5144
5196
  assert(mc);
5145
5197
  assert(key);
5146
- assert(key->mv_size > 0);
5198
+ if (key->mv_size == 0)
5199
+ return MDB_BAD_VALSIZE;
5147
5200
 
5148
5201
  if (mc->mc_xcursor)
5149
5202
  mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
@@ -5329,7 +5382,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5329
5382
  mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5330
5383
 
5331
5384
  if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5332
- rc = mdb_page_search(mc, NULL, 0);
5385
+ rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
5333
5386
  if (rc != MDB_SUCCESS)
5334
5387
  return rc;
5335
5388
  }
@@ -5375,11 +5428,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5375
5428
  if (!(mc->mc_flags & C_EOF)) {
5376
5429
 
5377
5430
  if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5378
- MDB_val lkey;
5379
-
5380
- lkey.mv_size = MDB_MAXKEYSIZE+1;
5381
- lkey.mv_data = NULL;
5382
- rc = mdb_page_search(mc, &lkey, 0);
5431
+ rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
5383
5432
  if (rc != MDB_SUCCESS)
5384
5433
  return rc;
5385
5434
  }
@@ -5431,8 +5480,9 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5431
5480
  rc = EINVAL;
5432
5481
  } else {
5433
5482
  MDB_page *mp = mc->mc_pg[mc->mc_top];
5434
- if (!NUMKEYS(mp)) {
5435
- mc->mc_ki[mc->mc_top] = 0;
5483
+ int nkeys = NUMKEYS(mp);
5484
+ if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
5485
+ mc->mc_ki[mc->mc_top] = nkeys;
5436
5486
  rc = MDB_NOTFOUND;
5437
5487
  break;
5438
5488
  }
@@ -5471,7 +5521,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5471
5521
  case MDB_SET_RANGE:
5472
5522
  if (key == NULL) {
5473
5523
  rc = EINVAL;
5474
- } else if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
5524
+ } else if (key->mv_size > MDB_MAXKEYSIZE) {
5475
5525
  rc = MDB_BAD_VALSIZE;
5476
5526
  } else if (op == MDB_SET_RANGE)
5477
5527
  rc = mdb_cursor_set(mc, key, data, op, NULL);
@@ -5577,14 +5627,14 @@ fetchm:
5577
5627
  return rc;
5578
5628
  }
5579
5629
 
5580
- /** Touch all the pages in the cursor stack.
5630
+ /** Touch all the pages in the cursor stack. Set mc_top.
5581
5631
  * Makes sure all the pages are writable, before attempting a write operation.
5582
5632
  * @param[in] mc The cursor to operate on.
5583
5633
  */
5584
5634
  static int
5585
5635
  mdb_cursor_touch(MDB_cursor *mc)
5586
5636
  {
5587
- int rc;
5637
+ int rc = MDB_SUCCESS;
5588
5638
 
5589
5639
  if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
5590
5640
  MDB_cursor mc2;
@@ -5595,13 +5645,14 @@ mdb_cursor_touch(MDB_cursor *mc)
5595
5645
  return rc;
5596
5646
  *mc->mc_dbflag |= DB_DIRTY;
5597
5647
  }
5598
- for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
5599
- rc = mdb_page_touch(mc);
5600
- if (rc)
5601
- return rc;
5648
+ mc->mc_top = 0;
5649
+ if (mc->mc_snum) {
5650
+ do {
5651
+ rc = mdb_page_touch(mc);
5652
+ } while (!rc && ++(mc->mc_top) < mc->mc_snum);
5653
+ mc->mc_top = mc->mc_snum-1;
5602
5654
  }
5603
- mc->mc_top = mc->mc_snum-1;
5604
- return MDB_SUCCESS;
5655
+ return rc;
5605
5656
  }
5606
5657
 
5607
5658
  /** Do not spill pages to disk if txn is getting full, may fail instead */
@@ -5612,15 +5663,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5612
5663
  unsigned int flags)
5613
5664
  {
5614
5665
  enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
5666
+ MDB_env *env = mc->mc_txn->mt_env;
5615
5667
  MDB_node *leaf = NULL;
5616
5668
  MDB_val xdata, *rdata, dkey;
5617
- MDB_page *fp;
5618
5669
  MDB_db dummy;
5619
5670
  int do_sub = 0, insert = 0;
5620
5671
  unsigned int mcount = 0, dcount = 0, nospill;
5621
5672
  size_t nsize;
5622
5673
  int rc, rc2;
5623
- MDB_pagebuf pbuf;
5624
5674
  char dbuf[MDB_MAXKEYSIZE+1];
5625
5675
  unsigned int nflags;
5626
5676
  DKBUF;
@@ -5652,8 +5702,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5652
5702
  return MDB_BAD_VALSIZE;
5653
5703
  #endif
5654
5704
 
5655
- DPRINTF(("==> put db %u key [%s], size %"Z"u, data size %"Z"u",
5656
- mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size));
5705
+ DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
5706
+ DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
5657
5707
 
5658
5708
  dkey.mv_size = 0;
5659
5709
 
@@ -5664,6 +5714,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5664
5714
  } else if (mc->mc_db->md_root == P_INVALID) {
5665
5715
  /* new database, cursor has nothing to point to */
5666
5716
  mc->mc_snum = 0;
5717
+ mc->mc_top = 0;
5667
5718
  mc->mc_flags &= ~C_INITIALIZED;
5668
5719
  rc = MDB_NO_ROOT;
5669
5720
  } else {
@@ -5733,6 +5784,9 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5733
5784
 
5734
5785
  /* The key already exists */
5735
5786
  if (rc == MDB_SUCCESS) {
5787
+ MDB_page *fp, *mp;
5788
+ MDB_val olddata;
5789
+
5736
5790
  /* there's only a key anyway, so this is a no-op */
5737
5791
  if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5738
5792
  unsigned int ksize = mc->mc_db->md_pad;
@@ -5745,19 +5799,23 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5745
5799
  return MDB_SUCCESS;
5746
5800
  }
5747
5801
 
5802
+ more:
5748
5803
  leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5804
+ olddata.mv_size = NODEDSZ(leaf);
5805
+ olddata.mv_data = NODEDATA(leaf);
5749
5806
 
5750
5807
  /* DB has dups? */
5751
5808
  if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
5809
+ mp = fp = xdata.mv_data = env->me_pbuf;
5810
+ mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5811
+
5752
5812
  /* Was a single item before, must convert now */
5753
- more:
5754
5813
  if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5755
5814
  /* Just overwrite the current item */
5756
5815
  if (flags == MDB_CURRENT)
5757
5816
  goto current;
5758
5817
 
5759
- dkey.mv_size = NODEDSZ(leaf);
5760
- dkey.mv_data = NODEDATA(leaf);
5818
+ dkey = olddata;
5761
5819
  #if UINT_MAX < SIZE_MAX
5762
5820
  if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
5763
5821
  #ifdef MISALIGNED_OK
@@ -5780,85 +5838,76 @@ more:
5780
5838
  /* create a fake page for the dup items */
5781
5839
  memcpy(dbuf, dkey.mv_data, dkey.mv_size);
5782
5840
  dkey.mv_data = dbuf;
5783
- fp = (MDB_page *)&pbuf;
5784
- fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5785
5841
  fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
5786
5842
  fp->mp_lower = PAGEHDRSZ;
5787
- fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5843
+ xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5788
5844
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5789
5845
  fp->mp_flags |= P_LEAF2;
5790
5846
  fp->mp_pad = data->mv_size;
5791
- fp->mp_upper += 2 * data->mv_size; /* leave space for 2 more */
5847
+ xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
5792
5848
  } else {
5793
- fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
5849
+ xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
5794
5850
  (dkey.mv_size & 1) + (data->mv_size & 1);
5795
5851
  }
5796
- mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5797
- do_sub = 1;
5798
- rdata = &xdata;
5799
- xdata.mv_size = fp->mp_upper;
5800
- xdata.mv_data = fp;
5801
- flags |= F_DUPDATA;
5802
- goto new_sub;
5803
- }
5804
- if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
5852
+ fp->mp_upper = xdata.mv_size;
5853
+ } else if (leaf->mn_flags & F_SUBDATA) {
5854
+ /* Data is on sub-DB, just store it */
5855
+ flags |= F_DUPDATA|F_SUBDATA;
5856
+ goto put_sub;
5857
+ } else {
5805
5858
  /* See if we need to convert from fake page to subDB */
5806
- MDB_page *mp;
5807
5859
  unsigned int offset;
5808
5860
  unsigned int i;
5809
5861
  uint16_t fp_flags;
5810
5862
 
5811
- fp = NODEDATA(leaf);
5812
- if (flags == MDB_CURRENT) {
5813
- reuse:
5863
+ fp = olddata.mv_data;
5864
+ switch (flags) {
5865
+ default:
5866
+ if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5867
+ offset = NODESIZE + sizeof(indx_t) + data->mv_size;
5868
+ offset += offset & 1;
5869
+ break;
5870
+ }
5871
+ offset = fp->mp_pad;
5872
+ if (SIZELEFT(fp) < offset) {
5873
+ offset *= 4; /* space for 4 more */
5874
+ break;
5875
+ }
5876
+ /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
5877
+ case MDB_CURRENT:
5814
5878
  fp->mp_flags |= P_DIRTY;
5815
- COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
5879
+ COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
5816
5880
  mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
5817
5881
  flags |= F_DUPDATA;
5818
5882
  goto put_sub;
5819
5883
  }
5820
- if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5821
- offset = fp->mp_pad;
5822
- if (SIZELEFT(fp) >= offset)
5823
- goto reuse;
5824
- offset *= 4; /* space for 4 more */
5825
- } else {
5826
- offset = NODESIZE + sizeof(indx_t) + data->mv_size;
5827
- }
5828
- offset += offset & 1;
5829
5884
  fp_flags = fp->mp_flags;
5830
- if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
5831
- offset >= mc->mc_txn->mt_env->me_nodemax) {
5885
+ xdata.mv_size = olddata.mv_size + offset;
5886
+ if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + xdata.mv_size
5887
+ >= env->me_nodemax) {
5832
5888
  /* yes, convert it */
5833
- dummy.md_flags = 0;
5834
5889
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5835
5890
  dummy.md_pad = fp->mp_pad;
5836
5891
  dummy.md_flags = MDB_DUPFIXED;
5837
5892
  if (mc->mc_db->md_flags & MDB_INTEGERDUP)
5838
5893
  dummy.md_flags |= MDB_INTEGERKEY;
5894
+ } else {
5895
+ dummy.md_pad = 0;
5896
+ dummy.md_flags = 0;
5839
5897
  }
5840
5898
  dummy.md_depth = 1;
5841
5899
  dummy.md_branch_pages = 0;
5842
5900
  dummy.md_leaf_pages = 1;
5843
5901
  dummy.md_overflow_pages = 0;
5844
5902
  dummy.md_entries = NUMKEYS(fp);
5845
- rdata = &xdata;
5846
5903
  xdata.mv_size = sizeof(MDB_db);
5847
5904
  xdata.mv_data = &dummy;
5848
5905
  if ((rc = mdb_page_alloc(mc, 1, &mp)))
5849
5906
  return rc;
5850
- offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
5907
+ offset = env->me_psize - olddata.mv_size;
5851
5908
  flags |= F_DUPDATA|F_SUBDATA;
5852
5909
  dummy.md_root = mp->mp_pgno;
5853
5910
  fp_flags &= ~P_SUBP;
5854
- } else {
5855
- /* no, just grow it */
5856
- rdata = &xdata;
5857
- xdata.mv_size = NODEDSZ(leaf) + offset;
5858
- xdata.mv_data = &pbuf;
5859
- mp = (MDB_page *)&pbuf;
5860
- mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5861
- flags |= F_DUPDATA;
5862
5911
  }
5863
5912
  mp->mp_flags = fp_flags | P_DIRTY;
5864
5913
  mp->mp_pad = fp->mp_pad;
@@ -5867,28 +5916,27 @@ reuse:
5867
5916
  if (IS_LEAF2(fp)) {
5868
5917
  memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
5869
5918
  } else {
5870
- nsize = NODEDSZ(leaf) - fp->mp_upper;
5871
- memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
5919
+ memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
5920
+ olddata.mv_size - fp->mp_upper);
5872
5921
  for (i=0; i<NUMKEYS(fp); i++)
5873
5922
  mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
5874
5923
  }
5875
- mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5876
- do_sub = 1;
5877
- goto new_sub;
5878
5924
  }
5879
- /* data is on sub-DB, just store it */
5880
- flags |= F_DUPDATA|F_SUBDATA;
5881
- goto put_sub;
5925
+
5926
+ rdata = &xdata;
5927
+ flags |= F_DUPDATA;
5928
+ do_sub = 1;
5929
+ mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5930
+ goto new_sub;
5882
5931
  }
5883
5932
  current:
5884
5933
  /* overflow page overwrites need special handling */
5885
5934
  if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
5886
5935
  MDB_page *omp;
5887
5936
  pgno_t pg;
5888
- unsigned psize = mc->mc_txn->mt_env->me_psize;
5889
- int level, ovpages, dpages = OVPAGES(data->mv_size, psize);
5937
+ int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
5890
5938
 
5891
- memcpy(&pg, NODEDATA(leaf), sizeof(pg));
5939
+ memcpy(&pg, olddata.mv_data, sizeof(pg));
5892
5940
  if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
5893
5941
  return rc2;
5894
5942
  ovpages = omp->mp_pages;
@@ -5896,7 +5944,7 @@ current:
5896
5944
  /* Is the ov page large enough? */
5897
5945
  if (ovpages >= dpages) {
5898
5946
  if (!(omp->mp_flags & P_DIRTY) &&
5899
- (level || (mc->mc_txn->mt_env->me_flags & MDB_WRITEMAP)))
5947
+ (level || (env->me_flags & MDB_WRITEMAP)))
5900
5948
  {
5901
5949
  rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
5902
5950
  if (rc)
@@ -5911,7 +5959,7 @@ current:
5911
5959
  */
5912
5960
  if (level > 1) {
5913
5961
  /* It is writable only in a parent txn */
5914
- size_t sz = (size_t) psize * ovpages, off;
5962
+ size_t sz = (size_t) env->me_psize * ovpages, off;
5915
5963
  MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
5916
5964
  MDB_ID2 id2;
5917
5965
  if (!np)
@@ -5941,15 +5989,15 @@ current:
5941
5989
  }
5942
5990
  if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
5943
5991
  return rc2;
5944
- } else if (NODEDSZ(leaf) == data->mv_size) {
5992
+ } else if (data->mv_size == olddata.mv_size) {
5945
5993
  /* same size, just replace it. Note that we could
5946
5994
  * also reuse this node if the new data is smaller,
5947
5995
  * but instead we opt to shrink the node in that case.
5948
5996
  */
5949
5997
  if (F_ISSET(flags, MDB_RESERVE))
5950
- data->mv_data = NODEDATA(leaf);
5998
+ data->mv_data = olddata.mv_data;
5951
5999
  else if (data->mv_size)
5952
- memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
6000
+ memcpy(olddata.mv_data, data->mv_data, data->mv_size);
5953
6001
  else
5954
6002
  memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
5955
6003
  goto done;
@@ -5965,7 +6013,7 @@ current:
5965
6013
 
5966
6014
  new_sub:
5967
6015
  nflags = flags & NODE_ADD_FLAGS;
5968
- nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
6016
+ nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
5969
6017
  if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
5970
6018
  if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
5971
6019
  nflags &= ~MDB_APPEND;
@@ -5982,9 +6030,6 @@ new_sub:
5982
6030
  unsigned i = mc->mc_top;
5983
6031
  MDB_page *mp = mc->mc_pg[i];
5984
6032
 
5985
- if (mc->mc_flags & C_SUB)
5986
- dbi--;
5987
-
5988
6033
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
5989
6034
  if (mc->mc_flags & C_SUB)
5990
6035
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -6062,7 +6107,6 @@ next_mult:
6062
6107
  data[1].mv_size = mcount;
6063
6108
  if (mcount < dcount) {
6064
6109
  data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
6065
- leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6066
6110
  goto more;
6067
6111
  }
6068
6112
  }
@@ -6081,6 +6125,7 @@ int
6081
6125
  mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6082
6126
  {
6083
6127
  MDB_node *leaf;
6128
+ MDB_page *mp;
6084
6129
  int rc;
6085
6130
 
6086
6131
  if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
@@ -6089,17 +6134,20 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6089
6134
  if (!(mc->mc_flags & C_INITIALIZED))
6090
6135
  return EINVAL;
6091
6136
 
6137
+ if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6138
+ return MDB_NOTFOUND;
6139
+
6092
6140
  if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
6093
6141
  return rc;
6094
- flags &= ~MDB_NOSPILL; /* TODO: Or change (flags != MDB_NODUPDATA) to ~(flags & MDB_NODUPDATA), not looking at the logic of that code just now */
6095
6142
 
6096
6143
  rc = mdb_cursor_touch(mc);
6097
6144
  if (rc)
6098
6145
  return rc;
6099
6146
 
6100
- leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6147
+ mp = mc->mc_pg[mc->mc_top];
6148
+ leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6101
6149
 
6102
- if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6150
+ if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6103
6151
  if (!(flags & MDB_NODUPDATA)) {
6104
6152
  if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
6105
6153
  mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
@@ -6114,13 +6162,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6114
6162
  } else {
6115
6163
  MDB_cursor *m2;
6116
6164
  /* shrink fake page */
6117
- mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6118
- leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6165
+ mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
6166
+ leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6119
6167
  mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6120
6168
  /* fix other sub-DB cursors pointed at this fake page */
6121
6169
  for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6122
6170
  if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6123
- if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top] &&
6171
+ if (m2->mc_pg[mc->mc_top] == mp &&
6124
6172
  m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
6125
6173
  m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6126
6174
  }
@@ -6252,6 +6300,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6252
6300
  {
6253
6301
  unsigned int i;
6254
6302
  size_t node_size = NODESIZE;
6303
+ ssize_t room;
6255
6304
  indx_t ofs;
6256
6305
  MDB_node *node;
6257
6306
  MDB_page *mp = mc->mc_pg[mc->mc_top];
@@ -6264,7 +6313,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6264
6313
  IS_LEAF(mp) ? "leaf" : "branch",
6265
6314
  IS_SUBP(mp) ? "sub-" : "",
6266
6315
  mp->mp_pgno, indx, data ? data->mv_size : 0,
6267
- key ? key->mv_size : 0, key ? DKEY(key) : NULL));
6316
+ key ? key->mv_size : 0, key ? DKEY(key) : "null"));
6268
6317
 
6269
6318
  if (IS_LEAF2(mp)) {
6270
6319
  /* Move higher keys up one slot. */
@@ -6282,9 +6331,9 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6282
6331
  return MDB_SUCCESS;
6283
6332
  }
6284
6333
 
6334
+ room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
6285
6335
  if (key != NULL)
6286
6336
  node_size += key->mv_size;
6287
-
6288
6337
  if (IS_LEAF(mp)) {
6289
6338
  assert(data);
6290
6339
  if (F_ISSET(flags, F_BIGDATA)) {
@@ -6296,26 +6345,23 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6296
6345
  /* Put data on overflow page. */
6297
6346
  DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
6298
6347
  data->mv_size, node_size+data->mv_size));
6299
- node_size += sizeof(pgno_t);
6348
+ node_size += sizeof(pgno_t) + (node_size & 1);
6349
+ if ((ssize_t)node_size > room)
6350
+ goto full;
6300
6351
  if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
6301
6352
  return rc;
6302
6353
  DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
6303
6354
  flags |= F_BIGDATA;
6355
+ goto update;
6304
6356
  } else {
6305
6357
  node_size += data->mv_size;
6306
6358
  }
6307
6359
  }
6308
6360
  node_size += node_size & 1;
6361
+ if ((ssize_t)node_size > room)
6362
+ goto full;
6309
6363
 
6310
- if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
6311
- DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
6312
- mp->mp_pgno, NUMKEYS(mp)));
6313
- DPRINTF(("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
6314
- mp->mp_upper - mp->mp_lower));
6315
- DPRINTF(("node size = %"Z"u", node_size));
6316
- return MDB_PAGE_FULL;
6317
- }
6318
-
6364
+ update:
6319
6365
  /* Move higher pointers up one slot. */
6320
6366
  for (i = NUMKEYS(mp); i > indx; i--)
6321
6367
  mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
@@ -6361,6 +6407,13 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6361
6407
  }
6362
6408
 
6363
6409
  return MDB_SUCCESS;
6410
+
6411
+ full:
6412
+ DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
6413
+ mp->mp_pgno, NUMKEYS(mp)));
6414
+ DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
6415
+ DPRINTF(("node size = %"Z"u", node_size));
6416
+ return MDB_PAGE_FULL;
6364
6417
  }
6365
6418
 
6366
6419
  /** Delete the specified node from a page.
@@ -6495,11 +6548,13 @@ mdb_xcursor_init0(MDB_cursor *mc)
6495
6548
  mx->mx_cursor.mc_txn = mc->mc_txn;
6496
6549
  mx->mx_cursor.mc_db = &mx->mx_db;
6497
6550
  mx->mx_cursor.mc_dbx = &mx->mx_dbx;
6498
- mx->mx_cursor.mc_dbi = mc->mc_dbi+1;
6551
+ mx->mx_cursor.mc_dbi = mc->mc_dbi;
6499
6552
  mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
6500
6553
  mx->mx_cursor.mc_snum = 0;
6501
6554
  mx->mx_cursor.mc_top = 0;
6502
6555
  mx->mx_cursor.mc_flags = C_SUB;
6556
+ mx->mx_dbx.md_name.mv_size = 0;
6557
+ mx->mx_dbx.md_name.mv_data = NULL;
6503
6558
  mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
6504
6559
  mx->mx_dbx.md_dcmp = NULL;
6505
6560
  mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
@@ -6520,6 +6575,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6520
6575
  memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
6521
6576
  mx->mx_cursor.mc_pg[0] = 0;
6522
6577
  mx->mx_cursor.mc_snum = 0;
6578
+ mx->mx_cursor.mc_top = 0;
6523
6579
  mx->mx_cursor.mc_flags = C_SUB;
6524
6580
  } else {
6525
6581
  MDB_page *fp = NODEDATA(node);
@@ -6532,8 +6588,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6532
6588
  mx->mx_db.md_entries = NUMKEYS(fp);
6533
6589
  COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
6534
6590
  mx->mx_cursor.mc_snum = 1;
6535
- mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
6536
6591
  mx->mx_cursor.mc_top = 0;
6592
+ mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
6537
6593
  mx->mx_cursor.mc_pg[0] = fp;
6538
6594
  mx->mx_cursor.mc_ki[0] = 0;
6539
6595
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
@@ -6543,12 +6599,9 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6543
6599
  mx->mx_db.md_flags |= MDB_INTEGERKEY;
6544
6600
  }
6545
6601
  }
6546
- DPRINTF(("Sub-db %u for db %u root page %"Z"u", mx->mx_cursor.mc_dbi, mc->mc_dbi,
6602
+ DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
6547
6603
  mx->mx_db.md_root));
6548
- mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ?
6549
- DB_DIRTY : 0);
6550
- mx->mx_dbx.md_name.mv_data = NODEKEY(node);
6551
- mx->mx_dbx.md_name.mv_size = node->mn_ksize;
6604
+ mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
6552
6605
  #if UINT_MAX < SIZE_MAX
6553
6606
  if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
6554
6607
  #ifdef MISALIGNED_OK
@@ -6793,7 +6846,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
6793
6846
  flags = 0;
6794
6847
  } else {
6795
6848
  srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
6796
- assert(!((long)srcnode&1));
6849
+ assert(!((size_t)srcnode&1));
6797
6850
  srcpg = NODEPGNO(srcnode);
6798
6851
  flags = srcnode->mn_flags;
6799
6852
  if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
@@ -6864,9 +6917,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
6864
6917
  MDB_dbi dbi = csrc->mc_dbi;
6865
6918
  MDB_page *mp = csrc->mc_pg[csrc->mc_top];
6866
6919
 
6867
- if (csrc->mc_flags & C_SUB)
6868
- dbi--;
6869
-
6870
6920
  for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
6871
6921
  if (csrc->mc_flags & C_SUB)
6872
6922
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7041,9 +7091,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7041
7091
  MDB_dbi dbi = csrc->mc_dbi;
7042
7092
  MDB_page *mp = cdst->mc_pg[cdst->mc_top];
7043
7093
 
7044
- if (csrc->mc_flags & C_SUB)
7045
- dbi--;
7046
-
7047
7094
  for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7048
7095
  if (csrc->mc_flags & C_SUB)
7049
7096
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7138,13 +7185,11 @@ mdb_rebalance(MDB_cursor *mc)
7138
7185
  /* Adjust cursors pointing to mp */
7139
7186
  mc->mc_snum = 0;
7140
7187
  mc->mc_top = 0;
7188
+ mc->mc_flags &= ~C_INITIALIZED;
7141
7189
  {
7142
7190
  MDB_cursor *m2, *m3;
7143
7191
  MDB_dbi dbi = mc->mc_dbi;
7144
7192
 
7145
- if (mc->mc_flags & C_SUB)
7146
- dbi--;
7147
-
7148
7193
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7149
7194
  if (mc->mc_flags & C_SUB)
7150
7195
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7154,6 +7199,7 @@ mdb_rebalance(MDB_cursor *mc)
7154
7199
  if (m3->mc_pg[0] == mp) {
7155
7200
  m3->mc_snum = 0;
7156
7201
  m3->mc_top = 0;
7202
+ m3->mc_flags &= ~C_INITIALIZED;
7157
7203
  }
7158
7204
  }
7159
7205
  }
@@ -7174,9 +7220,6 @@ mdb_rebalance(MDB_cursor *mc)
7174
7220
  MDB_cursor *m2, *m3;
7175
7221
  MDB_dbi dbi = mc->mc_dbi;
7176
7222
 
7177
- if (mc->mc_flags & C_SUB)
7178
- dbi--;
7179
-
7180
7223
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7181
7224
  if (mc->mc_flags & C_SUB)
7182
7225
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7184,10 +7227,13 @@ mdb_rebalance(MDB_cursor *mc)
7184
7227
  m3 = m2;
7185
7228
  if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
7186
7229
  if (m3->mc_pg[0] == mp) {
7187
- m3->mc_pg[0] = mc->mc_pg[0];
7188
- m3->mc_snum = 1;
7189
- m3->mc_top = 0;
7190
- m3->mc_ki[0] = m3->mc_ki[1];
7230
+ int i;
7231
+ m3->mc_snum--;
7232
+ m3->mc_top--;
7233
+ for (i=0; i<m3->mc_snum; i++) {
7234
+ m3->mc_pg[i] = m3->mc_pg[i+1];
7235
+ m3->mc_ki[i] = m3->mc_ki[i+1];
7236
+ }
7191
7237
  }
7192
7238
  }
7193
7239
  }
@@ -7300,7 +7346,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
7300
7346
 
7301
7347
  /* Adjust other cursors pointing to mp */
7302
7348
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7303
- if (m2 == mc)
7349
+ if (m2 == mc || m2->mc_snum < mc->mc_snum)
7304
7350
  continue;
7305
7351
  if (!(m2->mc_flags & C_INITIALIZED))
7306
7352
  continue;
@@ -7341,7 +7387,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi,
7341
7387
  if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
7342
7388
  return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7343
7389
 
7344
- if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
7390
+ if (key->mv_size > MDB_MAXKEYSIZE) {
7345
7391
  return MDB_BAD_VALSIZE;
7346
7392
  }
7347
7393
 
@@ -7394,24 +7440,26 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7394
7440
  unsigned int nflags)
7395
7441
  {
7396
7442
  unsigned int flags;
7397
- int rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1, did_split = 0;
7443
+ int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
7398
7444
  indx_t newindx;
7399
7445
  pgno_t pgno = 0;
7400
- unsigned int i, j, split_indx, nkeys, pmax;
7446
+ int i, j, split_indx, nkeys, pmax;
7447
+ MDB_env *env = mc->mc_txn->mt_env;
7401
7448
  MDB_node *node;
7402
7449
  MDB_val sepkey, rkey, xdata, *rdata = &xdata;
7403
- MDB_page *copy;
7450
+ MDB_page *copy = NULL;
7404
7451
  MDB_page *mp, *rp, *pp;
7405
- unsigned int ptop;
7452
+ int ptop;
7406
7453
  MDB_cursor mn;
7407
7454
  DKBUF;
7408
7455
 
7409
7456
  mp = mc->mc_pg[mc->mc_top];
7410
7457
  newindx = mc->mc_ki[mc->mc_top];
7458
+ nkeys = NUMKEYS(mp);
7411
7459
 
7412
- DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i",
7460
+ DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
7413
7461
  IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
7414
- DKEY(newkey), mc->mc_ki[mc->mc_top]));
7462
+ DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
7415
7463
 
7416
7464
  /* Create a right sibling. */
7417
7465
  if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
@@ -7458,141 +7506,139 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7458
7506
  sepkey = *newkey;
7459
7507
  split_indx = newindx;
7460
7508
  nkeys = 0;
7461
- goto newsep;
7462
- }
7509
+ } else {
7463
7510
 
7464
- nkeys = NUMKEYS(mp);
7465
- split_indx = nkeys / 2;
7466
- if (newindx < split_indx)
7467
- newpos = 0;
7468
-
7469
- if (IS_LEAF2(rp)) {
7470
- char *split, *ins;
7471
- int x;
7472
- unsigned int lsize, rsize, ksize;
7473
- /* Move half of the keys to the right sibling */
7474
- copy = NULL;
7475
- x = mc->mc_ki[mc->mc_top] - split_indx;
7476
- ksize = mc->mc_db->md_pad;
7477
- split = LEAF2KEY(mp, split_indx, ksize);
7478
- rsize = (nkeys - split_indx) * ksize;
7479
- lsize = (nkeys - split_indx) * sizeof(indx_t);
7480
- mp->mp_lower -= lsize;
7481
- rp->mp_lower += lsize;
7482
- mp->mp_upper += rsize - lsize;
7483
- rp->mp_upper -= rsize - lsize;
7484
- sepkey.mv_size = ksize;
7485
- if (newindx == split_indx) {
7486
- sepkey.mv_data = newkey->mv_data;
7487
- } else {
7488
- sepkey.mv_data = split;
7489
- }
7490
- if (x<0) {
7491
- ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
7492
- memcpy(rp->mp_ptrs, split, rsize);
7493
- sepkey.mv_data = rp->mp_ptrs;
7494
- memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
7495
- memcpy(ins, newkey->mv_data, ksize);
7496
- mp->mp_lower += sizeof(indx_t);
7497
- mp->mp_upper -= ksize - sizeof(indx_t);
7511
+ split_indx = (nkeys+1) / 2;
7512
+
7513
+ if (IS_LEAF2(rp)) {
7514
+ char *split, *ins;
7515
+ int x;
7516
+ unsigned int lsize, rsize, ksize;
7517
+ /* Move half of the keys to the right sibling */
7518
+ copy = NULL;
7519
+ x = mc->mc_ki[mc->mc_top] - split_indx;
7520
+ ksize = mc->mc_db->md_pad;
7521
+ split = LEAF2KEY(mp, split_indx, ksize);
7522
+ rsize = (nkeys - split_indx) * ksize;
7523
+ lsize = (nkeys - split_indx) * sizeof(indx_t);
7524
+ mp->mp_lower -= lsize;
7525
+ rp->mp_lower += lsize;
7526
+ mp->mp_upper += rsize - lsize;
7527
+ rp->mp_upper -= rsize - lsize;
7528
+ sepkey.mv_size = ksize;
7529
+ if (newindx == split_indx) {
7530
+ sepkey.mv_data = newkey->mv_data;
7531
+ } else {
7532
+ sepkey.mv_data = split;
7533
+ }
7534
+ if (x<0) {
7535
+ ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
7536
+ memcpy(rp->mp_ptrs, split, rsize);
7537
+ sepkey.mv_data = rp->mp_ptrs;
7538
+ memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
7539
+ memcpy(ins, newkey->mv_data, ksize);
7540
+ mp->mp_lower += sizeof(indx_t);
7541
+ mp->mp_upper -= ksize - sizeof(indx_t);
7542
+ } else {
7543
+ if (x)
7544
+ memcpy(rp->mp_ptrs, split, x * ksize);
7545
+ ins = LEAF2KEY(rp, x, ksize);
7546
+ memcpy(ins, newkey->mv_data, ksize);
7547
+ memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
7548
+ rp->mp_lower += sizeof(indx_t);
7549
+ rp->mp_upper -= ksize - sizeof(indx_t);
7550
+ mc->mc_ki[mc->mc_top] = x;
7551
+ mc->mc_pg[mc->mc_top] = rp;
7552
+ }
7498
7553
  } else {
7499
- if (x)
7500
- memcpy(rp->mp_ptrs, split, x * ksize);
7501
- ins = LEAF2KEY(rp, x, ksize);
7502
- memcpy(ins, newkey->mv_data, ksize);
7503
- memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
7504
- rp->mp_lower += sizeof(indx_t);
7505
- rp->mp_upper -= ksize - sizeof(indx_t);
7506
- mc->mc_ki[mc->mc_top] = x;
7507
- mc->mc_pg[mc->mc_top] = rp;
7508
- }
7509
- goto newsep;
7510
- }
7554
+ int psize, nsize, k;
7555
+ /* Maximum free space in an empty page */
7556
+ pmax = env->me_psize - PAGEHDRSZ;
7557
+ if (IS_LEAF(mp))
7558
+ nsize = mdb_leaf_size(env, newkey, newdata);
7559
+ else
7560
+ nsize = mdb_branch_size(env, newkey);
7561
+ nsize += nsize & 1;
7511
7562
 
7512
- /* For leaf pages, check the split point based on what
7513
- * fits where, since otherwise mdb_node_add can fail.
7514
- *
7515
- * This check is only needed when the data items are
7516
- * relatively large, such that being off by one will
7517
- * make the difference between success or failure.
7518
- *
7519
- * It's also relevant if a page happens to be laid out
7520
- * such that one half of its nodes are all "small" and
7521
- * the other half of its nodes are "large." If the new
7522
- * item is also "large" and falls on the half with
7523
- * "large" nodes, it also may not fit.
7524
- */
7525
- if (IS_LEAF(mp)) {
7526
- unsigned int psize, nsize;
7527
- /* Maximum free space in an empty page */
7528
- pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
7529
- nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata);
7530
- if ((nkeys < 20) || (nsize > pmax/16)) {
7531
- if (newindx <= split_indx) {
7532
- psize = nsize;
7533
- newpos = 0;
7534
- for (i=0; i<split_indx; i++) {
7535
- node = NODEPTR(mp, i);
7536
- psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7537
- if (F_ISSET(node->mn_flags, F_BIGDATA))
7538
- psize += sizeof(pgno_t);
7539
- else
7540
- psize += NODEDSZ(node);
7541
- psize += psize & 1;
7542
- if (psize > pmax) {
7543
- if (i <= newindx) {
7544
- split_indx = newindx;
7545
- if (i < newindx)
7546
- newpos = 1;
7563
+ /* grab a page to hold a temporary copy */
7564
+ copy = mdb_page_malloc(mc->mc_txn, 1);
7565
+ if (copy == NULL)
7566
+ return ENOMEM;
7567
+ copy->mp_pgno = mp->mp_pgno;
7568
+ copy->mp_flags = mp->mp_flags;
7569
+ copy->mp_lower = PAGEHDRSZ;
7570
+ copy->mp_upper = env->me_psize;
7571
+
7572
+ /* prepare to insert */
7573
+ for (i=0, j=0; i<nkeys; i++) {
7574
+ if (i == newindx) {
7575
+ copy->mp_ptrs[j++] = 0;
7576
+ }
7577
+ copy->mp_ptrs[j++] = mp->mp_ptrs[i];
7578
+ }
7579
+
7580
+ /* When items are relatively large the split point needs
7581
+ * to be checked, because being off-by-one will make the
7582
+ * difference between success or failure in mdb_node_add.
7583
+ *
7584
+ * It's also relevant if a page happens to be laid out
7585
+ * such that one half of its nodes are all "small" and
7586
+ * the other half of its nodes are "large." If the new
7587
+ * item is also "large" and falls on the half with
7588
+ * "large" nodes, it also may not fit.
7589
+ *
7590
+ * As a final tweak, if the new item goes on the last
7591
+ * spot on the page (and thus, onto the new page), bias
7592
+ * the split so the new page is emptier than the old page.
7593
+ * This yields better packing during sequential inserts.
7594
+ */
7595
+ if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
7596
+ /* Find split point */
7597
+ psize = 0;
7598
+ if (newindx <= split_indx || newindx >= nkeys) {
7599
+ i = 0; j = 1;
7600
+ k = newindx >= nkeys ? nkeys : split_indx+2;
7601
+ } else {
7602
+ i = nkeys; j = -1;
7603
+ k = split_indx-1;
7604
+ }
7605
+ for (; i!=k; i+=j) {
7606
+ if (i == newindx) {
7607
+ psize += nsize;
7608
+ node = NULL;
7609
+ } else {
7610
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
7611
+ psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7612
+ if (IS_LEAF(mp)) {
7613
+ if (F_ISSET(node->mn_flags, F_BIGDATA))
7614
+ psize += sizeof(pgno_t);
7615
+ else
7616
+ psize += NODEDSZ(node);
7547
7617
  }
7548
- else
7549
- split_indx = i;
7550
- break;
7618
+ psize += psize & 1;
7551
7619
  }
7552
- }
7553
- } else {
7554
- psize = nsize;
7555
- for (i=nkeys-1; i>=split_indx; i--) {
7556
- node = NODEPTR(mp, i);
7557
- psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7558
- if (F_ISSET(node->mn_flags, F_BIGDATA))
7559
- psize += sizeof(pgno_t);
7560
- else
7561
- psize += NODEDSZ(node);
7562
- psize += psize & 1;
7563
- if (psize > pmax) {
7564
- if (i >= newindx) {
7565
- split_indx = newindx;
7566
- newpos = 0;
7567
- } else
7568
- split_indx = i+1;
7620
+ if (psize > pmax || i == k-j) {
7621
+ split_indx = i + (j<0);
7569
7622
  break;
7570
7623
  }
7571
7624
  }
7572
7625
  }
7626
+ if (split_indx == newindx) {
7627
+ sepkey.mv_size = newkey->mv_size;
7628
+ sepkey.mv_data = newkey->mv_data;
7629
+ } else {
7630
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
7631
+ sepkey.mv_size = node->mn_ksize;
7632
+ sepkey.mv_data = NODEKEY(node);
7633
+ }
7573
7634
  }
7574
7635
  }
7575
7636
 
7576
- /* First find the separating key between the split pages.
7577
- * The case where newindx == split_indx is ambiguous; the
7578
- * new item could go to the new page or stay on the original
7579
- * page. If newpos == 1 it goes to the new page.
7580
- */
7581
- if (newindx == split_indx && newpos) {
7582
- sepkey.mv_size = newkey->mv_size;
7583
- sepkey.mv_data = newkey->mv_data;
7584
- } else {
7585
- node = NODEPTR(mp, split_indx);
7586
- sepkey.mv_size = node->mn_ksize;
7587
- sepkey.mv_data = NODEKEY(node);
7588
- }
7589
-
7590
- newsep:
7591
- DPRINTF(("separator is [%s]", DKEY(&sepkey)));
7637
+ DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
7592
7638
 
7593
7639
  /* Copy separator key to the parent.
7594
7640
  */
7595
- if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) {
7641
+ if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
7596
7642
  mn.mc_snum--;
7597
7643
  mn.mc_top--;
7598
7644
  did_split = 1;
@@ -7637,117 +7683,97 @@ newsep:
7637
7683
  return rc;
7638
7684
  for (i=0; i<mc->mc_top; i++)
7639
7685
  mc->mc_ki[i] = mn.mc_ki[i];
7640
- goto done;
7641
- }
7642
- if (IS_LEAF2(rp)) {
7643
- goto done;
7644
- }
7645
-
7646
- /* Move half of the keys to the right sibling. */
7686
+ } else if (!IS_LEAF2(mp)) {
7687
+ /* Move nodes */
7688
+ mc->mc_pg[mc->mc_top] = rp;
7689
+ i = split_indx;
7690
+ j = 0;
7691
+ do {
7692
+ if (i == newindx) {
7693
+ rkey.mv_data = newkey->mv_data;
7694
+ rkey.mv_size = newkey->mv_size;
7695
+ if (IS_LEAF(mp)) {
7696
+ rdata = newdata;
7697
+ } else
7698
+ pgno = newpgno;
7699
+ flags = nflags;
7700
+ /* Update index for the new key. */
7701
+ mc->mc_ki[mc->mc_top] = j;
7702
+ } else {
7703
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
7704
+ rkey.mv_data = NODEKEY(node);
7705
+ rkey.mv_size = node->mn_ksize;
7706
+ if (IS_LEAF(mp)) {
7707
+ xdata.mv_data = NODEDATA(node);
7708
+ xdata.mv_size = NODEDSZ(node);
7709
+ rdata = &xdata;
7710
+ } else
7711
+ pgno = NODEPGNO(node);
7712
+ flags = node->mn_flags;
7713
+ }
7647
7714
 
7648
- /* grab a page to hold a temporary copy */
7649
- copy = mdb_page_malloc(mc->mc_txn, 1);
7650
- if (copy == NULL)
7651
- return ENOMEM;
7715
+ if (!IS_LEAF(mp) && j == 0) {
7716
+ /* First branch index doesn't need key data. */
7717
+ rkey.mv_size = 0;
7718
+ }
7652
7719
 
7653
- copy->mp_pgno = mp->mp_pgno;
7654
- copy->mp_flags = mp->mp_flags;
7655
- copy->mp_lower = PAGEHDRSZ;
7656
- copy->mp_upper = mc->mc_txn->mt_env->me_psize;
7657
- mc->mc_pg[mc->mc_top] = copy;
7658
- for (i = j = 0; i <= nkeys; j++) {
7659
- if (i == split_indx) {
7660
- /* Insert in right sibling. */
7661
- /* Reset insert index for right sibling. */
7662
- if (i != newindx || (newpos ^ ins_new)) {
7720
+ rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
7721
+ if (rc) {
7722
+ /* return tmp page to freelist */
7723
+ mdb_page_free(env, copy);
7724
+ return rc;
7725
+ }
7726
+ if (i == nkeys) {
7727
+ i = 0;
7663
7728
  j = 0;
7664
- mc->mc_pg[mc->mc_top] = rp;
7729
+ mc->mc_pg[mc->mc_top] = copy;
7730
+ } else {
7731
+ i++;
7732
+ j++;
7733
+ }
7734
+ } while (i != split_indx);
7735
+
7736
+ nkeys = NUMKEYS(copy);
7737
+ for (i=0; i<nkeys; i++)
7738
+ mp->mp_ptrs[i] = copy->mp_ptrs[i];
7739
+ mp->mp_lower = copy->mp_lower;
7740
+ mp->mp_upper = copy->mp_upper;
7741
+ memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
7742
+ env->me_psize - copy->mp_upper);
7743
+
7744
+ /* reset back to original page */
7745
+ if (newindx < split_indx) {
7746
+ mc->mc_pg[mc->mc_top] = mp;
7747
+ if (nflags & MDB_RESERVE) {
7748
+ node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7749
+ if (!(node->mn_flags & F_BIGDATA))
7750
+ newdata->mv_data = NODEDATA(node);
7665
7751
  }
7666
- }
7667
-
7668
- if (i == newindx && !ins_new) {
7669
- /* Insert the original entry that caused the split. */
7670
- rkey.mv_data = newkey->mv_data;
7671
- rkey.mv_size = newkey->mv_size;
7672
- if (IS_LEAF(mp)) {
7673
- rdata = newdata;
7674
- } else
7675
- pgno = newpgno;
7676
- flags = nflags;
7677
-
7678
- ins_new = 1;
7679
-
7680
- /* Update index for the new key. */
7681
- mc->mc_ki[mc->mc_top] = j;
7682
- } else if (i == nkeys) {
7683
- break;
7684
7752
  } else {
7685
- node = NODEPTR(mp, i);
7686
- rkey.mv_data = NODEKEY(node);
7687
- rkey.mv_size = node->mn_ksize;
7688
- if (IS_LEAF(mp)) {
7689
- xdata.mv_data = NODEDATA(node);
7690
- xdata.mv_size = NODEDSZ(node);
7691
- rdata = &xdata;
7692
- } else
7693
- pgno = NODEPGNO(node);
7694
- flags = node->mn_flags;
7695
-
7696
- i++;
7697
- }
7698
-
7699
- if (!IS_LEAF(mp) && j == 0) {
7700
- /* First branch index doesn't need key data. */
7701
- rkey.mv_size = 0;
7702
- }
7703
-
7704
- rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
7705
- if (rc) break;
7706
- }
7707
-
7708
- nkeys = NUMKEYS(copy);
7709
- for (i=0; i<nkeys; i++)
7710
- mp->mp_ptrs[i] = copy->mp_ptrs[i];
7711
- mp->mp_lower = copy->mp_lower;
7712
- mp->mp_upper = copy->mp_upper;
7713
- memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
7714
- mc->mc_txn->mt_env->me_psize - copy->mp_upper);
7715
-
7716
- /* reset back to original page */
7717
- if (newindx < split_indx || (!newpos && newindx == split_indx)) {
7718
- mc->mc_pg[mc->mc_top] = mp;
7719
- if (nflags & MDB_RESERVE) {
7720
- node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7721
- if (!(node->mn_flags & F_BIGDATA))
7722
- newdata->mv_data = NODEDATA(node);
7723
- }
7724
- } else {
7725
- mc->mc_ki[ptop]++;
7726
- /* Make sure mc_ki is still valid.
7727
- */
7728
- if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
7729
- mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
7730
- for (i=0; i<ptop; i++) {
7731
- mc->mc_pg[i] = mn.mc_pg[i];
7732
- mc->mc_ki[i] = mn.mc_ki[i];
7753
+ mc->mc_pg[mc->mc_top] = rp;
7754
+ mc->mc_ki[ptop]++;
7755
+ /* Make sure mc_ki is still valid.
7756
+ */
7757
+ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
7758
+ mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
7759
+ for (i=0; i<ptop; i++) {
7760
+ mc->mc_pg[i] = mn.mc_pg[i];
7761
+ mc->mc_ki[i] = mn.mc_ki[i];
7762
+ }
7763
+ mc->mc_pg[ptop] = mn.mc_pg[ptop];
7764
+ mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
7733
7765
  }
7734
- mc->mc_pg[ptop] = mn.mc_pg[ptop];
7735
- mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
7736
7766
  }
7767
+ /* return tmp page to freelist */
7768
+ mdb_page_free(env, copy);
7737
7769
  }
7738
7770
 
7739
- /* return tmp page to freelist */
7740
- mdb_page_free(mc->mc_txn->mt_env, copy);
7741
- done:
7742
7771
  {
7743
7772
  /* Adjust other cursors pointing to mp */
7744
7773
  MDB_cursor *m2, *m3;
7745
7774
  MDB_dbi dbi = mc->mc_dbi;
7746
7775
  int fixup = NUMKEYS(mp);
7747
7776
 
7748
- if (mc->mc_flags & C_SUB)
7749
- dbi--;
7750
-
7751
7777
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7752
7778
  if (mc->mc_flags & C_SUB)
7753
7779
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7789,6 +7815,7 @@ done:
7789
7815
  }
7790
7816
  }
7791
7817
  }
7818
+ DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
7792
7819
  return rc;
7793
7820
  }
7794
7821
 
@@ -7805,13 +7832,6 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
7805
7832
  if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
7806
7833
  return EINVAL;
7807
7834
 
7808
- if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
7809
- return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7810
-
7811
- if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
7812
- return MDB_BAD_VALSIZE;
7813
- }
7814
-
7815
7835
  if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
7816
7836
  return EINVAL;
7817
7837
 
@@ -7851,6 +7871,16 @@ mdb_env_get_path(MDB_env *env, const char **arg)
7851
7871
  return MDB_SUCCESS;
7852
7872
  }
7853
7873
 
7874
+ int
7875
+ mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
7876
+ {
7877
+ if (!env || !arg)
7878
+ return EINVAL;
7879
+
7880
+ *arg = env->me_fd;
7881
+ return MDB_SUCCESS;
7882
+ }
7883
+
7854
7884
  /** Common code for #mdb_stat() and #mdb_env_stat().
7855
7885
  * @param[in] env the environment to operate in.
7856
7886
  * @param[in] db the #MDB_db record containing the stats to return.
@@ -8075,7 +8105,7 @@ mdb_drop0(MDB_cursor *mc, int subs)
8075
8105
  {
8076
8106
  int rc;
8077
8107
 
8078
- rc = mdb_page_search(mc, NULL, 0);
8108
+ rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
8079
8109
  if (rc == MDB_SUCCESS) {
8080
8110
  MDB_txn *txn = mc->mc_txn;
8081
8111
  MDB_node *ni;
@@ -8273,10 +8303,10 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
8273
8303
  return 0;
8274
8304
  }
8275
8305
 
8276
- /* insert pid into list if not already present.
8306
+ /** Insert pid into list if not already present.
8277
8307
  * return -1 if already present.
8278
8308
  */
8279
- static int mdb_pid_insert(pid_t *ids, pid_t pid)
8309
+ static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
8280
8310
  {
8281
8311
  /* binary search of pid in list */
8282
8312
  unsigned base = 0;
@@ -8301,7 +8331,7 @@ static int mdb_pid_insert(pid_t *ids, pid_t pid)
8301
8331
  return -1;
8302
8332
  }
8303
8333
  }
8304
-
8334
+
8305
8335
  if( val > 0 ) {
8306
8336
  ++cursor;
8307
8337
  }
@@ -8316,7 +8346,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
8316
8346
  {
8317
8347
  unsigned int i, j, rdrs;
8318
8348
  MDB_reader *mr;
8319
- pid_t *pids, pid;
8349
+ MDB_PID_T *pids, pid;
8320
8350
  int count = 0;
8321
8351
 
8322
8352
  if (!env)
@@ -8326,7 +8356,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
8326
8356
  if (!env->me_txns)
8327
8357
  return MDB_SUCCESS;
8328
8358
  rdrs = env->me_txns->mti_numreaders;
8329
- pids = malloc((rdrs+1) * sizeof(pid_t));
8359
+ pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
8330
8360
  if (!pids)
8331
8361
  return ENOMEM;
8332
8362
  pids[0] = 0;
@@ -8342,6 +8372,8 @@ int mdb_reader_check(MDB_env *env, int *dead)
8342
8372
  if (!mdb_reader_pid(env, Pidcheck, pid)) {
8343
8373
  for (j=i; j<rdrs; j++)
8344
8374
  if (mr[j].mr_pid == pid) {
8375
+ DPRINTF(("clear stale reader pid %u txn %"Z"d",
8376
+ (unsigned) pid, mr[j].mr_txnid));
8345
8377
  mr[j].mr_pid = 0;
8346
8378
  count++;
8347
8379
  }