lmdb 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c192851303ba3bd76a139142c94322233284d3ed
4
- data.tar.gz: b9482f90f6599d71f49989246fe9f2c526f47986
3
+ metadata.gz: 1e16a42693e5150e076829337a0d44feb46bc3f3
4
+ data.tar.gz: ce0bc77de0cf3c7e17df2522c8eab69e760e1221
5
5
  SHA512:
6
- metadata.gz: e8fa0b74c2854ebd13ba07663ee65be141cefe08ae55075f6fd8817c805e21f48d32c886d687f4e85334958bfa1d9f39935a9764d343755dec3adbd7b3d67b08
7
- data.tar.gz: c760928ef6d4fb1edb6a5a5d5432b9a8823af38e12457efaef74640aef3b7a6e7956bd258cf22b68c54a89136742fc48329c78ac0ce9db2bbfa1b135d8dd3a8a
6
+ metadata.gz: e02924de6a59386ec215b45baf321a6d098d818900f7816a3805185ab5b7441427cc5af866d2d8141eecfd1d166a659ebc2a87d5ae7626a9a574bd4d1bd4bf44
7
+ data.tar.gz: 3ee55c5879fd385e53d25eeffd694dab0de03c5b152fe07562f9b631de25ccd90f09ae886a1d30b7f9abd6ea3a8da3fedac0b1f9d69dbffc3874bfad18570dd9
data/.travis.yml CHANGED
@@ -3,6 +3,7 @@ rvm:
3
3
  - 1.8.7
4
4
  - 1.9.3
5
5
  - 2.0.0
6
+ - 2.1.0
6
7
  - ruby-head
7
8
  - rbx-18mode
8
9
  - rbx-19mode
data/CHANGES CHANGED
@@ -1,3 +1,7 @@
1
+ 0.4.0
2
+
3
+ * Print warnings if open LMDB objects are found during garbage collection
4
+
1
5
  0.3.1
2
6
 
3
7
  * Minor fixes
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
  # LMDB
2
2
 
3
- Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LLMDB)
3
+ [![Gittip donate button](http://img.shields.io/gittip/bevry.png)](https://www.gittip.com/min4d/ "Donate weekly to this project using Gittip")
4
+ [![Flattr this git repo](http://api.flattr.com/button/flattr-badge-large.png)](https://flattr.com/submit/auto?user_id=min4d&url=https://github.com/minad/lmdb&title=LMDB&language=&tags=github&category=software)
5
+
6
+ Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LMDB)
4
7
  http://symas.com/mdb/
5
8
 
6
9
  ### Installation
@@ -1,6 +1,31 @@
1
1
  LMDB 0.9 Change Log
2
2
 
3
- LMDB 0.9.8 Engineering
3
+ LMDB 0.9.10 Release (2013/11/12)
4
+ Add MDB_NOMEMINIT option
5
+ Fix mdb_page_split() again (ITS#7589)
6
+ Fix MDB_NORDAHEAD definition (ITS#7734)
7
+ Fix mdb_cursor_del() positioning (ITS#7733)
8
+ Partial fix for larger page sizes (ITS#7713)
9
+ Fix Windows64/MSVC build issues
10
+
11
+ LMDB 0.9.9 Release (2013/10/24)
12
+ Add mdb_env_get_fd()
13
+ Add MDB_NORDAHEAD option
14
+ Add MDB_NOLOCK option
15
+ Avoid wasting space in mdb_page_split() (ITS#7589)
16
+ Fix mdb_page_merge() cursor fixup (ITS#7722)
17
+ Fix mdb_cursor_del() on last delete (ITS#7718)
18
+ Fix adding WRITEMAP on existing env (ITS#7715)
19
+ Fixes for nested txns (ITS#7515)
20
+ Fix mdb_env_copy() O_DIRECT bug (ITS#7682)
21
+ Fix mdb_cursor_set(SET_RANGE) return code (ITS#7681)
22
+ Fix mdb_rebalance() cursor fixup (ITS#7701)
23
+ Misc code cleanup
24
+ Documentation
25
+ Note that by default, readers need write access
26
+
27
+
28
+ LMDB 0.9.8 Release (2013/09/09)
4
29
  Allow mdb_env_set_mapsize() on an open environment
5
30
  Fix mdb_dbi_flags() (ITS#7672)
6
31
  Fix mdb_page_unspill() in nested txns
@@ -66,6 +66,20 @@
66
66
  * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM.
67
67
  * Multiple users can cause startup to fail later, as noted above.
68
68
  *
69
+ * - There is normally no pure read-only mode, since readers need write
70
+ * access to locks and lock file. Exceptions: On read-only filesystems
71
+ * or with the #MDB_NOLOCK flag described under #mdb_env_open().
72
+ *
73
+ * - By default, in versions before 0.9.10, unused portions of the data
74
+ * file might receive garbage data from memory freed by other code.
75
+ * (This does not happen when using the #MDB_WRITEMAP flag.) As of
76
+ * 0.9.10 the default behavior is to initialize such memory before
77
+ * writing to the data file. Since there may be a slight performance
78
+ * cost due to this initialization, applications may disable it using
79
+ * the #MDB_NOMEMINIT flag. Applications handling sensitive data
80
+ * which must not be written should not use this flag. This flag is
81
+ * irrelevant when using #MDB_WRITEMAP.
82
+ *
69
83
  * - A thread can only use one transaction at a time, plus any child
70
84
  * transactions. Each transaction belongs to one thread. See below.
71
85
  * The #MDB_NOTLS flag changes this for read-only transactions.
@@ -170,7 +184,7 @@ typedef int mdb_filehandle_t;
170
184
  /** Library minor version */
171
185
  #define MDB_VERSION_MINOR 9
172
186
  /** Library patch version */
173
- #define MDB_VERSION_PATCH 8
187
+ #define MDB_VERSION_PATCH 10
174
188
 
175
189
  /** Combine args a,b,c into a single integer for easy version comparisons */
176
190
  #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
@@ -180,7 +194,7 @@ typedef int mdb_filehandle_t;
180
194
  MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
181
195
 
182
196
  /** The release date of this library version */
183
- #define MDB_VERSION_DATE "September 9, 2013"
197
+ #define MDB_VERSION_DATE "November 11, 2013"
184
198
 
185
199
  /** A stringifier for the version info */
186
200
  #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
@@ -216,13 +230,13 @@ typedef struct MDB_cursor MDB_cursor;
216
230
  /** @brief Generic structure used for passing keys and data in and out
217
231
  * of the database.
218
232
  *
219
- * Key sizes must be between 1 and the liblmdb build-time constant
220
- * #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The
221
- * same applies to data sizes in databases with the #MDB_DUPSORT flag.
222
- * Other data items can in theory be from 0 to 0xffffffff bytes long.
223
- *
224
233
  * Values returned from the database are valid only until a subsequent
225
- * update operation, or the end of the transaction.
234
+ * update operation, or the end of the transaction. Do not modify or
235
+ * free them, they commonly point into the database itself.
236
+ *
237
+ * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive.
238
+ * The same applies to data sizes in databases with the #MDB_DUPSORT flag.
239
+ * Other data items can in theory be from 0 to 0xffffffff bytes long.
226
240
  */
227
241
  typedef struct MDB_val {
228
242
  size_t mv_size; /**< size of the data item */
@@ -265,10 +279,16 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
265
279
  #define MDB_NOMETASYNC 0x40000
266
280
  /** use writable mmap */
267
281
  #define MDB_WRITEMAP 0x80000
268
- /** use asynchronous msync when MDB_WRITEMAP is used */
282
+ /** use asynchronous msync when #MDB_WRITEMAP is used */
269
283
  #define MDB_MAPASYNC 0x100000
270
284
  /** tie reader locktable slots to #MDB_txn objects instead of to threads */
271
285
  #define MDB_NOTLS 0x200000
286
+ /** don't do any locking, caller must manage their own locks */
287
+ #define MDB_NOLOCK 0x400000
288
+ /** don't do readahead (no effect on Windows) */
289
+ #define MDB_NORDAHEAD 0x800000
290
+ /** don't initialize malloc'd memory before writing to datafile */
291
+ #define MDB_NOMEMINIT 0x1000000
272
292
  /** @} */
273
293
 
274
294
  /** @defgroup mdb_dbi_open Database Flags
@@ -486,6 +506,8 @@ int mdb_env_create(MDB_env **env);
486
506
  * and uses fewer mallocs, but loses protection from application bugs
487
507
  * like wild pointer writes and other bad updates into the database.
488
508
  * Incompatible with nested transactions.
509
+ * Processes with and without MDB_WRITEMAP on the same environment do
510
+ * not cooperate well.
489
511
  * <li>#MDB_NOMETASYNC
490
512
  * Flush system buffers to disk only once per transaction, omit the
491
513
  * metadata flush. Defer that until the system flushes files to disk,
@@ -523,6 +545,38 @@ int mdb_env_create(MDB_env **env);
523
545
  * user threads over individual OS threads need this option. Such an
524
546
  * application must also serialize the write transactions in an OS
525
547
  * thread, since MDB's write locking is unaware of the user threads.
548
+ * <li>#MDB_NOLOCK
549
+ * Don't do any locking. If concurrent access is anticipated, the
550
+ * caller must manage all concurrency itself. For proper operation
551
+ * the caller must enforce single-writer semantics, and must ensure
552
+ * that no readers are using old transactions while a writer is
553
+ * active. The simplest approach is to use an exclusive lock so that
554
+ * no readers may be active at all when a writer begins.
555
+ * <li>#MDB_NORDAHEAD
556
+ * Turn off readahead. Most operating systems perform readahead on
557
+ * read requests by default. This option turns it off if the OS
558
+ * supports it. Turning it off may help random read performance
559
+ * when the DB is larger than RAM and system RAM is full.
560
+ * The option is not implemented on Windows.
561
+ * <li>#MDB_NOMEMINIT
562
+ * Don't initialize malloc'd memory before writing to unused spaces
563
+ * in the data file. By default, memory for pages written to the data
564
+ * file is obtained using malloc. While these pages may be reused in
565
+ * subsequent transactions, freshly malloc'd pages will be initialized
566
+ * to zeroes before use. This avoids persisting leftover data from other
567
+ * code (that used the heap and subsequently freed the memory) into the
568
+ * data file. Note that many other system libraries may allocate
569
+ * and free memory from the heap for arbitrary uses. E.g., stdio may
570
+ * use the heap for file I/O buffers. This initialization step has a
571
+ * modest performance cost so some applications may want to disable
572
+ * it using this flag. This option can be a problem for applications
573
+ * which handle sensitive data like passwords, and it makes memory
574
+ * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP,
575
+ * which writes directly to the mmap instead of using malloc for pages. The
576
+ * initialization is also skipped if #MDB_RESERVE is used; the
577
+ * caller is expected to overwrite all of the memory that was
578
+ * reserved in that case.
579
+ * This flag may be changed at any time using #mdb_env_set_flags().
526
580
  * </ul>
527
581
  * @param[in] mode The UNIX permissions to set on created files. This parameter
528
582
  * is ignored on Windows.
@@ -656,6 +710,18 @@ int mdb_env_get_flags(MDB_env *env, unsigned int *flags);
656
710
  */
657
711
  int mdb_env_get_path(MDB_env *env, const char **path);
658
712
 
713
+ /** @brief Return the filedescriptor for the given environment.
714
+ *
715
+ * @param[in] env An environment handle returned by #mdb_env_create()
716
+ * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor.
717
+ * @return A non-zero error value on failure and 0 on success. Some possible
718
+ * errors are:
719
+ * <ul>
720
+ * <li>EINVAL - an invalid parameter was specified.
721
+ * </ul>
722
+ */
723
+ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
724
+
659
725
  /** @brief Set the size of the memory map to use for this environment.
660
726
  *
661
727
  * The size should be a multiple of the OS page size. The default is
@@ -733,8 +799,10 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs);
733
799
 
734
800
  /** @brief Get the maximum size of a key for the environment.
735
801
  *
802
+ * This is the compile-time constant #MDB_MAXKEYSIZE, default 511.
803
+ * See @ref MDB_val.
736
804
  * @param[in] env An environment handle returned by #mdb_env_create()
737
- * @return The maximum size of a key. (#MDB_MAXKEYSIZE)
805
+ * @return The maximum size of a key
738
806
  */
739
807
  int mdb_env_get_maxkeysize(MDB_env *env);
740
808
 
@@ -1094,6 +1162,8 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
1094
1162
  * reserved space, which the caller can fill in later - before
1095
1163
  * the next update operation or the transaction ends. This saves
1096
1164
  * an extra memcpy if the data is being generated later.
1165
+ * MDB does nothing else with this memory, the caller is expected
1166
+ * to modify all of the space requested.
1097
1167
  * <li>#MDB_APPEND - append the given key/data pair to the end of the
1098
1168
  * database. No key comparisons are performed. This option allows
1099
1169
  * fast bulk loading when keys are already known to be in the
@@ -37,10 +37,26 @@
37
37
  #endif
38
38
  #include <sys/types.h>
39
39
  #include <sys/stat.h>
40
- #include <sys/param.h>
41
40
  #ifdef _WIN32
42
41
  #include <windows.h>
42
+ /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
43
+ * as int64 which is wrong. MSVC doesn't define it at all, so just
44
+ * don't use it.
45
+ */
46
+ #define MDB_PID_T int
47
+ #ifdef __GNUC__
48
+ # include <sys/param.h>
43
49
  #else
50
+ # define LITTLE_ENDIAN 1234
51
+ # define BIG_ENDIAN 4321
52
+ # define BYTE_ORDER LITTLE_ENDIAN
53
+ # ifndef SSIZE_MAX
54
+ # define SSIZE_MAX INT_MAX
55
+ # endif
56
+ #endif
57
+ #else
58
+ #define MDB_PID_T pid_t
59
+ #include <sys/param.h>
44
60
  #include <sys/uio.h>
45
61
  #include <sys/mman.h>
46
62
  #ifdef HAVE_SYS_FILE_H
@@ -75,6 +91,7 @@
75
91
  #ifndef _WIN32
76
92
  #include <pthread.h>
77
93
  #ifdef MDB_USE_POSIX_SEM
94
+ # define MDB_USE_HASH 1
78
95
  #include <semaphore.h>
79
96
  #endif
80
97
  #endif
@@ -140,6 +157,7 @@
140
157
  * @{
141
158
  */
142
159
  #ifdef _WIN32
160
+ #define MDB_USE_HASH 1
143
161
  #define MDB_PIDLOCK 0
144
162
  #define pthread_t DWORD
145
163
  #define pthread_mutex_t HANDLE
@@ -171,7 +189,7 @@
171
189
  #define Z "I"
172
190
  #else
173
191
 
174
- #define Z "z"
192
+ #define Z "z" /**< printf format modifier for size_t */
175
193
 
176
194
  /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
177
195
  #define MDB_PIDLOCK 1
@@ -317,12 +335,18 @@ static txnid_t mdb_debug_start;
317
335
  * The string is printed literally, with no format processing.
318
336
  */
319
337
  #define DPUTS(arg) DPRINTF(("%s", arg))
338
+ /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
339
+ #define DDBI(mc) \
340
+ (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
320
341
  /** @} */
321
342
 
322
- /** A default memory page size.
323
- * The actual size is platform-dependent, but we use this for
324
- * boot-strapping. We probably should not be using this any more.
325
- * The #GET_PAGESIZE() macro is used to get the actual size.
343
+ /** @brief The maximum size of a database page.
344
+ *
345
+ * This is 32k, since it must fit in #MDB_page.#mp_upper.
346
+ *
347
+ * LMDB will use database pages < OS pages if needed.
348
+ * That causes more I/O in write transactions: The OS must
349
+ * know (read) the whole page before writing a partial page.
326
350
  *
327
351
  * Note that we don't currently support Huge pages. On Linux,
328
352
  * regular data files cannot use Huge pages, and in general
@@ -331,7 +355,7 @@ static txnid_t mdb_debug_start;
331
355
  * pressure from other processes is high. So until OSs have
332
356
  * actual paging support for Huge pages, they're not viable.
333
357
  */
334
- #define MDB_PAGESIZE 4096
358
+ #define MAX_PAGESIZE 0x8000
335
359
 
336
360
  /** The minimum number of keys required in a database page.
337
361
  * Setting this to a larger value will place a smaller bound on the
@@ -365,7 +389,7 @@ static txnid_t mdb_debug_start;
365
389
  *
366
390
  * We require that keys all fit onto a regular page. This limit
367
391
  * could be raised a bit further if needed; to something just
368
- * under #MDB_PAGESIZE / #MDB_MINKEYS.
392
+ * under (page size / #MDB_MINKEYS / 3).
369
393
  *
370
394
  * Note that data items in an #MDB_DUPSORT database are actually keys
371
395
  * of a subDB, so they're also limited to this size.
@@ -425,7 +449,8 @@ typedef uint16_t indx_t;
425
449
  *
426
450
  * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
427
451
  *
428
- * No reader table is used if the database is on a read-only filesystem.
452
+ * No reader table is used if the database is on a read-only filesystem, or
453
+ * if #MDB_NOLOCK is set.
429
454
  *
430
455
  * Since the database uses multi-version concurrency control, readers don't
431
456
  * actually need any locking. This table is used to keep track of which
@@ -488,7 +513,7 @@ typedef struct MDB_rxbody {
488
513
  */
489
514
  txnid_t mrb_txnid;
490
515
  /** The process ID of the process owning this reader txn. */
491
- pid_t mrb_pid;
516
+ MDB_PID_T mrb_pid;
492
517
  /** The thread ID of the thread owning this txn. */
493
518
  pthread_t mrb_tid;
494
519
  } MDB_rxbody;
@@ -600,7 +625,7 @@ typedef struct MDB_page {
600
625
  #define P_LEAF 0x02 /**< leaf page */
601
626
  #define P_OVERFLOW 0x04 /**< overflow page */
602
627
  #define P_META 0x08 /**< meta page */
603
- #define P_DIRTY 0x10 /**< dirty page */
628
+ #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
604
629
  #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
605
630
  #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
606
631
  #define P_KEEP 0x8000 /**< leave this page alone during spill */
@@ -786,7 +811,10 @@ typedef struct MDB_db {
786
811
  /** Handle for the default DB. */
787
812
  #define MAIN_DBI 1
788
813
 
789
- /** Meta page content. */
814
+ /** Meta page content.
815
+ * A meta page is the start point for accessing a database snapshot.
816
+ * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
817
+ */
790
818
  typedef struct MDB_meta {
791
819
  /** Stamp identifying this as an MDB file. It must be set
792
820
  * to #MDB_MAGIC. */
@@ -804,19 +832,18 @@ typedef struct MDB_meta {
804
832
  txnid_t mm_txnid; /**< txnid that committed this page */
805
833
  } MDB_meta;
806
834
 
807
- /** Buffer for a stack-allocated dirty page.
835
+ /** Buffer for a stack-allocated meta page.
808
836
  * The members define size and alignment, and silence type
809
837
  * aliasing warnings. They are not used directly; that could
810
838
  * mean incorrectly using several union members in parallel.
811
839
  */
812
- typedef union MDB_pagebuf {
813
- char mb_raw[MDB_PAGESIZE];
840
+ typedef union MDB_metabuf {
814
841
  MDB_page mb_page;
815
842
  struct {
816
843
  char mm_pad[PAGEHDRSZ];
817
844
  MDB_meta mm_meta;
818
845
  } mb_metabuf;
819
- } MDB_pagebuf;
846
+ } MDB_metabuf;
820
847
 
821
848
  /** Auxiliary DB info.
822
849
  * The information here is mostly static/read-only. There is
@@ -865,9 +892,9 @@ struct MDB_txn {
865
892
  * @ingroup internal
866
893
  * @{
867
894
  */
868
- #define DB_DIRTY 0x01 /**< DB was written in this txn */
869
- #define DB_STALE 0x02 /**< DB record is older than txnID */
870
- #define DB_NEW 0x04 /**< DB handle opened in this txn */
895
+ #define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */
896
+ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */
897
+ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
871
898
  #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
872
899
  /** @} */
873
900
  /** In write txns, array of cursors for each DB */
@@ -889,12 +916,12 @@ struct MDB_txn {
889
916
  #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
890
917
  /** @} */
891
918
  unsigned int mt_flags; /**< @ref mdb_txn */
892
- /** dirty_list maxsize - # of allocated pages allowed, including in parent txns */
893
- unsigned int mt_dirty_room;
894
- /** Tracks which of the two meta pages was used at the start
895
- * of this transaction.
919
+ /** dirty_list room: Array size - #dirty pages visible to this txn.
920
+ * Includes ancestor txns' dirty pages not hidden by other txns'
921
+ * dirty/spilled pages. Thus commit(nested txn) has room to merge
922
+ * dirty_list into mt_parent after freeing hidden mt_parent pages.
896
923
  */
897
- unsigned int mt_toggle;
924
+ unsigned int mt_dirty_room;
898
925
  };
899
926
 
900
927
  /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
@@ -905,7 +932,14 @@ struct MDB_txn {
905
932
 
906
933
  struct MDB_xcursor;
907
934
 
908
- /** Cursors are used for all DB operations */
935
+ /** Cursors are used for all DB operations.
936
+ * A cursor holds a path of (page pointer, key index) from the DB
937
+ * root to a position in the DB, plus other state. #MDB_DUPSORT
938
+ * cursors include an xcursor to the current data item. Write txns
939
+ * track their cursors and keep them up to date when data moves.
940
+ * Exception: An xcursor's pointer to a #P_SUBP page can be stale.
941
+ * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
942
+ */
909
943
  struct MDB_cursor {
910
944
  /** Next cursor on this DB in this txn */
911
945
  MDB_cursor *mc_next;
@@ -978,16 +1012,18 @@ struct MDB_env {
978
1012
  /** Have liveness lock in reader table */
979
1013
  #define MDB_LIVE_READER 0x08000000U
980
1014
  uint32_t me_flags; /**< @ref mdb_env */
981
- unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
1015
+ unsigned int me_psize; /**< DB page size, inited from me_os_psize */
1016
+ unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
982
1017
  unsigned int me_maxreaders; /**< size of the reader table */
983
1018
  unsigned int me_numreaders; /**< max numreaders set by this env */
984
1019
  MDB_dbi me_numdbs; /**< number of DBs opened */
985
1020
  MDB_dbi me_maxdbs; /**< size of the DB table */
986
- pid_t me_pid; /**< process ID of this env */
1021
+ MDB_PID_T me_pid; /**< process ID of this env */
987
1022
  char *me_path; /**< path to the DB files */
988
1023
  char *me_map; /**< the memory map of the data file */
989
1024
  MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
990
1025
  MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
1026
+ void *me_pbuf; /**< scratch area for DUPSORT put() */
991
1027
  MDB_txn *me_txn; /**< current write transaction */
992
1028
  size_t me_mapsize; /**< size of the data memory map */
993
1029
  off_t me_size; /**< current file size */
@@ -1019,8 +1055,8 @@ struct MDB_env {
1019
1055
 
1020
1056
  /** Nested transaction */
1021
1057
  typedef struct MDB_ntxn {
1022
- MDB_txn mnt_txn; /* the transaction */
1023
- MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */
1058
+ MDB_txn mnt_txn; /**< the transaction */
1059
+ MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
1024
1060
  } MDB_ntxn;
1025
1061
 
1026
1062
  /** max number of pages to commit in one writev() call */
@@ -1042,6 +1078,8 @@ static int mdb_page_search_root(MDB_cursor *mc,
1042
1078
  MDB_val *key, int modify);
1043
1079
  #define MDB_PS_MODIFY 1
1044
1080
  #define MDB_PS_ROOTONLY 2
1081
+ #define MDB_PS_FIRST 4
1082
+ #define MDB_PS_LAST 8
1045
1083
  static int mdb_page_search(MDB_cursor *mc,
1046
1084
  MDB_val *key, int flags);
1047
1085
  static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
@@ -1255,7 +1293,7 @@ static void mdb_audit(MDB_txn *txn)
1255
1293
  txn->mt_dbs[i].md_leaf_pages +
1256
1294
  txn->mt_dbs[i].md_overflow_pages;
1257
1295
  if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1258
- mdb_page_search(&mc, NULL, 0);
1296
+ mdb_page_search(&mc, NULL, MDB_PS_FIRST);
1259
1297
  do {
1260
1298
  unsigned j;
1261
1299
  MDB_page *mp;
@@ -1300,7 +1338,12 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
1300
1338
  {
1301
1339
  MDB_env *env = txn->mt_env;
1302
1340
  MDB_page *ret = env->me_dpages;
1303
- size_t sz = env->me_psize;
1341
+ size_t psize = env->me_psize, sz = psize, off;
1342
+ /* For ! #MDB_NOMEMINIT, psize counts how much to init.
1343
+ * For a single page alloc, we init everything after the page header.
1344
+ * For multi-page, we init the final page; if the caller needed that
1345
+ * many pages they will be filling in at least up to the last page.
1346
+ */
1304
1347
  if (num == 1) {
1305
1348
  if (ret) {
1306
1349
  VGMEMP_ALLOC(env, ret, sz);
@@ -1308,10 +1351,16 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
1308
1351
  env->me_dpages = ret->mp_next;
1309
1352
  return ret;
1310
1353
  }
1354
+ psize -= off = PAGEHDRSZ;
1311
1355
  } else {
1312
1356
  sz *= num;
1357
+ off = sz - psize;
1313
1358
  }
1314
1359
  if ((ret = malloc(sz)) != NULL) {
1360
+ if (!(env->me_flags & MDB_NOMEMINIT)) {
1361
+ memset((char *)ret + off, 0, psize);
1362
+ ret->mp_pad = 0;
1363
+ }
1315
1364
  VGMEMP_ALLOC(env, ret, sz);
1316
1365
  }
1317
1366
  return ret;
@@ -1329,7 +1378,7 @@ mdb_page_free(MDB_env *env, MDB_page *mp)
1329
1378
  env->me_dpages = mp;
1330
1379
  }
1331
1380
 
1332
- /* Free a dirty page */
1381
+ /** Free a dirty page */
1333
1382
  static void
1334
1383
  mdb_dpage_free(MDB_env *env, MDB_page *dp)
1335
1384
  {
@@ -1356,7 +1405,7 @@ mdb_dlist_free(MDB_txn *txn)
1356
1405
  dl[0].mid = 0;
1357
1406
  }
1358
1407
 
1359
- /* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1408
+ /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1360
1409
  * @param[in] mc A cursor handle for the current operation.
1361
1410
  * @param[in] pflags Flags of the pages to update:
1362
1411
  * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
@@ -1366,10 +1415,12 @@ mdb_dlist_free(MDB_txn *txn)
1366
1415
  static int
1367
1416
  mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1368
1417
  {
1418
+ enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
1369
1419
  MDB_txn *txn = mc->mc_txn;
1370
1420
  MDB_cursor *m3;
1371
1421
  MDB_xcursor *mx;
1372
- MDB_page *dp;
1422
+ MDB_page *dp, *mp;
1423
+ MDB_node *leaf;
1373
1424
  unsigned i, j;
1374
1425
  int rc = MDB_SUCCESS, level;
1375
1426
 
@@ -1378,14 +1429,24 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1378
1429
  mc = NULL; /* will find mc in mt_cursors */
1379
1430
  for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
1380
1431
  for (; mc; mc=mc->mc_next) {
1381
- for (m3 = mc; m3->mc_flags & C_INITIALIZED; m3 = &mx->mx_cursor) {
1382
- for (j=0; j<m3->mc_snum; j++)
1383
- if ((m3->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY|P_KEEP))
1384
- == pflags)
1385
- m3->mc_pg[j]->mp_flags ^= P_KEEP;
1386
- mx = m3->mc_xcursor;
1387
- if (mx == NULL)
1388
- break;
1432
+ if (!(mc->mc_flags & C_INITIALIZED))
1433
+ continue;
1434
+ for (m3 = mc;; m3 = &mx->mx_cursor) {
1435
+ mp = NULL;
1436
+ for (j=0; j<m3->mc_snum; j++) {
1437
+ mp = m3->mc_pg[j];
1438
+ if ((mp->mp_flags & Mask) == pflags)
1439
+ mp->mp_flags ^= P_KEEP;
1440
+ }
1441
+ mx = m3->mc_xcursor;
1442
+ /* Proceed to mx if it is at a sub-database */
1443
+ if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
1444
+ break;
1445
+ if (! (mp && (mp->mp_flags & P_LEAF)))
1446
+ break;
1447
+ leaf = NODEPTR(mp, m3->mc_ki[j-1]);
1448
+ if (!(leaf->mn_flags & F_SUBDATA))
1449
+ break;
1389
1450
  }
1390
1451
  }
1391
1452
  if (i == 0)
@@ -1401,7 +1462,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1401
1462
  continue;
1402
1463
  if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
1403
1464
  break;
1404
- if ((dp->mp_flags & (P_DIRTY|P_KEEP)) == pflags && level <= 1)
1465
+ if ((dp->mp_flags & Mask) == pflags && level <= 1)
1405
1466
  dp->mp_flags ^= P_KEEP;
1406
1467
  }
1407
1468
  }
@@ -1415,15 +1476,12 @@ static int mdb_page_flush(MDB_txn *txn, int keep);
1415
1476
  /** Spill pages from the dirty list back to disk.
1416
1477
  * This is intended to prevent running into #MDB_TXN_FULL situations,
1417
1478
  * but note that they may still occur in a few cases:
1418
- * 1) pages in #MDB_DUPSORT sub-DBs are never spilled, so if there
1419
- * are too many of these dirtied in one txn, the txn may still get
1420
- * too full.
1479
+ * 1) our estimate of the txn size could be too small. Currently this
1480
+ * seems unlikely, except with a large number of #MDB_MULTIPLE items.
1421
1481
  * 2) child txns may run out of space if their parents dirtied a
1422
1482
  * lot of pages and never spilled them. TODO: we probably should do
1423
1483
  * a preemptive spill during #mdb_txn_begin() of a child txn, if
1424
1484
  * the parent's dirty_room is below a given threshold.
1425
- * 3) our estimate of the txn size could be too small. At the
1426
- * moment this seems unlikely.
1427
1485
  *
1428
1486
  * Otherwise, if not using nested txns, it is expected that apps will
1429
1487
  * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
@@ -1541,31 +1599,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
1541
1599
  rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
1542
1600
 
1543
1601
  done:
1544
- if (rc == 0) {
1545
- if (txn->mt_parent) {
1546
- txn->mt_dirty_room = txn->mt_parent->mt_dirty_room - dl[0].mid;
1547
- /* dirty pages that are dirty in an ancestor don't
1548
- * count against this txn's dirty_room.
1549
- */
1550
- for (i=1; i<=dl[0].mid; i++) {
1551
- pgno_t pgno = dl[i].mid;
1552
- MDB_txn *tx2;
1553
- for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
1554
- j = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
1555
- if (j <= tx2->mt_u.dirty_list[0].mid &&
1556
- tx2->mt_u.dirty_list[j].mid == pgno) {
1557
- txn->mt_dirty_room++;
1558
- break;
1559
- }
1560
- }
1561
- }
1562
- } else {
1563
- txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid;
1564
- }
1565
- txn->mt_flags |= MDB_TXN_SPILLS;
1566
- } else {
1567
- txn->mt_flags |= MDB_TXN_ERROR;
1568
- }
1602
+ txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
1569
1603
  return rc;
1570
1604
  }
1571
1605
 
@@ -1575,12 +1609,14 @@ mdb_find_oldest(MDB_txn *txn)
1575
1609
  {
1576
1610
  int i;
1577
1611
  txnid_t mr, oldest = txn->mt_txnid - 1;
1578
- MDB_reader *r = txn->mt_env->me_txns->mti_readers;
1579
- for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
1580
- if (r[i].mr_pid) {
1581
- mr = r[i].mr_txnid;
1582
- if (oldest > mr)
1583
- oldest = mr;
1612
+ if (txn->mt_env->me_txns) {
1613
+ MDB_reader *r = txn->mt_env->me_txns->mti_readers;
1614
+ for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
1615
+ if (r[i].mr_pid) {
1616
+ mr = r[i].mr_txnid;
1617
+ if (oldest > mr)
1618
+ oldest = mr;
1619
+ }
1584
1620
  }
1585
1621
  }
1586
1622
  return oldest;
@@ -1790,26 +1826,28 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
1790
1826
  /** Pull a page off the txn's spill list, if present.
1791
1827
  * If a page being referenced was spilled to disk in this txn, bring
1792
1828
  * it back and make it dirty/writable again.
1793
- * @param[in] tx0 the transaction handle.
1829
+ * @param[in] txn the transaction handle.
1794
1830
  * @param[in] mp the page being referenced.
1795
1831
  * @param[out] ret the writable page, if any. ret is unchanged if
1796
1832
  * mp wasn't spilled.
1797
1833
  */
1798
1834
  static int
1799
- mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
1835
+ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
1800
1836
  {
1801
- MDB_env *env = tx0->mt_env;
1802
- MDB_txn *txn;
1837
+ MDB_env *env = txn->mt_env;
1838
+ const MDB_txn *tx2;
1803
1839
  unsigned x;
1804
1840
  pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
1805
1841
 
1806
- for (txn = tx0; txn; txn=txn->mt_parent) {
1807
- if (!txn->mt_spill_pgs)
1842
+ for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
1843
+ if (!tx2->mt_spill_pgs)
1808
1844
  continue;
1809
- x = mdb_midl_search(txn->mt_spill_pgs, pn);
1810
- if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pn) {
1845
+ x = mdb_midl_search(tx2->mt_spill_pgs, pn);
1846
+ if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
1811
1847
  MDB_page *np;
1812
1848
  int num;
1849
+ if (txn->mt_dirty_room == 0)
1850
+ return MDB_TXN_FULL;
1813
1851
  if (IS_OVERFLOW(mp))
1814
1852
  num = mp->mp_pages;
1815
1853
  else
@@ -1825,7 +1863,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
1825
1863
  else
1826
1864
  mdb_page_copy(np, mp, env->me_psize);
1827
1865
  }
1828
- if (txn == tx0) {
1866
+ if (tx2 == txn) {
1829
1867
  /* If in current txn, this page is no longer spilled.
1830
1868
  * If it happens to be the last page, truncate the spill list.
1831
1869
  * Otherwise mark it as deleted by setting the LSB.
@@ -1838,22 +1876,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
1838
1876
  * page remains spilled until child commits
1839
1877
  */
1840
1878
 
1841
- if (txn->mt_parent) {
1842
- MDB_txn *tx2;
1843
- /* If this page is also in a parent's dirty list, then
1844
- * it's already accounted in dirty_room, and we need to
1845
- * cancel out the decrement that mdb_page_dirty does.
1846
- */
1847
- for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
1848
- x = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
1849
- if (x <= tx2->mt_u.dirty_list[0].mid &&
1850
- tx2->mt_u.dirty_list[x].mid == pgno) {
1851
- tx0->mt_dirty_room++;
1852
- break;
1853
- }
1854
- }
1855
- }
1856
- mdb_page_dirty(tx0, np);
1879
+ mdb_page_dirty(txn, np);
1857
1880
  np->mp_flags |= P_DIRTY;
1858
1881
  *ret = np;
1859
1882
  break;
@@ -1872,7 +1895,6 @@ mdb_page_touch(MDB_cursor *mc)
1872
1895
  MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
1873
1896
  MDB_txn *txn = mc->mc_txn;
1874
1897
  MDB_cursor *m2, *m3;
1875
- MDB_dbi dbi;
1876
1898
  pgno_t pgno;
1877
1899
  int rc;
1878
1900
 
@@ -1889,7 +1911,8 @@ mdb_page_touch(MDB_cursor *mc)
1889
1911
  (rc = mdb_page_alloc(mc, 1, &np)))
1890
1912
  return rc;
1891
1913
  pgno = np->mp_pgno;
1892
- DPRINTF(("touched db %u page %"Z"u -> %"Z"u", mc->mc_dbi,mp->mp_pgno,pgno));
1914
+ DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
1915
+ mp->mp_pgno, pgno));
1893
1916
  assert(mp->mp_pgno != pgno);
1894
1917
  mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
1895
1918
  /* Update the parent page, if any, to point to the new page */
@@ -1935,17 +1958,16 @@ mdb_page_touch(MDB_cursor *mc)
1935
1958
  done:
1936
1959
  /* Adjust cursors pointing to mp */
1937
1960
  mc->mc_pg[mc->mc_top] = np;
1938
- dbi = mc->mc_dbi;
1961
+ m2 = txn->mt_cursors[mc->mc_dbi];
1939
1962
  if (mc->mc_flags & C_SUB) {
1940
- dbi--;
1941
- for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
1963
+ for (; m2; m2=m2->mc_next) {
1942
1964
  m3 = &m2->mc_xcursor->mx_cursor;
1943
1965
  if (m3->mc_snum < mc->mc_snum) continue;
1944
1966
  if (m3->mc_pg[mc->mc_top] == mp)
1945
1967
  m3->mc_pg[mc->mc_top] = np;
1946
1968
  }
1947
1969
  } else {
1948
- for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
1970
+ for (; m2; m2=m2->mc_next) {
1949
1971
  if (m2->mc_snum < mc->mc_snum) continue;
1950
1972
  if (m2->mc_pg[mc->mc_top] == mp) {
1951
1973
  m2->mc_pg[mc->mc_top] = np;
@@ -2087,7 +2109,7 @@ enum Pidlock_op {
2087
2109
  * lock on the lockfile, set at an offset equal to the pid.
2088
2110
  */
2089
2111
  static int
2090
- mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid)
2112
+ mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
2091
2113
  {
2092
2114
  #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2093
2115
  int ret = 0;
@@ -2130,7 +2152,9 @@ static int
2130
2152
  mdb_txn_renew0(MDB_txn *txn)
2131
2153
  {
2132
2154
  MDB_env *env = txn->mt_env;
2133
- unsigned int i;
2155
+ MDB_txninfo *ti = env->me_txns;
2156
+ MDB_meta *meta;
2157
+ unsigned int i, nr;
2134
2158
  uint16_t x;
2135
2159
  int rc, new_notls = 0;
2136
2160
 
@@ -2139,9 +2163,9 @@ mdb_txn_renew0(MDB_txn *txn)
2139
2163
  txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
2140
2164
 
2141
2165
  if (txn->mt_flags & MDB_TXN_RDONLY) {
2142
- if (!env->me_txns) {
2143
- i = mdb_env_pick_meta(env);
2144
- txn->mt_txnid = env->me_metas[i]->mm_txnid;
2166
+ if (!ti) {
2167
+ meta = env->me_metas[ mdb_env_pick_meta(env) ];
2168
+ txn->mt_txnid = meta->mm_txnid;
2145
2169
  txn->mt_u.reader = NULL;
2146
2170
  } else {
2147
2171
  MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
@@ -2150,7 +2174,7 @@ mdb_txn_renew0(MDB_txn *txn)
2150
2174
  if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
2151
2175
  return MDB_BAD_RSLOT;
2152
2176
  } else {
2153
- pid_t pid = env->me_pid;
2177
+ MDB_PID_T pid = env->me_pid;
2154
2178
  pthread_t tid = pthread_self();
2155
2179
 
2156
2180
  if (!(env->me_flags & MDB_LIVE_READER)) {
@@ -2163,36 +2187,43 @@ mdb_txn_renew0(MDB_txn *txn)
2163
2187
  }
2164
2188
 
2165
2189
  LOCK_MUTEX_R(env);
2166
- for (i=0; i<env->me_txns->mti_numreaders; i++)
2167
- if (env->me_txns->mti_readers[i].mr_pid == 0)
2190
+ nr = ti->mti_numreaders;
2191
+ for (i=0; i<nr; i++)
2192
+ if (ti->mti_readers[i].mr_pid == 0)
2168
2193
  break;
2169
2194
  if (i == env->me_maxreaders) {
2170
2195
  UNLOCK_MUTEX_R(env);
2171
2196
  return MDB_READERS_FULL;
2172
2197
  }
2173
- env->me_txns->mti_readers[i].mr_pid = pid;
2174
- env->me_txns->mti_readers[i].mr_tid = tid;
2175
- if (i >= env->me_txns->mti_numreaders)
2176
- env->me_txns->mti_numreaders = i+1;
2198
+ ti->mti_readers[i].mr_pid = pid;
2199
+ ti->mti_readers[i].mr_tid = tid;
2200
+ if (i == nr)
2201
+ ti->mti_numreaders = ++nr;
2177
2202
  /* Save numreaders for un-mutexed mdb_env_close() */
2178
- env->me_numreaders = env->me_txns->mti_numreaders;
2203
+ env->me_numreaders = nr;
2179
2204
  UNLOCK_MUTEX_R(env);
2180
- r = &env->me_txns->mti_readers[i];
2205
+
2206
+ r = &ti->mti_readers[i];
2181
2207
  new_notls = (env->me_flags & MDB_NOTLS);
2182
2208
  if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
2183
2209
  r->mr_pid = 0;
2184
2210
  return rc;
2185
2211
  }
2186
2212
  }
2187
- txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid;
2213
+ txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
2188
2214
  txn->mt_u.reader = r;
2215
+ meta = env->me_metas[txn->mt_txnid & 1];
2189
2216
  }
2190
- txn->mt_toggle = txn->mt_txnid & 1;
2191
2217
  } else {
2192
- LOCK_MUTEX_W(env);
2218
+ if (ti) {
2219
+ LOCK_MUTEX_W(env);
2193
2220
 
2194
- txn->mt_txnid = env->me_txns->mti_txnid;
2195
- txn->mt_toggle = txn->mt_txnid & 1;
2221
+ txn->mt_txnid = ti->mti_txnid;
2222
+ meta = env->me_metas[txn->mt_txnid & 1];
2223
+ } else {
2224
+ meta = env->me_metas[ mdb_env_pick_meta(env) ];
2225
+ txn->mt_txnid = meta->mm_txnid;
2226
+ }
2196
2227
  txn->mt_txnid++;
2197
2228
  #if MDB_DEBUG
2198
2229
  if (txn->mt_txnid == mdb_debug_start)
@@ -2208,10 +2239,10 @@ mdb_txn_renew0(MDB_txn *txn)
2208
2239
  }
2209
2240
 
2210
2241
  /* Copy the DB info and flags */
2211
- memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db));
2242
+ memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
2212
2243
 
2213
2244
  /* Moved to here to avoid a data race in read TXNs */
2214
- txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
2245
+ txn->mt_next_pgno = meta->mm_last_pg+1;
2215
2246
 
2216
2247
  for (i=2; i<txn->mt_numdbs; i++) {
2217
2248
  x = env->me_dbflags[i];
@@ -2307,7 +2338,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2307
2338
  return ENOMEM;
2308
2339
  }
2309
2340
  txn->mt_txnid = parent->mt_txnid;
2310
- txn->mt_toggle = parent->mt_toggle;
2311
2341
  txn->mt_dirty_room = parent->mt_dirty_room;
2312
2342
  txn->mt_u.dirty_list[0].mid = 0;
2313
2343
  txn->mt_spill_pgs = NULL;
@@ -2433,7 +2463,8 @@ mdb_txn_reset0(MDB_txn *txn, const char *act)
2433
2463
 
2434
2464
  env->me_txn = NULL;
2435
2465
  /* The writer mutex was locked in mdb_txn_begin. */
2436
- UNLOCK_MUTEX_W(env);
2466
+ if (env->me_txns)
2467
+ UNLOCK_MUTEX_W(env);
2437
2468
  }
2438
2469
  }
2439
2470
 
@@ -2482,20 +2513,26 @@ mdb_freelist_save(MDB_txn *txn)
2482
2513
  int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
2483
2514
  txnid_t pglast = 0, head_id = 0;
2484
2515
  pgno_t freecnt = 0, *free_pgs, *mop;
2485
- ssize_t head_room = 0, total_room = 0, mop_len;
2516
+ ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
2486
2517
 
2487
2518
  mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
2488
2519
 
2489
2520
  if (env->me_pghead) {
2490
2521
  /* Make sure first page of freeDB is touched and on freelist */
2491
- rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
2522
+ rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
2492
2523
  if (rc && rc != MDB_NOTFOUND)
2493
2524
  return rc;
2494
2525
  }
2495
2526
 
2527
+ /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2528
+ clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2529
+ ? SSIZE_MAX : maxfree_1pg;
2530
+
2496
2531
  for (;;) {
2497
2532
  /* Come back here after each Put() in case freelist changed */
2498
2533
  MDB_val key, data;
2534
+ pgno_t *pgs;
2535
+ ssize_t j;
2499
2536
 
2500
2537
  /* If using records from freeDB which we have not yet
2501
2538
  * deleted, delete them and any we reserved for me_pghead.
@@ -2516,9 +2553,7 @@ mdb_freelist_save(MDB_txn *txn)
2516
2553
  if (freecnt < txn->mt_free_pgs[0]) {
2517
2554
  if (!freecnt) {
2518
2555
  /* Make sure last page of freeDB is touched and on freelist */
2519
- key.mv_size = MDB_MAXKEYSIZE+1;
2520
- key.mv_data = NULL;
2521
- rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
2556
+ rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
2522
2557
  if (rc && rc != MDB_NOTFOUND)
2523
2558
  return rc;
2524
2559
  }
@@ -2581,11 +2616,16 @@ mdb_freelist_save(MDB_txn *txn)
2581
2616
  rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
2582
2617
  if (rc)
2583
2618
  return rc;
2584
- *(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */
2619
+ /* IDL is initially empty, zero out at least the length */
2620
+ pgs = (pgno_t *)data.mv_data;
2621
+ j = head_room > clean_limit ? head_room : 0;
2622
+ do {
2623
+ pgs[j] = 0;
2624
+ } while (--j >= 0);
2585
2625
  total_room += head_room;
2586
2626
  }
2587
2627
 
2588
- /* Fill in the reserved, touched me_pghead records */
2628
+ /* Fill in the reserved me_pghead records */
2589
2629
  rc = MDB_SUCCESS;
2590
2630
  if (mop_len) {
2591
2631
  MDB_val key, data;
@@ -2655,8 +2695,7 @@ mdb_page_flush(MDB_txn *txn, int keep)
2655
2695
  }
2656
2696
  dp->mp_flags &= ~P_DIRTY;
2657
2697
  }
2658
- dl[0].mid = j;
2659
- return MDB_SUCCESS;
2698
+ goto done;
2660
2699
  }
2661
2700
 
2662
2701
  /* Write the pages */
@@ -2750,8 +2789,11 @@ mdb_page_flush(MDB_txn *txn, int keep)
2750
2789
  }
2751
2790
  mdb_dpage_free(env, dp);
2752
2791
  }
2753
- dl[0].mid = j;
2754
2792
 
2793
+ done:
2794
+ i--;
2795
+ txn->mt_dirty_room += i - j;
2796
+ dl[0].mid = j;
2755
2797
  return MDB_SUCCESS;
2756
2798
  }
2757
2799
 
@@ -2791,14 +2833,18 @@ mdb_txn_commit(MDB_txn *txn)
2791
2833
 
2792
2834
  if (txn->mt_parent) {
2793
2835
  MDB_txn *parent = txn->mt_parent;
2794
- unsigned x, y, len;
2795
2836
  MDB_ID2L dst, src;
2837
+ MDB_IDL pspill;
2838
+ unsigned x, y, len, ps_len;
2796
2839
 
2797
2840
  /* Append our free list to parent's */
2798
2841
  rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
2799
2842
  if (rc)
2800
2843
  goto fail;
2801
2844
  mdb_midl_free(txn->mt_free_pgs);
2845
+ /* Failures after this must either undo the changes
2846
+ * to the parent or set MDB_TXN_ERROR in the parent.
2847
+ */
2802
2848
 
2803
2849
  parent->mt_next_pgno = txn->mt_next_pgno;
2804
2850
  parent->mt_flags = txn->mt_flags;
@@ -2820,37 +2866,26 @@ mdb_txn_commit(MDB_txn *txn)
2820
2866
  dst = parent->mt_u.dirty_list;
2821
2867
  src = txn->mt_u.dirty_list;
2822
2868
  /* Remove anything in our dirty list from parent's spill list */
2823
- if (parent->mt_spill_pgs) {
2824
- x = parent->mt_spill_pgs[0];
2825
- len = x;
2826
- /* zero out our dirty pages in parent spill list */
2827
- for (i=1; i<=src[0].mid; i++) {
2869
+ if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
2870
+ x = y = ps_len;
2871
+ pspill[0] = (pgno_t)-1;
2872
+ /* Mark our dirty pages as deleted in parent spill list */
2873
+ for (i=0, len=src[0].mid; ++i <= len; ) {
2828
2874
  MDB_ID pn = src[i].mid << 1;
2829
- if (pn < parent->mt_spill_pgs[x])
2830
- continue;
2831
- if (pn > parent->mt_spill_pgs[x]) {
2832
- if (x <= 1)
2833
- break;
2875
+ while (pn > pspill[x])
2834
2876
  x--;
2835
- continue;
2836
- }
2837
- parent->mt_spill_pgs[x] = 0;
2838
- len--;
2839
- }
2840
- /* OK, we had a few hits, squash zeros from the spill list */
2841
- if (len < parent->mt_spill_pgs[0]) {
2842
- x=1;
2843
- for (y=1; y<=parent->mt_spill_pgs[0]; y++) {
2844
- if (parent->mt_spill_pgs[y]) {
2845
- if (y != x) {
2846
- parent->mt_spill_pgs[x] = parent->mt_spill_pgs[y];
2847
- }
2848
- x++;
2849
- }
2877
+ if (pn == pspill[x]) {
2878
+ pspill[x] = 1;
2879
+ y = --x;
2850
2880
  }
2851
- parent->mt_spill_pgs[0] = len;
2852
2881
  }
2882
+ /* Squash deleted pagenums if we deleted any */
2883
+ for (x=y; ++x <= ps_len; )
2884
+ if (!(pspill[x] & 1))
2885
+ pspill[++y] = pspill[x];
2886
+ pspill[0] = y;
2853
2887
  }
2888
+
2854
2889
  /* Find len = length of merging our dirty list with parent's */
2855
2890
  x = dst[0].mid;
2856
2891
  dst[0].mid = 0; /* simplify loops */
@@ -2884,7 +2919,10 @@ mdb_txn_commit(MDB_txn *txn)
2884
2919
  parent->mt_dirty_room = txn->mt_dirty_room;
2885
2920
  if (txn->mt_spill_pgs) {
2886
2921
  if (parent->mt_spill_pgs) {
2887
- mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
2922
+ /* TODO: Prevent failure here, so parent does not fail */
2923
+ rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
2924
+ if (rc)
2925
+ parent->mt_flags |= MDB_TXN_ERROR;
2888
2926
  mdb_midl_free(txn->mt_spill_pgs);
2889
2927
  mdb_midl_sort(parent->mt_spill_pgs);
2890
2928
  } else {
@@ -2895,7 +2933,7 @@ mdb_txn_commit(MDB_txn *txn)
2895
2933
  parent->mt_child = NULL;
2896
2934
  mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
2897
2935
  free(txn);
2898
- return MDB_SUCCESS;
2936
+ return rc;
2899
2937
  }
2900
2938
 
2901
2939
  if (txn != env->me_txn) {
@@ -2954,7 +2992,8 @@ done:
2954
2992
  env->me_txn = NULL;
2955
2993
  mdb_dbis_update(txn, 1);
2956
2994
 
2957
- UNLOCK_MUTEX_W(env);
2995
+ if (env->me_txns)
2996
+ UNLOCK_MUTEX_W(env);
2958
2997
  free(txn);
2959
2998
 
2960
2999
  return MDB_SUCCESS;
@@ -2973,10 +3012,11 @@ fail:
2973
3012
  static int
2974
3013
  mdb_env_read_header(MDB_env *env, MDB_meta *meta)
2975
3014
  {
2976
- MDB_pagebuf pbuf;
3015
+ MDB_metabuf pbuf;
2977
3016
  MDB_page *p;
2978
3017
  MDB_meta *m;
2979
3018
  int i, rc, off;
3019
+ enum { Size = sizeof(pbuf) };
2980
3020
 
2981
3021
  /* We don't know the page size yet, so use a minimum value.
2982
3022
  * Read both meta pages so we can use the latest one.
@@ -2988,13 +3028,13 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
2988
3028
  OVERLAPPED ov;
2989
3029
  memset(&ov, 0, sizeof(ov));
2990
3030
  ov.Offset = off;
2991
- rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1;
3031
+ rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
2992
3032
  if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
2993
3033
  rc = 0;
2994
3034
  #else
2995
- rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off);
3035
+ rc = pread(env->me_fd, &pbuf, Size, off);
2996
3036
  #endif
2997
- if (rc != MDB_PAGESIZE) {
3037
+ if (rc != Size) {
2998
3038
  if (rc == 0 && off == 0)
2999
3039
  return ENOENT;
3000
3040
  rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
@@ -3109,7 +3149,7 @@ mdb_env_write_meta(MDB_txn *txn)
3109
3149
  assert(txn != NULL);
3110
3150
  assert(txn->mt_env != NULL);
3111
3151
 
3112
- toggle = !txn->mt_toggle;
3152
+ toggle = txn->mt_txnid & 1;
3113
3153
  DPRINTF(("writing meta page %d for root page %"Z"u",
3114
3154
  toggle, txn->mt_dbs[MAIN_DBI].md_root));
3115
3155
 
@@ -3125,11 +3165,18 @@ mdb_env_write_meta(MDB_txn *txn)
3125
3165
  mp->mm_last_pg = txn->mt_next_pgno - 1;
3126
3166
  mp->mm_txnid = txn->mt_txnid;
3127
3167
  if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3168
+ unsigned meta_size = env->me_psize;
3128
3169
  rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
3129
3170
  ptr = env->me_map;
3130
- if (toggle)
3131
- ptr += env->me_psize;
3132
- if (MDB_MSYNC(ptr, env->me_psize, rc)) {
3171
+ if (toggle) {
3172
+ #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
3173
+ if (meta_size < env->me_os_psize)
3174
+ meta_size += meta_size;
3175
+ else
3176
+ #endif
3177
+ ptr += meta_size;
3178
+ }
3179
+ if (MDB_MSYNC(ptr, meta_size, rc)) {
3133
3180
  rc = ErrCode();
3134
3181
  goto fail;
3135
3182
  }
@@ -3200,7 +3247,8 @@ done:
3200
3247
  * readers will get consistent data regardless of how fresh or
3201
3248
  * how stale their view of these values is.
3202
3249
  */
3203
- env->me_txns->mti_txnid = txn->mt_txnid;
3250
+ if (env->me_txns)
3251
+ env->me_txns->mti_txnid = txn->mt_txnid;
3204
3252
 
3205
3253
  return MDB_SUCCESS;
3206
3254
  }
@@ -3234,6 +3282,7 @@ mdb_env_create(MDB_env **env)
3234
3282
  e->me_wmutex = SEM_FAILED;
3235
3283
  #endif
3236
3284
  e->me_pid = getpid();
3285
+ GET_PAGESIZE(e->me_os_psize);
3237
3286
  VGMEMP_CREATE(e,0,0);
3238
3287
  *env = e;
3239
3288
  return MDB_SUCCESS;
@@ -3276,7 +3325,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3276
3325
  int prot = PROT_READ;
3277
3326
  if (flags & MDB_WRITEMAP) {
3278
3327
  prot |= PROT_WRITE;
3279
- if (newsize && ftruncate(env->me_fd, env->me_mapsize) < 0)
3328
+ if (ftruncate(env->me_fd, env->me_mapsize) < 0)
3280
3329
  return ErrCode();
3281
3330
  }
3282
3331
  env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
@@ -3285,14 +3334,17 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3285
3334
  env->me_map = NULL;
3286
3335
  return ErrCode();
3287
3336
  }
3288
- /* Turn off readahead. It's harmful when the DB is larger than RAM. */
3337
+
3338
+ if (flags & MDB_NORDAHEAD) {
3339
+ /* Turn off readahead. It's harmful when the DB is larger than RAM. */
3289
3340
  #ifdef MADV_RANDOM
3290
- madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
3341
+ madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
3291
3342
  #else
3292
3343
  #ifdef POSIX_MADV_RANDOM
3293
- posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
3344
+ posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
3294
3345
  #endif /* POSIX_MADV_RANDOM */
3295
3346
  #endif /* MADV_RANDOM */
3347
+ }
3296
3348
  #endif /* _WIN32 */
3297
3349
 
3298
3350
  /* Can happen because the address argument to mmap() is just a
@@ -3323,6 +3375,14 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
3323
3375
  return EINVAL;
3324
3376
  if (!size)
3325
3377
  size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize;
3378
+ else if (size < env->me_mapsize) {
3379
+ /* If the configured size is smaller, make sure it's
3380
+ * still big enough. Silently round up to minimum if not.
3381
+ */
3382
+ size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize;
3383
+ if (size < minsize)
3384
+ size = minsize;
3385
+ }
3326
3386
  munmap(env->me_map, env->me_mapsize);
3327
3387
  env->me_mapsize = size;
3328
3388
  old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
@@ -3388,7 +3448,9 @@ mdb_env_open2(MDB_env *env)
3388
3448
  return i;
3389
3449
  DPUTS("new mdbenv");
3390
3450
  newenv = 1;
3391
- GET_PAGESIZE(env->me_psize);
3451
+ env->me_psize = env->me_os_psize;
3452
+ if (env->me_psize > MAX_PAGESIZE)
3453
+ env->me_psize = MAX_PAGESIZE;
3392
3454
  } else {
3393
3455
  env->me_psize = meta.mm_psize;
3394
3456
  }
@@ -3499,7 +3561,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_
3499
3561
  #pragma comment(linker, "/INCLUDE:_tls_used")
3500
3562
  #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
3501
3563
  #pragma const_seg(".CRT$XLB")
3502
- extern const PIMAGE_TLS_CALLBACK mdb_tls_callback;
3564
+ extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
3503
3565
  const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
3504
3566
  #pragma const_seg()
3505
3567
  #else /* WIN32 */
@@ -3597,7 +3659,7 @@ mdb_env_excl_lock(MDB_env *env, int *excl)
3597
3659
  return rc;
3598
3660
  }
3599
3661
 
3600
- #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
3662
+ #ifdef MDB_USE_HASH
3601
3663
  /*
3602
3664
  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
3603
3665
  *
@@ -3763,7 +3825,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
3763
3825
  rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
3764
3826
  if (size < rsize && *excl > 0) {
3765
3827
  #ifdef _WIN32
3766
- if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize
3828
+ if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
3767
3829
  || !SetEndOfFile(env->me_lfd))
3768
3830
  goto fail_errno;
3769
3831
  #else
@@ -3919,8 +3981,9 @@ fail:
3919
3981
  * at runtime. Changing other flags requires closing the
3920
3982
  * environment and re-opening it with the new flags.
3921
3983
  */
3922
- #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
3923
- #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS)
3984
+ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
3985
+ #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
3986
+ MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
3924
3987
 
3925
3988
  int
3926
3989
  mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
@@ -3973,7 +4036,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
3973
4036
  }
3974
4037
 
3975
4038
  /* For RDONLY, get lockfile after we know datafile exists */
3976
- if (!F_ISSET(flags, MDB_RDONLY)) {
4039
+ if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
3977
4040
  rc = mdb_env_setup_locks(env, lpath, mode, &excl);
3978
4041
  if (rc)
3979
4042
  goto leave;
@@ -4003,7 +4066,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4003
4066
  goto leave;
4004
4067
  }
4005
4068
 
4006
- if (F_ISSET(flags, MDB_RDONLY)) {
4069
+ if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
4007
4070
  rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4008
4071
  if (rc)
4009
4072
  goto leave;
@@ -4033,7 +4096,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4033
4096
  DPRINTF(("opened dbenv %p", (void *) env));
4034
4097
  if (excl > 0) {
4035
4098
  rc = mdb_env_share_locks(env, &excl);
4099
+ if (rc)
4100
+ goto leave;
4036
4101
  }
4102
+ if (!((flags & MDB_RDONLY) ||
4103
+ (env->me_pbuf = calloc(1, env->me_psize))))
4104
+ rc = ENOMEM;
4037
4105
  }
4038
4106
 
4039
4107
  leave:
@@ -4057,6 +4125,7 @@ mdb_env_close0(MDB_env *env, int excl)
4057
4125
  for (i = env->me_maxdbs; --i > MAIN_DBI; )
4058
4126
  free(env->me_dbxs[i].md_name.mv_data);
4059
4127
 
4128
+ free(env->me_pbuf);
4060
4129
  free(env->me_dbflags);
4061
4130
  free(env->me_dbxs);
4062
4131
  free(env->me_path);
@@ -4084,7 +4153,7 @@ mdb_env_close0(MDB_env *env, int excl)
4084
4153
  if (env->me_fd != INVALID_HANDLE_VALUE)
4085
4154
  (void) close(env->me_fd);
4086
4155
  if (env->me_txns) {
4087
- pid_t pid = env->me_pid;
4156
+ MDB_PID_T pid = env->me_pid;
4088
4157
  /* Clearing readers is done in this function because
4089
4158
  * me_txkey with its destructor must be disabled first.
4090
4159
  */
@@ -4246,14 +4315,6 @@ mdb_env_copy(MDB_env *env, const char *path)
4246
4315
  newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
4247
4316
  FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
4248
4317
  #else
4249
- #ifdef O_DIRECT
4250
- /* The OS supports O_DIRECT, try with it */
4251
- newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_DIRECT, 0666);
4252
- /* But open can fail if O_DIRECT isn't supported by the file system
4253
- * so retry without the flag
4254
- */
4255
- if (newfd == INVALID_HANDLE_VALUE && ErrCode() == EINVAL)
4256
- #endif
4257
4318
  newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
4258
4319
  #endif
4259
4320
  if (newfd == INVALID_HANDLE_VALUE) {
@@ -4261,6 +4322,11 @@ mdb_env_copy(MDB_env *env, const char *path)
4261
4322
  goto leave;
4262
4323
  }
4263
4324
 
4325
+ #ifdef O_DIRECT
4326
+ /* Set O_DIRECT if the file system supports it */
4327
+ if ((rc = fcntl(newfd, F_GETFL)) != -1)
4328
+ (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
4329
+ #endif
4264
4330
  #ifdef F_NOCACHE /* __APPLE__ */
4265
4331
  rc = fcntl(newfd, F_NOCACHE, 1);
4266
4332
  if (rc) {
@@ -4308,7 +4374,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4308
4374
  *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4309
4375
  }
4310
4376
 
4311
- /** Compare two items pointing at aligned int's */
4377
+ /** Compare two items pointing at aligned unsigned int's */
4312
4378
  static int
4313
4379
  mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4314
4380
  {
@@ -4316,7 +4382,7 @@ mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4316
4382
  *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4317
4383
  }
4318
4384
 
4319
- /** Compare two items pointing at ints of unknown alignment.
4385
+ /** Compare two items pointing at unsigned ints of unknown alignment.
4320
4386
  * Nodes and keys are guaranteed to be 2-byte aligned.
4321
4387
  */
4322
4388
  static int
@@ -4514,8 +4580,8 @@ mdb_cursor_pop(MDB_cursor *mc)
4514
4580
  if (mc->mc_snum)
4515
4581
  mc->mc_top--;
4516
4582
 
4517
- DPRINTF(("popped page %"Z"u off db %u cursor %p", top->mp_pgno,
4518
- mc->mc_dbi, (void *) mc));
4583
+ DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
4584
+ DDBI(mc), (void *) mc));
4519
4585
  }
4520
4586
  }
4521
4587
 
@@ -4523,8 +4589,8 @@ mdb_cursor_pop(MDB_cursor *mc)
4523
4589
  static int
4524
4590
  mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
4525
4591
  {
4526
- DPRINTF(("pushing page %"Z"u on db %u cursor %p", mp->mp_pgno,
4527
- mc->mc_dbi, (void *) mc));
4592
+ DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
4593
+ DDBI(mc), (void *) mc));
4528
4594
 
4529
4595
  if (mc->mc_snum >= CURSOR_STACK) {
4530
4596
  assert(mc->mc_snum < CURSOR_STACK);
@@ -4598,18 +4664,11 @@ done:
4598
4664
  return MDB_SUCCESS;
4599
4665
  }
4600
4666
 
4601
- /** Search for the page a given key should be in.
4602
- * Pushes parent pages on the cursor stack. This function continues a
4603
- * search on a cursor that has already been initialized. (Usually by
4604
- * #mdb_page_search() but also by #mdb_node_move().)
4605
- * @param[in,out] mc the cursor for this operation.
4606
- * @param[in] key the key to search for. If NULL, search for the lowest
4607
- * page. (This is used by #mdb_cursor_first().)
4608
- * @param[in] modify If true, visited pages are updated with new page numbers.
4609
- * @return 0 on success, non-zero on failure.
4667
+ /** Finish #mdb_page_search() / #mdb_page_search_lowest().
4668
+ * The cursor is at the root page, set up the rest of it.
4610
4669
  */
4611
4670
  static int
4612
- mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4671
+ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
4613
4672
  {
4614
4673
  MDB_page *mp = mc->mc_pg[mc->mc_top];
4615
4674
  int rc;
@@ -4623,11 +4682,10 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4623
4682
  assert(NUMKEYS(mp) > 1);
4624
4683
  DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
4625
4684
 
4626
- if (key == NULL) /* Initialize cursor to first page. */
4685
+ if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
4627
4686
  i = 0;
4628
- else if (key->mv_size > MDB_MAXKEYSIZE && key->mv_data == NULL) {
4629
- /* cursor to last page */
4630
- i = NUMKEYS(mp)-1;
4687
+ if (flags & MDB_PS_LAST)
4688
+ i = NUMKEYS(mp) - 1;
4631
4689
  } else {
4632
4690
  int exact;
4633
4691
  node = mdb_node_search(mc, key, &exact);
@@ -4640,10 +4698,9 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4640
4698
  i--;
4641
4699
  }
4642
4700
  }
4701
+ DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
4643
4702
  }
4644
4703
 
4645
- if (key)
4646
- DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
4647
4704
  assert(i < NUMKEYS(mp));
4648
4705
  node = NODEPTR(mp, i);
4649
4706
 
@@ -4654,7 +4711,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4654
4711
  if ((rc = mdb_cursor_push(mc, mp)))
4655
4712
  return rc;
4656
4713
 
4657
- if (modify) {
4714
+ if (flags & MDB_PS_MODIFY) {
4658
4715
  if ((rc = mdb_page_touch(mc)) != 0)
4659
4716
  return rc;
4660
4717
  mp = mc->mc_pg[mc->mc_top];
@@ -4668,7 +4725,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
4668
4725
  }
4669
4726
 
4670
4727
  DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
4671
- key ? DKEY(key) : NULL));
4728
+ key ? DKEY(key) : "null"));
4672
4729
  mc->mc_flags |= C_INITIALIZED;
4673
4730
  mc->mc_flags &= ~C_EOF;
4674
4731
 
@@ -4694,18 +4751,17 @@ mdb_page_search_lowest(MDB_cursor *mc)
4694
4751
  mc->mc_ki[mc->mc_top] = 0;
4695
4752
  if ((rc = mdb_cursor_push(mc, mp)))
4696
4753
  return rc;
4697
- return mdb_page_search_root(mc, NULL, 0);
4754
+ return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
4698
4755
  }
4699
4756
 
4700
4757
  /** Search for the page a given key should be in.
4701
- * Pushes parent pages on the cursor stack. This function just sets up
4702
- * the search; it finds the root page for \b mc's database and sets this
4703
- * as the root of the cursor's stack. Then #mdb_page_search_root() is
4704
- * called to complete the search.
4758
+ * Push it and its parent pages on the cursor stack.
4705
4759
  * @param[in,out] mc the cursor for this operation.
4706
- * @param[in] key the key to search for. If NULL, search for the lowest
4707
- * page. (This is used by #mdb_cursor_first().)
4708
- * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers.
4760
+ * @param[in] key the key to search for, or NULL for first/last page.
4761
+ * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
4762
+ * are touched (updated with new page numbers).
4763
+ * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
4764
+ * This is used by #mdb_cursor_first() and #mdb_cursor_last().
4709
4765
  * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
4710
4766
  * @return 0 on success, non-zero on failure.
4711
4767
  */
@@ -4716,23 +4772,20 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4716
4772
  pgno_t root;
4717
4773
 
4718
4774
  /* Make sure the txn is still viable, then find the root from
4719
- * the txn's db table.
4775
+ * the txn's db table and set it as the root of the cursor's stack.
4720
4776
  */
4721
4777
  if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
4722
4778
  DPUTS("transaction has failed, must abort");
4723
4779
  return MDB_BAD_TXN;
4724
4780
  } else {
4725
4781
  /* Make sure we're using an up-to-date root */
4726
- if (mc->mc_dbi > MAIN_DBI) {
4727
- if ((*mc->mc_dbflag & DB_STALE) ||
4728
- ((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) {
4782
+ if (*mc->mc_dbflag & DB_STALE) {
4729
4783
  MDB_cursor mc2;
4730
- unsigned char dbflag = 0;
4731
4784
  mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
4732
- rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, flags & MDB_PS_MODIFY);
4785
+ rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
4733
4786
  if (rc)
4734
4787
  return rc;
4735
- if (*mc->mc_dbflag & DB_STALE) {
4788
+ {
4736
4789
  MDB_val data;
4737
4790
  int exact = 0;
4738
4791
  uint16_t flags;
@@ -4752,11 +4805,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4752
4805
  return MDB_INCOMPATIBLE;
4753
4806
  memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
4754
4807
  }
4755
- if (flags & MDB_PS_MODIFY)
4756
- dbflag = DB_DIRTY;
4757
4808
  *mc->mc_dbflag &= ~DB_STALE;
4758
- *mc->mc_dbflag |= dbflag;
4759
- }
4760
4809
  }
4761
4810
  root = mc->mc_db->md_root;
4762
4811
 
@@ -4774,8 +4823,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4774
4823
  mc->mc_snum = 1;
4775
4824
  mc->mc_top = 0;
4776
4825
 
4777
- DPRINTF(("db %u root page %"Z"u has flags 0x%X",
4778
- mc->mc_dbi, root, mc->mc_pg[0]->mp_flags));
4826
+ DPRINTF(("db %d root page %"Z"u has flags 0x%X",
4827
+ DDBI(mc), root, mc->mc_pg[0]->mp_flags));
4779
4828
 
4780
4829
  if (flags & MDB_PS_MODIFY) {
4781
4830
  if ((rc = mdb_page_touch(mc)))
@@ -4914,7 +4963,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi,
4914
4963
  if (txn->mt_flags & MDB_TXN_ERROR)
4915
4964
  return MDB_BAD_TXN;
4916
4965
 
4917
- if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
4966
+ if (key->mv_size > MDB_MAXKEYSIZE) {
4918
4967
  return MDB_BAD_VALSIZE;
4919
4968
  }
4920
4969
 
@@ -4966,8 +5015,11 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right)
4966
5015
  assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
4967
5016
 
4968
5017
  indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
4969
- if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0))
5018
+ if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
5019
+ /* mc will be inconsistent if caller does mc_snum++ as above */
5020
+ mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
4970
5021
  return rc;
5022
+ }
4971
5023
 
4972
5024
  mdb_cursor_push(mc, mp);
4973
5025
  if (!move_right)
@@ -5143,7 +5195,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5143
5195
 
5144
5196
  assert(mc);
5145
5197
  assert(key);
5146
- assert(key->mv_size > 0);
5198
+ if (key->mv_size == 0)
5199
+ return MDB_BAD_VALSIZE;
5147
5200
 
5148
5201
  if (mc->mc_xcursor)
5149
5202
  mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
@@ -5329,7 +5382,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5329
5382
  mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5330
5383
 
5331
5384
  if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5332
- rc = mdb_page_search(mc, NULL, 0);
5385
+ rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
5333
5386
  if (rc != MDB_SUCCESS)
5334
5387
  return rc;
5335
5388
  }
@@ -5375,11 +5428,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5375
5428
  if (!(mc->mc_flags & C_EOF)) {
5376
5429
 
5377
5430
  if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5378
- MDB_val lkey;
5379
-
5380
- lkey.mv_size = MDB_MAXKEYSIZE+1;
5381
- lkey.mv_data = NULL;
5382
- rc = mdb_page_search(mc, &lkey, 0);
5431
+ rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
5383
5432
  if (rc != MDB_SUCCESS)
5384
5433
  return rc;
5385
5434
  }
@@ -5431,8 +5480,9 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5431
5480
  rc = EINVAL;
5432
5481
  } else {
5433
5482
  MDB_page *mp = mc->mc_pg[mc->mc_top];
5434
- if (!NUMKEYS(mp)) {
5435
- mc->mc_ki[mc->mc_top] = 0;
5483
+ int nkeys = NUMKEYS(mp);
5484
+ if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
5485
+ mc->mc_ki[mc->mc_top] = nkeys;
5436
5486
  rc = MDB_NOTFOUND;
5437
5487
  break;
5438
5488
  }
@@ -5471,7 +5521,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5471
5521
  case MDB_SET_RANGE:
5472
5522
  if (key == NULL) {
5473
5523
  rc = EINVAL;
5474
- } else if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
5524
+ } else if (key->mv_size > MDB_MAXKEYSIZE) {
5475
5525
  rc = MDB_BAD_VALSIZE;
5476
5526
  } else if (op == MDB_SET_RANGE)
5477
5527
  rc = mdb_cursor_set(mc, key, data, op, NULL);
@@ -5577,14 +5627,14 @@ fetchm:
5577
5627
  return rc;
5578
5628
  }
5579
5629
 
5580
- /** Touch all the pages in the cursor stack.
5630
+ /** Touch all the pages in the cursor stack. Set mc_top.
5581
5631
  * Makes sure all the pages are writable, before attempting a write operation.
5582
5632
  * @param[in] mc The cursor to operate on.
5583
5633
  */
5584
5634
  static int
5585
5635
  mdb_cursor_touch(MDB_cursor *mc)
5586
5636
  {
5587
- int rc;
5637
+ int rc = MDB_SUCCESS;
5588
5638
 
5589
5639
  if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
5590
5640
  MDB_cursor mc2;
@@ -5595,13 +5645,14 @@ mdb_cursor_touch(MDB_cursor *mc)
5595
5645
  return rc;
5596
5646
  *mc->mc_dbflag |= DB_DIRTY;
5597
5647
  }
5598
- for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
5599
- rc = mdb_page_touch(mc);
5600
- if (rc)
5601
- return rc;
5648
+ mc->mc_top = 0;
5649
+ if (mc->mc_snum) {
5650
+ do {
5651
+ rc = mdb_page_touch(mc);
5652
+ } while (!rc && ++(mc->mc_top) < mc->mc_snum);
5653
+ mc->mc_top = mc->mc_snum-1;
5602
5654
  }
5603
- mc->mc_top = mc->mc_snum-1;
5604
- return MDB_SUCCESS;
5655
+ return rc;
5605
5656
  }
5606
5657
 
5607
5658
  /** Do not spill pages to disk if txn is getting full, may fail instead */
@@ -5612,15 +5663,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5612
5663
  unsigned int flags)
5613
5664
  {
5614
5665
  enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
5666
+ MDB_env *env = mc->mc_txn->mt_env;
5615
5667
  MDB_node *leaf = NULL;
5616
5668
  MDB_val xdata, *rdata, dkey;
5617
- MDB_page *fp;
5618
5669
  MDB_db dummy;
5619
5670
  int do_sub = 0, insert = 0;
5620
5671
  unsigned int mcount = 0, dcount = 0, nospill;
5621
5672
  size_t nsize;
5622
5673
  int rc, rc2;
5623
- MDB_pagebuf pbuf;
5624
5674
  char dbuf[MDB_MAXKEYSIZE+1];
5625
5675
  unsigned int nflags;
5626
5676
  DKBUF;
@@ -5652,8 +5702,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5652
5702
  return MDB_BAD_VALSIZE;
5653
5703
  #endif
5654
5704
 
5655
- DPRINTF(("==> put db %u key [%s], size %"Z"u, data size %"Z"u",
5656
- mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size));
5705
+ DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
5706
+ DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
5657
5707
 
5658
5708
  dkey.mv_size = 0;
5659
5709
 
@@ -5664,6 +5714,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5664
5714
  } else if (mc->mc_db->md_root == P_INVALID) {
5665
5715
  /* new database, cursor has nothing to point to */
5666
5716
  mc->mc_snum = 0;
5717
+ mc->mc_top = 0;
5667
5718
  mc->mc_flags &= ~C_INITIALIZED;
5668
5719
  rc = MDB_NO_ROOT;
5669
5720
  } else {
@@ -5733,6 +5784,9 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5733
5784
 
5734
5785
  /* The key already exists */
5735
5786
  if (rc == MDB_SUCCESS) {
5787
+ MDB_page *fp, *mp;
5788
+ MDB_val olddata;
5789
+
5736
5790
  /* there's only a key anyway, so this is a no-op */
5737
5791
  if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5738
5792
  unsigned int ksize = mc->mc_db->md_pad;
@@ -5745,19 +5799,23 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5745
5799
  return MDB_SUCCESS;
5746
5800
  }
5747
5801
 
5802
+ more:
5748
5803
  leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5804
+ olddata.mv_size = NODEDSZ(leaf);
5805
+ olddata.mv_data = NODEDATA(leaf);
5749
5806
 
5750
5807
  /* DB has dups? */
5751
5808
  if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
5809
+ mp = fp = xdata.mv_data = env->me_pbuf;
5810
+ mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5811
+
5752
5812
  /* Was a single item before, must convert now */
5753
- more:
5754
5813
  if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5755
5814
  /* Just overwrite the current item */
5756
5815
  if (flags == MDB_CURRENT)
5757
5816
  goto current;
5758
5817
 
5759
- dkey.mv_size = NODEDSZ(leaf);
5760
- dkey.mv_data = NODEDATA(leaf);
5818
+ dkey = olddata;
5761
5819
  #if UINT_MAX < SIZE_MAX
5762
5820
  if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
5763
5821
  #ifdef MISALIGNED_OK
@@ -5780,85 +5838,76 @@ more:
5780
5838
  /* create a fake page for the dup items */
5781
5839
  memcpy(dbuf, dkey.mv_data, dkey.mv_size);
5782
5840
  dkey.mv_data = dbuf;
5783
- fp = (MDB_page *)&pbuf;
5784
- fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5785
5841
  fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
5786
5842
  fp->mp_lower = PAGEHDRSZ;
5787
- fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5843
+ xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5788
5844
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5789
5845
  fp->mp_flags |= P_LEAF2;
5790
5846
  fp->mp_pad = data->mv_size;
5791
- fp->mp_upper += 2 * data->mv_size; /* leave space for 2 more */
5847
+ xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
5792
5848
  } else {
5793
- fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
5849
+ xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
5794
5850
  (dkey.mv_size & 1) + (data->mv_size & 1);
5795
5851
  }
5796
- mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5797
- do_sub = 1;
5798
- rdata = &xdata;
5799
- xdata.mv_size = fp->mp_upper;
5800
- xdata.mv_data = fp;
5801
- flags |= F_DUPDATA;
5802
- goto new_sub;
5803
- }
5804
- if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
5852
+ fp->mp_upper = xdata.mv_size;
5853
+ } else if (leaf->mn_flags & F_SUBDATA) {
5854
+ /* Data is on sub-DB, just store it */
5855
+ flags |= F_DUPDATA|F_SUBDATA;
5856
+ goto put_sub;
5857
+ } else {
5805
5858
  /* See if we need to convert from fake page to subDB */
5806
- MDB_page *mp;
5807
5859
  unsigned int offset;
5808
5860
  unsigned int i;
5809
5861
  uint16_t fp_flags;
5810
5862
 
5811
- fp = NODEDATA(leaf);
5812
- if (flags == MDB_CURRENT) {
5813
- reuse:
5863
+ fp = olddata.mv_data;
5864
+ switch (flags) {
5865
+ default:
5866
+ if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5867
+ offset = NODESIZE + sizeof(indx_t) + data->mv_size;
5868
+ offset += offset & 1;
5869
+ break;
5870
+ }
5871
+ offset = fp->mp_pad;
5872
+ if (SIZELEFT(fp) < offset) {
5873
+ offset *= 4; /* space for 4 more */
5874
+ break;
5875
+ }
5876
+ /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
5877
+ case MDB_CURRENT:
5814
5878
  fp->mp_flags |= P_DIRTY;
5815
- COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
5879
+ COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
5816
5880
  mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
5817
5881
  flags |= F_DUPDATA;
5818
5882
  goto put_sub;
5819
5883
  }
5820
- if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5821
- offset = fp->mp_pad;
5822
- if (SIZELEFT(fp) >= offset)
5823
- goto reuse;
5824
- offset *= 4; /* space for 4 more */
5825
- } else {
5826
- offset = NODESIZE + sizeof(indx_t) + data->mv_size;
5827
- }
5828
- offset += offset & 1;
5829
5884
  fp_flags = fp->mp_flags;
5830
- if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
5831
- offset >= mc->mc_txn->mt_env->me_nodemax) {
5885
+ xdata.mv_size = olddata.mv_size + offset;
5886
+ if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + xdata.mv_size
5887
+ >= env->me_nodemax) {
5832
5888
  /* yes, convert it */
5833
- dummy.md_flags = 0;
5834
5889
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5835
5890
  dummy.md_pad = fp->mp_pad;
5836
5891
  dummy.md_flags = MDB_DUPFIXED;
5837
5892
  if (mc->mc_db->md_flags & MDB_INTEGERDUP)
5838
5893
  dummy.md_flags |= MDB_INTEGERKEY;
5894
+ } else {
5895
+ dummy.md_pad = 0;
5896
+ dummy.md_flags = 0;
5839
5897
  }
5840
5898
  dummy.md_depth = 1;
5841
5899
  dummy.md_branch_pages = 0;
5842
5900
  dummy.md_leaf_pages = 1;
5843
5901
  dummy.md_overflow_pages = 0;
5844
5902
  dummy.md_entries = NUMKEYS(fp);
5845
- rdata = &xdata;
5846
5903
  xdata.mv_size = sizeof(MDB_db);
5847
5904
  xdata.mv_data = &dummy;
5848
5905
  if ((rc = mdb_page_alloc(mc, 1, &mp)))
5849
5906
  return rc;
5850
- offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
5907
+ offset = env->me_psize - olddata.mv_size;
5851
5908
  flags |= F_DUPDATA|F_SUBDATA;
5852
5909
  dummy.md_root = mp->mp_pgno;
5853
5910
  fp_flags &= ~P_SUBP;
5854
- } else {
5855
- /* no, just grow it */
5856
- rdata = &xdata;
5857
- xdata.mv_size = NODEDSZ(leaf) + offset;
5858
- xdata.mv_data = &pbuf;
5859
- mp = (MDB_page *)&pbuf;
5860
- mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5861
- flags |= F_DUPDATA;
5862
5911
  }
5863
5912
  mp->mp_flags = fp_flags | P_DIRTY;
5864
5913
  mp->mp_pad = fp->mp_pad;
@@ -5867,28 +5916,27 @@ reuse:
5867
5916
  if (IS_LEAF2(fp)) {
5868
5917
  memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
5869
5918
  } else {
5870
- nsize = NODEDSZ(leaf) - fp->mp_upper;
5871
- memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
5919
+ memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
5920
+ olddata.mv_size - fp->mp_upper);
5872
5921
  for (i=0; i<NUMKEYS(fp); i++)
5873
5922
  mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
5874
5923
  }
5875
- mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5876
- do_sub = 1;
5877
- goto new_sub;
5878
5924
  }
5879
- /* data is on sub-DB, just store it */
5880
- flags |= F_DUPDATA|F_SUBDATA;
5881
- goto put_sub;
5925
+
5926
+ rdata = &xdata;
5927
+ flags |= F_DUPDATA;
5928
+ do_sub = 1;
5929
+ mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5930
+ goto new_sub;
5882
5931
  }
5883
5932
  current:
5884
5933
  /* overflow page overwrites need special handling */
5885
5934
  if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
5886
5935
  MDB_page *omp;
5887
5936
  pgno_t pg;
5888
- unsigned psize = mc->mc_txn->mt_env->me_psize;
5889
- int level, ovpages, dpages = OVPAGES(data->mv_size, psize);
5937
+ int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
5890
5938
 
5891
- memcpy(&pg, NODEDATA(leaf), sizeof(pg));
5939
+ memcpy(&pg, olddata.mv_data, sizeof(pg));
5892
5940
  if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
5893
5941
  return rc2;
5894
5942
  ovpages = omp->mp_pages;
@@ -5896,7 +5944,7 @@ current:
5896
5944
  /* Is the ov page large enough? */
5897
5945
  if (ovpages >= dpages) {
5898
5946
  if (!(omp->mp_flags & P_DIRTY) &&
5899
- (level || (mc->mc_txn->mt_env->me_flags & MDB_WRITEMAP)))
5947
+ (level || (env->me_flags & MDB_WRITEMAP)))
5900
5948
  {
5901
5949
  rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
5902
5950
  if (rc)
@@ -5911,7 +5959,7 @@ current:
5911
5959
  */
5912
5960
  if (level > 1) {
5913
5961
  /* It is writable only in a parent txn */
5914
- size_t sz = (size_t) psize * ovpages, off;
5962
+ size_t sz = (size_t) env->me_psize * ovpages, off;
5915
5963
  MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
5916
5964
  MDB_ID2 id2;
5917
5965
  if (!np)
@@ -5941,15 +5989,15 @@ current:
5941
5989
  }
5942
5990
  if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
5943
5991
  return rc2;
5944
- } else if (NODEDSZ(leaf) == data->mv_size) {
5992
+ } else if (data->mv_size == olddata.mv_size) {
5945
5993
  /* same size, just replace it. Note that we could
5946
5994
  * also reuse this node if the new data is smaller,
5947
5995
  * but instead we opt to shrink the node in that case.
5948
5996
  */
5949
5997
  if (F_ISSET(flags, MDB_RESERVE))
5950
- data->mv_data = NODEDATA(leaf);
5998
+ data->mv_data = olddata.mv_data;
5951
5999
  else if (data->mv_size)
5952
- memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
6000
+ memcpy(olddata.mv_data, data->mv_data, data->mv_size);
5953
6001
  else
5954
6002
  memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
5955
6003
  goto done;
@@ -5965,7 +6013,7 @@ current:
5965
6013
 
5966
6014
  new_sub:
5967
6015
  nflags = flags & NODE_ADD_FLAGS;
5968
- nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
6016
+ nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
5969
6017
  if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
5970
6018
  if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
5971
6019
  nflags &= ~MDB_APPEND;
@@ -5982,9 +6030,6 @@ new_sub:
5982
6030
  unsigned i = mc->mc_top;
5983
6031
  MDB_page *mp = mc->mc_pg[i];
5984
6032
 
5985
- if (mc->mc_flags & C_SUB)
5986
- dbi--;
5987
-
5988
6033
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
5989
6034
  if (mc->mc_flags & C_SUB)
5990
6035
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -6062,7 +6107,6 @@ next_mult:
6062
6107
  data[1].mv_size = mcount;
6063
6108
  if (mcount < dcount) {
6064
6109
  data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
6065
- leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6066
6110
  goto more;
6067
6111
  }
6068
6112
  }
@@ -6081,6 +6125,7 @@ int
6081
6125
  mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6082
6126
  {
6083
6127
  MDB_node *leaf;
6128
+ MDB_page *mp;
6084
6129
  int rc;
6085
6130
 
6086
6131
  if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
@@ -6089,17 +6134,20 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6089
6134
  if (!(mc->mc_flags & C_INITIALIZED))
6090
6135
  return EINVAL;
6091
6136
 
6137
+ if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6138
+ return MDB_NOTFOUND;
6139
+
6092
6140
  if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
6093
6141
  return rc;
6094
- flags &= ~MDB_NOSPILL; /* TODO: Or change (flags != MDB_NODUPDATA) to ~(flags & MDB_NODUPDATA), not looking at the logic of that code just now */
6095
6142
 
6096
6143
  rc = mdb_cursor_touch(mc);
6097
6144
  if (rc)
6098
6145
  return rc;
6099
6146
 
6100
- leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6147
+ mp = mc->mc_pg[mc->mc_top];
6148
+ leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6101
6149
 
6102
- if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6150
+ if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6103
6151
  if (!(flags & MDB_NODUPDATA)) {
6104
6152
  if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
6105
6153
  mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
@@ -6114,13 +6162,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6114
6162
  } else {
6115
6163
  MDB_cursor *m2;
6116
6164
  /* shrink fake page */
6117
- mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6118
- leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6165
+ mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
6166
+ leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6119
6167
  mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6120
6168
  /* fix other sub-DB cursors pointed at this fake page */
6121
6169
  for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6122
6170
  if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6123
- if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top] &&
6171
+ if (m2->mc_pg[mc->mc_top] == mp &&
6124
6172
  m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
6125
6173
  m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6126
6174
  }
@@ -6252,6 +6300,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6252
6300
  {
6253
6301
  unsigned int i;
6254
6302
  size_t node_size = NODESIZE;
6303
+ ssize_t room;
6255
6304
  indx_t ofs;
6256
6305
  MDB_node *node;
6257
6306
  MDB_page *mp = mc->mc_pg[mc->mc_top];
@@ -6264,7 +6313,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6264
6313
  IS_LEAF(mp) ? "leaf" : "branch",
6265
6314
  IS_SUBP(mp) ? "sub-" : "",
6266
6315
  mp->mp_pgno, indx, data ? data->mv_size : 0,
6267
- key ? key->mv_size : 0, key ? DKEY(key) : NULL));
6316
+ key ? key->mv_size : 0, key ? DKEY(key) : "null"));
6268
6317
 
6269
6318
  if (IS_LEAF2(mp)) {
6270
6319
  /* Move higher keys up one slot. */
@@ -6282,9 +6331,9 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6282
6331
  return MDB_SUCCESS;
6283
6332
  }
6284
6333
 
6334
+ room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
6285
6335
  if (key != NULL)
6286
6336
  node_size += key->mv_size;
6287
-
6288
6337
  if (IS_LEAF(mp)) {
6289
6338
  assert(data);
6290
6339
  if (F_ISSET(flags, F_BIGDATA)) {
@@ -6296,26 +6345,23 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6296
6345
  /* Put data on overflow page. */
6297
6346
  DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
6298
6347
  data->mv_size, node_size+data->mv_size));
6299
- node_size += sizeof(pgno_t);
6348
+ node_size += sizeof(pgno_t) + (node_size & 1);
6349
+ if ((ssize_t)node_size > room)
6350
+ goto full;
6300
6351
  if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
6301
6352
  return rc;
6302
6353
  DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
6303
6354
  flags |= F_BIGDATA;
6355
+ goto update;
6304
6356
  } else {
6305
6357
  node_size += data->mv_size;
6306
6358
  }
6307
6359
  }
6308
6360
  node_size += node_size & 1;
6361
+ if ((ssize_t)node_size > room)
6362
+ goto full;
6309
6363
 
6310
- if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
6311
- DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
6312
- mp->mp_pgno, NUMKEYS(mp)));
6313
- DPRINTF(("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
6314
- mp->mp_upper - mp->mp_lower));
6315
- DPRINTF(("node size = %"Z"u", node_size));
6316
- return MDB_PAGE_FULL;
6317
- }
6318
-
6364
+ update:
6319
6365
  /* Move higher pointers up one slot. */
6320
6366
  for (i = NUMKEYS(mp); i > indx; i--)
6321
6367
  mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
@@ -6361,6 +6407,13 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
6361
6407
  }
6362
6408
 
6363
6409
  return MDB_SUCCESS;
6410
+
6411
+ full:
6412
+ DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
6413
+ mp->mp_pgno, NUMKEYS(mp)));
6414
+ DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
6415
+ DPRINTF(("node size = %"Z"u", node_size));
6416
+ return MDB_PAGE_FULL;
6364
6417
  }
6365
6418
 
6366
6419
  /** Delete the specified node from a page.
@@ -6495,11 +6548,13 @@ mdb_xcursor_init0(MDB_cursor *mc)
6495
6548
  mx->mx_cursor.mc_txn = mc->mc_txn;
6496
6549
  mx->mx_cursor.mc_db = &mx->mx_db;
6497
6550
  mx->mx_cursor.mc_dbx = &mx->mx_dbx;
6498
- mx->mx_cursor.mc_dbi = mc->mc_dbi+1;
6551
+ mx->mx_cursor.mc_dbi = mc->mc_dbi;
6499
6552
  mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
6500
6553
  mx->mx_cursor.mc_snum = 0;
6501
6554
  mx->mx_cursor.mc_top = 0;
6502
6555
  mx->mx_cursor.mc_flags = C_SUB;
6556
+ mx->mx_dbx.md_name.mv_size = 0;
6557
+ mx->mx_dbx.md_name.mv_data = NULL;
6503
6558
  mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
6504
6559
  mx->mx_dbx.md_dcmp = NULL;
6505
6560
  mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
@@ -6520,6 +6575,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6520
6575
  memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
6521
6576
  mx->mx_cursor.mc_pg[0] = 0;
6522
6577
  mx->mx_cursor.mc_snum = 0;
6578
+ mx->mx_cursor.mc_top = 0;
6523
6579
  mx->mx_cursor.mc_flags = C_SUB;
6524
6580
  } else {
6525
6581
  MDB_page *fp = NODEDATA(node);
@@ -6532,8 +6588,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6532
6588
  mx->mx_db.md_entries = NUMKEYS(fp);
6533
6589
  COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
6534
6590
  mx->mx_cursor.mc_snum = 1;
6535
- mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
6536
6591
  mx->mx_cursor.mc_top = 0;
6592
+ mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
6537
6593
  mx->mx_cursor.mc_pg[0] = fp;
6538
6594
  mx->mx_cursor.mc_ki[0] = 0;
6539
6595
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
@@ -6543,12 +6599,9 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6543
6599
  mx->mx_db.md_flags |= MDB_INTEGERKEY;
6544
6600
  }
6545
6601
  }
6546
- DPRINTF(("Sub-db %u for db %u root page %"Z"u", mx->mx_cursor.mc_dbi, mc->mc_dbi,
6602
+ DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
6547
6603
  mx->mx_db.md_root));
6548
- mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ?
6549
- DB_DIRTY : 0);
6550
- mx->mx_dbx.md_name.mv_data = NODEKEY(node);
6551
- mx->mx_dbx.md_name.mv_size = node->mn_ksize;
6604
+ mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
6552
6605
  #if UINT_MAX < SIZE_MAX
6553
6606
  if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
6554
6607
  #ifdef MISALIGNED_OK
@@ -6793,7 +6846,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
6793
6846
  flags = 0;
6794
6847
  } else {
6795
6848
  srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
6796
- assert(!((long)srcnode&1));
6849
+ assert(!((size_t)srcnode&1));
6797
6850
  srcpg = NODEPGNO(srcnode);
6798
6851
  flags = srcnode->mn_flags;
6799
6852
  if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
@@ -6864,9 +6917,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
6864
6917
  MDB_dbi dbi = csrc->mc_dbi;
6865
6918
  MDB_page *mp = csrc->mc_pg[csrc->mc_top];
6866
6919
 
6867
- if (csrc->mc_flags & C_SUB)
6868
- dbi--;
6869
-
6870
6920
  for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
6871
6921
  if (csrc->mc_flags & C_SUB)
6872
6922
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7041,9 +7091,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7041
7091
  MDB_dbi dbi = csrc->mc_dbi;
7042
7092
  MDB_page *mp = cdst->mc_pg[cdst->mc_top];
7043
7093
 
7044
- if (csrc->mc_flags & C_SUB)
7045
- dbi--;
7046
-
7047
7094
  for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7048
7095
  if (csrc->mc_flags & C_SUB)
7049
7096
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7138,13 +7185,11 @@ mdb_rebalance(MDB_cursor *mc)
7138
7185
  /* Adjust cursors pointing to mp */
7139
7186
  mc->mc_snum = 0;
7140
7187
  mc->mc_top = 0;
7188
+ mc->mc_flags &= ~C_INITIALIZED;
7141
7189
  {
7142
7190
  MDB_cursor *m2, *m3;
7143
7191
  MDB_dbi dbi = mc->mc_dbi;
7144
7192
 
7145
- if (mc->mc_flags & C_SUB)
7146
- dbi--;
7147
-
7148
7193
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7149
7194
  if (mc->mc_flags & C_SUB)
7150
7195
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7154,6 +7199,7 @@ mdb_rebalance(MDB_cursor *mc)
7154
7199
  if (m3->mc_pg[0] == mp) {
7155
7200
  m3->mc_snum = 0;
7156
7201
  m3->mc_top = 0;
7202
+ m3->mc_flags &= ~C_INITIALIZED;
7157
7203
  }
7158
7204
  }
7159
7205
  }
@@ -7174,9 +7220,6 @@ mdb_rebalance(MDB_cursor *mc)
7174
7220
  MDB_cursor *m2, *m3;
7175
7221
  MDB_dbi dbi = mc->mc_dbi;
7176
7222
 
7177
- if (mc->mc_flags & C_SUB)
7178
- dbi--;
7179
-
7180
7223
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7181
7224
  if (mc->mc_flags & C_SUB)
7182
7225
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7184,10 +7227,13 @@ mdb_rebalance(MDB_cursor *mc)
7184
7227
  m3 = m2;
7185
7228
  if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
7186
7229
  if (m3->mc_pg[0] == mp) {
7187
- m3->mc_pg[0] = mc->mc_pg[0];
7188
- m3->mc_snum = 1;
7189
- m3->mc_top = 0;
7190
- m3->mc_ki[0] = m3->mc_ki[1];
7230
+ int i;
7231
+ m3->mc_snum--;
7232
+ m3->mc_top--;
7233
+ for (i=0; i<m3->mc_snum; i++) {
7234
+ m3->mc_pg[i] = m3->mc_pg[i+1];
7235
+ m3->mc_ki[i] = m3->mc_ki[i+1];
7236
+ }
7191
7237
  }
7192
7238
  }
7193
7239
  }
@@ -7300,7 +7346,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
7300
7346
 
7301
7347
  /* Adjust other cursors pointing to mp */
7302
7348
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7303
- if (m2 == mc)
7349
+ if (m2 == mc || m2->mc_snum < mc->mc_snum)
7304
7350
  continue;
7305
7351
  if (!(m2->mc_flags & C_INITIALIZED))
7306
7352
  continue;
@@ -7341,7 +7387,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi,
7341
7387
  if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
7342
7388
  return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7343
7389
 
7344
- if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
7390
+ if (key->mv_size > MDB_MAXKEYSIZE) {
7345
7391
  return MDB_BAD_VALSIZE;
7346
7392
  }
7347
7393
 
@@ -7394,24 +7440,26 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7394
7440
  unsigned int nflags)
7395
7441
  {
7396
7442
  unsigned int flags;
7397
- int rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1, did_split = 0;
7443
+ int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
7398
7444
  indx_t newindx;
7399
7445
  pgno_t pgno = 0;
7400
- unsigned int i, j, split_indx, nkeys, pmax;
7446
+ int i, j, split_indx, nkeys, pmax;
7447
+ MDB_env *env = mc->mc_txn->mt_env;
7401
7448
  MDB_node *node;
7402
7449
  MDB_val sepkey, rkey, xdata, *rdata = &xdata;
7403
- MDB_page *copy;
7450
+ MDB_page *copy = NULL;
7404
7451
  MDB_page *mp, *rp, *pp;
7405
- unsigned int ptop;
7452
+ int ptop;
7406
7453
  MDB_cursor mn;
7407
7454
  DKBUF;
7408
7455
 
7409
7456
  mp = mc->mc_pg[mc->mc_top];
7410
7457
  newindx = mc->mc_ki[mc->mc_top];
7458
+ nkeys = NUMKEYS(mp);
7411
7459
 
7412
- DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i",
7460
+ DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
7413
7461
  IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
7414
- DKEY(newkey), mc->mc_ki[mc->mc_top]));
7462
+ DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
7415
7463
 
7416
7464
  /* Create a right sibling. */
7417
7465
  if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
@@ -7458,141 +7506,139 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7458
7506
  sepkey = *newkey;
7459
7507
  split_indx = newindx;
7460
7508
  nkeys = 0;
7461
- goto newsep;
7462
- }
7509
+ } else {
7463
7510
 
7464
- nkeys = NUMKEYS(mp);
7465
- split_indx = nkeys / 2;
7466
- if (newindx < split_indx)
7467
- newpos = 0;
7468
-
7469
- if (IS_LEAF2(rp)) {
7470
- char *split, *ins;
7471
- int x;
7472
- unsigned int lsize, rsize, ksize;
7473
- /* Move half of the keys to the right sibling */
7474
- copy = NULL;
7475
- x = mc->mc_ki[mc->mc_top] - split_indx;
7476
- ksize = mc->mc_db->md_pad;
7477
- split = LEAF2KEY(mp, split_indx, ksize);
7478
- rsize = (nkeys - split_indx) * ksize;
7479
- lsize = (nkeys - split_indx) * sizeof(indx_t);
7480
- mp->mp_lower -= lsize;
7481
- rp->mp_lower += lsize;
7482
- mp->mp_upper += rsize - lsize;
7483
- rp->mp_upper -= rsize - lsize;
7484
- sepkey.mv_size = ksize;
7485
- if (newindx == split_indx) {
7486
- sepkey.mv_data = newkey->mv_data;
7487
- } else {
7488
- sepkey.mv_data = split;
7489
- }
7490
- if (x<0) {
7491
- ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
7492
- memcpy(rp->mp_ptrs, split, rsize);
7493
- sepkey.mv_data = rp->mp_ptrs;
7494
- memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
7495
- memcpy(ins, newkey->mv_data, ksize);
7496
- mp->mp_lower += sizeof(indx_t);
7497
- mp->mp_upper -= ksize - sizeof(indx_t);
7511
+ split_indx = (nkeys+1) / 2;
7512
+
7513
+ if (IS_LEAF2(rp)) {
7514
+ char *split, *ins;
7515
+ int x;
7516
+ unsigned int lsize, rsize, ksize;
7517
+ /* Move half of the keys to the right sibling */
7518
+ copy = NULL;
7519
+ x = mc->mc_ki[mc->mc_top] - split_indx;
7520
+ ksize = mc->mc_db->md_pad;
7521
+ split = LEAF2KEY(mp, split_indx, ksize);
7522
+ rsize = (nkeys - split_indx) * ksize;
7523
+ lsize = (nkeys - split_indx) * sizeof(indx_t);
7524
+ mp->mp_lower -= lsize;
7525
+ rp->mp_lower += lsize;
7526
+ mp->mp_upper += rsize - lsize;
7527
+ rp->mp_upper -= rsize - lsize;
7528
+ sepkey.mv_size = ksize;
7529
+ if (newindx == split_indx) {
7530
+ sepkey.mv_data = newkey->mv_data;
7531
+ } else {
7532
+ sepkey.mv_data = split;
7533
+ }
7534
+ if (x<0) {
7535
+ ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
7536
+ memcpy(rp->mp_ptrs, split, rsize);
7537
+ sepkey.mv_data = rp->mp_ptrs;
7538
+ memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
7539
+ memcpy(ins, newkey->mv_data, ksize);
7540
+ mp->mp_lower += sizeof(indx_t);
7541
+ mp->mp_upper -= ksize - sizeof(indx_t);
7542
+ } else {
7543
+ if (x)
7544
+ memcpy(rp->mp_ptrs, split, x * ksize);
7545
+ ins = LEAF2KEY(rp, x, ksize);
7546
+ memcpy(ins, newkey->mv_data, ksize);
7547
+ memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
7548
+ rp->mp_lower += sizeof(indx_t);
7549
+ rp->mp_upper -= ksize - sizeof(indx_t);
7550
+ mc->mc_ki[mc->mc_top] = x;
7551
+ mc->mc_pg[mc->mc_top] = rp;
7552
+ }
7498
7553
  } else {
7499
- if (x)
7500
- memcpy(rp->mp_ptrs, split, x * ksize);
7501
- ins = LEAF2KEY(rp, x, ksize);
7502
- memcpy(ins, newkey->mv_data, ksize);
7503
- memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
7504
- rp->mp_lower += sizeof(indx_t);
7505
- rp->mp_upper -= ksize - sizeof(indx_t);
7506
- mc->mc_ki[mc->mc_top] = x;
7507
- mc->mc_pg[mc->mc_top] = rp;
7508
- }
7509
- goto newsep;
7510
- }
7554
+ int psize, nsize, k;
7555
+ /* Maximum free space in an empty page */
7556
+ pmax = env->me_psize - PAGEHDRSZ;
7557
+ if (IS_LEAF(mp))
7558
+ nsize = mdb_leaf_size(env, newkey, newdata);
7559
+ else
7560
+ nsize = mdb_branch_size(env, newkey);
7561
+ nsize += nsize & 1;
7511
7562
 
7512
- /* For leaf pages, check the split point based on what
7513
- * fits where, since otherwise mdb_node_add can fail.
7514
- *
7515
- * This check is only needed when the data items are
7516
- * relatively large, such that being off by one will
7517
- * make the difference between success or failure.
7518
- *
7519
- * It's also relevant if a page happens to be laid out
7520
- * such that one half of its nodes are all "small" and
7521
- * the other half of its nodes are "large." If the new
7522
- * item is also "large" and falls on the half with
7523
- * "large" nodes, it also may not fit.
7524
- */
7525
- if (IS_LEAF(mp)) {
7526
- unsigned int psize, nsize;
7527
- /* Maximum free space in an empty page */
7528
- pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
7529
- nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata);
7530
- if ((nkeys < 20) || (nsize > pmax/16)) {
7531
- if (newindx <= split_indx) {
7532
- psize = nsize;
7533
- newpos = 0;
7534
- for (i=0; i<split_indx; i++) {
7535
- node = NODEPTR(mp, i);
7536
- psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7537
- if (F_ISSET(node->mn_flags, F_BIGDATA))
7538
- psize += sizeof(pgno_t);
7539
- else
7540
- psize += NODEDSZ(node);
7541
- psize += psize & 1;
7542
- if (psize > pmax) {
7543
- if (i <= newindx) {
7544
- split_indx = newindx;
7545
- if (i < newindx)
7546
- newpos = 1;
7563
+ /* grab a page to hold a temporary copy */
7564
+ copy = mdb_page_malloc(mc->mc_txn, 1);
7565
+ if (copy == NULL)
7566
+ return ENOMEM;
7567
+ copy->mp_pgno = mp->mp_pgno;
7568
+ copy->mp_flags = mp->mp_flags;
7569
+ copy->mp_lower = PAGEHDRSZ;
7570
+ copy->mp_upper = env->me_psize;
7571
+
7572
+ /* prepare to insert */
7573
+ for (i=0, j=0; i<nkeys; i++) {
7574
+ if (i == newindx) {
7575
+ copy->mp_ptrs[j++] = 0;
7576
+ }
7577
+ copy->mp_ptrs[j++] = mp->mp_ptrs[i];
7578
+ }
7579
+
7580
+ /* When items are relatively large the split point needs
7581
+ * to be checked, because being off-by-one will make the
7582
+ * difference between success or failure in mdb_node_add.
7583
+ *
7584
+ * It's also relevant if a page happens to be laid out
7585
+ * such that one half of its nodes are all "small" and
7586
+ * the other half of its nodes are "large." If the new
7587
+ * item is also "large" and falls on the half with
7588
+ * "large" nodes, it also may not fit.
7589
+ *
7590
+ * As a final tweak, if the new item goes on the last
7591
+ * spot on the page (and thus, onto the new page), bias
7592
+ * the split so the new page is emptier than the old page.
7593
+ * This yields better packing during sequential inserts.
7594
+ */
7595
+ if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
7596
+ /* Find split point */
7597
+ psize = 0;
7598
+ if (newindx <= split_indx || newindx >= nkeys) {
7599
+ i = 0; j = 1;
7600
+ k = newindx >= nkeys ? nkeys : split_indx+2;
7601
+ } else {
7602
+ i = nkeys; j = -1;
7603
+ k = split_indx-1;
7604
+ }
7605
+ for (; i!=k; i+=j) {
7606
+ if (i == newindx) {
7607
+ psize += nsize;
7608
+ node = NULL;
7609
+ } else {
7610
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
7611
+ psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7612
+ if (IS_LEAF(mp)) {
7613
+ if (F_ISSET(node->mn_flags, F_BIGDATA))
7614
+ psize += sizeof(pgno_t);
7615
+ else
7616
+ psize += NODEDSZ(node);
7547
7617
  }
7548
- else
7549
- split_indx = i;
7550
- break;
7618
+ psize += psize & 1;
7551
7619
  }
7552
- }
7553
- } else {
7554
- psize = nsize;
7555
- for (i=nkeys-1; i>=split_indx; i--) {
7556
- node = NODEPTR(mp, i);
7557
- psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7558
- if (F_ISSET(node->mn_flags, F_BIGDATA))
7559
- psize += sizeof(pgno_t);
7560
- else
7561
- psize += NODEDSZ(node);
7562
- psize += psize & 1;
7563
- if (psize > pmax) {
7564
- if (i >= newindx) {
7565
- split_indx = newindx;
7566
- newpos = 0;
7567
- } else
7568
- split_indx = i+1;
7620
+ if (psize > pmax || i == k-j) {
7621
+ split_indx = i + (j<0);
7569
7622
  break;
7570
7623
  }
7571
7624
  }
7572
7625
  }
7626
+ if (split_indx == newindx) {
7627
+ sepkey.mv_size = newkey->mv_size;
7628
+ sepkey.mv_data = newkey->mv_data;
7629
+ } else {
7630
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
7631
+ sepkey.mv_size = node->mn_ksize;
7632
+ sepkey.mv_data = NODEKEY(node);
7633
+ }
7573
7634
  }
7574
7635
  }
7575
7636
 
7576
- /* First find the separating key between the split pages.
7577
- * The case where newindx == split_indx is ambiguous; the
7578
- * new item could go to the new page or stay on the original
7579
- * page. If newpos == 1 it goes to the new page.
7580
- */
7581
- if (newindx == split_indx && newpos) {
7582
- sepkey.mv_size = newkey->mv_size;
7583
- sepkey.mv_data = newkey->mv_data;
7584
- } else {
7585
- node = NODEPTR(mp, split_indx);
7586
- sepkey.mv_size = node->mn_ksize;
7587
- sepkey.mv_data = NODEKEY(node);
7588
- }
7589
-
7590
- newsep:
7591
- DPRINTF(("separator is [%s]", DKEY(&sepkey)));
7637
+ DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
7592
7638
 
7593
7639
  /* Copy separator key to the parent.
7594
7640
  */
7595
- if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) {
7641
+ if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
7596
7642
  mn.mc_snum--;
7597
7643
  mn.mc_top--;
7598
7644
  did_split = 1;
@@ -7637,117 +7683,97 @@ newsep:
7637
7683
  return rc;
7638
7684
  for (i=0; i<mc->mc_top; i++)
7639
7685
  mc->mc_ki[i] = mn.mc_ki[i];
7640
- goto done;
7641
- }
7642
- if (IS_LEAF2(rp)) {
7643
- goto done;
7644
- }
7645
-
7646
- /* Move half of the keys to the right sibling. */
7686
+ } else if (!IS_LEAF2(mp)) {
7687
+ /* Move nodes */
7688
+ mc->mc_pg[mc->mc_top] = rp;
7689
+ i = split_indx;
7690
+ j = 0;
7691
+ do {
7692
+ if (i == newindx) {
7693
+ rkey.mv_data = newkey->mv_data;
7694
+ rkey.mv_size = newkey->mv_size;
7695
+ if (IS_LEAF(mp)) {
7696
+ rdata = newdata;
7697
+ } else
7698
+ pgno = newpgno;
7699
+ flags = nflags;
7700
+ /* Update index for the new key. */
7701
+ mc->mc_ki[mc->mc_top] = j;
7702
+ } else {
7703
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
7704
+ rkey.mv_data = NODEKEY(node);
7705
+ rkey.mv_size = node->mn_ksize;
7706
+ if (IS_LEAF(mp)) {
7707
+ xdata.mv_data = NODEDATA(node);
7708
+ xdata.mv_size = NODEDSZ(node);
7709
+ rdata = &xdata;
7710
+ } else
7711
+ pgno = NODEPGNO(node);
7712
+ flags = node->mn_flags;
7713
+ }
7647
7714
 
7648
- /* grab a page to hold a temporary copy */
7649
- copy = mdb_page_malloc(mc->mc_txn, 1);
7650
- if (copy == NULL)
7651
- return ENOMEM;
7715
+ if (!IS_LEAF(mp) && j == 0) {
7716
+ /* First branch index doesn't need key data. */
7717
+ rkey.mv_size = 0;
7718
+ }
7652
7719
 
7653
- copy->mp_pgno = mp->mp_pgno;
7654
- copy->mp_flags = mp->mp_flags;
7655
- copy->mp_lower = PAGEHDRSZ;
7656
- copy->mp_upper = mc->mc_txn->mt_env->me_psize;
7657
- mc->mc_pg[mc->mc_top] = copy;
7658
- for (i = j = 0; i <= nkeys; j++) {
7659
- if (i == split_indx) {
7660
- /* Insert in right sibling. */
7661
- /* Reset insert index for right sibling. */
7662
- if (i != newindx || (newpos ^ ins_new)) {
7720
+ rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
7721
+ if (rc) {
7722
+ /* return tmp page to freelist */
7723
+ mdb_page_free(env, copy);
7724
+ return rc;
7725
+ }
7726
+ if (i == nkeys) {
7727
+ i = 0;
7663
7728
  j = 0;
7664
- mc->mc_pg[mc->mc_top] = rp;
7729
+ mc->mc_pg[mc->mc_top] = copy;
7730
+ } else {
7731
+ i++;
7732
+ j++;
7733
+ }
7734
+ } while (i != split_indx);
7735
+
7736
+ nkeys = NUMKEYS(copy);
7737
+ for (i=0; i<nkeys; i++)
7738
+ mp->mp_ptrs[i] = copy->mp_ptrs[i];
7739
+ mp->mp_lower = copy->mp_lower;
7740
+ mp->mp_upper = copy->mp_upper;
7741
+ memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
7742
+ env->me_psize - copy->mp_upper);
7743
+
7744
+ /* reset back to original page */
7745
+ if (newindx < split_indx) {
7746
+ mc->mc_pg[mc->mc_top] = mp;
7747
+ if (nflags & MDB_RESERVE) {
7748
+ node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7749
+ if (!(node->mn_flags & F_BIGDATA))
7750
+ newdata->mv_data = NODEDATA(node);
7665
7751
  }
7666
- }
7667
-
7668
- if (i == newindx && !ins_new) {
7669
- /* Insert the original entry that caused the split. */
7670
- rkey.mv_data = newkey->mv_data;
7671
- rkey.mv_size = newkey->mv_size;
7672
- if (IS_LEAF(mp)) {
7673
- rdata = newdata;
7674
- } else
7675
- pgno = newpgno;
7676
- flags = nflags;
7677
-
7678
- ins_new = 1;
7679
-
7680
- /* Update index for the new key. */
7681
- mc->mc_ki[mc->mc_top] = j;
7682
- } else if (i == nkeys) {
7683
- break;
7684
7752
  } else {
7685
- node = NODEPTR(mp, i);
7686
- rkey.mv_data = NODEKEY(node);
7687
- rkey.mv_size = node->mn_ksize;
7688
- if (IS_LEAF(mp)) {
7689
- xdata.mv_data = NODEDATA(node);
7690
- xdata.mv_size = NODEDSZ(node);
7691
- rdata = &xdata;
7692
- } else
7693
- pgno = NODEPGNO(node);
7694
- flags = node->mn_flags;
7695
-
7696
- i++;
7697
- }
7698
-
7699
- if (!IS_LEAF(mp) && j == 0) {
7700
- /* First branch index doesn't need key data. */
7701
- rkey.mv_size = 0;
7702
- }
7703
-
7704
- rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
7705
- if (rc) break;
7706
- }
7707
-
7708
- nkeys = NUMKEYS(copy);
7709
- for (i=0; i<nkeys; i++)
7710
- mp->mp_ptrs[i] = copy->mp_ptrs[i];
7711
- mp->mp_lower = copy->mp_lower;
7712
- mp->mp_upper = copy->mp_upper;
7713
- memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
7714
- mc->mc_txn->mt_env->me_psize - copy->mp_upper);
7715
-
7716
- /* reset back to original page */
7717
- if (newindx < split_indx || (!newpos && newindx == split_indx)) {
7718
- mc->mc_pg[mc->mc_top] = mp;
7719
- if (nflags & MDB_RESERVE) {
7720
- node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7721
- if (!(node->mn_flags & F_BIGDATA))
7722
- newdata->mv_data = NODEDATA(node);
7723
- }
7724
- } else {
7725
- mc->mc_ki[ptop]++;
7726
- /* Make sure mc_ki is still valid.
7727
- */
7728
- if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
7729
- mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
7730
- for (i=0; i<ptop; i++) {
7731
- mc->mc_pg[i] = mn.mc_pg[i];
7732
- mc->mc_ki[i] = mn.mc_ki[i];
7753
+ mc->mc_pg[mc->mc_top] = rp;
7754
+ mc->mc_ki[ptop]++;
7755
+ /* Make sure mc_ki is still valid.
7756
+ */
7757
+ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
7758
+ mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
7759
+ for (i=0; i<ptop; i++) {
7760
+ mc->mc_pg[i] = mn.mc_pg[i];
7761
+ mc->mc_ki[i] = mn.mc_ki[i];
7762
+ }
7763
+ mc->mc_pg[ptop] = mn.mc_pg[ptop];
7764
+ mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
7733
7765
  }
7734
- mc->mc_pg[ptop] = mn.mc_pg[ptop];
7735
- mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
7736
7766
  }
7767
+ /* return tmp page to freelist */
7768
+ mdb_page_free(env, copy);
7737
7769
  }
7738
7770
 
7739
- /* return tmp page to freelist */
7740
- mdb_page_free(mc->mc_txn->mt_env, copy);
7741
- done:
7742
7771
  {
7743
7772
  /* Adjust other cursors pointing to mp */
7744
7773
  MDB_cursor *m2, *m3;
7745
7774
  MDB_dbi dbi = mc->mc_dbi;
7746
7775
  int fixup = NUMKEYS(mp);
7747
7776
 
7748
- if (mc->mc_flags & C_SUB)
7749
- dbi--;
7750
-
7751
7777
  for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7752
7778
  if (mc->mc_flags & C_SUB)
7753
7779
  m3 = &m2->mc_xcursor->mx_cursor;
@@ -7789,6 +7815,7 @@ done:
7789
7815
  }
7790
7816
  }
7791
7817
  }
7818
+ DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
7792
7819
  return rc;
7793
7820
  }
7794
7821
 
@@ -7805,13 +7832,6 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
7805
7832
  if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
7806
7833
  return EINVAL;
7807
7834
 
7808
- if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
7809
- return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7810
-
7811
- if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
7812
- return MDB_BAD_VALSIZE;
7813
- }
7814
-
7815
7835
  if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
7816
7836
  return EINVAL;
7817
7837
 
@@ -7851,6 +7871,16 @@ mdb_env_get_path(MDB_env *env, const char **arg)
7851
7871
  return MDB_SUCCESS;
7852
7872
  }
7853
7873
 
7874
+ int
7875
+ mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
7876
+ {
7877
+ if (!env || !arg)
7878
+ return EINVAL;
7879
+
7880
+ *arg = env->me_fd;
7881
+ return MDB_SUCCESS;
7882
+ }
7883
+
7854
7884
  /** Common code for #mdb_stat() and #mdb_env_stat().
7855
7885
  * @param[in] env the environment to operate in.
7856
7886
  * @param[in] db the #MDB_db record containing the stats to return.
@@ -8075,7 +8105,7 @@ mdb_drop0(MDB_cursor *mc, int subs)
8075
8105
  {
8076
8106
  int rc;
8077
8107
 
8078
- rc = mdb_page_search(mc, NULL, 0);
8108
+ rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
8079
8109
  if (rc == MDB_SUCCESS) {
8080
8110
  MDB_txn *txn = mc->mc_txn;
8081
8111
  MDB_node *ni;
@@ -8273,10 +8303,10 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
8273
8303
  return 0;
8274
8304
  }
8275
8305
 
8276
- /* insert pid into list if not already present.
8306
+ /** Insert pid into list if not already present.
8277
8307
  * return -1 if already present.
8278
8308
  */
8279
- static int mdb_pid_insert(pid_t *ids, pid_t pid)
8309
+ static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
8280
8310
  {
8281
8311
  /* binary search of pid in list */
8282
8312
  unsigned base = 0;
@@ -8301,7 +8331,7 @@ static int mdb_pid_insert(pid_t *ids, pid_t pid)
8301
8331
  return -1;
8302
8332
  }
8303
8333
  }
8304
-
8334
+
8305
8335
  if( val > 0 ) {
8306
8336
  ++cursor;
8307
8337
  }
@@ -8316,7 +8346,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
8316
8346
  {
8317
8347
  unsigned int i, j, rdrs;
8318
8348
  MDB_reader *mr;
8319
- pid_t *pids, pid;
8349
+ MDB_PID_T *pids, pid;
8320
8350
  int count = 0;
8321
8351
 
8322
8352
  if (!env)
@@ -8326,7 +8356,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
8326
8356
  if (!env->me_txns)
8327
8357
  return MDB_SUCCESS;
8328
8358
  rdrs = env->me_txns->mti_numreaders;
8329
- pids = malloc((rdrs+1) * sizeof(pid_t));
8359
+ pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
8330
8360
  if (!pids)
8331
8361
  return ENOMEM;
8332
8362
  pids[0] = 0;
@@ -8342,6 +8372,8 @@ int mdb_reader_check(MDB_env *env, int *dead)
8342
8372
  if (!mdb_reader_pid(env, Pidcheck, pid)) {
8343
8373
  for (j=i; j<rdrs; j++)
8344
8374
  if (mr[j].mr_pid == pid) {
8375
+ DPRINTF(("clear stale reader pid %u txn %"Z"d",
8376
+ (unsigned) pid, mr[j].mr_txnid));
8345
8377
  mr[j].mr_pid = 0;
8346
8378
  count++;
8347
8379
  }