RubyGems - lmdb - Versions diffs - 0.3.1 → 0.4.0 - Mend

lmdb 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -0
data/CHANGES +4 -0
data/README.md +4 -1
data/ext/lmdb_ext/liblmdb/CHANGES +26 -1
data/ext/lmdb_ext/liblmdb/lmdb.h +80 -10
data/ext/lmdb_ext/liblmdb/mdb.c +648 -616
data/ext/lmdb_ext/liblmdb/midl.c +8 -8
data/ext/lmdb_ext/liblmdb/midl.h +1 -1
data/ext/lmdb_ext/lmdb_ext.c +28 -45
data/ext/lmdb_ext/lmdb_ext.h +4 -7
data/lib/lmdb/version.rb +1 -1
data/spec/lmdb_spec.rb +7 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c192851303ba3bd76a139142c94322233284d3ed
-  data.tar.gz: b9482f90f6599d71f49989246fe9f2c526f47986
+  metadata.gz: 1e16a42693e5150e076829337a0d44feb46bc3f3
+  data.tar.gz: ce0bc77de0cf3c7e17df2522c8eab69e760e1221
 SHA512:
-  metadata.gz: e8fa0b74c2854ebd13ba07663ee65be141cefe08ae55075f6fd8817c805e21f48d32c886d687f4e85334958bfa1d9f39935a9764d343755dec3adbd7b3d67b08
-  data.tar.gz: c760928ef6d4fb1edb6a5a5d5432b9a8823af38e12457efaef74640aef3b7a6e7956bd258cf22b68c54a89136742fc48329c78ac0ce9db2bbfa1b135d8dd3a8a
+  metadata.gz: e02924de6a59386ec215b45baf321a6d098d818900f7816a3805185ab5b7441427cc5af866d2d8141eecfd1d166a659ebc2a87d5ae7626a9a574bd4d1bd4bf44
+  data.tar.gz: 3ee55c5879fd385e53d25eeffd694dab0de03c5b152fe07562f9b631de25ccd90f09ae886a1d30b7f9abd6ea3a8da3fedac0b1f9d69dbffc3874bfad18570dd9

data/.travis.yml CHANGED Viewed

@@ -3,6 +3,7 @@ rvm:
   - 1.8.7
   - 1.9.3
   - 2.0.0
+  - 2.1.0
   - ruby-head
   - rbx-18mode
   - rbx-19mode

data/CHANGES CHANGED Viewed

@@ -1,3 +1,7 @@
+0.4.0
+  * Print warnings if open LMDB objects are found during garbage collection
 0.3.1
   * Minor fixes

data/README.md CHANGED Viewed

@@ -1,6 +1,9 @@
 # LMDB
-Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LLMDB)
+[![Gittip donate button](http://img.shields.io/gittip/bevry.png)](https://www.gittip.com/min4d/ "Donate weekly to this project using Gittip")
+[![Flattr this git repo](http://api.flattr.com/button/flattr-badge-large.png)](https://flattr.com/submit/auto?user_id=min4d&url=https://github.com/minad/lmdb&title=LMDB&language=&tags=github&category=software)
+Ruby bindings for the amazing OpenLDAP's Lightning Memory-Mapped Database (LMDB)
 http://symas.com/mdb/
 ### Installation

data/ext/lmdb_ext/liblmdb/CHANGES CHANGED Viewed

@@ -1,6 +1,31 @@
 LMDB 0.9 Change Log
-LMDB 0.9.8 Engineering
+LMDB 0.9.10 Release (2013/11/12)
+	Add MDB_NOMEMINIT option
+	Fix mdb_page_split() again (ITS#7589)
+	Fix MDB_NORDAHEAD definition (ITS#7734)
+	Fix mdb_cursor_del() positioning (ITS#7733)
+	Partial fix for larger page sizes (ITS#7713)
+	Fix Windows64/MSVC build issues
+LMDB 0.9.9 Release (2013/10/24)
+	Add mdb_env_get_fd()
+	Add MDB_NORDAHEAD option
+	Add MDB_NOLOCK option
+	Avoid wasting space in mdb_page_split() (ITS#7589)
+	Fix mdb_page_merge() cursor fixup (ITS#7722)
+	Fix mdb_cursor_del() on last delete (ITS#7718)
+	Fix adding WRITEMAP on existing env (ITS#7715)
+	Fixes for nested txns (ITS#7515)
+	Fix mdb_env_copy() O_DIRECT bug (ITS#7682)
+	Fix mdb_cursor_set(SET_RANGE) return code (ITS#7681)
+	Fix mdb_rebalance() cursor fixup (ITS#7701)
+	Misc code cleanup
+	Documentation
+		Note that by default, readers need write access
+LMDB 0.9.8 Release (2013/09/09)
 	Allow mdb_env_set_mapsize() on an open environment
 	Fix mdb_dbi_flags() (ITS#7672)
 	Fix mdb_page_unspill() in nested txns

data/ext/lmdb_ext/liblmdb/lmdb.h CHANGED Viewed

@@ -66,6 +66,20 @@
  *	  BSD systems or when otherwise configured with MDB_USE_POSIX_SEM.
  *	  Multiple users can cause startup to fail later, as noted above.
  *
+ *	- There is normally no pure read-only mode, since readers need write
+ *	  access to locks and lock file. Exceptions: On read-only filesystems
+ *	  or with the #MDB_NOLOCK flag described under #mdb_env_open().
+ *
+ *	- By default, in versions before 0.9.10, unused portions of the data
+ *	  file might receive garbage data from memory freed by other code.
+ *	  (This does not happen when using the #MDB_WRITEMAP flag.) As of
+ *	  0.9.10 the default behavior is to initialize such memory before
+ *	  writing to the data file. Since there may be a slight performance
+ *	  cost due to this initialization, applications may disable it using
+ *	  the #MDB_NOMEMINIT flag. Applications handling sensitive data
+ *	  which must not be written should not use this flag. This flag is
+ *	  irrelevant when using #MDB_WRITEMAP.
+ *
  *	- A thread can only use one transaction at a time, plus any child
  *	  transactions.  Each transaction belongs to one thread.  See below.
  *	  The #MDB_NOTLS flag changes this for read-only transactions.
@@ -170,7 +184,7 @@ typedef int mdb_filehandle_t;
 /** Library minor version */
 #define MDB_VERSION_MINOR	9
 /** Library patch version */
-#define MDB_VERSION_PATCH	8
+#define MDB_VERSION_PATCH	10
 /** Combine args a,b,c into a single integer for easy version comparisons */
 #define MDB_VERINT(a,b,c)	(((a) << 24) | ((b) << 16) | (c))
@@ -180,7 +194,7 @@ typedef int mdb_filehandle_t;
 	MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
 /** The release date of this library version */
-#define MDB_VERSION_DATE	"September 9, 2013"
+#define MDB_VERSION_DATE	"November 11, 2013"
 /** A stringifier for the version info */
 #define MDB_VERSTR(a,b,c,d)	"MDB " #a "." #b "." #c ": (" d ")"
@@ -216,13 +230,13 @@ typedef struct MDB_cursor MDB_cursor;
 /** @brief Generic structure used for passing keys and data in and out
  * of the database.
  *
- * Key sizes must be between 1 and the liblmdb build-time constant
- * #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The
- * same applies to data sizes in databases with the #MDB_DUPSORT flag.
- * Other data items can in theory be from 0 to 0xffffffff bytes long.
- *
  * Values returned from the database are valid only until a subsequent
- * update operation, or the end of the transaction.
+ * update operation, or the end of the transaction. Do not modify or
+ * free them, they commonly point into the database itself.
+ *
+ * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive.
+ * The same applies to data sizes in databases with the #MDB_DUPSORT flag.
+ * Other data items can in theory be from 0 to 0xffffffff bytes long.
  */
 typedef struct MDB_val {
 	size_t		 mv_size;	/**< size of the data item */
@@ -265,10 +279,16 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
 #define MDB_NOMETASYNC		0x40000
 	/** use writable mmap */
 #define MDB_WRITEMAP		0x80000
-	/** use asynchronous msync when MDB_WRITEMAP is used */
+	/** use asynchronous msync when #MDB_WRITEMAP is used */
 #define MDB_MAPASYNC		0x100000
 	/** tie reader locktable slots to #MDB_txn objects instead of to threads */
 #define MDB_NOTLS		0x200000
+	/** don't do any locking, caller must manage their own locks */
+#define MDB_NOLOCK		0x400000
+	/** don't do readahead (no effect on Windows) */
+#define MDB_NORDAHEAD	0x800000
+	/** don't initialize malloc'd memory before writing to datafile */
+#define MDB_NOMEMINIT	0x1000000
 /** @} */
 /**	@defgroup	mdb_dbi_open	Database Flags
@@ -486,6 +506,8 @@ int  mdb_env_create(MDB_env **env);
 	 *		and uses fewer mallocs, but loses protection from application bugs
 	 *		like wild pointer writes and other bad updates into the database.
 	 *		Incompatible with nested transactions.
+	 *		Processes with and without MDB_WRITEMAP on the same environment do
+	 *		not cooperate well.
 	 *	<li>#MDB_NOMETASYNC
 	 *		Flush system buffers to disk only once per transaction, omit the
 	 *		metadata flush. Defer that until the system flushes files to disk,
@@ -523,6 +545,38 @@ int  mdb_env_create(MDB_env **env);
 	 *		user threads over individual OS threads need this option. Such an
 	 *		application must also serialize the write transactions in an OS
 	 *		thread, since MDB's write locking is unaware of the user threads.
+	 *	<li>#MDB_NOLOCK
+	 *		Don't do any locking. If concurrent access is anticipated, the
+	 *		caller must manage all concurrency itself. For proper operation
+	 *		the caller must enforce single-writer semantics, and must ensure
+	 *		that no readers are using old transactions while a writer is
+	 *		active. The simplest approach is to use an exclusive lock so that
+	 *		no readers may be active at all when a writer begins.
+	 *	<li>#MDB_NORDAHEAD
+	 *		Turn off readahead. Most operating systems perform readahead on
+	 *		read requests by default. This option turns it off if the OS
+	 *		supports it. Turning it off may help random read performance
+	 *		when the DB is larger than RAM and system RAM is full.
+	 *		The option is not implemented on Windows.
+	 *	<li>#MDB_NOMEMINIT
+	 *		Don't initialize malloc'd memory before writing to unused spaces
+	 *		in the data file. By default, memory for pages written to the data
+	 *		file is obtained using malloc. While these pages may be reused in
+	 *		subsequent transactions, freshly malloc'd pages will be initialized
+	 *		to zeroes before use. This avoids persisting leftover data from other
+	 *		code (that used the heap and subsequently freed the memory) into the
+	 *		data file. Note that many other system libraries may allocate
+	 *		and free memory from the heap for arbitrary uses. E.g., stdio may
+	 *		use the heap for file I/O buffers. This initialization step has a
+	 *		modest performance cost so some applications may want to disable
+	 *		it using this flag. This option can be a problem for applications
+	 *		which handle sensitive data like passwords, and it makes memory
+	 *		checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP,
+	 *		which writes directly to the mmap instead of using malloc for pages. The
+	 *		initialization is also skipped if #MDB_RESERVE is used; the
+	 *		caller is expected to overwrite all of the memory that was
+	 *		reserved in that case.
+	 *		This flag may be changed at any time using #mdb_env_set_flags().
 	 * </ul>
 	 * @param[in] mode The UNIX permissions to set on created files. This parameter
 	 * is ignored on Windows.
@@ -656,6 +710,18 @@ int  mdb_env_get_flags(MDB_env *env, unsigned int *flags);
 	 */
 int  mdb_env_get_path(MDB_env *env, const char **path);
+	/** @brief Return the filedescriptor for the given environment.
+	 *
+	 * @param[in] env An environment handle returned by #mdb_env_create()
+	 * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor.
+	 * @return A non-zero error value on failure and 0 on success. Some possible
+	 * errors are:
+	 * <ul>
+	 *	<li>EINVAL - an invalid parameter was specified.
+	 * </ul>
+	 */
+int  mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
 	/** @brief Set the size of the memory map to use for this environment.
 	 *
 	 * The size should be a multiple of the OS page size. The default is
@@ -733,8 +799,10 @@ int  mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs);
 	/** @brief Get the maximum size of a key for the environment.
 	 *
+	 * This is the compile-time constant #MDB_MAXKEYSIZE, default 511.
+	 * See @ref MDB_val.
 	 * @param[in] env An environment handle returned by #mdb_env_create()
-	 * @return The maximum size of a key. (#MDB_MAXKEYSIZE)
+	 * @return The maximum size of a key
 	 */
 int  mdb_env_get_maxkeysize(MDB_env *env);
@@ -1094,6 +1162,8 @@ int  mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
 	 *		reserved space, which the caller can fill in later - before
 	 *		the next update operation or the transaction ends. This saves
 	 *		an extra memcpy if the data is being generated later.
+	 *		MDB does nothing else with this memory, the caller is expected
+	 *		to modify all of the space requested.
 	 *	<li>#MDB_APPEND - append the given key/data pair to the end of the
 	 *		database. No key comparisons are performed. This option allows
 	 *		fast bulk loading when keys are already known to be in the

data/ext/lmdb_ext/liblmdb/mdb.c CHANGED Viewed

@@ -37,10 +37,26 @@
 #endif
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <sys/param.h>
 #ifdef _WIN32
 #include <windows.h>
+/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
+ *  as int64 which is wrong. MSVC doesn't define it at all, so just
+ *  don't use it.
+ */
+#define MDB_PID_T	int
+#ifdef __GNUC__
+# include <sys/param.h>
 #else
+# define LITTLE_ENDIAN	1234
+# define BIG_ENDIAN	4321
+# define BYTE_ORDER	LITTLE_ENDIAN
+# ifndef SSIZE_MAX
+#  define SSIZE_MAX	INT_MAX
+# endif
+#endif
+#else
+#define MDB_PID_T	pid_t
+#include <sys/param.h>
 #include <sys/uio.h>
 #include <sys/mman.h>
 #ifdef HAVE_SYS_FILE_H
@@ -75,6 +91,7 @@
 #ifndef _WIN32
 #include <pthread.h>
 #ifdef MDB_USE_POSIX_SEM
+# define MDB_USE_HASH		1
 #include <semaphore.h>
 #endif
 #endif
@@ -140,6 +157,7 @@
  *	@{
  */
 #ifdef _WIN32
+#define MDB_USE_HASH	1
 #define MDB_PIDLOCK	0
 #define pthread_t	DWORD
 #define pthread_mutex_t	HANDLE
@@ -171,7 +189,7 @@
 #define	Z	"I"
 #else
-#define	Z	"z"
+#define	Z	"z"			/**< printf format modifier for size_t */
 	/** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
 #define MDB_PIDLOCK			1
@@ -317,12 +335,18 @@ static txnid_t mdb_debug_start;
 	 *	The string is printed literally, with no format processing.
 	 */
 #define DPUTS(arg)	DPRINTF(("%s", arg))
+	/** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
+#define DDBI(mc) \
+	(((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
 /** @} */
-	/** A default memory page size.
-	 *	The actual size is platform-dependent, but we use this for
-	 *	boot-strapping. We probably should not be using this any more.
-	 *	The #GET_PAGESIZE() macro is used to get the actual size.
+	/**	@brief The maximum size of a database page.
+	 *
+	 *	This is 32k, since it must fit in #MDB_page.#mp_upper.
+	 *
+	 *	LMDB will use database pages < OS pages if needed.
+	 *	That causes more I/O in write transactions: The OS must
+	 *	know (read) the whole page before writing a partial page.
 	 *
 	 *	Note that we don't currently support Huge pages. On Linux,
 	 *	regular data files cannot use Huge pages, and in general
@@ -331,7 +355,7 @@ static txnid_t mdb_debug_start;
 	 *	pressure from other processes is high. So until OSs have
 	 *	actual paging support for Huge pages, they're not viable.
 	 */
-#define MDB_PAGESIZE	 4096
+#define MAX_PAGESIZE	 0x8000
 	/** The minimum number of keys required in a database page.
 	 *	Setting this to a larger value will place a smaller bound on the
@@ -365,7 +389,7 @@ static txnid_t mdb_debug_start;
 	 *
 	 *	We require that keys all fit onto a regular page. This limit
 	 *	could be raised a bit further if needed; to something just
-	 *	under #MDB_PAGESIZE / #MDB_MINKEYS.
+	 *	under (page size / #MDB_MINKEYS / 3).
 	 *
 	 *	Note that data items in an #MDB_DUPSORT database are actually keys
 	 *	of a subDB, so they're also limited to this size.
@@ -425,7 +449,8 @@ typedef uint16_t	 indx_t;
  *
  *	If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
  *
- *	No reader table is used if the database is on a read-only filesystem.
+ *	No reader table is used if the database is on a read-only filesystem, or
+ *	if #MDB_NOLOCK is set.
  *
  *	Since the database uses multi-version concurrency control, readers don't
  *	actually need any locking. This table is used to keep track of which
@@ -488,7 +513,7 @@ typedef struct MDB_rxbody {
 	 */
 	txnid_t		mrb_txnid;
 	/** The process ID of the process owning this reader txn. */
-	pid_t		mrb_pid;
+	MDB_PID_T	mrb_pid;
 	/** The thread ID of the thread owning this txn. */
 	pthread_t	mrb_tid;
 } MDB_rxbody;
@@ -600,7 +625,7 @@ typedef struct MDB_page {
 #define	P_LEAF		 0x02		/**< leaf page */
 #define	P_OVERFLOW	 0x04		/**< overflow page */
 #define	P_META		 0x08		/**< meta page */
-#define	P_DIRTY		 0x10		/**< dirty page */
+#define	P_DIRTY		 0x10		/**< dirty page, also set for #P_SUBP pages */
 #define	P_LEAF2		 0x20		/**< for #MDB_DUPFIXED records */
 #define	P_SUBP		 0x40		/**< for #MDB_DUPSORT sub-pages */
 #define	P_KEEP		 0x8000		/**< leave this page alone during spill */
@@ -786,7 +811,10 @@ typedef struct MDB_db {
 	/** Handle for the default DB. */
 #define	MAIN_DBI	1
-	/** Meta page content. */
+	/** Meta page content.
+	 *	A meta page is the start point for accessing a database snapshot.
+	 *	Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
+	 */
 typedef struct MDB_meta {
 		/** Stamp identifying this as an MDB file. It must be set
 		 *	to #MDB_MAGIC. */
@@ -804,19 +832,18 @@ typedef struct MDB_meta {
 	txnid_t		mm_txnid;			/**< txnid that committed this page */
 } MDB_meta;
-	/** Buffer for a stack-allocated dirty page.
+	/** Buffer for a stack-allocated meta page.
 	 *	The members define size and alignment, and silence type
 	 *	aliasing warnings.  They are not used directly; that could
 	 *	mean incorrectly using several union members in parallel.
 	 */
-typedef union MDB_pagebuf {
-	char		mb_raw[MDB_PAGESIZE];
+typedef union MDB_metabuf {
 	MDB_page	mb_page;
 	struct {
 		char		mm_pad[PAGEHDRSZ];
 		MDB_meta	mm_meta;
 	} mb_metabuf;
-} MDB_pagebuf;
+} MDB_metabuf;
 	/** Auxiliary DB info.
 	 *	The information here is mostly static/read-only. There is
@@ -865,9 +892,9 @@ struct MDB_txn {
  *	@ingroup internal
  * @{
  */
-#define DB_DIRTY	0x01		/**< DB was written in this txn */
-#define DB_STALE	0x02		/**< DB record is older than txnID */
-#define DB_NEW		0x04		/**< DB handle opened in this txn */
+#define DB_DIRTY	0x01		/**< DB was modified or is DUPSORT data */
+#define DB_STALE	0x02		/**< Named-DB record is older than txnID */
+#define DB_NEW		0x04		/**< Named-DB handle opened in this txn */
 #define DB_VALID	0x08		/**< DB handle is valid, see also #MDB_VALID */
 /** @} */
 	/** In write txns, array of cursors for each DB */
@@ -889,12 +916,12 @@ struct MDB_txn {
 #define MDB_TXN_SPILLS		0x08		/**< txn or a parent has spilled pages */
 /** @} */
 	unsigned int	mt_flags;		/**< @ref mdb_txn */
-	/** dirty_list maxsize - # of allocated pages allowed, including in parent txns */
-	unsigned int	mt_dirty_room;
-	/** Tracks which of the two meta pages was used at the start
-	 * 	of this transaction.
+	/** dirty_list room: Array size - #dirty pages visible to this txn.
+	 *	Includes ancestor txns' dirty pages not hidden by other txns'
+	 *	dirty/spilled pages. Thus commit(nested txn) has room to merge
+	 *	dirty_list into mt_parent after freeing hidden mt_parent pages.
 	 */
-	unsigned int	mt_toggle;
+	unsigned int	mt_dirty_room;
 };
 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
@@ -905,7 +932,14 @@ struct MDB_txn {
 struct MDB_xcursor;
-	/** Cursors are used for all DB operations */
+	/** Cursors are used for all DB operations.
+	 *	A cursor holds a path of (page pointer, key index) from the DB
+	 *	root to a position in the DB, plus other state. #MDB_DUPSORT
+	 *	cursors include an xcursor to the current data item. Write txns
+	 *	track their cursors and keep them up to date when data moves.
+	 *	Exception: An xcursor's pointer to a #P_SUBP page can be stale.
+	 *	(A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
+	 */
 struct MDB_cursor {
 	/** Next cursor on this DB in this txn */
 	MDB_cursor	*mc_next;
@@ -978,16 +1012,18 @@ struct MDB_env {
 	/** Have liveness lock in reader table */
 #define	MDB_LIVE_READER	0x08000000U
 	uint32_t 	me_flags;		/**< @ref mdb_env */
-	unsigned int	me_psize;	/**< size of a page, from #GET_PAGESIZE */
+	unsigned int	me_psize;	/**< DB page size, inited from me_os_psize */
+	unsigned int	me_os_psize;	/**< OS page size, from #GET_PAGESIZE */
 	unsigned int	me_maxreaders;	/**< size of the reader table */
 	unsigned int	me_numreaders;	/**< max numreaders set by this env */
 	MDB_dbi		me_numdbs;		/**< number of DBs opened */
 	MDB_dbi		me_maxdbs;		/**< size of the DB table */
-	pid_t		me_pid;		/**< process ID of this env */
+	MDB_PID_T	me_pid;		/**< process ID of this env */
 	char		*me_path;		/**< path to the DB files */
 	char		*me_map;		/**< the memory map of the data file */
 	MDB_txninfo	*me_txns;		/**< the memory map of the lock file or NULL */
 	MDB_meta	*me_metas[2];	/**< pointers to the two meta pages */
+	void		*me_pbuf;		/**< scratch area for DUPSORT put() */
 	MDB_txn		*me_txn;		/**< current write transaction */
 	size_t		me_mapsize;		/**< size of the data memory map */
 	off_t		me_size;		/**< current file size */
@@ -1019,8 +1055,8 @@ struct MDB_env {
 	/** Nested transaction */
 typedef struct MDB_ntxn {
-	MDB_txn		mnt_txn;		/* the transaction */
-	MDB_pgstate	mnt_pgstate;	/* parent transaction's saved freestate */
+	MDB_txn		mnt_txn;		/**< the transaction */
+	MDB_pgstate	mnt_pgstate;	/**< parent transaction's saved freestate */
 } MDB_ntxn;
 	/** max number of pages to commit in one writev() call */
@@ -1042,6 +1078,8 @@ static int  mdb_page_search_root(MDB_cursor *mc,
 			    MDB_val *key, int modify);
 #define MDB_PS_MODIFY	1
 #define MDB_PS_ROOTONLY	2
+#define MDB_PS_FIRST	4
+#define MDB_PS_LAST		8
 static int  mdb_page_search(MDB_cursor *mc,
 			    MDB_val *key, int flags);
 static int	mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
@@ -1255,7 +1293,7 @@ static void mdb_audit(MDB_txn *txn)
 			txn->mt_dbs[i].md_leaf_pages +
 			txn->mt_dbs[i].md_overflow_pages;
 		if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
-			mdb_page_search(&mc, NULL, 0);
+			mdb_page_search(&mc, NULL, MDB_PS_FIRST);
 			do {
 				unsigned j;
 				MDB_page *mp;
@@ -1300,7 +1338,12 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
 {
 	MDB_env *env = txn->mt_env;
 	MDB_page *ret = env->me_dpages;
-	size_t sz = env->me_psize;
+	size_t psize = env->me_psize, sz = psize, off;
+	/* For ! #MDB_NOMEMINIT, psize counts how much to init.
+	 * For a single page alloc, we init everything after the page header.
+	 * For multi-page, we init the final page; if the caller needed that
+	 * many pages they will be filling in at least up to the last page.
+	 */
 	if (num == 1) {
 		if (ret) {
 			VGMEMP_ALLOC(env, ret, sz);
@@ -1308,10 +1351,16 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
 			env->me_dpages = ret->mp_next;
 			return ret;
 		}
+		psize -= off = PAGEHDRSZ;
 	} else {
 		sz *= num;
+		off = sz - psize;
 	}
 	if ((ret = malloc(sz)) != NULL) {
+		if (!(env->me_flags & MDB_NOMEMINIT)) {
+			memset((char *)ret + off, 0, psize);
+			ret->mp_pad = 0;
+		}
 		VGMEMP_ALLOC(env, ret, sz);
 	}
 	return ret;
@@ -1329,7 +1378,7 @@ mdb_page_free(MDB_env *env, MDB_page *mp)
 	env->me_dpages = mp;
 }
-/* Free a dirty page */
+/** Free a dirty page */
 static void
 mdb_dpage_free(MDB_env *env, MDB_page *dp)
 {
@@ -1356,7 +1405,7 @@ mdb_dlist_free(MDB_txn *txn)
 	dl[0].mid = 0;
 }
-/* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
+/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
  * @param[in] mc A cursor handle for the current operation.
  * @param[in] pflags Flags of the pages to update:
  * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
@@ -1366,10 +1415,12 @@ mdb_dlist_free(MDB_txn *txn)
 static int
 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
 {
+	enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
 	MDB_txn *txn = mc->mc_txn;
 	MDB_cursor *m3;
 	MDB_xcursor *mx;
-	MDB_page *dp;
+	MDB_page *dp, *mp;
+	MDB_node *leaf;
 	unsigned i, j;
 	int rc = MDB_SUCCESS, level;
@@ -1378,14 +1429,24 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
 		mc = NULL;				/* will find mc in mt_cursors */
 	for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
 		for (; mc; mc=mc->mc_next) {
-			for (m3 = mc; m3->mc_flags & C_INITIALIZED; m3 = &mx->mx_cursor) {
-					for (j=0; j<m3->mc_snum; j++)
-						if ((m3->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY|P_KEEP))
-								== pflags)
-							m3->mc_pg[j]->mp_flags ^= P_KEEP;
-					mx = m3->mc_xcursor;
-					if (mx == NULL)
-						break;
+			if (!(mc->mc_flags & C_INITIALIZED))
+				continue;
+			for (m3 = mc;; m3 = &mx->mx_cursor) {
+				mp = NULL;
+				for (j=0; j<m3->mc_snum; j++) {
+					mp = m3->mc_pg[j];
+					if ((mp->mp_flags & Mask) == pflags)
+						mp->mp_flags ^= P_KEEP;
+				}
+				mx = m3->mc_xcursor;
+				/* Proceed to mx if it is at a sub-database */
+				if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
+					break;
+				if (! (mp && (mp->mp_flags & P_LEAF)))
+					break;
+				leaf = NODEPTR(mp, m3->mc_ki[j-1]);
+				if (!(leaf->mn_flags & F_SUBDATA))
+					break;
 			}
 		}
 		if (i == 0)
@@ -1401,7 +1462,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
 					continue;
 				if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
 					break;
-				if ((dp->mp_flags & (P_DIRTY|P_KEEP)) == pflags && level <= 1)
+				if ((dp->mp_flags & Mask) == pflags && level <= 1)
 					dp->mp_flags ^= P_KEEP;
 			}
 		}
@@ -1415,15 +1476,12 @@ static int mdb_page_flush(MDB_txn *txn, int keep);
 /**	Spill pages from the dirty list back to disk.
  * This is intended to prevent running into #MDB_TXN_FULL situations,
  * but note that they may still occur in a few cases:
- *	1) pages in #MDB_DUPSORT sub-DBs are never spilled, so if there
- *	 are too many of these dirtied in one txn, the txn may still get
- *	 too full.
+ *	1) our estimate of the txn size could be too small. Currently this
+ *	 seems unlikely, except with a large number of #MDB_MULTIPLE items.
  *	2) child txns may run out of space if their parents dirtied a
  *	 lot of pages and never spilled them. TODO: we probably should do
  *	 a preemptive spill during #mdb_txn_begin() of a child txn, if
  *	 the parent's dirty_room is below a given threshold.
- *	3) our estimate of the txn size could be too small. At the
- *	 moment this seems unlikely.
  *
  * Otherwise, if not using nested txns, it is expected that apps will
  * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
@@ -1541,31 +1599,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
 	rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
 done:
-	if (rc == 0) {
-		if (txn->mt_parent) {
-			txn->mt_dirty_room = txn->mt_parent->mt_dirty_room - dl[0].mid;
-			/* dirty pages that are dirty in an ancestor don't
-			 * count against this txn's dirty_room.
-			 */
-			for (i=1; i<=dl[0].mid; i++) {
-				pgno_t pgno = dl[i].mid;
-				MDB_txn *tx2;
-				for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
-					j = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
-					if (j <= tx2->mt_u.dirty_list[0].mid &&
-						tx2->mt_u.dirty_list[j].mid == pgno) {
-						txn->mt_dirty_room++;
-						break;
-					}
-				}
-			}
-		} else {
-			txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid;
-		}
-		txn->mt_flags |= MDB_TXN_SPILLS;
-	} else {
-		txn->mt_flags |= MDB_TXN_ERROR;
-	}
+	txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
 	return rc;
 }
@@ -1575,12 +1609,14 @@ mdb_find_oldest(MDB_txn *txn)
 {
 	int i;
 	txnid_t mr, oldest = txn->mt_txnid - 1;
-	MDB_reader *r = txn->mt_env->me_txns->mti_readers;
-	for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
-		if (r[i].mr_pid) {
-			mr = r[i].mr_txnid;
-			if (oldest > mr)
-				oldest = mr;
+	if (txn->mt_env->me_txns) {
+		MDB_reader *r = txn->mt_env->me_txns->mti_readers;
+		for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
+			if (r[i].mr_pid) {
+				mr = r[i].mr_txnid;
+				if (oldest > mr)
+					oldest = mr;
+			}
 		}
 	}
 	return oldest;
@@ -1790,26 +1826,28 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
 /** Pull a page off the txn's spill list, if present.
  * If a page being referenced was spilled to disk in this txn, bring
  * it back and make it dirty/writable again.
- * @param[in] tx0 the transaction handle.
+ * @param[in] txn the transaction handle.
  * @param[in] mp the page being referenced.
  * @param[out] ret the writable page, if any. ret is unchanged if
  * mp wasn't spilled.
  */
 static int
-mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
+mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
 {
-	MDB_env *env = tx0->mt_env;
-	MDB_txn *txn;
+	MDB_env *env = txn->mt_env;
+	const MDB_txn *tx2;
 	unsigned x;
 	pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
-	for (txn = tx0; txn; txn=txn->mt_parent) {
-		if (!txn->mt_spill_pgs)
+	for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
+		if (!tx2->mt_spill_pgs)
 			continue;
-		x = mdb_midl_search(txn->mt_spill_pgs, pn);
-		if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pn) {
+		x = mdb_midl_search(tx2->mt_spill_pgs, pn);
+		if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
 			MDB_page *np;
 			int num;
+			if (txn->mt_dirty_room == 0)
+				return MDB_TXN_FULL;
 			if (IS_OVERFLOW(mp))
 				num = mp->mp_pages;
 			else
@@ -1825,7 +1863,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
 				else
 					mdb_page_copy(np, mp, env->me_psize);
 			}
-			if (txn == tx0) {
+			if (tx2 == txn) {
 				/* If in current txn, this page is no longer spilled.
 				 * If it happens to be the last page, truncate the spill list.
 				 * Otherwise mark it as deleted by setting the LSB.
@@ -1838,22 +1876,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret)
 				 * page remains spilled until child commits
 				 */
-			if (txn->mt_parent) {
-				MDB_txn *tx2;
-				/* If this page is also in a parent's dirty list, then
-				 * it's already accounted in dirty_room, and we need to
-				 * cancel out the decrement that mdb_page_dirty does.
-				 */
-				for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
-					x = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno);
-					if (x <= tx2->mt_u.dirty_list[0].mid &&
-						tx2->mt_u.dirty_list[x].mid == pgno) {
-						tx0->mt_dirty_room++;
-						break;
-					}
-				}
-			}
-			mdb_page_dirty(tx0, np);
+			mdb_page_dirty(txn, np);
 			np->mp_flags |= P_DIRTY;
 			*ret = np;
 			break;
@@ -1872,7 +1895,6 @@ mdb_page_touch(MDB_cursor *mc)
 	MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
 	MDB_txn *txn = mc->mc_txn;
 	MDB_cursor *m2, *m3;
-	MDB_dbi dbi;
 	pgno_t	pgno;
 	int rc;
@@ -1889,7 +1911,8 @@ mdb_page_touch(MDB_cursor *mc)
 			(rc = mdb_page_alloc(mc, 1, &np)))
 			return rc;
 		pgno = np->mp_pgno;
-		DPRINTF(("touched db %u page %"Z"u -> %"Z"u", mc->mc_dbi,mp->mp_pgno,pgno));
+		DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
+			mp->mp_pgno, pgno));
 		assert(mp->mp_pgno != pgno);
 		mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
 		/* Update the parent page, if any, to point to the new page */
@@ -1935,17 +1958,16 @@ mdb_page_touch(MDB_cursor *mc)
 done:
 	/* Adjust cursors pointing to mp */
 	mc->mc_pg[mc->mc_top] = np;
-	dbi = mc->mc_dbi;
+	m2 = txn->mt_cursors[mc->mc_dbi];
 	if (mc->mc_flags & C_SUB) {
-		dbi--;
-		for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
+		for (; m2; m2=m2->mc_next) {
 			m3 = &m2->mc_xcursor->mx_cursor;
 			if (m3->mc_snum < mc->mc_snum) continue;
 			if (m3->mc_pg[mc->mc_top] == mp)
 				m3->mc_pg[mc->mc_top] = np;
 		}
 	} else {
-		for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
+		for (; m2; m2=m2->mc_next) {
 			if (m2->mc_snum < mc->mc_snum) continue;
 			if (m2->mc_pg[mc->mc_top] == mp) {
 				m2->mc_pg[mc->mc_top] = np;
@@ -2087,7 +2109,7 @@ enum Pidlock_op {
  * lock on the lockfile, set at an offset equal to the pid.
  */
 static int
-mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid)
+mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
 {
 #if !(MDB_PIDLOCK)		/* Currently the same as defined(_WIN32) */
 	int ret = 0;
@@ -2130,7 +2152,9 @@ static int
 mdb_txn_renew0(MDB_txn *txn)
 {
 	MDB_env *env = txn->mt_env;
-	unsigned int i;
+	MDB_txninfo *ti = env->me_txns;
+	MDB_meta *meta;
+	unsigned int i, nr;
 	uint16_t x;
 	int rc, new_notls = 0;
@@ -2139,9 +2163,9 @@ mdb_txn_renew0(MDB_txn *txn)
 	txn->mt_dbxs = env->me_dbxs;	/* mostly static anyway */
 	if (txn->mt_flags & MDB_TXN_RDONLY) {
-		if (!env->me_txns) {
-			i = mdb_env_pick_meta(env);
-			txn->mt_txnid = env->me_metas[i]->mm_txnid;
+		if (!ti) {
+			meta = env->me_metas[ mdb_env_pick_meta(env) ];
+			txn->mt_txnid = meta->mm_txnid;
 			txn->mt_u.reader = NULL;
 		} else {
 			MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
@@ -2150,7 +2174,7 @@ mdb_txn_renew0(MDB_txn *txn)
 				if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
 					return MDB_BAD_RSLOT;
 			} else {
-				pid_t pid = env->me_pid;
+				MDB_PID_T pid = env->me_pid;
 				pthread_t tid = pthread_self();
 				if (!(env->me_flags & MDB_LIVE_READER)) {
@@ -2163,36 +2187,43 @@ mdb_txn_renew0(MDB_txn *txn)
 				}
 				LOCK_MUTEX_R(env);
-				for (i=0; i<env->me_txns->mti_numreaders; i++)
-					if (env->me_txns->mti_readers[i].mr_pid == 0)
+				nr = ti->mti_numreaders;
+				for (i=0; i<nr; i++)
+					if (ti->mti_readers[i].mr_pid == 0)
 						break;
 				if (i == env->me_maxreaders) {
 					UNLOCK_MUTEX_R(env);
 					return MDB_READERS_FULL;
 				}
-				env->me_txns->mti_readers[i].mr_pid = pid;
-				env->me_txns->mti_readers[i].mr_tid = tid;
-				if (i >= env->me_txns->mti_numreaders)
-					env->me_txns->mti_numreaders = i+1;
+				ti->mti_readers[i].mr_pid = pid;
+				ti->mti_readers[i].mr_tid = tid;
+				if (i == nr)
+					ti->mti_numreaders = ++nr;
 				/* Save numreaders for un-mutexed mdb_env_close() */
-				env->me_numreaders = env->me_txns->mti_numreaders;
+				env->me_numreaders = nr;
 				UNLOCK_MUTEX_R(env);
-				r = &env->me_txns->mti_readers[i];
+				r = &ti->mti_readers[i];
 				new_notls = (env->me_flags & MDB_NOTLS);
 				if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
 					r->mr_pid = 0;
 					return rc;
 				}
 			}
-			txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid;
+			txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
 			txn->mt_u.reader = r;
+			meta = env->me_metas[txn->mt_txnid & 1];
 		}
-		txn->mt_toggle = txn->mt_txnid & 1;
 	} else {
-		LOCK_MUTEX_W(env);
+		if (ti) {
+			LOCK_MUTEX_W(env);
-		txn->mt_txnid = env->me_txns->mti_txnid;
-		txn->mt_toggle = txn->mt_txnid & 1;
+			txn->mt_txnid = ti->mti_txnid;
+			meta = env->me_metas[txn->mt_txnid & 1];
+		} else {
+			meta = env->me_metas[ mdb_env_pick_meta(env) ];
+			txn->mt_txnid = meta->mm_txnid;
+		}
 		txn->mt_txnid++;
 #if MDB_DEBUG
 		if (txn->mt_txnid == mdb_debug_start)
@@ -2208,10 +2239,10 @@ mdb_txn_renew0(MDB_txn *txn)
 	}
 	/* Copy the DB info and flags */
-	memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db));
+	memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
 	/* Moved to here to avoid a data race in read TXNs */
-	txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
+	txn->mt_next_pgno = meta->mm_last_pg+1;
 	for (i=2; i<txn->mt_numdbs; i++) {
 		x = env->me_dbflags[i];
@@ -2307,7 +2338,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
 			return ENOMEM;
 		}
 		txn->mt_txnid = parent->mt_txnid;
-		txn->mt_toggle = parent->mt_toggle;
 		txn->mt_dirty_room = parent->mt_dirty_room;
 		txn->mt_u.dirty_list[0].mid = 0;
 		txn->mt_spill_pgs = NULL;
@@ -2433,7 +2463,8 @@ mdb_txn_reset0(MDB_txn *txn, const char *act)
 		env->me_txn = NULL;
 		/* The writer mutex was locked in mdb_txn_begin. */
-		UNLOCK_MUTEX_W(env);
+		if (env->me_txns)
+			UNLOCK_MUTEX_W(env);
 	}
 }
@@ -2482,20 +2513,26 @@ mdb_freelist_save(MDB_txn *txn)
 	int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
 	txnid_t	pglast = 0, head_id = 0;
 	pgno_t	freecnt = 0, *free_pgs, *mop;
-	ssize_t	head_room = 0, total_room = 0, mop_len;
+	ssize_t	head_room = 0, total_room = 0, mop_len, clean_limit;
 	mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
 	if (env->me_pghead) {
 		/* Make sure first page of freeDB is touched and on freelist */
-		rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
+		rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
 		if (rc && rc != MDB_NOTFOUND)
 			return rc;
 	}
+	/* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
+	clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
+		? SSIZE_MAX : maxfree_1pg;
 	for (;;) {
 		/* Come back here after each Put() in case freelist changed */
 		MDB_val key, data;
+		pgno_t *pgs;
+		ssize_t j;
 		/* If using records from freeDB which we have not yet
 		 * deleted, delete them and any we reserved for me_pghead.
@@ -2516,9 +2553,7 @@ mdb_freelist_save(MDB_txn *txn)
 		if (freecnt < txn->mt_free_pgs[0]) {
 			if (!freecnt) {
 				/* Make sure last page of freeDB is touched and on freelist */
-				key.mv_size = MDB_MAXKEYSIZE+1;
-				key.mv_data = NULL;
-				rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
+				rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
 				if (rc && rc != MDB_NOTFOUND)
 					return rc;
 			}
@@ -2581,11 +2616,16 @@ mdb_freelist_save(MDB_txn *txn)
 		rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
 		if (rc)
 			return rc;
-		*(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */
+		/* IDL is initially empty, zero out at least the length */
+		pgs = (pgno_t *)data.mv_data;
+		j = head_room > clean_limit ? head_room : 0;
+		do {
+			pgs[j] = 0;
+		} while (--j >= 0);
 		total_room += head_room;
 	}
-	/* Fill in the reserved, touched me_pghead records */
+	/* Fill in the reserved me_pghead records */
 	rc = MDB_SUCCESS;
 	if (mop_len) {
 		MDB_val key, data;
@@ -2655,8 +2695,7 @@ mdb_page_flush(MDB_txn *txn, int keep)
 			}
 			dp->mp_flags &= ~P_DIRTY;
 		}
-		dl[0].mid = j;
-		return MDB_SUCCESS;
+		goto done;
 	}
 	/* Write the pages */
@@ -2750,8 +2789,11 @@ mdb_page_flush(MDB_txn *txn, int keep)
 		}
 		mdb_dpage_free(env, dp);
 	}
-	dl[0].mid = j;
+done:
+	i--;
+	txn->mt_dirty_room += i - j;
+	dl[0].mid = j;
 	return MDB_SUCCESS;
 }
@@ -2791,14 +2833,18 @@ mdb_txn_commit(MDB_txn *txn)
 	if (txn->mt_parent) {
 		MDB_txn *parent = txn->mt_parent;
-		unsigned x, y, len;
 		MDB_ID2L dst, src;
+		MDB_IDL pspill;
+		unsigned x, y, len, ps_len;
 		/* Append our free list to parent's */
 		rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
 		if (rc)
 			goto fail;
 		mdb_midl_free(txn->mt_free_pgs);
+		/* Failures after this must either undo the changes
+		 * to the parent or set MDB_TXN_ERROR in the parent.
+		 */
 		parent->mt_next_pgno = txn->mt_next_pgno;
 		parent->mt_flags = txn->mt_flags;
@@ -2820,37 +2866,26 @@ mdb_txn_commit(MDB_txn *txn)
 		dst = parent->mt_u.dirty_list;
 		src = txn->mt_u.dirty_list;
 		/* Remove anything in our dirty list from parent's spill list */
-		if (parent->mt_spill_pgs) {
-			x = parent->mt_spill_pgs[0];
-			len = x;
-			/* zero out our dirty pages in parent spill list */
-			for (i=1; i<=src[0].mid; i++) {
+		if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
+			x = y = ps_len;
+			pspill[0] = (pgno_t)-1;
+			/* Mark our dirty pages as deleted in parent spill list */
+			for (i=0, len=src[0].mid; ++i <= len; ) {
 				MDB_ID pn = src[i].mid << 1;
-				if (pn < parent->mt_spill_pgs[x])
-					continue;
-				if (pn > parent->mt_spill_pgs[x]) {
-					if (x <= 1)
-						break;
+				while (pn > pspill[x])
 					x--;
-					continue;
-				}
-				parent->mt_spill_pgs[x] = 0;
-				len--;
-			}
-			/* OK, we had a few hits, squash zeros from the spill list */
-			if (len < parent->mt_spill_pgs[0]) {
-				x=1;
-				for (y=1; y<=parent->mt_spill_pgs[0]; y++) {
-					if (parent->mt_spill_pgs[y]) {
-						if (y != x) {
-							parent->mt_spill_pgs[x] = parent->mt_spill_pgs[y];
-						}
-						x++;
-					}
+				if (pn == pspill[x]) {
+					pspill[x] = 1;
+					y = --x;
 				}
-				parent->mt_spill_pgs[0] = len;
 			}
+			/* Squash deleted pagenums if we deleted any */
+			for (x=y; ++x <= ps_len; )
+				if (!(pspill[x] & 1))
+					pspill[++y] = pspill[x];
+			pspill[0] = y;
 		}
 		/* Find len = length of merging our dirty list with parent's */
 		x = dst[0].mid;
 		dst[0].mid = 0;		/* simplify loops */
@@ -2884,7 +2919,10 @@ mdb_txn_commit(MDB_txn *txn)
 		parent->mt_dirty_room = txn->mt_dirty_room;
 		if (txn->mt_spill_pgs) {
 			if (parent->mt_spill_pgs) {
-				mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
+				/* TODO: Prevent failure here, so parent does not fail */
+				rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
+				if (rc)
+					parent->mt_flags |= MDB_TXN_ERROR;
 				mdb_midl_free(txn->mt_spill_pgs);
 				mdb_midl_sort(parent->mt_spill_pgs);
 			} else {
@@ -2895,7 +2933,7 @@ mdb_txn_commit(MDB_txn *txn)
 		parent->mt_child = NULL;
 		mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
 		free(txn);
-		return MDB_SUCCESS;
+		return rc;
 	}
 	if (txn != env->me_txn) {
@@ -2954,7 +2992,8 @@ done:
 	env->me_txn = NULL;
 	mdb_dbis_update(txn, 1);
-	UNLOCK_MUTEX_W(env);
+	if (env->me_txns)
+		UNLOCK_MUTEX_W(env);
 	free(txn);
 	return MDB_SUCCESS;
@@ -2973,10 +3012,11 @@ fail:
 static int
 mdb_env_read_header(MDB_env *env, MDB_meta *meta)
 {
-	MDB_pagebuf	pbuf;
+	MDB_metabuf	pbuf;
 	MDB_page	*p;
 	MDB_meta	*m;
 	int			i, rc, off;
+	enum { Size = sizeof(pbuf) };
 	/* We don't know the page size yet, so use a minimum value.
 	 * Read both meta pages so we can use the latest one.
@@ -2988,13 +3028,13 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
 		OVERLAPPED ov;
 		memset(&ov, 0, sizeof(ov));
 		ov.Offset = off;
-		rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1;
+		rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
 		if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
 			rc = 0;
 #else
-		rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off);
+		rc = pread(env->me_fd, &pbuf, Size, off);
 #endif
-		if (rc != MDB_PAGESIZE) {
+		if (rc != Size) {
 			if (rc == 0 && off == 0)
 				return ENOENT;
 			rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
@@ -3109,7 +3149,7 @@ mdb_env_write_meta(MDB_txn *txn)
 	assert(txn != NULL);
 	assert(txn->mt_env != NULL);
-	toggle = !txn->mt_toggle;
+	toggle = txn->mt_txnid & 1;
 	DPRINTF(("writing meta page %d for root page %"Z"u",
 		toggle, txn->mt_dbs[MAIN_DBI].md_root));
@@ -3125,11 +3165,18 @@ mdb_env_write_meta(MDB_txn *txn)
 		mp->mm_last_pg = txn->mt_next_pgno - 1;
 		mp->mm_txnid = txn->mt_txnid;
 		if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
+			unsigned meta_size = env->me_psize;
 			rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
 			ptr = env->me_map;
-			if (toggle)
-				ptr += env->me_psize;
-			if (MDB_MSYNC(ptr, env->me_psize, rc)) {
+			if (toggle) {
+#ifndef _WIN32	/* POSIX msync() requires ptr = start of OS page */
+				if (meta_size < env->me_os_psize)
+					meta_size += meta_size;
+				else
+#endif
+					ptr += meta_size;
+			}
+			if (MDB_MSYNC(ptr, meta_size, rc)) {
 				rc = ErrCode();
 				goto fail;
 			}
@@ -3200,7 +3247,8 @@ done:
 	 * readers will get consistent data regardless of how fresh or
 	 * how stale their view of these values is.
 	 */
-	env->me_txns->mti_txnid = txn->mt_txnid;
+	if (env->me_txns)
+		env->me_txns->mti_txnid = txn->mt_txnid;
 	return MDB_SUCCESS;
 }
@@ -3234,6 +3282,7 @@ mdb_env_create(MDB_env **env)
 	e->me_wmutex = SEM_FAILED;
 #endif
 	e->me_pid = getpid();
+	GET_PAGESIZE(e->me_os_psize);
 	VGMEMP_CREATE(e,0,0);
 	*env = e;
 	return MDB_SUCCESS;
@@ -3276,7 +3325,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
 	int prot = PROT_READ;
 	if (flags & MDB_WRITEMAP) {
 		prot |= PROT_WRITE;
-		if (newsize && ftruncate(env->me_fd, env->me_mapsize) < 0)
+		if (ftruncate(env->me_fd, env->me_mapsize) < 0)
 			return ErrCode();
 	}
 	env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
@@ -3285,14 +3334,17 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
 		env->me_map = NULL;
 		return ErrCode();
 	}
-	/* Turn off readahead. It's harmful when the DB is larger than RAM. */
+	if (flags & MDB_NORDAHEAD) {
+		/* Turn off readahead. It's harmful when the DB is larger than RAM. */
 #ifdef MADV_RANDOM
-	madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
+		madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
 #else
 #ifdef POSIX_MADV_RANDOM
-	posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
+		posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
 #endif /* POSIX_MADV_RANDOM */
 #endif /* MADV_RANDOM */
+	}
 #endif /* _WIN32 */
 	/* Can happen because the address argument to mmap() is just a
@@ -3323,6 +3375,14 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
 			return EINVAL;
 		if (!size)
 			size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize;
+		else if (size < env->me_mapsize) {
+			/* If the configured size is smaller, make sure it's
+			 * still big enough. Silently round up to minimum if not.
+			 */
+			size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize;
+			if (size < minsize)
+				size = minsize;
+		}
 		munmap(env->me_map, env->me_mapsize);
 		env->me_mapsize = size;
 		old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
@@ -3388,7 +3448,9 @@ mdb_env_open2(MDB_env *env)
 			return i;
 		DPUTS("new mdbenv");
 		newenv = 1;
-		GET_PAGESIZE(env->me_psize);
+		env->me_psize = env->me_os_psize;
+		if (env->me_psize > MAX_PAGESIZE)
+			env->me_psize = MAX_PAGESIZE;
 	} else {
 		env->me_psize = meta.mm_psize;
 	}
@@ -3499,7 +3561,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_
 #pragma comment(linker, "/INCLUDE:_tls_used")
 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
 #pragma const_seg(".CRT$XLB")
-extern const PIMAGE_TLS_CALLBACK mdb_tls_callback;
+extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
 #pragma const_seg()
 #else	/* WIN32 */
@@ -3597,7 +3659,7 @@ mdb_env_excl_lock(MDB_env *env, int *excl)
 	return rc;
 }
-#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
+#ifdef MDB_USE_HASH
 /*
  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
  *
@@ -3763,7 +3825,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
 	rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
 	if (size < rsize && *excl > 0) {
 #ifdef _WIN32
-		if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize
+		if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
 			|| !SetEndOfFile(env->me_lfd))
 			goto fail_errno;
 #else
@@ -3919,8 +3981,9 @@ fail:
 	 *	at runtime. Changing other flags requires closing the
 	 *	environment and re-opening it with the new flags.
 	 */
-#define	CHANGEABLE	(MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
-#define	CHANGELESS	(MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS)
+#define	CHANGEABLE	(MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
+#define	CHANGELESS	(MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
+	MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
 int
 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
@@ -3973,7 +4036,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
 	}
 	/* For RDONLY, get lockfile after we know datafile exists */
-	if (!F_ISSET(flags, MDB_RDONLY)) {
+	if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
 		rc = mdb_env_setup_locks(env, lpath, mode, &excl);
 		if (rc)
 			goto leave;
@@ -4003,7 +4066,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
 		goto leave;
 	}
-	if (F_ISSET(flags, MDB_RDONLY)) {
+	if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
 		rc = mdb_env_setup_locks(env, lpath, mode, &excl);
 		if (rc)
 			goto leave;
@@ -4033,7 +4096,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
 		DPRINTF(("opened dbenv %p", (void *) env));
 		if (excl > 0) {
 			rc = mdb_env_share_locks(env, &excl);
+			if (rc)
+				goto leave;
 		}
+		if (!((flags & MDB_RDONLY) ||
+			  (env->me_pbuf = calloc(1, env->me_psize))))
+			rc = ENOMEM;
 	}
 leave:
@@ -4057,6 +4125,7 @@ mdb_env_close0(MDB_env *env, int excl)
 	for (i = env->me_maxdbs; --i > MAIN_DBI; )
 		free(env->me_dbxs[i].md_name.mv_data);
+	free(env->me_pbuf);
 	free(env->me_dbflags);
 	free(env->me_dbxs);
 	free(env->me_path);
@@ -4084,7 +4153,7 @@ mdb_env_close0(MDB_env *env, int excl)
 	if (env->me_fd != INVALID_HANDLE_VALUE)
 		(void) close(env->me_fd);
 	if (env->me_txns) {
-		pid_t pid = env->me_pid;
+		MDB_PID_T pid = env->me_pid;
 		/* Clearing readers is done in this function because
 		 * me_txkey with its destructor must be disabled first.
 		 */
@@ -4246,14 +4315,6 @@ mdb_env_copy(MDB_env *env, const char *path)
 	newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
 				FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
 #else
-#ifdef O_DIRECT
-	/* The OS supports O_DIRECT, try with it */
-	newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_DIRECT, 0666);
-	/* But open can fail if O_DIRECT isn't supported by the file system
-	 * so retry without the flag
-	 */
-	if (newfd == INVALID_HANDLE_VALUE && ErrCode() == EINVAL)
-#endif
 	newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
 #endif
 	if (newfd == INVALID_HANDLE_VALUE) {
@@ -4261,6 +4322,11 @@ mdb_env_copy(MDB_env *env, const char *path)
 		goto leave;
 	}
+#ifdef O_DIRECT
+	/* Set O_DIRECT if the file system supports it */
+	if ((rc = fcntl(newfd, F_GETFL)) != -1)
+		(void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
+#endif
 #ifdef F_NOCACHE	/* __APPLE__ */
 	rc = fcntl(newfd, F_NOCACHE, 1);
 	if (rc) {
@@ -4308,7 +4374,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b)
 		*(size_t *)a->mv_data > *(size_t *)b->mv_data;
 }
-/** Compare two items pointing at aligned int's */
+/** Compare two items pointing at aligned unsigned int's */
 static int
 mdb_cmp_int(const MDB_val *a, const MDB_val *b)
 {
@@ -4316,7 +4382,7 @@ mdb_cmp_int(const MDB_val *a, const MDB_val *b)
 		*(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
 }
-/** Compare two items pointing at ints of unknown alignment.
+/** Compare two items pointing at unsigned ints of unknown alignment.
  *	Nodes and keys are guaranteed to be 2-byte aligned.
  */
 static int
@@ -4514,8 +4580,8 @@ mdb_cursor_pop(MDB_cursor *mc)
 		if (mc->mc_snum)
 			mc->mc_top--;
-		DPRINTF(("popped page %"Z"u off db %u cursor %p", top->mp_pgno,
-			mc->mc_dbi, (void *) mc));
+		DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
+			DDBI(mc), (void *) mc));
 	}
 }
@@ -4523,8 +4589,8 @@ mdb_cursor_pop(MDB_cursor *mc)
 static int
 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
 {
-	DPRINTF(("pushing page %"Z"u on db %u cursor %p", mp->mp_pgno,
-		mc->mc_dbi, (void *) mc));
+	DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
+		DDBI(mc), (void *) mc));
 	if (mc->mc_snum >= CURSOR_STACK) {
 		assert(mc->mc_snum < CURSOR_STACK);
@@ -4598,18 +4664,11 @@ done:
 	return MDB_SUCCESS;
 }
-/** Search for the page a given key should be in.
- * Pushes parent pages on the cursor stack. This function continues a
- * search on a cursor that has already been initialized. (Usually by
- * #mdb_page_search() but also by #mdb_node_move().)
- * @param[in,out] mc the cursor for this operation.
- * @param[in] key the key to search for. If NULL, search for the lowest
- * page. (This is used by #mdb_cursor_first().)
- * @param[in] modify If true, visited pages are updated with new page numbers.
- * @return 0 on success, non-zero on failure.
+/** Finish #mdb_page_search() / #mdb_page_search_lowest().
+ *	The cursor is at the root page, set up the rest of it.
  */
 static int
-mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
+mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
 {
 	MDB_page	*mp = mc->mc_pg[mc->mc_top];
 	int rc;
@@ -4623,11 +4682,10 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
 		assert(NUMKEYS(mp) > 1);
 		DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
-		if (key == NULL)	/* Initialize cursor to first page. */
+		if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
 			i = 0;
-		else if (key->mv_size > MDB_MAXKEYSIZE && key->mv_data == NULL) {
-							/* cursor to last page */
-			i = NUMKEYS(mp)-1;
+			if (flags & MDB_PS_LAST)
+				i = NUMKEYS(mp) - 1;
 		} else {
 			int	 exact;
 			node = mdb_node_search(mc, key, &exact);
@@ -4640,10 +4698,9 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
 					i--;
 				}
 			}
+			DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
 		}
-		if (key)
-			DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
 		assert(i < NUMKEYS(mp));
 		node = NODEPTR(mp, i);
@@ -4654,7 +4711,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
 		if ((rc = mdb_cursor_push(mc, mp)))
 			return rc;
-		if (modify) {
+		if (flags & MDB_PS_MODIFY) {
 			if ((rc = mdb_page_touch(mc)) != 0)
 				return rc;
 			mp = mc->mc_pg[mc->mc_top];
@@ -4668,7 +4725,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
 	}
 	DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
-	    key ? DKEY(key) : NULL));
+	    key ? DKEY(key) : "null"));
 	mc->mc_flags |= C_INITIALIZED;
 	mc->mc_flags &= ~C_EOF;
@@ -4694,18 +4751,17 @@ mdb_page_search_lowest(MDB_cursor *mc)
 	mc->mc_ki[mc->mc_top] = 0;
 	if ((rc = mdb_cursor_push(mc, mp)))
 		return rc;
-	return mdb_page_search_root(mc, NULL, 0);
+	return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
 }
 /** Search for the page a given key should be in.
- * Pushes parent pages on the cursor stack. This function just sets up
- * the search; it finds the root page for \b mc's database and sets this
- * as the root of the cursor's stack. Then #mdb_page_search_root() is
- * called to complete the search.
+ * Push it and its parent pages on the cursor stack.
  * @param[in,out] mc the cursor for this operation.
- * @param[in] key the key to search for. If NULL, search for the lowest
- * page. (This is used by #mdb_cursor_first().)
- * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers.
+ * @param[in] key the key to search for, or NULL for first/last page.
+ * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
+ *   are touched (updated with new page numbers).
+ *   If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
+ *   This is used by #mdb_cursor_first() and #mdb_cursor_last().
  *   If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
  * @return 0 on success, non-zero on failure.
  */
@@ -4716,23 +4772,20 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
 	pgno_t		 root;
 	/* Make sure the txn is still viable, then find the root from
-	 * the txn's db table.
+	 * the txn's db table and set it as the root of the cursor's stack.
 	 */
 	if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
 		DPUTS("transaction has failed, must abort");
 		return MDB_BAD_TXN;
 	} else {
 		/* Make sure we're using an up-to-date root */
-		if (mc->mc_dbi > MAIN_DBI) {
-			if ((*mc->mc_dbflag & DB_STALE) ||
-			((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) {
+		if (*mc->mc_dbflag & DB_STALE) {
 				MDB_cursor mc2;
-				unsigned char dbflag = 0;
 				mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
-				rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, flags & MDB_PS_MODIFY);
+				rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
 				if (rc)
 					return rc;
-				if (*mc->mc_dbflag & DB_STALE) {
+				{
 					MDB_val data;
 					int exact = 0;
 					uint16_t flags;
@@ -4752,11 +4805,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
 						return MDB_INCOMPATIBLE;
 					memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
 				}
-				if (flags & MDB_PS_MODIFY)
-					dbflag = DB_DIRTY;
 				*mc->mc_dbflag &= ~DB_STALE;
-				*mc->mc_dbflag |= dbflag;
-			}
 		}
 		root = mc->mc_db->md_root;
@@ -4774,8 +4823,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
 	mc->mc_snum = 1;
 	mc->mc_top = 0;
-	DPRINTF(("db %u root page %"Z"u has flags 0x%X",
-		mc->mc_dbi, root, mc->mc_pg[0]->mp_flags));
+	DPRINTF(("db %d root page %"Z"u has flags 0x%X",
+		DDBI(mc), root, mc->mc_pg[0]->mp_flags));
 	if (flags & MDB_PS_MODIFY) {
 		if ((rc = mdb_page_touch(mc)))
@@ -4914,7 +4963,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi,
 	if (txn->mt_flags & MDB_TXN_ERROR)
 		return MDB_BAD_TXN;
-	if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
+	if (key->mv_size > MDB_MAXKEYSIZE) {
 		return MDB_BAD_VALSIZE;
 	}
@@ -4966,8 +5015,11 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right)
 	assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
 	indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
-	if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0))
+	if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
+		/* mc will be inconsistent if caller does mc_snum++ as above */
+		mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
 		return rc;
+	}
 	mdb_cursor_push(mc, mp);
 	if (!move_right)
@@ -5143,7 +5195,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
 	assert(mc);
 	assert(key);
-	assert(key->mv_size > 0);
+	if (key->mv_size == 0)
+		return MDB_BAD_VALSIZE;
 	if (mc->mc_xcursor)
 		mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
@@ -5329,7 +5382,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
 		mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
 	if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
-		rc = mdb_page_search(mc, NULL, 0);
+		rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
 		if (rc != MDB_SUCCESS)
 			return rc;
 	}
@@ -5375,11 +5428,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
 	if (!(mc->mc_flags & C_EOF)) {
 		if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
-			MDB_val	lkey;
-			lkey.mv_size = MDB_MAXKEYSIZE+1;
-			lkey.mv_data = NULL;
-			rc = mdb_page_search(mc, &lkey, 0);
+			rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
 			if (rc != MDB_SUCCESS)
 				return rc;
 		}
@@ -5431,8 +5480,9 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
 			rc = EINVAL;
 		} else {
 			MDB_page *mp = mc->mc_pg[mc->mc_top];
-			if (!NUMKEYS(mp)) {
-				mc->mc_ki[mc->mc_top] = 0;
+			int nkeys = NUMKEYS(mp);
+			if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
+				mc->mc_ki[mc->mc_top] = nkeys;
 				rc = MDB_NOTFOUND;
 				break;
 			}
@@ -5471,7 +5521,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
 	case MDB_SET_RANGE:
 		if (key == NULL) {
 			rc = EINVAL;
-		} else if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
+		} else if (key->mv_size > MDB_MAXKEYSIZE) {
 			rc = MDB_BAD_VALSIZE;
 		} else if (op == MDB_SET_RANGE)
 			rc = mdb_cursor_set(mc, key, data, op, NULL);
@@ -5577,14 +5627,14 @@ fetchm:
 	return rc;
 }
-/** Touch all the pages in the cursor stack.
+/** Touch all the pages in the cursor stack. Set mc_top.
  *	Makes sure all the pages are writable, before attempting a write operation.
  * @param[in] mc The cursor to operate on.
  */
 static int
 mdb_cursor_touch(MDB_cursor *mc)
 {
-	int rc;
+	int rc = MDB_SUCCESS;
 	if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
 		MDB_cursor mc2;
@@ -5595,13 +5645,14 @@ mdb_cursor_touch(MDB_cursor *mc)
 			 return rc;
 		*mc->mc_dbflag |= DB_DIRTY;
 	}
-	for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
-		rc = mdb_page_touch(mc);
-		if (rc)
-			return rc;
+	mc->mc_top = 0;
+	if (mc->mc_snum) {
+		do {
+			rc = mdb_page_touch(mc);
+		} while (!rc && ++(mc->mc_top) < mc->mc_snum);
+		mc->mc_top = mc->mc_snum-1;
 	}
-	mc->mc_top = mc->mc_snum-1;
-	return MDB_SUCCESS;
+	return rc;
 }
 /** Do not spill pages to disk if txn is getting full, may fail instead */
@@ -5612,15 +5663,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
     unsigned int flags)
 {
 	enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
+	MDB_env		*env = mc->mc_txn->mt_env;
 	MDB_node	*leaf = NULL;
 	MDB_val	xdata, *rdata, dkey;
-	MDB_page	*fp;
 	MDB_db dummy;
 	int do_sub = 0, insert = 0;
 	unsigned int mcount = 0, dcount = 0, nospill;
 	size_t nsize;
 	int rc, rc2;
-	MDB_pagebuf pbuf;
 	char dbuf[MDB_MAXKEYSIZE+1];
 	unsigned int nflags;
 	DKBUF;
@@ -5652,8 +5702,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
 		return MDB_BAD_VALSIZE;
 #endif
-	DPRINTF(("==> put db %u key [%s], size %"Z"u, data size %"Z"u",
-		mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size));
+	DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
+		DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
 	dkey.mv_size = 0;
@@ -5664,6 +5714,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
 	} else if (mc->mc_db->md_root == P_INVALID) {
 		/* new database, cursor has nothing to point to */
 		mc->mc_snum = 0;
+		mc->mc_top = 0;
 		mc->mc_flags &= ~C_INITIALIZED;
 		rc = MDB_NO_ROOT;
 	} else {
@@ -5733,6 +5784,9 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
 	/* The key already exists */
 	if (rc == MDB_SUCCESS) {
+		MDB_page	*fp, *mp;
+		MDB_val		olddata;
 		/* there's only a key anyway, so this is a no-op */
 		if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
 			unsigned int ksize = mc->mc_db->md_pad;
@@ -5745,19 +5799,23 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
 			return MDB_SUCCESS;
 		}
+more:
 		leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+		olddata.mv_size = NODEDSZ(leaf);
+		olddata.mv_data = NODEDATA(leaf);
 		/* DB has dups? */
 		if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
+			mp = fp = xdata.mv_data = env->me_pbuf;
+			mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
 			/* Was a single item before, must convert now */
-more:
 			if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
 				/* Just overwrite the current item */
 				if (flags == MDB_CURRENT)
 					goto current;
-				dkey.mv_size = NODEDSZ(leaf);
-				dkey.mv_data = NODEDATA(leaf);
+				dkey = olddata;
 #if UINT_MAX < SIZE_MAX
 				if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
 #ifdef MISALIGNED_OK
@@ -5780,85 +5838,76 @@ more:
 				/* create a fake page for the dup items */
 				memcpy(dbuf, dkey.mv_data, dkey.mv_size);
 				dkey.mv_data = dbuf;
-				fp = (MDB_page *)&pbuf;
-				fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
 				fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
 				fp->mp_lower = PAGEHDRSZ;
-				fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
+				xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
 				if (mc->mc_db->md_flags & MDB_DUPFIXED) {
 					fp->mp_flags |= P_LEAF2;
 					fp->mp_pad = data->mv_size;
-					fp->mp_upper += 2 * data->mv_size;	/* leave space for 2 more */
+					xdata.mv_size += 2 * data->mv_size;	/* leave space for 2 more */
 				} else {
-					fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
+					xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
 						(dkey.mv_size & 1) + (data->mv_size & 1);
 				}
-				mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
-				do_sub = 1;
-				rdata = &xdata;
-				xdata.mv_size = fp->mp_upper;
-				xdata.mv_data = fp;
-				flags |= F_DUPDATA;
-				goto new_sub;
-			}
-			if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
+				fp->mp_upper = xdata.mv_size;
+			} else if (leaf->mn_flags & F_SUBDATA) {
+				/* Data is on sub-DB, just store it */
+				flags |= F_DUPDATA|F_SUBDATA;
+				goto put_sub;
+			} else {
 				/* See if we need to convert from fake page to subDB */
-				MDB_page *mp;
 				unsigned int offset;
 				unsigned int i;
 				uint16_t fp_flags;
-				fp = NODEDATA(leaf);
-				if (flags == MDB_CURRENT) {
-reuse:
+				fp = olddata.mv_data;
+				switch (flags) {
+				default:
+					if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
+						offset = NODESIZE + sizeof(indx_t) + data->mv_size;
+						offset += offset & 1;
+						break;
+					}
+					offset = fp->mp_pad;
+					if (SIZELEFT(fp) < offset) {
+						offset *= 4; /* space for 4 more */
+						break;
+					}
+					/* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
+				case MDB_CURRENT:
 					fp->mp_flags |= P_DIRTY;
-					COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
+					COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
 					mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
 					flags |= F_DUPDATA;
 					goto put_sub;
 				}
-				if (mc->mc_db->md_flags & MDB_DUPFIXED) {
-					offset = fp->mp_pad;
-					if (SIZELEFT(fp) >= offset)
-						goto reuse;
-					offset *= 4;	/* space for 4 more */
-				} else {
-					offset = NODESIZE + sizeof(indx_t) + data->mv_size;
-				}
-				offset += offset & 1;
 				fp_flags = fp->mp_flags;
-				if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
-					offset >= mc->mc_txn->mt_env->me_nodemax) {
+				xdata.mv_size = olddata.mv_size + offset;
+				if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + xdata.mv_size
+					>= env->me_nodemax) {
 					/* yes, convert it */
-					dummy.md_flags = 0;
 					if (mc->mc_db->md_flags & MDB_DUPFIXED) {
 						dummy.md_pad = fp->mp_pad;
 						dummy.md_flags = MDB_DUPFIXED;
 						if (mc->mc_db->md_flags & MDB_INTEGERDUP)
 							dummy.md_flags |= MDB_INTEGERKEY;
+					} else {
+						dummy.md_pad = 0;
+						dummy.md_flags = 0;
 					}
 					dummy.md_depth = 1;
 					dummy.md_branch_pages = 0;
 					dummy.md_leaf_pages = 1;
 					dummy.md_overflow_pages = 0;
 					dummy.md_entries = NUMKEYS(fp);
-					rdata = &xdata;
 					xdata.mv_size = sizeof(MDB_db);
 					xdata.mv_data = &dummy;
 					if ((rc = mdb_page_alloc(mc, 1, &mp)))
 						return rc;
-					offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
+					offset = env->me_psize - olddata.mv_size;
 					flags |= F_DUPDATA|F_SUBDATA;
 					dummy.md_root = mp->mp_pgno;
 					fp_flags &= ~P_SUBP;
-				} else {
-					/* no, just grow it */
-					rdata = &xdata;
-					xdata.mv_size = NODEDSZ(leaf) + offset;
-					xdata.mv_data = &pbuf;
-					mp = (MDB_page *)&pbuf;
-					mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
-					flags |= F_DUPDATA;
 				}
 				mp->mp_flags = fp_flags | P_DIRTY;
 				mp->mp_pad   = fp->mp_pad;
@@ -5867,28 +5916,27 @@ reuse:
 				if (IS_LEAF2(fp)) {
 					memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
 				} else {
-					nsize = NODEDSZ(leaf) - fp->mp_upper;
-					memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
+					memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
+						olddata.mv_size - fp->mp_upper);
 					for (i=0; i<NUMKEYS(fp); i++)
 						mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
 				}
-				mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
-				do_sub = 1;
-				goto new_sub;
 			}
-			/* data is on sub-DB, just store it */
-			flags |= F_DUPDATA|F_SUBDATA;
-			goto put_sub;
+			rdata = &xdata;
+			flags |= F_DUPDATA;
+			do_sub = 1;
+			mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
+			goto new_sub;
 		}
 current:
 		/* overflow page overwrites need special handling */
 		if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
 			MDB_page *omp;
 			pgno_t pg;
-			unsigned psize = mc->mc_txn->mt_env->me_psize;
-			int level, ovpages, dpages = OVPAGES(data->mv_size, psize);
+			int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
-			memcpy(&pg, NODEDATA(leaf), sizeof(pg));
+			memcpy(&pg, olddata.mv_data, sizeof(pg));
 			if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
 				return rc2;
 			ovpages = omp->mp_pages;
@@ -5896,7 +5944,7 @@ current:
 			/* Is the ov page large enough? */
 			if (ovpages >= dpages) {
 			  if (!(omp->mp_flags & P_DIRTY) &&
-				  (level || (mc->mc_txn->mt_env->me_flags & MDB_WRITEMAP)))
+				  (level || (env->me_flags & MDB_WRITEMAP)))
 			  {
 				rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
 				if (rc)
@@ -5911,7 +5959,7 @@ current:
 				 */
 				if (level > 1) {
 					/* It is writable only in a parent txn */
-					size_t sz = (size_t) psize * ovpages, off;
+					size_t sz = (size_t) env->me_psize * ovpages, off;
 					MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
 					MDB_ID2 id2;
 					if (!np)
@@ -5941,15 +5989,15 @@ current:
 			}
 			if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
 				return rc2;
-		} else if (NODEDSZ(leaf) == data->mv_size) {
+		} else if (data->mv_size == olddata.mv_size) {
 			/* same size, just replace it. Note that we could
 			 * also reuse this node if the new data is smaller,
 			 * but instead we opt to shrink the node in that case.
 			 */
 			if (F_ISSET(flags, MDB_RESERVE))
-				data->mv_data = NODEDATA(leaf);
+				data->mv_data = olddata.mv_data;
 			else if (data->mv_size)
-				memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
+				memcpy(olddata.mv_data, data->mv_data, data->mv_size);
 			else
 				memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
 			goto done;
@@ -5965,7 +6013,7 @@ current:
 new_sub:
 	nflags = flags & NODE_ADD_FLAGS;
-	nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
+	nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
 	if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
 		if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
 			nflags &= ~MDB_APPEND;
@@ -5982,9 +6030,6 @@ new_sub:
 			unsigned i = mc->mc_top;
 			MDB_page *mp = mc->mc_pg[i];
-			if (mc->mc_flags & C_SUB)
-				dbi--;
 			for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
 				if (mc->mc_flags & C_SUB)
 					m3 = &m2->mc_xcursor->mx_cursor;
@@ -6062,7 +6107,6 @@ next_mult:
 				data[1].mv_size = mcount;
 				if (mcount < dcount) {
 					data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
-					leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
 					goto more;
 				}
 			}
@@ -6081,6 +6125,7 @@ int
 mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
 {
 	MDB_node	*leaf;
+	MDB_page	*mp;
 	int rc;
 	if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
@@ -6089,17 +6134,20 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
 	if (!(mc->mc_flags & C_INITIALIZED))
 		return EINVAL;
+	if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
+		return MDB_NOTFOUND;
 	if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
 		return rc;
-	flags &= ~MDB_NOSPILL; /* TODO: Or change (flags != MDB_NODUPDATA) to ~(flags & MDB_NODUPDATA), not looking at the logic of that code just now */
 	rc = mdb_cursor_touch(mc);
 	if (rc)
 		return rc;
-	leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+	mp = mc->mc_pg[mc->mc_top];
+	leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
-	if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
+	if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
 		if (!(flags & MDB_NODUPDATA)) {
 			if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
 				mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
@@ -6114,13 +6162,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
 				} else {
 					MDB_cursor *m2;
 					/* shrink fake page */
-					mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
-					leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
+					mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
+					leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
 					mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
 					/* fix other sub-DB cursors pointed at this fake page */
 					for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
 						if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
-						if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top] &&
+						if (m2->mc_pg[mc->mc_top] == mp &&
 							m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
 							m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
 					}
@@ -6252,6 +6300,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
 {
 	unsigned int	 i;
 	size_t		 node_size = NODESIZE;
+	ssize_t		 room;
 	indx_t		 ofs;
 	MDB_node	*node;
 	MDB_page	*mp = mc->mc_pg[mc->mc_top];
@@ -6264,7 +6313,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
 	    IS_LEAF(mp) ? "leaf" : "branch",
 		IS_SUBP(mp) ? "sub-" : "",
 	    mp->mp_pgno, indx, data ? data->mv_size : 0,
-		key ? key->mv_size : 0, key ? DKEY(key) : NULL));
+		key ? key->mv_size : 0, key ? DKEY(key) : "null"));
 	if (IS_LEAF2(mp)) {
 		/* Move higher keys up one slot. */
@@ -6282,9 +6331,9 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
 		return MDB_SUCCESS;
 	}
+	room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
 	if (key != NULL)
 		node_size += key->mv_size;
 	if (IS_LEAF(mp)) {
 		assert(data);
 		if (F_ISSET(flags, F_BIGDATA)) {
@@ -6296,26 +6345,23 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
 			/* Put data on overflow page. */
 			DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
 			    data->mv_size, node_size+data->mv_size));
-			node_size += sizeof(pgno_t);
+			node_size += sizeof(pgno_t) + (node_size & 1);
+			if ((ssize_t)node_size > room)
+				goto full;
 			if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
 				return rc;
 			DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
 			flags |= F_BIGDATA;
+			goto update;
 		} else {
 			node_size += data->mv_size;
 		}
 	}
 	node_size += node_size & 1;
+	if ((ssize_t)node_size > room)
+		goto full;
-	if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
-		DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
-		    mp->mp_pgno, NUMKEYS(mp)));
-		DPRINTF(("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
-		    mp->mp_upper - mp->mp_lower));
-		DPRINTF(("node size = %"Z"u", node_size));
-		return MDB_PAGE_FULL;
-	}
+update:
 	/* Move higher pointers up one slot. */
 	for (i = NUMKEYS(mp); i > indx; i--)
 		mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
@@ -6361,6 +6407,13 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
 	}
 	return MDB_SUCCESS;
+full:
+	DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
+		mp->mp_pgno, NUMKEYS(mp)));
+	DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
+	DPRINTF(("node size = %"Z"u", node_size));
+	return MDB_PAGE_FULL;
 }
 /** Delete the specified node from a page.
@@ -6495,11 +6548,13 @@ mdb_xcursor_init0(MDB_cursor *mc)
 	mx->mx_cursor.mc_txn = mc->mc_txn;
 	mx->mx_cursor.mc_db = &mx->mx_db;
 	mx->mx_cursor.mc_dbx = &mx->mx_dbx;
-	mx->mx_cursor.mc_dbi = mc->mc_dbi+1;
+	mx->mx_cursor.mc_dbi = mc->mc_dbi;
 	mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
 	mx->mx_cursor.mc_snum = 0;
 	mx->mx_cursor.mc_top = 0;
 	mx->mx_cursor.mc_flags = C_SUB;
+	mx->mx_dbx.md_name.mv_size = 0;
+	mx->mx_dbx.md_name.mv_data = NULL;
 	mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
 	mx->mx_dbx.md_dcmp = NULL;
 	mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
@@ -6520,6 +6575,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
 		memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
 		mx->mx_cursor.mc_pg[0] = 0;
 		mx->mx_cursor.mc_snum = 0;
+		mx->mx_cursor.mc_top = 0;
 		mx->mx_cursor.mc_flags = C_SUB;
 	} else {
 		MDB_page *fp = NODEDATA(node);
@@ -6532,8 +6588,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
 		mx->mx_db.md_entries = NUMKEYS(fp);
 		COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
 		mx->mx_cursor.mc_snum = 1;
-		mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
 		mx->mx_cursor.mc_top = 0;
+		mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
 		mx->mx_cursor.mc_pg[0] = fp;
 		mx->mx_cursor.mc_ki[0] = 0;
 		if (mc->mc_db->md_flags & MDB_DUPFIXED) {
@@ -6543,12 +6599,9 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
 				mx->mx_db.md_flags |= MDB_INTEGERKEY;
 		}
 	}
-	DPRINTF(("Sub-db %u for db %u root page %"Z"u", mx->mx_cursor.mc_dbi, mc->mc_dbi,
+	DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
 		mx->mx_db.md_root));
-	mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ?
-		DB_DIRTY : 0);
-	mx->mx_dbx.md_name.mv_data = NODEKEY(node);
-	mx->mx_dbx.md_name.mv_size = node->mn_ksize;
+	mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
 #if UINT_MAX < SIZE_MAX
 	if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
 #ifdef MISALIGNED_OK
@@ -6793,7 +6846,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
 		flags = 0;
 	} else {
 		srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
-		assert(!((long)srcnode&1));
+		assert(!((size_t)srcnode&1));
 		srcpg = NODEPGNO(srcnode);
 		flags = srcnode->mn_flags;
 		if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
@@ -6864,9 +6917,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
 		MDB_dbi dbi = csrc->mc_dbi;
 		MDB_page *mp = csrc->mc_pg[csrc->mc_top];
-		if (csrc->mc_flags & C_SUB)
-			dbi--;
 		for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
 			if (csrc->mc_flags & C_SUB)
 				m3 = &m2->mc_xcursor->mx_cursor;
@@ -7041,9 +7091,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
 		MDB_dbi dbi = csrc->mc_dbi;
 		MDB_page *mp = cdst->mc_pg[cdst->mc_top];
-		if (csrc->mc_flags & C_SUB)
-			dbi--;
 		for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
 			if (csrc->mc_flags & C_SUB)
 				m3 = &m2->mc_xcursor->mx_cursor;
@@ -7138,13 +7185,11 @@ mdb_rebalance(MDB_cursor *mc)
 			/* Adjust cursors pointing to mp */
 			mc->mc_snum = 0;
 			mc->mc_top = 0;
+			mc->mc_flags &= ~C_INITIALIZED;
 			{
 				MDB_cursor *m2, *m3;
 				MDB_dbi dbi = mc->mc_dbi;
-				if (mc->mc_flags & C_SUB)
-					dbi--;
 				for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
 					if (mc->mc_flags & C_SUB)
 						m3 = &m2->mc_xcursor->mx_cursor;
@@ -7154,6 +7199,7 @@ mdb_rebalance(MDB_cursor *mc)
 					if (m3->mc_pg[0] == mp) {
 						m3->mc_snum = 0;
 						m3->mc_top = 0;
+						m3->mc_flags &= ~C_INITIALIZED;
 					}
 				}
 			}
@@ -7174,9 +7220,6 @@ mdb_rebalance(MDB_cursor *mc)
 				MDB_cursor *m2, *m3;
 				MDB_dbi dbi = mc->mc_dbi;
-				if (mc->mc_flags & C_SUB)
-					dbi--;
 				for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
 					if (mc->mc_flags & C_SUB)
 						m3 = &m2->mc_xcursor->mx_cursor;
@@ -7184,10 +7227,13 @@ mdb_rebalance(MDB_cursor *mc)
 						m3 = m2;
 					if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
 					if (m3->mc_pg[0] == mp) {
-						m3->mc_pg[0] = mc->mc_pg[0];
-						m3->mc_snum = 1;
-						m3->mc_top = 0;
-						m3->mc_ki[0] = m3->mc_ki[1];
+						int i;
+						m3->mc_snum--;
+						m3->mc_top--;
+						for (i=0; i<m3->mc_snum; i++) {
+							m3->mc_pg[i] = m3->mc_pg[i+1];
+							m3->mc_ki[i] = m3->mc_ki[i+1];
+						}
 					}
 				}
 			}
@@ -7300,7 +7346,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
 		/* Adjust other cursors pointing to mp */
 		for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
-			if (m2 == mc)
+			if (m2 == mc || m2->mc_snum < mc->mc_snum)
 				continue;
 			if (!(m2->mc_flags & C_INITIALIZED))
 				continue;
@@ -7341,7 +7387,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi,
 	if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
 		return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
-	if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
+	if (key->mv_size > MDB_MAXKEYSIZE) {
 		return MDB_BAD_VALSIZE;
 	}
@@ -7394,24 +7440,26 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
 	unsigned int nflags)
 {
 	unsigned int flags;
-	int		 rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1, did_split = 0;
+	int		 rc = MDB_SUCCESS, new_root = 0, did_split = 0;
 	indx_t		 newindx;
 	pgno_t		 pgno = 0;
-	unsigned int	 i, j, split_indx, nkeys, pmax;
+	int	 i, j, split_indx, nkeys, pmax;
+	MDB_env 	*env = mc->mc_txn->mt_env;
 	MDB_node	*node;
 	MDB_val	 sepkey, rkey, xdata, *rdata = &xdata;
-	MDB_page	*copy;
+	MDB_page	*copy = NULL;
 	MDB_page	*mp, *rp, *pp;
-	unsigned int ptop;
+	int ptop;
 	MDB_cursor	mn;
 	DKBUF;
 	mp = mc->mc_pg[mc->mc_top];
 	newindx = mc->mc_ki[mc->mc_top];
+	nkeys = NUMKEYS(mp);
-	DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i",
+	DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
 	    IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
-	    DKEY(newkey), mc->mc_ki[mc->mc_top]));
+	    DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
 	/* Create a right sibling. */
 	if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
@@ -7458,141 +7506,139 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
 		sepkey = *newkey;
 		split_indx = newindx;
 		nkeys = 0;
-		goto newsep;
-	}
+	} else {
-	nkeys = NUMKEYS(mp);
-	split_indx = nkeys / 2;
-	if (newindx < split_indx)
-		newpos = 0;
-	if (IS_LEAF2(rp)) {
-		char *split, *ins;
-		int x;
-		unsigned int lsize, rsize, ksize;
-		/* Move half of the keys to the right sibling */
-		copy = NULL;
-		x = mc->mc_ki[mc->mc_top] - split_indx;
-		ksize = mc->mc_db->md_pad;
-		split = LEAF2KEY(mp, split_indx, ksize);
-		rsize = (nkeys - split_indx) * ksize;
-		lsize = (nkeys - split_indx) * sizeof(indx_t);
-		mp->mp_lower -= lsize;
-		rp->mp_lower += lsize;
-		mp->mp_upper += rsize - lsize;
-		rp->mp_upper -= rsize - lsize;
-		sepkey.mv_size = ksize;
-		if (newindx == split_indx) {
-			sepkey.mv_data = newkey->mv_data;
-		} else {
-			sepkey.mv_data = split;
-		}
-		if (x<0) {
-			ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
-			memcpy(rp->mp_ptrs, split, rsize);
-			sepkey.mv_data = rp->mp_ptrs;
-			memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
-			memcpy(ins, newkey->mv_data, ksize);
-			mp->mp_lower += sizeof(indx_t);
-			mp->mp_upper -= ksize - sizeof(indx_t);
+		split_indx = (nkeys+1) / 2;
+		if (IS_LEAF2(rp)) {
+			char *split, *ins;
+			int x;
+			unsigned int lsize, rsize, ksize;
+			/* Move half of the keys to the right sibling */
+			copy = NULL;
+			x = mc->mc_ki[mc->mc_top] - split_indx;
+			ksize = mc->mc_db->md_pad;
+			split = LEAF2KEY(mp, split_indx, ksize);
+			rsize = (nkeys - split_indx) * ksize;
+			lsize = (nkeys - split_indx) * sizeof(indx_t);
+			mp->mp_lower -= lsize;
+			rp->mp_lower += lsize;
+			mp->mp_upper += rsize - lsize;
+			rp->mp_upper -= rsize - lsize;
+			sepkey.mv_size = ksize;
+			if (newindx == split_indx) {
+				sepkey.mv_data = newkey->mv_data;
+			} else {
+				sepkey.mv_data = split;
+			}
+			if (x<0) {
+				ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
+				memcpy(rp->mp_ptrs, split, rsize);
+				sepkey.mv_data = rp->mp_ptrs;
+				memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
+				memcpy(ins, newkey->mv_data, ksize);
+				mp->mp_lower += sizeof(indx_t);
+				mp->mp_upper -= ksize - sizeof(indx_t);
+			} else {
+				if (x)
+					memcpy(rp->mp_ptrs, split, x * ksize);
+				ins = LEAF2KEY(rp, x, ksize);
+				memcpy(ins, newkey->mv_data, ksize);
+				memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
+				rp->mp_lower += sizeof(indx_t);
+				rp->mp_upper -= ksize - sizeof(indx_t);
+				mc->mc_ki[mc->mc_top] = x;
+				mc->mc_pg[mc->mc_top] = rp;
+			}
 		} else {
-			if (x)
-				memcpy(rp->mp_ptrs, split, x * ksize);
-			ins = LEAF2KEY(rp, x, ksize);
-			memcpy(ins, newkey->mv_data, ksize);
-			memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
-			rp->mp_lower += sizeof(indx_t);
-			rp->mp_upper -= ksize - sizeof(indx_t);
-			mc->mc_ki[mc->mc_top] = x;
-			mc->mc_pg[mc->mc_top] = rp;
-		}
-		goto newsep;
-	}
+			int psize, nsize, k;
+			/* Maximum free space in an empty page */
+			pmax = env->me_psize - PAGEHDRSZ;
+			if (IS_LEAF(mp))
+				nsize = mdb_leaf_size(env, newkey, newdata);
+			else
+				nsize = mdb_branch_size(env, newkey);
+			nsize += nsize & 1;
-	/* For leaf pages, check the split point based on what
-	 * fits where, since otherwise mdb_node_add can fail.
-	 *
-	 * This check is only needed when the data items are
-	 * relatively large, such that being off by one will
-	 * make the difference between success or failure.
-	 *
-	 * It's also relevant if a page happens to be laid out
-	 * such that one half of its nodes are all "small" and
-	 * the other half of its nodes are "large." If the new
-	 * item is also "large" and falls on the half with
-	 * "large" nodes, it also may not fit.
-	 */
-	if (IS_LEAF(mp)) {
-		unsigned int psize, nsize;
-		/* Maximum free space in an empty page */
-		pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
-		nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata);
-		if ((nkeys < 20) || (nsize > pmax/16)) {
-			if (newindx <= split_indx) {
-				psize = nsize;
-				newpos = 0;
-				for (i=0; i<split_indx; i++) {
-					node = NODEPTR(mp, i);
-					psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
-					if (F_ISSET(node->mn_flags, F_BIGDATA))
-						psize += sizeof(pgno_t);
-					else
-						psize += NODEDSZ(node);
-					psize += psize & 1;
-					if (psize > pmax) {
-						if (i <= newindx) {
-							split_indx = newindx;
-							if (i < newindx)
-								newpos = 1;
+			/* grab a page to hold a temporary copy */
+			copy = mdb_page_malloc(mc->mc_txn, 1);
+			if (copy == NULL)
+				return ENOMEM;
+			copy->mp_pgno  = mp->mp_pgno;
+			copy->mp_flags = mp->mp_flags;
+			copy->mp_lower = PAGEHDRSZ;
+			copy->mp_upper = env->me_psize;
+			/* prepare to insert */
+			for (i=0, j=0; i<nkeys; i++) {
+				if (i == newindx) {
+					copy->mp_ptrs[j++] = 0;
+				}
+				copy->mp_ptrs[j++] = mp->mp_ptrs[i];
+			}
+			/* When items are relatively large the split point needs
+			 * to be checked, because being off-by-one will make the
+			 * difference between success or failure in mdb_node_add.
+			 *
+			 * It's also relevant if a page happens to be laid out
+			 * such that one half of its nodes are all "small" and
+			 * the other half of its nodes are "large." If the new
+			 * item is also "large" and falls on the half with
+			 * "large" nodes, it also may not fit.
+			 *
+			 * As a final tweak, if the new item goes on the last
+			 * spot on the page (and thus, onto the new page), bias
+			 * the split so the new page is emptier than the old page.
+			 * This yields better packing during sequential inserts.
+			 */
+			if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
+				/* Find split point */
+				psize = 0;
+				if (newindx <= split_indx || newindx >= nkeys) {
+					i = 0; j = 1;
+					k = newindx >= nkeys ? nkeys : split_indx+2;
+				} else {
+					i = nkeys; j = -1;
+					k = split_indx-1;
+				}
+				for (; i!=k; i+=j) {
+					if (i == newindx) {
+						psize += nsize;
+						node = NULL;
+					} else {
+						node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
+						psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
+						if (IS_LEAF(mp)) {
+							if (F_ISSET(node->mn_flags, F_BIGDATA))
+								psize += sizeof(pgno_t);
+							else
+								psize += NODEDSZ(node);
 						}
-						else
-							split_indx = i;
-						break;
+						psize += psize & 1;
 					}
-				}
-			} else {
-				psize = nsize;
-				for (i=nkeys-1; i>=split_indx; i--) {
-					node = NODEPTR(mp, i);
-					psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
-					if (F_ISSET(node->mn_flags, F_BIGDATA))
-						psize += sizeof(pgno_t);
-					else
-						psize += NODEDSZ(node);
-					psize += psize & 1;
-					if (psize > pmax) {
-						if (i >= newindx) {
-							split_indx = newindx;
-							newpos = 0;
-						} else
-							split_indx = i+1;
+					if (psize > pmax || i == k-j) {
+						split_indx = i + (j<0);
 						break;
 					}
 				}
 			}
+			if (split_indx == newindx) {
+				sepkey.mv_size = newkey->mv_size;
+				sepkey.mv_data = newkey->mv_data;
+			} else {
+				node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
+				sepkey.mv_size = node->mn_ksize;
+				sepkey.mv_data = NODEKEY(node);
+			}
 		}
 	}
-	/* First find the separating key between the split pages.
-	 * The case where newindx == split_indx is ambiguous; the
-	 * new item could go to the new page or stay on the original
-	 * page. If newpos == 1 it goes to the new page.
-	 */
-	if (newindx == split_indx && newpos) {
-		sepkey.mv_size = newkey->mv_size;
-		sepkey.mv_data = newkey->mv_data;
-	} else {
-		node = NODEPTR(mp, split_indx);
-		sepkey.mv_size = node->mn_ksize;
-		sepkey.mv_data = NODEKEY(node);
-	}
-newsep:
-	DPRINTF(("separator is [%s]", DKEY(&sepkey)));
+	DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
 	/* Copy separator key to the parent.
 	 */
-	if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) {
+	if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
 		mn.mc_snum--;
 		mn.mc_top--;
 		did_split = 1;
@@ -7637,117 +7683,97 @@ newsep:
 			return rc;
 		for (i=0; i<mc->mc_top; i++)
 			mc->mc_ki[i] = mn.mc_ki[i];
-		goto done;
-	}
-	if (IS_LEAF2(rp)) {
-		goto done;
-	}
-	/* Move half of the keys to the right sibling. */
+	} else if (!IS_LEAF2(mp)) {
+		/* Move nodes */
+		mc->mc_pg[mc->mc_top] = rp;
+		i = split_indx;
+		j = 0;
+		do {
+			if (i == newindx) {
+				rkey.mv_data = newkey->mv_data;
+				rkey.mv_size = newkey->mv_size;
+				if (IS_LEAF(mp)) {
+					rdata = newdata;
+				} else
+					pgno = newpgno;
+				flags = nflags;
+				/* Update index for the new key. */
+				mc->mc_ki[mc->mc_top] = j;
+			} else {
+				node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
+				rkey.mv_data = NODEKEY(node);
+				rkey.mv_size = node->mn_ksize;
+				if (IS_LEAF(mp)) {
+					xdata.mv_data = NODEDATA(node);
+					xdata.mv_size = NODEDSZ(node);
+					rdata = &xdata;
+				} else
+					pgno = NODEPGNO(node);
+				flags = node->mn_flags;
+			}
-	/* grab a page to hold a temporary copy */
-	copy = mdb_page_malloc(mc->mc_txn, 1);
-	if (copy == NULL)
-		return ENOMEM;
+			if (!IS_LEAF(mp) && j == 0) {
+				/* First branch index doesn't need key data. */
+				rkey.mv_size = 0;
+			}
-	copy->mp_pgno  = mp->mp_pgno;
-	copy->mp_flags = mp->mp_flags;
-	copy->mp_lower = PAGEHDRSZ;
-	copy->mp_upper = mc->mc_txn->mt_env->me_psize;
-	mc->mc_pg[mc->mc_top] = copy;
-	for (i = j = 0; i <= nkeys; j++) {
-		if (i == split_indx) {
-		/* Insert in right sibling. */
-		/* Reset insert index for right sibling. */
-			if (i != newindx || (newpos ^ ins_new)) {
+			rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
+			if (rc) {
+				/* return tmp page to freelist */
+				mdb_page_free(env, copy);
+				return rc;
+			}
+			if (i == nkeys) {
+				i = 0;
 				j = 0;
-				mc->mc_pg[mc->mc_top] = rp;
+				mc->mc_pg[mc->mc_top] = copy;
+			} else {
+				i++;
+				j++;
+			}
+		} while (i != split_indx);
+		nkeys = NUMKEYS(copy);
+		for (i=0; i<nkeys; i++)
+			mp->mp_ptrs[i] = copy->mp_ptrs[i];
+		mp->mp_lower = copy->mp_lower;
+		mp->mp_upper = copy->mp_upper;
+		memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
+			env->me_psize - copy->mp_upper);
+		/* reset back to original page */
+		if (newindx < split_indx) {
+			mc->mc_pg[mc->mc_top] = mp;
+			if (nflags & MDB_RESERVE) {
+				node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
+				if (!(node->mn_flags & F_BIGDATA))
+					newdata->mv_data = NODEDATA(node);
 			}
-		}
-		if (i == newindx && !ins_new) {
-			/* Insert the original entry that caused the split. */
-			rkey.mv_data = newkey->mv_data;
-			rkey.mv_size = newkey->mv_size;
-			if (IS_LEAF(mp)) {
-				rdata = newdata;
-			} else
-				pgno = newpgno;
-			flags = nflags;
-			ins_new = 1;
-			/* Update index for the new key. */
-			mc->mc_ki[mc->mc_top] = j;
-		} else if (i == nkeys) {
-			break;
 		} else {
-			node = NODEPTR(mp, i);
-			rkey.mv_data = NODEKEY(node);
-			rkey.mv_size = node->mn_ksize;
-			if (IS_LEAF(mp)) {
-				xdata.mv_data = NODEDATA(node);
-				xdata.mv_size = NODEDSZ(node);
-				rdata = &xdata;
-			} else
-				pgno = NODEPGNO(node);
-			flags = node->mn_flags;
-			i++;
-		}
-		if (!IS_LEAF(mp) && j == 0) {
-			/* First branch index doesn't need key data. */
-			rkey.mv_size = 0;
-		}
-		rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
-		if (rc) break;
-	}
-	nkeys = NUMKEYS(copy);
-	for (i=0; i<nkeys; i++)
-		mp->mp_ptrs[i] = copy->mp_ptrs[i];
-	mp->mp_lower = copy->mp_lower;
-	mp->mp_upper = copy->mp_upper;
-	memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
-		mc->mc_txn->mt_env->me_psize - copy->mp_upper);
-	/* reset back to original page */
-	if (newindx < split_indx || (!newpos && newindx == split_indx)) {
-		mc->mc_pg[mc->mc_top] = mp;
-		if (nflags & MDB_RESERVE) {
-			node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
-			if (!(node->mn_flags & F_BIGDATA))
-				newdata->mv_data = NODEDATA(node);
-		}
-	} else {
-		mc->mc_ki[ptop]++;
-		/* Make sure mc_ki is still valid.
-		 */
-		if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
-		    mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
-			for (i=0; i<ptop; i++) {
-				mc->mc_pg[i] = mn.mc_pg[i];
-				mc->mc_ki[i] = mn.mc_ki[i];
+			mc->mc_pg[mc->mc_top] = rp;
+			mc->mc_ki[ptop]++;
+			/* Make sure mc_ki is still valid.
+			 */
+			if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
+				mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
+				for (i=0; i<ptop; i++) {
+					mc->mc_pg[i] = mn.mc_pg[i];
+					mc->mc_ki[i] = mn.mc_ki[i];
+				}
+				mc->mc_pg[ptop] = mn.mc_pg[ptop];
+				mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
 			}
-			mc->mc_pg[ptop] = mn.mc_pg[ptop];
-			mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
 		}
+		/* return tmp page to freelist */
+		mdb_page_free(env, copy);
 	}
-	/* return tmp page to freelist */
-	mdb_page_free(mc->mc_txn->mt_env, copy);
-done:
 	{
 		/* Adjust other cursors pointing to mp */
 		MDB_cursor *m2, *m3;
 		MDB_dbi dbi = mc->mc_dbi;
 		int fixup = NUMKEYS(mp);
-		if (mc->mc_flags & C_SUB)
-			dbi--;
 		for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
 			if (mc->mc_flags & C_SUB)
 				m3 = &m2->mc_xcursor->mx_cursor;
@@ -7789,6 +7815,7 @@ done:
 			}
 		}
 	}
+	DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
 	return rc;
 }
@@ -7805,13 +7832,6 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
 		return EINVAL;
-	if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
-		return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
-	if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
-		return MDB_BAD_VALSIZE;
-	}
 	if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
 		return EINVAL;
@@ -7851,6 +7871,16 @@ mdb_env_get_path(MDB_env *env, const char **arg)
 	return MDB_SUCCESS;
 }
+int
+mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
+{
+	if (!env || !arg)
+		return EINVAL;
+	*arg = env->me_fd;
+	return MDB_SUCCESS;
+}
 /** Common code for #mdb_stat() and #mdb_env_stat().
  * @param[in] env the environment to operate in.
  * @param[in] db the #MDB_db record containing the stats to return.
@@ -8075,7 +8105,7 @@ mdb_drop0(MDB_cursor *mc, int subs)
 {
 	int rc;
-	rc = mdb_page_search(mc, NULL, 0);
+	rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
 	if (rc == MDB_SUCCESS) {
 		MDB_txn *txn = mc->mc_txn;
 		MDB_node *ni;
@@ -8273,10 +8303,10 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
 	return 0;
 }
-/* insert pid into list if not already present.
+/** Insert pid into list if not already present.
  * return -1 if already present.
  */
-static int mdb_pid_insert(pid_t *ids, pid_t pid)
+static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
 {
 	/* binary search of pid in list */
 	unsigned base = 0;
@@ -8301,7 +8331,7 @@ static int mdb_pid_insert(pid_t *ids, pid_t pid)
 			return -1;
 		}
 	}
 	if( val > 0 ) {
 		++cursor;
 	}
@@ -8316,7 +8346,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
 {
 	unsigned int i, j, rdrs;
 	MDB_reader *mr;
-	pid_t *pids, pid;
+	MDB_PID_T *pids, pid;
 	int count = 0;
 	if (!env)
@@ -8326,7 +8356,7 @@ int mdb_reader_check(MDB_env *env, int *dead)
 	if (!env->me_txns)
 		return MDB_SUCCESS;
 	rdrs = env->me_txns->mti_numreaders;
-	pids = malloc((rdrs+1) * sizeof(pid_t));
+	pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
 	if (!pids)
 		return ENOMEM;
 	pids[0] = 0;
@@ -8342,6 +8372,8 @@ int mdb_reader_check(MDB_env *env, int *dead)
 					if (!mdb_reader_pid(env, Pidcheck, pid)) {
 						for (j=i; j<rdrs; j++)
 							if (mr[j].mr_pid == pid) {
+								DPRINTF(("clear stale reader pid %u txn %"Z"d",
+									(unsigned) pid, mr[j].mr_txnid));
 								mr[j].mr_pid = 0;
 								count++;
 							}