lmdb 0.4.5 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 14c11d6e3fb6489302524938d95191ac9db4131a
4
- data.tar.gz: e5e16ec5a67bc0e44a30753daf12c46c22f38fc1
3
+ metadata.gz: cfbe0e0fc20cfe471e48ab16b7a6f03a868f5fa0
4
+ data.tar.gz: cd1e90a95ee5eef3bf2bfe0f0db3e389feb22a51
5
5
  SHA512:
6
- metadata.gz: 621f7a59503333b623e1073c382cc0c2832e2e2648abed644faf0e812667a4282887ae5da1a78c67f54c15bcc5d9a504f0eb95dc9e15147d6f80dffaa9b7d5d7
7
- data.tar.gz: c634c5c777b29f49804e3748349e750be8998450d6e0f0bf77a208c3a5a97ea918f7026f51a587a585d7c9421d9c9b486f9a489498c5a43009477479f00afaa4
6
+ metadata.gz: 23b7d820ead899db18c95d87e819b9d8a533a118a86bffe2bd996cd1352ca73bf8c3399150ed7b4fdaea7a5e4ab1901bc3844e14d25672874744572d383a1985
7
+ data.tar.gz: e3d13fe0515fd9ca626ed81526b3be34a4b9f0affe73afef4862033bb8b124c1c84687115f8f5f40a3524a4308d44ac86e877113362a6b08e87f002f081052b5
@@ -1,5 +1,27 @@
1
1
  LMDB 0.9 Change Log
2
2
 
3
+ LMDB 0.9.14 Release (2014/09/15)
4
+ Fix to support 64K page size (ITS#7713)
5
+ Fix to persist decreased as well as increased mapsizes (ITS#7789)
6
+ Fix cursor bug when deleting last node of a DUPSORT key
7
+ Fix mdb_env_info to return FIXEDMAP address
8
+ Fix ambiguous error code from writing to closed DBI (ITS#7825)
9
+ Fix mdb_copy copying past end of file (ITS#7886)
10
+ Fix cursor bugs from page_merge/rebalance
11
+ Fix to dirty fewer pages in deletes (mdb_page_loose())
12
+ Fix mdb_dbi_open creating subDBs (ITS#7917)
13
+ Fix mdb_cursor_get(_DUP) with single value (ITS#7913)
14
+ Fix Windows compat issues in mtests (ITS#7879)
15
+ Add compacting variant of mdb_copy
16
+ Add BigEndian integer key compare code
17
+ Add mdb_dump/mdb_load utilities
18
+
19
+ LMDB 0.9.13 Release (2014/06/18)
20
+ Fix mdb_page_alloc unlimited overflow page search
21
+ Documentation
22
+ Re-fix MDB_CURRENT doc (ITS#7793)
23
+ Fix MDB_GET_MULTIPLE/MDB_NEXT_MULTIPLE doc
24
+
3
25
  LMDB 0.9.12 Release (2014/06/13)
4
26
  Fix MDB_GET_BOTH regression (ITS#7875,#7681)
5
27
  Fix MDB_MULTIPLE writing multiple keys (ITS#7834)
@@ -1,10 +1,10 @@
1
1
  /** @file lmdb.h
2
2
  * @brief Lightning memory-mapped database library
3
3
  *
4
- * @mainpage Lightning Memory-Mapped Database Manager (MDB)
4
+ * @mainpage Lightning Memory-Mapped Database Manager (LMDB)
5
5
  *
6
6
  * @section intro_sec Introduction
7
- * MDB is a Btree-based database management library modeled loosely on the
7
+ * LMDB is a Btree-based database management library modeled loosely on the
8
8
  * BerkeleyDB API, but much simplified. The entire database is exposed
9
9
  * in a memory map, and all data fetches return data directly
10
10
  * from the mapped memory, so no malloc's or memcpy's occur during
@@ -26,10 +26,10 @@
26
26
  * readers, and readers don't block writers.
27
27
  *
28
28
  * Unlike other well-known database mechanisms which use either write-ahead
29
- * transaction logs or append-only data writes, MDB requires no maintenance
29
+ * transaction logs or append-only data writes, LMDB requires no maintenance
30
30
  * during operation. Both write-ahead loggers and append-only databases
31
31
  * require periodic checkpointing and/or compaction of their log or database
32
- * files otherwise they grow without bound. MDB tracks free pages within
32
+ * files otherwise they grow without bound. LMDB tracks free pages within
33
33
  * the database and re-uses them for new write operations, so the database
34
34
  * size does not grow without bound in normal use.
35
35
  *
@@ -49,7 +49,7 @@
49
49
  * stale locks can block further operation.
50
50
  *
51
51
  * Fix: Check for stale readers periodically, using the
52
- * #mdb_reader_check function or the mdb_stat tool. Or just
52
+ * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. Or just
53
53
  * make all programs using the database close it; the lockfile
54
54
  * is always reset on first open of the environment.
55
55
  *
@@ -86,7 +86,7 @@
86
86
  *
87
87
  * - Use an MDB_env* in the process which opened it, without fork()ing.
88
88
  *
89
- * - Do not have open an MDB database twice in the same process at
89
+ * - Do not have open an LMDB database twice in the same process at
90
90
  * the same time. Not even from a plain open() call - close()ing it
91
91
  * breaks flock() advisory locking.
92
92
  *
@@ -109,7 +109,7 @@
109
109
  * - If you do that anyway, do a periodic check for stale readers. Or
110
110
  * close the environment once in a while, so the lockfile can get reset.
111
111
  *
112
- * - Do not use MDB databases on remote filesystems, even between
112
+ * - Do not use LMDB databases on remote filesystems, even between
113
113
  * processes on the same host. This breaks flock() on some OSes,
114
114
  * possibly memory map sync, and certainly sync between programs
115
115
  * on different hosts.
@@ -172,7 +172,7 @@ typedef void *mdb_filehandle_t;
172
172
  typedef int mdb_filehandle_t;
173
173
  #endif
174
174
 
175
- /** @defgroup mdb MDB API
175
+ /** @defgroup mdb LMDB API
176
176
  * @{
177
177
  * @brief OpenLDAP Lightning Memory-Mapped Database Manager
178
178
  */
@@ -184,7 +184,7 @@ typedef int mdb_filehandle_t;
184
184
  /** Library minor version */
185
185
  #define MDB_VERSION_MINOR 9
186
186
  /** Library patch version */
187
- #define MDB_VERSION_PATCH 12
187
+ #define MDB_VERSION_PATCH 14
188
188
 
189
189
  /** Combine args a,b,c into a single integer for easy version comparisons */
190
190
  #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
@@ -194,10 +194,10 @@ typedef int mdb_filehandle_t;
194
194
  MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
195
195
 
196
196
  /** The release date of this library version */
197
- #define MDB_VERSION_DATE "June 13, 2014"
197
+ #define MDB_VERSION_DATE "September 15, 2014"
198
198
 
199
199
  /** A stringifier for the version info */
200
- #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
200
+ #define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")"
201
201
 
202
202
  /** A helper for the stringifier macro */
203
203
  #define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d)
@@ -333,6 +333,15 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
333
333
  #define MDB_MULTIPLE 0x80000
334
334
  /* @} */
335
335
 
336
+ /** @defgroup mdb_copy Copy Flags
337
+ * @{
338
+ */
339
+ /** Compacting copy: Omit free space from copy, and renumber all
340
+ * pages sequentially.
341
+ */
342
+ #define MDB_CP_COMPACT 0x01
343
+ /* @} */
344
+
336
345
  /** @brief Cursor Get operations.
337
346
  *
338
347
  * This is the set of all operations for retrieving data
@@ -345,16 +354,18 @@ typedef enum MDB_cursor_op {
345
354
  MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */
346
355
  MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */
347
356
  MDB_GET_CURRENT, /**< Return key/data at current cursor position */
348
- MDB_GET_MULTIPLE, /**< Return all the duplicate data items at the current
349
- cursor position. Only for #MDB_DUPFIXED */
357
+ MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items
358
+ from current cursor position. Move cursor to prepare
359
+ for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */
350
360
  MDB_LAST, /**< Position at last key/data item */
351
361
  MDB_LAST_DUP, /**< Position at last data item of current key.
352
362
  Only for #MDB_DUPSORT */
353
363
  MDB_NEXT, /**< Position at next data item */
354
364
  MDB_NEXT_DUP, /**< Position at next data item of current key.
355
365
  Only for #MDB_DUPSORT */
356
- MDB_NEXT_MULTIPLE, /**< Return all duplicate data items at the next
357
- cursor position. Only for #MDB_DUPFIXED */
366
+ MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items
367
+ from next cursor position. Move cursor to prepare
368
+ for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */
358
369
  MDB_NEXT_NODUP, /**< Position at first data item of next key */
359
370
  MDB_PREV, /**< Position at previous data item */
360
371
  MDB_PREV_DUP, /**< Position at previous data item of current key.
@@ -384,7 +395,7 @@ typedef enum MDB_cursor_op {
384
395
  #define MDB_PANIC (-30795)
385
396
  /** Environment version mismatch */
386
397
  #define MDB_VERSION_MISMATCH (-30794)
387
- /** File is not a valid MDB file */
398
+ /** File is not a valid LMDB file */
388
399
  #define MDB_INVALID (-30793)
389
400
  /** Environment mapsize reached */
390
401
  #define MDB_MAP_FULL (-30792)
@@ -410,7 +421,10 @@ typedef enum MDB_cursor_op {
410
421
  #define MDB_BAD_TXN (-30782)
411
422
  /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */
412
423
  #define MDB_BAD_VALSIZE (-30781)
413
- #define MDB_LAST_ERRCODE MDB_BAD_VALSIZE
424
+ /** The specified DBI was changed unexpectedly */
425
+ #define MDB_BAD_DBI (-30780)
426
+ /** The last defined error code */
427
+ #define MDB_LAST_ERRCODE MDB_BAD_DBI
414
428
  /** @} */
415
429
 
416
430
  /** @brief Statistics for a database in the environment */
@@ -434,7 +448,7 @@ typedef struct MDB_envinfo {
434
448
  unsigned int me_numreaders; /**< max reader slots used in the environment */
435
449
  } MDB_envinfo;
436
450
 
437
- /** @brief Return the mdb library version information.
451
+ /** @brief Return the LMDB library version information.
438
452
  *
439
453
  * @param[out] major if non-NULL, the library major version number is copied here
440
454
  * @param[out] minor if non-NULL, the library minor version number is copied here
@@ -448,14 +462,14 @@ char *mdb_version(int *major, int *minor, int *patch);
448
462
  * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3)
449
463
  * function. If the error code is greater than or equal to 0, then the string
450
464
  * returned by the system function strerror(3) is returned. If the error code
451
- * is less than 0, an error string corresponding to the MDB library error is
452
- * returned. See @ref errors for a list of MDB-specific error codes.
465
+ * is less than 0, an error string corresponding to the LMDB library error is
466
+ * returned. See @ref errors for a list of LMDB-specific error codes.
453
467
  * @param[in] err The error code
454
468
  * @retval "error message" The description of the error
455
469
  */
456
470
  char *mdb_strerror(int err);
457
471
 
458
- /** @brief Create an MDB environment handle.
472
+ /** @brief Create an LMDB environment handle.
459
473
  *
460
474
  * This function allocates memory for a #MDB_env structure. To release
461
475
  * the allocated memory and discard the handle, call #mdb_env_close().
@@ -488,15 +502,15 @@ int mdb_env_create(MDB_env **env);
488
502
  * how the operating system has allocated memory to shared libraries and other uses.
489
503
  * The feature is highly experimental.
490
504
  * <li>#MDB_NOSUBDIR
491
- * By default, MDB creates its environment in a directory whose
505
+ * By default, LMDB creates its environment in a directory whose
492
506
  * pathname is given in \b path, and creates its data and lock files
493
507
  * under that directory. With this option, \b path is used as-is for
494
508
  * the database main data file. The database lock file is the \b path
495
509
  * with "-lock" appended.
496
510
  * <li>#MDB_RDONLY
497
511
  * Open the environment in read-only mode. No write operations will be
498
- * allowed. MDB will still modify the lock file - except on read-only
499
- * filesystems, where MDB does not use locks.
512
+ * allowed. LMDB will still modify the lock file - except on read-only
513
+ * filesystems, where LMDB does not use locks.
500
514
  * <li>#MDB_WRITEMAP
501
515
  * Use a writeable memory map unless MDB_RDONLY is set. This is faster
502
516
  * and uses fewer mallocs, but loses protection from application bugs
@@ -540,7 +554,7 @@ int mdb_env_create(MDB_env **env);
540
554
  * the user synchronizes its use. Applications that multiplex many
541
555
  * user threads over individual OS threads need this option. Such an
542
556
  * application must also serialize the write transactions in an OS
543
- * thread, since MDB's write locking is unaware of the user threads.
557
+ * thread, since LMDB's write locking is unaware of the user threads.
544
558
  * <li>#MDB_NOLOCK
545
559
  * Don't do any locking. If concurrent access is anticipated, the
546
560
  * caller must manage all concurrency itself. For proper operation
@@ -579,7 +593,7 @@ int mdb_env_create(MDB_env **env);
579
593
  * @return A non-zero error value on failure and 0 on success. Some possible
580
594
  * errors are:
581
595
  * <ul>
582
- * <li>#MDB_VERSION_MISMATCH - the version of the MDB library doesn't match the
596
+ * <li>#MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the
583
597
  * version that created the database environment.
584
598
  * <li>#MDB_INVALID - the environment file headers are corrupted.
585
599
  * <li>ENOENT - the directory specified by the path parameter doesn't exist.
@@ -589,7 +603,7 @@ int mdb_env_create(MDB_env **env);
589
603
  */
590
604
  int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode);
591
605
 
592
- /** @brief Copy an MDB environment to the specified path.
606
+ /** @brief Copy an LMDB environment to the specified path.
593
607
  *
594
608
  * This function may be used to make a backup of an existing environment.
595
609
  * No lockfile is created, since it gets recreated at need.
@@ -605,7 +619,7 @@ int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t
605
619
  */
606
620
  int mdb_env_copy(MDB_env *env, const char *path);
607
621
 
608
- /** @brief Copy an MDB environment to the specified file descriptor.
622
+ /** @brief Copy an LMDB environment to the specified file descriptor.
609
623
  *
610
624
  * This function may be used to make a backup of an existing environment.
611
625
  * No lockfile is created, since it gets recreated at need.
@@ -620,7 +634,50 @@ int mdb_env_copy(MDB_env *env, const char *path);
620
634
  */
621
635
  int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd);
622
636
 
623
- /** @brief Return statistics about the MDB environment.
637
+ /** @brief Copy an LMDB environment to the specified path, with options.
638
+ *
639
+ * This function may be used to make a backup of an existing environment.
640
+ * No lockfile is created, since it gets recreated at need.
641
+ * @note This call can trigger significant file size growth if run in
642
+ * parallel with write transactions, because it employs a read-only
643
+ * transaction. See long-lived transactions under @ref caveats_sec.
644
+ * @param[in] env An environment handle returned by #mdb_env_create(). It
645
+ * must have already been opened successfully.
646
+ * @param[in] path The directory in which the copy will reside. This
647
+ * directory must already exist and be writable but must otherwise be
648
+ * empty.
649
+ * @param[in] flags Special options for this operation. This parameter
650
+ * must be set to 0 or by bitwise OR'ing together one or more of the
651
+ * values described here.
652
+ * <ul>
653
+ * <li>#MDB_CP_COMPACT - Perform compaction while copying: omit free
654
+ * pages and sequentially renumber all pages in output. This option
655
+ * consumes more CPU and runs more slowly than the default.
656
+ * </ul>
657
+ * @return A non-zero error value on failure and 0 on success.
658
+ */
659
+ int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags);
660
+
661
+ /** @brief Copy an LMDB environment to the specified file descriptor,
662
+ * with options.
663
+ *
664
+ * This function may be used to make a backup of an existing environment.
665
+ * No lockfile is created, since it gets recreated at need. See
666
+ * #mdb_env_copy2() for further details.
667
+ * @note This call can trigger significant file size growth if run in
668
+ * parallel with write transactions, because it employs a read-only
669
+ * transaction. See long-lived transactions under @ref caveats_sec.
670
+ * @param[in] env An environment handle returned by #mdb_env_create(). It
671
+ * must have already been opened successfully.
672
+ * @param[in] fd The filedescriptor to write the copy to. It must
673
+ * have already been opened for Write access.
674
+ * @param[in] flags Special options for this operation.
675
+ * See #mdb_env_copy2() for options.
676
+ * @return A non-zero error value on failure and 0 on success.
677
+ */
678
+ int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags);
679
+
680
+ /** @brief Return statistics about the LMDB environment.
624
681
  *
625
682
  * @param[in] env An environment handle returned by #mdb_env_create()
626
683
  * @param[out] stat The address of an #MDB_stat structure
@@ -628,7 +685,7 @@ int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd);
628
685
  */
629
686
  int mdb_env_stat(MDB_env *env, MDB_stat *stat);
630
687
 
631
- /** @brief Return information about the MDB environment.
688
+ /** @brief Return information about the LMDB environment.
632
689
  *
633
690
  * @param[in] env An environment handle returned by #mdb_env_create()
634
691
  * @param[out] stat The address of an #MDB_envinfo structure
@@ -639,7 +696,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
639
696
  /** @brief Flush the data buffers to disk.
640
697
  *
641
698
  * Data is always written to disk when #mdb_txn_commit() is called,
642
- * but the operating system may keep it buffered. MDB always flushes
699
+ * but the operating system may keep it buffered. LMDB always flushes
643
700
  * the OS buffers upon commit as well, unless the environment was
644
701
  * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
645
702
  * @param[in] env An environment handle returned by #mdb_env_create()
@@ -730,7 +787,13 @@ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
730
787
  * this process. Note that the library does not check for this condition,
731
788
  * the caller must ensure it explicitly.
732
789
  *
733
- * If the mapsize is changed by another process, #mdb_txn_begin() will
790
+ * The new size takes effect immediately for the current process but
791
+ * will not be persisted to any others until a write transaction has been
792
+ * committed by the current process. Also, only mapsize increases are
793
+ * persisted into the environment.
794
+ *
795
+ * If the mapsize is increased by another process, and data has grown
796
+ * beyond the range of the current mapsize, #mdb_txn_begin() will
734
797
  * return #MDB_MAP_RESIZED. This function may be called with a size
735
798
  * of zero to adopt the new size.
736
799
  *
@@ -822,7 +885,7 @@ int mdb_env_set_userctx(MDB_env *env, void *ctx);
822
885
  */
823
886
  void *mdb_env_get_userctx(MDB_env *env);
824
887
 
825
- /** @brief A callback function for most MDB assert() failures,
888
+ /** @brief A callback function for most LMDB assert() failures,
826
889
  * called before printing the message and aborting.
827
890
  *
828
891
  * @param[in] env An environment handle returned by #mdb_env_create().
@@ -1204,7 +1267,7 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
1204
1267
  * reserved space, which the caller can fill in later - before
1205
1268
  * the next update operation or the transaction ends. This saves
1206
1269
  * an extra memcpy if the data is being generated later.
1207
- * MDB does nothing else with this memory, the caller is expected
1270
+ * LMDB does nothing else with this memory, the caller is expected
1208
1271
  * to modify all of the space requested.
1209
1272
  * <li>#MDB_APPEND - append the given key/data pair to the end of the
1210
1273
  * database. No key comparisons are performed. This option allows
@@ -1345,11 +1408,12 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
1345
1408
  * @param[in] flags Options for this operation. This parameter
1346
1409
  * must be set to 0 or one of the values described here.
1347
1410
  * <ul>
1348
- * <li>#MDB_CURRENT - overwrite the data of the key/data pair to which
1349
- * the cursor refers with the specified data item. The \b key
1350
- * parameter is not used for positioning the cursor, but should
1351
- * still be provided. If using sorted duplicates (#MDB_DUPSORT)
1352
- * the data item must still sort into the same place.
1411
+ * <li>#MDB_CURRENT - replace the item at the current cursor position.
1412
+ * The \b key parameter must still be provided, and must match it.
1413
+ * If using sorted duplicates (#MDB_DUPSORT) the data item must still
1414
+ * sort into the same place. This is intended to be used when the
1415
+ * new data is the same size as the old. Otherwise it will simply
1416
+ * perform a delete of the old record followed by an insert.
1353
1417
  * <li>#MDB_NODUPDATA - enter the new key/data pair only if it does not
1354
1418
  * already appear in the database. This flag may only be specified
1355
1419
  * if the database was opened with #MDB_DUPSORT. The function will
@@ -1478,4 +1542,12 @@ int mdb_reader_check(MDB_env *env, int *dead);
1478
1542
  #ifdef __cplusplus
1479
1543
  }
1480
1544
  #endif
1545
+ /** @page tools LMDB Command Line Tools
1546
+ The following describes the command line tools that are available for LMDB.
1547
+ \li \ref mdb_copy_1
1548
+ \li \ref mdb_dump_1
1549
+ \li \ref mdb_load_1
1550
+ \li \ref mdb_stat_1
1551
+ */
1552
+
1481
1553
  #endif /* _LMDB_H_ */
@@ -1,11 +1,11 @@
1
1
  /** @file mdb.c
2
- * @brief memory-mapped database library
2
+ * @brief Lightning memory-mapped database library
3
3
  *
4
4
  * A Btree-based database management library modeled loosely on the
5
5
  * BerkeleyDB API, but much simplified.
6
6
  */
7
7
  /*
8
- * Copyright 2011-2013 Howard Chu, Symas Corp.
8
+ * Copyright 2011-2014 Howard Chu, Symas Corp.
9
9
  * All rights reserved.
10
10
  *
11
11
  * Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,17 @@
35
35
  #ifndef _GNU_SOURCE
36
36
  #define _GNU_SOURCE 1
37
37
  #endif
38
- #include <sys/types.h>
39
- #include <sys/stat.h>
40
38
  #ifdef _WIN32
39
+ #include <malloc.h>
41
40
  #include <windows.h>
42
41
  /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
43
42
  * as int64 which is wrong. MSVC doesn't define it at all, so just
44
43
  * don't use it.
45
44
  */
46
45
  #define MDB_PID_T int
46
+ #define MDB_THR_T DWORD
47
+ #include <sys/types.h>
48
+ #include <sys/stat.h>
47
49
  #ifdef __GNUC__
48
50
  # include <sys/param.h>
49
51
  #else
@@ -55,7 +57,10 @@
55
57
  # endif
56
58
  #endif
57
59
  #else
60
+ #include <sys/types.h>
61
+ #include <sys/stat.h>
58
62
  #define MDB_PID_T pid_t
63
+ #define MDB_THR_T pthread_t
59
64
  #include <sys/param.h>
60
65
  #include <sys/uio.h>
61
66
  #include <sys/mman.h>
@@ -65,6 +70,16 @@
65
70
  #include <fcntl.h>
66
71
  #endif
67
72
 
73
+ #if defined(__mips) && defined(__linux)
74
+ /* MIPS has cache coherency issues, requires explicit cache control */
75
+ #include <asm/cachectl.h>
76
+ extern int cacheflush(char *addr, int nbytes, int cache);
77
+ #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache)
78
+ #else
79
+ #define CACHEFLUSH(addr, bytes, cache)
80
+ #endif
81
+
82
+
68
83
  #include <errno.h>
69
84
  #include <limits.h>
70
85
  #include <stddef.h>
@@ -75,6 +90,12 @@
75
90
  #include <time.h>
76
91
  #include <unistd.h>
77
92
 
93
+ #if defined(__sun)
94
+ /* Most platforms have posix_memalign, older may only have memalign */
95
+ #define HAVE_MEMALIGN 1
96
+ #include <malloc.h>
97
+ #endif
98
+
78
99
  #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
79
100
  #include <netinet/in.h>
80
101
  #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
@@ -145,7 +166,18 @@
145
166
  # error "Two's complement, reasonably sized integer types, please"
146
167
  #endif
147
168
 
148
- /** @defgroup internal MDB Internals
169
+ #ifdef __GNUC__
170
+ /** Put infrequently used env functions in separate section */
171
+ # ifdef __APPLE__
172
+ # define ESECT __attribute__ ((section("__TEXT,text_env")))
173
+ # else
174
+ # define ESECT __attribute__ ((section("text_env")))
175
+ # endif
176
+ #else
177
+ #define ESECT
178
+ #endif
179
+
180
+ /** @defgroup internal LMDB Internals
149
181
  * @{
150
182
  */
151
183
  /** @defgroup compat Compatibility Macros
@@ -156,6 +188,11 @@
156
188
  * @{
157
189
  */
158
190
 
191
+ /** Features under development */
192
+ #ifndef MDB_DEVEL
193
+ #define MDB_DEVEL 0
194
+ #endif
195
+
159
196
  /** Wrapper around __func__, which is a C99 feature */
160
197
  #if __STDC_VERSION__ >= 199901L
161
198
  # define mdb_func_ __func__
@@ -169,8 +206,10 @@
169
206
  #ifdef _WIN32
170
207
  #define MDB_USE_HASH 1
171
208
  #define MDB_PIDLOCK 0
172
- #define pthread_t DWORD
209
+ #define THREAD_RET DWORD
210
+ #define pthread_t HANDLE
173
211
  #define pthread_mutex_t HANDLE
212
+ #define pthread_cond_t HANDLE
174
213
  #define pthread_key_t DWORD
175
214
  #define pthread_self() GetCurrentThreadId()
176
215
  #define pthread_key_create(x,y) \
@@ -178,12 +217,16 @@
178
217
  #define pthread_key_delete(x) TlsFree(x)
179
218
  #define pthread_getspecific(x) TlsGetValue(x)
180
219
  #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
181
- #define pthread_mutex_unlock(x) ReleaseMutex(x)
182
- #define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE)
183
- #define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex)
184
- #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock((env)->me_rmutex)
185
- #define LOCK_MUTEX_W(env) pthread_mutex_lock((env)->me_wmutex)
186
- #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock((env)->me_wmutex)
220
+ #define pthread_mutex_unlock(x) ReleaseMutex(*x)
221
+ #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
222
+ #define pthread_cond_signal(x) SetEvent(*x)
223
+ #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
224
+ #define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL)
225
+ #define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE)
226
+ #define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_rmutex)
227
+ #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_rmutex)
228
+ #define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_wmutex)
229
+ #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_wmutex)
187
230
  #define getpid() GetCurrentProcessId()
188
231
  #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
189
232
  #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
@@ -198,7 +241,9 @@
198
241
  #endif
199
242
  #define Z "I"
200
243
  #else
201
-
244
+ #define THREAD_RET void *
245
+ #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg)
246
+ #define THREAD_FINISH(thr) pthread_join(thr,NULL)
202
247
  #define Z "z" /**< printf format modifier for size_t */
203
248
 
204
249
  /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
@@ -352,7 +397,8 @@ static txnid_t mdb_debug_start;
352
397
 
353
398
  /** @brief The maximum size of a database page.
354
399
  *
355
- * This is 32k, since it must fit in #MDB_page.%mp_upper.
400
+ * It is 32k or 64k, since value-PAGEBASE must fit in
401
+ * #MDB_page.%mp_upper.
356
402
  *
357
403
  * LMDB will use database pages < OS pages if needed.
358
404
  * That causes more I/O in write transactions: The OS must
@@ -365,7 +411,7 @@ static txnid_t mdb_debug_start;
365
411
  * pressure from other processes is high. So until OSs have
366
412
  * actual paging support for Huge pages, they're not viable.
367
413
  */
368
- #define MAX_PAGESIZE 0x8000
414
+ #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
369
415
 
370
416
  /** The minimum number of keys required in a database page.
371
417
  * Setting this to a larger value will place a smaller bound on the
@@ -381,14 +427,14 @@ static txnid_t mdb_debug_start;
381
427
  */
382
428
  #define MDB_MINKEYS 2
383
429
 
384
- /** A stamp that identifies a file as an MDB file.
430
+ /** A stamp that identifies a file as an LMDB file.
385
431
  * There's nothing special about this value other than that it is easily
386
432
  * recognizable, and it will reflect any byte order mismatches.
387
433
  */
388
434
  #define MDB_MAGIC 0xBEEFC0DE
389
435
 
390
436
  /** The version number for a database's datafile format. */
391
- #define MDB_DATA_VERSION 1
437
+ #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1)
392
438
  /** The version number for a database's lockfile format. */
393
439
  #define MDB_LOCK_VERSION 1
394
440
 
@@ -397,13 +443,14 @@ static txnid_t mdb_debug_start;
397
443
  * Define this as 0 to compute the max from the page size. 511
398
444
  * is default for backwards compat: liblmdb <= 0.9.10 can break
399
445
  * when modifying a DB with keys/dupsort data bigger than its max.
446
+ * #MDB_DEVEL sets the default to 0.
400
447
  *
401
448
  * Data items in an #MDB_DUPSORT database are also limited to
402
449
  * this size, since they're actually keys of a sub-DB. Keys and
403
450
  * #MDB_DUPSORT data items must fit on a node in a regular page.
404
451
  */
405
452
  #ifndef MDB_MAXKEYSIZE
406
- #define MDB_MAXKEYSIZE 511
453
+ #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511)
407
454
  #endif
408
455
 
409
456
  /** The maximum size of a key we can write to the environment. */
@@ -537,7 +584,7 @@ typedef struct MDB_rxbody {
537
584
  /** The process ID of the process owning this reader txn. */
538
585
  MDB_PID_T mrb_pid;
539
586
  /** The thread ID of the thread owning this txn. */
540
- pthread_t mrb_tid;
587
+ MDB_THR_T mrb_tid;
541
588
  } MDB_rxbody;
542
589
 
543
590
  /** The actual reader record, with cacheline padding. */
@@ -568,7 +615,7 @@ typedef struct MDB_reader {
568
615
  * unlikely. If a collision occurs, the results are unpredictable.
569
616
  */
570
617
  typedef struct MDB_txbody {
571
- /** Stamp identifying this as an MDB file. It must be set
618
+ /** Stamp identifying this as an LMDB file. It must be set
572
619
  * to #MDB_MAGIC. */
573
620
  uint32_t mtb_magic;
574
621
  /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */
@@ -635,7 +682,7 @@ typedef struct MDB_page {
635
682
  #define mp_next mp_p.p_next
636
683
  union {
637
684
  pgno_t p_pgno; /**< page number */
638
- void * p_next; /**< for in-memory list of freed structs */
685
+ struct MDB_page *p_next; /**< for in-memory list of freed pages */
639
686
  } mp_p;
640
687
  uint16_t mp_pad;
641
688
  /** @defgroup mdb_page Page Flags
@@ -650,6 +697,7 @@ typedef struct MDB_page {
650
697
  #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
651
698
  #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
652
699
  #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
700
+ #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
653
701
  #define P_KEEP 0x8000 /**< leave this page alone during spill */
654
702
  /** @} */
655
703
  uint16_t mp_flags; /**< @ref mdb_page */
@@ -672,8 +720,11 @@ typedef struct MDB_page {
672
720
  /** Address of first usable data byte in a page, after the header */
673
721
  #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
674
722
 
723
+ /** ITS#7713, change PAGEBASE to handle 65536 byte pages */
724
+ #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0)
725
+
675
726
  /** Number of nodes on a page */
676
- #define NUMKEYS(p) (((p)->mp_lower - PAGEHDRSZ) >> 1)
727
+ #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
677
728
 
678
729
  /** The amount of space remaining in the page */
679
730
  #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
@@ -700,6 +751,9 @@ typedef struct MDB_page {
700
751
  /** The number of overflow pages needed to store the given size. */
701
752
  #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
702
753
 
754
+ /** Link in #MDB_txn.%mt_loose_pgs list */
755
+ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
756
+
703
757
  /** Header for a single key/data pair within a page.
704
758
  * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
705
759
  * We guarantee 2-byte alignment for 'MDB_node's.
@@ -751,7 +805,7 @@ typedef struct MDB_node {
751
805
  #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
752
806
 
753
807
  /** Address of node \b i in page \b p */
754
- #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
808
+ #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
755
809
 
756
810
  /** Address of the key for the node */
757
811
  #define NODEKEY(node) (void *)((node)->mn_data)
@@ -841,7 +895,7 @@ typedef struct MDB_db {
841
895
  * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
842
896
  */
843
897
  typedef struct MDB_meta {
844
- /** Stamp identifying this as an MDB file. It must be set
898
+ /** Stamp identifying this as an LMDB file. It must be set
845
899
  * to #MDB_MAGIC. */
846
900
  uint32_t mm_magic;
847
901
  /** Version number of this lock file. Must be set to #MDB_DATA_VERSION. */
@@ -898,6 +952,12 @@ struct MDB_txn {
898
952
  /** The list of pages that became unused during this transaction.
899
953
  */
900
954
  MDB_IDL mt_free_pgs;
955
+ /** The list of loose pages that became unused and may be reused
956
+ * in this transaction, linked through #NEXT_LOOSE_PAGE(page).
957
+ */
958
+ MDB_page *mt_loose_pgs;
959
+ /* #Number of loose pages (#mt_loose_pgs) */
960
+ int mt_loose_count;
901
961
  /** The sorted list of dirty pages we temporarily wrote to disk
902
962
  * because the dirty list was full. page numbers in here are
903
963
  * shifted left by 1, deleted slots have the LSB set.
@@ -913,6 +973,8 @@ struct MDB_txn {
913
973
  MDB_dbx *mt_dbxs;
914
974
  /** Array of MDB_db records for each known DB */
915
975
  MDB_db *mt_dbs;
976
+ /** Array of sequence numbers for each DB handle */
977
+ unsigned int *mt_dbiseqs;
916
978
  /** @defgroup mt_dbflag Transaction DB Flags
917
979
  * @ingroup internal
918
980
  * @{
@@ -1048,12 +1110,15 @@ struct MDB_env {
1048
1110
  MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
1049
1111
  void *me_pbuf; /**< scratch area for DUPSORT put() */
1050
1112
  MDB_txn *me_txn; /**< current write transaction */
1113
+ MDB_txn *me_txn0; /**< prealloc'd write transaction */
1051
1114
  size_t me_mapsize; /**< size of the data memory map */
1052
1115
  off_t me_size; /**< current file size */
1053
1116
  pgno_t me_maxpg; /**< me_mapsize / me_psize */
1054
1117
  MDB_dbx *me_dbxs; /**< array of static DB info */
1055
1118
  uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
1119
+ unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */
1056
1120
  pthread_key_t me_txkey; /**< thread-key for readers */
1121
+ txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */
1057
1122
  MDB_pgstate me_pgstate; /**< state of old pages from freeDB */
1058
1123
  # define me_pglast me_pgstate.mf_pglast
1059
1124
  # define me_pghead me_pgstate.mf_pghead
@@ -1102,6 +1167,10 @@ typedef struct MDB_ntxn {
1102
1167
  #define TXN_DBI_EXIST(txn, dbi) \
1103
1168
  ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID))
1104
1169
 
1170
+ /** Check for misused \b dbi handles */
1171
+ #define TXN_DBI_CHANGED(txn, dbi) \
1172
+ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1173
+
1105
1174
  static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
1106
1175
  static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
1107
1176
  static int mdb_page_touch(MDB_cursor *mc);
@@ -1182,7 +1251,7 @@ mdb_version(int *major, int *minor, int *patch)
1182
1251
  return MDB_VERSION_STRING;
1183
1252
  }
1184
1253
 
1185
- /** Table of descriptions for MDB @ref errors */
1254
+ /** Table of descriptions for LMDB @ref errors */
1186
1255
  static char *const mdb_errstr[] = {
1187
1256
  "MDB_KEYEXIST: Key/data pair already exists",
1188
1257
  "MDB_NOTFOUND: No matching key/data pair found",
@@ -1190,7 +1259,7 @@ static char *const mdb_errstr[] = {
1190
1259
  "MDB_CORRUPTED: Located page was wrong type",
1191
1260
  "MDB_PANIC: Update of meta page failed",
1192
1261
  "MDB_VERSION_MISMATCH: Database environment version mismatch",
1193
- "MDB_INVALID: File is not an MDB file",
1262
+ "MDB_INVALID: File is not an LMDB file",
1194
1263
  "MDB_MAP_FULL: Environment mapsize limit reached",
1195
1264
  "MDB_DBS_FULL: Environment maxdbs limit reached",
1196
1265
  "MDB_READERS_FULL: Environment maxreaders limit reached",
@@ -1203,11 +1272,20 @@ static char *const mdb_errstr[] = {
1203
1272
  "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1204
1273
  "MDB_BAD_TXN: Transaction cannot recover - it must be aborted",
1205
1274
  "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1275
+ "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1206
1276
  };
1207
1277
 
1208
1278
  char *
1209
1279
  mdb_strerror(int err)
1210
1280
  {
1281
+ #ifdef _WIN32
1282
+ /** HACK: pad 4KB on stack over the buf. Return system msgs in buf.
1283
+ * This works as long as no function between the call to mdb_strerror
1284
+ * and the actual use of the message uses more than 4K of stack.
1285
+ */
1286
+ char pad[4096];
1287
+ char buf[1024], *ptr = buf;
1288
+ #endif
1211
1289
  int i;
1212
1290
  if (!err)
1213
1291
  return ("Successful return: 0");
@@ -1217,7 +1295,32 @@ mdb_strerror(int err)
1217
1295
  return mdb_errstr[i];
1218
1296
  }
1219
1297
 
1298
+ #ifdef _WIN32
1299
+ /* These are the C-runtime error codes we use. The comment indicates
1300
+ * their numeric value, and the Win32 error they would correspond to
1301
+ * if the error actually came from a Win32 API. A major mess, we should
1302
+ * have used LMDB-specific error codes for everything.
1303
+ */
1304
+ switch(err) {
1305
+ case ENOENT: /* 2, FILE_NOT_FOUND */
1306
+ case EIO: /* 5, ACCESS_DENIED */
1307
+ case ENOMEM: /* 12, INVALID_ACCESS */
1308
+ case EACCES: /* 13, INVALID_DATA */
1309
+ case EBUSY: /* 16, CURRENT_DIRECTORY */
1310
+ case EINVAL: /* 22, BAD_COMMAND */
1311
+ case ENOSPC: /* 28, OUT_OF_PAPER */
1312
+ return strerror(err);
1313
+ default:
1314
+ ;
1315
+ }
1316
+ buf[0] = 0;
1317
+ FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM |
1318
+ FORMAT_MESSAGE_IGNORE_INSERTS,
1319
+ NULL, err, 0, ptr, sizeof(buf), pad);
1320
+ return ptr;
1321
+ #else
1220
1322
  return strerror(err);
1323
+ #endif
1221
1324
  }
1222
1325
 
1223
1326
  /** assert(3) variant in cursor context */
@@ -1357,7 +1460,7 @@ mdb_page_list(MDB_page *mp)
1357
1460
  total = EVEN(total);
1358
1461
  }
1359
1462
  fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1360
- IS_LEAF2(mp) ? PAGEHDRSZ : mp->mp_lower, total, SIZELEFT(mp));
1463
+ IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1361
1464
  }
1362
1465
 
1363
1466
  void
@@ -1485,7 +1588,6 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
1485
1588
  }
1486
1589
  return ret;
1487
1590
  }
1488
-
1489
1591
  /** Free a single page.
1490
1592
  * Saves single pages to a list, for future reuse.
1491
1593
  * (This is not used for multi-page overflow pages.)
@@ -1525,6 +1627,62 @@ mdb_dlist_free(MDB_txn *txn)
1525
1627
  dl[0].mid = 0;
1526
1628
  }
1527
1629
 
1630
+ /** Loosen or free a single page.
1631
+ * Saves single pages to a list for future reuse
1632
+ * in this same txn. It has been pulled from the freeDB
1633
+ * and already resides on the dirty list, but has been
1634
+ * deleted. Use these pages first before pulling again
1635
+ * from the freeDB.
1636
+ *
1637
+ * If the page wasn't dirtied in this txn, just add it
1638
+ * to this txn's free list.
1639
+ */
1640
+ static int
1641
+ mdb_page_loose(MDB_cursor *mc, MDB_page *mp)
1642
+ {
1643
+ int loose = 0;
1644
+ pgno_t pgno = mp->mp_pgno;
1645
+ MDB_txn *txn = mc->mc_txn;
1646
+
1647
+ if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
1648
+ if (txn->mt_parent) {
1649
+ MDB_ID2 *dl = txn->mt_u.dirty_list;
1650
+ /* If txn has a parent, make sure the page is in our
1651
+ * dirty list.
1652
+ */
1653
+ if (dl[0].mid) {
1654
+ unsigned x = mdb_mid2l_search(dl, pgno);
1655
+ if (x <= dl[0].mid && dl[x].mid == pgno) {
1656
+ if (mp != dl[x].mptr) { /* bad cursor? */
1657
+ mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
1658
+ txn->mt_flags |= MDB_TXN_ERROR;
1659
+ return MDB_CORRUPTED;
1660
+ }
1661
+ /* ok, it's ours */
1662
+ loose = 1;
1663
+ }
1664
+ }
1665
+ } else {
1666
+ /* no parent txn, so it's just ours */
1667
+ loose = 1;
1668
+ }
1669
+ }
1670
+ if (loose) {
1671
+ DPRINTF(("loosen db %d page %"Z"u", DDBI(mc),
1672
+ mp->mp_pgno));
1673
+ NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
1674
+ txn->mt_loose_pgs = mp;
1675
+ txn->mt_loose_count++;
1676
+ mp->mp_flags |= P_LOOSE;
1677
+ } else {
1678
+ int rc = mdb_midl_append(&txn->mt_free_pgs, pgno);
1679
+ if (rc)
1680
+ return rc;
1681
+ }
1682
+
1683
+ return MDB_SUCCESS;
1684
+ }
1685
+
1528
1686
  /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1529
1687
  * @param[in] mc A cursor handle for the current operation.
1530
1688
  * @param[in] pflags Flags of the pages to update:
@@ -1535,7 +1693,7 @@ mdb_dlist_free(MDB_txn *txn)
1535
1693
  static int
1536
1694
  mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1537
1695
  {
1538
- enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
1696
+ enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
1539
1697
  MDB_txn *txn = mc->mc_txn;
1540
1698
  MDB_cursor *m3;
1541
1699
  MDB_xcursor *mx;
@@ -1686,7 +1844,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
1686
1844
  for (i=dl[0].mid; i && need; i--) {
1687
1845
  MDB_ID pn = dl[i].mid << 1;
1688
1846
  dp = dl[i].mptr;
1689
- if (dp->mp_flags & P_KEEP)
1847
+ if (dp->mp_flags & (P_LOOSE|P_KEEP))
1690
1848
  continue;
1691
1849
  /* Can't spill twice, make sure it's not already in a parent's
1692
1850
  * spill list.
@@ -1790,15 +1948,27 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1790
1948
  #else
1791
1949
  enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
1792
1950
  #endif
1793
- int rc, retry = Max_retries;
1951
+ int rc, retry = num * 60;
1794
1952
  MDB_txn *txn = mc->mc_txn;
1795
1953
  MDB_env *env = txn->mt_env;
1796
1954
  pgno_t pgno, *mop = env->me_pghead;
1797
- unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1;
1955
+ unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
1798
1956
  MDB_page *np;
1799
1957
  txnid_t oldest = 0, last;
1800
1958
  MDB_cursor_op op;
1801
1959
  MDB_cursor m2;
1960
+ int found_old = 0;
1961
+
1962
+ /* If there are any loose pages, just use them */
1963
+ if (num == 1 && txn->mt_loose_pgs) {
1964
+ np = txn->mt_loose_pgs;
1965
+ txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
1966
+ txn->mt_loose_count--;
1967
+ DPRINTF(("db %d use loose page %"Z"u", DDBI(mc),
1968
+ np->mp_pgno));
1969
+ *mp = np;
1970
+ return MDB_SUCCESS;
1971
+ }
1802
1972
 
1803
1973
  *mp = NULL;
1804
1974
 
@@ -1811,7 +1981,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1811
1981
  for (op = MDB_FIRST;; op = MDB_NEXT) {
1812
1982
  MDB_val key, data;
1813
1983
  MDB_node *leaf;
1814
- pgno_t *idl, old_id, new_id;
1984
+ pgno_t *idl;
1815
1985
 
1816
1986
  /* Seek a big enough contiguous page range. Prefer
1817
1987
  * pages at the tail, just truncating the list.
@@ -1823,14 +1993,14 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1823
1993
  if (mop[i-n2] == pgno+n2)
1824
1994
  goto search_done;
1825
1995
  } while (--i > n2);
1826
- if (Max_retries < INT_MAX && --retry < 0)
1996
+ if (--retry < 0)
1827
1997
  break;
1828
1998
  }
1829
1999
 
1830
2000
  if (op == MDB_FIRST) { /* 1st iteration */
1831
2001
  /* Prepare to fetch more and coalesce */
1832
- oldest = mdb_find_oldest(txn);
1833
2002
  last = env->me_pglast;
2003
+ oldest = env->me_pgoldest;
1834
2004
  mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
1835
2005
  if (last) {
1836
2006
  op = MDB_SET_RANGE;
@@ -1845,8 +2015,15 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1845
2015
 
1846
2016
  last++;
1847
2017
  /* Do not fetch more if the record will be too recent */
1848
- if (oldest <= last)
1849
- break;
2018
+ if (oldest <= last) {
2019
+ if (!found_old) {
2020
+ oldest = mdb_find_oldest(txn);
2021
+ env->me_pgoldest = oldest;
2022
+ found_old = 1;
2023
+ }
2024
+ if (oldest <= last)
2025
+ break;
2026
+ }
1850
2027
  rc = mdb_cursor_get(&m2, &key, NULL, op);
1851
2028
  if (rc) {
1852
2029
  if (rc == MDB_NOTFOUND)
@@ -1854,8 +2031,15 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1854
2031
  goto fail;
1855
2032
  }
1856
2033
  last = *(txnid_t*)key.mv_data;
1857
- if (oldest <= last)
1858
- break;
2034
+ if (oldest <= last) {
2035
+ if (!found_old) {
2036
+ oldest = mdb_find_oldest(txn);
2037
+ env->me_pgoldest = oldest;
2038
+ found_old = 1;
2039
+ }
2040
+ if (oldest <= last)
2041
+ break;
2042
+ }
1859
2043
  np = m2.mc_pg[m2.mc_top];
1860
2044
  leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
1861
2045
  if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)
@@ -1877,21 +2061,12 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1877
2061
  #if (MDB_DEBUG) > 1
1878
2062
  DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u",
1879
2063
  last, txn->mt_dbs[FREE_DBI].md_root, i));
1880
- for (k = i; k; k--)
1881
- DPRINTF(("IDL %"Z"u", idl[k]));
2064
+ for (j = i; j; j--)
2065
+ DPRINTF(("IDL %"Z"u", idl[j]));
1882
2066
  #endif
1883
2067
  /* Merge in descending sorted order */
1884
- j = mop_len;
1885
- k = mop_len += i;
1886
- mop[0] = (pgno_t)-1;
1887
- old_id = mop[j];
1888
- while (i) {
1889
- new_id = idl[i--];
1890
- for (; old_id < new_id; old_id = mop[--j])
1891
- mop[k--] = old_id;
1892
- mop[k--] = new_id;
1893
- }
1894
- mop[0] = mop_len;
2068
+ mdb_midl_xmerge(mop, idl);
2069
+ mop_len = mop[0];
1895
2070
  }
1896
2071
 
1897
2072
  /* Use new pages from the map when nothing suitable in the freeDB */
@@ -1946,8 +2121,8 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
1946
2121
  * alignment so memcpy may copy words instead of bytes.
1947
2122
  */
1948
2123
  if ((unused &= -Align) && !IS_LEAF2(src)) {
1949
- upper &= -Align;
1950
- memcpy(dst, src, (lower + (Align-1)) & -Align);
2124
+ upper = (upper + PAGEBASE) & -Align;
2125
+ memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
1951
2126
  memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
1952
2127
  psize - upper);
1953
2128
  } else {
@@ -2314,7 +2489,7 @@ mdb_txn_renew0(MDB_txn *txn)
2314
2489
  return MDB_BAD_RSLOT;
2315
2490
  } else {
2316
2491
  MDB_PID_T pid = env->me_pid;
2317
- pthread_t tid = pthread_self();
2492
+ MDB_THR_T tid = pthread_self();
2318
2493
 
2319
2494
  if (!env->me_live_reader) {
2320
2495
  rc = mdb_reader_pid(env, Pidset, pid);
@@ -2373,6 +2548,7 @@ mdb_txn_renew0(MDB_txn *txn)
2373
2548
  txn->mt_free_pgs[0] = 0;
2374
2549
  txn->mt_spill_pgs = NULL;
2375
2550
  env->me_txn = txn;
2551
+ memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
2376
2552
  }
2377
2553
 
2378
2554
  /* Copy the DB info and flags */
@@ -2447,23 +2623,39 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2447
2623
  tsize = sizeof(MDB_ntxn);
2448
2624
  }
2449
2625
  size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1);
2450
- if (!(flags & MDB_RDONLY))
2626
+ if (!(flags & MDB_RDONLY)) {
2627
+ if (!parent) {
2628
+ txn = env->me_txn0;
2629
+ goto ok;
2630
+ }
2451
2631
  size += env->me_maxdbs * sizeof(MDB_cursor *);
2632
+ /* child txns use parent's dbiseqs */
2633
+ if (!parent)
2634
+ size += env->me_maxdbs * sizeof(unsigned int);
2635
+ }
2452
2636
 
2453
2637
  if ((txn = calloc(1, size)) == NULL) {
2454
- DPRINTF(("calloc: %s", strerror(ErrCode())));
2638
+ DPRINTF(("calloc: %s", strerror(errno)));
2455
2639
  return ENOMEM;
2456
2640
  }
2457
2641
  txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
2458
2642
  if (flags & MDB_RDONLY) {
2459
2643
  txn->mt_flags |= MDB_TXN_RDONLY;
2460
2644
  txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
2645
+ txn->mt_dbiseqs = env->me_dbiseqs;
2461
2646
  } else {
2462
2647
  txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
2463
- txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
2648
+ if (parent) {
2649
+ txn->mt_dbiseqs = parent->mt_dbiseqs;
2650
+ txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
2651
+ } else {
2652
+ txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
2653
+ txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
2654
+ }
2464
2655
  }
2465
2656
  txn->mt_env = env;
2466
2657
 
2658
+ ok:
2467
2659
  if (parent) {
2468
2660
  unsigned int i;
2469
2661
  txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
@@ -2506,9 +2698,10 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2506
2698
  } else {
2507
2699
  rc = mdb_txn_renew0(txn);
2508
2700
  }
2509
- if (rc)
2510
- free(txn);
2511
- else {
2701
+ if (rc) {
2702
+ if (txn != env->me_txn0)
2703
+ free(txn);
2704
+ } else {
2512
2705
  *ret = txn;
2513
2706
  DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2514
2707
  txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
@@ -2540,10 +2733,13 @@ mdb_dbis_update(MDB_txn *txn, int keep)
2540
2733
  env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
2541
2734
  } else {
2542
2735
  char *ptr = env->me_dbxs[i].md_name.mv_data;
2543
- env->me_dbxs[i].md_name.mv_data = NULL;
2544
- env->me_dbxs[i].md_name.mv_size = 0;
2545
- env->me_dbflags[i] = 0;
2546
- free(ptr);
2736
+ if (ptr) {
2737
+ env->me_dbxs[i].md_name.mv_data = NULL;
2738
+ env->me_dbxs[i].md_name.mv_size = 0;
2739
+ env->me_dbflags[i] = 0;
2740
+ env->me_dbiseqs[i]++;
2741
+ free(ptr);
2742
+ }
2547
2743
  }
2548
2744
  }
2549
2745
  }
@@ -2632,7 +2828,8 @@ mdb_txn_abort(MDB_txn *txn)
2632
2828
  if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader)
2633
2829
  txn->mt_u.reader->mr_pid = 0;
2634
2830
 
2635
- free(txn);
2831
+ if (txn != txn->mt_env->me_txn0)
2832
+ free(txn);
2636
2833
  }
2637
2834
 
2638
2835
  /** Save the freelist as of this transaction to the freeDB.
@@ -2661,6 +2858,19 @@ mdb_freelist_save(MDB_txn *txn)
2661
2858
  return rc;
2662
2859
  }
2663
2860
 
2861
+ if (!env->me_pghead && txn->mt_loose_pgs) {
2862
+ /* Put loose page numbers in mt_free_pgs, since
2863
+ * we may be unable to return them to me_pghead.
2864
+ */
2865
+ MDB_page *mp = txn->mt_loose_pgs;
2866
+ if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
2867
+ return rc;
2868
+ for (; mp; mp = NEXT_LOOSE_PAGE(mp))
2869
+ mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2870
+ txn->mt_loose_pgs = NULL;
2871
+ txn->mt_loose_count = 0;
2872
+ }
2873
+
2664
2874
  /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2665
2875
  clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2666
2876
  ? SSIZE_MAX : maxfree_1pg;
@@ -2722,7 +2932,7 @@ mdb_freelist_save(MDB_txn *txn)
2722
2932
  }
2723
2933
 
2724
2934
  mop = env->me_pghead;
2725
- mop_len = mop ? mop[0] : 0;
2935
+ mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
2726
2936
 
2727
2937
  /* Reserve records for me_pghead[]. Split it if multi-page,
2728
2938
  * to avoid searching freeDB for a page range. Use keys in
@@ -2762,6 +2972,28 @@ mdb_freelist_save(MDB_txn *txn)
2762
2972
  total_room += head_room;
2763
2973
  }
2764
2974
 
2975
+ /* Return loose page numbers to me_pghead, though usually none are
2976
+ * left at this point. The pages themselves remain in dirty_list.
2977
+ */
2978
+ if (txn->mt_loose_pgs) {
2979
+ MDB_page *mp = txn->mt_loose_pgs;
2980
+ unsigned count = txn->mt_loose_count;
2981
+ MDB_IDL loose;
2982
+ /* Room for loose pages + temp IDL with same */
2983
+ if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0)
2984
+ return rc;
2985
+ mop = env->me_pghead;
2986
+ loose = mop + MDB_IDL_ALLOCLEN(mop) - count;
2987
+ for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
2988
+ loose[ ++count ] = mp->mp_pgno;
2989
+ loose[0] = count;
2990
+ mdb_midl_sort(loose);
2991
+ mdb_midl_xmerge(mop, loose);
2992
+ txn->mt_loose_pgs = NULL;
2993
+ txn->mt_loose_count = 0;
2994
+ mop_len = mop[0];
2995
+ }
2996
+
2765
2997
  /* Fill in the reserved me_pghead records */
2766
2998
  rc = MDB_SUCCESS;
2767
2999
  if (mop_len) {
@@ -2823,8 +3055,8 @@ mdb_page_flush(MDB_txn *txn, int keep)
2823
3055
  while (++i <= pagecount) {
2824
3056
  dp = dl[i].mptr;
2825
3057
  /* Don't flush this page yet */
2826
- if (dp->mp_flags & P_KEEP) {
2827
- dp->mp_flags ^= P_KEEP;
3058
+ if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3059
+ dp->mp_flags &= ~P_KEEP;
2828
3060
  dl[++j] = dl[i];
2829
3061
  continue;
2830
3062
  }
@@ -2838,8 +3070,8 @@ mdb_page_flush(MDB_txn *txn, int keep)
2838
3070
  if (++i <= pagecount) {
2839
3071
  dp = dl[i].mptr;
2840
3072
  /* Don't flush this page yet */
2841
- if (dp->mp_flags & P_KEEP) {
2842
- dp->mp_flags ^= P_KEEP;
3073
+ if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3074
+ dp->mp_flags &= ~P_KEEP;
2843
3075
  dl[i].mid = 0;
2844
3076
  continue;
2845
3077
  }
@@ -2914,6 +3146,12 @@ mdb_page_flush(MDB_txn *txn, int keep)
2914
3146
  #endif /* _WIN32 */
2915
3147
  }
2916
3148
 
3149
+ /* MIPS has cache coherency issues, this is a no-op everywhere else
3150
+ * Note: for any size >= on-chip cache size, entire on-chip cache is
3151
+ * flushed.
3152
+ */
3153
+ CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3154
+
2917
3155
  for (i = keep; ++i <= pagecount; ) {
2918
3156
  dp = dl[i].mptr;
2919
3157
  /* This is a page we skipped above */
@@ -2968,6 +3206,7 @@ mdb_txn_commit(MDB_txn *txn)
2968
3206
 
2969
3207
  if (txn->mt_parent) {
2970
3208
  MDB_txn *parent = txn->mt_parent;
3209
+ MDB_page **lp;
2971
3210
  MDB_ID2L dst, src;
2972
3211
  MDB_IDL pspill;
2973
3212
  unsigned x, y, len, ps_len;
@@ -3065,6 +3304,12 @@ mdb_txn_commit(MDB_txn *txn)
3065
3304
  }
3066
3305
  }
3067
3306
 
3307
+ /* Append our loose page list to parent's */
3308
+ for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp))
3309
+ ;
3310
+ *lp = txn->mt_loose_pgs;
3311
+ parent->mt_loose_count += txn->mt_loose_count;
3312
+
3068
3313
  parent->mt_child = NULL;
3069
3314
  mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3070
3315
  free(txn);
@@ -3096,6 +3341,10 @@ mdb_txn_commit(MDB_txn *txn)
3096
3341
  mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3097
3342
  for (i = 2; i < txn->mt_numdbs; i++) {
3098
3343
  if (txn->mt_dbflags[i] & DB_DIRTY) {
3344
+ if (TXN_DBI_CHANGED(txn, i)) {
3345
+ rc = MDB_BAD_DBI;
3346
+ goto fail;
3347
+ }
3099
3348
  data.mv_data = &txn->mt_dbs[i];
3100
3349
  rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
3101
3350
  if (rc)
@@ -3122,6 +3371,10 @@ mdb_txn_commit(MDB_txn *txn)
3122
3371
  (rc = mdb_env_write_meta(txn)))
3123
3372
  goto fail;
3124
3373
 
3374
+ /* Free P_LOOSE pages left behind in dirty_list */
3375
+ if (!(env->me_flags & MDB_WRITEMAP))
3376
+ mdb_dlist_free(txn);
3377
+
3125
3378
  done:
3126
3379
  env->me_pglast = 0;
3127
3380
  env->me_txn = NULL;
@@ -3129,7 +3382,8 @@ done:
3129
3382
 
3130
3383
  if (env->me_txns)
3131
3384
  UNLOCK_MUTEX_W(env);
3132
- free(txn);
3385
+ if (txn != env->me_txn0)
3386
+ free(txn);
3133
3387
 
3134
3388
  return MDB_SUCCESS;
3135
3389
 
@@ -3144,7 +3398,7 @@ fail:
3144
3398
  * @param[out] meta address of where to store the meta information
3145
3399
  * @return 0 on success, non-zero on failure.
3146
3400
  */
3147
- static int
3401
+ static int ESECT
3148
3402
  mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3149
3403
  {
3150
3404
  MDB_metabuf pbuf;
@@ -3202,12 +3456,26 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3202
3456
  return 0;
3203
3457
  }
3204
3458
 
3459
+ static void ESECT
3460
+ mdb_env_init_meta0(MDB_env *env, MDB_meta *meta)
3461
+ {
3462
+ meta->mm_magic = MDB_MAGIC;
3463
+ meta->mm_version = MDB_DATA_VERSION;
3464
+ meta->mm_mapsize = env->me_mapsize;
3465
+ meta->mm_psize = env->me_psize;
3466
+ meta->mm_last_pg = 1;
3467
+ meta->mm_flags = env->me_flags & 0xffff;
3468
+ meta->mm_flags |= MDB_INTEGERKEY;
3469
+ meta->mm_dbs[0].md_root = P_INVALID;
3470
+ meta->mm_dbs[1].md_root = P_INVALID;
3471
+ }
3472
+
3205
3473
  /** Write the environment parameters of a freshly created DB environment.
3206
3474
  * @param[in] env the environment handle
3207
3475
  * @param[out] meta address of where to store the meta information
3208
3476
  * @return 0 on success, non-zero on failure.
3209
3477
  */
3210
- static int
3478
+ static int ESECT
3211
3479
  mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3212
3480
  {
3213
3481
  MDB_page *p, *q;
@@ -3231,15 +3499,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3231
3499
 
3232
3500
  psize = env->me_psize;
3233
3501
 
3234
- meta->mm_magic = MDB_MAGIC;
3235
- meta->mm_version = MDB_DATA_VERSION;
3236
- meta->mm_mapsize = env->me_mapsize;
3237
- meta->mm_psize = psize;
3238
- meta->mm_last_pg = 1;
3239
- meta->mm_flags = env->me_flags & 0xffff;
3240
- meta->mm_flags |= MDB_INTEGERKEY;
3241
- meta->mm_dbs[0].md_root = P_INVALID;
3242
- meta->mm_dbs[1].md_root = P_INVALID;
3502
+ mdb_env_init_meta0(env, meta);
3243
3503
 
3244
3504
  p = calloc(2, psize);
3245
3505
  p->mp_pgno = 0;
@@ -3271,6 +3531,7 @@ mdb_env_write_meta(MDB_txn *txn)
3271
3531
  {
3272
3532
  MDB_env *env;
3273
3533
  MDB_meta meta, metab, *mp;
3534
+ size_t mapsize;
3274
3535
  off_t off;
3275
3536
  int rc, len, toggle;
3276
3537
  char *ptr;
@@ -3287,11 +3548,13 @@ mdb_env_write_meta(MDB_txn *txn)
3287
3548
 
3288
3549
  env = txn->mt_env;
3289
3550
  mp = env->me_metas[toggle];
3551
+ mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
3552
+ /* Persist any increases of mapsize config */
3553
+ if (mapsize < env->me_mapsize)
3554
+ mapsize = env->me_mapsize;
3290
3555
 
3291
3556
  if (env->me_flags & MDB_WRITEMAP) {
3292
- /* Persist any increases of mapsize config */
3293
- if (env->me_mapsize > mp->mm_mapsize)
3294
- mp->mm_mapsize = env->me_mapsize;
3557
+ mp->mm_mapsize = mapsize;
3295
3558
  mp->mm_dbs[0] = txn->mt_dbs[0];
3296
3559
  mp->mm_dbs[1] = txn->mt_dbs[1];
3297
3560
  mp->mm_last_pg = txn->mt_next_pgno - 1;
@@ -3318,22 +3581,15 @@ mdb_env_write_meta(MDB_txn *txn)
3318
3581
  metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
3319
3582
  metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
3320
3583
 
3321
- ptr = (char *)&meta;
3322
- if (env->me_mapsize > mp->mm_mapsize) {
3323
- /* Persist any increases of mapsize config */
3324
- meta.mm_mapsize = env->me_mapsize;
3325
- off = offsetof(MDB_meta, mm_mapsize);
3326
- } else {
3327
- off = offsetof(MDB_meta, mm_dbs[0].md_depth);
3328
- }
3329
- len = sizeof(MDB_meta) - off;
3330
-
3331
- ptr += off;
3584
+ meta.mm_mapsize = mapsize;
3332
3585
  meta.mm_dbs[0] = txn->mt_dbs[0];
3333
3586
  meta.mm_dbs[1] = txn->mt_dbs[1];
3334
3587
  meta.mm_last_pg = txn->mt_next_pgno - 1;
3335
3588
  meta.mm_txnid = txn->mt_txnid;
3336
3589
 
3590
+ off = offsetof(MDB_meta, mm_mapsize);
3591
+ ptr = (char *)&meta + off;
3592
+ len = sizeof(MDB_meta) - off;
3337
3593
  if (toggle)
3338
3594
  off += env->me_psize;
3339
3595
  off += PAGEHDRSZ;
@@ -3372,6 +3628,8 @@ fail:
3372
3628
  env->me_flags |= MDB_FATAL_ERROR;
3373
3629
  return rc;
3374
3630
  }
3631
+ /* MIPS has cache coherency issues, this is a no-op everywhere else */
3632
+ CACHEFLUSH(env->me_map + off, len, DCACHE);
3375
3633
  done:
3376
3634
  /* Memory ordering issues are irrelevant; since the entire writer
3377
3635
  * is wrapped by wmutex, all of these changes will become visible
@@ -3395,7 +3653,7 @@ mdb_env_pick_meta(const MDB_env *env)
3395
3653
  return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid);
3396
3654
  }
3397
3655
 
3398
- int
3656
+ int ESECT
3399
3657
  mdb_env_create(MDB_env **env)
3400
3658
  {
3401
3659
  MDB_env *e;
@@ -3420,8 +3678,8 @@ mdb_env_create(MDB_env **env)
3420
3678
  return MDB_SUCCESS;
3421
3679
  }
3422
3680
 
3423
- static int
3424
- mdb_env_map(MDB_env *env, void *addr, int newsize)
3681
+ static int ESECT
3682
+ mdb_env_map(MDB_env *env, void *addr)
3425
3683
  {
3426
3684
  MDB_page *p;
3427
3685
  unsigned int flags = env->me_flags;
@@ -3429,18 +3687,28 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3429
3687
  int rc;
3430
3688
  HANDLE mh;
3431
3689
  LONG sizelo, sizehi;
3432
- sizelo = env->me_mapsize & 0xffffffff;
3433
- sizehi = env->me_mapsize >> 16 >> 16; /* only needed on Win64 */
3690
+ size_t msize;
3434
3691
 
3435
- /* Windows won't create mappings for zero length files.
3436
- * Just allocate the maxsize right now.
3437
- */
3438
- if (newsize) {
3692
+ if (flags & MDB_RDONLY) {
3693
+ /* Don't set explicit map size, use whatever exists */
3694
+ msize = 0;
3695
+ sizelo = 0;
3696
+ sizehi = 0;
3697
+ } else {
3698
+ msize = env->me_mapsize;
3699
+ sizelo = msize & 0xffffffff;
3700
+ sizehi = msize >> 16 >> 16; /* only needed on Win64 */
3701
+
3702
+ /* Windows won't create mappings for zero length files.
3703
+ * and won't map more than the file size.
3704
+ * Just set the maxsize right now.
3705
+ */
3439
3706
  if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo
3440
3707
  || !SetEndOfFile(env->me_fd)
3441
3708
  || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)
3442
3709
  return ErrCode();
3443
3710
  }
3711
+
3444
3712
  mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
3445
3713
  PAGE_READWRITE : PAGE_READONLY,
3446
3714
  sizehi, sizelo, NULL);
@@ -3448,7 +3716,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3448
3716
  return ErrCode();
3449
3717
  env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
3450
3718
  FILE_MAP_WRITE : FILE_MAP_READ,
3451
- 0, 0, env->me_mapsize, addr);
3719
+ 0, 0, msize, addr);
3452
3720
  rc = env->me_map ? 0 : ErrCode();
3453
3721
  CloseHandle(mh);
3454
3722
  if (rc)
@@ -3494,7 +3762,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3494
3762
  return MDB_SUCCESS;
3495
3763
  }
3496
3764
 
3497
- int
3765
+ int ESECT
3498
3766
  mdb_env_set_mapsize(MDB_env *env, size_t size)
3499
3767
  {
3500
3768
  /* If env is already open, caller is responsible for making
@@ -3518,7 +3786,7 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
3518
3786
  munmap(env->me_map, env->me_mapsize);
3519
3787
  env->me_mapsize = size;
3520
3788
  old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
3521
- rc = mdb_env_map(env, old, 1);
3789
+ rc = mdb_env_map(env, old);
3522
3790
  if (rc)
3523
3791
  return rc;
3524
3792
  }
@@ -3528,7 +3796,7 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
3528
3796
  return MDB_SUCCESS;
3529
3797
  }
3530
3798
 
3531
- int
3799
+ int ESECT
3532
3800
  mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
3533
3801
  {
3534
3802
  if (env->me_map)
@@ -3537,7 +3805,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
3537
3805
  return MDB_SUCCESS;
3538
3806
  }
3539
3807
 
3540
- int
3808
+ int ESECT
3541
3809
  mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
3542
3810
  {
3543
3811
  if (env->me_map || readers < 1)
@@ -3546,7 +3814,7 @@ mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
3546
3814
  return MDB_SUCCESS;
3547
3815
  }
3548
3816
 
3549
- int
3817
+ int ESECT
3550
3818
  mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
3551
3819
  {
3552
3820
  if (!env || !readers)
@@ -3555,9 +3823,9 @@ mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
3555
3823
  return MDB_SUCCESS;
3556
3824
  }
3557
3825
 
3558
- /** Further setup required for opening an MDB environment
3826
+ /** Further setup required for opening an LMDB environment
3559
3827
  */
3560
- static int
3828
+ static int ESECT
3561
3829
  mdb_env_open2(MDB_env *env)
3562
3830
  {
3563
3831
  unsigned int flags = env->me_flags;
@@ -3602,7 +3870,7 @@ mdb_env_open2(MDB_env *env)
3602
3870
  env->me_mapsize = minsize;
3603
3871
  }
3604
3872
 
3605
- rc = mdb_env_map(env, meta.mm_address, newenv || env->me_mapsize != meta.mm_mapsize);
3873
+ rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
3606
3874
  if (rc)
3607
3875
  return rc;
3608
3876
 
@@ -3714,7 +3982,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
3714
3982
  #endif
3715
3983
 
3716
3984
  /** Downgrade the exclusive lock on the region back to shared */
3717
- static int
3985
+ static int ESECT
3718
3986
  mdb_env_share_locks(MDB_env *env, int *excl)
3719
3987
  {
3720
3988
  int rc = 0, toggle = mdb_env_pick_meta(env);
@@ -3756,7 +4024,7 @@ mdb_env_share_locks(MDB_env *env, int *excl)
3756
4024
  /** Try to get exlusive lock, otherwise shared.
3757
4025
  * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
3758
4026
  */
3759
- static int
4027
+ static int ESECT
3760
4028
  mdb_env_excl_lock(MDB_env *env, int *excl)
3761
4029
  {
3762
4030
  int rc = 0;
@@ -3891,14 +4159,14 @@ mdb_hash_enc(MDB_val *val, char *encbuf)
3891
4159
  #endif
3892
4160
 
3893
4161
  /** Open and/or initialize the lock region for the environment.
3894
- * @param[in] env The MDB environment.
4162
+ * @param[in] env The LMDB environment.
3895
4163
  * @param[in] lpath The pathname of the file used for the lock region.
3896
4164
  * @param[in] mode The Unix permissions for the file, if we create it.
3897
4165
  * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive
3898
4166
  * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
3899
4167
  * @return 0 on success, non-zero on failure.
3900
4168
  */
3901
- static int
4169
+ static int ESECT
3902
4170
  mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
3903
4171
  {
3904
4172
  #ifdef _WIN32
@@ -4128,7 +4396,7 @@ fail:
4128
4396
  # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
4129
4397
  #endif
4130
4398
 
4131
- int
4399
+ int ESECT
4132
4400
  mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
4133
4401
  {
4134
4402
  int oflags, rc, len, excl = -1;
@@ -4173,7 +4441,8 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4173
4441
  env->me_path = strdup(path);
4174
4442
  env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
4175
4443
  env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
4176
- if (!(env->me_dbxs && env->me_path && env->me_dbflags)) {
4444
+ env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
4445
+ if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
4177
4446
  rc = ENOMEM;
4178
4447
  goto leave;
4179
4448
  }
@@ -4245,6 +4514,22 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4245
4514
  if (!((flags & MDB_RDONLY) ||
4246
4515
  (env->me_pbuf = calloc(1, env->me_psize))))
4247
4516
  rc = ENOMEM;
4517
+ if (!(flags & MDB_RDONLY)) {
4518
+ MDB_txn *txn;
4519
+ int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs *
4520
+ (sizeof(MDB_db)+sizeof(MDB_cursor)+sizeof(unsigned int)+1);
4521
+ txn = calloc(1, size);
4522
+ if (txn) {
4523
+ txn->mt_dbs = (MDB_db *)((char *)txn + tsize);
4524
+ txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
4525
+ txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
4526
+ txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
4527
+ txn->mt_env = env;
4528
+ env->me_txn0 = txn;
4529
+ } else {
4530
+ rc = ENOMEM;
4531
+ }
4532
+ }
4248
4533
  }
4249
4534
 
4250
4535
  leave:
@@ -4256,7 +4541,7 @@ leave:
4256
4541
  }
4257
4542
 
4258
4543
  /** Destroy resources from mdb_env_open(), clear our readers & DBIs */
4259
- static void
4544
+ static void ESECT
4260
4545
  mdb_env_close0(MDB_env *env, int excl)
4261
4546
  {
4262
4547
  int i;
@@ -4269,6 +4554,7 @@ mdb_env_close0(MDB_env *env, int excl)
4269
4554
  free(env->me_dbxs[i].md_name.mv_data);
4270
4555
 
4271
4556
  free(env->me_pbuf);
4557
+ free(env->me_dbiseqs);
4272
4558
  free(env->me_dbflags);
4273
4559
  free(env->me_dbxs);
4274
4560
  free(env->me_path);
@@ -4344,186 +4630,41 @@ mdb_env_close0(MDB_env *env, int excl)
4344
4630
  env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
4345
4631
  }
4346
4632
 
4347
- int
4348
- mdb_env_copyfd(MDB_env *env, HANDLE fd)
4349
- {
4350
- MDB_txn *txn = NULL;
4351
- int rc;
4352
- size_t wsize;
4353
- char *ptr;
4354
- #ifdef _WIN32
4355
- DWORD len, w2;
4356
- #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
4357
- #else
4358
- ssize_t len;
4359
- size_t w2;
4360
- #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
4361
- #endif
4362
-
4363
- /* Do the lock/unlock of the reader mutex before starting the
4364
- * write txn. Otherwise other read txns could block writers.
4365
- */
4366
- rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
4367
- if (rc)
4368
- return rc;
4369
-
4370
- if (env->me_txns) {
4371
- /* We must start the actual read txn after blocking writers */
4372
- mdb_txn_reset0(txn, "reset-stage1");
4373
4633
 
4374
- /* Temporarily block writers until we snapshot the meta pages */
4375
- LOCK_MUTEX_W(env);
4634
+ void ESECT
4635
+ mdb_env_close(MDB_env *env)
4636
+ {
4637
+ MDB_page *dp;
4376
4638
 
4377
- rc = mdb_txn_renew0(txn);
4378
- if (rc) {
4379
- UNLOCK_MUTEX_W(env);
4380
- goto leave;
4381
- }
4382
- }
4639
+ if (env == NULL)
4640
+ return;
4383
4641
 
4384
- wsize = env->me_psize * 2;
4385
- ptr = env->me_map;
4386
- w2 = wsize;
4387
- while (w2 > 0) {
4388
- DO_WRITE(rc, fd, ptr, w2, len);
4389
- if (!rc) {
4390
- rc = ErrCode();
4391
- break;
4392
- } else if (len > 0) {
4393
- rc = MDB_SUCCESS;
4394
- ptr += len;
4395
- w2 -= len;
4396
- continue;
4397
- } else {
4398
- /* Non-blocking or async handles are not supported */
4399
- rc = EIO;
4400
- break;
4401
- }
4642
+ VGMEMP_DESTROY(env);
4643
+ while ((dp = env->me_dpages) != NULL) {
4644
+ VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
4645
+ env->me_dpages = dp->mp_next;
4646
+ free(dp);
4402
4647
  }
4403
- if (env->me_txns)
4404
- UNLOCK_MUTEX_W(env);
4405
-
4406
- if (rc)
4407
- goto leave;
4408
4648
 
4409
- wsize = txn->mt_next_pgno * env->me_psize - wsize;
4410
- while (wsize > 0) {
4411
- if (wsize > MAX_WRITE)
4412
- w2 = MAX_WRITE;
4413
- else
4414
- w2 = wsize;
4415
- DO_WRITE(rc, fd, ptr, w2, len);
4416
- if (!rc) {
4417
- rc = ErrCode();
4418
- break;
4419
- } else if (len > 0) {
4420
- rc = MDB_SUCCESS;
4421
- ptr += len;
4422
- wsize -= len;
4423
- continue;
4424
- } else {
4425
- rc = EIO;
4426
- break;
4427
- }
4428
- }
4649
+ mdb_env_close0(env, 0);
4650
+ free(env);
4651
+ }
4429
4652
 
4430
- leave:
4431
- mdb_txn_abort(txn);
4432
- return rc;
4653
+ /** Compare two items pointing at aligned size_t's */
4654
+ static int
4655
+ mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4656
+ {
4657
+ return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
4658
+ *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4433
4659
  }
4434
4660
 
4435
- int
4436
- mdb_env_copy(MDB_env *env, const char *path)
4661
+ /** Compare two items pointing at aligned unsigned int's */
4662
+ static int
4663
+ mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4437
4664
  {
4438
- int rc, len;
4439
- char *lpath;
4440
- HANDLE newfd = INVALID_HANDLE_VALUE;
4441
-
4442
- if (env->me_flags & MDB_NOSUBDIR) {
4443
- lpath = (char *)path;
4444
- } else {
4445
- len = strlen(path);
4446
- len += sizeof(DATANAME);
4447
- lpath = malloc(len);
4448
- if (!lpath)
4449
- return ENOMEM;
4450
- sprintf(lpath, "%s" DATANAME, path);
4451
- }
4452
-
4453
- /* The destination path must exist, but the destination file must not.
4454
- * We don't want the OS to cache the writes, since the source data is
4455
- * already in the OS cache.
4456
- */
4457
- #ifdef _WIN32
4458
- newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
4459
- FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
4460
- #else
4461
- newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
4462
- #endif
4463
- if (newfd == INVALID_HANDLE_VALUE) {
4464
- rc = ErrCode();
4465
- goto leave;
4466
- }
4467
-
4468
- #ifdef O_DIRECT
4469
- /* Set O_DIRECT if the file system supports it */
4470
- if ((rc = fcntl(newfd, F_GETFL)) != -1)
4471
- (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
4472
- #endif
4473
- #ifdef F_NOCACHE /* __APPLE__ */
4474
- rc = fcntl(newfd, F_NOCACHE, 1);
4475
- if (rc) {
4476
- rc = ErrCode();
4477
- goto leave;
4478
- }
4479
- #endif
4480
-
4481
- rc = mdb_env_copyfd(env, newfd);
4482
-
4483
- leave:
4484
- if (!(env->me_flags & MDB_NOSUBDIR))
4485
- free(lpath);
4486
- if (newfd != INVALID_HANDLE_VALUE)
4487
- if (close(newfd) < 0 && rc == MDB_SUCCESS)
4488
- rc = ErrCode();
4489
-
4490
- return rc;
4491
- }
4492
-
4493
- void
4494
- mdb_env_close(MDB_env *env)
4495
- {
4496
- MDB_page *dp;
4497
-
4498
- if (env == NULL)
4499
- return;
4500
-
4501
- VGMEMP_DESTROY(env);
4502
- while ((dp = env->me_dpages) != NULL) {
4503
- VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
4504
- env->me_dpages = dp->mp_next;
4505
- free(dp);
4506
- }
4507
-
4508
- mdb_env_close0(env, 0);
4509
- free(env);
4510
- }
4511
-
4512
- /** Compare two items pointing at aligned size_t's */
4513
- static int
4514
- mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4515
- {
4516
- return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
4517
- *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4518
- }
4519
-
4520
- /** Compare two items pointing at aligned unsigned int's */
4521
- static int
4522
- mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4523
- {
4524
- return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
4525
- *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4526
- }
4665
+ return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
4666
+ *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4667
+ }
4527
4668
 
4528
4669
  /** Compare two items pointing at unsigned ints of unknown alignment.
4529
4670
  * Nodes and keys are guaranteed to be 2-byte aligned.
@@ -4542,7 +4683,16 @@ mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
4542
4683
  } while(!x && u > (unsigned short *)a->mv_data);
4543
4684
  return x;
4544
4685
  #else
4545
- return memcmp(a->mv_data, b->mv_data, a->mv_size);
4686
+ unsigned short *u, *c, *end;
4687
+ int x;
4688
+
4689
+ end = (unsigned short *) ((char *) a->mv_data + a->mv_size);
4690
+ u = (unsigned short *)a->mv_data;
4691
+ c = (unsigned short *)b->mv_data;
4692
+ do {
4693
+ x = *u++ - *c++;
4694
+ } while(!x && u < end);
4695
+ return x;
4546
4696
  #endif
4547
4697
  }
4548
4698
 
@@ -4924,6 +5074,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4924
5074
  /* Make sure we're using an up-to-date root */
4925
5075
  if (*mc->mc_dbflag & DB_STALE) {
4926
5076
  MDB_cursor mc2;
5077
+ if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
5078
+ return MDB_BAD_DBI;
4927
5079
  mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
4928
5080
  rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
4929
5081
  if (rc)
@@ -5264,8 +5416,10 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5264
5416
  if (op == MDB_PREV || op == MDB_PREV_DUP) {
5265
5417
  rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
5266
5418
  if (op != MDB_PREV || rc != MDB_NOTFOUND) {
5267
- if (rc == MDB_SUCCESS)
5419
+ if (rc == MDB_SUCCESS) {
5268
5420
  MDB_GET_KEY(leaf, key);
5421
+ mc->mc_flags &= ~C_EOF;
5422
+ }
5269
5423
  return rc;
5270
5424
  }
5271
5425
  } else {
@@ -5457,8 +5611,10 @@ set1:
5457
5611
  mc->mc_flags &= ~C_EOF;
5458
5612
 
5459
5613
  if (IS_LEAF2(mp)) {
5460
- key->mv_size = mc->mc_db->md_pad;
5461
- key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5614
+ if (op == MDB_SET_RANGE || op == MDB_SET_KEY) {
5615
+ key->mv_size = mc->mc_db->md_pad;
5616
+ key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5617
+ }
5462
5618
  return MDB_SUCCESS;
5463
5619
  }
5464
5620
 
@@ -5740,6 +5896,14 @@ fetchm:
5740
5896
  rc = MDB_INCOMPATIBLE;
5741
5897
  break;
5742
5898
  }
5899
+ {
5900
+ MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5901
+ if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5902
+ MDB_GET_KEY(leaf, key);
5903
+ rc = mdb_node_read(mc->mc_txn, leaf, data);
5904
+ break;
5905
+ }
5906
+ }
5743
5907
  if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
5744
5908
  rc = EINVAL;
5745
5909
  break;
@@ -5776,6 +5940,8 @@ mdb_cursor_touch(MDB_cursor *mc)
5776
5940
  if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
5777
5941
  MDB_cursor mc2;
5778
5942
  MDB_xcursor mcx;
5943
+ if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
5944
+ return MDB_BAD_DBI;
5779
5945
  mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
5780
5946
  rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
5781
5947
  if (rc)
@@ -5932,22 +6098,42 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5932
6098
  if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
5933
6099
  LEAFSIZE(key, data) > env->me_nodemax)
5934
6100
  {
5935
- /* Too big for a node, insert in sub-DB */
6101
+ /* Too big for a node, insert in sub-DB. Set up an empty
6102
+ * "old sub-page" for prep_subDB to expand to a full page.
6103
+ */
5936
6104
  fp_flags = P_LEAF|P_DIRTY;
5937
6105
  fp = env->me_pbuf;
5938
6106
  fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */
5939
- fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ;
6107
+ fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
6108
+ olddata.mv_size = PAGEHDRSZ;
5940
6109
  goto prep_subDB;
5941
6110
  }
5942
6111
  } else {
5943
6112
  /* there's only a key anyway, so this is a no-op */
5944
6113
  if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
6114
+ char *ptr;
5945
6115
  unsigned int ksize = mc->mc_db->md_pad;
5946
6116
  if (key->mv_size != ksize)
5947
6117
  return MDB_BAD_VALSIZE;
5948
- if (flags == MDB_CURRENT) {
5949
- char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
5950
- memcpy(ptr, key->mv_data, ksize);
6118
+ ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
6119
+ memcpy(ptr, key->mv_data, ksize);
6120
+ fix_parent:
6121
+ /* if overwriting slot 0 of leaf, need to
6122
+ * update branch key if there is a parent page
6123
+ */
6124
+ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
6125
+ unsigned short top = mc->mc_top;
6126
+ mc->mc_top--;
6127
+ /* slot 0 is always an empty key, find real slot */
6128
+ while (mc->mc_top && !mc->mc_ki[mc->mc_top])
6129
+ mc->mc_top--;
6130
+ if (mc->mc_ki[mc->mc_top])
6131
+ rc2 = mdb_update_key(mc, key);
6132
+ else
6133
+ rc2 = MDB_SUCCESS;
6134
+ mc->mc_top = top;
6135
+ if (rc2)
6136
+ return rc2;
5951
6137
  }
5952
6138
  return MDB_SUCCESS;
5953
6139
  }
@@ -5978,12 +6164,12 @@ more:
5978
6164
  if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
5979
6165
  mc->mc_dbx->md_dcmp = mdb_cmp_clong;
5980
6166
  #endif
5981
- /* if data matches, skip it */
6167
+ /* does data match? */
5982
6168
  if (!mc->mc_dbx->md_dcmp(data, &olddata)) {
5983
6169
  if (flags & MDB_NODUPDATA)
5984
6170
  return MDB_KEYEXIST;
5985
- rc = MDB_SUCCESS;
5986
- goto next_sub;
6171
+ /* overwrite it */
6172
+ goto current;
5987
6173
  }
5988
6174
 
5989
6175
  /* Back up original data item */
@@ -5992,7 +6178,7 @@ more:
5992
6178
 
5993
6179
  /* Make sub-page header for the dup items, with dummy body */
5994
6180
  fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
5995
- fp->mp_lower = PAGEHDRSZ;
6181
+ fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
5996
6182
  xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5997
6183
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5998
6184
  fp->mp_flags |= P_LEAF2;
@@ -6002,8 +6188,8 @@ more:
6002
6188
  xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
6003
6189
  (dkey.mv_size & 1) + (data->mv_size & 1);
6004
6190
  }
6005
- fp->mp_upper = xdata.mv_size;
6006
- olddata.mv_size = fp->mp_upper; /* pretend olddata is fp */
6191
+ fp->mp_upper = xdata.mv_size - PAGEBASE;
6192
+ olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */
6007
6193
  } else if (leaf->mn_flags & F_SUBDATA) {
6008
6194
  /* Data is on sub-DB, just store it */
6009
6195
  flags |= F_DUPDATA|F_SUBDATA;
@@ -6070,8 +6256,8 @@ prep_subDB:
6070
6256
  if (fp_flags & P_LEAF2) {
6071
6257
  memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
6072
6258
  } else {
6073
- memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
6074
- olddata.mv_size - fp->mp_upper);
6259
+ memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
6260
+ olddata.mv_size - fp->mp_upper - PAGEBASE);
6075
6261
  for (i=0; i<NUMKEYS(fp); i++)
6076
6262
  mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
6077
6263
  }
@@ -6154,8 +6340,10 @@ current:
6154
6340
  data->mv_data = olddata.mv_data;
6155
6341
  else if (!(mc->mc_flags & C_SUB))
6156
6342
  memcpy(olddata.mv_data, data->mv_data, data->mv_size);
6157
- else
6343
+ else {
6158
6344
  memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
6345
+ goto fix_parent;
6346
+ }
6159
6347
  return MDB_SUCCESS;
6160
6348
  }
6161
6349
  mdb_node_del(mc, 0);
@@ -6259,7 +6447,6 @@ put_sub:
6259
6447
  */
6260
6448
  mc->mc_flags |= C_INITIALIZED;
6261
6449
  }
6262
- next_sub:
6263
6450
  if (flags & MDB_MULTIPLE) {
6264
6451
  if (!rc) {
6265
6452
  mcount++;
@@ -6393,8 +6580,8 @@ mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp)
6393
6580
  DPRINTF(("allocated new mpage %"Z"u, page size %u",
6394
6581
  np->mp_pgno, mc->mc_txn->mt_env->me_psize));
6395
6582
  np->mp_flags = flags | P_DIRTY;
6396
- np->mp_lower = PAGEHDRSZ;
6397
- np->mp_upper = mc->mc_txn->mt_env->me_psize;
6583
+ np->mp_lower = (PAGEHDRSZ-PAGEBASE);
6584
+ np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
6398
6585
 
6399
6586
  if (IS_BRANCH(np))
6400
6587
  mc->mc_db->md_branch_pages++;
@@ -6647,7 +6834,7 @@ mdb_node_del(MDB_cursor *mc, int ksize)
6647
6834
  }
6648
6835
  }
6649
6836
 
6650
- base = (char *)mp + mp->mp_upper;
6837
+ base = (char *)mp + mp->mp_upper + PAGEBASE;
6651
6838
  memmove(base + sz, base, ptr - mp->mp_upper);
6652
6839
 
6653
6840
  mp->mp_lower -= sizeof(indx_t);
@@ -6701,7 +6888,7 @@ mdb_node_shrink(MDB_page *mp, indx_t indx)
6701
6888
  mp->mp_ptrs[i] += delta;
6702
6889
  }
6703
6890
 
6704
- base = (char *)mp + mp->mp_upper;
6891
+ base = (char *)mp + mp->mp_upper + PAGEBASE;
6705
6892
  memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
6706
6893
  mp->mp_upper += delta;
6707
6894
  }
@@ -6877,6 +7064,12 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp)
6877
7064
  if (mc->mc_txn->mt_flags & MDB_TXN_ERROR)
6878
7065
  return MDB_BAD_TXN;
6879
7066
 
7067
+ if (!(mc->mc_flags & C_INITIALIZED))
7068
+ return EINVAL;
7069
+
7070
+ if (!mc->mc_snum || (mc->mc_flags & C_EOF))
7071
+ return MDB_NOTFOUND;
7072
+
6880
7073
  leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6881
7074
  if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6882
7075
  *countp = 1;
@@ -6973,7 +7166,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key)
6973
7166
  mp->mp_ptrs[i] -= delta;
6974
7167
  }
6975
7168
 
6976
- base = (char *)mp + mp->mp_upper;
7169
+ base = (char *)mp + mp->mp_upper + PAGEBASE;
6977
7170
  len = ptr - mp->mp_upper + NODESIZE;
6978
7171
  memmove(base - delta, base, len);
6979
7172
  mp->mp_upper -= delta;
@@ -7054,20 +7247,20 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
7054
7247
  MDB_node *s2;
7055
7248
  MDB_val bkey;
7056
7249
  /* must find the lowest key below dst */
7057
- rc = mdb_page_search_lowest(cdst);
7250
+ mdb_cursor_copy(cdst, &mn);
7251
+ rc = mdb_page_search_lowest(&mn);
7058
7252
  if (rc)
7059
7253
  return rc;
7060
- if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) {
7061
- bkey.mv_size = cdst->mc_db->md_pad;
7062
- bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size);
7254
+ if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7255
+ bkey.mv_size = mn.mc_db->md_pad;
7256
+ bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size);
7063
7257
  } else {
7064
- s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
7258
+ s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7065
7259
  bkey.mv_size = NODEKSZ(s2);
7066
7260
  bkey.mv_data = NODEKEY(s2);
7067
7261
  }
7068
- cdst->mc_snum = snum--;
7069
- cdst->mc_top = snum;
7070
- mdb_cursor_copy(cdst, &mn);
7262
+ mn.mc_snum = snum--;
7263
+ mn.mc_top = snum;
7071
7264
  mn.mc_ki[snum] = 0;
7072
7265
  rc = mdb_update_key(&mn, &bkey);
7073
7266
  if (rc)
@@ -7183,14 +7376,17 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
7183
7376
  static int
7184
7377
  mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7185
7378
  {
7186
- int rc;
7187
- indx_t i, j;
7188
- MDB_node *srcnode;
7379
+ MDB_page *psrc, *pdst;
7380
+ MDB_node *srcnode;
7189
7381
  MDB_val key, data;
7190
- unsigned nkeys;
7382
+ unsigned nkeys;
7383
+ int rc;
7384
+ indx_t i, j;
7191
7385
 
7192
- DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno,
7193
- cdst->mc_pg[cdst->mc_top]->mp_pgno));
7386
+ psrc = csrc->mc_pg[csrc->mc_top];
7387
+ pdst = cdst->mc_pg[cdst->mc_top];
7388
+
7389
+ DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno));
7194
7390
 
7195
7391
  mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */
7196
7392
  mdb_cassert(csrc, cdst->mc_snum > 1);
@@ -7201,36 +7397,35 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7201
7397
 
7202
7398
  /* Move all nodes from src to dst.
7203
7399
  */
7204
- j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]);
7205
- if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7400
+ j = nkeys = NUMKEYS(pdst);
7401
+ if (IS_LEAF2(psrc)) {
7206
7402
  key.mv_size = csrc->mc_db->md_pad;
7207
- key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]);
7208
- for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
7403
+ key.mv_data = METADATA(psrc);
7404
+ for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7209
7405
  rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
7210
7406
  if (rc != MDB_SUCCESS)
7211
7407
  return rc;
7212
7408
  key.mv_data = (char *)key.mv_data + key.mv_size;
7213
7409
  }
7214
7410
  } else {
7215
- for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
7216
- srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
7217
- if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7218
- unsigned int snum = csrc->mc_snum;
7411
+ for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7412
+ srcnode = NODEPTR(psrc, i);
7413
+ if (i == 0 && IS_BRANCH(psrc)) {
7414
+ MDB_cursor mn;
7219
7415
  MDB_node *s2;
7416
+ mdb_cursor_copy(csrc, &mn);
7220
7417
  /* must find the lowest key below src */
7221
- rc = mdb_page_search_lowest(csrc);
7418
+ rc = mdb_page_search_lowest(&mn);
7222
7419
  if (rc)
7223
7420
  return rc;
7224
- if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7225
- key.mv_size = csrc->mc_db->md_pad;
7226
- key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7421
+ if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7422
+ key.mv_size = mn.mc_db->md_pad;
7423
+ key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size);
7227
7424
  } else {
7228
- s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7425
+ s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7229
7426
  key.mv_size = NODEKSZ(s2);
7230
7427
  key.mv_data = NODEKEY(s2);
7231
7428
  }
7232
- csrc->mc_snum = snum--;
7233
- csrc->mc_top = snum;
7234
7429
  } else {
7235
7430
  key.mv_size = srcnode->mn_ksize;
7236
7431
  key.mv_data = NODEKEY(srcnode);
@@ -7245,8 +7440,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7245
7440
  }
7246
7441
 
7247
7442
  DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)",
7248
- cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]),
7249
- (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10));
7443
+ pdst->mp_pgno, NUMKEYS(pdst),
7444
+ (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10));
7250
7445
 
7251
7446
  /* Unlink the src page from parent and add to free list.
7252
7447
  */
@@ -7262,11 +7457,14 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7262
7457
  }
7263
7458
  csrc->mc_top++;
7264
7459
 
7265
- rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs,
7266
- csrc->mc_pg[csrc->mc_top]->mp_pgno);
7460
+ psrc = csrc->mc_pg[csrc->mc_top];
7461
+ /* If not operating on FreeDB, allow this page to be reused
7462
+ * in this txn. Otherwise just add to free list.
7463
+ */
7464
+ rc = mdb_page_loose(csrc, psrc);
7267
7465
  if (rc)
7268
7466
  return rc;
7269
- if (IS_LEAF(csrc->mc_pg[csrc->mc_top]))
7467
+ if (IS_LEAF(psrc))
7270
7468
  csrc->mc_db->md_leaf_pages--;
7271
7469
  else
7272
7470
  csrc->mc_db->md_branch_pages--;
@@ -7274,7 +7472,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7274
7472
  /* Adjust other cursors pointing to mp */
7275
7473
  MDB_cursor *m2, *m3;
7276
7474
  MDB_dbi dbi = csrc->mc_dbi;
7277
- MDB_page *mp = cdst->mc_pg[cdst->mc_top];
7278
7475
 
7279
7476
  for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7280
7477
  if (csrc->mc_flags & C_SUB)
@@ -7283,8 +7480,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7283
7480
  m3 = m2;
7284
7481
  if (m3 == csrc) continue;
7285
7482
  if (m3->mc_snum < csrc->mc_snum) continue;
7286
- if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
7287
- m3->mc_pg[csrc->mc_top] = mp;
7483
+ if (m3->mc_pg[csrc->mc_top] == psrc) {
7484
+ m3->mc_pg[csrc->mc_top] = pdst;
7288
7485
  m3->mc_ki[csrc->mc_top] += nkeys;
7289
7486
  }
7290
7487
  }
@@ -7525,8 +7722,10 @@ mdb_cursor_del0(MDB_cursor *mc)
7525
7722
  /* if mc points past last node in page, find next sibling */
7526
7723
  if (mc->mc_ki[mc->mc_top] >= nkeys) {
7527
7724
  rc = mdb_cursor_sibling(mc, 1);
7528
- if (rc == MDB_NOTFOUND)
7725
+ if (rc == MDB_NOTFOUND) {
7726
+ mc->mc_flags |= C_EOF;
7529
7727
  rc = MDB_SUCCESS;
7728
+ }
7530
7729
  }
7531
7730
 
7532
7731
  /* Adjust other cursors pointing to mp */
@@ -7541,11 +7740,15 @@ mdb_cursor_del0(MDB_cursor *mc)
7541
7740
  m3->mc_flags |= C_DEL;
7542
7741
  if (m3->mc_ki[mc->mc_top] > ki)
7543
7742
  m3->mc_ki[mc->mc_top]--;
7743
+ else if (mc->mc_db->md_flags & MDB_DUPSORT)
7744
+ m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF;
7544
7745
  }
7545
7746
  if (m3->mc_ki[mc->mc_top] >= nkeys) {
7546
7747
  rc = mdb_cursor_sibling(m3, 1);
7547
- if (rc == MDB_NOTFOUND)
7748
+ if (rc == MDB_NOTFOUND) {
7749
+ m3->mc_flags |= C_EOF;
7548
7750
  rc = MDB_SUCCESS;
7751
+ }
7549
7752
  }
7550
7753
  }
7551
7754
  }
@@ -7760,8 +7963,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7760
7963
  }
7761
7964
  copy->mp_pgno = mp->mp_pgno;
7762
7965
  copy->mp_flags = mp->mp_flags;
7763
- copy->mp_lower = PAGEHDRSZ;
7764
- copy->mp_upper = env->me_psize;
7966
+ copy->mp_lower = (PAGEHDRSZ-PAGEBASE);
7967
+ copy->mp_upper = env->me_psize - PAGEBASE;
7765
7968
 
7766
7969
  /* prepare to insert */
7767
7970
  for (i=0, j=0; i<nkeys; i++) {
@@ -7801,7 +8004,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7801
8004
  psize += nsize;
7802
8005
  node = NULL;
7803
8006
  } else {
7804
- node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
8007
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
7805
8008
  psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7806
8009
  if (IS_LEAF(mp)) {
7807
8010
  if (F_ISSET(node->mn_flags, F_BIGDATA))
@@ -7821,7 +8024,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7821
8024
  sepkey.mv_size = newkey->mv_size;
7822
8025
  sepkey.mv_data = newkey->mv_data;
7823
8026
  } else {
7824
- node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
8027
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
7825
8028
  sepkey.mv_size = node->mn_ksize;
7826
8029
  sepkey.mv_data = NODEKEY(node);
7827
8030
  }
@@ -7902,7 +8105,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7902
8105
  /* Update index for the new key. */
7903
8106
  mc->mc_ki[mc->mc_top] = j;
7904
8107
  } else {
7905
- node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
8108
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
7906
8109
  rkey.mv_data = NODEKEY(node);
7907
8110
  rkey.mv_size = node->mn_ksize;
7908
8111
  if (IS_LEAF(mp)) {
@@ -7938,7 +8141,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7938
8141
  mp->mp_lower = copy->mp_lower;
7939
8142
  mp->mp_upper = copy->mp_upper;
7940
8143
  memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
7941
- env->me_psize - copy->mp_upper);
8144
+ env->me_psize - copy->mp_upper - PAGEBASE);
7942
8145
 
7943
8146
  /* reset back to original page */
7944
8147
  if (newindx < split_indx) {
@@ -8037,7 +8240,568 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
8037
8240
  return mdb_cursor_put(&mc, key, data, flags);
8038
8241
  }
8039
8242
 
8040
- int
8243
+ #ifndef MDB_WBUF
8244
+ #define MDB_WBUF (1024*1024)
8245
+ #endif
8246
+
8247
+ /** State needed for a compacting copy. */
8248
+ typedef struct mdb_copy {
8249
+ pthread_mutex_t mc_mutex;
8250
+ pthread_cond_t mc_cond;
8251
+ char *mc_wbuf[2];
8252
+ char *mc_over[2];
8253
+ MDB_env *mc_env;
8254
+ MDB_txn *mc_txn;
8255
+ int mc_wlen[2];
8256
+ int mc_olen[2];
8257
+ pgno_t mc_next_pgno;
8258
+ HANDLE mc_fd;
8259
+ int mc_status;
8260
+ volatile int mc_new;
8261
+ int mc_toggle;
8262
+
8263
+ } mdb_copy;
8264
+
8265
+ /** Dedicated writer thread for compacting copy. */
8266
+ static THREAD_RET ESECT
8267
+ mdb_env_copythr(void *arg)
8268
+ {
8269
+ mdb_copy *my = arg;
8270
+ char *ptr;
8271
+ int toggle = 0, wsize, rc;
8272
+ #ifdef _WIN32
8273
+ DWORD len;
8274
+ #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8275
+ #else
8276
+ int len;
8277
+ #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8278
+ #endif
8279
+
8280
+ pthread_mutex_lock(&my->mc_mutex);
8281
+ my->mc_new = 0;
8282
+ pthread_cond_signal(&my->mc_cond);
8283
+ for(;;) {
8284
+ while (!my->mc_new)
8285
+ pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8286
+ if (my->mc_new < 0) {
8287
+ my->mc_new = 0;
8288
+ break;
8289
+ }
8290
+ my->mc_new = 0;
8291
+ wsize = my->mc_wlen[toggle];
8292
+ ptr = my->mc_wbuf[toggle];
8293
+ again:
8294
+ while (wsize > 0) {
8295
+ DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
8296
+ if (!rc) {
8297
+ rc = ErrCode();
8298
+ break;
8299
+ } else if (len > 0) {
8300
+ rc = MDB_SUCCESS;
8301
+ ptr += len;
8302
+ wsize -= len;
8303
+ continue;
8304
+ } else {
8305
+ rc = EIO;
8306
+ break;
8307
+ }
8308
+ }
8309
+ if (rc) {
8310
+ my->mc_status = rc;
8311
+ break;
8312
+ }
8313
+ /* If there's an overflow page tail, write it too */
8314
+ if (my->mc_olen[toggle]) {
8315
+ wsize = my->mc_olen[toggle];
8316
+ ptr = my->mc_over[toggle];
8317
+ my->mc_olen[toggle] = 0;
8318
+ goto again;
8319
+ }
8320
+ my->mc_wlen[toggle] = 0;
8321
+ toggle ^= 1;
8322
+ pthread_cond_signal(&my->mc_cond);
8323
+ }
8324
+ pthread_cond_signal(&my->mc_cond);
8325
+ pthread_mutex_unlock(&my->mc_mutex);
8326
+ return (THREAD_RET)0;
8327
+ #undef DO_WRITE
8328
+ }
8329
+
8330
+ /** Tell the writer thread there's a buffer ready to write */
8331
+ static int ESECT
8332
+ mdb_env_cthr_toggle(mdb_copy *my, int st)
8333
+ {
8334
+ int toggle = my->mc_toggle ^ 1;
8335
+ pthread_mutex_lock(&my->mc_mutex);
8336
+ if (my->mc_status) {
8337
+ pthread_mutex_unlock(&my->mc_mutex);
8338
+ return my->mc_status;
8339
+ }
8340
+ while (my->mc_new == 1)
8341
+ pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8342
+ my->mc_new = st;
8343
+ my->mc_toggle = toggle;
8344
+ pthread_cond_signal(&my->mc_cond);
8345
+ pthread_mutex_unlock(&my->mc_mutex);
8346
+ return 0;
8347
+ }
8348
+
8349
+ /** Depth-first tree traversal for compacting copy. */
8350
+ static int ESECT
8351
+ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
8352
+ {
8353
+ MDB_cursor mc;
8354
+ MDB_txn *txn = my->mc_txn;
8355
+ MDB_node *ni;
8356
+ MDB_page *mo, *mp, *leaf;
8357
+ char *buf, *ptr;
8358
+ int rc, toggle;
8359
+ unsigned int i;
8360
+
8361
+ /* Empty DB, nothing to do */
8362
+ if (*pg == P_INVALID)
8363
+ return MDB_SUCCESS;
8364
+
8365
+ mc.mc_snum = 1;
8366
+ mc.mc_top = 0;
8367
+ mc.mc_txn = txn;
8368
+
8369
+ rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL);
8370
+ if (rc)
8371
+ return rc;
8372
+ rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST);
8373
+ if (rc)
8374
+ return rc;
8375
+
8376
+ /* Make cursor pages writable */
8377
+ buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum);
8378
+ if (buf == NULL)
8379
+ return ENOMEM;
8380
+
8381
+ for (i=0; i<mc.mc_top; i++) {
8382
+ mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
8383
+ mc.mc_pg[i] = (MDB_page *)ptr;
8384
+ ptr += my->mc_env->me_psize;
8385
+ }
8386
+
8387
+ /* This is writable space for a leaf page. Usually not needed. */
8388
+ leaf = (MDB_page *)ptr;
8389
+
8390
+ toggle = my->mc_toggle;
8391
+ while (mc.mc_snum > 0) {
8392
+ unsigned n;
8393
+ mp = mc.mc_pg[mc.mc_top];
8394
+ n = NUMKEYS(mp);
8395
+
8396
+ if (IS_LEAF(mp)) {
8397
+ if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
8398
+ for (i=0; i<n; i++) {
8399
+ ni = NODEPTR(mp, i);
8400
+ if (ni->mn_flags & F_BIGDATA) {
8401
+ MDB_page *omp;
8402
+ pgno_t pg;
8403
+
8404
+ /* Need writable leaf */
8405
+ if (mp != leaf) {
8406
+ mc.mc_pg[mc.mc_top] = leaf;
8407
+ mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8408
+ mp = leaf;
8409
+ ni = NODEPTR(mp, i);
8410
+ }
8411
+
8412
+ memcpy(&pg, NODEDATA(ni), sizeof(pg));
8413
+ rc = mdb_page_get(txn, pg, &omp, NULL);
8414
+ if (rc)
8415
+ goto done;
8416
+ if (my->mc_wlen[toggle] >= MDB_WBUF) {
8417
+ rc = mdb_env_cthr_toggle(my, 1);
8418
+ if (rc)
8419
+ goto done;
8420
+ toggle = my->mc_toggle;
8421
+ }
8422
+ mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8423
+ memcpy(mo, omp, my->mc_env->me_psize);
8424
+ mo->mp_pgno = my->mc_next_pgno;
8425
+ my->mc_next_pgno += omp->mp_pages;
8426
+ my->mc_wlen[toggle] += my->mc_env->me_psize;
8427
+ if (omp->mp_pages > 1) {
8428
+ my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
8429
+ my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
8430
+ rc = mdb_env_cthr_toggle(my, 1);
8431
+ if (rc)
8432
+ goto done;
8433
+ toggle = my->mc_toggle;
8434
+ }
8435
+ memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t));
8436
+ } else if (ni->mn_flags & F_SUBDATA) {
8437
+ MDB_db db;
8438
+
8439
+ /* Need writable leaf */
8440
+ if (mp != leaf) {
8441
+ mc.mc_pg[mc.mc_top] = leaf;
8442
+ mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8443
+ mp = leaf;
8444
+ ni = NODEPTR(mp, i);
8445
+ }
8446
+
8447
+ memcpy(&db, NODEDATA(ni), sizeof(db));
8448
+ my->mc_toggle = toggle;
8449
+ rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA);
8450
+ if (rc)
8451
+ goto done;
8452
+ toggle = my->mc_toggle;
8453
+ memcpy(NODEDATA(ni), &db, sizeof(db));
8454
+ }
8455
+ }
8456
+ }
8457
+ } else {
8458
+ mc.mc_ki[mc.mc_top]++;
8459
+ if (mc.mc_ki[mc.mc_top] < n) {
8460
+ pgno_t pg;
8461
+ again:
8462
+ ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
8463
+ pg = NODEPGNO(ni);
8464
+ rc = mdb_page_get(txn, pg, &mp, NULL);
8465
+ if (rc)
8466
+ goto done;
8467
+ mc.mc_top++;
8468
+ mc.mc_snum++;
8469
+ mc.mc_ki[mc.mc_top] = 0;
8470
+ if (IS_BRANCH(mp)) {
8471
+ /* Whenever we advance to a sibling branch page,
8472
+ * we must proceed all the way down to its first leaf.
8473
+ */
8474
+ mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
8475
+ goto again;
8476
+ } else
8477
+ mc.mc_pg[mc.mc_top] = mp;
8478
+ continue;
8479
+ }
8480
+ }
8481
+ if (my->mc_wlen[toggle] >= MDB_WBUF) {
8482
+ rc = mdb_env_cthr_toggle(my, 1);
8483
+ if (rc)
8484
+ goto done;
8485
+ toggle = my->mc_toggle;
8486
+ }
8487
+ mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8488
+ mdb_page_copy(mo, mp, my->mc_env->me_psize);
8489
+ mo->mp_pgno = my->mc_next_pgno++;
8490
+ my->mc_wlen[toggle] += my->mc_env->me_psize;
8491
+ if (mc.mc_top) {
8492
+ /* Update parent if there is one */
8493
+ ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
8494
+ SETPGNO(ni, mo->mp_pgno);
8495
+ mdb_cursor_pop(&mc);
8496
+ } else {
8497
+ /* Otherwise we're done */
8498
+ *pg = mo->mp_pgno;
8499
+ break;
8500
+ }
8501
+ }
8502
+ done:
8503
+ free(buf);
8504
+ return rc;
8505
+ }
8506
+
8507
+ /** Copy environment with compaction. */
8508
+ static int ESECT
8509
+ mdb_env_copyfd1(MDB_env *env, HANDLE fd)
8510
+ {
8511
+ MDB_meta *mm;
8512
+ MDB_page *mp;
8513
+ mdb_copy my;
8514
+ MDB_txn *txn = NULL;
8515
+ pthread_t thr;
8516
+ int rc;
8517
+
8518
+ #ifdef _WIN32
8519
+ my.mc_mutex = CreateMutex(NULL, FALSE, NULL);
8520
+ my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL);
8521
+ my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize);
8522
+ if (my.mc_wbuf[0] == NULL)
8523
+ return errno;
8524
+ #else
8525
+ pthread_mutex_init(&my.mc_mutex, NULL);
8526
+ pthread_cond_init(&my.mc_cond, NULL);
8527
+ #ifdef HAVE_MEMALIGN
8528
+ my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2);
8529
+ if (my.mc_wbuf[0] == NULL)
8530
+ return errno;
8531
+ #else
8532
+ rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2);
8533
+ if (rc)
8534
+ return rc;
8535
+ #endif
8536
+ #endif
8537
+ memset(my.mc_wbuf[0], 0, MDB_WBUF*2);
8538
+ my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF;
8539
+ my.mc_wlen[0] = 0;
8540
+ my.mc_wlen[1] = 0;
8541
+ my.mc_olen[0] = 0;
8542
+ my.mc_olen[1] = 0;
8543
+ my.mc_next_pgno = 2;
8544
+ my.mc_status = 0;
8545
+ my.mc_new = 1;
8546
+ my.mc_toggle = 0;
8547
+ my.mc_env = env;
8548
+ my.mc_fd = fd;
8549
+ THREAD_CREATE(thr, mdb_env_copythr, &my);
8550
+
8551
+ rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8552
+ if (rc)
8553
+ return rc;
8554
+
8555
+ mp = (MDB_page *)my.mc_wbuf[0];
8556
+ memset(mp, 0, 2*env->me_psize);
8557
+ mp->mp_pgno = 0;
8558
+ mp->mp_flags = P_META;
8559
+ mm = (MDB_meta *)METADATA(mp);
8560
+ mdb_env_init_meta0(env, mm);
8561
+ mm->mm_address = env->me_metas[0]->mm_address;
8562
+
8563
+ mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize);
8564
+ mp->mp_pgno = 1;
8565
+ mp->mp_flags = P_META;
8566
+ *(MDB_meta *)METADATA(mp) = *mm;
8567
+ mm = (MDB_meta *)METADATA(mp);
8568
+
8569
+ /* Count the number of free pages, subtract from lastpg to find
8570
+ * number of active pages
8571
+ */
8572
+ {
8573
+ MDB_ID freecount = 0;
8574
+ MDB_cursor mc;
8575
+ MDB_val key, data;
8576
+ mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
8577
+ while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
8578
+ freecount += *(MDB_ID *)data.mv_data;
8579
+ freecount += txn->mt_dbs[0].md_branch_pages +
8580
+ txn->mt_dbs[0].md_leaf_pages +
8581
+ txn->mt_dbs[0].md_overflow_pages;
8582
+
8583
+ /* Set metapage 1 */
8584
+ mm->mm_last_pg = txn->mt_next_pgno - freecount - 1;
8585
+ mm->mm_dbs[1] = txn->mt_dbs[1];
8586
+ mm->mm_dbs[1].md_root = mm->mm_last_pg;
8587
+ mm->mm_txnid = 1;
8588
+ }
8589
+ my.mc_wlen[0] = env->me_psize * 2;
8590
+ my.mc_txn = txn;
8591
+ pthread_mutex_lock(&my.mc_mutex);
8592
+ while(my.mc_new)
8593
+ pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8594
+ pthread_mutex_unlock(&my.mc_mutex);
8595
+ rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0);
8596
+ if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle])
8597
+ rc = mdb_env_cthr_toggle(&my, 1);
8598
+ mdb_env_cthr_toggle(&my, -1);
8599
+ pthread_mutex_lock(&my.mc_mutex);
8600
+ while(my.mc_new)
8601
+ pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8602
+ pthread_mutex_unlock(&my.mc_mutex);
8603
+ THREAD_FINISH(thr);
8604
+
8605
+ mdb_txn_abort(txn);
8606
+ #ifdef _WIN32
8607
+ CloseHandle(my.mc_cond);
8608
+ CloseHandle(my.mc_mutex);
8609
+ _aligned_free(my.mc_wbuf[0]);
8610
+ #else
8611
+ pthread_cond_destroy(&my.mc_cond);
8612
+ pthread_mutex_destroy(&my.mc_mutex);
8613
+ free(my.mc_wbuf[0]);
8614
+ #endif
8615
+ return rc;
8616
+ }
8617
+
8618
+ /** Copy environment as-is. */
8619
+ static int ESECT
8620
+ mdb_env_copyfd0(MDB_env *env, HANDLE fd)
8621
+ {
8622
+ MDB_txn *txn = NULL;
8623
+ int rc;
8624
+ size_t wsize;
8625
+ char *ptr;
8626
+ #ifdef _WIN32
8627
+ DWORD len, w2;
8628
+ #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8629
+ #else
8630
+ ssize_t len;
8631
+ size_t w2;
8632
+ #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8633
+ #endif
8634
+
8635
+ /* Do the lock/unlock of the reader mutex before starting the
8636
+ * write txn. Otherwise other read txns could block writers.
8637
+ */
8638
+ rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8639
+ if (rc)
8640
+ return rc;
8641
+
8642
+ if (env->me_txns) {
8643
+ /* We must start the actual read txn after blocking writers */
8644
+ mdb_txn_reset0(txn, "reset-stage1");
8645
+
8646
+ /* Temporarily block writers until we snapshot the meta pages */
8647
+ LOCK_MUTEX_W(env);
8648
+
8649
+ rc = mdb_txn_renew0(txn);
8650
+ if (rc) {
8651
+ UNLOCK_MUTEX_W(env);
8652
+ goto leave;
8653
+ }
8654
+ }
8655
+
8656
+ wsize = env->me_psize * 2;
8657
+ ptr = env->me_map;
8658
+ w2 = wsize;
8659
+ while (w2 > 0) {
8660
+ DO_WRITE(rc, fd, ptr, w2, len);
8661
+ if (!rc) {
8662
+ rc = ErrCode();
8663
+ break;
8664
+ } else if (len > 0) {
8665
+ rc = MDB_SUCCESS;
8666
+ ptr += len;
8667
+ w2 -= len;
8668
+ continue;
8669
+ } else {
8670
+ /* Non-blocking or async handles are not supported */
8671
+ rc = EIO;
8672
+ break;
8673
+ }
8674
+ }
8675
+ if (env->me_txns)
8676
+ UNLOCK_MUTEX_W(env);
8677
+
8678
+ if (rc)
8679
+ goto leave;
8680
+
8681
+ w2 = txn->mt_next_pgno * env->me_psize;
8682
+ #ifdef WIN32
8683
+ {
8684
+ LARGE_INTEGER fsize;
8685
+ GetFileSizeEx(env->me_fd, &fsize);
8686
+ if (w2 > fsize.QuadPart)
8687
+ w2 = fsize.QuadPart;
8688
+ }
8689
+ #else
8690
+ {
8691
+ struct stat st;
8692
+ fstat(env->me_fd, &st);
8693
+ if (w2 > (size_t)st.st_size)
8694
+ w2 = st.st_size;
8695
+ }
8696
+ #endif
8697
+ wsize = w2 - wsize;
8698
+ while (wsize > 0) {
8699
+ if (wsize > MAX_WRITE)
8700
+ w2 = MAX_WRITE;
8701
+ else
8702
+ w2 = wsize;
8703
+ DO_WRITE(rc, fd, ptr, w2, len);
8704
+ if (!rc) {
8705
+ rc = ErrCode();
8706
+ break;
8707
+ } else if (len > 0) {
8708
+ rc = MDB_SUCCESS;
8709
+ ptr += len;
8710
+ wsize -= len;
8711
+ continue;
8712
+ } else {
8713
+ rc = EIO;
8714
+ break;
8715
+ }
8716
+ }
8717
+
8718
+ leave:
8719
+ mdb_txn_abort(txn);
8720
+ return rc;
8721
+ }
8722
+
8723
+ int ESECT
8724
+ mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags)
8725
+ {
8726
+ if (flags & MDB_CP_COMPACT)
8727
+ return mdb_env_copyfd1(env, fd);
8728
+ else
8729
+ return mdb_env_copyfd0(env, fd);
8730
+ }
8731
+
8732
+ int ESECT
8733
+ mdb_env_copyfd(MDB_env *env, HANDLE fd)
8734
+ {
8735
+ return mdb_env_copyfd2(env, fd, 0);
8736
+ }
8737
+
8738
+ int ESECT
8739
+ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
8740
+ {
8741
+ int rc, len;
8742
+ char *lpath;
8743
+ HANDLE newfd = INVALID_HANDLE_VALUE;
8744
+
8745
+ if (env->me_flags & MDB_NOSUBDIR) {
8746
+ lpath = (char *)path;
8747
+ } else {
8748
+ len = strlen(path);
8749
+ len += sizeof(DATANAME);
8750
+ lpath = malloc(len);
8751
+ if (!lpath)
8752
+ return ENOMEM;
8753
+ sprintf(lpath, "%s" DATANAME, path);
8754
+ }
8755
+
8756
+ /* The destination path must exist, but the destination file must not.
8757
+ * We don't want the OS to cache the writes, since the source data is
8758
+ * already in the OS cache.
8759
+ */
8760
+ #ifdef _WIN32
8761
+ newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
8762
+ FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
8763
+ #else
8764
+ newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
8765
+ #endif
8766
+ if (newfd == INVALID_HANDLE_VALUE) {
8767
+ rc = ErrCode();
8768
+ goto leave;
8769
+ }
8770
+
8771
+ if (env->me_psize >= env->me_os_psize) {
8772
+ #ifdef O_DIRECT
8773
+ /* Set O_DIRECT if the file system supports it */
8774
+ if ((rc = fcntl(newfd, F_GETFL)) != -1)
8775
+ (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
8776
+ #endif
8777
+ #ifdef F_NOCACHE /* __APPLE__ */
8778
+ rc = fcntl(newfd, F_NOCACHE, 1);
8779
+ if (rc) {
8780
+ rc = ErrCode();
8781
+ goto leave;
8782
+ }
8783
+ #endif
8784
+ }
8785
+
8786
+ rc = mdb_env_copyfd2(env, newfd, flags);
8787
+
8788
+ leave:
8789
+ if (!(env->me_flags & MDB_NOSUBDIR))
8790
+ free(lpath);
8791
+ if (newfd != INVALID_HANDLE_VALUE)
8792
+ if (close(newfd) < 0 && rc == MDB_SUCCESS)
8793
+ rc = ErrCode();
8794
+
8795
+ return rc;
8796
+ }
8797
+
8798
+ int ESECT
8799
+ mdb_env_copy(MDB_env *env, const char *path)
8800
+ {
8801
+ return mdb_env_copy2(env, path, 0);
8802
+ }
8803
+
8804
+ int ESECT
8041
8805
  mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
8042
8806
  {
8043
8807
  if ((flag & CHANGEABLE) != flag)
@@ -8049,7 +8813,7 @@ mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
8049
8813
  return MDB_SUCCESS;
8050
8814
  }
8051
8815
 
8052
- int
8816
+ int ESECT
8053
8817
  mdb_env_get_flags(MDB_env *env, unsigned int *arg)
8054
8818
  {
8055
8819
  if (!env || !arg)
@@ -8059,7 +8823,7 @@ mdb_env_get_flags(MDB_env *env, unsigned int *arg)
8059
8823
  return MDB_SUCCESS;
8060
8824
  }
8061
8825
 
8062
- int
8826
+ int ESECT
8063
8827
  mdb_env_set_userctx(MDB_env *env, void *ctx)
8064
8828
  {
8065
8829
  if (!env)
@@ -8068,13 +8832,13 @@ mdb_env_set_userctx(MDB_env *env, void *ctx)
8068
8832
  return MDB_SUCCESS;
8069
8833
  }
8070
8834
 
8071
- void *
8835
+ void * ESECT
8072
8836
  mdb_env_get_userctx(MDB_env *env)
8073
8837
  {
8074
8838
  return env ? env->me_userctx : NULL;
8075
8839
  }
8076
8840
 
8077
- int
8841
+ int ESECT
8078
8842
  mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
8079
8843
  {
8080
8844
  if (!env)
@@ -8085,7 +8849,7 @@ mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
8085
8849
  return MDB_SUCCESS;
8086
8850
  }
8087
8851
 
8088
- int
8852
+ int ESECT
8089
8853
  mdb_env_get_path(MDB_env *env, const char **arg)
8090
8854
  {
8091
8855
  if (!env || !arg)
@@ -8095,7 +8859,7 @@ mdb_env_get_path(MDB_env *env, const char **arg)
8095
8859
  return MDB_SUCCESS;
8096
8860
  }
8097
8861
 
8098
- int
8862
+ int ESECT
8099
8863
  mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
8100
8864
  {
8101
8865
  if (!env || !arg)
@@ -8111,7 +8875,7 @@ mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
8111
8875
  * @param[out] arg the address of an #MDB_stat structure to receive the stats.
8112
8876
  * @return 0, this function always succeeds.
8113
8877
  */
8114
- static int
8878
+ static int ESECT
8115
8879
  mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
8116
8880
  {
8117
8881
  arg->ms_psize = env->me_psize;
@@ -8123,7 +8887,8 @@ mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
8123
8887
 
8124
8888
  return MDB_SUCCESS;
8125
8889
  }
8126
- int
8890
+
8891
+ int ESECT
8127
8892
  mdb_env_stat(MDB_env *env, MDB_stat *arg)
8128
8893
  {
8129
8894
  int toggle;
@@ -8136,7 +8901,7 @@ mdb_env_stat(MDB_env *env, MDB_stat *arg)
8136
8901
  return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
8137
8902
  }
8138
8903
 
8139
- int
8904
+ int ESECT
8140
8905
  mdb_env_info(MDB_env *env, MDB_envinfo *arg)
8141
8906
  {
8142
8907
  int toggle;
@@ -8145,7 +8910,7 @@ mdb_env_info(MDB_env *env, MDB_envinfo *arg)
8145
8910
  return EINVAL;
8146
8911
 
8147
8912
  toggle = mdb_env_pick_meta(env);
8148
- arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0;
8913
+ arg->me_mapaddr = env->me_metas[toggle]->mm_address;
8149
8914
  arg->me_mapsize = env->me_mapsize;
8150
8915
  arg->me_maxreaders = env->me_maxreaders;
8151
8916
 
@@ -8187,8 +8952,9 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
8187
8952
  MDB_val key, data;
8188
8953
  MDB_dbi i;
8189
8954
  MDB_cursor mc;
8955
+ MDB_db dummy;
8190
8956
  int rc, dbflag, exact;
8191
- unsigned int unused = 0;
8957
+ unsigned int unused = 0, seq;
8192
8958
  size_t len;
8193
8959
 
8194
8960
  if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
@@ -8256,7 +9022,6 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
8256
9022
  return MDB_INCOMPATIBLE;
8257
9023
  } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
8258
9024
  /* Create if requested */
8259
- MDB_db dummy;
8260
9025
  data.mv_size = sizeof(MDB_db);
8261
9026
  data.mv_data = &dummy;
8262
9027
  memset(&dummy, 0, sizeof(dummy));
@@ -8273,6 +9038,12 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
8273
9038
  txn->mt_dbxs[slot].md_name.mv_size = len;
8274
9039
  txn->mt_dbxs[slot].md_rel = NULL;
8275
9040
  txn->mt_dbflags[slot] = dbflag;
9041
+ /* txn-> and env-> are the same in read txns, use
9042
+ * tmp variable to avoid undefined assignment
9043
+ */
9044
+ seq = ++txn->mt_env->me_dbiseqs[slot];
9045
+ txn->mt_dbiseqs[slot] = seq;
9046
+
8276
9047
  memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
8277
9048
  *dbi = slot;
8278
9049
  mdb_default_cmp(txn, slot);
@@ -8307,10 +9078,14 @@ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
8307
9078
  if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs)
8308
9079
  return;
8309
9080
  ptr = env->me_dbxs[dbi].md_name.mv_data;
8310
- env->me_dbxs[dbi].md_name.mv_data = NULL;
8311
- env->me_dbxs[dbi].md_name.mv_size = 0;
8312
- env->me_dbflags[dbi] = 0;
8313
- free(ptr);
9081
+ /* If there was no name, this was already closed */
9082
+ if (ptr) {
9083
+ env->me_dbxs[dbi].md_name.mv_data = NULL;
9084
+ env->me_dbxs[dbi].md_name.mv_size = 0;
9085
+ env->me_dbflags[dbi] = 0;
9086
+ env->me_dbiseqs[dbi]++;
9087
+ free(ptr);
9088
+ }
8314
9089
  }
8315
9090
 
8316
9091
  int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags)
@@ -8420,6 +9195,9 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
8420
9195
  if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
8421
9196
  return EACCES;
8422
9197
 
9198
+ if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi))
9199
+ return MDB_BAD_DBI;
9200
+
8423
9201
  rc = mdb_cursor_open(txn, dbi, &mc);
8424
9202
  if (rc)
8425
9203
  return rc;
@@ -8493,12 +9271,14 @@ int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
8493
9271
  return MDB_SUCCESS;
8494
9272
  }
8495
9273
 
8496
- int mdb_env_get_maxkeysize(MDB_env *env)
9274
+ int ESECT
9275
+ mdb_env_get_maxkeysize(MDB_env *env)
8497
9276
  {
8498
9277
  return ENV_MAXKEY(env);
8499
9278
  }
8500
9279
 
8501
- int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
9280
+ int ESECT
9281
+ mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
8502
9282
  {
8503
9283
  unsigned int i, rdrs;
8504
9284
  MDB_reader *mr;
@@ -8538,7 +9318,8 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
8538
9318
  /** Insert pid into list if not already present.
8539
9319
  * return -1 if already present.
8540
9320
  */
8541
- static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
9321
+ static int ESECT
9322
+ mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
8542
9323
  {
8543
9324
  /* binary search of pid in list */
8544
9325
  unsigned base = 0;
@@ -8574,7 +9355,8 @@ static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
8574
9355
  return 0;
8575
9356
  }
8576
9357
 
8577
- int mdb_reader_check(MDB_env *env, int *dead)
9358
+ int ESECT
9359
+ mdb_reader_check(MDB_env *env, int *dead)
8578
9360
  {
8579
9361
  unsigned int i, j, rdrs;
8580
9362
  MDB_reader *mr;