lmdb 0.4.5 → 0.4.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 14c11d6e3fb6489302524938d95191ac9db4131a
4
- data.tar.gz: e5e16ec5a67bc0e44a30753daf12c46c22f38fc1
3
+ metadata.gz: cfbe0e0fc20cfe471e48ab16b7a6f03a868f5fa0
4
+ data.tar.gz: cd1e90a95ee5eef3bf2bfe0f0db3e389feb22a51
5
5
  SHA512:
6
- metadata.gz: 621f7a59503333b623e1073c382cc0c2832e2e2648abed644faf0e812667a4282887ae5da1a78c67f54c15bcc5d9a504f0eb95dc9e15147d6f80dffaa9b7d5d7
7
- data.tar.gz: c634c5c777b29f49804e3748349e750be8998450d6e0f0bf77a208c3a5a97ea918f7026f51a587a585d7c9421d9c9b486f9a489498c5a43009477479f00afaa4
6
+ metadata.gz: 23b7d820ead899db18c95d87e819b9d8a533a118a86bffe2bd996cd1352ca73bf8c3399150ed7b4fdaea7a5e4ab1901bc3844e14d25672874744572d383a1985
7
+ data.tar.gz: e3d13fe0515fd9ca626ed81526b3be34a4b9f0affe73afef4862033bb8b124c1c84687115f8f5f40a3524a4308d44ac86e877113362a6b08e87f002f081052b5
@@ -1,5 +1,27 @@
1
1
  LMDB 0.9 Change Log
2
2
 
3
+ LMDB 0.9.14 Release (2014/09/15)
4
+ Fix to support 64K page size (ITS#7713)
5
+ Fix to persist decreased as well as increased mapsizes (ITS#7789)
6
+ Fix cursor bug when deleting last node of a DUPSORT key
7
+ Fix mdb_env_info to return FIXEDMAP address
8
+ Fix ambiguous error code from writing to closed DBI (ITS#7825)
9
+ Fix mdb_copy copying past end of file (ITS#7886)
10
+ Fix cursor bugs from page_merge/rebalance
11
+ Fix to dirty fewer pages in deletes (mdb_page_loose())
12
+ Fix mdb_dbi_open creating subDBs (ITS#7917)
13
+ Fix mdb_cursor_get(_DUP) with single value (ITS#7913)
14
+ Fix Windows compat issues in mtests (ITS#7879)
15
+ Add compacting variant of mdb_copy
16
+ Add BigEndian integer key compare code
17
+ Add mdb_dump/mdb_load utilities
18
+
19
+ LMDB 0.9.13 Release (2014/06/18)
20
+ Fix mdb_page_alloc unlimited overflow page search
21
+ Documentation
22
+ Re-fix MDB_CURRENT doc (ITS#7793)
23
+ Fix MDB_GET_MULTIPLE/MDB_NEXT_MULTIPLE doc
24
+
3
25
  LMDB 0.9.12 Release (2014/06/13)
4
26
  Fix MDB_GET_BOTH regression (ITS#7875,#7681)
5
27
  Fix MDB_MULTIPLE writing multiple keys (ITS#7834)
@@ -1,10 +1,10 @@
1
1
  /** @file lmdb.h
2
2
  * @brief Lightning memory-mapped database library
3
3
  *
4
- * @mainpage Lightning Memory-Mapped Database Manager (MDB)
4
+ * @mainpage Lightning Memory-Mapped Database Manager (LMDB)
5
5
  *
6
6
  * @section intro_sec Introduction
7
- * MDB is a Btree-based database management library modeled loosely on the
7
+ * LMDB is a Btree-based database management library modeled loosely on the
8
8
  * BerkeleyDB API, but much simplified. The entire database is exposed
9
9
  * in a memory map, and all data fetches return data directly
10
10
  * from the mapped memory, so no malloc's or memcpy's occur during
@@ -26,10 +26,10 @@
26
26
  * readers, and readers don't block writers.
27
27
  *
28
28
  * Unlike other well-known database mechanisms which use either write-ahead
29
- * transaction logs or append-only data writes, MDB requires no maintenance
29
+ * transaction logs or append-only data writes, LMDB requires no maintenance
30
30
  * during operation. Both write-ahead loggers and append-only databases
31
31
  * require periodic checkpointing and/or compaction of their log or database
32
- * files otherwise they grow without bound. MDB tracks free pages within
32
+ * files otherwise they grow without bound. LMDB tracks free pages within
33
33
  * the database and re-uses them for new write operations, so the database
34
34
  * size does not grow without bound in normal use.
35
35
  *
@@ -49,7 +49,7 @@
49
49
  * stale locks can block further operation.
50
50
  *
51
51
  * Fix: Check for stale readers periodically, using the
52
- * #mdb_reader_check function or the mdb_stat tool. Or just
52
+ * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. Or just
53
53
  * make all programs using the database close it; the lockfile
54
54
  * is always reset on first open of the environment.
55
55
  *
@@ -86,7 +86,7 @@
86
86
  *
87
87
  * - Use an MDB_env* in the process which opened it, without fork()ing.
88
88
  *
89
- * - Do not have open an MDB database twice in the same process at
89
+ * - Do not have open an LMDB database twice in the same process at
90
90
  * the same time. Not even from a plain open() call - close()ing it
91
91
  * breaks flock() advisory locking.
92
92
  *
@@ -109,7 +109,7 @@
109
109
  * - If you do that anyway, do a periodic check for stale readers. Or
110
110
  * close the environment once in a while, so the lockfile can get reset.
111
111
  *
112
- * - Do not use MDB databases on remote filesystems, even between
112
+ * - Do not use LMDB databases on remote filesystems, even between
113
113
  * processes on the same host. This breaks flock() on some OSes,
114
114
  * possibly memory map sync, and certainly sync between programs
115
115
  * on different hosts.
@@ -172,7 +172,7 @@ typedef void *mdb_filehandle_t;
172
172
  typedef int mdb_filehandle_t;
173
173
  #endif
174
174
 
175
- /** @defgroup mdb MDB API
175
+ /** @defgroup mdb LMDB API
176
176
  * @{
177
177
  * @brief OpenLDAP Lightning Memory-Mapped Database Manager
178
178
  */
@@ -184,7 +184,7 @@ typedef int mdb_filehandle_t;
184
184
  /** Library minor version */
185
185
  #define MDB_VERSION_MINOR 9
186
186
  /** Library patch version */
187
- #define MDB_VERSION_PATCH 12
187
+ #define MDB_VERSION_PATCH 14
188
188
 
189
189
  /** Combine args a,b,c into a single integer for easy version comparisons */
190
190
  #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
@@ -194,10 +194,10 @@ typedef int mdb_filehandle_t;
194
194
  MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
195
195
 
196
196
  /** The release date of this library version */
197
- #define MDB_VERSION_DATE "June 13, 2014"
197
+ #define MDB_VERSION_DATE "September 15, 2014"
198
198
 
199
199
  /** A stringifier for the version info */
200
- #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
200
+ #define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")"
201
201
 
202
202
  /** A helper for the stringifier macro */
203
203
  #define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d)
@@ -333,6 +333,15 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
333
333
  #define MDB_MULTIPLE 0x80000
334
334
  /* @} */
335
335
 
336
+ /** @defgroup mdb_copy Copy Flags
337
+ * @{
338
+ */
339
+ /** Compacting copy: Omit free space from copy, and renumber all
340
+ * pages sequentially.
341
+ */
342
+ #define MDB_CP_COMPACT 0x01
343
+ /* @} */
344
+
336
345
  /** @brief Cursor Get operations.
337
346
  *
338
347
  * This is the set of all operations for retrieving data
@@ -345,16 +354,18 @@ typedef enum MDB_cursor_op {
345
354
  MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */
346
355
  MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */
347
356
  MDB_GET_CURRENT, /**< Return key/data at current cursor position */
348
- MDB_GET_MULTIPLE, /**< Return all the duplicate data items at the current
349
- cursor position. Only for #MDB_DUPFIXED */
357
+ MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items
358
+ from current cursor position. Move cursor to prepare
359
+ for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */
350
360
  MDB_LAST, /**< Position at last key/data item */
351
361
  MDB_LAST_DUP, /**< Position at last data item of current key.
352
362
  Only for #MDB_DUPSORT */
353
363
  MDB_NEXT, /**< Position at next data item */
354
364
  MDB_NEXT_DUP, /**< Position at next data item of current key.
355
365
  Only for #MDB_DUPSORT */
356
- MDB_NEXT_MULTIPLE, /**< Return all duplicate data items at the next
357
- cursor position. Only for #MDB_DUPFIXED */
366
+ MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items
367
+ from next cursor position. Move cursor to prepare
368
+ for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */
358
369
  MDB_NEXT_NODUP, /**< Position at first data item of next key */
359
370
  MDB_PREV, /**< Position at previous data item */
360
371
  MDB_PREV_DUP, /**< Position at previous data item of current key.
@@ -384,7 +395,7 @@ typedef enum MDB_cursor_op {
384
395
  #define MDB_PANIC (-30795)
385
396
  /** Environment version mismatch */
386
397
  #define MDB_VERSION_MISMATCH (-30794)
387
- /** File is not a valid MDB file */
398
+ /** File is not a valid LMDB file */
388
399
  #define MDB_INVALID (-30793)
389
400
  /** Environment mapsize reached */
390
401
  #define MDB_MAP_FULL (-30792)
@@ -410,7 +421,10 @@ typedef enum MDB_cursor_op {
410
421
  #define MDB_BAD_TXN (-30782)
411
422
  /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */
412
423
  #define MDB_BAD_VALSIZE (-30781)
413
- #define MDB_LAST_ERRCODE MDB_BAD_VALSIZE
424
+ /** The specified DBI was changed unexpectedly */
425
+ #define MDB_BAD_DBI (-30780)
426
+ /** The last defined error code */
427
+ #define MDB_LAST_ERRCODE MDB_BAD_DBI
414
428
  /** @} */
415
429
 
416
430
  /** @brief Statistics for a database in the environment */
@@ -434,7 +448,7 @@ typedef struct MDB_envinfo {
434
448
  unsigned int me_numreaders; /**< max reader slots used in the environment */
435
449
  } MDB_envinfo;
436
450
 
437
- /** @brief Return the mdb library version information.
451
+ /** @brief Return the LMDB library version information.
438
452
  *
439
453
  * @param[out] major if non-NULL, the library major version number is copied here
440
454
  * @param[out] minor if non-NULL, the library minor version number is copied here
@@ -448,14 +462,14 @@ char *mdb_version(int *major, int *minor, int *patch);
448
462
  * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3)
449
463
  * function. If the error code is greater than or equal to 0, then the string
450
464
  * returned by the system function strerror(3) is returned. If the error code
451
- * is less than 0, an error string corresponding to the MDB library error is
452
- * returned. See @ref errors for a list of MDB-specific error codes.
465
+ * is less than 0, an error string corresponding to the LMDB library error is
466
+ * returned. See @ref errors for a list of LMDB-specific error codes.
453
467
  * @param[in] err The error code
454
468
  * @retval "error message" The description of the error
455
469
  */
456
470
  char *mdb_strerror(int err);
457
471
 
458
- /** @brief Create an MDB environment handle.
472
+ /** @brief Create an LMDB environment handle.
459
473
  *
460
474
  * This function allocates memory for a #MDB_env structure. To release
461
475
  * the allocated memory and discard the handle, call #mdb_env_close().
@@ -488,15 +502,15 @@ int mdb_env_create(MDB_env **env);
488
502
  * how the operating system has allocated memory to shared libraries and other uses.
489
503
  * The feature is highly experimental.
490
504
  * <li>#MDB_NOSUBDIR
491
- * By default, MDB creates its environment in a directory whose
505
+ * By default, LMDB creates its environment in a directory whose
492
506
  * pathname is given in \b path, and creates its data and lock files
493
507
  * under that directory. With this option, \b path is used as-is for
494
508
  * the database main data file. The database lock file is the \b path
495
509
  * with "-lock" appended.
496
510
  * <li>#MDB_RDONLY
497
511
  * Open the environment in read-only mode. No write operations will be
498
- * allowed. MDB will still modify the lock file - except on read-only
499
- * filesystems, where MDB does not use locks.
512
+ * allowed. LMDB will still modify the lock file - except on read-only
513
+ * filesystems, where LMDB does not use locks.
500
514
  * <li>#MDB_WRITEMAP
501
515
  * Use a writeable memory map unless MDB_RDONLY is set. This is faster
502
516
  * and uses fewer mallocs, but loses protection from application bugs
@@ -540,7 +554,7 @@ int mdb_env_create(MDB_env **env);
540
554
  * the user synchronizes its use. Applications that multiplex many
541
555
  * user threads over individual OS threads need this option. Such an
542
556
  * application must also serialize the write transactions in an OS
543
- * thread, since MDB's write locking is unaware of the user threads.
557
+ * thread, since LMDB's write locking is unaware of the user threads.
544
558
  * <li>#MDB_NOLOCK
545
559
  * Don't do any locking. If concurrent access is anticipated, the
546
560
  * caller must manage all concurrency itself. For proper operation
@@ -579,7 +593,7 @@ int mdb_env_create(MDB_env **env);
579
593
  * @return A non-zero error value on failure and 0 on success. Some possible
580
594
  * errors are:
581
595
  * <ul>
582
- * <li>#MDB_VERSION_MISMATCH - the version of the MDB library doesn't match the
596
+ * <li>#MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the
583
597
  * version that created the database environment.
584
598
  * <li>#MDB_INVALID - the environment file headers are corrupted.
585
599
  * <li>ENOENT - the directory specified by the path parameter doesn't exist.
@@ -589,7 +603,7 @@ int mdb_env_create(MDB_env **env);
589
603
  */
590
604
  int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode);
591
605
 
592
- /** @brief Copy an MDB environment to the specified path.
606
+ /** @brief Copy an LMDB environment to the specified path.
593
607
  *
594
608
  * This function may be used to make a backup of an existing environment.
595
609
  * No lockfile is created, since it gets recreated at need.
@@ -605,7 +619,7 @@ int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t
605
619
  */
606
620
  int mdb_env_copy(MDB_env *env, const char *path);
607
621
 
608
- /** @brief Copy an MDB environment to the specified file descriptor.
622
+ /** @brief Copy an LMDB environment to the specified file descriptor.
609
623
  *
610
624
  * This function may be used to make a backup of an existing environment.
611
625
  * No lockfile is created, since it gets recreated at need.
@@ -620,7 +634,50 @@ int mdb_env_copy(MDB_env *env, const char *path);
620
634
  */
621
635
  int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd);
622
636
 
623
- /** @brief Return statistics about the MDB environment.
637
+ /** @brief Copy an LMDB environment to the specified path, with options.
638
+ *
639
+ * This function may be used to make a backup of an existing environment.
640
+ * No lockfile is created, since it gets recreated at need.
641
+ * @note This call can trigger significant file size growth if run in
642
+ * parallel with write transactions, because it employs a read-only
643
+ * transaction. See long-lived transactions under @ref caveats_sec.
644
+ * @param[in] env An environment handle returned by #mdb_env_create(). It
645
+ * must have already been opened successfully.
646
+ * @param[in] path The directory in which the copy will reside. This
647
+ * directory must already exist and be writable but must otherwise be
648
+ * empty.
649
+ * @param[in] flags Special options for this operation. This parameter
650
+ * must be set to 0 or by bitwise OR'ing together one or more of the
651
+ * values described here.
652
+ * <ul>
653
+ * <li>#MDB_CP_COMPACT - Perform compaction while copying: omit free
654
+ * pages and sequentially renumber all pages in output. This option
655
+ * consumes more CPU and runs more slowly than the default.
656
+ * </ul>
657
+ * @return A non-zero error value on failure and 0 on success.
658
+ */
659
+ int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags);
660
+
661
+ /** @brief Copy an LMDB environment to the specified file descriptor,
662
+ * with options.
663
+ *
664
+ * This function may be used to make a backup of an existing environment.
665
+ * No lockfile is created, since it gets recreated at need. See
666
+ * #mdb_env_copy2() for further details.
667
+ * @note This call can trigger significant file size growth if run in
668
+ * parallel with write transactions, because it employs a read-only
669
+ * transaction. See long-lived transactions under @ref caveats_sec.
670
+ * @param[in] env An environment handle returned by #mdb_env_create(). It
671
+ * must have already been opened successfully.
672
+ * @param[in] fd The filedescriptor to write the copy to. It must
673
+ * have already been opened for Write access.
674
+ * @param[in] flags Special options for this operation.
675
+ * See #mdb_env_copy2() for options.
676
+ * @return A non-zero error value on failure and 0 on success.
677
+ */
678
+ int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags);
679
+
680
+ /** @brief Return statistics about the LMDB environment.
624
681
  *
625
682
  * @param[in] env An environment handle returned by #mdb_env_create()
626
683
  * @param[out] stat The address of an #MDB_stat structure
@@ -628,7 +685,7 @@ int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd);
628
685
  */
629
686
  int mdb_env_stat(MDB_env *env, MDB_stat *stat);
630
687
 
631
- /** @brief Return information about the MDB environment.
688
+ /** @brief Return information about the LMDB environment.
632
689
  *
633
690
  * @param[in] env An environment handle returned by #mdb_env_create()
634
691
  * @param[out] stat The address of an #MDB_envinfo structure
@@ -639,7 +696,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
639
696
  /** @brief Flush the data buffers to disk.
640
697
  *
641
698
  * Data is always written to disk when #mdb_txn_commit() is called,
642
- * but the operating system may keep it buffered. MDB always flushes
699
+ * but the operating system may keep it buffered. LMDB always flushes
643
700
  * the OS buffers upon commit as well, unless the environment was
644
701
  * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
645
702
  * @param[in] env An environment handle returned by #mdb_env_create()
@@ -730,7 +787,13 @@ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
730
787
  * this process. Note that the library does not check for this condition,
731
788
  * the caller must ensure it explicitly.
732
789
  *
733
- * If the mapsize is changed by another process, #mdb_txn_begin() will
790
+ * The new size takes effect immediately for the current process but
791
+ * will not be persisted to any others until a write transaction has been
792
+ * committed by the current process. Also, only mapsize increases are
793
+ * persisted into the environment.
794
+ *
795
+ * If the mapsize is increased by another process, and data has grown
796
+ * beyond the range of the current mapsize, #mdb_txn_begin() will
734
797
  * return #MDB_MAP_RESIZED. This function may be called with a size
735
798
  * of zero to adopt the new size.
736
799
  *
@@ -822,7 +885,7 @@ int mdb_env_set_userctx(MDB_env *env, void *ctx);
822
885
  */
823
886
  void *mdb_env_get_userctx(MDB_env *env);
824
887
 
825
- /** @brief A callback function for most MDB assert() failures,
888
+ /** @brief A callback function for most LMDB assert() failures,
826
889
  * called before printing the message and aborting.
827
890
  *
828
891
  * @param[in] env An environment handle returned by #mdb_env_create().
@@ -1204,7 +1267,7 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
1204
1267
  * reserved space, which the caller can fill in later - before
1205
1268
  * the next update operation or the transaction ends. This saves
1206
1269
  * an extra memcpy if the data is being generated later.
1207
- * MDB does nothing else with this memory, the caller is expected
1270
+ * LMDB does nothing else with this memory, the caller is expected
1208
1271
  * to modify all of the space requested.
1209
1272
  * <li>#MDB_APPEND - append the given key/data pair to the end of the
1210
1273
  * database. No key comparisons are performed. This option allows
@@ -1345,11 +1408,12 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
1345
1408
  * @param[in] flags Options for this operation. This parameter
1346
1409
  * must be set to 0 or one of the values described here.
1347
1410
  * <ul>
1348
- * <li>#MDB_CURRENT - overwrite the data of the key/data pair to which
1349
- * the cursor refers with the specified data item. The \b key
1350
- * parameter is not used for positioning the cursor, but should
1351
- * still be provided. If using sorted duplicates (#MDB_DUPSORT)
1352
- * the data item must still sort into the same place.
1411
+ * <li>#MDB_CURRENT - replace the item at the current cursor position.
1412
+ * The \b key parameter must still be provided, and must match it.
1413
+ * If using sorted duplicates (#MDB_DUPSORT) the data item must still
1414
+ * sort into the same place. This is intended to be used when the
1415
+ * new data is the same size as the old. Otherwise it will simply
1416
+ * perform a delete of the old record followed by an insert.
1353
1417
  * <li>#MDB_NODUPDATA - enter the new key/data pair only if it does not
1354
1418
  * already appear in the database. This flag may only be specified
1355
1419
  * if the database was opened with #MDB_DUPSORT. The function will
@@ -1478,4 +1542,12 @@ int mdb_reader_check(MDB_env *env, int *dead);
1478
1542
  #ifdef __cplusplus
1479
1543
  }
1480
1544
  #endif
1545
+ /** @page tools LMDB Command Line Tools
1546
+ The following describes the command line tools that are available for LMDB.
1547
+ \li \ref mdb_copy_1
1548
+ \li \ref mdb_dump_1
1549
+ \li \ref mdb_load_1
1550
+ \li \ref mdb_stat_1
1551
+ */
1552
+
1481
1553
  #endif /* _LMDB_H_ */
@@ -1,11 +1,11 @@
1
1
  /** @file mdb.c
2
- * @brief memory-mapped database library
2
+ * @brief Lightning memory-mapped database library
3
3
  *
4
4
  * A Btree-based database management library modeled loosely on the
5
5
  * BerkeleyDB API, but much simplified.
6
6
  */
7
7
  /*
8
- * Copyright 2011-2013 Howard Chu, Symas Corp.
8
+ * Copyright 2011-2014 Howard Chu, Symas Corp.
9
9
  * All rights reserved.
10
10
  *
11
11
  * Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,17 @@
35
35
  #ifndef _GNU_SOURCE
36
36
  #define _GNU_SOURCE 1
37
37
  #endif
38
- #include <sys/types.h>
39
- #include <sys/stat.h>
40
38
  #ifdef _WIN32
39
+ #include <malloc.h>
41
40
  #include <windows.h>
42
41
  /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
43
42
  * as int64 which is wrong. MSVC doesn't define it at all, so just
44
43
  * don't use it.
45
44
  */
46
45
  #define MDB_PID_T int
46
+ #define MDB_THR_T DWORD
47
+ #include <sys/types.h>
48
+ #include <sys/stat.h>
47
49
  #ifdef __GNUC__
48
50
  # include <sys/param.h>
49
51
  #else
@@ -55,7 +57,10 @@
55
57
  # endif
56
58
  #endif
57
59
  #else
60
+ #include <sys/types.h>
61
+ #include <sys/stat.h>
58
62
  #define MDB_PID_T pid_t
63
+ #define MDB_THR_T pthread_t
59
64
  #include <sys/param.h>
60
65
  #include <sys/uio.h>
61
66
  #include <sys/mman.h>
@@ -65,6 +70,16 @@
65
70
  #include <fcntl.h>
66
71
  #endif
67
72
 
73
+ #if defined(__mips) && defined(__linux)
74
+ /* MIPS has cache coherency issues, requires explicit cache control */
75
+ #include <asm/cachectl.h>
76
+ extern int cacheflush(char *addr, int nbytes, int cache);
77
+ #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache)
78
+ #else
79
+ #define CACHEFLUSH(addr, bytes, cache)
80
+ #endif
81
+
82
+
68
83
  #include <errno.h>
69
84
  #include <limits.h>
70
85
  #include <stddef.h>
@@ -75,6 +90,12 @@
75
90
  #include <time.h>
76
91
  #include <unistd.h>
77
92
 
93
+ #if defined(__sun)
94
+ /* Most platforms have posix_memalign, older may only have memalign */
95
+ #define HAVE_MEMALIGN 1
96
+ #include <malloc.h>
97
+ #endif
98
+
78
99
  #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
79
100
  #include <netinet/in.h>
80
101
  #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
@@ -145,7 +166,18 @@
145
166
  # error "Two's complement, reasonably sized integer types, please"
146
167
  #endif
147
168
 
148
- /** @defgroup internal MDB Internals
169
+ #ifdef __GNUC__
170
+ /** Put infrequently used env functions in separate section */
171
+ # ifdef __APPLE__
172
+ # define ESECT __attribute__ ((section("__TEXT,text_env")))
173
+ # else
174
+ # define ESECT __attribute__ ((section("text_env")))
175
+ # endif
176
+ #else
177
+ #define ESECT
178
+ #endif
179
+
180
+ /** @defgroup internal LMDB Internals
149
181
  * @{
150
182
  */
151
183
  /** @defgroup compat Compatibility Macros
@@ -156,6 +188,11 @@
156
188
  * @{
157
189
  */
158
190
 
191
+ /** Features under development */
192
+ #ifndef MDB_DEVEL
193
+ #define MDB_DEVEL 0
194
+ #endif
195
+
159
196
  /** Wrapper around __func__, which is a C99 feature */
160
197
  #if __STDC_VERSION__ >= 199901L
161
198
  # define mdb_func_ __func__
@@ -169,8 +206,10 @@
169
206
  #ifdef _WIN32
170
207
  #define MDB_USE_HASH 1
171
208
  #define MDB_PIDLOCK 0
172
- #define pthread_t DWORD
209
+ #define THREAD_RET DWORD
210
+ #define pthread_t HANDLE
173
211
  #define pthread_mutex_t HANDLE
212
+ #define pthread_cond_t HANDLE
174
213
  #define pthread_key_t DWORD
175
214
  #define pthread_self() GetCurrentThreadId()
176
215
  #define pthread_key_create(x,y) \
@@ -178,12 +217,16 @@
178
217
  #define pthread_key_delete(x) TlsFree(x)
179
218
  #define pthread_getspecific(x) TlsGetValue(x)
180
219
  #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
181
- #define pthread_mutex_unlock(x) ReleaseMutex(x)
182
- #define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE)
183
- #define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex)
184
- #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock((env)->me_rmutex)
185
- #define LOCK_MUTEX_W(env) pthread_mutex_lock((env)->me_wmutex)
186
- #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock((env)->me_wmutex)
220
+ #define pthread_mutex_unlock(x) ReleaseMutex(*x)
221
+ #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
222
+ #define pthread_cond_signal(x) SetEvent(*x)
223
+ #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
224
+ #define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL)
225
+ #define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE)
226
+ #define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_rmutex)
227
+ #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_rmutex)
228
+ #define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_wmutex)
229
+ #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_wmutex)
187
230
  #define getpid() GetCurrentProcessId()
188
231
  #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
189
232
  #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
@@ -198,7 +241,9 @@
198
241
  #endif
199
242
  #define Z "I"
200
243
  #else
201
-
244
+ #define THREAD_RET void *
245
+ #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg)
246
+ #define THREAD_FINISH(thr) pthread_join(thr,NULL)
202
247
  #define Z "z" /**< printf format modifier for size_t */
203
248
 
204
249
  /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
@@ -352,7 +397,8 @@ static txnid_t mdb_debug_start;
352
397
 
353
398
  /** @brief The maximum size of a database page.
354
399
  *
355
- * This is 32k, since it must fit in #MDB_page.%mp_upper.
400
+ * It is 32k or 64k, since value-PAGEBASE must fit in
401
+ * #MDB_page.%mp_upper.
356
402
  *
357
403
  * LMDB will use database pages < OS pages if needed.
358
404
  * That causes more I/O in write transactions: The OS must
@@ -365,7 +411,7 @@ static txnid_t mdb_debug_start;
365
411
  * pressure from other processes is high. So until OSs have
366
412
  * actual paging support for Huge pages, they're not viable.
367
413
  */
368
- #define MAX_PAGESIZE 0x8000
414
+ #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
369
415
 
370
416
  /** The minimum number of keys required in a database page.
371
417
  * Setting this to a larger value will place a smaller bound on the
@@ -381,14 +427,14 @@ static txnid_t mdb_debug_start;
381
427
  */
382
428
  #define MDB_MINKEYS 2
383
429
 
384
- /** A stamp that identifies a file as an MDB file.
430
+ /** A stamp that identifies a file as an LMDB file.
385
431
  * There's nothing special about this value other than that it is easily
386
432
  * recognizable, and it will reflect any byte order mismatches.
387
433
  */
388
434
  #define MDB_MAGIC 0xBEEFC0DE
389
435
 
390
436
  /** The version number for a database's datafile format. */
391
- #define MDB_DATA_VERSION 1
437
+ #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1)
392
438
  /** The version number for a database's lockfile format. */
393
439
  #define MDB_LOCK_VERSION 1
394
440
 
@@ -397,13 +443,14 @@ static txnid_t mdb_debug_start;
397
443
  * Define this as 0 to compute the max from the page size. 511
398
444
  * is default for backwards compat: liblmdb <= 0.9.10 can break
399
445
  * when modifying a DB with keys/dupsort data bigger than its max.
446
+ * #MDB_DEVEL sets the default to 0.
400
447
  *
401
448
  * Data items in an #MDB_DUPSORT database are also limited to
402
449
  * this size, since they're actually keys of a sub-DB. Keys and
403
450
  * #MDB_DUPSORT data items must fit on a node in a regular page.
404
451
  */
405
452
  #ifndef MDB_MAXKEYSIZE
406
- #define MDB_MAXKEYSIZE 511
453
+ #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511)
407
454
  #endif
408
455
 
409
456
  /** The maximum size of a key we can write to the environment. */
@@ -537,7 +584,7 @@ typedef struct MDB_rxbody {
537
584
  /** The process ID of the process owning this reader txn. */
538
585
  MDB_PID_T mrb_pid;
539
586
  /** The thread ID of the thread owning this txn. */
540
- pthread_t mrb_tid;
587
+ MDB_THR_T mrb_tid;
541
588
  } MDB_rxbody;
542
589
 
543
590
  /** The actual reader record, with cacheline padding. */
@@ -568,7 +615,7 @@ typedef struct MDB_reader {
568
615
  * unlikely. If a collision occurs, the results are unpredictable.
569
616
  */
570
617
  typedef struct MDB_txbody {
571
- /** Stamp identifying this as an MDB file. It must be set
618
+ /** Stamp identifying this as an LMDB file. It must be set
572
619
  * to #MDB_MAGIC. */
573
620
  uint32_t mtb_magic;
574
621
  /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */
@@ -635,7 +682,7 @@ typedef struct MDB_page {
635
682
  #define mp_next mp_p.p_next
636
683
  union {
637
684
  pgno_t p_pgno; /**< page number */
638
- void * p_next; /**< for in-memory list of freed structs */
685
+ struct MDB_page *p_next; /**< for in-memory list of freed pages */
639
686
  } mp_p;
640
687
  uint16_t mp_pad;
641
688
  /** @defgroup mdb_page Page Flags
@@ -650,6 +697,7 @@ typedef struct MDB_page {
650
697
  #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
651
698
  #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
652
699
  #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
700
+ #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
653
701
  #define P_KEEP 0x8000 /**< leave this page alone during spill */
654
702
  /** @} */
655
703
  uint16_t mp_flags; /**< @ref mdb_page */
@@ -672,8 +720,11 @@ typedef struct MDB_page {
672
720
  /** Address of first usable data byte in a page, after the header */
673
721
  #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
674
722
 
723
+ /** ITS#7713, change PAGEBASE to handle 65536 byte pages */
724
+ #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0)
725
+
675
726
  /** Number of nodes on a page */
676
- #define NUMKEYS(p) (((p)->mp_lower - PAGEHDRSZ) >> 1)
727
+ #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
677
728
 
678
729
  /** The amount of space remaining in the page */
679
730
  #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
@@ -700,6 +751,9 @@ typedef struct MDB_page {
700
751
  /** The number of overflow pages needed to store the given size. */
701
752
  #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
702
753
 
754
+ /** Link in #MDB_txn.%mt_loose_pgs list */
755
+ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
756
+
703
757
  /** Header for a single key/data pair within a page.
704
758
  * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
705
759
  * We guarantee 2-byte alignment for 'MDB_node's.
@@ -751,7 +805,7 @@ typedef struct MDB_node {
751
805
  #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
752
806
 
753
807
  /** Address of node \b i in page \b p */
754
- #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
808
+ #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
755
809
 
756
810
  /** Address of the key for the node */
757
811
  #define NODEKEY(node) (void *)((node)->mn_data)
@@ -841,7 +895,7 @@ typedef struct MDB_db {
841
895
  * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
842
896
  */
843
897
  typedef struct MDB_meta {
844
- /** Stamp identifying this as an MDB file. It must be set
898
+ /** Stamp identifying this as an LMDB file. It must be set
845
899
  * to #MDB_MAGIC. */
846
900
  uint32_t mm_magic;
847
901
  /** Version number of this lock file. Must be set to #MDB_DATA_VERSION. */
@@ -898,6 +952,12 @@ struct MDB_txn {
898
952
  /** The list of pages that became unused during this transaction.
899
953
  */
900
954
  MDB_IDL mt_free_pgs;
955
+ /** The list of loose pages that became unused and may be reused
956
+ * in this transaction, linked through #NEXT_LOOSE_PAGE(page).
957
+ */
958
+ MDB_page *mt_loose_pgs;
959
+ /* #Number of loose pages (#mt_loose_pgs) */
960
+ int mt_loose_count;
901
961
  /** The sorted list of dirty pages we temporarily wrote to disk
902
962
  * because the dirty list was full. page numbers in here are
903
963
  * shifted left by 1, deleted slots have the LSB set.
@@ -913,6 +973,8 @@ struct MDB_txn {
913
973
  MDB_dbx *mt_dbxs;
914
974
  /** Array of MDB_db records for each known DB */
915
975
  MDB_db *mt_dbs;
976
+ /** Array of sequence numbers for each DB handle */
977
+ unsigned int *mt_dbiseqs;
916
978
  /** @defgroup mt_dbflag Transaction DB Flags
917
979
  * @ingroup internal
918
980
  * @{
@@ -1048,12 +1110,15 @@ struct MDB_env {
1048
1110
  MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
1049
1111
  void *me_pbuf; /**< scratch area for DUPSORT put() */
1050
1112
  MDB_txn *me_txn; /**< current write transaction */
1113
+ MDB_txn *me_txn0; /**< prealloc'd write transaction */
1051
1114
  size_t me_mapsize; /**< size of the data memory map */
1052
1115
  off_t me_size; /**< current file size */
1053
1116
  pgno_t me_maxpg; /**< me_mapsize / me_psize */
1054
1117
  MDB_dbx *me_dbxs; /**< array of static DB info */
1055
1118
  uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
1119
+ unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */
1056
1120
  pthread_key_t me_txkey; /**< thread-key for readers */
1121
+ txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */
1057
1122
  MDB_pgstate me_pgstate; /**< state of old pages from freeDB */
1058
1123
  # define me_pglast me_pgstate.mf_pglast
1059
1124
  # define me_pghead me_pgstate.mf_pghead
@@ -1102,6 +1167,10 @@ typedef struct MDB_ntxn {
1102
1167
  #define TXN_DBI_EXIST(txn, dbi) \
1103
1168
  ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID))
1104
1169
 
1170
+ /** Check for misused \b dbi handles */
1171
+ #define TXN_DBI_CHANGED(txn, dbi) \
1172
+ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1173
+
1105
1174
  static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
1106
1175
  static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
1107
1176
  static int mdb_page_touch(MDB_cursor *mc);
@@ -1182,7 +1251,7 @@ mdb_version(int *major, int *minor, int *patch)
1182
1251
  return MDB_VERSION_STRING;
1183
1252
  }
1184
1253
 
1185
- /** Table of descriptions for MDB @ref errors */
1254
+ /** Table of descriptions for LMDB @ref errors */
1186
1255
  static char *const mdb_errstr[] = {
1187
1256
  "MDB_KEYEXIST: Key/data pair already exists",
1188
1257
  "MDB_NOTFOUND: No matching key/data pair found",
@@ -1190,7 +1259,7 @@ static char *const mdb_errstr[] = {
1190
1259
  "MDB_CORRUPTED: Located page was wrong type",
1191
1260
  "MDB_PANIC: Update of meta page failed",
1192
1261
  "MDB_VERSION_MISMATCH: Database environment version mismatch",
1193
- "MDB_INVALID: File is not an MDB file",
1262
+ "MDB_INVALID: File is not an LMDB file",
1194
1263
  "MDB_MAP_FULL: Environment mapsize limit reached",
1195
1264
  "MDB_DBS_FULL: Environment maxdbs limit reached",
1196
1265
  "MDB_READERS_FULL: Environment maxreaders limit reached",
@@ -1203,11 +1272,20 @@ static char *const mdb_errstr[] = {
1203
1272
  "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1204
1273
  "MDB_BAD_TXN: Transaction cannot recover - it must be aborted",
1205
1274
  "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1275
+ "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1206
1276
  };
1207
1277
 
1208
1278
  char *
1209
1279
  mdb_strerror(int err)
1210
1280
  {
1281
+ #ifdef _WIN32
1282
+ /** HACK: pad 4KB on stack over the buf. Return system msgs in buf.
1283
+ * This works as long as no function between the call to mdb_strerror
1284
+ * and the actual use of the message uses more than 4K of stack.
1285
+ */
1286
+ char pad[4096];
1287
+ char buf[1024], *ptr = buf;
1288
+ #endif
1211
1289
  int i;
1212
1290
  if (!err)
1213
1291
  return ("Successful return: 0");
@@ -1217,7 +1295,32 @@ mdb_strerror(int err)
1217
1295
  return mdb_errstr[i];
1218
1296
  }
1219
1297
 
1298
+ #ifdef _WIN32
1299
+ /* These are the C-runtime error codes we use. The comment indicates
1300
+ * their numeric value, and the Win32 error they would correspond to
1301
+ * if the error actually came from a Win32 API. A major mess, we should
1302
+ * have used LMDB-specific error codes for everything.
1303
+ */
1304
+ switch(err) {
1305
+ case ENOENT: /* 2, FILE_NOT_FOUND */
1306
+ case EIO: /* 5, ACCESS_DENIED */
1307
+ case ENOMEM: /* 12, INVALID_ACCESS */
1308
+ case EACCES: /* 13, INVALID_DATA */
1309
+ case EBUSY: /* 16, CURRENT_DIRECTORY */
1310
+ case EINVAL: /* 22, BAD_COMMAND */
1311
+ case ENOSPC: /* 28, OUT_OF_PAPER */
1312
+ return strerror(err);
1313
+ default:
1314
+ ;
1315
+ }
1316
+ buf[0] = 0;
1317
+ FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM |
1318
+ FORMAT_MESSAGE_IGNORE_INSERTS,
1319
+ NULL, err, 0, ptr, sizeof(buf), pad);
1320
+ return ptr;
1321
+ #else
1220
1322
  return strerror(err);
1323
+ #endif
1221
1324
  }
1222
1325
 
1223
1326
  /** assert(3) variant in cursor context */
@@ -1357,7 +1460,7 @@ mdb_page_list(MDB_page *mp)
1357
1460
  total = EVEN(total);
1358
1461
  }
1359
1462
  fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1360
- IS_LEAF2(mp) ? PAGEHDRSZ : mp->mp_lower, total, SIZELEFT(mp));
1463
+ IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1361
1464
  }
1362
1465
 
1363
1466
  void
@@ -1485,7 +1588,6 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
1485
1588
  }
1486
1589
  return ret;
1487
1590
  }
1488
-
1489
1591
  /** Free a single page.
1490
1592
  * Saves single pages to a list, for future reuse.
1491
1593
  * (This is not used for multi-page overflow pages.)
@@ -1525,6 +1627,62 @@ mdb_dlist_free(MDB_txn *txn)
1525
1627
  dl[0].mid = 0;
1526
1628
  }
1527
1629
 
1630
+ /** Loosen or free a single page.
1631
+ * Saves single pages to a list for future reuse
1632
+ * in this same txn. It has been pulled from the freeDB
1633
+ * and already resides on the dirty list, but has been
1634
+ * deleted. Use these pages first before pulling again
1635
+ * from the freeDB.
1636
+ *
1637
+ * If the page wasn't dirtied in this txn, just add it
1638
+ * to this txn's free list.
1639
+ */
1640
+ static int
1641
+ mdb_page_loose(MDB_cursor *mc, MDB_page *mp)
1642
+ {
1643
+ int loose = 0;
1644
+ pgno_t pgno = mp->mp_pgno;
1645
+ MDB_txn *txn = mc->mc_txn;
1646
+
1647
+ if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
1648
+ if (txn->mt_parent) {
1649
+ MDB_ID2 *dl = txn->mt_u.dirty_list;
1650
+ /* If txn has a parent, make sure the page is in our
1651
+ * dirty list.
1652
+ */
1653
+ if (dl[0].mid) {
1654
+ unsigned x = mdb_mid2l_search(dl, pgno);
1655
+ if (x <= dl[0].mid && dl[x].mid == pgno) {
1656
+ if (mp != dl[x].mptr) { /* bad cursor? */
1657
+ mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
1658
+ txn->mt_flags |= MDB_TXN_ERROR;
1659
+ return MDB_CORRUPTED;
1660
+ }
1661
+ /* ok, it's ours */
1662
+ loose = 1;
1663
+ }
1664
+ }
1665
+ } else {
1666
+ /* no parent txn, so it's just ours */
1667
+ loose = 1;
1668
+ }
1669
+ }
1670
+ if (loose) {
1671
+ DPRINTF(("loosen db %d page %"Z"u", DDBI(mc),
1672
+ mp->mp_pgno));
1673
+ NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
1674
+ txn->mt_loose_pgs = mp;
1675
+ txn->mt_loose_count++;
1676
+ mp->mp_flags |= P_LOOSE;
1677
+ } else {
1678
+ int rc = mdb_midl_append(&txn->mt_free_pgs, pgno);
1679
+ if (rc)
1680
+ return rc;
1681
+ }
1682
+
1683
+ return MDB_SUCCESS;
1684
+ }
1685
+
1528
1686
  /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1529
1687
  * @param[in] mc A cursor handle for the current operation.
1530
1688
  * @param[in] pflags Flags of the pages to update:
@@ -1535,7 +1693,7 @@ mdb_dlist_free(MDB_txn *txn)
1535
1693
  static int
1536
1694
  mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1537
1695
  {
1538
- enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
1696
+ enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
1539
1697
  MDB_txn *txn = mc->mc_txn;
1540
1698
  MDB_cursor *m3;
1541
1699
  MDB_xcursor *mx;
@@ -1686,7 +1844,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
1686
1844
  for (i=dl[0].mid; i && need; i--) {
1687
1845
  MDB_ID pn = dl[i].mid << 1;
1688
1846
  dp = dl[i].mptr;
1689
- if (dp->mp_flags & P_KEEP)
1847
+ if (dp->mp_flags & (P_LOOSE|P_KEEP))
1690
1848
  continue;
1691
1849
  /* Can't spill twice, make sure it's not already in a parent's
1692
1850
  * spill list.
@@ -1790,15 +1948,27 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1790
1948
  #else
1791
1949
  enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
1792
1950
  #endif
1793
- int rc, retry = Max_retries;
1951
+ int rc, retry = num * 60;
1794
1952
  MDB_txn *txn = mc->mc_txn;
1795
1953
  MDB_env *env = txn->mt_env;
1796
1954
  pgno_t pgno, *mop = env->me_pghead;
1797
- unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1;
1955
+ unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
1798
1956
  MDB_page *np;
1799
1957
  txnid_t oldest = 0, last;
1800
1958
  MDB_cursor_op op;
1801
1959
  MDB_cursor m2;
1960
+ int found_old = 0;
1961
+
1962
+ /* If there are any loose pages, just use them */
1963
+ if (num == 1 && txn->mt_loose_pgs) {
1964
+ np = txn->mt_loose_pgs;
1965
+ txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
1966
+ txn->mt_loose_count--;
1967
+ DPRINTF(("db %d use loose page %"Z"u", DDBI(mc),
1968
+ np->mp_pgno));
1969
+ *mp = np;
1970
+ return MDB_SUCCESS;
1971
+ }
1802
1972
 
1803
1973
  *mp = NULL;
1804
1974
 
@@ -1811,7 +1981,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1811
1981
  for (op = MDB_FIRST;; op = MDB_NEXT) {
1812
1982
  MDB_val key, data;
1813
1983
  MDB_node *leaf;
1814
- pgno_t *idl, old_id, new_id;
1984
+ pgno_t *idl;
1815
1985
 
1816
1986
  /* Seek a big enough contiguous page range. Prefer
1817
1987
  * pages at the tail, just truncating the list.
@@ -1823,14 +1993,14 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1823
1993
  if (mop[i-n2] == pgno+n2)
1824
1994
  goto search_done;
1825
1995
  } while (--i > n2);
1826
- if (Max_retries < INT_MAX && --retry < 0)
1996
+ if (--retry < 0)
1827
1997
  break;
1828
1998
  }
1829
1999
 
1830
2000
  if (op == MDB_FIRST) { /* 1st iteration */
1831
2001
  /* Prepare to fetch more and coalesce */
1832
- oldest = mdb_find_oldest(txn);
1833
2002
  last = env->me_pglast;
2003
+ oldest = env->me_pgoldest;
1834
2004
  mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
1835
2005
  if (last) {
1836
2006
  op = MDB_SET_RANGE;
@@ -1845,8 +2015,15 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1845
2015
 
1846
2016
  last++;
1847
2017
  /* Do not fetch more if the record will be too recent */
1848
- if (oldest <= last)
1849
- break;
2018
+ if (oldest <= last) {
2019
+ if (!found_old) {
2020
+ oldest = mdb_find_oldest(txn);
2021
+ env->me_pgoldest = oldest;
2022
+ found_old = 1;
2023
+ }
2024
+ if (oldest <= last)
2025
+ break;
2026
+ }
1850
2027
  rc = mdb_cursor_get(&m2, &key, NULL, op);
1851
2028
  if (rc) {
1852
2029
  if (rc == MDB_NOTFOUND)
@@ -1854,8 +2031,15 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1854
2031
  goto fail;
1855
2032
  }
1856
2033
  last = *(txnid_t*)key.mv_data;
1857
- if (oldest <= last)
1858
- break;
2034
+ if (oldest <= last) {
2035
+ if (!found_old) {
2036
+ oldest = mdb_find_oldest(txn);
2037
+ env->me_pgoldest = oldest;
2038
+ found_old = 1;
2039
+ }
2040
+ if (oldest <= last)
2041
+ break;
2042
+ }
1859
2043
  np = m2.mc_pg[m2.mc_top];
1860
2044
  leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
1861
2045
  if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)
@@ -1877,21 +2061,12 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1877
2061
  #if (MDB_DEBUG) > 1
1878
2062
  DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u",
1879
2063
  last, txn->mt_dbs[FREE_DBI].md_root, i));
1880
- for (k = i; k; k--)
1881
- DPRINTF(("IDL %"Z"u", idl[k]));
2064
+ for (j = i; j; j--)
2065
+ DPRINTF(("IDL %"Z"u", idl[j]));
1882
2066
  #endif
1883
2067
  /* Merge in descending sorted order */
1884
- j = mop_len;
1885
- k = mop_len += i;
1886
- mop[0] = (pgno_t)-1;
1887
- old_id = mop[j];
1888
- while (i) {
1889
- new_id = idl[i--];
1890
- for (; old_id < new_id; old_id = mop[--j])
1891
- mop[k--] = old_id;
1892
- mop[k--] = new_id;
1893
- }
1894
- mop[0] = mop_len;
2068
+ mdb_midl_xmerge(mop, idl);
2069
+ mop_len = mop[0];
1895
2070
  }
1896
2071
 
1897
2072
  /* Use new pages from the map when nothing suitable in the freeDB */
@@ -1946,8 +2121,8 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
1946
2121
  * alignment so memcpy may copy words instead of bytes.
1947
2122
  */
1948
2123
  if ((unused &= -Align) && !IS_LEAF2(src)) {
1949
- upper &= -Align;
1950
- memcpy(dst, src, (lower + (Align-1)) & -Align);
2124
+ upper = (upper + PAGEBASE) & -Align;
2125
+ memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
1951
2126
  memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
1952
2127
  psize - upper);
1953
2128
  } else {
@@ -2314,7 +2489,7 @@ mdb_txn_renew0(MDB_txn *txn)
2314
2489
  return MDB_BAD_RSLOT;
2315
2490
  } else {
2316
2491
  MDB_PID_T pid = env->me_pid;
2317
- pthread_t tid = pthread_self();
2492
+ MDB_THR_T tid = pthread_self();
2318
2493
 
2319
2494
  if (!env->me_live_reader) {
2320
2495
  rc = mdb_reader_pid(env, Pidset, pid);
@@ -2373,6 +2548,7 @@ mdb_txn_renew0(MDB_txn *txn)
2373
2548
  txn->mt_free_pgs[0] = 0;
2374
2549
  txn->mt_spill_pgs = NULL;
2375
2550
  env->me_txn = txn;
2551
+ memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
2376
2552
  }
2377
2553
 
2378
2554
  /* Copy the DB info and flags */
@@ -2447,23 +2623,39 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2447
2623
  tsize = sizeof(MDB_ntxn);
2448
2624
  }
2449
2625
  size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1);
2450
- if (!(flags & MDB_RDONLY))
2626
+ if (!(flags & MDB_RDONLY)) {
2627
+ if (!parent) {
2628
+ txn = env->me_txn0;
2629
+ goto ok;
2630
+ }
2451
2631
  size += env->me_maxdbs * sizeof(MDB_cursor *);
2632
+ /* child txns use parent's dbiseqs */
2633
+ if (!parent)
2634
+ size += env->me_maxdbs * sizeof(unsigned int);
2635
+ }
2452
2636
 
2453
2637
  if ((txn = calloc(1, size)) == NULL) {
2454
- DPRINTF(("calloc: %s", strerror(ErrCode())));
2638
+ DPRINTF(("calloc: %s", strerror(errno)));
2455
2639
  return ENOMEM;
2456
2640
  }
2457
2641
  txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
2458
2642
  if (flags & MDB_RDONLY) {
2459
2643
  txn->mt_flags |= MDB_TXN_RDONLY;
2460
2644
  txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
2645
+ txn->mt_dbiseqs = env->me_dbiseqs;
2461
2646
  } else {
2462
2647
  txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
2463
- txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
2648
+ if (parent) {
2649
+ txn->mt_dbiseqs = parent->mt_dbiseqs;
2650
+ txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
2651
+ } else {
2652
+ txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
2653
+ txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
2654
+ }
2464
2655
  }
2465
2656
  txn->mt_env = env;
2466
2657
 
2658
+ ok:
2467
2659
  if (parent) {
2468
2660
  unsigned int i;
2469
2661
  txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
@@ -2506,9 +2698,10 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2506
2698
  } else {
2507
2699
  rc = mdb_txn_renew0(txn);
2508
2700
  }
2509
- if (rc)
2510
- free(txn);
2511
- else {
2701
+ if (rc) {
2702
+ if (txn != env->me_txn0)
2703
+ free(txn);
2704
+ } else {
2512
2705
  *ret = txn;
2513
2706
  DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2514
2707
  txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
@@ -2540,10 +2733,13 @@ mdb_dbis_update(MDB_txn *txn, int keep)
2540
2733
  env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
2541
2734
  } else {
2542
2735
  char *ptr = env->me_dbxs[i].md_name.mv_data;
2543
- env->me_dbxs[i].md_name.mv_data = NULL;
2544
- env->me_dbxs[i].md_name.mv_size = 0;
2545
- env->me_dbflags[i] = 0;
2546
- free(ptr);
2736
+ if (ptr) {
2737
+ env->me_dbxs[i].md_name.mv_data = NULL;
2738
+ env->me_dbxs[i].md_name.mv_size = 0;
2739
+ env->me_dbflags[i] = 0;
2740
+ env->me_dbiseqs[i]++;
2741
+ free(ptr);
2742
+ }
2547
2743
  }
2548
2744
  }
2549
2745
  }
@@ -2632,7 +2828,8 @@ mdb_txn_abort(MDB_txn *txn)
2632
2828
  if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader)
2633
2829
  txn->mt_u.reader->mr_pid = 0;
2634
2830
 
2635
- free(txn);
2831
+ if (txn != txn->mt_env->me_txn0)
2832
+ free(txn);
2636
2833
  }
2637
2834
 
2638
2835
  /** Save the freelist as of this transaction to the freeDB.
@@ -2661,6 +2858,19 @@ mdb_freelist_save(MDB_txn *txn)
2661
2858
  return rc;
2662
2859
  }
2663
2860
 
2861
+ if (!env->me_pghead && txn->mt_loose_pgs) {
2862
+ /* Put loose page numbers in mt_free_pgs, since
2863
+ * we may be unable to return them to me_pghead.
2864
+ */
2865
+ MDB_page *mp = txn->mt_loose_pgs;
2866
+ if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
2867
+ return rc;
2868
+ for (; mp; mp = NEXT_LOOSE_PAGE(mp))
2869
+ mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2870
+ txn->mt_loose_pgs = NULL;
2871
+ txn->mt_loose_count = 0;
2872
+ }
2873
+
2664
2874
  /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2665
2875
  clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2666
2876
  ? SSIZE_MAX : maxfree_1pg;
@@ -2722,7 +2932,7 @@ mdb_freelist_save(MDB_txn *txn)
2722
2932
  }
2723
2933
 
2724
2934
  mop = env->me_pghead;
2725
- mop_len = mop ? mop[0] : 0;
2935
+ mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
2726
2936
 
2727
2937
  /* Reserve records for me_pghead[]. Split it if multi-page,
2728
2938
  * to avoid searching freeDB for a page range. Use keys in
@@ -2762,6 +2972,28 @@ mdb_freelist_save(MDB_txn *txn)
2762
2972
  total_room += head_room;
2763
2973
  }
2764
2974
 
2975
+ /* Return loose page numbers to me_pghead, though usually none are
2976
+ * left at this point. The pages themselves remain in dirty_list.
2977
+ */
2978
+ if (txn->mt_loose_pgs) {
2979
+ MDB_page *mp = txn->mt_loose_pgs;
2980
+ unsigned count = txn->mt_loose_count;
2981
+ MDB_IDL loose;
2982
+ /* Room for loose pages + temp IDL with same */
2983
+ if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0)
2984
+ return rc;
2985
+ mop = env->me_pghead;
2986
+ loose = mop + MDB_IDL_ALLOCLEN(mop) - count;
2987
+ for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
2988
+ loose[ ++count ] = mp->mp_pgno;
2989
+ loose[0] = count;
2990
+ mdb_midl_sort(loose);
2991
+ mdb_midl_xmerge(mop, loose);
2992
+ txn->mt_loose_pgs = NULL;
2993
+ txn->mt_loose_count = 0;
2994
+ mop_len = mop[0];
2995
+ }
2996
+
2765
2997
  /* Fill in the reserved me_pghead records */
2766
2998
  rc = MDB_SUCCESS;
2767
2999
  if (mop_len) {
@@ -2823,8 +3055,8 @@ mdb_page_flush(MDB_txn *txn, int keep)
2823
3055
  while (++i <= pagecount) {
2824
3056
  dp = dl[i].mptr;
2825
3057
  /* Don't flush this page yet */
2826
- if (dp->mp_flags & P_KEEP) {
2827
- dp->mp_flags ^= P_KEEP;
3058
+ if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3059
+ dp->mp_flags &= ~P_KEEP;
2828
3060
  dl[++j] = dl[i];
2829
3061
  continue;
2830
3062
  }
@@ -2838,8 +3070,8 @@ mdb_page_flush(MDB_txn *txn, int keep)
2838
3070
  if (++i <= pagecount) {
2839
3071
  dp = dl[i].mptr;
2840
3072
  /* Don't flush this page yet */
2841
- if (dp->mp_flags & P_KEEP) {
2842
- dp->mp_flags ^= P_KEEP;
3073
+ if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3074
+ dp->mp_flags &= ~P_KEEP;
2843
3075
  dl[i].mid = 0;
2844
3076
  continue;
2845
3077
  }
@@ -2914,6 +3146,12 @@ mdb_page_flush(MDB_txn *txn, int keep)
2914
3146
  #endif /* _WIN32 */
2915
3147
  }
2916
3148
 
3149
+ /* MIPS has cache coherency issues, this is a no-op everywhere else
3150
+ * Note: for any size >= on-chip cache size, entire on-chip cache is
3151
+ * flushed.
3152
+ */
3153
+ CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3154
+
2917
3155
  for (i = keep; ++i <= pagecount; ) {
2918
3156
  dp = dl[i].mptr;
2919
3157
  /* This is a page we skipped above */
@@ -2968,6 +3206,7 @@ mdb_txn_commit(MDB_txn *txn)
2968
3206
 
2969
3207
  if (txn->mt_parent) {
2970
3208
  MDB_txn *parent = txn->mt_parent;
3209
+ MDB_page **lp;
2971
3210
  MDB_ID2L dst, src;
2972
3211
  MDB_IDL pspill;
2973
3212
  unsigned x, y, len, ps_len;
@@ -3065,6 +3304,12 @@ mdb_txn_commit(MDB_txn *txn)
3065
3304
  }
3066
3305
  }
3067
3306
 
3307
+ /* Append our loose page list to parent's */
3308
+ for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp))
3309
+ ;
3310
+ *lp = txn->mt_loose_pgs;
3311
+ parent->mt_loose_count += txn->mt_loose_count;
3312
+
3068
3313
  parent->mt_child = NULL;
3069
3314
  mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3070
3315
  free(txn);
@@ -3096,6 +3341,10 @@ mdb_txn_commit(MDB_txn *txn)
3096
3341
  mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3097
3342
  for (i = 2; i < txn->mt_numdbs; i++) {
3098
3343
  if (txn->mt_dbflags[i] & DB_DIRTY) {
3344
+ if (TXN_DBI_CHANGED(txn, i)) {
3345
+ rc = MDB_BAD_DBI;
3346
+ goto fail;
3347
+ }
3099
3348
  data.mv_data = &txn->mt_dbs[i];
3100
3349
  rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
3101
3350
  if (rc)
@@ -3122,6 +3371,10 @@ mdb_txn_commit(MDB_txn *txn)
3122
3371
  (rc = mdb_env_write_meta(txn)))
3123
3372
  goto fail;
3124
3373
 
3374
+ /* Free P_LOOSE pages left behind in dirty_list */
3375
+ if (!(env->me_flags & MDB_WRITEMAP))
3376
+ mdb_dlist_free(txn);
3377
+
3125
3378
  done:
3126
3379
  env->me_pglast = 0;
3127
3380
  env->me_txn = NULL;
@@ -3129,7 +3382,8 @@ done:
3129
3382
 
3130
3383
  if (env->me_txns)
3131
3384
  UNLOCK_MUTEX_W(env);
3132
- free(txn);
3385
+ if (txn != env->me_txn0)
3386
+ free(txn);
3133
3387
 
3134
3388
  return MDB_SUCCESS;
3135
3389
 
@@ -3144,7 +3398,7 @@ fail:
3144
3398
  * @param[out] meta address of where to store the meta information
3145
3399
  * @return 0 on success, non-zero on failure.
3146
3400
  */
3147
- static int
3401
+ static int ESECT
3148
3402
  mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3149
3403
  {
3150
3404
  MDB_metabuf pbuf;
@@ -3202,12 +3456,26 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3202
3456
  return 0;
3203
3457
  }
3204
3458
 
3459
+ static void ESECT
3460
+ mdb_env_init_meta0(MDB_env *env, MDB_meta *meta)
3461
+ {
3462
+ meta->mm_magic = MDB_MAGIC;
3463
+ meta->mm_version = MDB_DATA_VERSION;
3464
+ meta->mm_mapsize = env->me_mapsize;
3465
+ meta->mm_psize = env->me_psize;
3466
+ meta->mm_last_pg = 1;
3467
+ meta->mm_flags = env->me_flags & 0xffff;
3468
+ meta->mm_flags |= MDB_INTEGERKEY;
3469
+ meta->mm_dbs[0].md_root = P_INVALID;
3470
+ meta->mm_dbs[1].md_root = P_INVALID;
3471
+ }
3472
+
3205
3473
  /** Write the environment parameters of a freshly created DB environment.
3206
3474
  * @param[in] env the environment handle
3207
3475
  * @param[out] meta address of where to store the meta information
3208
3476
  * @return 0 on success, non-zero on failure.
3209
3477
  */
3210
- static int
3478
+ static int ESECT
3211
3479
  mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3212
3480
  {
3213
3481
  MDB_page *p, *q;
@@ -3231,15 +3499,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3231
3499
 
3232
3500
  psize = env->me_psize;
3233
3501
 
3234
- meta->mm_magic = MDB_MAGIC;
3235
- meta->mm_version = MDB_DATA_VERSION;
3236
- meta->mm_mapsize = env->me_mapsize;
3237
- meta->mm_psize = psize;
3238
- meta->mm_last_pg = 1;
3239
- meta->mm_flags = env->me_flags & 0xffff;
3240
- meta->mm_flags |= MDB_INTEGERKEY;
3241
- meta->mm_dbs[0].md_root = P_INVALID;
3242
- meta->mm_dbs[1].md_root = P_INVALID;
3502
+ mdb_env_init_meta0(env, meta);
3243
3503
 
3244
3504
  p = calloc(2, psize);
3245
3505
  p->mp_pgno = 0;
@@ -3271,6 +3531,7 @@ mdb_env_write_meta(MDB_txn *txn)
3271
3531
  {
3272
3532
  MDB_env *env;
3273
3533
  MDB_meta meta, metab, *mp;
3534
+ size_t mapsize;
3274
3535
  off_t off;
3275
3536
  int rc, len, toggle;
3276
3537
  char *ptr;
@@ -3287,11 +3548,13 @@ mdb_env_write_meta(MDB_txn *txn)
3287
3548
 
3288
3549
  env = txn->mt_env;
3289
3550
  mp = env->me_metas[toggle];
3551
+ mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
3552
+ /* Persist any increases of mapsize config */
3553
+ if (mapsize < env->me_mapsize)
3554
+ mapsize = env->me_mapsize;
3290
3555
 
3291
3556
  if (env->me_flags & MDB_WRITEMAP) {
3292
- /* Persist any increases of mapsize config */
3293
- if (env->me_mapsize > mp->mm_mapsize)
3294
- mp->mm_mapsize = env->me_mapsize;
3557
+ mp->mm_mapsize = mapsize;
3295
3558
  mp->mm_dbs[0] = txn->mt_dbs[0];
3296
3559
  mp->mm_dbs[1] = txn->mt_dbs[1];
3297
3560
  mp->mm_last_pg = txn->mt_next_pgno - 1;
@@ -3318,22 +3581,15 @@ mdb_env_write_meta(MDB_txn *txn)
3318
3581
  metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
3319
3582
  metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
3320
3583
 
3321
- ptr = (char *)&meta;
3322
- if (env->me_mapsize > mp->mm_mapsize) {
3323
- /* Persist any increases of mapsize config */
3324
- meta.mm_mapsize = env->me_mapsize;
3325
- off = offsetof(MDB_meta, mm_mapsize);
3326
- } else {
3327
- off = offsetof(MDB_meta, mm_dbs[0].md_depth);
3328
- }
3329
- len = sizeof(MDB_meta) - off;
3330
-
3331
- ptr += off;
3584
+ meta.mm_mapsize = mapsize;
3332
3585
  meta.mm_dbs[0] = txn->mt_dbs[0];
3333
3586
  meta.mm_dbs[1] = txn->mt_dbs[1];
3334
3587
  meta.mm_last_pg = txn->mt_next_pgno - 1;
3335
3588
  meta.mm_txnid = txn->mt_txnid;
3336
3589
 
3590
+ off = offsetof(MDB_meta, mm_mapsize);
3591
+ ptr = (char *)&meta + off;
3592
+ len = sizeof(MDB_meta) - off;
3337
3593
  if (toggle)
3338
3594
  off += env->me_psize;
3339
3595
  off += PAGEHDRSZ;
@@ -3372,6 +3628,8 @@ fail:
3372
3628
  env->me_flags |= MDB_FATAL_ERROR;
3373
3629
  return rc;
3374
3630
  }
3631
+ /* MIPS has cache coherency issues, this is a no-op everywhere else */
3632
+ CACHEFLUSH(env->me_map + off, len, DCACHE);
3375
3633
  done:
3376
3634
  /* Memory ordering issues are irrelevant; since the entire writer
3377
3635
  * is wrapped by wmutex, all of these changes will become visible
@@ -3395,7 +3653,7 @@ mdb_env_pick_meta(const MDB_env *env)
3395
3653
  return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid);
3396
3654
  }
3397
3655
 
3398
- int
3656
+ int ESECT
3399
3657
  mdb_env_create(MDB_env **env)
3400
3658
  {
3401
3659
  MDB_env *e;
@@ -3420,8 +3678,8 @@ mdb_env_create(MDB_env **env)
3420
3678
  return MDB_SUCCESS;
3421
3679
  }
3422
3680
 
3423
- static int
3424
- mdb_env_map(MDB_env *env, void *addr, int newsize)
3681
+ static int ESECT
3682
+ mdb_env_map(MDB_env *env, void *addr)
3425
3683
  {
3426
3684
  MDB_page *p;
3427
3685
  unsigned int flags = env->me_flags;
@@ -3429,18 +3687,28 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3429
3687
  int rc;
3430
3688
  HANDLE mh;
3431
3689
  LONG sizelo, sizehi;
3432
- sizelo = env->me_mapsize & 0xffffffff;
3433
- sizehi = env->me_mapsize >> 16 >> 16; /* only needed on Win64 */
3690
+ size_t msize;
3434
3691
 
3435
- /* Windows won't create mappings for zero length files.
3436
- * Just allocate the maxsize right now.
3437
- */
3438
- if (newsize) {
3692
+ if (flags & MDB_RDONLY) {
3693
+ /* Don't set explicit map size, use whatever exists */
3694
+ msize = 0;
3695
+ sizelo = 0;
3696
+ sizehi = 0;
3697
+ } else {
3698
+ msize = env->me_mapsize;
3699
+ sizelo = msize & 0xffffffff;
3700
+ sizehi = msize >> 16 >> 16; /* only needed on Win64 */
3701
+
3702
+ /* Windows won't create mappings for zero length files.
3703
+ * and won't map more than the file size.
3704
+ * Just set the maxsize right now.
3705
+ */
3439
3706
  if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo
3440
3707
  || !SetEndOfFile(env->me_fd)
3441
3708
  || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)
3442
3709
  return ErrCode();
3443
3710
  }
3711
+
3444
3712
  mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
3445
3713
  PAGE_READWRITE : PAGE_READONLY,
3446
3714
  sizehi, sizelo, NULL);
@@ -3448,7 +3716,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3448
3716
  return ErrCode();
3449
3717
  env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
3450
3718
  FILE_MAP_WRITE : FILE_MAP_READ,
3451
- 0, 0, env->me_mapsize, addr);
3719
+ 0, 0, msize, addr);
3452
3720
  rc = env->me_map ? 0 : ErrCode();
3453
3721
  CloseHandle(mh);
3454
3722
  if (rc)
@@ -3494,7 +3762,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize)
3494
3762
  return MDB_SUCCESS;
3495
3763
  }
3496
3764
 
3497
- int
3765
+ int ESECT
3498
3766
  mdb_env_set_mapsize(MDB_env *env, size_t size)
3499
3767
  {
3500
3768
  /* If env is already open, caller is responsible for making
@@ -3518,7 +3786,7 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
3518
3786
  munmap(env->me_map, env->me_mapsize);
3519
3787
  env->me_mapsize = size;
3520
3788
  old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
3521
- rc = mdb_env_map(env, old, 1);
3789
+ rc = mdb_env_map(env, old);
3522
3790
  if (rc)
3523
3791
  return rc;
3524
3792
  }
@@ -3528,7 +3796,7 @@ mdb_env_set_mapsize(MDB_env *env, size_t size)
3528
3796
  return MDB_SUCCESS;
3529
3797
  }
3530
3798
 
3531
- int
3799
+ int ESECT
3532
3800
  mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
3533
3801
  {
3534
3802
  if (env->me_map)
@@ -3537,7 +3805,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
3537
3805
  return MDB_SUCCESS;
3538
3806
  }
3539
3807
 
3540
- int
3808
+ int ESECT
3541
3809
  mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
3542
3810
  {
3543
3811
  if (env->me_map || readers < 1)
@@ -3546,7 +3814,7 @@ mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
3546
3814
  return MDB_SUCCESS;
3547
3815
  }
3548
3816
 
3549
- int
3817
+ int ESECT
3550
3818
  mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
3551
3819
  {
3552
3820
  if (!env || !readers)
@@ -3555,9 +3823,9 @@ mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
3555
3823
  return MDB_SUCCESS;
3556
3824
  }
3557
3825
 
3558
- /** Further setup required for opening an MDB environment
3826
+ /** Further setup required for opening an LMDB environment
3559
3827
  */
3560
- static int
3828
+ static int ESECT
3561
3829
  mdb_env_open2(MDB_env *env)
3562
3830
  {
3563
3831
  unsigned int flags = env->me_flags;
@@ -3602,7 +3870,7 @@ mdb_env_open2(MDB_env *env)
3602
3870
  env->me_mapsize = minsize;
3603
3871
  }
3604
3872
 
3605
- rc = mdb_env_map(env, meta.mm_address, newenv || env->me_mapsize != meta.mm_mapsize);
3873
+ rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
3606
3874
  if (rc)
3607
3875
  return rc;
3608
3876
 
@@ -3714,7 +3982,7 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
3714
3982
  #endif
3715
3983
 
3716
3984
  /** Downgrade the exclusive lock on the region back to shared */
3717
- static int
3985
+ static int ESECT
3718
3986
  mdb_env_share_locks(MDB_env *env, int *excl)
3719
3987
  {
3720
3988
  int rc = 0, toggle = mdb_env_pick_meta(env);
@@ -3756,7 +4024,7 @@ mdb_env_share_locks(MDB_env *env, int *excl)
3756
4024
  /** Try to get exlusive lock, otherwise shared.
3757
4025
  * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
3758
4026
  */
3759
- static int
4027
+ static int ESECT
3760
4028
  mdb_env_excl_lock(MDB_env *env, int *excl)
3761
4029
  {
3762
4030
  int rc = 0;
@@ -3891,14 +4159,14 @@ mdb_hash_enc(MDB_val *val, char *encbuf)
3891
4159
  #endif
3892
4160
 
3893
4161
  /** Open and/or initialize the lock region for the environment.
3894
- * @param[in] env The MDB environment.
4162
+ * @param[in] env The LMDB environment.
3895
4163
  * @param[in] lpath The pathname of the file used for the lock region.
3896
4164
  * @param[in] mode The Unix permissions for the file, if we create it.
3897
4165
  * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive
3898
4166
  * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
3899
4167
  * @return 0 on success, non-zero on failure.
3900
4168
  */
3901
- static int
4169
+ static int ESECT
3902
4170
  mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
3903
4171
  {
3904
4172
  #ifdef _WIN32
@@ -4128,7 +4396,7 @@ fail:
4128
4396
  # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
4129
4397
  #endif
4130
4398
 
4131
- int
4399
+ int ESECT
4132
4400
  mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
4133
4401
  {
4134
4402
  int oflags, rc, len, excl = -1;
@@ -4173,7 +4441,8 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4173
4441
  env->me_path = strdup(path);
4174
4442
  env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
4175
4443
  env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
4176
- if (!(env->me_dbxs && env->me_path && env->me_dbflags)) {
4444
+ env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
4445
+ if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
4177
4446
  rc = ENOMEM;
4178
4447
  goto leave;
4179
4448
  }
@@ -4245,6 +4514,22 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
4245
4514
  if (!((flags & MDB_RDONLY) ||
4246
4515
  (env->me_pbuf = calloc(1, env->me_psize))))
4247
4516
  rc = ENOMEM;
4517
+ if (!(flags & MDB_RDONLY)) {
4518
+ MDB_txn *txn;
4519
+ int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs *
4520
+ (sizeof(MDB_db)+sizeof(MDB_cursor)+sizeof(unsigned int)+1);
4521
+ txn = calloc(1, size);
4522
+ if (txn) {
4523
+ txn->mt_dbs = (MDB_db *)((char *)txn + tsize);
4524
+ txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
4525
+ txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
4526
+ txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
4527
+ txn->mt_env = env;
4528
+ env->me_txn0 = txn;
4529
+ } else {
4530
+ rc = ENOMEM;
4531
+ }
4532
+ }
4248
4533
  }
4249
4534
 
4250
4535
  leave:
@@ -4256,7 +4541,7 @@ leave:
4256
4541
  }
4257
4542
 
4258
4543
  /** Destroy resources from mdb_env_open(), clear our readers & DBIs */
4259
- static void
4544
+ static void ESECT
4260
4545
  mdb_env_close0(MDB_env *env, int excl)
4261
4546
  {
4262
4547
  int i;
@@ -4269,6 +4554,7 @@ mdb_env_close0(MDB_env *env, int excl)
4269
4554
  free(env->me_dbxs[i].md_name.mv_data);
4270
4555
 
4271
4556
  free(env->me_pbuf);
4557
+ free(env->me_dbiseqs);
4272
4558
  free(env->me_dbflags);
4273
4559
  free(env->me_dbxs);
4274
4560
  free(env->me_path);
@@ -4344,186 +4630,41 @@ mdb_env_close0(MDB_env *env, int excl)
4344
4630
  env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
4345
4631
  }
4346
4632
 
4347
- int
4348
- mdb_env_copyfd(MDB_env *env, HANDLE fd)
4349
- {
4350
- MDB_txn *txn = NULL;
4351
- int rc;
4352
- size_t wsize;
4353
- char *ptr;
4354
- #ifdef _WIN32
4355
- DWORD len, w2;
4356
- #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
4357
- #else
4358
- ssize_t len;
4359
- size_t w2;
4360
- #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
4361
- #endif
4362
-
4363
- /* Do the lock/unlock of the reader mutex before starting the
4364
- * write txn. Otherwise other read txns could block writers.
4365
- */
4366
- rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
4367
- if (rc)
4368
- return rc;
4369
-
4370
- if (env->me_txns) {
4371
- /* We must start the actual read txn after blocking writers */
4372
- mdb_txn_reset0(txn, "reset-stage1");
4373
4633
 
4374
- /* Temporarily block writers until we snapshot the meta pages */
4375
- LOCK_MUTEX_W(env);
4634
+ void ESECT
4635
+ mdb_env_close(MDB_env *env)
4636
+ {
4637
+ MDB_page *dp;
4376
4638
 
4377
- rc = mdb_txn_renew0(txn);
4378
- if (rc) {
4379
- UNLOCK_MUTEX_W(env);
4380
- goto leave;
4381
- }
4382
- }
4639
+ if (env == NULL)
4640
+ return;
4383
4641
 
4384
- wsize = env->me_psize * 2;
4385
- ptr = env->me_map;
4386
- w2 = wsize;
4387
- while (w2 > 0) {
4388
- DO_WRITE(rc, fd, ptr, w2, len);
4389
- if (!rc) {
4390
- rc = ErrCode();
4391
- break;
4392
- } else if (len > 0) {
4393
- rc = MDB_SUCCESS;
4394
- ptr += len;
4395
- w2 -= len;
4396
- continue;
4397
- } else {
4398
- /* Non-blocking or async handles are not supported */
4399
- rc = EIO;
4400
- break;
4401
- }
4642
+ VGMEMP_DESTROY(env);
4643
+ while ((dp = env->me_dpages) != NULL) {
4644
+ VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
4645
+ env->me_dpages = dp->mp_next;
4646
+ free(dp);
4402
4647
  }
4403
- if (env->me_txns)
4404
- UNLOCK_MUTEX_W(env);
4405
-
4406
- if (rc)
4407
- goto leave;
4408
4648
 
4409
- wsize = txn->mt_next_pgno * env->me_psize - wsize;
4410
- while (wsize > 0) {
4411
- if (wsize > MAX_WRITE)
4412
- w2 = MAX_WRITE;
4413
- else
4414
- w2 = wsize;
4415
- DO_WRITE(rc, fd, ptr, w2, len);
4416
- if (!rc) {
4417
- rc = ErrCode();
4418
- break;
4419
- } else if (len > 0) {
4420
- rc = MDB_SUCCESS;
4421
- ptr += len;
4422
- wsize -= len;
4423
- continue;
4424
- } else {
4425
- rc = EIO;
4426
- break;
4427
- }
4428
- }
4649
+ mdb_env_close0(env, 0);
4650
+ free(env);
4651
+ }
4429
4652
 
4430
- leave:
4431
- mdb_txn_abort(txn);
4432
- return rc;
4653
+ /** Compare two items pointing at aligned size_t's */
4654
+ static int
4655
+ mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4656
+ {
4657
+ return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
4658
+ *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4433
4659
  }
4434
4660
 
4435
- int
4436
- mdb_env_copy(MDB_env *env, const char *path)
4661
+ /** Compare two items pointing at aligned unsigned int's */
4662
+ static int
4663
+ mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4437
4664
  {
4438
- int rc, len;
4439
- char *lpath;
4440
- HANDLE newfd = INVALID_HANDLE_VALUE;
4441
-
4442
- if (env->me_flags & MDB_NOSUBDIR) {
4443
- lpath = (char *)path;
4444
- } else {
4445
- len = strlen(path);
4446
- len += sizeof(DATANAME);
4447
- lpath = malloc(len);
4448
- if (!lpath)
4449
- return ENOMEM;
4450
- sprintf(lpath, "%s" DATANAME, path);
4451
- }
4452
-
4453
- /* The destination path must exist, but the destination file must not.
4454
- * We don't want the OS to cache the writes, since the source data is
4455
- * already in the OS cache.
4456
- */
4457
- #ifdef _WIN32
4458
- newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
4459
- FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
4460
- #else
4461
- newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
4462
- #endif
4463
- if (newfd == INVALID_HANDLE_VALUE) {
4464
- rc = ErrCode();
4465
- goto leave;
4466
- }
4467
-
4468
- #ifdef O_DIRECT
4469
- /* Set O_DIRECT if the file system supports it */
4470
- if ((rc = fcntl(newfd, F_GETFL)) != -1)
4471
- (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
4472
- #endif
4473
- #ifdef F_NOCACHE /* __APPLE__ */
4474
- rc = fcntl(newfd, F_NOCACHE, 1);
4475
- if (rc) {
4476
- rc = ErrCode();
4477
- goto leave;
4478
- }
4479
- #endif
4480
-
4481
- rc = mdb_env_copyfd(env, newfd);
4482
-
4483
- leave:
4484
- if (!(env->me_flags & MDB_NOSUBDIR))
4485
- free(lpath);
4486
- if (newfd != INVALID_HANDLE_VALUE)
4487
- if (close(newfd) < 0 && rc == MDB_SUCCESS)
4488
- rc = ErrCode();
4489
-
4490
- return rc;
4491
- }
4492
-
4493
- void
4494
- mdb_env_close(MDB_env *env)
4495
- {
4496
- MDB_page *dp;
4497
-
4498
- if (env == NULL)
4499
- return;
4500
-
4501
- VGMEMP_DESTROY(env);
4502
- while ((dp = env->me_dpages) != NULL) {
4503
- VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
4504
- env->me_dpages = dp->mp_next;
4505
- free(dp);
4506
- }
4507
-
4508
- mdb_env_close0(env, 0);
4509
- free(env);
4510
- }
4511
-
4512
- /** Compare two items pointing at aligned size_t's */
4513
- static int
4514
- mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4515
- {
4516
- return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
4517
- *(size_t *)a->mv_data > *(size_t *)b->mv_data;
4518
- }
4519
-
4520
- /** Compare two items pointing at aligned unsigned int's */
4521
- static int
4522
- mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4523
- {
4524
- return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
4525
- *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4526
- }
4665
+ return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
4666
+ *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4667
+ }
4527
4668
 
4528
4669
  /** Compare two items pointing at unsigned ints of unknown alignment.
4529
4670
  * Nodes and keys are guaranteed to be 2-byte aligned.
@@ -4542,7 +4683,16 @@ mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
4542
4683
  } while(!x && u > (unsigned short *)a->mv_data);
4543
4684
  return x;
4544
4685
  #else
4545
- return memcmp(a->mv_data, b->mv_data, a->mv_size);
4686
+ unsigned short *u, *c, *end;
4687
+ int x;
4688
+
4689
+ end = (unsigned short *) ((char *) a->mv_data + a->mv_size);
4690
+ u = (unsigned short *)a->mv_data;
4691
+ c = (unsigned short *)b->mv_data;
4692
+ do {
4693
+ x = *u++ - *c++;
4694
+ } while(!x && u < end);
4695
+ return x;
4546
4696
  #endif
4547
4697
  }
4548
4698
 
@@ -4924,6 +5074,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4924
5074
  /* Make sure we're using an up-to-date root */
4925
5075
  if (*mc->mc_dbflag & DB_STALE) {
4926
5076
  MDB_cursor mc2;
5077
+ if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
5078
+ return MDB_BAD_DBI;
4927
5079
  mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
4928
5080
  rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
4929
5081
  if (rc)
@@ -5264,8 +5416,10 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5264
5416
  if (op == MDB_PREV || op == MDB_PREV_DUP) {
5265
5417
  rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
5266
5418
  if (op != MDB_PREV || rc != MDB_NOTFOUND) {
5267
- if (rc == MDB_SUCCESS)
5419
+ if (rc == MDB_SUCCESS) {
5268
5420
  MDB_GET_KEY(leaf, key);
5421
+ mc->mc_flags &= ~C_EOF;
5422
+ }
5269
5423
  return rc;
5270
5424
  }
5271
5425
  } else {
@@ -5457,8 +5611,10 @@ set1:
5457
5611
  mc->mc_flags &= ~C_EOF;
5458
5612
 
5459
5613
  if (IS_LEAF2(mp)) {
5460
- key->mv_size = mc->mc_db->md_pad;
5461
- key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5614
+ if (op == MDB_SET_RANGE || op == MDB_SET_KEY) {
5615
+ key->mv_size = mc->mc_db->md_pad;
5616
+ key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5617
+ }
5462
5618
  return MDB_SUCCESS;
5463
5619
  }
5464
5620
 
@@ -5740,6 +5896,14 @@ fetchm:
5740
5896
  rc = MDB_INCOMPATIBLE;
5741
5897
  break;
5742
5898
  }
5899
+ {
5900
+ MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5901
+ if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5902
+ MDB_GET_KEY(leaf, key);
5903
+ rc = mdb_node_read(mc->mc_txn, leaf, data);
5904
+ break;
5905
+ }
5906
+ }
5743
5907
  if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
5744
5908
  rc = EINVAL;
5745
5909
  break;
@@ -5776,6 +5940,8 @@ mdb_cursor_touch(MDB_cursor *mc)
5776
5940
  if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
5777
5941
  MDB_cursor mc2;
5778
5942
  MDB_xcursor mcx;
5943
+ if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
5944
+ return MDB_BAD_DBI;
5779
5945
  mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
5780
5946
  rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
5781
5947
  if (rc)
@@ -5932,22 +6098,42 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5932
6098
  if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
5933
6099
  LEAFSIZE(key, data) > env->me_nodemax)
5934
6100
  {
5935
- /* Too big for a node, insert in sub-DB */
6101
+ /* Too big for a node, insert in sub-DB. Set up an empty
6102
+ * "old sub-page" for prep_subDB to expand to a full page.
6103
+ */
5936
6104
  fp_flags = P_LEAF|P_DIRTY;
5937
6105
  fp = env->me_pbuf;
5938
6106
  fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */
5939
- fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ;
6107
+ fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
6108
+ olddata.mv_size = PAGEHDRSZ;
5940
6109
  goto prep_subDB;
5941
6110
  }
5942
6111
  } else {
5943
6112
  /* there's only a key anyway, so this is a no-op */
5944
6113
  if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
6114
+ char *ptr;
5945
6115
  unsigned int ksize = mc->mc_db->md_pad;
5946
6116
  if (key->mv_size != ksize)
5947
6117
  return MDB_BAD_VALSIZE;
5948
- if (flags == MDB_CURRENT) {
5949
- char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
5950
- memcpy(ptr, key->mv_data, ksize);
6118
+ ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
6119
+ memcpy(ptr, key->mv_data, ksize);
6120
+ fix_parent:
6121
+ /* if overwriting slot 0 of leaf, need to
6122
+ * update branch key if there is a parent page
6123
+ */
6124
+ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
6125
+ unsigned short top = mc->mc_top;
6126
+ mc->mc_top--;
6127
+ /* slot 0 is always an empty key, find real slot */
6128
+ while (mc->mc_top && !mc->mc_ki[mc->mc_top])
6129
+ mc->mc_top--;
6130
+ if (mc->mc_ki[mc->mc_top])
6131
+ rc2 = mdb_update_key(mc, key);
6132
+ else
6133
+ rc2 = MDB_SUCCESS;
6134
+ mc->mc_top = top;
6135
+ if (rc2)
6136
+ return rc2;
5951
6137
  }
5952
6138
  return MDB_SUCCESS;
5953
6139
  }
@@ -5978,12 +6164,12 @@ more:
5978
6164
  if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
5979
6165
  mc->mc_dbx->md_dcmp = mdb_cmp_clong;
5980
6166
  #endif
5981
- /* if data matches, skip it */
6167
+ /* does data match? */
5982
6168
  if (!mc->mc_dbx->md_dcmp(data, &olddata)) {
5983
6169
  if (flags & MDB_NODUPDATA)
5984
6170
  return MDB_KEYEXIST;
5985
- rc = MDB_SUCCESS;
5986
- goto next_sub;
6171
+ /* overwrite it */
6172
+ goto current;
5987
6173
  }
5988
6174
 
5989
6175
  /* Back up original data item */
@@ -5992,7 +6178,7 @@ more:
5992
6178
 
5993
6179
  /* Make sub-page header for the dup items, with dummy body */
5994
6180
  fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
5995
- fp->mp_lower = PAGEHDRSZ;
6181
+ fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
5996
6182
  xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5997
6183
  if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5998
6184
  fp->mp_flags |= P_LEAF2;
@@ -6002,8 +6188,8 @@ more:
6002
6188
  xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
6003
6189
  (dkey.mv_size & 1) + (data->mv_size & 1);
6004
6190
  }
6005
- fp->mp_upper = xdata.mv_size;
6006
- olddata.mv_size = fp->mp_upper; /* pretend olddata is fp */
6191
+ fp->mp_upper = xdata.mv_size - PAGEBASE;
6192
+ olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */
6007
6193
  } else if (leaf->mn_flags & F_SUBDATA) {
6008
6194
  /* Data is on sub-DB, just store it */
6009
6195
  flags |= F_DUPDATA|F_SUBDATA;
@@ -6070,8 +6256,8 @@ prep_subDB:
6070
6256
  if (fp_flags & P_LEAF2) {
6071
6257
  memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
6072
6258
  } else {
6073
- memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
6074
- olddata.mv_size - fp->mp_upper);
6259
+ memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
6260
+ olddata.mv_size - fp->mp_upper - PAGEBASE);
6075
6261
  for (i=0; i<NUMKEYS(fp); i++)
6076
6262
  mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
6077
6263
  }
@@ -6154,8 +6340,10 @@ current:
6154
6340
  data->mv_data = olddata.mv_data;
6155
6341
  else if (!(mc->mc_flags & C_SUB))
6156
6342
  memcpy(olddata.mv_data, data->mv_data, data->mv_size);
6157
- else
6343
+ else {
6158
6344
  memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
6345
+ goto fix_parent;
6346
+ }
6159
6347
  return MDB_SUCCESS;
6160
6348
  }
6161
6349
  mdb_node_del(mc, 0);
@@ -6259,7 +6447,6 @@ put_sub:
6259
6447
  */
6260
6448
  mc->mc_flags |= C_INITIALIZED;
6261
6449
  }
6262
- next_sub:
6263
6450
  if (flags & MDB_MULTIPLE) {
6264
6451
  if (!rc) {
6265
6452
  mcount++;
@@ -6393,8 +6580,8 @@ mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp)
6393
6580
  DPRINTF(("allocated new mpage %"Z"u, page size %u",
6394
6581
  np->mp_pgno, mc->mc_txn->mt_env->me_psize));
6395
6582
  np->mp_flags = flags | P_DIRTY;
6396
- np->mp_lower = PAGEHDRSZ;
6397
- np->mp_upper = mc->mc_txn->mt_env->me_psize;
6583
+ np->mp_lower = (PAGEHDRSZ-PAGEBASE);
6584
+ np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
6398
6585
 
6399
6586
  if (IS_BRANCH(np))
6400
6587
  mc->mc_db->md_branch_pages++;
@@ -6647,7 +6834,7 @@ mdb_node_del(MDB_cursor *mc, int ksize)
6647
6834
  }
6648
6835
  }
6649
6836
 
6650
- base = (char *)mp + mp->mp_upper;
6837
+ base = (char *)mp + mp->mp_upper + PAGEBASE;
6651
6838
  memmove(base + sz, base, ptr - mp->mp_upper);
6652
6839
 
6653
6840
  mp->mp_lower -= sizeof(indx_t);
@@ -6701,7 +6888,7 @@ mdb_node_shrink(MDB_page *mp, indx_t indx)
6701
6888
  mp->mp_ptrs[i] += delta;
6702
6889
  }
6703
6890
 
6704
- base = (char *)mp + mp->mp_upper;
6891
+ base = (char *)mp + mp->mp_upper + PAGEBASE;
6705
6892
  memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
6706
6893
  mp->mp_upper += delta;
6707
6894
  }
@@ -6877,6 +7064,12 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp)
6877
7064
  if (mc->mc_txn->mt_flags & MDB_TXN_ERROR)
6878
7065
  return MDB_BAD_TXN;
6879
7066
 
7067
+ if (!(mc->mc_flags & C_INITIALIZED))
7068
+ return EINVAL;
7069
+
7070
+ if (!mc->mc_snum || (mc->mc_flags & C_EOF))
7071
+ return MDB_NOTFOUND;
7072
+
6880
7073
  leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6881
7074
  if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6882
7075
  *countp = 1;
@@ -6973,7 +7166,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key)
6973
7166
  mp->mp_ptrs[i] -= delta;
6974
7167
  }
6975
7168
 
6976
- base = (char *)mp + mp->mp_upper;
7169
+ base = (char *)mp + mp->mp_upper + PAGEBASE;
6977
7170
  len = ptr - mp->mp_upper + NODESIZE;
6978
7171
  memmove(base - delta, base, len);
6979
7172
  mp->mp_upper -= delta;
@@ -7054,20 +7247,20 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
7054
7247
  MDB_node *s2;
7055
7248
  MDB_val bkey;
7056
7249
  /* must find the lowest key below dst */
7057
- rc = mdb_page_search_lowest(cdst);
7250
+ mdb_cursor_copy(cdst, &mn);
7251
+ rc = mdb_page_search_lowest(&mn);
7058
7252
  if (rc)
7059
7253
  return rc;
7060
- if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) {
7061
- bkey.mv_size = cdst->mc_db->md_pad;
7062
- bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size);
7254
+ if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7255
+ bkey.mv_size = mn.mc_db->md_pad;
7256
+ bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size);
7063
7257
  } else {
7064
- s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
7258
+ s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7065
7259
  bkey.mv_size = NODEKSZ(s2);
7066
7260
  bkey.mv_data = NODEKEY(s2);
7067
7261
  }
7068
- cdst->mc_snum = snum--;
7069
- cdst->mc_top = snum;
7070
- mdb_cursor_copy(cdst, &mn);
7262
+ mn.mc_snum = snum--;
7263
+ mn.mc_top = snum;
7071
7264
  mn.mc_ki[snum] = 0;
7072
7265
  rc = mdb_update_key(&mn, &bkey);
7073
7266
  if (rc)
@@ -7183,14 +7376,17 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
7183
7376
  static int
7184
7377
  mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7185
7378
  {
7186
- int rc;
7187
- indx_t i, j;
7188
- MDB_node *srcnode;
7379
+ MDB_page *psrc, *pdst;
7380
+ MDB_node *srcnode;
7189
7381
  MDB_val key, data;
7190
- unsigned nkeys;
7382
+ unsigned nkeys;
7383
+ int rc;
7384
+ indx_t i, j;
7191
7385
 
7192
- DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno,
7193
- cdst->mc_pg[cdst->mc_top]->mp_pgno));
7386
+ psrc = csrc->mc_pg[csrc->mc_top];
7387
+ pdst = cdst->mc_pg[cdst->mc_top];
7388
+
7389
+ DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno));
7194
7390
 
7195
7391
  mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */
7196
7392
  mdb_cassert(csrc, cdst->mc_snum > 1);
@@ -7201,36 +7397,35 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7201
7397
 
7202
7398
  /* Move all nodes from src to dst.
7203
7399
  */
7204
- j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]);
7205
- if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7400
+ j = nkeys = NUMKEYS(pdst);
7401
+ if (IS_LEAF2(psrc)) {
7206
7402
  key.mv_size = csrc->mc_db->md_pad;
7207
- key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]);
7208
- for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
7403
+ key.mv_data = METADATA(psrc);
7404
+ for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7209
7405
  rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
7210
7406
  if (rc != MDB_SUCCESS)
7211
7407
  return rc;
7212
7408
  key.mv_data = (char *)key.mv_data + key.mv_size;
7213
7409
  }
7214
7410
  } else {
7215
- for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
7216
- srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
7217
- if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7218
- unsigned int snum = csrc->mc_snum;
7411
+ for (i = 0; i < NUMKEYS(psrc); i++, j++) {
7412
+ srcnode = NODEPTR(psrc, i);
7413
+ if (i == 0 && IS_BRANCH(psrc)) {
7414
+ MDB_cursor mn;
7219
7415
  MDB_node *s2;
7416
+ mdb_cursor_copy(csrc, &mn);
7220
7417
  /* must find the lowest key below src */
7221
- rc = mdb_page_search_lowest(csrc);
7418
+ rc = mdb_page_search_lowest(&mn);
7222
7419
  if (rc)
7223
7420
  return rc;
7224
- if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7225
- key.mv_size = csrc->mc_db->md_pad;
7226
- key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7421
+ if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7422
+ key.mv_size = mn.mc_db->md_pad;
7423
+ key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size);
7227
7424
  } else {
7228
- s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7425
+ s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7229
7426
  key.mv_size = NODEKSZ(s2);
7230
7427
  key.mv_data = NODEKEY(s2);
7231
7428
  }
7232
- csrc->mc_snum = snum--;
7233
- csrc->mc_top = snum;
7234
7429
  } else {
7235
7430
  key.mv_size = srcnode->mn_ksize;
7236
7431
  key.mv_data = NODEKEY(srcnode);
@@ -7245,8 +7440,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7245
7440
  }
7246
7441
 
7247
7442
  DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)",
7248
- cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]),
7249
- (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10));
7443
+ pdst->mp_pgno, NUMKEYS(pdst),
7444
+ (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10));
7250
7445
 
7251
7446
  /* Unlink the src page from parent and add to free list.
7252
7447
  */
@@ -7262,11 +7457,14 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7262
7457
  }
7263
7458
  csrc->mc_top++;
7264
7459
 
7265
- rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs,
7266
- csrc->mc_pg[csrc->mc_top]->mp_pgno);
7460
+ psrc = csrc->mc_pg[csrc->mc_top];
7461
+ /* If not operating on FreeDB, allow this page to be reused
7462
+ * in this txn. Otherwise just add to free list.
7463
+ */
7464
+ rc = mdb_page_loose(csrc, psrc);
7267
7465
  if (rc)
7268
7466
  return rc;
7269
- if (IS_LEAF(csrc->mc_pg[csrc->mc_top]))
7467
+ if (IS_LEAF(psrc))
7270
7468
  csrc->mc_db->md_leaf_pages--;
7271
7469
  else
7272
7470
  csrc->mc_db->md_branch_pages--;
@@ -7274,7 +7472,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7274
7472
  /* Adjust other cursors pointing to mp */
7275
7473
  MDB_cursor *m2, *m3;
7276
7474
  MDB_dbi dbi = csrc->mc_dbi;
7277
- MDB_page *mp = cdst->mc_pg[cdst->mc_top];
7278
7475
 
7279
7476
  for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7280
7477
  if (csrc->mc_flags & C_SUB)
@@ -7283,8 +7480,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7283
7480
  m3 = m2;
7284
7481
  if (m3 == csrc) continue;
7285
7482
  if (m3->mc_snum < csrc->mc_snum) continue;
7286
- if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
7287
- m3->mc_pg[csrc->mc_top] = mp;
7483
+ if (m3->mc_pg[csrc->mc_top] == psrc) {
7484
+ m3->mc_pg[csrc->mc_top] = pdst;
7288
7485
  m3->mc_ki[csrc->mc_top] += nkeys;
7289
7486
  }
7290
7487
  }
@@ -7525,8 +7722,10 @@ mdb_cursor_del0(MDB_cursor *mc)
7525
7722
  /* if mc points past last node in page, find next sibling */
7526
7723
  if (mc->mc_ki[mc->mc_top] >= nkeys) {
7527
7724
  rc = mdb_cursor_sibling(mc, 1);
7528
- if (rc == MDB_NOTFOUND)
7725
+ if (rc == MDB_NOTFOUND) {
7726
+ mc->mc_flags |= C_EOF;
7529
7727
  rc = MDB_SUCCESS;
7728
+ }
7530
7729
  }
7531
7730
 
7532
7731
  /* Adjust other cursors pointing to mp */
@@ -7541,11 +7740,15 @@ mdb_cursor_del0(MDB_cursor *mc)
7541
7740
  m3->mc_flags |= C_DEL;
7542
7741
  if (m3->mc_ki[mc->mc_top] > ki)
7543
7742
  m3->mc_ki[mc->mc_top]--;
7743
+ else if (mc->mc_db->md_flags & MDB_DUPSORT)
7744
+ m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF;
7544
7745
  }
7545
7746
  if (m3->mc_ki[mc->mc_top] >= nkeys) {
7546
7747
  rc = mdb_cursor_sibling(m3, 1);
7547
- if (rc == MDB_NOTFOUND)
7748
+ if (rc == MDB_NOTFOUND) {
7749
+ m3->mc_flags |= C_EOF;
7548
7750
  rc = MDB_SUCCESS;
7751
+ }
7549
7752
  }
7550
7753
  }
7551
7754
  }
@@ -7760,8 +7963,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7760
7963
  }
7761
7964
  copy->mp_pgno = mp->mp_pgno;
7762
7965
  copy->mp_flags = mp->mp_flags;
7763
- copy->mp_lower = PAGEHDRSZ;
7764
- copy->mp_upper = env->me_psize;
7966
+ copy->mp_lower = (PAGEHDRSZ-PAGEBASE);
7967
+ copy->mp_upper = env->me_psize - PAGEBASE;
7765
7968
 
7766
7969
  /* prepare to insert */
7767
7970
  for (i=0, j=0; i<nkeys; i++) {
@@ -7801,7 +8004,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7801
8004
  psize += nsize;
7802
8005
  node = NULL;
7803
8006
  } else {
7804
- node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
8007
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
7805
8008
  psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7806
8009
  if (IS_LEAF(mp)) {
7807
8010
  if (F_ISSET(node->mn_flags, F_BIGDATA))
@@ -7821,7 +8024,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7821
8024
  sepkey.mv_size = newkey->mv_size;
7822
8025
  sepkey.mv_data = newkey->mv_data;
7823
8026
  } else {
7824
- node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
8027
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
7825
8028
  sepkey.mv_size = node->mn_ksize;
7826
8029
  sepkey.mv_data = NODEKEY(node);
7827
8030
  }
@@ -7902,7 +8105,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7902
8105
  /* Update index for the new key. */
7903
8106
  mc->mc_ki[mc->mc_top] = j;
7904
8107
  } else {
7905
- node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
8108
+ node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
7906
8109
  rkey.mv_data = NODEKEY(node);
7907
8110
  rkey.mv_size = node->mn_ksize;
7908
8111
  if (IS_LEAF(mp)) {
@@ -7938,7 +8141,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
7938
8141
  mp->mp_lower = copy->mp_lower;
7939
8142
  mp->mp_upper = copy->mp_upper;
7940
8143
  memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
7941
- env->me_psize - copy->mp_upper);
8144
+ env->me_psize - copy->mp_upper - PAGEBASE);
7942
8145
 
7943
8146
  /* reset back to original page */
7944
8147
  if (newindx < split_indx) {
@@ -8037,7 +8240,568 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
8037
8240
  return mdb_cursor_put(&mc, key, data, flags);
8038
8241
  }
8039
8242
 
8040
- int
8243
+ #ifndef MDB_WBUF
8244
+ #define MDB_WBUF (1024*1024)
8245
+ #endif
8246
+
8247
+ /** State needed for a compacting copy. */
8248
+ typedef struct mdb_copy {
8249
+ pthread_mutex_t mc_mutex;
8250
+ pthread_cond_t mc_cond;
8251
+ char *mc_wbuf[2];
8252
+ char *mc_over[2];
8253
+ MDB_env *mc_env;
8254
+ MDB_txn *mc_txn;
8255
+ int mc_wlen[2];
8256
+ int mc_olen[2];
8257
+ pgno_t mc_next_pgno;
8258
+ HANDLE mc_fd;
8259
+ int mc_status;
8260
+ volatile int mc_new;
8261
+ int mc_toggle;
8262
+
8263
+ } mdb_copy;
8264
+
8265
+ /** Dedicated writer thread for compacting copy. */
8266
+ static THREAD_RET ESECT
8267
+ mdb_env_copythr(void *arg)
8268
+ {
8269
+ mdb_copy *my = arg;
8270
+ char *ptr;
8271
+ int toggle = 0, wsize, rc;
8272
+ #ifdef _WIN32
8273
+ DWORD len;
8274
+ #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8275
+ #else
8276
+ int len;
8277
+ #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8278
+ #endif
8279
+
8280
+ pthread_mutex_lock(&my->mc_mutex);
8281
+ my->mc_new = 0;
8282
+ pthread_cond_signal(&my->mc_cond);
8283
+ for(;;) {
8284
+ while (!my->mc_new)
8285
+ pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8286
+ if (my->mc_new < 0) {
8287
+ my->mc_new = 0;
8288
+ break;
8289
+ }
8290
+ my->mc_new = 0;
8291
+ wsize = my->mc_wlen[toggle];
8292
+ ptr = my->mc_wbuf[toggle];
8293
+ again:
8294
+ while (wsize > 0) {
8295
+ DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
8296
+ if (!rc) {
8297
+ rc = ErrCode();
8298
+ break;
8299
+ } else if (len > 0) {
8300
+ rc = MDB_SUCCESS;
8301
+ ptr += len;
8302
+ wsize -= len;
8303
+ continue;
8304
+ } else {
8305
+ rc = EIO;
8306
+ break;
8307
+ }
8308
+ }
8309
+ if (rc) {
8310
+ my->mc_status = rc;
8311
+ break;
8312
+ }
8313
+ /* If there's an overflow page tail, write it too */
8314
+ if (my->mc_olen[toggle]) {
8315
+ wsize = my->mc_olen[toggle];
8316
+ ptr = my->mc_over[toggle];
8317
+ my->mc_olen[toggle] = 0;
8318
+ goto again;
8319
+ }
8320
+ my->mc_wlen[toggle] = 0;
8321
+ toggle ^= 1;
8322
+ pthread_cond_signal(&my->mc_cond);
8323
+ }
8324
+ pthread_cond_signal(&my->mc_cond);
8325
+ pthread_mutex_unlock(&my->mc_mutex);
8326
+ return (THREAD_RET)0;
8327
+ #undef DO_WRITE
8328
+ }
8329
+
8330
+ /** Tell the writer thread there's a buffer ready to write */
8331
+ static int ESECT
8332
+ mdb_env_cthr_toggle(mdb_copy *my, int st)
8333
+ {
8334
+ int toggle = my->mc_toggle ^ 1;
8335
+ pthread_mutex_lock(&my->mc_mutex);
8336
+ if (my->mc_status) {
8337
+ pthread_mutex_unlock(&my->mc_mutex);
8338
+ return my->mc_status;
8339
+ }
8340
+ while (my->mc_new == 1)
8341
+ pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
8342
+ my->mc_new = st;
8343
+ my->mc_toggle = toggle;
8344
+ pthread_cond_signal(&my->mc_cond);
8345
+ pthread_mutex_unlock(&my->mc_mutex);
8346
+ return 0;
8347
+ }
8348
+
8349
+ /** Depth-first tree traversal for compacting copy. */
8350
+ static int ESECT
8351
+ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
8352
+ {
8353
+ MDB_cursor mc;
8354
+ MDB_txn *txn = my->mc_txn;
8355
+ MDB_node *ni;
8356
+ MDB_page *mo, *mp, *leaf;
8357
+ char *buf, *ptr;
8358
+ int rc, toggle;
8359
+ unsigned int i;
8360
+
8361
+ /* Empty DB, nothing to do */
8362
+ if (*pg == P_INVALID)
8363
+ return MDB_SUCCESS;
8364
+
8365
+ mc.mc_snum = 1;
8366
+ mc.mc_top = 0;
8367
+ mc.mc_txn = txn;
8368
+
8369
+ rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL);
8370
+ if (rc)
8371
+ return rc;
8372
+ rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST);
8373
+ if (rc)
8374
+ return rc;
8375
+
8376
+ /* Make cursor pages writable */
8377
+ buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum);
8378
+ if (buf == NULL)
8379
+ return ENOMEM;
8380
+
8381
+ for (i=0; i<mc.mc_top; i++) {
8382
+ mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
8383
+ mc.mc_pg[i] = (MDB_page *)ptr;
8384
+ ptr += my->mc_env->me_psize;
8385
+ }
8386
+
8387
+ /* This is writable space for a leaf page. Usually not needed. */
8388
+ leaf = (MDB_page *)ptr;
8389
+
8390
+ toggle = my->mc_toggle;
8391
+ while (mc.mc_snum > 0) {
8392
+ unsigned n;
8393
+ mp = mc.mc_pg[mc.mc_top];
8394
+ n = NUMKEYS(mp);
8395
+
8396
+ if (IS_LEAF(mp)) {
8397
+ if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
8398
+ for (i=0; i<n; i++) {
8399
+ ni = NODEPTR(mp, i);
8400
+ if (ni->mn_flags & F_BIGDATA) {
8401
+ MDB_page *omp;
8402
+ pgno_t pg;
8403
+
8404
+ /* Need writable leaf */
8405
+ if (mp != leaf) {
8406
+ mc.mc_pg[mc.mc_top] = leaf;
8407
+ mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8408
+ mp = leaf;
8409
+ ni = NODEPTR(mp, i);
8410
+ }
8411
+
8412
+ memcpy(&pg, NODEDATA(ni), sizeof(pg));
8413
+ rc = mdb_page_get(txn, pg, &omp, NULL);
8414
+ if (rc)
8415
+ goto done;
8416
+ if (my->mc_wlen[toggle] >= MDB_WBUF) {
8417
+ rc = mdb_env_cthr_toggle(my, 1);
8418
+ if (rc)
8419
+ goto done;
8420
+ toggle = my->mc_toggle;
8421
+ }
8422
+ mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8423
+ memcpy(mo, omp, my->mc_env->me_psize);
8424
+ mo->mp_pgno = my->mc_next_pgno;
8425
+ my->mc_next_pgno += omp->mp_pages;
8426
+ my->mc_wlen[toggle] += my->mc_env->me_psize;
8427
+ if (omp->mp_pages > 1) {
8428
+ my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
8429
+ my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
8430
+ rc = mdb_env_cthr_toggle(my, 1);
8431
+ if (rc)
8432
+ goto done;
8433
+ toggle = my->mc_toggle;
8434
+ }
8435
+ memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t));
8436
+ } else if (ni->mn_flags & F_SUBDATA) {
8437
+ MDB_db db;
8438
+
8439
+ /* Need writable leaf */
8440
+ if (mp != leaf) {
8441
+ mc.mc_pg[mc.mc_top] = leaf;
8442
+ mdb_page_copy(leaf, mp, my->mc_env->me_psize);
8443
+ mp = leaf;
8444
+ ni = NODEPTR(mp, i);
8445
+ }
8446
+
8447
+ memcpy(&db, NODEDATA(ni), sizeof(db));
8448
+ my->mc_toggle = toggle;
8449
+ rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA);
8450
+ if (rc)
8451
+ goto done;
8452
+ toggle = my->mc_toggle;
8453
+ memcpy(NODEDATA(ni), &db, sizeof(db));
8454
+ }
8455
+ }
8456
+ }
8457
+ } else {
8458
+ mc.mc_ki[mc.mc_top]++;
8459
+ if (mc.mc_ki[mc.mc_top] < n) {
8460
+ pgno_t pg;
8461
+ again:
8462
+ ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
8463
+ pg = NODEPGNO(ni);
8464
+ rc = mdb_page_get(txn, pg, &mp, NULL);
8465
+ if (rc)
8466
+ goto done;
8467
+ mc.mc_top++;
8468
+ mc.mc_snum++;
8469
+ mc.mc_ki[mc.mc_top] = 0;
8470
+ if (IS_BRANCH(mp)) {
8471
+ /* Whenever we advance to a sibling branch page,
8472
+ * we must proceed all the way down to its first leaf.
8473
+ */
8474
+ mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
8475
+ goto again;
8476
+ } else
8477
+ mc.mc_pg[mc.mc_top] = mp;
8478
+ continue;
8479
+ }
8480
+ }
8481
+ if (my->mc_wlen[toggle] >= MDB_WBUF) {
8482
+ rc = mdb_env_cthr_toggle(my, 1);
8483
+ if (rc)
8484
+ goto done;
8485
+ toggle = my->mc_toggle;
8486
+ }
8487
+ mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
8488
+ mdb_page_copy(mo, mp, my->mc_env->me_psize);
8489
+ mo->mp_pgno = my->mc_next_pgno++;
8490
+ my->mc_wlen[toggle] += my->mc_env->me_psize;
8491
+ if (mc.mc_top) {
8492
+ /* Update parent if there is one */
8493
+ ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
8494
+ SETPGNO(ni, mo->mp_pgno);
8495
+ mdb_cursor_pop(&mc);
8496
+ } else {
8497
+ /* Otherwise we're done */
8498
+ *pg = mo->mp_pgno;
8499
+ break;
8500
+ }
8501
+ }
8502
+ done:
8503
+ free(buf);
8504
+ return rc;
8505
+ }
8506
+
8507
+ /** Copy environment with compaction. */
8508
+ static int ESECT
8509
+ mdb_env_copyfd1(MDB_env *env, HANDLE fd)
8510
+ {
8511
+ MDB_meta *mm;
8512
+ MDB_page *mp;
8513
+ mdb_copy my;
8514
+ MDB_txn *txn = NULL;
8515
+ pthread_t thr;
8516
+ int rc;
8517
+
8518
+ #ifdef _WIN32
8519
+ my.mc_mutex = CreateMutex(NULL, FALSE, NULL);
8520
+ my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL);
8521
+ my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize);
8522
+ if (my.mc_wbuf[0] == NULL)
8523
+ return errno;
8524
+ #else
8525
+ pthread_mutex_init(&my.mc_mutex, NULL);
8526
+ pthread_cond_init(&my.mc_cond, NULL);
8527
+ #ifdef HAVE_MEMALIGN
8528
+ my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2);
8529
+ if (my.mc_wbuf[0] == NULL)
8530
+ return errno;
8531
+ #else
8532
+ rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2);
8533
+ if (rc)
8534
+ return rc;
8535
+ #endif
8536
+ #endif
8537
+ memset(my.mc_wbuf[0], 0, MDB_WBUF*2);
8538
+ my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF;
8539
+ my.mc_wlen[0] = 0;
8540
+ my.mc_wlen[1] = 0;
8541
+ my.mc_olen[0] = 0;
8542
+ my.mc_olen[1] = 0;
8543
+ my.mc_next_pgno = 2;
8544
+ my.mc_status = 0;
8545
+ my.mc_new = 1;
8546
+ my.mc_toggle = 0;
8547
+ my.mc_env = env;
8548
+ my.mc_fd = fd;
8549
+ THREAD_CREATE(thr, mdb_env_copythr, &my);
8550
+
8551
+ rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8552
+ if (rc)
8553
+ return rc;
8554
+
8555
+ mp = (MDB_page *)my.mc_wbuf[0];
8556
+ memset(mp, 0, 2*env->me_psize);
8557
+ mp->mp_pgno = 0;
8558
+ mp->mp_flags = P_META;
8559
+ mm = (MDB_meta *)METADATA(mp);
8560
+ mdb_env_init_meta0(env, mm);
8561
+ mm->mm_address = env->me_metas[0]->mm_address;
8562
+
8563
+ mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize);
8564
+ mp->mp_pgno = 1;
8565
+ mp->mp_flags = P_META;
8566
+ *(MDB_meta *)METADATA(mp) = *mm;
8567
+ mm = (MDB_meta *)METADATA(mp);
8568
+
8569
+ /* Count the number of free pages, subtract from lastpg to find
8570
+ * number of active pages
8571
+ */
8572
+ {
8573
+ MDB_ID freecount = 0;
8574
+ MDB_cursor mc;
8575
+ MDB_val key, data;
8576
+ mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
8577
+ while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
8578
+ freecount += *(MDB_ID *)data.mv_data;
8579
+ freecount += txn->mt_dbs[0].md_branch_pages +
8580
+ txn->mt_dbs[0].md_leaf_pages +
8581
+ txn->mt_dbs[0].md_overflow_pages;
8582
+
8583
+ /* Set metapage 1 */
8584
+ mm->mm_last_pg = txn->mt_next_pgno - freecount - 1;
8585
+ mm->mm_dbs[1] = txn->mt_dbs[1];
8586
+ mm->mm_dbs[1].md_root = mm->mm_last_pg;
8587
+ mm->mm_txnid = 1;
8588
+ }
8589
+ my.mc_wlen[0] = env->me_psize * 2;
8590
+ my.mc_txn = txn;
8591
+ pthread_mutex_lock(&my.mc_mutex);
8592
+ while(my.mc_new)
8593
+ pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8594
+ pthread_mutex_unlock(&my.mc_mutex);
8595
+ rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0);
8596
+ if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle])
8597
+ rc = mdb_env_cthr_toggle(&my, 1);
8598
+ mdb_env_cthr_toggle(&my, -1);
8599
+ pthread_mutex_lock(&my.mc_mutex);
8600
+ while(my.mc_new)
8601
+ pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
8602
+ pthread_mutex_unlock(&my.mc_mutex);
8603
+ THREAD_FINISH(thr);
8604
+
8605
+ mdb_txn_abort(txn);
8606
+ #ifdef _WIN32
8607
+ CloseHandle(my.mc_cond);
8608
+ CloseHandle(my.mc_mutex);
8609
+ _aligned_free(my.mc_wbuf[0]);
8610
+ #else
8611
+ pthread_cond_destroy(&my.mc_cond);
8612
+ pthread_mutex_destroy(&my.mc_mutex);
8613
+ free(my.mc_wbuf[0]);
8614
+ #endif
8615
+ return rc;
8616
+ }
8617
+
8618
+ /** Copy environment as-is. */
8619
+ static int ESECT
8620
+ mdb_env_copyfd0(MDB_env *env, HANDLE fd)
8621
+ {
8622
+ MDB_txn *txn = NULL;
8623
+ int rc;
8624
+ size_t wsize;
8625
+ char *ptr;
8626
+ #ifdef _WIN32
8627
+ DWORD len, w2;
8628
+ #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
8629
+ #else
8630
+ ssize_t len;
8631
+ size_t w2;
8632
+ #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
8633
+ #endif
8634
+
8635
+ /* Do the lock/unlock of the reader mutex before starting the
8636
+ * write txn. Otherwise other read txns could block writers.
8637
+ */
8638
+ rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
8639
+ if (rc)
8640
+ return rc;
8641
+
8642
+ if (env->me_txns) {
8643
+ /* We must start the actual read txn after blocking writers */
8644
+ mdb_txn_reset0(txn, "reset-stage1");
8645
+
8646
+ /* Temporarily block writers until we snapshot the meta pages */
8647
+ LOCK_MUTEX_W(env);
8648
+
8649
+ rc = mdb_txn_renew0(txn);
8650
+ if (rc) {
8651
+ UNLOCK_MUTEX_W(env);
8652
+ goto leave;
8653
+ }
8654
+ }
8655
+
8656
+ wsize = env->me_psize * 2;
8657
+ ptr = env->me_map;
8658
+ w2 = wsize;
8659
+ while (w2 > 0) {
8660
+ DO_WRITE(rc, fd, ptr, w2, len);
8661
+ if (!rc) {
8662
+ rc = ErrCode();
8663
+ break;
8664
+ } else if (len > 0) {
8665
+ rc = MDB_SUCCESS;
8666
+ ptr += len;
8667
+ w2 -= len;
8668
+ continue;
8669
+ } else {
8670
+ /* Non-blocking or async handles are not supported */
8671
+ rc = EIO;
8672
+ break;
8673
+ }
8674
+ }
8675
+ if (env->me_txns)
8676
+ UNLOCK_MUTEX_W(env);
8677
+
8678
+ if (rc)
8679
+ goto leave;
8680
+
8681
+ w2 = txn->mt_next_pgno * env->me_psize;
8682
+ #ifdef WIN32
8683
+ {
8684
+ LARGE_INTEGER fsize;
8685
+ GetFileSizeEx(env->me_fd, &fsize);
8686
+ if (w2 > fsize.QuadPart)
8687
+ w2 = fsize.QuadPart;
8688
+ }
8689
+ #else
8690
+ {
8691
+ struct stat st;
8692
+ fstat(env->me_fd, &st);
8693
+ if (w2 > (size_t)st.st_size)
8694
+ w2 = st.st_size;
8695
+ }
8696
+ #endif
8697
+ wsize = w2 - wsize;
8698
+ while (wsize > 0) {
8699
+ if (wsize > MAX_WRITE)
8700
+ w2 = MAX_WRITE;
8701
+ else
8702
+ w2 = wsize;
8703
+ DO_WRITE(rc, fd, ptr, w2, len);
8704
+ if (!rc) {
8705
+ rc = ErrCode();
8706
+ break;
8707
+ } else if (len > 0) {
8708
+ rc = MDB_SUCCESS;
8709
+ ptr += len;
8710
+ wsize -= len;
8711
+ continue;
8712
+ } else {
8713
+ rc = EIO;
8714
+ break;
8715
+ }
8716
+ }
8717
+
8718
+ leave:
8719
+ mdb_txn_abort(txn);
8720
+ return rc;
8721
+ }
8722
+
8723
+ int ESECT
8724
+ mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags)
8725
+ {
8726
+ if (flags & MDB_CP_COMPACT)
8727
+ return mdb_env_copyfd1(env, fd);
8728
+ else
8729
+ return mdb_env_copyfd0(env, fd);
8730
+ }
8731
+
8732
+ int ESECT
8733
+ mdb_env_copyfd(MDB_env *env, HANDLE fd)
8734
+ {
8735
+ return mdb_env_copyfd2(env, fd, 0);
8736
+ }
8737
+
8738
+ int ESECT
8739
+ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
8740
+ {
8741
+ int rc, len;
8742
+ char *lpath;
8743
+ HANDLE newfd = INVALID_HANDLE_VALUE;
8744
+
8745
+ if (env->me_flags & MDB_NOSUBDIR) {
8746
+ lpath = (char *)path;
8747
+ } else {
8748
+ len = strlen(path);
8749
+ len += sizeof(DATANAME);
8750
+ lpath = malloc(len);
8751
+ if (!lpath)
8752
+ return ENOMEM;
8753
+ sprintf(lpath, "%s" DATANAME, path);
8754
+ }
8755
+
8756
+ /* The destination path must exist, but the destination file must not.
8757
+ * We don't want the OS to cache the writes, since the source data is
8758
+ * already in the OS cache.
8759
+ */
8760
+ #ifdef _WIN32
8761
+ newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
8762
+ FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
8763
+ #else
8764
+ newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
8765
+ #endif
8766
+ if (newfd == INVALID_HANDLE_VALUE) {
8767
+ rc = ErrCode();
8768
+ goto leave;
8769
+ }
8770
+
8771
+ if (env->me_psize >= env->me_os_psize) {
8772
+ #ifdef O_DIRECT
8773
+ /* Set O_DIRECT if the file system supports it */
8774
+ if ((rc = fcntl(newfd, F_GETFL)) != -1)
8775
+ (void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
8776
+ #endif
8777
+ #ifdef F_NOCACHE /* __APPLE__ */
8778
+ rc = fcntl(newfd, F_NOCACHE, 1);
8779
+ if (rc) {
8780
+ rc = ErrCode();
8781
+ goto leave;
8782
+ }
8783
+ #endif
8784
+ }
8785
+
8786
+ rc = mdb_env_copyfd2(env, newfd, flags);
8787
+
8788
+ leave:
8789
+ if (!(env->me_flags & MDB_NOSUBDIR))
8790
+ free(lpath);
8791
+ if (newfd != INVALID_HANDLE_VALUE)
8792
+ if (close(newfd) < 0 && rc == MDB_SUCCESS)
8793
+ rc = ErrCode();
8794
+
8795
+ return rc;
8796
+ }
8797
+
8798
+ int ESECT
8799
+ mdb_env_copy(MDB_env *env, const char *path)
8800
+ {
8801
+ return mdb_env_copy2(env, path, 0);
8802
+ }
8803
+
8804
+ int ESECT
8041
8805
  mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
8042
8806
  {
8043
8807
  if ((flag & CHANGEABLE) != flag)
@@ -8049,7 +8813,7 @@ mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
8049
8813
  return MDB_SUCCESS;
8050
8814
  }
8051
8815
 
8052
- int
8816
+ int ESECT
8053
8817
  mdb_env_get_flags(MDB_env *env, unsigned int *arg)
8054
8818
  {
8055
8819
  if (!env || !arg)
@@ -8059,7 +8823,7 @@ mdb_env_get_flags(MDB_env *env, unsigned int *arg)
8059
8823
  return MDB_SUCCESS;
8060
8824
  }
8061
8825
 
8062
- int
8826
+ int ESECT
8063
8827
  mdb_env_set_userctx(MDB_env *env, void *ctx)
8064
8828
  {
8065
8829
  if (!env)
@@ -8068,13 +8832,13 @@ mdb_env_set_userctx(MDB_env *env, void *ctx)
8068
8832
  return MDB_SUCCESS;
8069
8833
  }
8070
8834
 
8071
- void *
8835
+ void * ESECT
8072
8836
  mdb_env_get_userctx(MDB_env *env)
8073
8837
  {
8074
8838
  return env ? env->me_userctx : NULL;
8075
8839
  }
8076
8840
 
8077
- int
8841
+ int ESECT
8078
8842
  mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
8079
8843
  {
8080
8844
  if (!env)
@@ -8085,7 +8849,7 @@ mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
8085
8849
  return MDB_SUCCESS;
8086
8850
  }
8087
8851
 
8088
- int
8852
+ int ESECT
8089
8853
  mdb_env_get_path(MDB_env *env, const char **arg)
8090
8854
  {
8091
8855
  if (!env || !arg)
@@ -8095,7 +8859,7 @@ mdb_env_get_path(MDB_env *env, const char **arg)
8095
8859
  return MDB_SUCCESS;
8096
8860
  }
8097
8861
 
8098
- int
8862
+ int ESECT
8099
8863
  mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
8100
8864
  {
8101
8865
  if (!env || !arg)
@@ -8111,7 +8875,7 @@ mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
8111
8875
  * @param[out] arg the address of an #MDB_stat structure to receive the stats.
8112
8876
  * @return 0, this function always succeeds.
8113
8877
  */
8114
- static int
8878
+ static int ESECT
8115
8879
  mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
8116
8880
  {
8117
8881
  arg->ms_psize = env->me_psize;
@@ -8123,7 +8887,8 @@ mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
8123
8887
 
8124
8888
  return MDB_SUCCESS;
8125
8889
  }
8126
- int
8890
+
8891
+ int ESECT
8127
8892
  mdb_env_stat(MDB_env *env, MDB_stat *arg)
8128
8893
  {
8129
8894
  int toggle;
@@ -8136,7 +8901,7 @@ mdb_env_stat(MDB_env *env, MDB_stat *arg)
8136
8901
  return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
8137
8902
  }
8138
8903
 
8139
- int
8904
+ int ESECT
8140
8905
  mdb_env_info(MDB_env *env, MDB_envinfo *arg)
8141
8906
  {
8142
8907
  int toggle;
@@ -8145,7 +8910,7 @@ mdb_env_info(MDB_env *env, MDB_envinfo *arg)
8145
8910
  return EINVAL;
8146
8911
 
8147
8912
  toggle = mdb_env_pick_meta(env);
8148
- arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0;
8913
+ arg->me_mapaddr = env->me_metas[toggle]->mm_address;
8149
8914
  arg->me_mapsize = env->me_mapsize;
8150
8915
  arg->me_maxreaders = env->me_maxreaders;
8151
8916
 
@@ -8187,8 +8952,9 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
8187
8952
  MDB_val key, data;
8188
8953
  MDB_dbi i;
8189
8954
  MDB_cursor mc;
8955
+ MDB_db dummy;
8190
8956
  int rc, dbflag, exact;
8191
- unsigned int unused = 0;
8957
+ unsigned int unused = 0, seq;
8192
8958
  size_t len;
8193
8959
 
8194
8960
  if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
@@ -8256,7 +9022,6 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
8256
9022
  return MDB_INCOMPATIBLE;
8257
9023
  } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
8258
9024
  /* Create if requested */
8259
- MDB_db dummy;
8260
9025
  data.mv_size = sizeof(MDB_db);
8261
9026
  data.mv_data = &dummy;
8262
9027
  memset(&dummy, 0, sizeof(dummy));
@@ -8273,6 +9038,12 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db
8273
9038
  txn->mt_dbxs[slot].md_name.mv_size = len;
8274
9039
  txn->mt_dbxs[slot].md_rel = NULL;
8275
9040
  txn->mt_dbflags[slot] = dbflag;
9041
+ /* txn-> and env-> are the same in read txns, use
9042
+ * tmp variable to avoid undefined assignment
9043
+ */
9044
+ seq = ++txn->mt_env->me_dbiseqs[slot];
9045
+ txn->mt_dbiseqs[slot] = seq;
9046
+
8276
9047
  memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
8277
9048
  *dbi = slot;
8278
9049
  mdb_default_cmp(txn, slot);
@@ -8307,10 +9078,14 @@ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
8307
9078
  if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs)
8308
9079
  return;
8309
9080
  ptr = env->me_dbxs[dbi].md_name.mv_data;
8310
- env->me_dbxs[dbi].md_name.mv_data = NULL;
8311
- env->me_dbxs[dbi].md_name.mv_size = 0;
8312
- env->me_dbflags[dbi] = 0;
8313
- free(ptr);
9081
+ /* If there was no name, this was already closed */
9082
+ if (ptr) {
9083
+ env->me_dbxs[dbi].md_name.mv_data = NULL;
9084
+ env->me_dbxs[dbi].md_name.mv_size = 0;
9085
+ env->me_dbflags[dbi] = 0;
9086
+ env->me_dbiseqs[dbi]++;
9087
+ free(ptr);
9088
+ }
8314
9089
  }
8315
9090
 
8316
9091
  int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags)
@@ -8420,6 +9195,9 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
8420
9195
  if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
8421
9196
  return EACCES;
8422
9197
 
9198
+ if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi))
9199
+ return MDB_BAD_DBI;
9200
+
8423
9201
  rc = mdb_cursor_open(txn, dbi, &mc);
8424
9202
  if (rc)
8425
9203
  return rc;
@@ -8493,12 +9271,14 @@ int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
8493
9271
  return MDB_SUCCESS;
8494
9272
  }
8495
9273
 
8496
- int mdb_env_get_maxkeysize(MDB_env *env)
9274
+ int ESECT
9275
+ mdb_env_get_maxkeysize(MDB_env *env)
8497
9276
  {
8498
9277
  return ENV_MAXKEY(env);
8499
9278
  }
8500
9279
 
8501
- int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
9280
+ int ESECT
9281
+ mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
8502
9282
  {
8503
9283
  unsigned int i, rdrs;
8504
9284
  MDB_reader *mr;
@@ -8538,7 +9318,8 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
8538
9318
  /** Insert pid into list if not already present.
8539
9319
  * return -1 if already present.
8540
9320
  */
8541
- static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
9321
+ static int ESECT
9322
+ mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
8542
9323
  {
8543
9324
  /* binary search of pid in list */
8544
9325
  unsigned base = 0;
@@ -8574,7 +9355,8 @@ static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
8574
9355
  return 0;
8575
9356
  }
8576
9357
 
8577
- int mdb_reader_check(MDB_env *env, int *dead)
9358
+ int ESECT
9359
+ mdb_reader_check(MDB_env *env, int *dead)
8578
9360
  {
8579
9361
  unsigned int i, j, rdrs;
8580
9362
  MDB_reader *mr;