@nxtedition/rocksdb 13.5.13 → 15.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/binding.cc +55 -180
  2. package/binding.gyp +2 -2
  3. package/chained-batch.js +9 -16
  4. package/deps/rocksdb/rocksdb/BUCK +18 -1
  5. package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -3
  6. package/deps/rocksdb/rocksdb/Makefile +20 -9
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +90 -13
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -75
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.h +44 -36
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +184 -148
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +5 -11
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +116 -47
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +3 -6
  15. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -1
  16. package/deps/rocksdb/rocksdb/db/builder.cc +4 -2
  17. package/deps/rocksdb/rocksdb/db/c.cc +207 -0
  18. package/deps/rocksdb/rocksdb/db/c_test.c +72 -0
  19. package/deps/rocksdb/rocksdb/db/column_family.cc +3 -2
  20. package/deps/rocksdb/rocksdb/db/column_family.h +5 -0
  21. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +2 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +51 -38
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +29 -12
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +5 -10
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +566 -366
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +131 -4
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +1 -0
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +4 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +13 -14
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +12 -7
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -10
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +97 -76
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +11 -14
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +16 -3
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +1 -0
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +448 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +22 -20
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +4 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +5 -5
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +7 -3
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -1
  46. package/deps/rocksdb/rocksdb/db/db_iter.cc +104 -0
  47. package/deps/rocksdb/rocksdb/db/db_iter.h +4 -11
  48. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +331 -58
  49. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +129 -0
  50. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +64 -0
  51. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +40 -0
  52. package/deps/rocksdb/rocksdb/db/db_test2.cc +25 -15
  53. package/deps/rocksdb/rocksdb/db/db_test_util.cc +42 -24
  54. package/deps/rocksdb/rocksdb/db/db_test_util.h +29 -14
  55. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +69 -36
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +0 -1
  57. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  58. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -4
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +8 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +275 -79
  61. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +23 -5
  62. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +591 -175
  63. package/deps/rocksdb/rocksdb/db/flush_job.cc +3 -4
  64. package/deps/rocksdb/rocksdb/db/log_reader.cc +5 -2
  65. package/deps/rocksdb/rocksdb/db/memtable.cc +84 -35
  66. package/deps/rocksdb/rocksdb/db/memtable.h +39 -34
  67. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -0
  68. package/deps/rocksdb/rocksdb/db/merge_operator.cc +1 -1
  69. package/deps/rocksdb/rocksdb/db/multi_scan.cc +11 -5
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +1 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -1
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +34 -14
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +28 -5
  74. package/deps/rocksdb/rocksdb/db/version_set.cc +159 -14
  75. package/deps/rocksdb/rocksdb/db/version_set.h +2 -0
  76. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -1
  77. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +60 -0
  78. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +16 -1
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +75 -10
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.cc +28 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +2 -0
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +31 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +50 -2
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +57 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +0 -4
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +266 -35
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +0 -6
  89. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +18 -2
  90. package/deps/rocksdb/rocksdb/env/env.cc +12 -0
  91. package/deps/rocksdb/rocksdb/env/env_test.cc +18 -0
  92. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +2 -0
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +9 -5
  94. package/deps/rocksdb/rocksdb/env/io_posix.cc +4 -2
  95. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +19 -0
  96. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -31
  97. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +42 -9
  98. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +93 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +43 -49
  100. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +4 -3
  101. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +8 -6
  102. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +487 -0
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +11 -12
  104. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +135 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -0
  106. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +12 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +8 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +12 -8
  110. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +3 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +19 -9
  112. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +219 -24
  113. package/deps/rocksdb/rocksdb/include/rocksdb/point_lock_bench_tool.h +14 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +2 -2
  115. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +1 -1
  116. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +7 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +16 -0
  118. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +16 -4
  119. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +13 -0
  120. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +4 -0
  121. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +0 -2
  122. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +45 -0
  123. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +1 -1
  124. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +1 -1
  125. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +6 -1
  126. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  127. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  128. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +3 -3
  129. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +77 -51
  130. package/deps/rocksdb/rocksdb/memtable/skiplist.h +10 -13
  131. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +16 -7
  132. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +9 -4
  133. package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +2 -0
  134. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +6 -0
  135. package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -1
  136. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
  137. package/deps/rocksdb/rocksdb/options/options.cc +2 -0
  138. package/deps/rocksdb/rocksdb/options/options_helper.cc +9 -8
  139. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -5
  140. package/deps/rocksdb/rocksdb/port/mmap.cc +1 -1
  141. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +51 -0
  142. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +4 -0
  143. package/deps/rocksdb/rocksdb/src.mk +8 -2
  144. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1125 -765
  145. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +35 -24
  146. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +29 -4
  147. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +732 -256
  148. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +225 -16
  149. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -26
  150. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -1
  151. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -75
  152. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +433 -141
  153. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +2 -0
  154. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +17 -10
  155. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy_impl.h +20 -0
  156. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +112 -85
  157. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +191 -36
  158. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +2 -2
  159. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  160. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +108 -31
  161. package/deps/rocksdb/rocksdb/table/external_table.cc +7 -3
  162. package/deps/rocksdb/rocksdb/table/format.cc +6 -12
  163. package/deps/rocksdb/rocksdb/table/format.h +10 -0
  164. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  165. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +1 -1
  166. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -1
  167. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +5 -0
  168. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -1
  169. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +118 -46
  170. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +9 -8
  171. package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
  172. package/deps/rocksdb/rocksdb/table/table_properties.cc +16 -0
  173. package/deps/rocksdb/rocksdb/table/table_test.cc +1540 -155
  174. package/deps/rocksdb/rocksdb/test_util/testutil.h +21 -5
  175. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +26 -5
  176. package/deps/rocksdb/rocksdb/tools/ldb.cc +1 -2
  177. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +2 -0
  178. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -3
  179. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +133 -165
  180. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +173 -64
  181. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +69 -0
  182. package/deps/rocksdb/rocksdb/util/atomic.h +6 -0
  183. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +29 -20
  184. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +10 -6
  185. package/deps/rocksdb/rocksdb/util/bit_fields.h +338 -0
  186. package/deps/rocksdb/rocksdb/util/coding.h +3 -3
  187. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -2
  188. package/deps/rocksdb/rocksdb/util/compression.cc +777 -82
  189. package/deps/rocksdb/rocksdb/util/compression.h +5 -0
  190. package/deps/rocksdb/rocksdb/util/compression_test.cc +5 -3
  191. package/deps/rocksdb/rocksdb/util/dynamic_bloom.cc +2 -2
  192. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +15 -14
  193. package/deps/rocksdb/rocksdb/util/interval_test.cc +102 -0
  194. package/deps/rocksdb/rocksdb/util/semaphore.h +164 -0
  195. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +10 -6
  196. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -2
  197. package/deps/rocksdb/rocksdb/util/slice_test.cc +136 -0
  198. package/deps/rocksdb/rocksdb/util/status.cc +1 -0
  199. package/deps/rocksdb/rocksdb/util/string_util.cc +2 -16
  200. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +1 -1
  201. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -1
  202. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +7 -4
  203. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +35 -14
  204. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_test.cc +2 -0
  205. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +5 -2
  206. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/any_lock_manager_test.h +244 -0
  207. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench.cc +18 -0
  208. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench_tool.cc +159 -0
  209. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +1244 -161
  210. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +66 -12
  211. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_stress_test.cc +103 -0
  212. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +1275 -8
  213. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +40 -262
  214. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test_common.h +78 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_validation_test_runner.h +469 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -6
  217. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +4 -0
  218. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +9 -1
  219. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +18 -9
  220. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +2 -0
  221. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +2 -1
  222. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +72 -44
  223. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +92 -15
  224. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +6 -20
  225. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +143 -112
  226. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +23 -16
  227. package/index.js +18 -42
  228. package/package.json +1 -1
  229. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  230. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  231. package/util.h +38 -12
  232. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc +0 -17
@@ -13,7 +13,6 @@
13
13
  #include "rocksdb/slice.h"
14
14
  #include "rocksdb/utilities/transaction_db_mutex.h"
15
15
  #include "test_util/sync_point.h"
16
- #include "util/cast_util.h"
17
16
  #include "util/hash.h"
18
17
  #include "util/thread_local.h"
19
18
  #include "utilities/transactions/pessimistic_transaction_db.h"
@@ -21,36 +20,275 @@
21
20
 
22
21
  namespace ROCKSDB_NAMESPACE {
23
22
 
23
+ constexpr bool kDebugLog = false;
24
+
25
+ // KeyLockWaiter represents a waiter for a key lock. It contains a conditional
26
+ // variable to allow waiter to wait for the key lock. It also contains other
27
+ // metadata about the waiter such as transaction id, lock type etc.
28
+ struct KeyLockWaiter {
29
+ KeyLockWaiter(std::shared_ptr<TransactionDBCondVar> c, TransactionID i,
30
+ bool ex)
31
+ : id(i), exclusive(ex), ready(false), cv(std::move(c)) {}
32
+
33
+ // disable copy constructor and assignment operator, move and move
34
+ // assignment
35
+ KeyLockWaiter(const KeyLockWaiter&) = delete;
36
+ KeyLockWaiter& operator=(const KeyLockWaiter&) = delete;
37
+ KeyLockWaiter(KeyLockWaiter&&) = delete;
38
+ KeyLockWaiter& operator=(KeyLockWaiter&&) = delete;
39
+
40
+ ~KeyLockWaiter() = default;
41
+
42
+ // Reset the waiter to be used again
43
+ void Reset(TransactionID i, bool e) {
44
+ id = i;
45
+ exclusive = e;
46
+ ready = false;
47
+ }
48
+
49
+ // Check whether the waiter has been notified that it is its turn to take the
50
+ // lock
51
+ bool IsReady() const { return ready; }
52
+
53
+ // Wait until its turn to take the lock forever
54
+ Status Wait(std::shared_ptr<TransactionDBMutex>& mutex) {
55
+ // Mutex is already locked by caller
56
+ // Check ready flag before wait
57
+ if (ready) {
58
+ return Status::OK();
59
+ }
60
+ return AfterWait(cv->Wait(mutex));
61
+ }
62
+
63
+ // Wait until its turn to take the lock within timeout_us
64
+ Status WaitFor(std::shared_ptr<TransactionDBMutex>& mutex,
65
+ int64_t timeout_us) {
66
+ // Mutex is already locked by caller
67
+ // Check ready flag before wait
68
+ if (ready) {
69
+ return Status::OK();
70
+ }
71
+ return AfterWait(cv->WaitFor(mutex, timeout_us));
72
+ }
73
+
74
+ // Notify the waiter to take the lock
75
+ void Notify() {
76
+ // Mutex is already locked by caller
77
+ ready = true;
78
+ cv->Notify();
79
+ }
80
+
81
+ TransactionID id;
82
+ bool exclusive;
83
+
84
+ private:
85
+ Status AfterWait(Status wait_result) {
86
+ if (wait_result.ok() || wait_result.IsTimedOut()) {
87
+ // check ready again after wake up.
88
+ if (ready) {
89
+ return Status::OK();
90
+ } else {
91
+ return Status::TimedOut(Status::SubCode::kMutexTimeout);
92
+ }
93
+ } else {
94
+ return wait_result;
95
+ }
96
+ }
97
+
98
+ // Track whether the waiter has been woken up explicitly.
99
+ bool ready;
100
+ // TODO(Xingbo), Switch to std::binary_semaphore, once we have c++20
101
+ // semaphore is likely more performant than mutex + cv.
102
+ // Although we will also need to implement TransactionDBSemaphore, which would
103
+ // be required if external system wants to do instrumented lock wait tracking
104
+ std::shared_ptr<TransactionDBCondVar> cv;
105
+ };
106
+
24
107
  struct LockInfo {
108
+ LockInfo(TransactionID id, uint64_t time, bool ex)
109
+ : exclusive(ex), expiration_time(time) {
110
+ txn_ids.push_back(id);
111
+ }
112
+
113
+ DECLARE_DEFAULT_MOVES(LockInfo);
114
+
25
115
  bool exclusive;
26
116
  autovector<TransactionID> txn_ids;
27
117
 
28
118
  // Transaction locks are not valid after this time in us
29
119
  uint64_t expiration_time;
30
120
 
31
- LockInfo(TransactionID id, uint64_t time, bool ex)
32
- : exclusive(ex), expiration_time(time) {
33
- txn_ids.push_back(id);
121
+ // waiter queue for this key
122
+ // TODO xingbo, use intrusive list to avoid extra memory allocation
123
+ std::unique_ptr<std::list<KeyLockWaiter*>> waiter_queue;
124
+ };
125
+
126
+ // Print debug info for lock waiter wake up action.
127
+ void DebugWakeUpWaiter(TransactionID txn_id, TransactionID waiter_id,
128
+ const std::string& key, const std::string& msg) {
129
+ if (kDebugLog) {
130
+ // print which waiter got woken up
131
+ fprintf(stderr,
132
+ "Txn %" PRIu64 ": wake up next waiter on %s Txn %" PRIu64
133
+ " on key %s\n",
134
+ txn_id, msg.c_str(), waiter_id, key.c_str());
135
+ fflush(stderr);
136
+ }
137
+ }
138
+
139
+ // Key lock waiter context, used for free the lock automatically
140
+ struct KeyLockWaiterContext {
141
+ // When a lock waiter is aborted due to dead lock or time out, this function
142
+ // is used to wake up the waiters after it, if they could proceed.
143
+ void TryWakeUpNextWaiters(const LockInfo& lock_info, const std::string& key) {
144
+ if (waiter_queue != nullptr && lock_waiter != waiter_queue->end()) {
145
+ bool wake_up_next_shared_waiters = false;
146
+
147
+ if (lock_waiter == waiter_queue->begin()) {
148
+ // if lock waiter is at the head of the queue, check the current lock
149
+ // status. If it is exclusive lock, no waiter should be woken up. other
150
+ // wise, try to wake up shared lock waiters on the right side of itself.
151
+ wake_up_next_shared_waiters = !lock_info.exclusive;
152
+ } else {
153
+ // if lock waiter is not at the head of the queue, check the previous
154
+ // lock status. If it is active and shared, it should try to wake up the
155
+ // shared lock waiter on the right side of itself.
156
+ auto lock_waiter_prev = lock_waiter;
157
+ lock_waiter_prev--;
158
+ wake_up_next_shared_waiters =
159
+ (*lock_waiter_prev)->IsReady() && !(*lock_waiter_prev)->exclusive;
160
+ }
161
+
162
+ if (wake_up_next_shared_waiters) {
163
+ // Go through all the waiters on the right side of the lock waiter and
164
+ // wake up the shared lock waiter until the end of the queue or
165
+ // encountered an exclusive lock waiter.
166
+ auto lock_waiter_next = lock_waiter;
167
+ lock_waiter_next++;
168
+ while (lock_waiter_next != waiter_queue->end() &&
169
+ !(*lock_waiter_next)->exclusive) {
170
+ (*lock_waiter_next)->Notify();
171
+ DebugWakeUpWaiter((*lock_waiter)->id, (*lock_waiter_next)->id, key,
172
+ "TryWakeUpNextWaiters");
173
+ lock_waiter_next++;
174
+ }
175
+ }
176
+ }
34
177
  }
35
- LockInfo(const LockInfo& lock_info)
36
178
 
37
- = default;
38
- void operator=(const LockInfo& lock_info) {
39
- exclusive = lock_info.exclusive;
40
- txn_ids = lock_info.txn_ids;
41
- expiration_time = lock_info.expiration_time;
179
+ ~KeyLockWaiterContext() {
180
+ if (waiter_queue != nullptr && lock_waiter != waiter_queue->end()) {
181
+ waiter_queue->erase(lock_waiter);
182
+ lock_waiter = waiter_queue->end();
183
+ }
184
+ waiter_queue = nullptr;
42
185
  }
43
- DECLARE_DEFAULT_MOVES(LockInfo);
186
+
187
+ // The waiter queue the lock waiter joined. Used for remove the waiter from
188
+ // the waiter queue.
189
+ std::list<KeyLockWaiter*>* waiter_queue = nullptr;
190
+ // The stable iterator that tracks the position of the waiter in the waiter
191
+ // queue. Used for remove the waiter from the waiter queue.
192
+ std::list<KeyLockWaiter*>::iterator lock_waiter;
44
193
  };
45
194
 
46
195
  struct LockMapStripe {
47
- explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
48
- stripe_mutex = factory->AllocateMutex();
49
- stripe_cv = factory->AllocateCondVar();
196
+ explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory,
197
+ ThreadLocalPtr& key_lock_waiter)
198
+ : mutex_factory_(std::move(factory)), key_lock_waiter_(key_lock_waiter) {
199
+ stripe_mutex = mutex_factory_->AllocateMutex();
200
+ stripe_cv = mutex_factory_->AllocateCondVar();
201
+
50
202
  assert(stripe_mutex);
51
203
  assert(stripe_cv);
52
204
  }
53
205
 
206
+ LockInfo* GetLockInfo(const std::string& key) {
207
+ auto lock_info_iter = keys.find(key);
208
+ if (lock_info_iter != keys.end()) {
209
+ return &lock_info_iter->second;
210
+ } else {
211
+ return nullptr;
212
+ }
213
+ }
214
+
215
+ // Wait until its turn to take the lock of this key within timeout_us.
216
+ // By default timeout_us == 0, which means wait forever
217
+ void JoinWaitQueue(LockInfo& lock_info, TransactionID id, bool exclusive,
218
+ bool isUpgrade, KeyLockWaiterContext& waiter_context) {
219
+ if (lock_info.waiter_queue == nullptr) {
220
+ // no waiter queue yet, create a new one
221
+ lock_info.waiter_queue = std::make_unique<std::list<KeyLockWaiter*>>();
222
+ }
223
+
224
+ auto waiter_queue = lock_info.waiter_queue.get();
225
+
226
+ // by default insert the new lock waiter at the end of the queue.
227
+ auto insert_point = waiter_queue->end();
228
+
229
+ if (isUpgrade) {
230
+ // If transaction is upgrading a shared lock to exclusive lock, prioritize
231
+ // it by moving its lock waiter before the first exclusive lock in the
232
+ // queue if there is one, or end of the queue if not exist. It will be
233
+ // able to acquire the lock after the other shared locks waiters at the
234
+ // front of queue acquired and released locks. This reduces the chance of
235
+ // deadlock, which makes transaction run more efficiently.
236
+
237
+ if (waiter_context.waiter_queue != nullptr) {
238
+ // If waiter_context is already initialized, it means current
239
+ // transaction already joined the lock queue. Don't move the lock
240
+ // position if it is already at the head of the queue or the lock
241
+ // waiters before it are ready to take the lock.
242
+ if (waiter_context.lock_waiter == waiter_queue->begin()) {
243
+ return;
244
+ }
245
+
246
+ auto prev_lock_waiter = waiter_context.lock_waiter;
247
+ prev_lock_waiter--;
248
+ if ((*prev_lock_waiter)->IsReady()) {
249
+ return;
250
+ }
251
+
252
+ // Remove existing lock waiter
253
+ waiter_queue->erase(waiter_context.lock_waiter);
254
+ }
255
+
256
+ // For upgrade, insert waiter either at the end of the queue or before the
257
+ // first exlusive lock waiter.
258
+ insert_point = waiter_queue->begin();
259
+ while ((insert_point != waiter_queue->end()) &&
260
+ (!(*insert_point)->exclusive)) {
261
+ insert_point++;
262
+ }
263
+ }
264
+
265
+ // Insert the new lock waiter
266
+ waiter_context.lock_waiter =
267
+ waiter_queue->insert(insert_point, GetKeyLockWaiter(id, exclusive));
268
+
269
+ waiter_context.waiter_queue = waiter_queue;
270
+ }
271
+
272
+ // Wait on an existing KeyLockWaiter until its turn to take the lock or
273
+ // timeout
274
+ Status WaitOnLock(std::list<KeyLockWaiter*>::iterator& lock_waiter,
275
+ int64_t timeout_us = 0) {
276
+ Status ret;
277
+ if (timeout_us == 0) {
278
+ ret = (*lock_waiter)->Wait(stripe_mutex);
279
+ } else {
280
+ ret = (*lock_waiter)->WaitFor(stripe_mutex, timeout_us);
281
+ }
282
+ return ret;
283
+ }
284
+
285
+ void ReleaseLastLockHolder(
286
+ LockInfo& lock_info,
287
+ UnorderedMap<std::string, LockInfo>::iterator stripe_iter,
288
+ LockMap* lock_map, TransactionID txn_id, const std::string& key,
289
+ const int64_t max_num_locks, autovector<TransactionID>& txns,
290
+ autovector<TransactionID>::iterator& txn_it);
291
+
54
292
  // Mutex must be held before modifying keys map
55
293
  std::shared_ptr<TransactionDBMutex> stripe_mutex;
56
294
 
@@ -60,16 +298,39 @@ struct LockMapStripe {
60
298
  // Locked keys mapped to the info about the transactions that locked them.
61
299
  // TODO(agiardullo): Explore performance of other data structures.
62
300
  UnorderedMap<std::string, LockInfo> keys;
301
+
302
+ private:
303
+ std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
304
+
305
+ // key lock waiter, wrapped in thread local for reusing it across
306
+ // transactions.
307
+ ThreadLocalPtr& key_lock_waiter_;
308
+
309
+ // Return key lock waiter stored in thread local var, create on first use
310
+ KeyLockWaiter* GetKeyLockWaiter(TransactionID id, bool exclusive) {
311
+ KeyLockWaiter* waiter = nullptr;
312
+ if (key_lock_waiter_.Get() == nullptr) {
313
+ // create key lock waiter
314
+ key_lock_waiter_.Reset(
315
+ new KeyLockWaiter(mutex_factory_->AllocateCondVar(), id, exclusive));
316
+ waiter = static_cast<KeyLockWaiter*>(key_lock_waiter_.Get());
317
+ } else {
318
+ waiter = static_cast<KeyLockWaiter*>(key_lock_waiter_.Get());
319
+ waiter->Reset(id, exclusive);
320
+ }
321
+ return waiter;
322
+ }
63
323
  };
64
324
 
65
325
  // Map of #num_stripes LockMapStripes
66
326
  struct LockMap {
67
327
  explicit LockMap(size_t num_stripes,
68
- std::shared_ptr<TransactionDBMutexFactory> factory)
69
- : num_stripes_(num_stripes) {
328
+ std::shared_ptr<TransactionDBMutexFactory> factory,
329
+ ThreadLocalPtr& key_lock_waiter)
330
+ : num_stripes_(num_stripes), key_lock_waiter_(key_lock_waiter) {
70
331
  lock_map_stripes_.reserve(num_stripes);
71
332
  for (size_t i = 0; i < num_stripes; i++) {
72
- LockMapStripe* stripe = new LockMapStripe(factory);
333
+ LockMapStripe* stripe = new LockMapStripe(factory, key_lock_waiter_);
73
334
  lock_map_stripes_.push_back(stripe);
74
335
  }
75
336
  }
@@ -78,20 +339,80 @@ struct LockMap {
78
339
  for (auto stripe : lock_map_stripes_) {
79
340
  delete stripe;
80
341
  }
342
+ // Validate total locked key count is 0, when lock map is destructed.
343
+ assert(locked_key_cnt.LoadRelaxed() == 0);
81
344
  }
82
345
 
83
346
  // Number of sepearate LockMapStripes to create, each with their own Mutex
84
347
  const size_t num_stripes_;
348
+ ThreadLocalPtr& key_lock_waiter_;
85
349
 
86
350
  // Count of keys that are currently locked in this column family.
351
+ // Note that multiple shared locks on the same key is counted as 1 lock.
87
352
  // (Only maintained if PointLockManager::max_num_locks_ is positive.)
88
- std::atomic<int64_t> lock_cnt{0};
353
+ RelaxedAtomic<int64_t> locked_key_cnt{0};
89
354
 
90
355
  std::vector<LockMapStripe*> lock_map_stripes_;
91
356
 
92
357
  size_t GetStripe(const std::string& key) const;
93
358
  };
94
359
 
360
+ inline void RemoveTransaction(autovector<TransactionID>& txns,
361
+ autovector<TransactionID>::iterator& txn_it) {
362
+ if (txns.size() > 1) {
363
+ auto last_it = txns.end() - 1;
364
+ if (txn_it != last_it) {
365
+ *txn_it = *last_it;
366
+ }
367
+ }
368
+ txns.pop_back();
369
+ }
370
+
371
+ void LockMapStripe::ReleaseLastLockHolder(
372
+ LockInfo& lock_info,
373
+ UnorderedMap<std::string, LockInfo>::iterator stripe_iter,
374
+ LockMap* lock_map, TransactionID txn_id, const std::string& key,
375
+ const int64_t max_num_locks, autovector<TransactionID>& txns,
376
+ autovector<TransactionID>::iterator& txn_it) {
377
+ // check whether there is other waiting transactions
378
+ if (lock_info.waiter_queue == nullptr || lock_info.waiter_queue->empty()) {
379
+ keys.erase(stripe_iter);
380
+ if (max_num_locks > 0) {
381
+ // Maintain lock count if there is a limit on the number of
382
+ // locks.
383
+ assert(lock_map->locked_key_cnt.LoadRelaxed() > 0);
384
+ lock_map->locked_key_cnt.FetchSubRelaxed(1);
385
+ }
386
+ } else {
387
+ // there are waiters in the queue, so we need to wake the next
388
+ // one up
389
+ RemoveTransaction(txns, txn_it);
390
+ // loop through the waiter queue and wake up all the shared lock
391
+ // waiters until the first exclusive lock waiter, or wake up the
392
+ // first waiter, if it is waiting for an exclusive lock.
393
+ bool first_waiter = true;
394
+ for (auto& waiter : *lock_info.waiter_queue) {
395
+ if (waiter->exclusive) {
396
+ if (first_waiter) {
397
+ // the first waiter is an exclusive lock waiter, wake it
398
+ // up Note that they are only notified, but not removed
399
+ // from the waiter queue. This allows new transaction to
400
+ // be aware that there are waiters ahead of them.
401
+ waiter->Notify();
402
+ DebugWakeUpWaiter(txn_id, waiter->id, key, "UnlockKey X waiter");
403
+ }
404
+ // found the first exclusive lock waiter, stop
405
+ break;
406
+ } else {
407
+ // wake up the shared lock waiter
408
+ waiter->Notify();
409
+ DebugWakeUpWaiter(txn_id, waiter->id, key, "UnlockKey S waiter");
410
+ }
411
+ first_waiter = false;
412
+ }
413
+ }
414
+ }
415
+
95
416
  namespace {
96
417
  void UnrefLockMapsCache(void* ptr) {
97
418
  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
@@ -99,6 +420,10 @@ void UnrefLockMapsCache(void* ptr) {
99
420
  static_cast<UnorderedMap<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
100
421
  delete lock_maps_cache;
101
422
  }
423
+ void UnrefKeyLockWaiter(void* ptr) {
424
+ auto key_lock_waiter = static_cast<KeyLockWaiter*>(ptr);
425
+ delete key_lock_waiter;
426
+ }
102
427
  } // anonymous namespace
103
428
 
104
429
  PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
@@ -107,6 +432,7 @@ PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
107
432
  default_num_stripes_(opt.num_stripes),
108
433
  max_num_locks_(opt.max_num_locks),
109
434
  lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
435
+ key_lock_waiter_(&UnrefKeyLockWaiter),
110
436
  dlock_buffer_(opt.max_num_deadlocks),
111
437
  mutex_factory_(opt.custom_mutex_factory
112
438
  ? opt.custom_mutex_factory
@@ -122,7 +448,8 @@ void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) {
122
448
 
123
449
  if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) {
124
450
  lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>(
125
- default_num_stripes_, mutex_factory_));
451
+ default_num_stripes_, mutex_factory_,
452
+ key_lock_waiter_));
126
453
  } else {
127
454
  // column_family already exists in lock map
128
455
  assert(false);
@@ -242,16 +569,18 @@ Status PointLockManager::TryLock(PessimisticTransaction* txn,
242
569
 
243
570
  LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive);
244
571
  int64_t timeout = txn->GetLockTimeout();
572
+ int64_t deadlock_timeout_us = txn->GetDeadlockTimeout();
245
573
 
246
574
  return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env,
247
- timeout, lock_info);
575
+ timeout, deadlock_timeout_us, lock_info);
248
576
  }
249
577
 
250
578
  // Helper function for TryLock().
251
579
  Status PointLockManager::AcquireWithTimeout(
252
580
  PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
253
581
  ColumnFamilyId column_family_id, const std::string& key, Env* env,
254
- int64_t timeout, const LockInfo& lock_info) {
582
+ int64_t timeout, int64_t /*deadlock_timeout_us*/,
583
+ const LockInfo& lock_info) {
255
584
  Status result;
256
585
  uint64_t end_time = 0;
257
586
 
@@ -322,9 +651,6 @@ Status PointLockManager::AcquireWithTimeout(
322
651
  // instead of exiting this while loop below.
323
652
  uint64_t now = env->NowMicros();
324
653
  if (static_cast<uint64_t>(cv_end_time) > now) {
325
- // This may be invoked multiple times since we divide
326
- // the time into smaller intervals.
327
- (void)ROCKSDB_THREAD_YIELD_CHECK_ABORT();
328
654
  result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
329
655
  cv_end_time - now);
330
656
  cv_wait_fail = !result.ok() && !result.IsTimedOut();
@@ -367,6 +693,130 @@ Status PointLockManager::AcquireWithTimeout(
367
693
  return result;
368
694
  }
369
695
 
696
+ // Try to lock this key after we have acquired the mutex.
697
+ // Sets *expire_time to the expiration time in microseconds
698
+ // or 0 if no expiration.
699
+ //
700
+ // Returns Status::TimeOut if the lock cannot be acquired due to it being
701
+ // held by other transactions, `txn_ids` will be populated with the id of
702
+ // transactions that hold the lock, excluding lock_info.txn_ids[0].
703
+ // Returns Status::Aborted(kLockLimit) if the lock cannot be acquired due to
704
+ // reaching per CF limit on the number of locks.
705
+ //
706
+ // REQUIRED: Stripe mutex must be held. txn_ids must be empty.
707
+ Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
708
+ const std::string& key, Env* env,
709
+ const LockInfo& txn_lock_info,
710
+ uint64_t* expire_time,
711
+ autovector<TransactionID>* txn_ids) {
712
+ assert(txn_lock_info.txn_ids.size() == 1);
713
+ assert(txn_ids && txn_ids->empty());
714
+
715
+ Status result;
716
+ // Check if this key is already locked
717
+ auto stripe_iter = stripe->keys.find(key);
718
+ if (stripe_iter != stripe->keys.end()) {
719
+ // Lock already held
720
+ auto& lock_info = stripe_iter->second;
721
+ assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
722
+
723
+ if (lock_info.exclusive || txn_lock_info.exclusive) {
724
+ if (lock_info.txn_ids.size() == 1 &&
725
+ lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
726
+ // The list contains one txn and we're it, so just take it.
727
+ lock_info.exclusive = txn_lock_info.exclusive;
728
+ lock_info.expiration_time = txn_lock_info.expiration_time;
729
+ } else {
730
+ // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
731
+ // it's there for a shared lock with multiple holders which was not
732
+ // caught in the first case.
733
+ if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
734
+ expire_time)) {
735
+ // lock is expired, can steal it
736
+ lock_info.txn_ids = txn_lock_info.txn_ids;
737
+ lock_info.exclusive = txn_lock_info.exclusive;
738
+ lock_info.expiration_time = txn_lock_info.expiration_time;
739
+ // lock_cnt does not change
740
+ } else {
741
+ result = Status::TimedOut(Status::SubCode::kLockTimeout);
742
+ for (auto id : lock_info.txn_ids) {
743
+ // A transaction is not blocked by itself
744
+ if (id != txn_lock_info.txn_ids[0]) {
745
+ txn_ids->push_back(id);
746
+ }
747
+ }
748
+ }
749
+ }
750
+ } else {
751
+ // We are requesting shared access to a shared lock, so just grant it.
752
+ lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
753
+ // Using std::max means that expiration time never goes down even when
754
+ // a transaction is removed from the list. The correct solution would be
755
+ // to track expiry for every transaction, but this would also work for
756
+ // now.
757
+ lock_info.expiration_time =
758
+ std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
759
+ }
760
+ } else {
761
+ // Lock not held.
762
+ // Check lock limit
763
+ if (max_num_locks_ > 0 &&
764
+ lock_map->locked_key_cnt.LoadRelaxed() >= max_num_locks_) {
765
+ result = Status::LockLimit();
766
+ } else {
767
+ // acquire lock
768
+ stripe->keys.try_emplace(key, txn_lock_info.txn_ids[0],
769
+ txn_lock_info.expiration_time,
770
+ txn_lock_info.exclusive);
771
+
772
+ // Maintain lock count if there is a limit on the number of locks
773
+ if (max_num_locks_ > 0) {
774
+ lock_map->locked_key_cnt.FetchAddRelaxed(1);
775
+ }
776
+ }
777
+ }
778
+
779
+ return result;
780
+ }
781
+
782
+ void PointLockManager::UnLockKey(PessimisticTransaction* txn,
783
+ const std::string& key, LockMapStripe* stripe,
784
+ LockMap* lock_map, Env* env) {
785
+ #ifdef NDEBUG
786
+ (void)env;
787
+ #endif
788
+ TransactionID txn_id = txn->GetID();
789
+
790
+ auto stripe_iter = stripe->keys.find(key);
791
+ if (stripe_iter != stripe->keys.end()) {
792
+ auto& txns = stripe_iter->second.txn_ids;
793
+ auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
794
+ // Found the key we locked. unlock it.
795
+ if (txn_it != txns.end()) {
796
+ if (txns.size() == 1) {
797
+ stripe->keys.erase(stripe_iter);
798
+ } else {
799
+ auto last_it = txns.end() - 1;
800
+ if (txn_it != last_it) {
801
+ *txn_it = *last_it;
802
+ }
803
+ txns.pop_back();
804
+ }
805
+
806
+ if (max_num_locks_ > 0) {
807
+ // Maintain lock count if there is a limit on the number of locks.
808
+ assert(lock_map->locked_key_cnt.LoadRelaxed() > 0);
809
+ lock_map->locked_key_cnt.FetchSubRelaxed(1);
810
+ }
811
+ }
812
+ } else {
813
+ // This key is either not locked or locked by someone else. This should
814
+ // only happen if the unlocking transaction has expired.
815
+ assert(txn->GetExpirationTime() > 0 &&
816
+ txn->GetExpirationTime() < env->NowMicros());
817
+ }
818
+ }
819
+
370
820
  void PointLockManager::DecrementWaiters(
371
821
  const PessimisticTransaction* txn,
372
822
  const autovector<TransactionID>& wait_ids) {
@@ -484,143 +934,22 @@ bool PointLockManager::IncrementWaiters(
484
934
  return true;
485
935
  }
486
936
 
487
- // Try to lock this key after we have acquired the mutex.
488
- // Sets *expire_time to the expiration time in microseconds
489
- // or 0 if no expiration.
490
- //
491
- // Returns Status::TimeOut if the lock cannot be acquired due to it being
492
- // held by other transactions, `txn_ids` will be populated with the id of
493
- // transactions that hold the lock, excluding lock_info.txn_ids[0].
494
- // Returns Status::Aborted(kLockLimit) if the lock cannot be acquired due to
495
- // reaching per CF limit on the number of locks.
496
- //
497
- // REQUIRED: Stripe mutex must be held. txn_ids must be empty.
498
- Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
499
- const std::string& key, Env* env,
500
- const LockInfo& txn_lock_info,
501
- uint64_t* expire_time,
502
- autovector<TransactionID>* txn_ids) {
503
- assert(txn_lock_info.txn_ids.size() == 1);
504
- assert(txn_ids && txn_ids->empty());
937
+ void PointLockManager::UnLock(PessimisticTransaction* txn,
938
+ ColumnFamilyId column_family_id,
939
+ const std::string& key, Env* env) {
940
+ std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
941
+ LockMap* lock_map = lock_map_ptr.get();
942
+ if (lock_map == nullptr) {
943
+ // Column Family must have been dropped.
944
+ return;
945
+ }
505
946
 
506
- Status result;
507
- // Check if this key is already locked
508
- auto stripe_iter = stripe->keys.find(key);
509
- if (stripe_iter != stripe->keys.end()) {
510
- // Lock already held
511
- LockInfo& lock_info = stripe_iter->second;
512
- assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
947
+ // Lock the mutex for the stripe that this key hashes to
948
+ size_t stripe_num = lock_map->GetStripe(key);
949
+ assert(lock_map->lock_map_stripes_.size() > stripe_num);
950
+ LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
513
951
 
514
- if (lock_info.exclusive || txn_lock_info.exclusive) {
515
- if (lock_info.txn_ids.size() == 1 &&
516
- lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
517
- // The list contains one txn and we're it, so just take it.
518
- lock_info.exclusive = txn_lock_info.exclusive;
519
- lock_info.expiration_time = txn_lock_info.expiration_time;
520
- } else {
521
- // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
522
- // it's there for a shared lock with multiple holders which was not
523
- // caught in the first case.
524
- if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
525
- expire_time)) {
526
- // lock is expired, can steal it
527
- lock_info.txn_ids = txn_lock_info.txn_ids;
528
- lock_info.exclusive = txn_lock_info.exclusive;
529
- lock_info.expiration_time = txn_lock_info.expiration_time;
530
- // lock_cnt does not change
531
- } else {
532
- result = Status::TimedOut(Status::SubCode::kLockTimeout);
533
- for (auto id : lock_info.txn_ids) {
534
- // A transaction is not blocked by itself
535
- if (id != txn_lock_info.txn_ids[0]) {
536
- txn_ids->push_back(id);
537
- }
538
- }
539
- }
540
- }
541
- } else {
542
- // We are requesting shared access to a shared lock, so just grant it.
543
- lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
544
- // Using std::max means that expiration time never goes down even when
545
- // a transaction is removed from the list. The correct solution would be
546
- // to track expiry for every transaction, but this would also work for
547
- // now.
548
- lock_info.expiration_time =
549
- std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
550
- }
551
- } else { // Lock not held.
552
- // Check lock limit
553
- if (max_num_locks_ > 0 &&
554
- lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
555
- result = Status::LockLimit();
556
- } else {
557
- // acquire lock
558
- stripe->keys.emplace(key, txn_lock_info);
559
-
560
- // Maintain lock count if there is a limit on the number of locks
561
- if (max_num_locks_) {
562
- lock_map->lock_cnt++;
563
- }
564
- }
565
- }
566
-
567
- return result;
568
- }
569
-
570
- void PointLockManager::UnLockKey(PessimisticTransaction* txn,
571
- const std::string& key, LockMapStripe* stripe,
572
- LockMap* lock_map, Env* env) {
573
- #ifdef NDEBUG
574
- (void)env;
575
- #endif
576
- TransactionID txn_id = txn->GetID();
577
-
578
- auto stripe_iter = stripe->keys.find(key);
579
- if (stripe_iter != stripe->keys.end()) {
580
- auto& txns = stripe_iter->second.txn_ids;
581
- auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
582
- // Found the key we locked. unlock it.
583
- if (txn_it != txns.end()) {
584
- if (txns.size() == 1) {
585
- stripe->keys.erase(stripe_iter);
586
- } else {
587
- auto last_it = txns.end() - 1;
588
- if (txn_it != last_it) {
589
- *txn_it = *last_it;
590
- }
591
- txns.pop_back();
592
- }
593
-
594
- if (max_num_locks_ > 0) {
595
- // Maintain lock count if there is a limit on the number of locks.
596
- assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
597
- lock_map->lock_cnt--;
598
- }
599
- }
600
- } else {
601
- // This key is either not locked or locked by someone else. This should
602
- // only happen if the unlocking transaction has expired.
603
- assert(txn->GetExpirationTime() > 0 &&
604
- txn->GetExpirationTime() < env->NowMicros());
605
- }
606
- }
607
-
608
- void PointLockManager::UnLock(PessimisticTransaction* txn,
609
- ColumnFamilyId column_family_id,
610
- const std::string& key, Env* env) {
611
- std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
612
- LockMap* lock_map = lock_map_ptr.get();
613
- if (lock_map == nullptr) {
614
- // Column Family must have been dropped.
615
- return;
616
- }
617
-
618
- // Lock the mutex for the stripe that this key hashes to
619
- size_t stripe_num = lock_map->GetStripe(key);
620
- assert(lock_map->lock_map_stripes_.size() > stripe_num);
621
- LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
622
-
623
- stripe->stripe_mutex->Lock().PermitUncheckedError();
952
+ stripe->stripe_mutex->Lock().AssertOK();
624
953
  UnLockKey(txn, key, stripe, lock_map, env);
625
954
  stripe->stripe_mutex->UnLock();
626
955
 
@@ -662,7 +991,7 @@ void PointLockManager::UnLock(PessimisticTransaction* txn,
662
991
  assert(lock_map->lock_map_stripes_.size() > stripe_num);
663
992
  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
664
993
 
665
- stripe->stripe_mutex->Lock().PermitUncheckedError();
994
+ stripe->stripe_mutex->Lock().AssertOK();
666
995
 
667
996
  for (const std::string* key : stripe_keys) {
668
997
  UnLockKey(txn, *key, stripe, lock_map, env);
@@ -693,7 +1022,7 @@ PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() {
693
1022
  const auto& stripes = lock_maps_[i]->lock_map_stripes_;
694
1023
  // Iterate and lock all stripes in ascending order.
695
1024
  for (const auto& j : stripes) {
696
- j->stripe_mutex->Lock().PermitUncheckedError();
1025
+ j->stripe_mutex->Lock().AssertOK();
697
1026
  for (const auto& it : j->keys) {
698
1027
  struct KeyLockInfo info;
699
1028
  info.exclusive = it.second.exclusive;
@@ -745,4 +1074,758 @@ void PointLockManager::UnLock(PessimisticTransaction* /* txn */,
745
1074
  // no-op
746
1075
  }
747
1076
 
1077
+ // PerKeyPointLockManager implementation
1078
+ PerKeyPointLockManager::PerKeyPointLockManager(PessimisticTransactionDB* db,
1079
+ const TransactionDBOptions& opt)
1080
+ : PointLockManager(db, opt) {}
1081
+
1082
+ void DebugLockStatus(TransactionID my_txn_id, const LockInfo& lock_info,
1083
+ const std::string& key,
1084
+ const KeyLockWaiterContext& key_lock_waiter_ctx) {
1085
+ if (kDebugLog) {
1086
+ char msg[512];
1087
+ size_t offset = 0;
1088
+
1089
+ // print lock holders
1090
+ offset += snprintf(msg + offset, sizeof(msg),
1091
+ "Txn %" PRIu64 ": LockStatus key %s: holder [",
1092
+ my_txn_id, key.c_str());
1093
+ for (const auto& txn_id : lock_info.txn_ids) {
1094
+ offset += snprintf(msg + offset, sizeof(msg), "%s%" PRIu64 ",",
1095
+ lock_info.exclusive ? "X" : "S", txn_id);
1096
+ }
1097
+
1098
+ // print waiter queue
1099
+ offset += snprintf(msg + offset, sizeof(msg), "], waiter_queue [");
1100
+ for (auto it = key_lock_waiter_ctx.waiter_queue->begin();
1101
+ it != key_lock_waiter_ctx.waiter_queue->end(); it++) {
1102
+ offset += snprintf(msg + offset, sizeof(msg), "%s%" PRIu64 ",",
1103
+ (*it)->exclusive ? "X" : "S", (*it)->id);
1104
+ }
1105
+
1106
+ offset += snprintf(msg + offset, sizeof(msg), "]\n");
1107
+ fprintf(stderr, "%s", msg);
1108
+ fflush(stderr);
1109
+ }
1110
+ }
1111
+
1112
+ int64_t PerKeyPointLockManager::CalculateWaitEndTime(int64_t expire_time_hint,
1113
+ int64_t end_time) {
1114
+ int64_t cv_end_time = -1;
1115
+ if (expire_time_hint > 0 && end_time > 0) {
1116
+ cv_end_time = std::min(expire_time_hint, end_time);
1117
+ } else if (expire_time_hint > 0) {
1118
+ cv_end_time = expire_time_hint;
1119
+ } else if (end_time > 0) {
1120
+ cv_end_time = end_time;
1121
+ }
1122
+ return cv_end_time;
1123
+ }
1124
+
1125
+ // Acquire lock within timeout.
1126
+ // This function is similar to PointLockManger::AcquireWithTimeout with
1127
+ // following differences.
1128
+ //
1129
+ // If deadlock_timeout_us is not 0, it first performs a wait without doing dead
1130
+ // lock detection. This wait duration is specified by deadlock_timeout_us.
1131
+ // If this wait times out and it is still not able to acquire the lock, perform
1132
+ // the deadlock detection before wait again.
1133
+ //
1134
+ // It uses a per key lock waiter queue to handle lock waiting and wake up
1135
+ // efficiently. When a transaction is waiting for acquiring a lock on a key, it
1136
+ // joins a wait queue that is dedicated for this key. It will either timeout, or
1137
+ // get woken up when it is its turn to take the lock. This is more efficient
1138
+ // than the PointLockManger implementation where all lock waiters wait on the
1139
+ // same lock stripe cond var.
1140
+ Status PerKeyPointLockManager::AcquireWithTimeout(
1141
+ PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
1142
+ ColumnFamilyId column_family_id, const std::string& key, Env* env,
1143
+ int64_t timeout, int64_t deadlock_timeout_us,
1144
+ const LockInfo& txn_lock_info) {
1145
+ Status result;
1146
+ uint64_t end_time = 0;
1147
+ auto my_txn_id = txn_lock_info.txn_ids[0];
1148
+
1149
+ if (timeout > 0) {
1150
+ uint64_t start_time = env->NowMicros();
1151
+ end_time = start_time + timeout;
1152
+ }
1153
+
1154
+ if (timeout < 0) {
1155
+ // If timeout is negative, we wait indefinitely to acquire the lock
1156
+ result = stripe->stripe_mutex->Lock();
1157
+ } else {
1158
+ result = stripe->stripe_mutex->TryLockFor(timeout);
1159
+ }
1160
+
1161
+ if (!result.ok()) {
1162
+ // failed to acquire mutex
1163
+ return result;
1164
+ }
1165
+
1166
+ // Acquire lock if we are able to
1167
+ uint64_t expire_time_hint = 0;
1168
+ autovector<TransactionID> wait_ids;
1169
+ bool isUpgrade = false;
1170
+
1171
+ auto lock_info = stripe->GetLockInfo(key);
1172
+
1173
+ auto wait_before_deadlock_detection =
1174
+ txn->IsDeadlockDetect() && (deadlock_timeout_us > 0);
1175
+ result = AcquireLocked(
1176
+ lock_map, stripe, key, env, txn_lock_info, &expire_time_hint,
1177
+ // If wait before deadlock detection, it executes a fast path to save CPU
1178
+ // cycles, wait ids are not collected.
1179
+ wait_before_deadlock_detection ? nullptr : &wait_ids, &lock_info,
1180
+ &isUpgrade, true);
1181
+ if (!result.ok() && timeout != 0 &&
1182
+ /* No need to retry after reach lock limit or aborted */
1183
+ !result.IsLockLimit() && !result.IsAborted()) {
1184
+ assert(lock_info);
1185
+
1186
+ PERF_TIMER_GUARD(key_lock_wait_time);
1187
+ PERF_COUNTER_ADD(key_lock_wait_count, 1);
1188
+ // If we weren't able to acquire the lock, we will keep retrying as long
1189
+ // as the timeout allows.
1190
+ bool timed_out = false;
1191
+ bool cv_wait_fail = false;
1192
+
1193
+ KeyLockWaiterContext key_lock_waiter_ctx;
1194
+
1195
+ // Decide how long to wait
1196
+ auto cv_end_time = CalculateWaitEndTime(expire_time_hint, end_time);
1197
+
1198
+ // We will try to wait a little bit before checking deadlock, as
1199
+ // deadlock check is expensive.
1200
+ if (wait_before_deadlock_detection) {
1201
+ int64_t now = env->NowMicros();
1202
+ if (cv_end_time < 0 || cv_end_time > now) {
1203
+ if (kDebugLog) {
1204
+ // print lock status before deadlock detection
1205
+ fprintf(stderr,
1206
+ "Txn %" PRIu64
1207
+ " wait before deadlock detection %s, exclusive lock "
1208
+ "%d\n",
1209
+ my_txn_id, key.c_str(), txn_lock_info.exclusive);
1210
+ fflush(stderr);
1211
+ }
1212
+ stripe->JoinWaitQueue(*lock_info, my_txn_id, txn_lock_info.exclusive,
1213
+ false, key_lock_waiter_ctx);
1214
+ DebugLockStatus(my_txn_id, *lock_info, key, key_lock_waiter_ctx);
1215
+
1216
+ TEST_SYNC_POINT(
1217
+ "PerKeyPointLockManager::AcquireWithTimeout:"
1218
+ "WaitingTxnBeforeDeadLockDetection");
1219
+ result = stripe->WaitOnLock(
1220
+ key_lock_waiter_ctx.lock_waiter,
1221
+ std::min(cv_end_time - now, (int64_t)deadlock_timeout_us));
1222
+ assert(result.ok() || result.IsTimedOut());
1223
+ // Refresh lock info pointer, as this pointer is not guaranteed to be
1224
+ // stable in folly
1225
+ lock_info = stripe->GetLockInfo(key);
1226
+ // try to take a lock again to get wait ids after deadlock timeout
1227
+ result = AcquireLocked(lock_map, stripe, key, env, txn_lock_info,
1228
+ &expire_time_hint, &wait_ids, &lock_info,
1229
+ &isUpgrade, !result.ok());
1230
+ } else {
1231
+ // Already timed out
1232
+ timed_out = true;
1233
+ result = Status::TimedOut(Status::SubCode::kLockTimeout);
1234
+ }
1235
+ }
1236
+
1237
+ while (!result.ok() && !timed_out && !result.IsAborted()) {
1238
+ // Refresh wait end time
1239
+ cv_end_time = CalculateWaitEndTime(expire_time_hint, end_time);
1240
+
1241
+ // We are dependent on a transaction to finish, so perform deadlock
1242
+ // detection.
1243
+ if (!wait_ids.empty()) {
1244
+ if (txn->IsDeadlockDetect()) {
1245
+ if (IncrementWaiters(txn, wait_ids, key, column_family_id,
1246
+ txn_lock_info.exclusive, env)) {
1247
+ result = Status::Busy(Status::SubCode::kDeadlock);
1248
+ break;
1249
+ }
1250
+ }
1251
+ txn->SetWaitingTxn(wait_ids, column_family_id, &key);
1252
+ }
1253
+
1254
+ TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn");
1255
+
1256
+ if (kDebugLog) {
1257
+ // print transaction lock status and wait ids
1258
+ char msg[512];
1259
+ size_t offset = 0;
1260
+ offset += snprintf(msg + offset, sizeof(msg),
1261
+ "Txn %" PRIu64
1262
+ " wait after deadlock detection %s, exclusive lock "
1263
+ "%d, upgrade %d, wait_ids [",
1264
+ my_txn_id, key.c_str(), txn_lock_info.exclusive,
1265
+ isUpgrade);
1266
+
1267
+ for (auto it = wait_ids.begin(); it != wait_ids.end(); it++) {
1268
+ offset += snprintf(msg + offset, sizeof(msg), "%" PRIu64 ",", *it);
1269
+ }
1270
+
1271
+ offset += snprintf(msg + offset, sizeof(msg), "]\n");
1272
+
1273
+ fprintf(stderr, "%s", msg);
1274
+ fflush(stderr);
1275
+ }
1276
+
1277
+ // If it has not joined wait queue, join it now.
1278
+ // If it is a lock upgrade, rejoin it.
1279
+ if (isUpgrade || (key_lock_waiter_ctx.waiter_queue == nullptr)) {
1280
+ stripe->JoinWaitQueue(*lock_info, my_txn_id, txn_lock_info.exclusive,
1281
+ isUpgrade, key_lock_waiter_ctx);
1282
+
1283
+ DebugLockStatus(my_txn_id, *lock_info, key, key_lock_waiter_ctx);
1284
+ }
1285
+
1286
+ int64_t now = 0;
1287
+ if (cv_end_time < 0) {
1288
+ // Wait indefinitely
1289
+ result = stripe->WaitOnLock(key_lock_waiter_ctx.lock_waiter);
1290
+ cv_wait_fail = !result.ok();
1291
+ } else {
1292
+ now = env->NowMicros();
1293
+ if (cv_end_time > now) {
1294
+ result = stripe->WaitOnLock(key_lock_waiter_ctx.lock_waiter,
1295
+ cv_end_time - now);
1296
+
1297
+ cv_wait_fail = !result.ok() && !result.IsTimedOut();
1298
+ } else {
1299
+ // now >= cv_end_time, we already timed out
1300
+ result = Status::TimedOut(Status::SubCode::kLockTimeout);
1301
+ }
1302
+ }
1303
+
1304
+ #ifndef NDEBUG
1305
+ stripe->stripe_mutex->UnLock();
1306
+ TEST_SYNC_POINT_CALLBACK(
1307
+ "PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
1308
+ &my_txn_id);
1309
+ TEST_SYNC_POINT(
1310
+ "PerKeyPointLockManager::AcquireWithTimeout:BeforeTakeLock");
1311
+ auto lock_status = stripe->stripe_mutex->Lock();
1312
+ assert(lock_status.ok());
1313
+ #endif
1314
+
1315
+ if (!wait_ids.empty()) {
1316
+ txn->ClearWaitingTxn();
1317
+ if (txn->IsDeadlockDetect()) {
1318
+ DecrementWaiters(txn, wait_ids);
1319
+ }
1320
+ }
1321
+
1322
+ if (cv_wait_fail) {
1323
+ break;
1324
+ }
1325
+
1326
+ if (result.IsTimedOut()) {
1327
+ timed_out = true;
1328
+ // Even though we timed out, we will still make one more attempt to
1329
+ // acquire lock below (it is possible the lock expired and we
1330
+ // were never signaled).
1331
+ }
1332
+ assert(result.ok() || result.IsTimedOut());
1333
+
1334
+ // Refresh lock info pointer, as this pointer is not guaranteed to be
1335
+ // stable in folly
1336
+ lock_info = stripe->GetLockInfo(key);
1337
+
1338
+ // Try to get the lock again.
1339
+ result = AcquireLocked(
1340
+ lock_map, stripe, key, env, txn_lock_info, &expire_time_hint,
1341
+ &wait_ids, &lock_info, &isUpgrade,
1342
+ /* If wait is timed out, it means it is not its turn to take the lock.
1343
+ * Therefore, it should still follow FIFO order. */
1344
+ timed_out);
1345
+ auto fail_to_take_lock_on_its_turn = !timed_out && !result.ok();
1346
+ if (fail_to_take_lock_on_its_turn) {
1347
+ // If it is its turn, but it failed to take lock, something is broken.
1348
+ // Assert this should not happen in debug build during testing.
1349
+ // In prod, it simply gives up the attempt.
1350
+ assert(!fail_to_take_lock_on_its_turn);
1351
+ break;
1352
+ }
1353
+
1354
+ if (!result.ok() && cv_end_time >= 0) {
1355
+ if (static_cast<int64_t>(end_time) <= now) {
1356
+ // lock timeout timed out
1357
+ result = Status::TimedOut(Status::SubCode::kLockTimeout);
1358
+ timed_out = true;
1359
+ }
1360
+ }
1361
+ }
1362
+
1363
+ // For any reason that the transaction failed to acquire the lock, it should
1364
+ // try to wake up next waiters, if they are ready to proceed.
1365
+ if (!result.ok()) {
1366
+ key_lock_waiter_ctx.TryWakeUpNextWaiters(*lock_info, key);
1367
+ }
1368
+ }
1369
+
1370
+ stripe->stripe_mutex->UnLock();
1371
+
1372
+ // On timeout, persist the lock information so we can debug the contention
1373
+ if (result.IsTimedOut()) {
1374
+ txn->SetWaitingTxn(wait_ids, column_family_id, &key, true);
1375
+ }
1376
+
1377
+ return result;
1378
+ }
1379
+
1380
+ Status PerKeyPointLockManager::FillWaitIds(LockInfo& lock_info,
1381
+ const LockInfo& txn_lock_info,
1382
+ autovector<TransactionID>* wait_ids,
1383
+ bool& isUpgrade,
1384
+ TransactionID& my_txn_id,
1385
+ const std::string& key) {
1386
+ if (wait_ids != nullptr) {
1387
+ for (auto id : lock_info.txn_ids) {
1388
+ // A transaction is not blocked by itself
1389
+ if (id != my_txn_id) {
1390
+ wait_ids->push_back(id);
1391
+ } else {
1392
+ // Itself is already holding a lock, so it is either an upgrade or
1393
+ // downgrade. Downgrade has already been handled above. Assert it
1394
+ // is an upgrade here.
1395
+ auto is_upgrade = !lock_info.exclusive && txn_lock_info.exclusive;
1396
+ if (!is_upgrade) {
1397
+ if (kDebugLog) {
1398
+ fprintf(stderr,
1399
+ "txn id %" PRIu64 " assert failed on lock upgrade key %s\n",
1400
+ my_txn_id, key.c_str());
1401
+ fflush(stderr);
1402
+ }
1403
+ assert(is_upgrade);
1404
+ return Status::Aborted(Status::SubCode::kNotExpectedCodePath);
1405
+ }
1406
+ isUpgrade = true;
1407
+ }
1408
+ }
1409
+ }
1410
+ return Status::OK();
1411
+ }
1412
+
1413
+ // This function is similar to PointLockManager::AcquireLocked with following
1414
+ // differences.
1415
+ //
1416
+ // It introduces a per key lock waiter queue. When it tries to take the lock, it
1417
+ // will first check whether there are other transactions already in the waiter
1418
+ // queue, if so it will return TimeOut. Caller will join the waiter queue, if
1419
+ // lock timeout is not reached yet. When it is its to take the lock, it will be
1420
+ // woken up and take the lock.
1421
+ //
1422
+ // It introduces a fast path check that will quickly check whether the lock
1423
+ // could be obtained without gathering waiter id information. This allows
1424
+ // transaction to sleep a short time before perform deadlock detection.
1425
+ //
1426
+ // @param lock_info_ptr: pointer to the LockInfo associated with the key. If the
1427
+ // key is already locked, LockInfo will be not null. If not, LockInfo is
1428
+ // null, and a new LockInfo is created and assigned to lock_info_ptr.
1429
+ //
1430
+ // @param wait_ids: When wait_ids is nullptr, it perform a fast path check to
1431
+ // see whether it could take the lock, it does not fill waiter_ids. If
1432
+ // wait_ids is not nullptr, it will fill the wait_ids with the lock holder.
1433
+ //
1434
+ // @param isUpgrade: isUpgrade is set to true, if the transaction tries to
1435
+ // uprade a lock to exclusive, but it needs to wait for other lock holders to
1436
+ // release the shared locks. Note that isUpgrade is not set on fast path
1437
+ // check.
1438
+ //
1439
+ // @param fifo: fifo flag indicates whether it should follow fifo order to check
1440
+ // whether there is already a waiter waiting for the lock or not. If fifo is
1441
+ // true and there is already a lock waiter waiting in the queue and it is not
1442
+ // itself, return TimedOut. If fifo is false, it means it is its turn to take
1443
+ // the lock.
1444
+ Status PerKeyPointLockManager::AcquireLocked(
1445
+ LockMap* lock_map, LockMapStripe* stripe, const std::string& key, Env* env,
1446
+ const LockInfo& txn_lock_info, uint64_t* expire_time,
1447
+ autovector<TransactionID>* wait_ids, LockInfo** lock_info_ptr,
1448
+ bool* isUpgrade, bool fifo) {
1449
+ assert(txn_lock_info.txn_ids.size() == 1);
1450
+
1451
+ if (wait_ids != nullptr) {
1452
+ wait_ids->clear();
1453
+ }
1454
+
1455
+ *isUpgrade = false;
1456
+ auto my_txn_id = txn_lock_info.txn_ids[0];
1457
+
1458
+ if (!*lock_info_ptr) {
1459
+ // No lock nor waiter on this key, so it can try to acquire the lock
1460
+ // directly
1461
+ if (max_num_locks_ > 0 &&
1462
+ lock_map->locked_key_cnt.LoadRelaxed() >= max_num_locks_) {
1463
+ return Status::LockLimit();
1464
+ } else {
1465
+ // acquire lock
1466
+ auto ret = stripe->keys.try_emplace(key, my_txn_id,
1467
+ txn_lock_info.expiration_time,
1468
+ txn_lock_info.exclusive);
1469
+ assert(ret.second);
1470
+ *lock_info_ptr = &(ret.first->second);
1471
+
1472
+ // Maintain lock count if there is a limit on the number of locks
1473
+ if (max_num_locks_ > 0) {
1474
+ lock_map->locked_key_cnt.FetchAddRelaxed(1);
1475
+ }
1476
+
1477
+ return Status::OK();
1478
+ }
1479
+ }
1480
+
1481
+ auto& lock_info = **lock_info_ptr;
1482
+ auto locked = !lock_info.txn_ids.empty();
1483
+ auto solo_lock_owner =
1484
+ (lock_info.txn_ids.size() == 1) && (lock_info.txn_ids[0] == my_txn_id);
1485
+
1486
+ // Handle lock downgrade and reentrant first, it should always succeed
1487
+ if (locked) {
1488
+ if (solo_lock_owner) {
1489
+ // Lock is already owned by itself.
1490
+ if (lock_info.exclusive && !txn_lock_info.exclusive) {
1491
+ // For downgrade, wake up all the shared lock waiters at the front of
1492
+ // the waiter queue
1493
+ if (lock_info.waiter_queue != nullptr) {
1494
+ for (auto& waiter : *lock_info.waiter_queue) {
1495
+ if (waiter->exclusive) {
1496
+ break;
1497
+ }
1498
+ waiter->Notify();
1499
+ DebugWakeUpWaiter(my_txn_id, waiter->id, key, "Lock Downgrade");
1500
+ }
1501
+ }
1502
+ }
1503
+
1504
+ if (lock_info.exclusive || !txn_lock_info.exclusive) {
1505
+ // If it is lock downgrade or re-entrant, grant it immediately
1506
+ lock_info.exclusive = txn_lock_info.exclusive;
1507
+ lock_info.expiration_time = txn_lock_info.expiration_time;
1508
+ return Status::OK();
1509
+ }
1510
+ } else {
1511
+ // handle read reentrant lock for non solo lock owner case
1512
+ // Check whether the transaction already hold a shared lock and it is
1513
+ // trying to acquire it again.
1514
+ if (!txn_lock_info.exclusive && !lock_info.exclusive) {
1515
+ auto lock_it = std::find(lock_info.txn_ids.begin(),
1516
+ lock_info.txn_ids.end(), my_txn_id);
1517
+ if (lock_it != lock_info.txn_ids.end()) {
1518
+ lock_info.expiration_time = std::max(lock_info.expiration_time,
1519
+ txn_lock_info.expiration_time);
1520
+ return Status::OK();
1521
+ }
1522
+ }
1523
+ }
1524
+ }
1525
+
1526
+ auto has_waiter =
1527
+ (lock_info.waiter_queue != nullptr) && !lock_info.waiter_queue->empty();
1528
+
1529
+ // Update solo lock owner for the rest of the cases
1530
+ if (solo_lock_owner) {
1531
+ // If there is a shared lock waiter that is ready to take the lock, the
1532
+ // current transaction would not be the solo lock owner.
1533
+ auto has_ready_shared_lock_waiter =
1534
+ has_waiter && lock_info.waiter_queue->front()->IsReady() &&
1535
+ (!lock_info.waiter_queue->front()->exclusive);
1536
+ solo_lock_owner = !has_ready_shared_lock_waiter;
1537
+ }
1538
+
1539
+ // If myself is the first waiter in the queue, skip checking waiter queue
1540
+ auto is_first_waiter =
1541
+ has_waiter && (lock_info.waiter_queue->front()->id == my_txn_id);
1542
+
1543
+ if (fifo && has_waiter && !is_first_waiter) {
1544
+ // There are other waiters ahead of myself
1545
+ {
1546
+ // handle shared lock request on a shared lock with only shared lock
1547
+ // waiters
1548
+ if (!txn_lock_info.exclusive &&
1549
+ (!locked || (locked && !lock_info.exclusive))) {
1550
+ bool has_exclusive_waiter = false;
1551
+ // check whether there is exclusive lock waiter
1552
+ for (auto& waiter : *lock_info.waiter_queue) {
1553
+ if (waiter->exclusive) {
1554
+ has_exclusive_waiter = true;
1555
+ break;
1556
+ }
1557
+ }
1558
+ if (!has_exclusive_waiter) {
1559
+ // no X waiter in the queue, so it can acquire the lock without
1560
+ // waiting
1561
+ lock_info.txn_ids.push_back(my_txn_id);
1562
+ lock_info.exclusive = false;
1563
+ lock_info.expiration_time = std::max(lock_info.expiration_time,
1564
+ txn_lock_info.expiration_time);
1565
+ return Status::OK();
1566
+ }
1567
+ }
1568
+ }
1569
+
1570
+ // fast path check for lock upgrade
1571
+ if (solo_lock_owner && !lock_info.exclusive && txn_lock_info.exclusive) {
1572
+ // During lock upgrade, if it is the only transaction owns the lock and no
1573
+ // other shared lock requesting transaction is ready to take the lock,
1574
+ // prioritize the lock grade and grant it now.
1575
+ lock_info.exclusive = txn_lock_info.exclusive;
1576
+ lock_info.expiration_time = txn_lock_info.expiration_time;
1577
+ return Status::OK();
1578
+ }
1579
+
1580
+ if (wait_ids == nullptr) {
1581
+ // If wait_ids is nullptr, it is a fast path check to see whether it is
1582
+ // able to take the lock or not, skip filling the waiting txn ids for
1583
+ // deadlock detection.
1584
+ return Status::TimedOut(Status::SubCode::kLockTimeout);
1585
+ }
1586
+
1587
+ // For other cases with fifo and lock waiter, try to wait in the queue
1588
+ // and fill the waiting txn list
1589
+ auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
1590
+ my_txn_id, key);
1591
+ if (!s.ok()) {
1592
+ // propagate error up
1593
+ return s;
1594
+ }
1595
+
1596
+ // Add the waiter txn ids to the blocking txn id list
1597
+ if (txn_lock_info.exclusive) {
1598
+ // For exclusive lock, it traverse the queue from front to back to
1599
+ // handle upgrade
1600
+ for (auto& waiter : *lock_info.waiter_queue) {
1601
+ // For upgrade locks, it will be placed at the beginning of
1602
+ // the queue. However, for shared lock waiters that are at
1603
+ // the beginning of the queue that got woken up but haven't
1604
+ // taken the lock yet, they should still be added to the
1605
+ // blocking txn id list.
1606
+ if (*isUpgrade && waiter->exclusive) {
1607
+ break;
1608
+ }
1609
+ if (waiter->id != my_txn_id) {
1610
+ wait_ids->push_back(waiter->id);
1611
+ }
1612
+ }
1613
+ } else {
1614
+ // For shared lock, skip the S lock waiters at the end of the queue, as
1615
+ // they will be waked up together. Therefore, it traverses the queue from
1616
+ // from back to front.
1617
+ bool skip_shared_lock_waiter = true;
1618
+ for (auto it = lock_info.waiter_queue->rbegin();
1619
+ it != lock_info.waiter_queue->rend(); ++it) {
1620
+ if ((*it)->exclusive) {
1621
+ skip_shared_lock_waiter = false;
1622
+ } else {
1623
+ if (skip_shared_lock_waiter) {
1624
+ continue;
1625
+ }
1626
+ }
1627
+ if ((*it)->id != my_txn_id) {
1628
+ wait_ids->push_back((*it)->id);
1629
+ }
1630
+ }
1631
+ }
1632
+
1633
+ return Status::TimedOut(Status::SubCode::kLockTimeout);
1634
+ } else {
1635
+ // there is no waiter or it is its turn to take the lock
1636
+ if (!locked) {
1637
+ // no lock on this key, acquire it directly
1638
+ lock_info.txn_ids = txn_lock_info.txn_ids;
1639
+ lock_info.exclusive = txn_lock_info.exclusive;
1640
+ lock_info.expiration_time = txn_lock_info.expiration_time;
1641
+ return Status::OK();
1642
+ }
1643
+
1644
+ if (IsLockExpired(my_txn_id, lock_info, env, expire_time)) {
1645
+ // current lock is expired, steal it.
1646
+ lock_info.txn_ids = txn_lock_info.txn_ids;
1647
+ lock_info.exclusive = txn_lock_info.exclusive;
1648
+ lock_info.expiration_time = txn_lock_info.expiration_time;
1649
+ return Status::OK();
1650
+ }
1651
+
1652
+ // Check lock compatibility
1653
+ if (txn_lock_info.exclusive) {
1654
+ // handle lock upgrade
1655
+ if (solo_lock_owner) {
1656
+ // Lock re-entrant or downgrade has already been handled above.
1657
+ // Assert it is an upgrade here. Acquire the lock directly.
1658
+ assert(!lock_info.exclusive);
1659
+ lock_info.exclusive = txn_lock_info.exclusive;
1660
+ lock_info.expiration_time = txn_lock_info.expiration_time;
1661
+ return Status::OK();
1662
+ } else {
1663
+ // lock is already owned by other transactions
1664
+ auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
1665
+ my_txn_id, key);
1666
+ if (!s.ok()) {
1667
+ // propagate error up
1668
+ return s;
1669
+ }
1670
+ return Status::TimedOut(Status::SubCode::kLockTimeout);
1671
+ }
1672
+ } else {
1673
+ // handle shared lock request
1674
+ if (lock_info.exclusive) {
1675
+ // lock is already owned by other exclusive lock
1676
+ auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
1677
+ my_txn_id, key);
1678
+ if (!s.ok()) {
1679
+ // propagate error up
1680
+ return s;
1681
+ }
1682
+ return Status::TimedOut(Status::SubCode::kLockTimeout);
1683
+ } else {
1684
+ // lock is on shared lock state, acquire it
1685
+ lock_info.txn_ids.push_back(my_txn_id);
1686
+ // update the expiration time
1687
+ lock_info.expiration_time =
1688
+ std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
1689
+ return Status::OK();
1690
+ }
1691
+ }
1692
+ }
1693
+ }
1694
+
1695
+ void PerKeyPointLockManager::UnLockKey(PessimisticTransaction* txn,
1696
+ const std::string& key,
1697
+ LockMapStripe* stripe, LockMap* lock_map,
1698
+ Env* env) {
1699
+ #ifdef NDEBUG
1700
+ (void)env;
1701
+ #endif
1702
+ TransactionID txn_id = txn->GetID();
1703
+
1704
+ auto stripe_iter = stripe->keys.find(key);
1705
+ if (stripe_iter != stripe->keys.end()) {
1706
+ auto& lock_info = stripe_iter->second;
1707
+ auto& txns = lock_info.txn_ids;
1708
+ auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
1709
+
1710
+ if (txn_it != txns.end()) {
1711
+ // If the lock was held in exclusive mode, only one transaction should
1712
+ // holding it.
1713
+ if (lock_info.exclusive) {
1714
+ assert(txns.size() == 1);
1715
+ stripe->ReleaseLastLockHolder(lock_info, stripe_iter, lock_map, txn_id,
1716
+ key, max_num_locks_, txns, txn_it);
1717
+ } else {
1718
+ // In shared mode, it is possible that another transaction is holding
1719
+ // a shared lock and is waiting to upgrade the lock to exclusive.
1720
+ assert(txns.size() >= 1);
1721
+ if (txns.size() > 2) {
1722
+ // Including the current transaction, if there are more than 2
1723
+ // transactions holding the lock in shared mode, don't wake up any
1724
+ // waiter, as the next waiter will not be able to acquire the lock
1725
+ // anyway.
1726
+ RemoveTransaction(txns, txn_it);
1727
+ } else if (txns.size() == 2) {
1728
+ // remove the current transaction first.
1729
+ RemoveTransaction(txns, txn_it);
1730
+ // Check whether the one remained is trying to upgrade the lock by
1731
+ // checking whether its id matches.
1732
+ auto& waiter_queue = lock_info.waiter_queue;
1733
+ if (waiter_queue != nullptr && !waiter_queue->empty() &&
1734
+ waiter_queue->front()->id == txns[0]) {
1735
+ // There are waiters in the queue and the next one is same as the
1736
+ // only one that is still holding the shared lock, wake the waiter
1737
+ // up
1738
+ waiter_queue->front()->Notify();
1739
+ DebugWakeUpWaiter(txn_id, waiter_queue->front()->id, key,
1740
+ "Lock Upgrade");
1741
+ }
1742
+ } else {
1743
+ // Current transaction is the only one holding the shared lock
1744
+ stripe->ReleaseLastLockHolder(lock_info, stripe_iter, lock_map,
1745
+ txn_id, key, max_num_locks_, txns,
1746
+ txn_it);
1747
+ }
1748
+ }
1749
+ }
1750
+ } else {
1751
+ // This key is either not locked or locked by someone else. This should
1752
+ // only happen if the unlocking transaction has expired.
1753
+ assert(txn->GetExpirationTime() > 0 &&
1754
+ txn->GetExpirationTime() < env->NowMicros());
1755
+ }
1756
+ }
1757
+
1758
+ void PerKeyPointLockManager::UnLock(PessimisticTransaction* txn,
1759
+ ColumnFamilyId column_family_id,
1760
+ const std::string& key, Env* env) {
1761
+ std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
1762
+ LockMap* lock_map = lock_map_ptr.get();
1763
+ if (lock_map == nullptr) {
1764
+ // Column Family must have been dropped.
1765
+ return;
1766
+ }
1767
+
1768
+ // Lock the mutex for the stripe that this key hashes to
1769
+ size_t stripe_num = lock_map->GetStripe(key);
1770
+ assert(lock_map->lock_map_stripes_.size() > stripe_num);
1771
+ LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
1772
+
1773
+ stripe->stripe_mutex->Lock().AssertOK();
1774
+ UnLockKey(txn, key, stripe, lock_map, env);
1775
+ stripe->stripe_mutex->UnLock();
1776
+ }
1777
+
1778
+ void PerKeyPointLockManager::UnLock(PessimisticTransaction* txn,
1779
+ const LockTracker& tracker, Env* env) {
1780
+ std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
1781
+ tracker.GetColumnFamilyIterator());
1782
+ assert(cf_it != nullptr);
1783
+ while (cf_it->HasNext()) {
1784
+ ColumnFamilyId cf = cf_it->Next();
1785
+ std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(cf);
1786
+ LockMap* lock_map = lock_map_ptr.get();
1787
+ if (!lock_map) {
1788
+ // Column Family must have been dropped.
1789
+ return;
1790
+ }
1791
+
1792
+ // Bucket keys by lock_map_ stripe
1793
+ UnorderedMap<size_t, std::vector<const std::string*>> keys_by_stripe(
1794
+ lock_map->num_stripes_);
1795
+ std::unique_ptr<LockTracker::KeyIterator> key_it(
1796
+ tracker.GetKeyIterator(cf));
1797
+ assert(key_it != nullptr);
1798
+ while (key_it->HasNext()) {
1799
+ const std::string& key = key_it->Next();
1800
+ size_t stripe_num = lock_map->GetStripe(key);
1801
+ keys_by_stripe[stripe_num].push_back(&key);
1802
+ }
1803
+
1804
+ // For each stripe, grab the stripe mutex and unlock all keys in this
1805
+ // stripe
1806
+ for (auto& stripe_iter : keys_by_stripe) {
1807
+ size_t stripe_num = stripe_iter.first;
1808
+ auto& stripe_keys = stripe_iter.second;
1809
+
1810
+ assert(lock_map->lock_map_stripes_.size() > stripe_num);
1811
+ LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
1812
+
1813
+ stripe->stripe_mutex->Lock().AssertOK();
1814
+
1815
+ for (const std::string* key : stripe_keys) {
1816
+ UnLockKey(txn, *key, stripe, lock_map, env);
1817
+ }
1818
+
1819
+ stripe->stripe_mutex->UnLock();
1820
+ }
1821
+ }
1822
+ }
1823
+
1824
+ void PerKeyPointLockManager::UnLock(PessimisticTransaction* /* txn */,
1825
+ ColumnFamilyId /* cf_id */,
1826
+ const Endpoint& /* start */,
1827
+ const Endpoint& /* end */, Env* /* env */) {
1828
+ // no-op
1829
+ }
1830
+
748
1831
  } // namespace ROCKSDB_NAMESPACE