@nxtedition/rocksdb 7.1.33 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/BUILDING.md +2 -2
  2. package/binding.cc +0 -147
  3. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +402 -345
  4. package/deps/rocksdb/rocksdb/cache/clock_cache.h +121 -64
  5. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +28 -18
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +1 -0
  7. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
  8. package/deps/rocksdb/rocksdb/db/builder.cc +2 -1
  9. package/deps/rocksdb/rocksdb/db/c.cc +563 -673
  10. package/deps/rocksdb/rocksdb/db/c_test.c +168 -169
  11. package/deps/rocksdb/rocksdb/db/column_family.cc +16 -15
  12. package/deps/rocksdb/rocksdb/db/column_family.h +7 -7
  13. package/deps/rocksdb/rocksdb/db/column_family_test.cc +17 -28
  14. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -9
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +8 -3
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +114 -0
  17. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +2 -3
  18. package/deps/rocksdb/rocksdb/db/convenience.cc +3 -5
  19. package/deps/rocksdb/rocksdb/db/corruption_test.cc +10 -14
  20. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +9 -13
  21. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +2 -2
  22. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -2
  23. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +14 -16
  24. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +52 -72
  25. package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +2 -2
  26. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +12 -12
  27. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -2
  28. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +3 -3
  29. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +1 -12
  30. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +3 -0
  31. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +26 -0
  32. package/deps/rocksdb/rocksdb/db/db_info_dumper.cc +1 -0
  33. package/deps/rocksdb/rocksdb/db/db_iter.cc +12 -6
  34. package/deps/rocksdb/rocksdb/db/db_iter.h +1 -0
  35. package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +6 -7
  36. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +10 -8
  37. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +15 -13
  38. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -9
  39. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +4 -4
  40. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +1 -1
  41. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +2 -4
  42. package/deps/rocksdb/rocksdb/db/db_options_test.cc +4 -4
  43. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +7 -4
  44. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +7 -5
  45. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +1 -1
  46. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +8 -6
  47. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +18 -23
  48. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +3 -5
  49. package/deps/rocksdb/rocksdb/db/db_test.cc +10 -5
  50. package/deps/rocksdb/rocksdb/db/db_test2.cc +172 -169
  51. package/deps/rocksdb/rocksdb/db/db_test_util.cc +68 -66
  52. package/deps/rocksdb/rocksdb/db/db_test_util.h +1 -3
  53. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +31 -39
  54. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +182 -2
  55. package/deps/rocksdb/rocksdb/db/db_write_test.cc +43 -40
  56. package/deps/rocksdb/rocksdb/db/dbformat.h +15 -0
  57. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +35 -34
  58. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +10 -11
  59. package/deps/rocksdb/rocksdb/db/error_handler.cc +6 -6
  60. package/deps/rocksdb/rocksdb/db/error_handler.h +93 -94
  61. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -1
  62. package/deps/rocksdb/rocksdb/db/event_helpers.h +3 -3
  63. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +16 -17
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +2 -2
  65. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +1 -2
  66. package/deps/rocksdb/rocksdb/db/file_indexer.cc +2 -0
  67. package/deps/rocksdb/rocksdb/db/file_indexer.h +2 -1
  68. package/deps/rocksdb/rocksdb/db/file_indexer_test.cc +4 -2
  69. package/deps/rocksdb/rocksdb/db/filename_test.cc +27 -29
  70. package/deps/rocksdb/rocksdb/db/flush_job.cc +7 -13
  71. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +2 -2
  72. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +15 -21
  73. package/deps/rocksdb/rocksdb/db/forward_iterator.h +7 -6
  74. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +4 -2
  75. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +2 -2
  76. package/deps/rocksdb/rocksdb/db/internal_stats.cc +59 -14
  77. package/deps/rocksdb/rocksdb/db/internal_stats.h +27 -11
  78. package/deps/rocksdb/rocksdb/db/job_context.h +5 -6
  79. package/deps/rocksdb/rocksdb/db/listener_test.cc +21 -23
  80. package/deps/rocksdb/rocksdb/db/log_reader.cc +7 -11
  81. package/deps/rocksdb/rocksdb/db/log_reader.h +4 -6
  82. package/deps/rocksdb/rocksdb/db/log_test.cc +6 -12
  83. package/deps/rocksdb/rocksdb/db/log_writer.h +1 -1
  84. package/deps/rocksdb/rocksdb/db/logs_with_prep_tracker.h +0 -1
  85. package/deps/rocksdb/rocksdb/db/lookup_key.h +4 -1
  86. package/deps/rocksdb/rocksdb/db/malloc_stats.cc +2 -1
  87. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +3 -5
  88. package/deps/rocksdb/rocksdb/db/memtable.cc +34 -22
  89. package/deps/rocksdb/rocksdb/db/memtable.h +4 -6
  90. package/deps/rocksdb/rocksdb/db/memtable_list.cc +7 -0
  91. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +37 -13
  92. package/deps/rocksdb/rocksdb/db/merge_context.h +1 -0
  93. package/deps/rocksdb/rocksdb/db/merge_helper.cc +128 -14
  94. package/deps/rocksdb/rocksdb/db/merge_helper.h +15 -7
  95. package/deps/rocksdb/rocksdb/db/merge_helper_test.cc +2 -1
  96. package/deps/rocksdb/rocksdb/db/merge_operator.cc +5 -6
  97. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +4 -3
  98. package/deps/rocksdb/rocksdb/db/options_file_test.cc +1 -1
  99. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +55 -43
  100. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +288 -299
  101. package/deps/rocksdb/rocksdb/db/prefix_test.cc +22 -27
  102. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +1 -1
  103. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +1 -1
  104. package/deps/rocksdb/rocksdb/db/repair.cc +7 -8
  105. package/deps/rocksdb/rocksdb/db/repair_test.cc +3 -4
  106. package/deps/rocksdb/rocksdb/db/snapshot_impl.cc +4 -5
  107. package/deps/rocksdb/rocksdb/db/snapshot_impl.h +10 -4
  108. package/deps/rocksdb/rocksdb/db/table_cache.cc +3 -4
  109. package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +6 -7
  110. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +22 -22
  111. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +12 -12
  112. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +6 -8
  113. package/deps/rocksdb/rocksdb/db/trim_history_scheduler.h +2 -0
  114. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +3 -3
  115. package/deps/rocksdb/rocksdb/db/version_edit.cc +2 -5
  116. package/deps/rocksdb/rocksdb/db/version_edit.h +8 -12
  117. package/deps/rocksdb/rocksdb/db/version_set.cc +74 -102
  118. package/deps/rocksdb/rocksdb/db/version_set.h +8 -10
  119. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +0 -5
  120. package/deps/rocksdb/rocksdb/db/version_set_test.cc +47 -45
  121. package/deps/rocksdb/rocksdb/db/wal_manager.cc +6 -5
  122. package/deps/rocksdb/rocksdb/db/wal_manager.h +2 -2
  123. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +4 -3
  124. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +144 -61
  125. package/deps/rocksdb/rocksdb/db/write_batch.cc +41 -24
  126. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +2 -7
  127. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +105 -104
  128. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +5 -4
  129. package/deps/rocksdb/rocksdb/db/write_controller.h +1 -0
  130. package/deps/rocksdb/rocksdb/db/write_controller_test.cc +1 -1
  131. package/deps/rocksdb/rocksdb/db/write_thread.cc +8 -6
  132. package/deps/rocksdb/rocksdb/env/io_posix.h +6 -0
  133. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +134 -65
  134. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -0
  135. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +1 -0
  136. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +1 -4
  137. package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +1 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +4 -0
  139. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +14 -4
  140. package/deps/rocksdb/rocksdb/table/get_context.cc +52 -7
  141. package/deps/rocksdb/rocksdb/table/get_context.h +1 -2
  142. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +13 -0
  143. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +36 -4
  144. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +6 -6
  145. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +23 -28
  146. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +11 -1
  147. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +19 -17
  148. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +10 -7
  149. package/index.js +3 -195
  150. package/package.json +2 -4
  151. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  152. package/prebuilds/linux-x64/node.napi.node +0 -0
  153. package/common.js +0 -7
@@ -225,10 +225,8 @@ void FilePrefetchBuffer::AbortIOIfNeeded(uint64_t offset) {
225
225
  bufs_[second].async_read_in_progress_ = false;
226
226
  }
227
227
 
228
- if (bufs_[curr_].io_handle_ == nullptr &&
229
- bufs_[curr_].async_read_in_progress_) {
228
+ if (bufs_[curr_].io_handle_ == nullptr) {
230
229
  bufs_[curr_].async_read_in_progress_ = false;
231
- curr_ = curr_ ^ 1;
232
230
  }
233
231
  }
234
232
 
@@ -268,16 +266,36 @@ void FilePrefetchBuffer::UpdateBuffersIfNeeded(uint64_t offset) {
268
266
  bufs_[second].buffer_.Clear();
269
267
  }
270
268
 
269
+ {
270
+ // In case buffers do not align, reset second buffer. This can happen in
271
+ // case readahead_size is set.
272
+ if (!bufs_[second].async_read_in_progress_ &&
273
+ !bufs_[curr_].async_read_in_progress_) {
274
+ if (DoesBufferContainData(curr_)) {
275
+ if (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() !=
276
+ bufs_[second].offset_) {
277
+ bufs_[second].buffer_.Clear();
278
+ }
279
+ } else {
280
+ if (!IsOffsetInBuffer(offset, second)) {
281
+ bufs_[second].buffer_.Clear();
282
+ }
283
+ }
284
+ }
285
+ }
286
+
271
287
  // If data starts from second buffer, make it curr_. Second buffer can be
272
- // either partial filled or full.
273
- if (!bufs_[second].async_read_in_progress_ && DoesBufferContainData(second) &&
274
- IsOffsetInBuffer(offset, second)) {
275
- // Clear the curr_ as buffers have been swapped and curr_ contains the
276
- // outdated data and switch the buffers.
277
- if (!bufs_[curr_].async_read_in_progress_) {
278
- bufs_[curr_].buffer_.Clear();
288
+ // either partial filled, full or async read is in progress.
289
+ if (bufs_[second].async_read_in_progress_) {
290
+ if (IsOffsetInBufferWithAsyncProgress(offset, second)) {
291
+ curr_ = curr_ ^ 1;
292
+ }
293
+ } else {
294
+ if (DoesBufferContainData(second) && IsOffsetInBuffer(offset, second)) {
295
+ assert(bufs_[curr_].async_read_in_progress_ ||
296
+ bufs_[curr_].buffer_.CurrentSize() == 0);
297
+ curr_ = curr_ ^ 1;
279
298
  }
280
- curr_ = curr_ ^ 1;
281
299
  }
282
300
  }
283
301
 
@@ -300,53 +318,16 @@ void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
300
318
  UpdateBuffersIfNeeded(offset);
301
319
  }
302
320
 
303
- // If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is
304
- // called. When buffers are switched, we clear the curr_ buffer as we assume the
305
- // data has been consumed because of sequential reads.
306
- // Data in buffers will always be sequential with curr_ following second and
307
- // not vice versa.
308
- //
309
- // Scenarios for prefetching asynchronously:
310
- // Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes
311
- // synchronously in curr_ and prefetch readahead_size_/2 async in second
312
- // buffer.
313
- // Case2: If second buffer has partial or full data, make it current and
314
- // prefetch readahead_size_/2 async in second buffer. In case of
315
- // partial data, prefetch remaining bytes from size n synchronously to
316
- // fulfill the requested bytes request.
317
- // Case3: If curr_ has partial data, prefetch remaining bytes from size n
318
- // synchronously in curr_ to fulfill the requested bytes request and
319
- // prefetch readahead_size_/2 bytes async in second buffer.
320
- // Case4: (Special case) If data is in both buffers, copy requested data from
321
- // curr_, send async request on curr_, wait for poll to fill second
322
- // buffer (if any), and copy remaining data from second buffer to third
323
- // buffer.
324
- Status FilePrefetchBuffer::PrefetchAsyncInternal(
321
+ Status FilePrefetchBuffer::HandleOverlappingData(
325
322
  const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
326
- size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority,
327
- bool& copy_to_third_buffer) {
328
- if (!enable_) {
329
- return Status::OK();
330
- }
331
-
332
- TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start");
333
-
334
- size_t alignment = reader->file()->GetRequiredBufferAlignment();
323
+ size_t length, size_t readahead_size,
324
+ Env::IOPriority /*rate_limiter_priority*/, bool& copy_to_third_buffer,
325
+ uint64_t& tmp_offset, size_t& tmp_length) {
335
326
  Status s;
336
- uint64_t tmp_offset = offset;
337
- size_t tmp_length = length;
338
-
339
- // 1. Abort IO and swap buffers if needed to point curr_ to first buffer with
340
- // data.
341
- {
342
- if (!explicit_prefetch_submitted_) {
343
- AbortIOIfNeeded(offset);
344
- }
345
- UpdateBuffersIfNeeded(offset);
346
- }
327
+ size_t alignment = reader->file()->GetRequiredBufferAlignment();
347
328
  uint32_t second = curr_ ^ 1;
348
329
 
349
- // 2. If data is overlapping over two buffers, copy the data from curr_ and
330
+ // If data is overlapping over two buffers, copy the data from curr_ and
350
331
  // call ReadAsync on curr_.
351
332
  if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
352
333
  IsOffsetInBuffer(offset, curr_) &&
@@ -391,21 +372,80 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
391
372
  }
392
373
  curr_ = curr_ ^ 1;
393
374
  }
375
+ return s;
376
+ }
377
+ // If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is
378
+ // called. When buffers are switched, we clear the curr_ buffer as we assume the
379
+ // data has been consumed because of sequential reads.
380
+ // Data in buffers will always be sequential with curr_ following second and
381
+ // not vice versa.
382
+ //
383
+ // Scenarios for prefetching asynchronously:
384
+ // Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes
385
+ // synchronously in curr_ and prefetch readahead_size_/2 async in second
386
+ // buffer.
387
+ // Case2: If second buffer has partial or full data, make it current and
388
+ // prefetch readahead_size_/2 async in second buffer. In case of
389
+ // partial data, prefetch remaining bytes from size n synchronously to
390
+ // fulfill the requested bytes request.
391
+ // Case3: If curr_ has partial data, prefetch remaining bytes from size n
392
+ // synchronously in curr_ to fulfill the requested bytes request and
393
+ // prefetch readahead_size_/2 bytes async in second buffer.
394
+ // Case4: (Special case) If data is in both buffers, copy requested data from
395
+ // curr_, send async request on curr_, wait for poll to fill second
396
+ // buffer (if any), and copy remaining data from second buffer to third
397
+ // buffer.
398
+ Status FilePrefetchBuffer::PrefetchAsyncInternal(
399
+ const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
400
+ size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority,
401
+ bool& copy_to_third_buffer) {
402
+ if (!enable_) {
403
+ return Status::OK();
404
+ }
405
+
406
+ TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start");
407
+
408
+ size_t alignment = reader->file()->GetRequiredBufferAlignment();
409
+ Status s;
410
+ uint64_t tmp_offset = offset;
411
+ size_t tmp_length = length;
412
+
413
+ // 1. Abort IO and swap buffers if needed to point curr_ to first buffer with
414
+ // data.
415
+ if (!explicit_prefetch_submitted_) {
416
+ AbortIOIfNeeded(offset);
417
+ }
418
+ UpdateBuffersIfNeeded(offset);
419
+
420
+ // 2. Handle overlapping data over two buffers. If data is overlapping then
421
+ // during this call:
422
+ // - data from curr_ is copied into third buffer,
423
+ // - curr_ is send for async prefetching of further data if second buffer
424
+ // contains remaining requested data or in progress for async prefetch,
425
+ // - switch buffers and curr_ now points to second buffer to copy remaining
426
+ // data.
427
+ s = HandleOverlappingData(opts, reader, offset, length, readahead_size,
428
+ rate_limiter_priority, copy_to_third_buffer,
429
+ tmp_offset, tmp_length);
430
+ if (!s.ok()) {
431
+ return s;
432
+ }
394
433
 
395
434
  // 3. Call Poll only if data is needed for the second buffer.
396
- // - Return if whole data is in curr_ and second buffer in progress.
435
+ // - Return if whole data is in curr_ and second buffer is in progress or
436
+ // already full.
397
437
  // - If second buffer is empty, it will go for ReadAsync for second buffer.
398
438
  if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
399
439
  IsDataBlockInBuffer(offset, length, curr_)) {
400
440
  // Whole data is in curr_.
401
441
  UpdateBuffersIfNeeded(offset);
402
- second = curr_ ^ 1;
403
- if (bufs_[second].async_read_in_progress_) {
442
+ if (!IsSecondBuffEligibleForPrefetching()) {
404
443
  return s;
405
444
  }
406
445
  } else {
446
+ // After poll request, curr_ might be empty because of IOError in
447
+ // callback while reading or may contain required data.
407
448
  PollAndUpdateBuffersIfNeeded(offset);
408
- second = curr_ ^ 1;
409
449
  }
410
450
 
411
451
  if (copy_to_third_buffer) {
@@ -427,19 +467,42 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
427
467
  if (explicit_prefetch_submitted_) {
428
468
  return s;
429
469
  }
470
+ if (!IsSecondBuffEligibleForPrefetching()) {
471
+ return s;
472
+ }
473
+ }
474
+
475
+ uint32_t second = curr_ ^ 1;
476
+ assert(!bufs_[curr_].async_read_in_progress_);
477
+
478
+ // In case because of some IOError curr_ got empty, abort IO for second as
479
+ // well. Otherwise data might not align if more data needs to be read in curr_
480
+ // which might overlap with second buffer.
481
+ if (!DoesBufferContainData(curr_) && bufs_[second].async_read_in_progress_) {
482
+ if (bufs_[second].io_handle_ != nullptr) {
483
+ std::vector<void*> handles;
484
+ handles.emplace_back(bufs_[second].io_handle_);
485
+ {
486
+ StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
487
+ Status status = fs_->AbortIO(handles);
488
+ assert(status.ok());
489
+ }
490
+ }
491
+ DestroyAndClearIOHandle(second);
492
+ bufs_[second].buffer_.Clear();
430
493
  }
431
494
 
432
495
  // 5. Data is overlapping i.e. some of the data has been copied to third
433
- // buffer
434
- // and remaining will be updated below.
435
- if (copy_to_third_buffer) {
496
+ // buffer and remaining will be updated below.
497
+ if (copy_to_third_buffer && DoesBufferContainData(curr_)) {
436
498
  CopyDataToBuffer(curr_, offset, length);
437
499
 
438
500
  // Length == 0: All the requested data has been copied to third buffer and
439
501
  // it has already gone for async prefetching. It can return without doing
440
502
  // anything further.
441
- // Length > 0: More data needs to be consumed so it will continue async and
442
- // sync prefetching and copy the remaining data to third buffer in the end.
503
+ // Length > 0: More data needs to be consumed so it will continue async
504
+ // and sync prefetching and copy the remaining data to third buffer in the
505
+ // end.
443
506
  if (length == 0) {
444
507
  return s;
445
508
  }
@@ -458,6 +521,9 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
458
521
  uint64_t chunk_len1 = 0;
459
522
  uint64_t read_len1 = 0;
460
523
 
524
+ assert(!bufs_[second].async_read_in_progress_ &&
525
+ !DoesBufferContainData(second));
526
+
461
527
  // For length == 0, skip the synchronous prefetching. read_len1 will be 0.
462
528
  if (length > 0) {
463
529
  CalculateOffsetAndLen(alignment, offset, roundup_len1, curr_,
@@ -594,8 +660,11 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
594
660
  }
595
661
 
596
662
  if (explicit_prefetch_submitted_) {
597
- if (prev_offset_ != offset) {
598
- // Random offset called. So abort the IOs.
663
+ // explicit_prefetch_submitted_ is special case where it expects request
664
+ // submitted in PrefetchAsync should match with this request. Otherwise
665
+ // buffers will be outdated.
666
+ // Random offset called. So abort the IOs.
667
+ if (bufs_[curr_].offset_ != offset) {
599
668
  AbortAllIOs();
600
669
  bufs_[curr_].buffer_.Clear();
601
670
  bufs_[curr_ ^ 1].buffer_.Clear();
@@ -8,6 +8,7 @@
8
8
  // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
9
 
10
10
  #pragma once
11
+
11
12
  #include <algorithm>
12
13
  #include <atomic>
13
14
  #include <sstream>
@@ -363,6 +364,27 @@ class FilePrefetchBuffer {
363
364
  bufs_[index].io_handle_ != nullptr &&
364
365
  offset >= bufs_[index].offset_ + bufs_[index].async_req_len_);
365
366
  }
367
+ bool IsOffsetInBufferWithAsyncProgress(uint64_t offset, uint32_t index) {
368
+ return (bufs_[index].async_read_in_progress_ &&
369
+ offset >= bufs_[index].offset_ &&
370
+ offset < bufs_[index].offset_ + bufs_[index].async_req_len_);
371
+ }
372
+
373
+ bool IsSecondBuffEligibleForPrefetching() {
374
+ uint32_t second = curr_ ^ 1;
375
+ if (bufs_[second].async_read_in_progress_) {
376
+ return false;
377
+ }
378
+ assert(!bufs_[curr_].async_read_in_progress_);
379
+
380
+ if (DoesBufferContainData(curr_) && DoesBufferContainData(second) &&
381
+ (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() ==
382
+ bufs_[second].offset_)) {
383
+ return false;
384
+ }
385
+ bufs_[second].buffer_.Clear();
386
+ return true;
387
+ }
366
388
 
367
389
  void DestroyAndClearIOHandle(uint32_t index) {
368
390
  if (bufs_[index].io_handle_ != nullptr && bufs_[index].del_fn_ != nullptr) {
@@ -373,6 +395,13 @@ class FilePrefetchBuffer {
373
395
  bufs_[index].async_read_in_progress_ = false;
374
396
  }
375
397
 
398
+ Status HandleOverlappingData(const IOOptions& opts,
399
+ RandomAccessFileReader* reader, uint64_t offset,
400
+ size_t length, size_t readahead_size,
401
+ Env::IOPriority rate_limiter_priority,
402
+ bool& copy_to_third_buffer, uint64_t& tmp_offset,
403
+ size_t& tmp_length);
404
+
376
405
  std::vector<BufferInfo> bufs_;
377
406
  // curr_ represents the index for bufs_ indicating which buffer is being
378
407
  // consumed currently.
@@ -163,6 +163,7 @@ class CompactionFilter : public Customizable {
163
163
  // is a write conflict and may allow a Transaction to Commit that should have
164
164
  // failed. Instead, it is better to implement any Merge filtering inside the
165
165
  // MergeOperator.
166
+ // key includes timestamp if user-defined timestamp is enabled.
166
167
  virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
167
168
  const Slice& existing_value, std::string* new_value,
168
169
  std::string* /*skip_until*/) const {
@@ -500,10 +500,7 @@ class DB {
500
500
  virtual Status Merge(const WriteOptions& /*options*/,
501
501
  ColumnFamilyHandle* /*column_family*/,
502
502
  const Slice& /*key*/, const Slice& /*ts*/,
503
- const Slice& /*value*/) {
504
- return Status::NotSupported(
505
- "Merge does not support user-defined timestamp yet");
506
- }
503
+ const Slice& /*value*/);
507
504
 
508
505
  // Apply the specified updates to the database.
509
506
  // If `updates` contains no update, WAL will still be synced if
@@ -82,6 +82,7 @@ class MergeOperator : public Customizable {
82
82
  }
83
83
 
84
84
  struct MergeOperationInput {
85
+ // If user-defined timestamp is enabled, `_key` includes timestamp.
85
86
  explicit MergeOperationInput(const Slice& _key,
86
87
  const Slice* _existing_value,
87
88
  const std::vector<Slice>& _operand_list,
@@ -215,6 +215,10 @@ class StackableDB : public DB {
215
215
  const Slice& value) override {
216
216
  return db_->Merge(options, column_family, key, value);
217
217
  }
218
+ Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
219
+ const Slice& key, const Slice& ts, const Slice& value) override {
220
+ return db_->Merge(options, column_family, key, ts, value);
221
+ }
218
222
 
219
223
  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override {
220
224
  return db_->Write(opts, updates);
@@ -172,10 +172,7 @@ class WriteBatch : public WriteBatchBase {
172
172
  return Merge(nullptr, key, value);
173
173
  }
174
174
  Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
175
- const Slice& /*ts*/, const Slice& /*value*/) override {
176
- return Status::NotSupported(
177
- "Merge does not support user-defined timestamp");
178
- }
175
+ const Slice& /*ts*/, const Slice& /*value*/) override;
179
176
 
180
177
  // variant that takes SliceParts
181
178
  Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
@@ -219,6 +216,7 @@ class WriteBatch : public WriteBatchBase {
219
216
  Status PopSavePoint() override;
220
217
 
221
218
  // Support for iterating over the contents of a batch.
219
+ // Objects of subclasses of Handler will be used by WriteBatch::Iterate().
222
220
  class Handler {
223
221
  public:
224
222
  virtual ~Handler();
@@ -229,6 +227,7 @@ class WriteBatch : public WriteBatchBase {
229
227
  // default implementation will just call Put without column family for
230
228
  // backwards compatibility. If the column family is not default,
231
229
  // the function is noop
230
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
232
231
  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
233
232
  const Slice& value) {
234
233
  if (column_family_id == 0) {
@@ -241,14 +240,17 @@ class WriteBatch : public WriteBatchBase {
241
240
  return Status::InvalidArgument(
242
241
  "non-default column family and PutCF not implemented");
243
242
  }
243
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
244
244
  virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {}
245
245
 
246
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
246
247
  virtual Status PutEntityCF(uint32_t /* column_family_id */,
247
248
  const Slice& /* key */,
248
249
  const Slice& /* entity */) {
249
250
  return Status::NotSupported("PutEntityCF not implemented");
250
251
  }
251
252
 
253
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
252
254
  virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
253
255
  if (column_family_id == 0) {
254
256
  Delete(key);
@@ -257,8 +259,10 @@ class WriteBatch : public WriteBatchBase {
257
259
  return Status::InvalidArgument(
258
260
  "non-default column family and DeleteCF not implemented");
259
261
  }
262
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
260
263
  virtual void Delete(const Slice& /*key*/) {}
261
264
 
265
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
262
266
  virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) {
263
267
  if (column_family_id == 0) {
264
268
  SingleDelete(key);
@@ -267,14 +271,18 @@ class WriteBatch : public WriteBatchBase {
267
271
  return Status::InvalidArgument(
268
272
  "non-default column family and SingleDeleteCF not implemented");
269
273
  }
274
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
270
275
  virtual void SingleDelete(const Slice& /*key*/) {}
271
276
 
277
+ // If user-defined timestamp is enabled, then `begin_key` and `end_key`
278
+ // both include timestamp.
272
279
  virtual Status DeleteRangeCF(uint32_t /*column_family_id*/,
273
280
  const Slice& /*begin_key*/,
274
281
  const Slice& /*end_key*/) {
275
282
  return Status::InvalidArgument("DeleteRangeCF not implemented");
276
283
  }
277
284
 
285
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
278
286
  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
279
287
  const Slice& value) {
280
288
  if (column_family_id == 0) {
@@ -284,8 +292,10 @@ class WriteBatch : public WriteBatchBase {
284
292
  return Status::InvalidArgument(
285
293
  "non-default column family and MergeCF not implemented");
286
294
  }
295
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
287
296
  virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
288
297
 
298
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
289
299
  virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
290
300
  const Slice& /*key*/,
291
301
  const Slice& /*value*/) {
@@ -351,9 +351,17 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
351
351
  Slice blob_value(pin_val);
352
352
  push_operand(blob_value, nullptr);
353
353
  } else if (type == kTypeWideColumnEntity) {
354
- // TODO: support wide-column entities
355
- state_ = kUnexpectedWideColumnEntity;
356
- return false;
354
+ Slice value_copy = value;
355
+ Slice value_of_default;
356
+
357
+ if (!WideColumnSerialization::GetValueOfDefaultColumn(
358
+ value_copy, value_of_default)
359
+ .ok()) {
360
+ state_ = kCorrupt;
361
+ return false;
362
+ }
363
+
364
+ push_operand(value_of_default, value_pinner);
357
365
  } else {
358
366
  assert(type == kTypeValue);
359
367
  push_operand(value, value_pinner);
@@ -377,9 +385,26 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
377
385
  push_operand(blob_value, nullptr);
378
386
  }
379
387
  } else if (type == kTypeWideColumnEntity) {
380
- // TODO: support wide-column entities
381
- state_ = kUnexpectedWideColumnEntity;
382
- return false;
388
+ state_ = kFound;
389
+
390
+ if (do_merge_) {
391
+ MergeWithEntity(value);
392
+ } else {
393
+ // It means this function is called as part of DB GetMergeOperands
394
+ // API and the current value should be part of
395
+ // merge_context_->operand_list
396
+ Slice value_copy = value;
397
+ Slice value_of_default;
398
+
399
+ if (!WideColumnSerialization::GetValueOfDefaultColumn(
400
+ value_copy, value_of_default)
401
+ .ok()) {
402
+ state_ = kCorrupt;
403
+ return false;
404
+ }
405
+
406
+ push_operand(value_of_default, value_pinner);
407
+ }
383
408
  } else {
384
409
  assert(type == kTypeValue);
385
410
 
@@ -446,7 +471,26 @@ void GetContext::Merge(const Slice* value) {
446
471
  const Status s = MergeHelper::TimedFullMerge(
447
472
  merge_operator_, user_key_, value, merge_context_->GetOperands(),
448
473
  pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, logger_,
449
- statistics_, clock_);
474
+ statistics_, clock_, /* result_operand */ nullptr,
475
+ /* update_num_ops_stats */ true);
476
+ if (!s.ok()) {
477
+ state_ = kCorrupt;
478
+ return;
479
+ }
480
+
481
+ if (LIKELY(pinnable_val_ != nullptr)) {
482
+ pinnable_val_->PinSelf();
483
+ }
484
+ }
485
+
486
+ void GetContext::MergeWithEntity(Slice entity) {
487
+ assert(do_merge_);
488
+ assert(!pinnable_val_ || !columns_);
489
+
490
+ const Status s = MergeHelper::TimedFullMergeWithEntity(
491
+ merge_operator_, user_key_, entity, merge_context_->GetOperands(),
492
+ pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, logger_,
493
+ statistics_, clock_, /* update_num_ops_stats */ true);
450
494
  if (!s.ok()) {
451
495
  state_ = kCorrupt;
452
496
  return;
@@ -478,6 +522,7 @@ bool GetContext::GetBlobValue(const Slice& blob_index,
478
522
  }
479
523
 
480
524
  void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
525
+ // TODO(yanqin) preserve timestamps information in merge_context
481
526
  if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
482
527
  value_pinner != nullptr) {
483
528
  value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
@@ -75,8 +75,6 @@ class GetContext {
75
75
  kCorrupt,
76
76
  kMerge, // saver contains the current merge result (the operands)
77
77
  kUnexpectedBlobIndex,
78
- // TODO: remove once wide-column entities are supported by Get/MultiGet
79
- kUnexpectedWideColumnEntity,
80
78
  };
81
79
  GetContextStats get_context_stats_;
82
80
 
@@ -185,6 +183,7 @@ class GetContext {
185
183
 
186
184
  private:
187
185
  void Merge(const Slice* value);
186
+ void MergeWithEntity(Slice entity);
188
187
  bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value);
189
188
 
190
189
  const Comparator* ucmp_;
@@ -3498,6 +3498,11 @@ class Benchmark {
3498
3498
  fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3499
3499
  entries_per_batch_);
3500
3500
  method = &Benchmark::MultiReadRandom;
3501
+ } else if (name == "multireadwhilewriting") {
3502
+ fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3503
+ entries_per_batch_);
3504
+ num_threads++;
3505
+ method = &Benchmark::MultiReadWhileWriting;
3501
3506
  } else if (name == "approximatesizerandom") {
3502
3507
  fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3503
3508
  entries_per_batch_);
@@ -6895,6 +6900,14 @@ class Benchmark {
6895
6900
  }
6896
6901
  }
6897
6902
 
6903
+ void MultiReadWhileWriting(ThreadState* thread) {
6904
+ if (thread->tid > 0) {
6905
+ MultiReadRandom(thread);
6906
+ } else {
6907
+ BGWriter(thread, kWrite);
6908
+ }
6909
+ }
6910
+
6898
6911
  void ReadWhileMerging(ThreadState* thread) {
6899
6912
  if (thread->tid > 0) {
6900
6913
  ReadRandom(thread);
@@ -22,6 +22,12 @@
22
22
  #if defined(__APPLE__)
23
23
  #include <sys/sysctl.h>
24
24
  #endif
25
+ #if defined(__OpenBSD__)
26
+ #include <sys/types.h>
27
+ #include <sys/sysctl.h>
28
+ #include <machine/cpu.h>
29
+ #include <machine/armreg.h>
30
+ #endif
25
31
 
26
32
  #ifdef HAVE_ARM64_CRYPTO
27
33
  /* unfolding to compute 8 * 3 = 24 bytes parallelly */
@@ -46,7 +52,7 @@
46
52
  extern bool pmull_runtime_flag;
47
53
 
48
54
  uint32_t crc32c_runtime_check(void) {
49
- #if !defined(__APPLE__)
55
+ #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
50
56
  uint64_t auxv = 0;
51
57
  #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
52
58
  auxv = getauxval(AT_HWCAP);
@@ -54,16 +60,29 @@ uint32_t crc32c_runtime_check(void) {
54
60
  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
55
61
  #endif
56
62
  return (auxv & HWCAP_CRC32) != 0;
57
- #else
63
+ #elif defined(__APPLE__)
58
64
  int r;
59
65
  size_t l = sizeof(r);
60
66
  if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0;
61
67
  return r == 1;
68
+ #elif defined(__OpenBSD__)
69
+ int r = 0;
70
+ const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
71
+ uint64_t isar0;
72
+ size_t len = sizeof(isar0);
73
+
74
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
75
+ if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
76
+ r = 1;
77
+ }
78
+ return r;
79
+ #else
80
+ return 0;
62
81
  #endif
63
82
  }
64
83
 
65
84
  bool crc32c_pmull_runtime_check(void) {
66
- #if !defined(__APPLE__)
85
+ #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
67
86
  uint64_t auxv = 0;
68
87
  #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
69
88
  auxv = getauxval(AT_HWCAP);
@@ -71,8 +90,21 @@ bool crc32c_pmull_runtime_check(void) {
71
90
  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
72
91
  #endif
73
92
  return (auxv & HWCAP_PMULL) != 0;
74
- #else
93
+ #elif defined(__APPLE__)
75
94
  return true;
95
+ #elif defined(__OpenBSD__)
96
+ bool r = false;
97
+ const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
98
+ uint64_t isar0;
99
+ size_t len = sizeof(isar0);
100
+
101
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
102
+ if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL)
103
+ r = true;
104
+ }
105
+ return r;
106
+ #else
107
+ return false;
76
108
  #endif
77
109
  }
78
110