leveldb-ruby 0.7 → 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/README +1 -1
  2. data/leveldb/Makefile +70 -29
  3. data/leveldb/build_detect_platform +74 -0
  4. data/leveldb/db/builder.cc +2 -4
  5. data/leveldb/db/builder.h +4 -6
  6. data/leveldb/db/c.cc +471 -0
  7. data/leveldb/db/corruption_test.cc +21 -16
  8. data/leveldb/db/db_bench.cc +400 -200
  9. data/leveldb/db/db_impl.cc +276 -131
  10. data/leveldb/db/db_impl.h +22 -10
  11. data/leveldb/db/db_iter.cc +2 -1
  12. data/leveldb/db/db_test.cc +391 -43
  13. data/leveldb/db/dbformat.cc +31 -0
  14. data/leveldb/db/dbformat.h +51 -1
  15. data/leveldb/db/filename.h +1 -1
  16. data/leveldb/db/log_format.h +1 -1
  17. data/leveldb/db/log_reader.cc +16 -11
  18. data/leveldb/db/memtable.cc +37 -0
  19. data/leveldb/db/memtable.h +6 -0
  20. data/leveldb/db/repair.cc +17 -14
  21. data/leveldb/db/skiplist_test.cc +2 -2
  22. data/leveldb/db/version_edit.cc +7 -9
  23. data/leveldb/db/version_edit.h +2 -1
  24. data/leveldb/db/version_set.cc +416 -104
  25. data/leveldb/db/version_set.h +78 -14
  26. data/leveldb/db/version_set_test.cc +179 -0
  27. data/leveldb/db/write_batch_internal.h +2 -0
  28. data/leveldb/include/leveldb/c.h +246 -0
  29. data/leveldb/include/leveldb/db.h +14 -2
  30. data/leveldb/include/leveldb/env.h +31 -10
  31. data/leveldb/include/leveldb/options.h +7 -18
  32. data/leveldb/include/leveldb/slice.h +2 -2
  33. data/leveldb/include/leveldb/status.h +1 -1
  34. data/leveldb/port/atomic_pointer.h +144 -0
  35. data/leveldb/port/port.h +0 -2
  36. data/leveldb/port/port_android.h +7 -1
  37. data/leveldb/port/port_example.h +11 -1
  38. data/leveldb/port/port_posix.h +56 -38
  39. data/leveldb/table/format.cc +12 -8
  40. data/leveldb/table/table_test.cc +16 -7
  41. data/leveldb/util/cache.cc +173 -100
  42. data/leveldb/util/cache_test.cc +28 -11
  43. data/leveldb/util/coding.h +4 -4
  44. data/leveldb/util/comparator.cc +1 -0
  45. data/leveldb/util/env.cc +10 -5
  46. data/leveldb/util/env_posix.cc +48 -87
  47. data/leveldb/util/histogram.cc +11 -0
  48. data/leveldb/util/histogram.h +1 -0
  49. data/leveldb/util/posix_logger.h +98 -0
  50. data/leveldb/util/testharness.cc +12 -0
  51. data/leveldb/util/testharness.h +10 -1
  52. data/lib/leveldb.rb +11 -3
  53. metadata +41 -22
@@ -68,16 +68,6 @@ struct DBImpl::CompactionState {
68
68
  }
69
69
  };
70
70
 
71
- namespace {
72
- class NullWritableFile : public WritableFile {
73
- public:
74
- virtual Status Append(const Slice& data) { return Status::OK(); }
75
- virtual Status Close() { return Status::OK(); }
76
- virtual Status Flush() { return Status::OK(); }
77
- virtual Status Sync() { return Status::OK(); }
78
- };
79
- }
80
-
81
71
  // Fix user-supplied options to be reasonable
82
72
  template <class T,class V>
83
73
  static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
@@ -96,11 +86,10 @@ Options SanitizeOptions(const std::string& dbname,
96
86
  // Open a log file in the same directory as the db
97
87
  src.env->CreateDir(dbname); // In case it does not exist
98
88
  src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
99
- Status s = src.env->NewWritableFile(InfoLogFileName(dbname),
100
- &result.info_log);
89
+ Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
101
90
  if (!s.ok()) {
102
91
  // No place suitable for logging
103
- result.info_log = new NullWritableFile;
92
+ result.info_log = NULL;
104
93
  }
105
94
  }
106
95
  if (result.block_cache == NULL) {
@@ -119,13 +108,15 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
119
108
  db_lock_(NULL),
120
109
  shutting_down_(NULL),
121
110
  bg_cv_(&mutex_),
122
- compacting_cv_(&mutex_),
123
111
  mem_(new MemTable(internal_comparator_)),
124
112
  imm_(NULL),
125
113
  logfile_(NULL),
114
+ logfile_number_(0),
126
115
  log_(NULL),
116
+ logger_(NULL),
117
+ logger_cv_(&mutex_),
127
118
  bg_compaction_scheduled_(false),
128
- compacting_(false) {
119
+ manual_compaction_(NULL) {
129
120
  mem_->Ref();
130
121
  has_imm_.Release_Store(NULL);
131
122
 
@@ -141,10 +132,8 @@ DBImpl::~DBImpl() {
141
132
  // Wait for background work to finish
142
133
  mutex_.Lock();
143
134
  shutting_down_.Release_Store(this); // Any non-NULL value is ok
144
- if (bg_compaction_scheduled_) {
145
- while (bg_compaction_scheduled_) {
146
- bg_cv_.Wait();
147
- }
135
+ while (bg_compaction_scheduled_) {
136
+ bg_cv_.Wait();
148
137
  }
149
138
  mutex_.Unlock();
150
139
 
@@ -203,7 +192,7 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
203
192
  if (s->ok() || options_.paranoid_checks) {
204
193
  // No change needed
205
194
  } else {
206
- Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str());
195
+ Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
207
196
  *s = Status::OK();
208
197
  }
209
198
  }
@@ -222,7 +211,7 @@ void DBImpl::DeleteObsoleteFiles() {
222
211
  bool keep = true;
223
212
  switch (type) {
224
213
  case kLogFile:
225
- keep = ((number == versions_->LogNumber()) ||
214
+ keep = ((number >= versions_->LogNumber()) ||
226
215
  (number == versions_->PrevLogNumber()));
227
216
  break;
228
217
  case kDescriptorFile:
@@ -249,7 +238,7 @@ void DBImpl::DeleteObsoleteFiles() {
249
238
  if (type == kTableFile) {
250
239
  table_cache_->Evict(number);
251
240
  }
252
- Log(env_, options_.info_log, "Delete type=%d #%lld\n",
241
+ Log(options_.info_log, "Delete type=%d #%lld\n",
253
242
  int(type),
254
243
  static_cast<unsigned long long>(number));
255
244
  env_->DeleteFile(dbname_ + "/" + filenames[i]);
@@ -290,14 +279,44 @@ Status DBImpl::Recover(VersionEdit* edit) {
290
279
 
291
280
  s = versions_->Recover();
292
281
  if (s.ok()) {
293
- // Recover from the log files named in the descriptor
294
282
  SequenceNumber max_sequence(0);
295
- if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log
296
- s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence);
283
+
284
+ // Recover from all newer log files than the ones named in the
285
+ // descriptor (new log files may have been added by the previous
286
+ // incarnation without registering them in the descriptor).
287
+ //
288
+ // Note that PrevLogNumber() is no longer used, but we pay
289
+ // attention to it in case we are recovering a database
290
+ // produced by an older version of leveldb.
291
+ const uint64_t min_log = versions_->LogNumber();
292
+ const uint64_t prev_log = versions_->PrevLogNumber();
293
+ std::vector<std::string> filenames;
294
+ s = env_->GetChildren(dbname_, &filenames);
295
+ if (!s.ok()) {
296
+ return s;
297
+ }
298
+ uint64_t number;
299
+ FileType type;
300
+ std::vector<uint64_t> logs;
301
+ for (size_t i = 0; i < filenames.size(); i++) {
302
+ if (ParseFileName(filenames[i], &number, &type)
303
+ && type == kLogFile
304
+ && ((number >= min_log) || (number == prev_log))) {
305
+ logs.push_back(number);
306
+ }
297
307
  }
298
- if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state
299
- s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence);
308
+
309
+ // Recover in the order in which the logs were generated
310
+ std::sort(logs.begin(), logs.end());
311
+ for (size_t i = 0; i < logs.size(); i++) {
312
+ s = RecoverLogFile(logs[i], edit, &max_sequence);
313
+
314
+ // The previous incarnation may not have written any MANIFEST
315
+ // records after allocating this log number. So we manually
316
+ // update the file number allocation counter in VersionSet.
317
+ versions_->MarkFileNumberUsed(logs[i]);
300
318
  }
319
+
301
320
  if (s.ok()) {
302
321
  if (versions_->LastSequence() < max_sequence) {
303
322
  versions_->SetLastSequence(max_sequence);
@@ -313,11 +332,11 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
313
332
  SequenceNumber* max_sequence) {
314
333
  struct LogReporter : public log::Reader::Reporter {
315
334
  Env* env;
316
- WritableFile* info_log;
335
+ Logger* info_log;
317
336
  const char* fname;
318
337
  Status* status; // NULL if options_.paranoid_checks==false
319
338
  virtual void Corruption(size_t bytes, const Status& s) {
320
- Log(env, info_log, "%s%s: dropping %d bytes; %s",
339
+ Log(info_log, "%s%s: dropping %d bytes; %s",
321
340
  (this->status == NULL ? "(ignoring error) " : ""),
322
341
  fname, static_cast<int>(bytes), s.ToString().c_str());
323
342
  if (this->status != NULL && this->status->ok()) *this->status = s;
@@ -347,7 +366,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
347
366
  // large sequence numbers).
348
367
  log::Reader reader(file, &reporter, true/*checksum*/,
349
368
  0/*initial_offset*/);
350
- Log(env_, options_.info_log, "Recovering log #%llu",
369
+ Log(options_.info_log, "Recovering log #%llu",
351
370
  (unsigned long long) log_number);
352
371
 
353
372
  // Read all the records and add to a memtable
@@ -381,7 +400,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
381
400
  }
382
401
 
383
402
  if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
384
- status = WriteLevel0Table(mem, edit);
403
+ status = WriteLevel0Table(mem, edit, NULL);
385
404
  if (!status.ok()) {
386
405
  // Reflect errors immediately so that conditions like full
387
406
  // file-systems cause the DB::Open() to fail.
@@ -393,7 +412,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
393
412
  }
394
413
 
395
414
  if (status.ok() && mem != NULL) {
396
- status = WriteLevel0Table(mem, edit);
415
+ status = WriteLevel0Table(mem, edit, NULL);
397
416
  // Reflect errors immediately so that conditions like full
398
417
  // file-systems cause the DB::Open() to fail.
399
418
  }
@@ -403,50 +422,72 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
403
422
  return status;
404
423
  }
405
424
 
406
- Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) {
425
+ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
426
+ Version* base) {
407
427
  mutex_.AssertHeld();
408
428
  const uint64_t start_micros = env_->NowMicros();
409
429
  FileMetaData meta;
410
430
  meta.number = versions_->NewFileNumber();
411
431
  pending_outputs_.insert(meta.number);
412
432
  Iterator* iter = mem->NewIterator();
413
- Log(env_, options_.info_log, "Level-0 table #%llu: started",
433
+ Log(options_.info_log, "Level-0 table #%llu: started",
414
434
  (unsigned long long) meta.number);
415
435
 
416
436
  Status s;
417
437
  {
418
438
  mutex_.Unlock();
419
- s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit);
439
+ s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
420
440
  mutex_.Lock();
421
441
  }
422
442
 
423
- Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s",
443
+ Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
424
444
  (unsigned long long) meta.number,
425
445
  (unsigned long long) meta.file_size,
426
446
  s.ToString().c_str());
427
447
  delete iter;
428
448
  pending_outputs_.erase(meta.number);
429
449
 
450
+
451
+ // Note that if file_size is zero, the file has been deleted and
452
+ // should not be added to the manifest.
453
+ int level = 0;
454
+ if (s.ok() && meta.file_size > 0) {
455
+ const Slice min_user_key = meta.smallest.user_key();
456
+ const Slice max_user_key = meta.largest.user_key();
457
+ if (base != NULL) {
458
+ level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
459
+ }
460
+ edit->AddFile(level, meta.number, meta.file_size,
461
+ meta.smallest, meta.largest);
462
+ }
463
+
430
464
  CompactionStats stats;
431
465
  stats.micros = env_->NowMicros() - start_micros;
432
466
  stats.bytes_written = meta.file_size;
433
- stats_[0].Add(stats);
467
+ stats_[level].Add(stats);
434
468
  return s;
435
469
  }
436
470
 
437
471
  Status DBImpl::CompactMemTable() {
438
472
  mutex_.AssertHeld();
439
473
  assert(imm_ != NULL);
440
- assert(compacting_);
441
474
 
442
475
  // Save the contents of the memtable as a new Table
443
476
  VersionEdit edit;
444
- Status s = WriteLevel0Table(imm_, &edit);
477
+ Version* base = versions_->current();
478
+ base->Ref();
479
+ Status s = WriteLevel0Table(imm_, &edit, base);
480
+ base->Unref();
481
+
482
+ if (s.ok() && shutting_down_.Acquire_Load()) {
483
+ s = Status::IOError("Deleting DB during memtable compaction");
484
+ }
445
485
 
446
486
  // Replace immutable memtable with the generated Table
447
487
  if (s.ok()) {
448
488
  edit.SetPrevLogNumber(0);
449
- s = versions_->LogAndApply(&edit);
489
+ edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
490
+ s = versions_->LogAndApply(&edit, &mutex_);
450
491
  }
451
492
 
452
493
  if (s.ok()) {
@@ -457,40 +498,71 @@ Status DBImpl::CompactMemTable() {
457
498
  DeleteObsoleteFiles();
458
499
  }
459
500
 
460
- compacting_cv_.SignalAll(); // Wake up waiter even if there was an error
461
501
  return s;
462
502
  }
463
503
 
464
- void DBImpl::TEST_CompactRange(
465
- int level,
466
- const std::string& begin,
467
- const std::string& end) {
468
- MutexLock l(&mutex_);
469
- while (compacting_) {
470
- compacting_cv_.Wait();
504
+ void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
505
+ int max_level_with_files = 1;
506
+ {
507
+ MutexLock l(&mutex_);
508
+ Version* base = versions_->current();
509
+ for (int level = 1; level < config::kNumLevels; level++) {
510
+ if (base->OverlapInLevel(level, begin, end)) {
511
+ max_level_with_files = level;
512
+ }
513
+ }
514
+ }
515
+ TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
516
+ for (int level = 0; level < max_level_with_files; level++) {
517
+ TEST_CompactRange(level, begin, end);
471
518
  }
472
- Compaction* c = versions_->CompactRange(
473
- level,
474
- InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek),
475
- InternalKey(end, 0, static_cast<ValueType>(0)));
519
+ }
476
520
 
477
- if (c != NULL) {
478
- CompactionState* compact = new CompactionState(c);
479
- DoCompactionWork(compact); // Ignore error in test compaction
480
- CleanupCompaction(compact);
521
+ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
522
+ assert(level >= 0);
523
+ assert(level + 1 < config::kNumLevels);
524
+
525
+ InternalKey begin_storage, end_storage;
526
+
527
+ ManualCompaction manual;
528
+ manual.level = level;
529
+ manual.done = false;
530
+ if (begin == NULL) {
531
+ manual.begin = NULL;
532
+ } else {
533
+ begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
534
+ manual.begin = &begin_storage;
535
+ }
536
+ if (end == NULL) {
537
+ manual.end = NULL;
538
+ } else {
539
+ end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
540
+ manual.end = &end_storage;
481
541
  }
482
542
 
483
- // Start any background compaction that may have been delayed by this thread
484
- MaybeScheduleCompaction();
543
+ MutexLock l(&mutex_);
544
+ while (!manual.done) {
545
+ while (manual_compaction_ != NULL) {
546
+ bg_cv_.Wait();
547
+ }
548
+ manual_compaction_ = &manual;
549
+ MaybeScheduleCompaction();
550
+ while (manual_compaction_ == &manual) {
551
+ bg_cv_.Wait();
552
+ }
553
+ }
485
554
  }
486
555
 
487
556
  Status DBImpl::TEST_CompactMemTable() {
488
557
  MutexLock l(&mutex_);
558
+ LoggerId self;
559
+ AcquireLoggingResponsibility(&self);
489
560
  Status s = MakeRoomForWrite(true /* force compaction */);
561
+ ReleaseLoggingResponsibility(&self);
490
562
  if (s.ok()) {
491
563
  // Wait until the compaction completes
492
564
  while (imm_ != NULL && bg_error_.ok()) {
493
- compacting_cv_.Wait();
565
+ bg_cv_.Wait();
494
566
  }
495
567
  if (imm_ != NULL) {
496
568
  s = bg_error_;
@@ -503,11 +575,11 @@ void DBImpl::MaybeScheduleCompaction() {
503
575
  mutex_.AssertHeld();
504
576
  if (bg_compaction_scheduled_) {
505
577
  // Already scheduled
506
- } else if (compacting_) {
507
- // Some other thread is running a compaction. Do not conflict with it.
508
578
  } else if (shutting_down_.Acquire_Load()) {
509
579
  // DB is being deleted; no more background compactions
510
- } else if (imm_ == NULL && !versions_->NeedsCompaction()) {
580
+ } else if (imm_ == NULL &&
581
+ manual_compaction_ == NULL &&
582
+ !versions_->NeedsCompaction()) {
511
583
  // No work to be done
512
584
  } else {
513
585
  bg_compaction_scheduled_ = true;
@@ -522,50 +594,63 @@ void DBImpl::BGWork(void* db) {
522
594
  void DBImpl::BackgroundCall() {
523
595
  MutexLock l(&mutex_);
524
596
  assert(bg_compaction_scheduled_);
525
- if (!shutting_down_.Acquire_Load() &&
526
- !compacting_) {
597
+ if (!shutting_down_.Acquire_Load()) {
527
598
  BackgroundCompaction();
528
599
  }
529
600
  bg_compaction_scheduled_ = false;
530
- bg_cv_.SignalAll();
531
601
 
532
602
  // Previous compaction may have produced too many files in a level,
533
603
  // so reschedule another compaction if needed.
534
604
  MaybeScheduleCompaction();
605
+ bg_cv_.SignalAll();
535
606
  }
536
607
 
537
608
  void DBImpl::BackgroundCompaction() {
538
609
  mutex_.AssertHeld();
539
- assert(!compacting_);
540
610
 
541
611
  if (imm_ != NULL) {
542
- compacting_ = true;
543
612
  CompactMemTable();
544
- compacting_ = false;
545
- compacting_cv_.SignalAll();
546
613
  return;
547
614
  }
548
615
 
549
- Compaction* c = versions_->PickCompaction();
550
- if (c == NULL) {
551
- // Nothing to do
552
- return;
616
+ Compaction* c;
617
+ bool is_manual = (manual_compaction_ != NULL);
618
+ InternalKey manual_end;
619
+ if (is_manual) {
620
+ ManualCompaction* m = manual_compaction_;
621
+ c = versions_->CompactRange(m->level, m->begin, m->end);
622
+ m->done = (c == NULL);
623
+ if (c != NULL) {
624
+ manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
625
+ }
626
+ Log(options_.info_log,
627
+ "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
628
+ m->level,
629
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
630
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
631
+ (m->done ? "(end)" : manual_end.DebugString().c_str()));
632
+ } else {
633
+ c = versions_->PickCompaction();
553
634
  }
554
635
 
555
636
  Status status;
556
- if (c->IsTrivialMove()) {
637
+ if (c == NULL) {
638
+ // Nothing to do
639
+ } else if (!is_manual && c->IsTrivialMove()) {
557
640
  // Move file to next level
558
641
  assert(c->num_input_files(0) == 1);
559
642
  FileMetaData* f = c->input(0, 0);
560
643
  c->edit()->DeleteFile(c->level(), f->number);
561
644
  c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
562
645
  f->smallest, f->largest);
563
- status = versions_->LogAndApply(c->edit());
564
- Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
646
+ status = versions_->LogAndApply(c->edit(), &mutex_);
647
+ VersionSet::LevelSummaryStorage tmp;
648
+ Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
565
649
  static_cast<unsigned long long>(f->number),
566
650
  c->level() + 1,
567
651
  static_cast<unsigned long long>(f->file_size),
568
- status.ToString().c_str());
652
+ status.ToString().c_str(),
653
+ versions_->LevelSummary(&tmp));
569
654
  } else {
570
655
  CompactionState* compact = new CompactionState(c);
571
656
  status = DoCompactionWork(compact);
@@ -578,12 +663,23 @@ void DBImpl::BackgroundCompaction() {
578
663
  } else if (shutting_down_.Acquire_Load()) {
579
664
  // Ignore compaction errors found during shutting down
580
665
  } else {
581
- Log(env_, options_.info_log,
666
+ Log(options_.info_log,
582
667
  "Compaction error: %s", status.ToString().c_str());
583
668
  if (options_.paranoid_checks && bg_error_.ok()) {
584
669
  bg_error_ = status;
585
670
  }
586
671
  }
672
+
673
+ if (is_manual) {
674
+ ManualCompaction* m = manual_compaction_;
675
+ if (!m->done) {
676
+ // We only compacted part of the requested range. Update *m
677
+ // to the range that is left to be compacted.
678
+ m->tmp_storage = manual_end;
679
+ m->begin = &m->tmp_storage;
680
+ }
681
+ manual_compaction_ = NULL;
682
+ }
587
683
  }
588
684
 
589
685
  void DBImpl::CleanupCompaction(CompactionState* compact) {
@@ -669,7 +765,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
669
765
  s = iter->status();
670
766
  delete iter;
671
767
  if (s.ok()) {
672
- Log(env_, options_.info_log,
768
+ Log(options_.info_log,
673
769
  "Generated table #%llu: %lld keys, %lld bytes",
674
770
  (unsigned long long) output_number,
675
771
  (unsigned long long) current_entries,
@@ -682,7 +778,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
682
778
 
683
779
  Status DBImpl::InstallCompactionResults(CompactionState* compact) {
684
780
  mutex_.AssertHeld();
685
- Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
781
+ Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
686
782
  compact->compaction->num_input_files(0),
687
783
  compact->compaction->level(),
688
784
  compact->compaction->num_input_files(1),
@@ -701,7 +797,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
701
797
  }
702
798
  compact->outputs.clear();
703
799
 
704
- Status s = versions_->LogAndApply(compact->compaction->edit());
800
+ Status s = versions_->LogAndApply(compact->compaction->edit(), &mutex_);
705
801
  if (s.ok()) {
706
802
  compact->compaction->ReleaseInputs();
707
803
  DeleteObsoleteFiles();
@@ -718,7 +814,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
718
814
  const uint64_t start_micros = env_->NowMicros();
719
815
  int64_t imm_micros = 0; // Micros spent doing imm_ compactions
720
816
 
721
- Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files",
817
+ Log(options_.info_log, "Compacting %d@%d + %d@%d files",
722
818
  compact->compaction->num_input_files(0),
723
819
  compact->compaction->level(),
724
820
  compact->compaction->num_input_files(1),
@@ -734,7 +830,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
734
830
  }
735
831
 
736
832
  // Release mutex while we're actually doing the compaction work
737
- compacting_ = true;
738
833
  mutex_.Unlock();
739
834
 
740
835
  Iterator* input = versions_->MakeInputIterator(compact->compaction);
@@ -751,7 +846,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
751
846
  mutex_.Lock();
752
847
  if (imm_ != NULL) {
753
848
  CompactMemTable();
754
- compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
849
+ bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
755
850
  }
756
851
  mutex_.Unlock();
757
852
  imm_micros += (env_->NowMicros() - imm_start);
@@ -802,7 +897,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
802
897
  last_sequence_for_key = ikey.sequence;
803
898
  }
804
899
  #if 0
805
- Log(env_, options_.info_log,
900
+ Log(options_.info_log,
806
901
  " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
807
902
  "%d smallest_snapshot: %d",
808
903
  ikey.user_key.ToString().c_str(),
@@ -867,10 +962,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
867
962
  if (status.ok()) {
868
963
  status = InstallCompactionResults(compact);
869
964
  }
870
- compacting_ = false;
871
- compacting_cv_.SignalAll();
872
965
  VersionSet::LevelSummaryStorage tmp;
873
- Log(env_, options_.info_log,
966
+ Log(options_.info_log,
874
967
  "compacted to: %s", versions_->LevelSummary(&tmp));
875
968
  return status;
876
969
  }
@@ -936,22 +1029,48 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
936
1029
  Status DBImpl::Get(const ReadOptions& options,
937
1030
  const Slice& key,
938
1031
  std::string* value) {
939
- // TODO(opt): faster implementation
940
- Iterator* iter = NewIterator(options);
941
- iter->Seek(key);
942
- bool found = false;
943
- if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) {
944
- Slice v = iter->value();
945
- value->assign(v.data(), v.size());
946
- found = true;
947
- }
948
- // Non-OK iterator status trumps everything else
949
- Status result = iter->status();
950
- if (result.ok() && !found) {
951
- result = Status::NotFound(Slice()); // Use an empty error message for speed
1032
+ Status s;
1033
+ MutexLock l(&mutex_);
1034
+ SequenceNumber snapshot;
1035
+ if (options.snapshot != NULL) {
1036
+ snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
1037
+ } else {
1038
+ snapshot = versions_->LastSequence();
952
1039
  }
953
- delete iter;
954
- return result;
1040
+
1041
+ MemTable* mem = mem_;
1042
+ MemTable* imm = imm_;
1043
+ Version* current = versions_->current();
1044
+ mem->Ref();
1045
+ if (imm != NULL) imm->Ref();
1046
+ current->Ref();
1047
+
1048
+ bool have_stat_update = false;
1049
+ Version::GetStats stats;
1050
+
1051
+ // Unlock while reading from files and memtables
1052
+ {
1053
+ mutex_.Unlock();
1054
+ // First look in the memtable, then in the immutable memtable (if any).
1055
+ LookupKey lkey(key, snapshot);
1056
+ if (mem->Get(lkey, value, &s)) {
1057
+ // Done
1058
+ } else if (imm != NULL && imm->Get(lkey, value, &s)) {
1059
+ // Done
1060
+ } else {
1061
+ s = current->Get(options, lkey, value, &stats);
1062
+ have_stat_update = true;
1063
+ }
1064
+ mutex_.Lock();
1065
+ }
1066
+
1067
+ if (have_stat_update && current->UpdateStats(stats)) {
1068
+ MaybeScheduleCompaction();
1069
+ }
1070
+ mem->Unref();
1071
+ if (imm != NULL) imm->Unref();
1072
+ current->Unref();
1073
+ return s;
955
1074
  }
956
1075
 
957
1076
  Iterator* DBImpl::NewIterator(const ReadOptions& options) {
@@ -983,34 +1102,61 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
983
1102
  return DB::Delete(options, key);
984
1103
  }
985
1104
 
1105
+ // There is at most one thread that is the current logger. This call
1106
+ // waits until preceding logger(s) have finished and becomes the
1107
+ // current logger.
1108
+ void DBImpl::AcquireLoggingResponsibility(LoggerId* self) {
1109
+ while (logger_ != NULL) {
1110
+ logger_cv_.Wait();
1111
+ }
1112
+ logger_ = self;
1113
+ }
1114
+
1115
+ void DBImpl::ReleaseLoggingResponsibility(LoggerId* self) {
1116
+ assert(logger_ == self);
1117
+ logger_ = NULL;
1118
+ logger_cv_.SignalAll();
1119
+ }
1120
+
986
1121
  Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
987
1122
  Status status;
988
1123
  MutexLock l(&mutex_);
1124
+ LoggerId self;
1125
+ AcquireLoggingResponsibility(&self);
989
1126
  status = MakeRoomForWrite(false); // May temporarily release lock and wait
990
1127
  uint64_t last_sequence = versions_->LastSequence();
991
1128
  if (status.ok()) {
992
1129
  WriteBatchInternal::SetSequence(updates, last_sequence + 1);
993
1130
  last_sequence += WriteBatchInternal::Count(updates);
994
- versions_->SetLastSequence(last_sequence);
995
1131
 
996
- // Add to log and apply to memtable
997
- status = log_->AddRecord(WriteBatchInternal::Contents(updates));
998
- if (status.ok() && options.sync) {
999
- status = logfile_->Sync();
1000
- }
1001
- if (status.ok()) {
1002
- status = WriteBatchInternal::InsertInto(updates, mem_);
1132
+ // Add to log and apply to memtable. We can release the lock during
1133
+ // this phase since the "logger_" flag protects against concurrent
1134
+ // loggers and concurrent writes into mem_.
1135
+ {
1136
+ assert(logger_ == &self);
1137
+ mutex_.Unlock();
1138
+ status = log_->AddRecord(WriteBatchInternal::Contents(updates));
1139
+ if (status.ok() && options.sync) {
1140
+ status = logfile_->Sync();
1141
+ }
1142
+ if (status.ok()) {
1143
+ status = WriteBatchInternal::InsertInto(updates, mem_);
1144
+ }
1145
+ mutex_.Lock();
1146
+ assert(logger_ == &self);
1003
1147
  }
1148
+
1149
+ versions_->SetLastSequence(last_sequence);
1004
1150
  }
1005
- if (options.post_write_snapshot != NULL) {
1006
- *options.post_write_snapshot =
1007
- status.ok() ? snapshots_.New(last_sequence) : NULL;
1008
- }
1151
+ ReleaseLoggingResponsibility(&self);
1009
1152
  return status;
1010
1153
  }
1011
1154
 
1155
+ // REQUIRES: mutex_ is held
1156
+ // REQUIRES: this thread is the current logger
1012
1157
  Status DBImpl::MakeRoomForWrite(bool force) {
1013
1158
  mutex_.AssertHeld();
1159
+ assert(logger_ != NULL);
1014
1160
  bool allow_delay = !force;
1015
1161
  Status s;
1016
1162
  while (true) {
@@ -1038,10 +1184,11 @@ Status DBImpl::MakeRoomForWrite(bool force) {
1038
1184
  } else if (imm_ != NULL) {
1039
1185
  // We have filled up the current memtable, but the previous
1040
1186
  // one is still being compacted, so we wait.
1041
- compacting_cv_.Wait();
1187
+ bg_cv_.Wait();
1042
1188
  } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
1043
1189
  // There are too many level-0 files.
1044
- compacting_cv_.Wait();
1190
+ Log(options_.info_log, "waiting...\n");
1191
+ bg_cv_.Wait();
1045
1192
  } else {
1046
1193
  // Attempt to switch to a new memtable and trigger compaction of old
1047
1194
  assert(versions_->PrevLogNumber() == 0);
@@ -1051,18 +1198,10 @@ Status DBImpl::MakeRoomForWrite(bool force) {
1051
1198
  if (!s.ok()) {
1052
1199
  break;
1053
1200
  }
1054
- VersionEdit edit;
1055
- edit.SetPrevLogNumber(versions_->LogNumber());
1056
- edit.SetLogNumber(new_log_number);
1057
- s = versions_->LogAndApply(&edit);
1058
- if (!s.ok()) {
1059
- delete lfile;
1060
- env_->DeleteFile(LogFileName(dbname_, new_log_number));
1061
- break;
1062
- }
1063
1201
  delete log_;
1064
1202
  delete logfile_;
1065
1203
  logfile_ = lfile;
1204
+ logfile_number_ = new_log_number;
1066
1205
  log_ = new log::Writer(lfile);
1067
1206
  imm_ = mem_;
1068
1207
  has_imm_.Release_Store(imm_);
@@ -1088,7 +1227,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
1088
1227
  in.remove_prefix(strlen("num-files-at-level"));
1089
1228
  uint64_t level;
1090
1229
  bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
1091
- if (!ok || level < 0 || level >= config::kNumLevels) {
1230
+ if (!ok || level >= config::kNumLevels) {
1092
1231
  return false;
1093
1232
  } else {
1094
1233
  char buf[100];
@@ -1121,6 +1260,9 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
1121
1260
  }
1122
1261
  }
1123
1262
  return true;
1263
+ } else if (in == "sstables") {
1264
+ *value = versions_->current()->DebugString();
1265
+ return true;
1124
1266
  }
1125
1267
 
1126
1268
  return false;
@@ -1184,8 +1326,9 @@ Status DB::Open(const Options& options, const std::string& dbname,
1184
1326
  if (s.ok()) {
1185
1327
  edit.SetLogNumber(new_log_number);
1186
1328
  impl->logfile_ = lfile;
1329
+ impl->logfile_number_ = new_log_number;
1187
1330
  impl->log_ = new log::Writer(lfile);
1188
- s = impl->versions_->LogAndApply(&edit);
1331
+ s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
1189
1332
  }
1190
1333
  if (s.ok()) {
1191
1334
  impl->DeleteObsoleteFiles();
@@ -1214,12 +1357,14 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
1214
1357
  }
1215
1358
 
1216
1359
  FileLock* lock;
1217
- Status result = env->LockFile(LockFileName(dbname), &lock);
1360
+ const std::string lockname = LockFileName(dbname);
1361
+ Status result = env->LockFile(lockname, &lock);
1218
1362
  if (result.ok()) {
1219
1363
  uint64_t number;
1220
1364
  FileType type;
1221
1365
  for (size_t i = 0; i < filenames.size(); i++) {
1222
- if (ParseFileName(filenames[i], &number, &type)) {
1366
+ if (ParseFileName(filenames[i], &number, &type) &&
1367
+ filenames[i] != lockname) { // Lock file will be deleted at end
1223
1368
  Status del = env->DeleteFile(dbname + "/" + filenames[i]);
1224
1369
  if (result.ok() && !del.ok()) {
1225
1370
  result = del;
@@ -1227,7 +1372,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
1227
1372
  }
1228
1373
  }
1229
1374
  env->UnlockFile(lock); // Ignore error since state is already gone
1230
- env->DeleteFile(LockFileName(dbname));
1375
+ env->DeleteFile(lockname);
1231
1376
  env->DeleteDir(dbname); // Ignore error in case dir contains other files
1232
1377
  }
1233
1378
  return result;