leveldb-ruby 0.7 → 0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/README +1 -1
  2. data/leveldb/Makefile +70 -29
  3. data/leveldb/build_detect_platform +74 -0
  4. data/leveldb/db/builder.cc +2 -4
  5. data/leveldb/db/builder.h +4 -6
  6. data/leveldb/db/c.cc +471 -0
  7. data/leveldb/db/corruption_test.cc +21 -16
  8. data/leveldb/db/db_bench.cc +400 -200
  9. data/leveldb/db/db_impl.cc +276 -131
  10. data/leveldb/db/db_impl.h +22 -10
  11. data/leveldb/db/db_iter.cc +2 -1
  12. data/leveldb/db/db_test.cc +391 -43
  13. data/leveldb/db/dbformat.cc +31 -0
  14. data/leveldb/db/dbformat.h +51 -1
  15. data/leveldb/db/filename.h +1 -1
  16. data/leveldb/db/log_format.h +1 -1
  17. data/leveldb/db/log_reader.cc +16 -11
  18. data/leveldb/db/memtable.cc +37 -0
  19. data/leveldb/db/memtable.h +6 -0
  20. data/leveldb/db/repair.cc +17 -14
  21. data/leveldb/db/skiplist_test.cc +2 -2
  22. data/leveldb/db/version_edit.cc +7 -9
  23. data/leveldb/db/version_edit.h +2 -1
  24. data/leveldb/db/version_set.cc +416 -104
  25. data/leveldb/db/version_set.h +78 -14
  26. data/leveldb/db/version_set_test.cc +179 -0
  27. data/leveldb/db/write_batch_internal.h +2 -0
  28. data/leveldb/include/leveldb/c.h +246 -0
  29. data/leveldb/include/leveldb/db.h +14 -2
  30. data/leveldb/include/leveldb/env.h +31 -10
  31. data/leveldb/include/leveldb/options.h +7 -18
  32. data/leveldb/include/leveldb/slice.h +2 -2
  33. data/leveldb/include/leveldb/status.h +1 -1
  34. data/leveldb/port/atomic_pointer.h +144 -0
  35. data/leveldb/port/port.h +0 -2
  36. data/leveldb/port/port_android.h +7 -1
  37. data/leveldb/port/port_example.h +11 -1
  38. data/leveldb/port/port_posix.h +56 -38
  39. data/leveldb/table/format.cc +12 -8
  40. data/leveldb/table/table_test.cc +16 -7
  41. data/leveldb/util/cache.cc +173 -100
  42. data/leveldb/util/cache_test.cc +28 -11
  43. data/leveldb/util/coding.h +4 -4
  44. data/leveldb/util/comparator.cc +1 -0
  45. data/leveldb/util/env.cc +10 -5
  46. data/leveldb/util/env_posix.cc +48 -87
  47. data/leveldb/util/histogram.cc +11 -0
  48. data/leveldb/util/histogram.h +1 -0
  49. data/leveldb/util/posix_logger.h +98 -0
  50. data/leveldb/util/testharness.cc +12 -0
  51. data/leveldb/util/testharness.h +10 -1
  52. data/lib/leveldb.rb +11 -3
  53. metadata +41 -22
@@ -68,16 +68,6 @@ struct DBImpl::CompactionState {
68
68
  }
69
69
  };
70
70
 
71
- namespace {
72
- class NullWritableFile : public WritableFile {
73
- public:
74
- virtual Status Append(const Slice& data) { return Status::OK(); }
75
- virtual Status Close() { return Status::OK(); }
76
- virtual Status Flush() { return Status::OK(); }
77
- virtual Status Sync() { return Status::OK(); }
78
- };
79
- }
80
-
81
71
  // Fix user-supplied options to be reasonable
82
72
  template <class T,class V>
83
73
  static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
@@ -96,11 +86,10 @@ Options SanitizeOptions(const std::string& dbname,
96
86
  // Open a log file in the same directory as the db
97
87
  src.env->CreateDir(dbname); // In case it does not exist
98
88
  src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
99
- Status s = src.env->NewWritableFile(InfoLogFileName(dbname),
100
- &result.info_log);
89
+ Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
101
90
  if (!s.ok()) {
102
91
  // No place suitable for logging
103
- result.info_log = new NullWritableFile;
92
+ result.info_log = NULL;
104
93
  }
105
94
  }
106
95
  if (result.block_cache == NULL) {
@@ -119,13 +108,15 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
119
108
  db_lock_(NULL),
120
109
  shutting_down_(NULL),
121
110
  bg_cv_(&mutex_),
122
- compacting_cv_(&mutex_),
123
111
  mem_(new MemTable(internal_comparator_)),
124
112
  imm_(NULL),
125
113
  logfile_(NULL),
114
+ logfile_number_(0),
126
115
  log_(NULL),
116
+ logger_(NULL),
117
+ logger_cv_(&mutex_),
127
118
  bg_compaction_scheduled_(false),
128
- compacting_(false) {
119
+ manual_compaction_(NULL) {
129
120
  mem_->Ref();
130
121
  has_imm_.Release_Store(NULL);
131
122
 
@@ -141,10 +132,8 @@ DBImpl::~DBImpl() {
141
132
  // Wait for background work to finish
142
133
  mutex_.Lock();
143
134
  shutting_down_.Release_Store(this); // Any non-NULL value is ok
144
- if (bg_compaction_scheduled_) {
145
- while (bg_compaction_scheduled_) {
146
- bg_cv_.Wait();
147
- }
135
+ while (bg_compaction_scheduled_) {
136
+ bg_cv_.Wait();
148
137
  }
149
138
  mutex_.Unlock();
150
139
 
@@ -203,7 +192,7 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
203
192
  if (s->ok() || options_.paranoid_checks) {
204
193
  // No change needed
205
194
  } else {
206
- Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str());
195
+ Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
207
196
  *s = Status::OK();
208
197
  }
209
198
  }
@@ -222,7 +211,7 @@ void DBImpl::DeleteObsoleteFiles() {
222
211
  bool keep = true;
223
212
  switch (type) {
224
213
  case kLogFile:
225
- keep = ((number == versions_->LogNumber()) ||
214
+ keep = ((number >= versions_->LogNumber()) ||
226
215
  (number == versions_->PrevLogNumber()));
227
216
  break;
228
217
  case kDescriptorFile:
@@ -249,7 +238,7 @@ void DBImpl::DeleteObsoleteFiles() {
249
238
  if (type == kTableFile) {
250
239
  table_cache_->Evict(number);
251
240
  }
252
- Log(env_, options_.info_log, "Delete type=%d #%lld\n",
241
+ Log(options_.info_log, "Delete type=%d #%lld\n",
253
242
  int(type),
254
243
  static_cast<unsigned long long>(number));
255
244
  env_->DeleteFile(dbname_ + "/" + filenames[i]);
@@ -290,14 +279,44 @@ Status DBImpl::Recover(VersionEdit* edit) {
290
279
 
291
280
  s = versions_->Recover();
292
281
  if (s.ok()) {
293
- // Recover from the log files named in the descriptor
294
282
  SequenceNumber max_sequence(0);
295
- if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log
296
- s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence);
283
+
284
+ // Recover from all newer log files than the ones named in the
285
+ // descriptor (new log files may have been added by the previous
286
+ // incarnation without registering them in the descriptor).
287
+ //
288
+ // Note that PrevLogNumber() is no longer used, but we pay
289
+ // attention to it in case we are recovering a database
290
+ // produced by an older version of leveldb.
291
+ const uint64_t min_log = versions_->LogNumber();
292
+ const uint64_t prev_log = versions_->PrevLogNumber();
293
+ std::vector<std::string> filenames;
294
+ s = env_->GetChildren(dbname_, &filenames);
295
+ if (!s.ok()) {
296
+ return s;
297
+ }
298
+ uint64_t number;
299
+ FileType type;
300
+ std::vector<uint64_t> logs;
301
+ for (size_t i = 0; i < filenames.size(); i++) {
302
+ if (ParseFileName(filenames[i], &number, &type)
303
+ && type == kLogFile
304
+ && ((number >= min_log) || (number == prev_log))) {
305
+ logs.push_back(number);
306
+ }
297
307
  }
298
- if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state
299
- s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence);
308
+
309
+ // Recover in the order in which the logs were generated
310
+ std::sort(logs.begin(), logs.end());
311
+ for (size_t i = 0; i < logs.size(); i++) {
312
+ s = RecoverLogFile(logs[i], edit, &max_sequence);
313
+
314
+ // The previous incarnation may not have written any MANIFEST
315
+ // records after allocating this log number. So we manually
316
+ // update the file number allocation counter in VersionSet.
317
+ versions_->MarkFileNumberUsed(logs[i]);
300
318
  }
319
+
301
320
  if (s.ok()) {
302
321
  if (versions_->LastSequence() < max_sequence) {
303
322
  versions_->SetLastSequence(max_sequence);
@@ -313,11 +332,11 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
313
332
  SequenceNumber* max_sequence) {
314
333
  struct LogReporter : public log::Reader::Reporter {
315
334
  Env* env;
316
- WritableFile* info_log;
335
+ Logger* info_log;
317
336
  const char* fname;
318
337
  Status* status; // NULL if options_.paranoid_checks==false
319
338
  virtual void Corruption(size_t bytes, const Status& s) {
320
- Log(env, info_log, "%s%s: dropping %d bytes; %s",
339
+ Log(info_log, "%s%s: dropping %d bytes; %s",
321
340
  (this->status == NULL ? "(ignoring error) " : ""),
322
341
  fname, static_cast<int>(bytes), s.ToString().c_str());
323
342
  if (this->status != NULL && this->status->ok()) *this->status = s;
@@ -347,7 +366,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
347
366
  // large sequence numbers).
348
367
  log::Reader reader(file, &reporter, true/*checksum*/,
349
368
  0/*initial_offset*/);
350
- Log(env_, options_.info_log, "Recovering log #%llu",
369
+ Log(options_.info_log, "Recovering log #%llu",
351
370
  (unsigned long long) log_number);
352
371
 
353
372
  // Read all the records and add to a memtable
@@ -381,7 +400,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
381
400
  }
382
401
 
383
402
  if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
384
- status = WriteLevel0Table(mem, edit);
403
+ status = WriteLevel0Table(mem, edit, NULL);
385
404
  if (!status.ok()) {
386
405
  // Reflect errors immediately so that conditions like full
387
406
  // file-systems cause the DB::Open() to fail.
@@ -393,7 +412,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
393
412
  }
394
413
 
395
414
  if (status.ok() && mem != NULL) {
396
- status = WriteLevel0Table(mem, edit);
415
+ status = WriteLevel0Table(mem, edit, NULL);
397
416
  // Reflect errors immediately so that conditions like full
398
417
  // file-systems cause the DB::Open() to fail.
399
418
  }
@@ -403,50 +422,72 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
403
422
  return status;
404
423
  }
405
424
 
406
- Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) {
425
+ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
426
+ Version* base) {
407
427
  mutex_.AssertHeld();
408
428
  const uint64_t start_micros = env_->NowMicros();
409
429
  FileMetaData meta;
410
430
  meta.number = versions_->NewFileNumber();
411
431
  pending_outputs_.insert(meta.number);
412
432
  Iterator* iter = mem->NewIterator();
413
- Log(env_, options_.info_log, "Level-0 table #%llu: started",
433
+ Log(options_.info_log, "Level-0 table #%llu: started",
414
434
  (unsigned long long) meta.number);
415
435
 
416
436
  Status s;
417
437
  {
418
438
  mutex_.Unlock();
419
- s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit);
439
+ s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
420
440
  mutex_.Lock();
421
441
  }
422
442
 
423
- Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s",
443
+ Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
424
444
  (unsigned long long) meta.number,
425
445
  (unsigned long long) meta.file_size,
426
446
  s.ToString().c_str());
427
447
  delete iter;
428
448
  pending_outputs_.erase(meta.number);
429
449
 
450
+
451
+ // Note that if file_size is zero, the file has been deleted and
452
+ // should not be added to the manifest.
453
+ int level = 0;
454
+ if (s.ok() && meta.file_size > 0) {
455
+ const Slice min_user_key = meta.smallest.user_key();
456
+ const Slice max_user_key = meta.largest.user_key();
457
+ if (base != NULL) {
458
+ level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
459
+ }
460
+ edit->AddFile(level, meta.number, meta.file_size,
461
+ meta.smallest, meta.largest);
462
+ }
463
+
430
464
  CompactionStats stats;
431
465
  stats.micros = env_->NowMicros() - start_micros;
432
466
  stats.bytes_written = meta.file_size;
433
- stats_[0].Add(stats);
467
+ stats_[level].Add(stats);
434
468
  return s;
435
469
  }
436
470
 
437
471
  Status DBImpl::CompactMemTable() {
438
472
  mutex_.AssertHeld();
439
473
  assert(imm_ != NULL);
440
- assert(compacting_);
441
474
 
442
475
  // Save the contents of the memtable as a new Table
443
476
  VersionEdit edit;
444
- Status s = WriteLevel0Table(imm_, &edit);
477
+ Version* base = versions_->current();
478
+ base->Ref();
479
+ Status s = WriteLevel0Table(imm_, &edit, base);
480
+ base->Unref();
481
+
482
+ if (s.ok() && shutting_down_.Acquire_Load()) {
483
+ s = Status::IOError("Deleting DB during memtable compaction");
484
+ }
445
485
 
446
486
  // Replace immutable memtable with the generated Table
447
487
  if (s.ok()) {
448
488
  edit.SetPrevLogNumber(0);
449
- s = versions_->LogAndApply(&edit);
489
+ edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
490
+ s = versions_->LogAndApply(&edit, &mutex_);
450
491
  }
451
492
 
452
493
  if (s.ok()) {
@@ -457,40 +498,71 @@ Status DBImpl::CompactMemTable() {
457
498
  DeleteObsoleteFiles();
458
499
  }
459
500
 
460
- compacting_cv_.SignalAll(); // Wake up waiter even if there was an error
461
501
  return s;
462
502
  }
463
503
 
464
- void DBImpl::TEST_CompactRange(
465
- int level,
466
- const std::string& begin,
467
- const std::string& end) {
468
- MutexLock l(&mutex_);
469
- while (compacting_) {
470
- compacting_cv_.Wait();
504
+ void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
505
+ int max_level_with_files = 1;
506
+ {
507
+ MutexLock l(&mutex_);
508
+ Version* base = versions_->current();
509
+ for (int level = 1; level < config::kNumLevels; level++) {
510
+ if (base->OverlapInLevel(level, begin, end)) {
511
+ max_level_with_files = level;
512
+ }
513
+ }
514
+ }
515
+ TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
516
+ for (int level = 0; level < max_level_with_files; level++) {
517
+ TEST_CompactRange(level, begin, end);
471
518
  }
472
- Compaction* c = versions_->CompactRange(
473
- level,
474
- InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek),
475
- InternalKey(end, 0, static_cast<ValueType>(0)));
519
+ }
476
520
 
477
- if (c != NULL) {
478
- CompactionState* compact = new CompactionState(c);
479
- DoCompactionWork(compact); // Ignore error in test compaction
480
- CleanupCompaction(compact);
521
+ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
522
+ assert(level >= 0);
523
+ assert(level + 1 < config::kNumLevels);
524
+
525
+ InternalKey begin_storage, end_storage;
526
+
527
+ ManualCompaction manual;
528
+ manual.level = level;
529
+ manual.done = false;
530
+ if (begin == NULL) {
531
+ manual.begin = NULL;
532
+ } else {
533
+ begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
534
+ manual.begin = &begin_storage;
535
+ }
536
+ if (end == NULL) {
537
+ manual.end = NULL;
538
+ } else {
539
+ end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
540
+ manual.end = &end_storage;
481
541
  }
482
542
 
483
- // Start any background compaction that may have been delayed by this thread
484
- MaybeScheduleCompaction();
543
+ MutexLock l(&mutex_);
544
+ while (!manual.done) {
545
+ while (manual_compaction_ != NULL) {
546
+ bg_cv_.Wait();
547
+ }
548
+ manual_compaction_ = &manual;
549
+ MaybeScheduleCompaction();
550
+ while (manual_compaction_ == &manual) {
551
+ bg_cv_.Wait();
552
+ }
553
+ }
485
554
  }
486
555
 
487
556
  Status DBImpl::TEST_CompactMemTable() {
488
557
  MutexLock l(&mutex_);
558
+ LoggerId self;
559
+ AcquireLoggingResponsibility(&self);
489
560
  Status s = MakeRoomForWrite(true /* force compaction */);
561
+ ReleaseLoggingResponsibility(&self);
490
562
  if (s.ok()) {
491
563
  // Wait until the compaction completes
492
564
  while (imm_ != NULL && bg_error_.ok()) {
493
- compacting_cv_.Wait();
565
+ bg_cv_.Wait();
494
566
  }
495
567
  if (imm_ != NULL) {
496
568
  s = bg_error_;
@@ -503,11 +575,11 @@ void DBImpl::MaybeScheduleCompaction() {
503
575
  mutex_.AssertHeld();
504
576
  if (bg_compaction_scheduled_) {
505
577
  // Already scheduled
506
- } else if (compacting_) {
507
- // Some other thread is running a compaction. Do not conflict with it.
508
578
  } else if (shutting_down_.Acquire_Load()) {
509
579
  // DB is being deleted; no more background compactions
510
- } else if (imm_ == NULL && !versions_->NeedsCompaction()) {
580
+ } else if (imm_ == NULL &&
581
+ manual_compaction_ == NULL &&
582
+ !versions_->NeedsCompaction()) {
511
583
  // No work to be done
512
584
  } else {
513
585
  bg_compaction_scheduled_ = true;
@@ -522,50 +594,63 @@ void DBImpl::BGWork(void* db) {
522
594
  void DBImpl::BackgroundCall() {
523
595
  MutexLock l(&mutex_);
524
596
  assert(bg_compaction_scheduled_);
525
- if (!shutting_down_.Acquire_Load() &&
526
- !compacting_) {
597
+ if (!shutting_down_.Acquire_Load()) {
527
598
  BackgroundCompaction();
528
599
  }
529
600
  bg_compaction_scheduled_ = false;
530
- bg_cv_.SignalAll();
531
601
 
532
602
  // Previous compaction may have produced too many files in a level,
533
603
  // so reschedule another compaction if needed.
534
604
  MaybeScheduleCompaction();
605
+ bg_cv_.SignalAll();
535
606
  }
536
607
 
537
608
  void DBImpl::BackgroundCompaction() {
538
609
  mutex_.AssertHeld();
539
- assert(!compacting_);
540
610
 
541
611
  if (imm_ != NULL) {
542
- compacting_ = true;
543
612
  CompactMemTable();
544
- compacting_ = false;
545
- compacting_cv_.SignalAll();
546
613
  return;
547
614
  }
548
615
 
549
- Compaction* c = versions_->PickCompaction();
550
- if (c == NULL) {
551
- // Nothing to do
552
- return;
616
+ Compaction* c;
617
+ bool is_manual = (manual_compaction_ != NULL);
618
+ InternalKey manual_end;
619
+ if (is_manual) {
620
+ ManualCompaction* m = manual_compaction_;
621
+ c = versions_->CompactRange(m->level, m->begin, m->end);
622
+ m->done = (c == NULL);
623
+ if (c != NULL) {
624
+ manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
625
+ }
626
+ Log(options_.info_log,
627
+ "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
628
+ m->level,
629
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
630
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
631
+ (m->done ? "(end)" : manual_end.DebugString().c_str()));
632
+ } else {
633
+ c = versions_->PickCompaction();
553
634
  }
554
635
 
555
636
  Status status;
556
- if (c->IsTrivialMove()) {
637
+ if (c == NULL) {
638
+ // Nothing to do
639
+ } else if (!is_manual && c->IsTrivialMove()) {
557
640
  // Move file to next level
558
641
  assert(c->num_input_files(0) == 1);
559
642
  FileMetaData* f = c->input(0, 0);
560
643
  c->edit()->DeleteFile(c->level(), f->number);
561
644
  c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
562
645
  f->smallest, f->largest);
563
- status = versions_->LogAndApply(c->edit());
564
- Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
646
+ status = versions_->LogAndApply(c->edit(), &mutex_);
647
+ VersionSet::LevelSummaryStorage tmp;
648
+ Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
565
649
  static_cast<unsigned long long>(f->number),
566
650
  c->level() + 1,
567
651
  static_cast<unsigned long long>(f->file_size),
568
- status.ToString().c_str());
652
+ status.ToString().c_str(),
653
+ versions_->LevelSummary(&tmp));
569
654
  } else {
570
655
  CompactionState* compact = new CompactionState(c);
571
656
  status = DoCompactionWork(compact);
@@ -578,12 +663,23 @@ void DBImpl::BackgroundCompaction() {
578
663
  } else if (shutting_down_.Acquire_Load()) {
579
664
  // Ignore compaction errors found during shutting down
580
665
  } else {
581
- Log(env_, options_.info_log,
666
+ Log(options_.info_log,
582
667
  "Compaction error: %s", status.ToString().c_str());
583
668
  if (options_.paranoid_checks && bg_error_.ok()) {
584
669
  bg_error_ = status;
585
670
  }
586
671
  }
672
+
673
+ if (is_manual) {
674
+ ManualCompaction* m = manual_compaction_;
675
+ if (!m->done) {
676
+ // We only compacted part of the requested range. Update *m
677
+ // to the range that is left to be compacted.
678
+ m->tmp_storage = manual_end;
679
+ m->begin = &m->tmp_storage;
680
+ }
681
+ manual_compaction_ = NULL;
682
+ }
587
683
  }
588
684
 
589
685
  void DBImpl::CleanupCompaction(CompactionState* compact) {
@@ -669,7 +765,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
669
765
  s = iter->status();
670
766
  delete iter;
671
767
  if (s.ok()) {
672
- Log(env_, options_.info_log,
768
+ Log(options_.info_log,
673
769
  "Generated table #%llu: %lld keys, %lld bytes",
674
770
  (unsigned long long) output_number,
675
771
  (unsigned long long) current_entries,
@@ -682,7 +778,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
682
778
 
683
779
  Status DBImpl::InstallCompactionResults(CompactionState* compact) {
684
780
  mutex_.AssertHeld();
685
- Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
781
+ Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
686
782
  compact->compaction->num_input_files(0),
687
783
  compact->compaction->level(),
688
784
  compact->compaction->num_input_files(1),
@@ -701,7 +797,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
701
797
  }
702
798
  compact->outputs.clear();
703
799
 
704
- Status s = versions_->LogAndApply(compact->compaction->edit());
800
+ Status s = versions_->LogAndApply(compact->compaction->edit(), &mutex_);
705
801
  if (s.ok()) {
706
802
  compact->compaction->ReleaseInputs();
707
803
  DeleteObsoleteFiles();
@@ -718,7 +814,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
718
814
  const uint64_t start_micros = env_->NowMicros();
719
815
  int64_t imm_micros = 0; // Micros spent doing imm_ compactions
720
816
 
721
- Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files",
817
+ Log(options_.info_log, "Compacting %d@%d + %d@%d files",
722
818
  compact->compaction->num_input_files(0),
723
819
  compact->compaction->level(),
724
820
  compact->compaction->num_input_files(1),
@@ -734,7 +830,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
734
830
  }
735
831
 
736
832
  // Release mutex while we're actually doing the compaction work
737
- compacting_ = true;
738
833
  mutex_.Unlock();
739
834
 
740
835
  Iterator* input = versions_->MakeInputIterator(compact->compaction);
@@ -751,7 +846,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
751
846
  mutex_.Lock();
752
847
  if (imm_ != NULL) {
753
848
  CompactMemTable();
754
- compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
849
+ bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
755
850
  }
756
851
  mutex_.Unlock();
757
852
  imm_micros += (env_->NowMicros() - imm_start);
@@ -802,7 +897,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
802
897
  last_sequence_for_key = ikey.sequence;
803
898
  }
804
899
  #if 0
805
- Log(env_, options_.info_log,
900
+ Log(options_.info_log,
806
901
  " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
807
902
  "%d smallest_snapshot: %d",
808
903
  ikey.user_key.ToString().c_str(),
@@ -867,10 +962,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
867
962
  if (status.ok()) {
868
963
  status = InstallCompactionResults(compact);
869
964
  }
870
- compacting_ = false;
871
- compacting_cv_.SignalAll();
872
965
  VersionSet::LevelSummaryStorage tmp;
873
- Log(env_, options_.info_log,
966
+ Log(options_.info_log,
874
967
  "compacted to: %s", versions_->LevelSummary(&tmp));
875
968
  return status;
876
969
  }
@@ -936,22 +1029,48 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
936
1029
  Status DBImpl::Get(const ReadOptions& options,
937
1030
  const Slice& key,
938
1031
  std::string* value) {
939
- // TODO(opt): faster implementation
940
- Iterator* iter = NewIterator(options);
941
- iter->Seek(key);
942
- bool found = false;
943
- if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) {
944
- Slice v = iter->value();
945
- value->assign(v.data(), v.size());
946
- found = true;
947
- }
948
- // Non-OK iterator status trumps everything else
949
- Status result = iter->status();
950
- if (result.ok() && !found) {
951
- result = Status::NotFound(Slice()); // Use an empty error message for speed
1032
+ Status s;
1033
+ MutexLock l(&mutex_);
1034
+ SequenceNumber snapshot;
1035
+ if (options.snapshot != NULL) {
1036
+ snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
1037
+ } else {
1038
+ snapshot = versions_->LastSequence();
952
1039
  }
953
- delete iter;
954
- return result;
1040
+
1041
+ MemTable* mem = mem_;
1042
+ MemTable* imm = imm_;
1043
+ Version* current = versions_->current();
1044
+ mem->Ref();
1045
+ if (imm != NULL) imm->Ref();
1046
+ current->Ref();
1047
+
1048
+ bool have_stat_update = false;
1049
+ Version::GetStats stats;
1050
+
1051
+ // Unlock while reading from files and memtables
1052
+ {
1053
+ mutex_.Unlock();
1054
+ // First look in the memtable, then in the immutable memtable (if any).
1055
+ LookupKey lkey(key, snapshot);
1056
+ if (mem->Get(lkey, value, &s)) {
1057
+ // Done
1058
+ } else if (imm != NULL && imm->Get(lkey, value, &s)) {
1059
+ // Done
1060
+ } else {
1061
+ s = current->Get(options, lkey, value, &stats);
1062
+ have_stat_update = true;
1063
+ }
1064
+ mutex_.Lock();
1065
+ }
1066
+
1067
+ if (have_stat_update && current->UpdateStats(stats)) {
1068
+ MaybeScheduleCompaction();
1069
+ }
1070
+ mem->Unref();
1071
+ if (imm != NULL) imm->Unref();
1072
+ current->Unref();
1073
+ return s;
955
1074
  }
956
1075
 
957
1076
  Iterator* DBImpl::NewIterator(const ReadOptions& options) {
@@ -983,34 +1102,61 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
983
1102
  return DB::Delete(options, key);
984
1103
  }
985
1104
 
1105
+ // There is at most one thread that is the current logger. This call
1106
+ // waits until preceding logger(s) have finished and becomes the
1107
+ // current logger.
1108
+ void DBImpl::AcquireLoggingResponsibility(LoggerId* self) {
1109
+ while (logger_ != NULL) {
1110
+ logger_cv_.Wait();
1111
+ }
1112
+ logger_ = self;
1113
+ }
1114
+
1115
+ void DBImpl::ReleaseLoggingResponsibility(LoggerId* self) {
1116
+ assert(logger_ == self);
1117
+ logger_ = NULL;
1118
+ logger_cv_.SignalAll();
1119
+ }
1120
+
986
1121
  Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
987
1122
  Status status;
988
1123
  MutexLock l(&mutex_);
1124
+ LoggerId self;
1125
+ AcquireLoggingResponsibility(&self);
989
1126
  status = MakeRoomForWrite(false); // May temporarily release lock and wait
990
1127
  uint64_t last_sequence = versions_->LastSequence();
991
1128
  if (status.ok()) {
992
1129
  WriteBatchInternal::SetSequence(updates, last_sequence + 1);
993
1130
  last_sequence += WriteBatchInternal::Count(updates);
994
- versions_->SetLastSequence(last_sequence);
995
1131
 
996
- // Add to log and apply to memtable
997
- status = log_->AddRecord(WriteBatchInternal::Contents(updates));
998
- if (status.ok() && options.sync) {
999
- status = logfile_->Sync();
1000
- }
1001
- if (status.ok()) {
1002
- status = WriteBatchInternal::InsertInto(updates, mem_);
1132
+ // Add to log and apply to memtable. We can release the lock during
1133
+ // this phase since the "logger_" flag protects against concurrent
1134
+ // loggers and concurrent writes into mem_.
1135
+ {
1136
+ assert(logger_ == &self);
1137
+ mutex_.Unlock();
1138
+ status = log_->AddRecord(WriteBatchInternal::Contents(updates));
1139
+ if (status.ok() && options.sync) {
1140
+ status = logfile_->Sync();
1141
+ }
1142
+ if (status.ok()) {
1143
+ status = WriteBatchInternal::InsertInto(updates, mem_);
1144
+ }
1145
+ mutex_.Lock();
1146
+ assert(logger_ == &self);
1003
1147
  }
1148
+
1149
+ versions_->SetLastSequence(last_sequence);
1004
1150
  }
1005
- if (options.post_write_snapshot != NULL) {
1006
- *options.post_write_snapshot =
1007
- status.ok() ? snapshots_.New(last_sequence) : NULL;
1008
- }
1151
+ ReleaseLoggingResponsibility(&self);
1009
1152
  return status;
1010
1153
  }
1011
1154
 
1155
+ // REQUIRES: mutex_ is held
1156
+ // REQUIRES: this thread is the current logger
1012
1157
  Status DBImpl::MakeRoomForWrite(bool force) {
1013
1158
  mutex_.AssertHeld();
1159
+ assert(logger_ != NULL);
1014
1160
  bool allow_delay = !force;
1015
1161
  Status s;
1016
1162
  while (true) {
@@ -1038,10 +1184,11 @@ Status DBImpl::MakeRoomForWrite(bool force) {
1038
1184
  } else if (imm_ != NULL) {
1039
1185
  // We have filled up the current memtable, but the previous
1040
1186
  // one is still being compacted, so we wait.
1041
- compacting_cv_.Wait();
1187
+ bg_cv_.Wait();
1042
1188
  } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
1043
1189
  // There are too many level-0 files.
1044
- compacting_cv_.Wait();
1190
+ Log(options_.info_log, "waiting...\n");
1191
+ bg_cv_.Wait();
1045
1192
  } else {
1046
1193
  // Attempt to switch to a new memtable and trigger compaction of old
1047
1194
  assert(versions_->PrevLogNumber() == 0);
@@ -1051,18 +1198,10 @@ Status DBImpl::MakeRoomForWrite(bool force) {
1051
1198
  if (!s.ok()) {
1052
1199
  break;
1053
1200
  }
1054
- VersionEdit edit;
1055
- edit.SetPrevLogNumber(versions_->LogNumber());
1056
- edit.SetLogNumber(new_log_number);
1057
- s = versions_->LogAndApply(&edit);
1058
- if (!s.ok()) {
1059
- delete lfile;
1060
- env_->DeleteFile(LogFileName(dbname_, new_log_number));
1061
- break;
1062
- }
1063
1201
  delete log_;
1064
1202
  delete logfile_;
1065
1203
  logfile_ = lfile;
1204
+ logfile_number_ = new_log_number;
1066
1205
  log_ = new log::Writer(lfile);
1067
1206
  imm_ = mem_;
1068
1207
  has_imm_.Release_Store(imm_);
@@ -1088,7 +1227,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
1088
1227
  in.remove_prefix(strlen("num-files-at-level"));
1089
1228
  uint64_t level;
1090
1229
  bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
1091
- if (!ok || level < 0 || level >= config::kNumLevels) {
1230
+ if (!ok || level >= config::kNumLevels) {
1092
1231
  return false;
1093
1232
  } else {
1094
1233
  char buf[100];
@@ -1121,6 +1260,9 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
1121
1260
  }
1122
1261
  }
1123
1262
  return true;
1263
+ } else if (in == "sstables") {
1264
+ *value = versions_->current()->DebugString();
1265
+ return true;
1124
1266
  }
1125
1267
 
1126
1268
  return false;
@@ -1184,8 +1326,9 @@ Status DB::Open(const Options& options, const std::string& dbname,
1184
1326
  if (s.ok()) {
1185
1327
  edit.SetLogNumber(new_log_number);
1186
1328
  impl->logfile_ = lfile;
1329
+ impl->logfile_number_ = new_log_number;
1187
1330
  impl->log_ = new log::Writer(lfile);
1188
- s = impl->versions_->LogAndApply(&edit);
1331
+ s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
1189
1332
  }
1190
1333
  if (s.ok()) {
1191
1334
  impl->DeleteObsoleteFiles();
@@ -1214,12 +1357,14 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
1214
1357
  }
1215
1358
 
1216
1359
  FileLock* lock;
1217
- Status result = env->LockFile(LockFileName(dbname), &lock);
1360
+ const std::string lockname = LockFileName(dbname);
1361
+ Status result = env->LockFile(lockname, &lock);
1218
1362
  if (result.ok()) {
1219
1363
  uint64_t number;
1220
1364
  FileType type;
1221
1365
  for (size_t i = 0; i < filenames.size(); i++) {
1222
- if (ParseFileName(filenames[i], &number, &type)) {
1366
+ if (ParseFileName(filenames[i], &number, &type) &&
1367
+ filenames[i] != lockname) { // Lock file will be deleted at end
1223
1368
  Status del = env->DeleteFile(dbname + "/" + filenames[i]);
1224
1369
  if (result.ok() && !del.ok()) {
1225
1370
  result = del;
@@ -1227,7 +1372,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
1227
1372
  }
1228
1373
  }
1229
1374
  env->UnlockFile(lock); // Ignore error since state is already gone
1230
- env->DeleteFile(LockFileName(dbname));
1375
+ env->DeleteFile(lockname);
1231
1376
  env->DeleteDir(dbname); // Ignore error in case dir contains other files
1232
1377
  }
1233
1378
  return result;