leveldb-ruby 0.7 → 0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/README +1 -1
  2. data/leveldb/Makefile +70 -29
  3. data/leveldb/build_detect_platform +74 -0
  4. data/leveldb/db/builder.cc +2 -4
  5. data/leveldb/db/builder.h +4 -6
  6. data/leveldb/db/c.cc +471 -0
  7. data/leveldb/db/corruption_test.cc +21 -16
  8. data/leveldb/db/db_bench.cc +400 -200
  9. data/leveldb/db/db_impl.cc +276 -131
  10. data/leveldb/db/db_impl.h +22 -10
  11. data/leveldb/db/db_iter.cc +2 -1
  12. data/leveldb/db/db_test.cc +391 -43
  13. data/leveldb/db/dbformat.cc +31 -0
  14. data/leveldb/db/dbformat.h +51 -1
  15. data/leveldb/db/filename.h +1 -1
  16. data/leveldb/db/log_format.h +1 -1
  17. data/leveldb/db/log_reader.cc +16 -11
  18. data/leveldb/db/memtable.cc +37 -0
  19. data/leveldb/db/memtable.h +6 -0
  20. data/leveldb/db/repair.cc +17 -14
  21. data/leveldb/db/skiplist_test.cc +2 -2
  22. data/leveldb/db/version_edit.cc +7 -9
  23. data/leveldb/db/version_edit.h +2 -1
  24. data/leveldb/db/version_set.cc +416 -104
  25. data/leveldb/db/version_set.h +78 -14
  26. data/leveldb/db/version_set_test.cc +179 -0
  27. data/leveldb/db/write_batch_internal.h +2 -0
  28. data/leveldb/include/leveldb/c.h +246 -0
  29. data/leveldb/include/leveldb/db.h +14 -2
  30. data/leveldb/include/leveldb/env.h +31 -10
  31. data/leveldb/include/leveldb/options.h +7 -18
  32. data/leveldb/include/leveldb/slice.h +2 -2
  33. data/leveldb/include/leveldb/status.h +1 -1
  34. data/leveldb/port/atomic_pointer.h +144 -0
  35. data/leveldb/port/port.h +0 -2
  36. data/leveldb/port/port_android.h +7 -1
  37. data/leveldb/port/port_example.h +11 -1
  38. data/leveldb/port/port_posix.h +56 -38
  39. data/leveldb/table/format.cc +12 -8
  40. data/leveldb/table/table_test.cc +16 -7
  41. data/leveldb/util/cache.cc +173 -100
  42. data/leveldb/util/cache_test.cc +28 -11
  43. data/leveldb/util/coding.h +4 -4
  44. data/leveldb/util/comparator.cc +1 -0
  45. data/leveldb/util/env.cc +10 -5
  46. data/leveldb/util/env_posix.cc +48 -87
  47. data/leveldb/util/histogram.cc +11 -0
  48. data/leveldb/util/histogram.h +1 -0
  49. data/leveldb/util/posix_logger.h +98 -0
  50. data/leveldb/util/testharness.cc +12 -0
  51. data/leveldb/util/testharness.h +10 -1
  52. data/lib/leveldb.rb +11 -3
  53. metadata +41 -22
@@ -41,6 +41,14 @@ static uint64_t MaxFileSizeForLevel(int level) {
41
41
  return kTargetFileSize; // We could vary per level to reduce number of files?
42
42
  }
43
43
 
44
+ static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
45
+ int64_t sum = 0;
46
+ for (size_t i = 0; i < files.size(); i++) {
47
+ sum += files[i]->file_size;
48
+ }
49
+ return sum;
50
+ }
51
+
44
52
  namespace {
45
53
  std::string IntSetToString(const std::set<uint64_t>& s) {
46
54
  std::string result = "{";
@@ -75,6 +83,78 @@ Version::~Version() {
75
83
  }
76
84
  }
77
85
 
86
+ int FindFile(const InternalKeyComparator& icmp,
87
+ const std::vector<FileMetaData*>& files,
88
+ const Slice& key) {
89
+ uint32_t left = 0;
90
+ uint32_t right = files.size();
91
+ while (left < right) {
92
+ uint32_t mid = (left + right) / 2;
93
+ const FileMetaData* f = files[mid];
94
+ if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
95
+ // Key at "mid.largest" is < "target". Therefore all
96
+ // files at or before "mid" are uninteresting.
97
+ left = mid + 1;
98
+ } else {
99
+ // Key at "mid.largest" is >= "target". Therefore all files
100
+ // after "mid" are uninteresting.
101
+ right = mid;
102
+ }
103
+ }
104
+ return right;
105
+ }
106
+
107
+ static bool AfterFile(const Comparator* ucmp,
108
+ const Slice* user_key, const FileMetaData* f) {
109
+ // NULL user_key occurs before all keys and is therefore never after *f
110
+ return (user_key != NULL &&
111
+ ucmp->Compare(*user_key, f->largest.user_key()) > 0);
112
+ }
113
+
114
+ static bool BeforeFile(const Comparator* ucmp,
115
+ const Slice* user_key, const FileMetaData* f) {
116
+ // NULL user_key occurs after all keys and is therefore never before *f
117
+ return (user_key != NULL &&
118
+ ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
119
+ }
120
+
121
+ bool SomeFileOverlapsRange(
122
+ const InternalKeyComparator& icmp,
123
+ bool disjoint_sorted_files,
124
+ const std::vector<FileMetaData*>& files,
125
+ const Slice* smallest_user_key,
126
+ const Slice* largest_user_key) {
127
+ const Comparator* ucmp = icmp.user_comparator();
128
+ if (!disjoint_sorted_files) {
129
+ // Need to check against all files
130
+ for (int i = 0; i < files.size(); i++) {
131
+ const FileMetaData* f = files[i];
132
+ if (AfterFile(ucmp, smallest_user_key, f) ||
133
+ BeforeFile(ucmp, largest_user_key, f)) {
134
+ // No overlap
135
+ } else {
136
+ return true; // Overlap
137
+ }
138
+ }
139
+ return false;
140
+ }
141
+
142
+ // Binary search over file list
143
+ uint32_t index = 0;
144
+ if (smallest_user_key != NULL) {
145
+ // Find the earliest possible internal key for smallest_user_key
146
+ InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
147
+ index = FindFile(icmp, files, small.Encode());
148
+ }
149
+
150
+ if (index >= files.size()) {
151
+ // beginning of range is after all files, so no overlap.
152
+ return false;
153
+ }
154
+
155
+ return !BeforeFile(ucmp, largest_user_key, files[index]);
156
+ }
157
+
78
158
  // An internal iterator. For a given version/level pair, yields
79
159
  // information about the files in the level. For a given entry, key()
80
160
  // is the largest key that occurs in the file, and value() is an
@@ -92,22 +172,7 @@ class Version::LevelFileNumIterator : public Iterator {
92
172
  return index_ < flist_->size();
93
173
  }
94
174
  virtual void Seek(const Slice& target) {
95
- uint32_t left = 0;
96
- uint32_t right = flist_->size() - 1;
97
- while (left < right) {
98
- uint32_t mid = (left + right) / 2;
99
- int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target);
100
- if (cmp < 0) {
101
- // Key at "mid.largest" is < than "target". Therefore all
102
- // files at or before "mid" are uninteresting.
103
- left = mid + 1;
104
- } else {
105
- // Key at "mid.largest" is >= "target". Therefore all files
106
- // after "mid" are uninteresting.
107
- right = mid;
108
- }
109
- }
110
- index_ = left;
175
+ index_ = FindFile(icmp_, *flist_, target);
111
176
  }
112
177
  virtual void SeekToFirst() { index_ = 0; }
113
178
  virtual void SeekToLast() {
@@ -185,6 +250,146 @@ void Version::AddIterators(const ReadOptions& options,
185
250
  }
186
251
  }
187
252
 
253
+ // If "*iter" points at a value or deletion for user_key, store
254
+ // either the value, or a NotFound error and return true.
255
+ // Else return false.
256
+ static bool GetValue(Iterator* iter, const Slice& user_key,
257
+ std::string* value,
258
+ Status* s) {
259
+ if (!iter->Valid()) {
260
+ return false;
261
+ }
262
+ ParsedInternalKey parsed_key;
263
+ if (!ParseInternalKey(iter->key(), &parsed_key)) {
264
+ *s = Status::Corruption("corrupted key for ", user_key);
265
+ return true;
266
+ }
267
+ if (parsed_key.user_key != user_key) {
268
+ return false;
269
+ }
270
+ switch (parsed_key.type) {
271
+ case kTypeDeletion:
272
+ *s = Status::NotFound(Slice()); // Use an empty error message for speed
273
+ break;
274
+ case kTypeValue: {
275
+ Slice v = iter->value();
276
+ value->assign(v.data(), v.size());
277
+ break;
278
+ }
279
+ }
280
+ return true;
281
+ }
282
+
283
+ static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
284
+ return a->number > b->number;
285
+ }
286
+
287
+ Status Version::Get(const ReadOptions& options,
288
+ const LookupKey& k,
289
+ std::string* value,
290
+ GetStats* stats) {
291
+ Slice ikey = k.internal_key();
292
+ Slice user_key = k.user_key();
293
+ const Comparator* ucmp = vset_->icmp_.user_comparator();
294
+ Status s;
295
+
296
+ stats->seek_file = NULL;
297
+ stats->seek_file_level = -1;
298
+ FileMetaData* last_file_read = NULL;
299
+ int last_file_read_level = -1;
300
+
301
+ // We can search level-by-level since entries never hop across
302
+ // levels. Therefore we are guaranteed that if we find data
303
+ // in an smaller level, later levels are irrelevant.
304
+ std::vector<FileMetaData*> tmp;
305
+ FileMetaData* tmp2;
306
+ for (int level = 0; level < config::kNumLevels; level++) {
307
+ size_t num_files = files_[level].size();
308
+ if (num_files == 0) continue;
309
+
310
+ // Get the list of files to search in this level
311
+ FileMetaData* const* files = &files_[level][0];
312
+ if (level == 0) {
313
+ // Level-0 files may overlap each other. Find all files that
314
+ // overlap user_key and process them in order from newest to oldest.
315
+ tmp.reserve(num_files);
316
+ for (uint32_t i = 0; i < num_files; i++) {
317
+ FileMetaData* f = files[i];
318
+ if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
319
+ ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
320
+ tmp.push_back(f);
321
+ }
322
+ }
323
+ if (tmp.empty()) continue;
324
+
325
+ std::sort(tmp.begin(), tmp.end(), NewestFirst);
326
+ files = &tmp[0];
327
+ num_files = tmp.size();
328
+ } else {
329
+ // Binary search to find earliest index whose largest key >= ikey.
330
+ uint32_t index = FindFile(vset_->icmp_, files_[level], ikey);
331
+ if (index >= num_files) {
332
+ files = NULL;
333
+ num_files = 0;
334
+ } else {
335
+ tmp2 = files[index];
336
+ if (ucmp->Compare(user_key, tmp2->smallest.user_key()) < 0) {
337
+ // All of "tmp2" is past any data for user_key
338
+ files = NULL;
339
+ num_files = 0;
340
+ } else {
341
+ files = &tmp2;
342
+ num_files = 1;
343
+ }
344
+ }
345
+ }
346
+
347
+ for (uint32_t i = 0; i < num_files; ++i) {
348
+ if (last_file_read != NULL && stats->seek_file == NULL) {
349
+ // We have had more than one seek for this read. Charge the 1st file.
350
+ stats->seek_file = last_file_read;
351
+ stats->seek_file_level = last_file_read_level;
352
+ }
353
+
354
+ FileMetaData* f = files[i];
355
+ last_file_read = f;
356
+ last_file_read_level = level;
357
+
358
+ Iterator* iter = vset_->table_cache_->NewIterator(
359
+ options,
360
+ f->number,
361
+ f->file_size);
362
+ iter->Seek(ikey);
363
+ const bool done = GetValue(iter, user_key, value, &s);
364
+ if (!iter->status().ok()) {
365
+ s = iter->status();
366
+ delete iter;
367
+ return s;
368
+ } else {
369
+ delete iter;
370
+ if (done) {
371
+ return s;
372
+ }
373
+ }
374
+ }
375
+ }
376
+
377
+ return Status::NotFound(Slice()); // Use an empty error message for speed
378
+ }
379
+
380
+ bool Version::UpdateStats(const GetStats& stats) {
381
+ FileMetaData* f = stats.seek_file;
382
+ if (f != NULL) {
383
+ f->allowed_seeks--;
384
+ if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) {
385
+ file_to_compact_ = f;
386
+ file_to_compact_level_ = stats.seek_file_level;
387
+ return true;
388
+ }
389
+ }
390
+ return false;
391
+ }
392
+
188
393
  void Version::Ref() {
189
394
  ++refs_;
190
395
  }
@@ -198,26 +403,89 @@ void Version::Unref() {
198
403
  }
199
404
  }
200
405
 
406
+ bool Version::OverlapInLevel(int level,
407
+ const Slice* smallest_user_key,
408
+ const Slice* largest_user_key) {
409
+ return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
410
+ smallest_user_key, largest_user_key);
411
+ }
412
+
413
+ int Version::PickLevelForMemTableOutput(
414
+ const Slice& smallest_user_key,
415
+ const Slice& largest_user_key) {
416
+ int level = 0;
417
+ if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
418
+ // Push to next level if there is no overlap in next level,
419
+ // and the #bytes overlapping in the level after that are limited.
420
+ InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
421
+ InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
422
+ std::vector<FileMetaData*> overlaps;
423
+ while (level < config::kMaxMemCompactLevel) {
424
+ if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
425
+ break;
426
+ }
427
+ GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
428
+ const int64_t sum = TotalFileSize(overlaps);
429
+ if (sum > kMaxGrandParentOverlapBytes) {
430
+ break;
431
+ }
432
+ level++;
433
+ }
434
+ }
435
+ return level;
436
+ }
437
+
438
+ // Store in "*inputs" all files in "level" that overlap [begin,end]
439
+ void Version::GetOverlappingInputs(
440
+ int level,
441
+ const InternalKey* begin,
442
+ const InternalKey* end,
443
+ std::vector<FileMetaData*>* inputs) {
444
+ inputs->clear();
445
+ Slice user_begin, user_end;
446
+ if (begin != NULL) {
447
+ user_begin = begin->user_key();
448
+ }
449
+ if (end != NULL) {
450
+ user_end = end->user_key();
451
+ }
452
+ const Comparator* user_cmp = vset_->icmp_.user_comparator();
453
+ for (size_t i = 0; i < files_[level].size(); i++) {
454
+ FileMetaData* f = files_[level][i];
455
+ if (begin != NULL &&
456
+ user_cmp->Compare(f->largest.user_key(), user_begin) < 0) {
457
+ // "f" is completely before specified range; skip it
458
+ } else if (end != NULL &&
459
+ user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
460
+ // "f" is completely after specified range; skip it
461
+ } else {
462
+ inputs->push_back(f);
463
+ }
464
+ }
465
+ }
466
+
201
467
  std::string Version::DebugString() const {
202
468
  std::string r;
203
469
  for (int level = 0; level < config::kNumLevels; level++) {
204
- // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g']
205
- r.append("level ");
470
+ // E.g.,
471
+ // --- level 1 ---
472
+ // 17:123['a' .. 'd']
473
+ // 20:43['e' .. 'g']
474
+ r.append("--- level ");
206
475
  AppendNumberTo(&r, level);
207
- r.push_back(':');
476
+ r.append(" ---\n");
208
477
  const std::vector<FileMetaData*>& files = files_[level];
209
478
  for (size_t i = 0; i < files.size(); i++) {
210
479
  r.push_back(' ');
211
480
  AppendNumberTo(&r, files[i]->number);
212
481
  r.push_back(':');
213
482
  AppendNumberTo(&r, files[i]->file_size);
214
- r.append("['");
215
- AppendEscapedStringTo(&r, files[i]->smallest.Encode());
216
- r.append("' .. '");
217
- AppendEscapedStringTo(&r, files[i]->largest.Encode());
218
- r.append("']");
483
+ r.append("[");
484
+ r.append(files[i]->smallest.DebugString());
485
+ r.append(" .. ");
486
+ r.append(files[i]->largest.DebugString());
487
+ r.append("]\n");
219
488
  }
220
- r.push_back('\n');
221
489
  }
222
490
  return r;
223
491
  }
@@ -267,10 +535,15 @@ class VersionSet::Builder {
267
535
 
268
536
  ~Builder() {
269
537
  for (int level = 0; level < config::kNumLevels; level++) {
270
- std::vector<FileMetaData*> to_unref(levels_[level].added_files->begin(),
271
- levels_[level].added_files->end());
272
- delete levels_[level].added_files;
273
- for (int i = 0; i < to_unref.size(); i++) {
538
+ const FileSet* added = levels_[level].added_files;
539
+ std::vector<FileMetaData*> to_unref;
540
+ to_unref.reserve(added->size());
541
+ for (FileSet::const_iterator it = added->begin();
542
+ it != added->end(); ++it) {
543
+ to_unref.push_back(*it);
544
+ }
545
+ delete added;
546
+ for (uint32_t i = 0; i < to_unref.size(); i++) {
274
547
  FileMetaData* f = to_unref[i];
275
548
  f->refs--;
276
549
  if (f->refs <= 0) {
@@ -305,6 +578,23 @@ class VersionSet::Builder {
305
578
  const int level = edit->new_files_[i].first;
306
579
  FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
307
580
  f->refs = 1;
581
+
582
+ // We arrange to automatically compact this file after
583
+ // a certain number of seeks. Let's assume:
584
+ // (1) One seek costs 10ms
585
+ // (2) Writing or reading 1MB costs 10ms (100MB/s)
586
+ // (3) A compaction of 1MB does 25MB of IO:
587
+ // 1MB read from this level
588
+ // 10-12MB read from next level (boundaries may be misaligned)
589
+ // 10-12MB written to next level
590
+ // This implies that 25 seeks cost the same as the compaction
591
+ // of 1MB of data. I.e., one seek costs approximately the
592
+ // same as the compaction of 40KB of data. We are a little
593
+ // conservative and allow approximately one seek for every 16KB
594
+ // of data before triggering a compaction.
595
+ f->allowed_seeks = (f->file_size / 16384);
596
+ if (f->allowed_seeks < 100) f->allowed_seeks = 100;
597
+
308
598
  levels_[level].deleted_files.erase(f->number);
309
599
  levels_[level].added_files->insert(f);
310
600
  }
@@ -344,13 +634,13 @@ class VersionSet::Builder {
344
634
  #ifndef NDEBUG
345
635
  // Make sure there is no overlap in levels > 0
346
636
  if (level > 0) {
347
- for (int i = 1; i < v->files_[level].size(); i++) {
637
+ for (uint32_t i = 1; i < v->files_[level].size(); i++) {
348
638
  const InternalKey& prev_end = v->files_[level][i-1]->largest;
349
639
  const InternalKey& this_begin = v->files_[level][i]->smallest;
350
640
  if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
351
641
  fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
352
- EscapeString(prev_end.Encode()).c_str(),
353
- EscapeString(this_begin.Encode()).c_str());
642
+ prev_end.DebugString().c_str(),
643
+ this_begin.DebugString().c_str());
354
644
  abort();
355
645
  }
356
646
  }
@@ -363,8 +653,14 @@ class VersionSet::Builder {
363
653
  if (levels_[level].deleted_files.count(f->number) > 0) {
364
654
  // File is deleted: do nothing
365
655
  } else {
656
+ std::vector<FileMetaData*>* files = &v->files_[level];
657
+ if (level > 0 && !files->empty()) {
658
+ // Must not overlap
659
+ assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
660
+ f->smallest) < 0);
661
+ }
366
662
  f->refs++;
367
- v->files_[level].push_back(f);
663
+ files->push_back(f);
368
664
  }
369
665
  }
370
666
  };
@@ -414,7 +710,7 @@ void VersionSet::AppendVersion(Version* v) {
414
710
  v->next_->prev_ = v;
415
711
  }
416
712
 
417
- Status VersionSet::LogAndApply(VersionEdit* edit) {
713
+ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
418
714
  if (edit->has_log_number_) {
419
715
  assert(edit->log_number_ >= log_number_);
420
716
  assert(edit->log_number_ < next_file_number_);
@@ -442,6 +738,8 @@ Status VersionSet::LogAndApply(VersionEdit* edit) {
442
738
  std::string new_manifest_file;
443
739
  Status s;
444
740
  if (descriptor_log_ == NULL) {
741
+ // No reason to unlock *mu here since we only hit this path in the
742
+ // first call to LogAndApply (when opening the database).
445
743
  assert(descriptor_file_ == NULL);
446
744
  new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
447
745
  edit->SetNextFile(next_file_number_);
@@ -452,20 +750,27 @@ Status VersionSet::LogAndApply(VersionEdit* edit) {
452
750
  }
453
751
  }
454
752
 
455
- // Write new record to MANIFEST log
456
- if (s.ok()) {
457
- std::string record;
458
- edit->EncodeTo(&record);
459
- s = descriptor_log_->AddRecord(record);
753
+ // Unlock during expensive MANIFEST log write
754
+ {
755
+ mu->Unlock();
756
+
757
+ // Write new record to MANIFEST log
460
758
  if (s.ok()) {
461
- s = descriptor_file_->Sync();
759
+ std::string record;
760
+ edit->EncodeTo(&record);
761
+ s = descriptor_log_->AddRecord(record);
762
+ if (s.ok()) {
763
+ s = descriptor_file_->Sync();
764
+ }
462
765
  }
463
- }
464
766
 
465
- // If we just created a new descriptor file, install it by writing a
466
- // new CURRENT file that points to it.
467
- if (s.ok() && !new_manifest_file.empty()) {
468
- s = SetCurrentFile(env_, dbname_, manifest_file_number_);
767
+ // If we just created a new descriptor file, install it by writing a
768
+ // new CURRENT file that points to it.
769
+ if (s.ok() && !new_manifest_file.empty()) {
770
+ s = SetCurrentFile(env_, dbname_, manifest_file_number_);
771
+ }
772
+
773
+ mu->Lock();
469
774
  }
470
775
 
471
776
  // Install the new version
@@ -581,6 +886,9 @@ Status VersionSet::Recover() {
581
886
  if (!have_prev_log_number) {
582
887
  prev_log_number = 0;
583
888
  }
889
+
890
+ MarkFileNumberUsed(prev_log_number);
891
+ MarkFileNumberUsed(log_number);
584
892
  }
585
893
 
586
894
  if (s.ok()) {
@@ -599,12 +907,10 @@ Status VersionSet::Recover() {
599
907
  return s;
600
908
  }
601
909
 
602
- static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
603
- int64_t sum = 0;
604
- for (size_t i = 0; i < files.size(); i++) {
605
- sum += files[i]->file_size;
910
+ void VersionSet::MarkFileNumberUsed(uint64_t number) {
911
+ if (next_file_number_ <= number) {
912
+ next_file_number_ = number + 1;
606
913
  }
607
- return sum;
608
914
  }
609
915
 
610
916
  void VersionSet::Finalize(Version* v) {
@@ -749,10 +1055,11 @@ int64_t VersionSet::NumLevelBytes(int level) const {
749
1055
  int64_t VersionSet::MaxNextLevelOverlappingBytes() {
750
1056
  int64_t result = 0;
751
1057
  std::vector<FileMetaData*> overlaps;
752
- for (int level = 0; level < config::kNumLevels - 1; level++) {
1058
+ for (int level = 1; level < config::kNumLevels - 1; level++) {
753
1059
  for (size_t i = 0; i < current_->files_[level].size(); i++) {
754
1060
  const FileMetaData* f = current_->files_[level][i];
755
- GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
1061
+ current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
1062
+ &overlaps);
756
1063
  const int64_t sum = TotalFileSize(overlaps);
757
1064
  if (sum > result) {
758
1065
  result = sum;
@@ -762,27 +1069,6 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() {
762
1069
  return result;
763
1070
  }
764
1071
 
765
- // Store in "*inputs" all files in "level" that overlap [begin,end]
766
- void VersionSet::GetOverlappingInputs(
767
- int level,
768
- const InternalKey& begin,
769
- const InternalKey& end,
770
- std::vector<FileMetaData*>* inputs) {
771
- inputs->clear();
772
- Slice user_begin = begin.user_key();
773
- Slice user_end = end.user_key();
774
- const Comparator* user_cmp = icmp_.user_comparator();
775
- for (size_t i = 0; i < current_->files_[level].size(); i++) {
776
- FileMetaData* f = current_->files_[level][i];
777
- if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 ||
778
- user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
779
- // Either completely before or after range; skip it
780
- } else {
781
- inputs->push_back(f);
782
- }
783
- }
784
- }
785
-
786
1072
  // Stores the minimal range that covers all entries in inputs in
787
1073
  // *smallest, *largest.
788
1074
  // REQUIRES: inputs is not empty
@@ -854,31 +1140,43 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
854
1140
  }
855
1141
 
856
1142
  Compaction* VersionSet::PickCompaction() {
857
- if (!NeedsCompaction()) {
1143
+ Compaction* c;
1144
+ int level;
1145
+
1146
+ // We prefer compactions triggered by too much data in a level over
1147
+ // the compactions triggered by seeks.
1148
+ const bool size_compaction = (current_->compaction_score_ >= 1);
1149
+ const bool seek_compaction = (current_->file_to_compact_ != NULL);
1150
+ if (size_compaction) {
1151
+ level = current_->compaction_level_;
1152
+ assert(level >= 0);
1153
+ assert(level+1 < config::kNumLevels);
1154
+ c = new Compaction(level);
1155
+
1156
+ // Pick the first file that comes after compact_pointer_[level]
1157
+ for (size_t i = 0; i < current_->files_[level].size(); i++) {
1158
+ FileMetaData* f = current_->files_[level][i];
1159
+ if (compact_pointer_[level].empty() ||
1160
+ icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
1161
+ c->inputs_[0].push_back(f);
1162
+ break;
1163
+ }
1164
+ }
1165
+ if (c->inputs_[0].empty()) {
1166
+ // Wrap-around to the beginning of the key space
1167
+ c->inputs_[0].push_back(current_->files_[level][0]);
1168
+ }
1169
+ } else if (seek_compaction) {
1170
+ level = current_->file_to_compact_level_;
1171
+ c = new Compaction(level);
1172
+ c->inputs_[0].push_back(current_->file_to_compact_);
1173
+ } else {
858
1174
  return NULL;
859
1175
  }
860
- const int level = current_->compaction_level_;
861
- assert(level >= 0);
862
- assert(level+1 < config::kNumLevels);
863
1176
 
864
- Compaction* c = new Compaction(level);
865
1177
  c->input_version_ = current_;
866
1178
  c->input_version_->Ref();
867
1179
 
868
- // Pick the first file that comes after compact_pointer_[level]
869
- for (size_t i = 0; i < current_->files_[level].size(); i++) {
870
- FileMetaData* f = current_->files_[level][i];
871
- if (compact_pointer_[level].empty() ||
872
- icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
873
- c->inputs_[0].push_back(f);
874
- break;
875
- }
876
- }
877
- if (c->inputs_[0].empty()) {
878
- // Wrap-around to the beginning of the key space
879
- c->inputs_[0].push_back(current_->files_[level][0]);
880
- }
881
-
882
1180
  // Files in level 0 may overlap each other, so pick up all overlapping ones
883
1181
  if (level == 0) {
884
1182
  InternalKey smallest, largest;
@@ -886,7 +1184,7 @@ Compaction* VersionSet::PickCompaction() {
886
1184
  // Note that the next call will discard the file we placed in
887
1185
  // c->inputs_[0] earlier and replace it with an overlapping set
888
1186
  // which will include the picked file.
889
- GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
1187
+ current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
890
1188
  assert(!c->inputs_[0].empty());
891
1189
  }
892
1190
 
@@ -900,7 +1198,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
900
1198
  InternalKey smallest, largest;
901
1199
  GetRange(c->inputs_[0], &smallest, &largest);
902
1200
 
903
- GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
1201
+ current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
904
1202
 
905
1203
  // Get entire range covered by compaction
906
1204
  InternalKey all_start, all_limit;
@@ -910,14 +1208,15 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
910
1208
  // changing the number of "level+1" files we pick up.
911
1209
  if (!c->inputs_[1].empty()) {
912
1210
  std::vector<FileMetaData*> expanded0;
913
- GetOverlappingInputs(level, all_start, all_limit, &expanded0);
1211
+ current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
914
1212
  if (expanded0.size() > c->inputs_[0].size()) {
915
1213
  InternalKey new_start, new_limit;
916
1214
  GetRange(expanded0, &new_start, &new_limit);
917
1215
  std::vector<FileMetaData*> expanded1;
918
- GetOverlappingInputs(level+1, new_start, new_limit, &expanded1);
1216
+ current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
1217
+ &expanded1);
919
1218
  if (expanded1.size() == c->inputs_[1].size()) {
920
- Log(env_, options_->info_log,
1219
+ Log(options_->info_log,
921
1220
  "Expanding@%d %d+%d to %d+%d\n",
922
1221
  level,
923
1222
  int(c->inputs_[0].size()),
@@ -936,14 +1235,15 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
936
1235
  // Compute the set of grandparent files that overlap this compaction
937
1236
  // (parent == level+1; grandparent == level+2)
938
1237
  if (level + 2 < config::kNumLevels) {
939
- GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_);
1238
+ current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
1239
+ &c->grandparents_);
940
1240
  }
941
1241
 
942
1242
  if (false) {
943
- Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
1243
+ Log(options_->info_log, "Compacting %d '%s' .. '%s'",
944
1244
  level,
945
- EscapeString(smallest.Encode()).c_str(),
946
- EscapeString(largest.Encode()).c_str());
1245
+ smallest.DebugString().c_str(),
1246
+ largest.DebugString().c_str());
947
1247
  }
948
1248
 
949
1249
  // Update the place where we will do the next compaction for this level.
@@ -956,14 +1256,26 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
956
1256
 
957
1257
  Compaction* VersionSet::CompactRange(
958
1258
  int level,
959
- const InternalKey& begin,
960
- const InternalKey& end) {
1259
+ const InternalKey* begin,
1260
+ const InternalKey* end) {
961
1261
  std::vector<FileMetaData*> inputs;
962
- GetOverlappingInputs(level, begin, end, &inputs);
1262
+ current_->GetOverlappingInputs(level, begin, end, &inputs);
963
1263
  if (inputs.empty()) {
964
1264
  return NULL;
965
1265
  }
966
1266
 
1267
+ // Avoid compacting too much in one shot in case the range is large.
1268
+ const uint64_t limit = MaxFileSizeForLevel(level);
1269
+ uint64_t total = 0;
1270
+ for (int i = 0; i < inputs.size(); i++) {
1271
+ uint64_t s = inputs[i]->file_size;
1272
+ total += s;
1273
+ if (total >= limit) {
1274
+ inputs.resize(i + 1);
1275
+ break;
1276
+ }
1277
+ }
1278
+
967
1279
  Compaction* c = new Compaction(level);
968
1280
  c->input_version_ = current_;
969
1281
  c->input_version_->Ref();