leveldb 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (128) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +22 -0
  3. data/README.md +95 -0
  4. data/ext/Rakefile +11 -0
  5. data/ext/leveldb/LICENSE +27 -0
  6. data/ext/leveldb/Makefile +206 -0
  7. data/ext/leveldb/build_config.mk +13 -0
  8. data/ext/leveldb/db/builder.cc +88 -0
  9. data/ext/leveldb/db/builder.h +34 -0
  10. data/ext/leveldb/db/c.cc +595 -0
  11. data/ext/leveldb/db/c_test.c +390 -0
  12. data/ext/leveldb/db/corruption_test.cc +359 -0
  13. data/ext/leveldb/db/db_bench.cc +979 -0
  14. data/ext/leveldb/db/db_impl.cc +1485 -0
  15. data/ext/leveldb/db/db_impl.h +203 -0
  16. data/ext/leveldb/db/db_iter.cc +299 -0
  17. data/ext/leveldb/db/db_iter.h +26 -0
  18. data/ext/leveldb/db/db_test.cc +2092 -0
  19. data/ext/leveldb/db/dbformat.cc +140 -0
  20. data/ext/leveldb/db/dbformat.h +227 -0
  21. data/ext/leveldb/db/dbformat_test.cc +112 -0
  22. data/ext/leveldb/db/filename.cc +139 -0
  23. data/ext/leveldb/db/filename.h +80 -0
  24. data/ext/leveldb/db/filename_test.cc +122 -0
  25. data/ext/leveldb/db/leveldb_main.cc +238 -0
  26. data/ext/leveldb/db/log_format.h +35 -0
  27. data/ext/leveldb/db/log_reader.cc +259 -0
  28. data/ext/leveldb/db/log_reader.h +108 -0
  29. data/ext/leveldb/db/log_test.cc +500 -0
  30. data/ext/leveldb/db/log_writer.cc +103 -0
  31. data/ext/leveldb/db/log_writer.h +48 -0
  32. data/ext/leveldb/db/memtable.cc +145 -0
  33. data/ext/leveldb/db/memtable.h +91 -0
  34. data/ext/leveldb/db/repair.cc +389 -0
  35. data/ext/leveldb/db/skiplist.h +379 -0
  36. data/ext/leveldb/db/skiplist_test.cc +378 -0
  37. data/ext/leveldb/db/snapshot.h +66 -0
  38. data/ext/leveldb/db/table_cache.cc +121 -0
  39. data/ext/leveldb/db/table_cache.h +61 -0
  40. data/ext/leveldb/db/version_edit.cc +266 -0
  41. data/ext/leveldb/db/version_edit.h +107 -0
  42. data/ext/leveldb/db/version_edit_test.cc +46 -0
  43. data/ext/leveldb/db/version_set.cc +1443 -0
  44. data/ext/leveldb/db/version_set.h +383 -0
  45. data/ext/leveldb/db/version_set_test.cc +179 -0
  46. data/ext/leveldb/db/write_batch.cc +147 -0
  47. data/ext/leveldb/db/write_batch_internal.h +49 -0
  48. data/ext/leveldb/db/write_batch_test.cc +120 -0
  49. data/ext/leveldb/doc/bench/db_bench_sqlite3.cc +718 -0
  50. data/ext/leveldb/doc/bench/db_bench_tree_db.cc +528 -0
  51. data/ext/leveldb/helpers/memenv/memenv.cc +384 -0
  52. data/ext/leveldb/helpers/memenv/memenv.h +20 -0
  53. data/ext/leveldb/helpers/memenv/memenv_test.cc +232 -0
  54. data/ext/leveldb/include/leveldb/c.h +291 -0
  55. data/ext/leveldb/include/leveldb/cache.h +99 -0
  56. data/ext/leveldb/include/leveldb/comparator.h +63 -0
  57. data/ext/leveldb/include/leveldb/db.h +161 -0
  58. data/ext/leveldb/include/leveldb/env.h +333 -0
  59. data/ext/leveldb/include/leveldb/filter_policy.h +70 -0
  60. data/ext/leveldb/include/leveldb/iterator.h +100 -0
  61. data/ext/leveldb/include/leveldb/options.h +195 -0
  62. data/ext/leveldb/include/leveldb/slice.h +109 -0
  63. data/ext/leveldb/include/leveldb/status.h +106 -0
  64. data/ext/leveldb/include/leveldb/table.h +85 -0
  65. data/ext/leveldb/include/leveldb/table_builder.h +92 -0
  66. data/ext/leveldb/include/leveldb/write_batch.h +64 -0
  67. data/ext/leveldb/issues/issue178_test.cc +92 -0
  68. data/ext/leveldb/port/atomic_pointer.h +224 -0
  69. data/ext/leveldb/port/port.h +19 -0
  70. data/ext/leveldb/port/port_example.h +135 -0
  71. data/ext/leveldb/port/port_posix.cc +54 -0
  72. data/ext/leveldb/port/port_posix.h +157 -0
  73. data/ext/leveldb/port/thread_annotations.h +59 -0
  74. data/ext/leveldb/port/win/stdint.h +24 -0
  75. data/ext/leveldb/table/block.cc +268 -0
  76. data/ext/leveldb/table/block.h +44 -0
  77. data/ext/leveldb/table/block_builder.cc +109 -0
  78. data/ext/leveldb/table/block_builder.h +57 -0
  79. data/ext/leveldb/table/filter_block.cc +111 -0
  80. data/ext/leveldb/table/filter_block.h +68 -0
  81. data/ext/leveldb/table/filter_block_test.cc +128 -0
  82. data/ext/leveldb/table/format.cc +145 -0
  83. data/ext/leveldb/table/format.h +108 -0
  84. data/ext/leveldb/table/iterator.cc +67 -0
  85. data/ext/leveldb/table/iterator_wrapper.h +63 -0
  86. data/ext/leveldb/table/merger.cc +197 -0
  87. data/ext/leveldb/table/merger.h +26 -0
  88. data/ext/leveldb/table/table.cc +275 -0
  89. data/ext/leveldb/table/table_builder.cc +270 -0
  90. data/ext/leveldb/table/table_test.cc +868 -0
  91. data/ext/leveldb/table/two_level_iterator.cc +182 -0
  92. data/ext/leveldb/table/two_level_iterator.h +34 -0
  93. data/ext/leveldb/util/arena.cc +68 -0
  94. data/ext/leveldb/util/arena.h +68 -0
  95. data/ext/leveldb/util/arena_test.cc +68 -0
  96. data/ext/leveldb/util/bloom.cc +95 -0
  97. data/ext/leveldb/util/bloom_test.cc +160 -0
  98. data/ext/leveldb/util/cache.cc +325 -0
  99. data/ext/leveldb/util/cache_test.cc +186 -0
  100. data/ext/leveldb/util/coding.cc +194 -0
  101. data/ext/leveldb/util/coding.h +104 -0
  102. data/ext/leveldb/util/coding_test.cc +196 -0
  103. data/ext/leveldb/util/comparator.cc +81 -0
  104. data/ext/leveldb/util/crc32c.cc +332 -0
  105. data/ext/leveldb/util/crc32c.h +45 -0
  106. data/ext/leveldb/util/crc32c_test.cc +72 -0
  107. data/ext/leveldb/util/env.cc +96 -0
  108. data/ext/leveldb/util/env_posix.cc +698 -0
  109. data/ext/leveldb/util/env_test.cc +104 -0
  110. data/ext/leveldb/util/filter_policy.cc +11 -0
  111. data/ext/leveldb/util/hash.cc +52 -0
  112. data/ext/leveldb/util/hash.h +19 -0
  113. data/ext/leveldb/util/histogram.cc +139 -0
  114. data/ext/leveldb/util/histogram.h +42 -0
  115. data/ext/leveldb/util/logging.cc +81 -0
  116. data/ext/leveldb/util/logging.h +47 -0
  117. data/ext/leveldb/util/mutexlock.h +41 -0
  118. data/ext/leveldb/util/options.cc +29 -0
  119. data/ext/leveldb/util/posix_logger.h +98 -0
  120. data/ext/leveldb/util/random.h +59 -0
  121. data/ext/leveldb/util/status.cc +75 -0
  122. data/ext/leveldb/util/testharness.cc +77 -0
  123. data/ext/leveldb/util/testharness.h +138 -0
  124. data/ext/leveldb/util/testutil.cc +51 -0
  125. data/ext/leveldb/util/testutil.h +53 -0
  126. data/lib/leveldb/version.rb +3 -0
  127. data/lib/leveldb.rb +1006 -0
  128. metadata +228 -0
@@ -0,0 +1,1485 @@
1
+ // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file. See the AUTHORS file for names of contributors.
4
+
5
+ #include "db/db_impl.h"
6
+
7
+ #include <algorithm>
8
+ #include <set>
9
+ #include <string>
10
+ #include <stdint.h>
11
+ #include <stdio.h>
12
+ #include <vector>
13
+ #include "db/builder.h"
14
+ #include "db/db_iter.h"
15
+ #include "db/dbformat.h"
16
+ #include "db/filename.h"
17
+ #include "db/log_reader.h"
18
+ #include "db/log_writer.h"
19
+ #include "db/memtable.h"
20
+ #include "db/table_cache.h"
21
+ #include "db/version_set.h"
22
+ #include "db/write_batch_internal.h"
23
+ #include "leveldb/db.h"
24
+ #include "leveldb/env.h"
25
+ #include "leveldb/status.h"
26
+ #include "leveldb/table.h"
27
+ #include "leveldb/table_builder.h"
28
+ #include "port/port.h"
29
+ #include "table/block.h"
30
+ #include "table/merger.h"
31
+ #include "table/two_level_iterator.h"
32
+ #include "util/coding.h"
33
+ #include "util/logging.h"
34
+ #include "util/mutexlock.h"
35
+
36
+ namespace leveldb {
37
+
38
+ const int kNumNonTableCacheFiles = 10;
39
+
40
+ // Information kept for every waiting writer
41
+ struct DBImpl::Writer {
42
+ Status status;
43
+ WriteBatch* batch;
44
+ bool sync;
45
+ bool done;
46
+ port::CondVar cv;
47
+
48
+ explicit Writer(port::Mutex* mu) : cv(mu) { }
49
+ };
50
+
51
+ struct DBImpl::CompactionState {
52
+ Compaction* const compaction;
53
+
54
+ // Sequence numbers < smallest_snapshot are not significant since we
55
+ // will never have to service a snapshot below smallest_snapshot.
56
+ // Therefore if we have seen a sequence number S <= smallest_snapshot,
57
+ // we can drop all entries for the same key with sequence numbers < S.
58
+ SequenceNumber smallest_snapshot;
59
+
60
+ // Files produced by compaction
61
+ struct Output {
62
+ uint64_t number;
63
+ uint64_t file_size;
64
+ InternalKey smallest, largest;
65
+ };
66
+ std::vector<Output> outputs;
67
+
68
+ // State kept for output being generated
69
+ WritableFile* outfile;
70
+ TableBuilder* builder;
71
+
72
+ uint64_t total_bytes;
73
+
74
+ Output* current_output() { return &outputs[outputs.size()-1]; }
75
+
76
+ explicit CompactionState(Compaction* c)
77
+ : compaction(c),
78
+ outfile(NULL),
79
+ builder(NULL),
80
+ total_bytes(0) {
81
+ }
82
+ };
83
+
84
+ // Fix user-supplied options to be reasonable
85
+ template <class T,class V>
86
+ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
87
+ if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
88
+ if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
89
+ }
90
+ Options SanitizeOptions(const std::string& dbname,
91
+ const InternalKeyComparator* icmp,
92
+ const InternalFilterPolicy* ipolicy,
93
+ const Options& src) {
94
+ Options result = src;
95
+ result.comparator = icmp;
96
+ result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
97
+ ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000);
98
+ ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
99
+ ClipToRange(&result.block_size, 1<<10, 4<<20);
100
+ if (result.info_log == NULL) {
101
+ // Open a log file in the same directory as the db
102
+ src.env->CreateDir(dbname); // In case it does not exist
103
+ src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
104
+ Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
105
+ if (!s.ok()) {
106
+ // No place suitable for logging
107
+ result.info_log = NULL;
108
+ }
109
+ }
110
+ if (result.block_cache == NULL) {
111
+ result.block_cache = NewLRUCache(8 << 20);
112
+ }
113
+ return result;
114
+ }
115
+
116
+ DBImpl::DBImpl(const Options& options, const std::string& dbname)
117
+ : env_(options.env),
118
+ internal_comparator_(options.comparator),
119
+ internal_filter_policy_(options.filter_policy),
120
+ options_(SanitizeOptions(
121
+ dbname, &internal_comparator_, &internal_filter_policy_, options)),
122
+ owns_info_log_(options_.info_log != options.info_log),
123
+ owns_cache_(options_.block_cache != options.block_cache),
124
+ dbname_(dbname),
125
+ db_lock_(NULL),
126
+ shutting_down_(NULL),
127
+ bg_cv_(&mutex_),
128
+ mem_(new MemTable(internal_comparator_)),
129
+ imm_(NULL),
130
+ logfile_(NULL),
131
+ logfile_number_(0),
132
+ log_(NULL),
133
+ tmp_batch_(new WriteBatch),
134
+ bg_compaction_scheduled_(false),
135
+ manual_compaction_(NULL),
136
+ consecutive_compaction_errors_(0) {
137
+ mem_->Ref();
138
+ has_imm_.Release_Store(NULL);
139
+
140
+ // Reserve ten files or so for other uses and give the rest to TableCache.
141
+ const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles;
142
+ table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
143
+
144
+ versions_ = new VersionSet(dbname_, &options_, table_cache_,
145
+ &internal_comparator_);
146
+ }
147
+
148
+ DBImpl::~DBImpl() {
149
+ // Wait for background work to finish
150
+ mutex_.Lock();
151
+ shutting_down_.Release_Store(this); // Any non-NULL value is ok
152
+ while (bg_compaction_scheduled_) {
153
+ bg_cv_.Wait();
154
+ }
155
+ mutex_.Unlock();
156
+
157
+ if (db_lock_ != NULL) {
158
+ env_->UnlockFile(db_lock_);
159
+ }
160
+
161
+ delete versions_;
162
+ if (mem_ != NULL) mem_->Unref();
163
+ if (imm_ != NULL) imm_->Unref();
164
+ delete tmp_batch_;
165
+ delete log_;
166
+ delete logfile_;
167
+ delete table_cache_;
168
+
169
+ if (owns_info_log_) {
170
+ delete options_.info_log;
171
+ }
172
+ if (owns_cache_) {
173
+ delete options_.block_cache;
174
+ }
175
+ }
176
+
177
+ Status DBImpl::NewDB() {
178
+ VersionEdit new_db;
179
+ new_db.SetComparatorName(user_comparator()->Name());
180
+ new_db.SetLogNumber(0);
181
+ new_db.SetNextFile(2);
182
+ new_db.SetLastSequence(0);
183
+
184
+ const std::string manifest = DescriptorFileName(dbname_, 1);
185
+ WritableFile* file;
186
+ Status s = env_->NewWritableFile(manifest, &file);
187
+ if (!s.ok()) {
188
+ return s;
189
+ }
190
+ {
191
+ log::Writer log(file);
192
+ std::string record;
193
+ new_db.EncodeTo(&record);
194
+ s = log.AddRecord(record);
195
+ if (s.ok()) {
196
+ s = file->Close();
197
+ }
198
+ }
199
+ delete file;
200
+ if (s.ok()) {
201
+ // Make "CURRENT" file that points to the new manifest file.
202
+ s = SetCurrentFile(env_, dbname_, 1);
203
+ } else {
204
+ env_->DeleteFile(manifest);
205
+ }
206
+ return s;
207
+ }
208
+
209
+ void DBImpl::MaybeIgnoreError(Status* s) const {
210
+ if (s->ok() || options_.paranoid_checks) {
211
+ // No change needed
212
+ } else {
213
+ Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
214
+ *s = Status::OK();
215
+ }
216
+ }
217
+
218
+ void DBImpl::DeleteObsoleteFiles() {
219
+ // Make a set of all of the live files
220
+ std::set<uint64_t> live = pending_outputs_;
221
+ versions_->AddLiveFiles(&live);
222
+
223
+ std::vector<std::string> filenames;
224
+ env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
225
+ uint64_t number;
226
+ FileType type;
227
+ for (size_t i = 0; i < filenames.size(); i++) {
228
+ if (ParseFileName(filenames[i], &number, &type)) {
229
+ bool keep = true;
230
+ switch (type) {
231
+ case kLogFile:
232
+ keep = ((number >= versions_->LogNumber()) ||
233
+ (number == versions_->PrevLogNumber()));
234
+ break;
235
+ case kDescriptorFile:
236
+ // Keep my manifest file, and any newer incarnations'
237
+ // (in case there is a race that allows other incarnations)
238
+ keep = (number >= versions_->ManifestFileNumber());
239
+ break;
240
+ case kTableFile:
241
+ keep = (live.find(number) != live.end());
242
+ break;
243
+ case kTempFile:
244
+ // Any temp files that are currently being written to must
245
+ // be recorded in pending_outputs_, which is inserted into "live"
246
+ keep = (live.find(number) != live.end());
247
+ break;
248
+ case kCurrentFile:
249
+ case kDBLockFile:
250
+ case kInfoLogFile:
251
+ keep = true;
252
+ break;
253
+ }
254
+
255
+ if (!keep) {
256
+ if (type == kTableFile) {
257
+ table_cache_->Evict(number);
258
+ }
259
+ Log(options_.info_log, "Delete type=%d #%lld\n",
260
+ int(type),
261
+ static_cast<unsigned long long>(number));
262
+ env_->DeleteFile(dbname_ + "/" + filenames[i]);
263
+ }
264
+ }
265
+ }
266
+ }
267
+
268
+ Status DBImpl::Recover(VersionEdit* edit) {
269
+ mutex_.AssertHeld();
270
+
271
+ // Ignore error from CreateDir since the creation of the DB is
272
+ // committed only when the descriptor is created, and this directory
273
+ // may already exist from a previous failed creation attempt.
274
+ env_->CreateDir(dbname_);
275
+ assert(db_lock_ == NULL);
276
+ Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
277
+ if (!s.ok()) {
278
+ return s;
279
+ }
280
+
281
+ if (!env_->FileExists(CurrentFileName(dbname_))) {
282
+ if (options_.create_if_missing) {
283
+ s = NewDB();
284
+ if (!s.ok()) {
285
+ return s;
286
+ }
287
+ } else {
288
+ return Status::InvalidArgument(
289
+ dbname_, "does not exist (create_if_missing is false)");
290
+ }
291
+ } else {
292
+ if (options_.error_if_exists) {
293
+ return Status::InvalidArgument(
294
+ dbname_, "exists (error_if_exists is true)");
295
+ }
296
+ }
297
+
298
+ s = versions_->Recover();
299
+ if (s.ok()) {
300
+ SequenceNumber max_sequence(0);
301
+
302
+ // Recover from all newer log files than the ones named in the
303
+ // descriptor (new log files may have been added by the previous
304
+ // incarnation without registering them in the descriptor).
305
+ //
306
+ // Note that PrevLogNumber() is no longer used, but we pay
307
+ // attention to it in case we are recovering a database
308
+ // produced by an older version of leveldb.
309
+ const uint64_t min_log = versions_->LogNumber();
310
+ const uint64_t prev_log = versions_->PrevLogNumber();
311
+ std::vector<std::string> filenames;
312
+ s = env_->GetChildren(dbname_, &filenames);
313
+ if (!s.ok()) {
314
+ return s;
315
+ }
316
+ std::set<uint64_t> expected;
317
+ versions_->AddLiveFiles(&expected);
318
+ uint64_t number;
319
+ FileType type;
320
+ std::vector<uint64_t> logs;
321
+ for (size_t i = 0; i < filenames.size(); i++) {
322
+ if (ParseFileName(filenames[i], &number, &type)) {
323
+ expected.erase(number);
324
+ if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
325
+ logs.push_back(number);
326
+ }
327
+ }
328
+ if (!expected.empty()) {
329
+ char buf[50];
330
+ snprintf(buf, sizeof(buf), "%d missing files; e.g.",
331
+ static_cast<int>(expected.size()));
332
+ return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
333
+ }
334
+
335
+ // Recover in the order in which the logs were generated
336
+ std::sort(logs.begin(), logs.end());
337
+ for (size_t i = 0; i < logs.size(); i++) {
338
+ s = RecoverLogFile(logs[i], edit, &max_sequence);
339
+
340
+ // The previous incarnation may not have written any MANIFEST
341
+ // records after allocating this log number. So we manually
342
+ // update the file number allocation counter in VersionSet.
343
+ versions_->MarkFileNumberUsed(logs[i]);
344
+ }
345
+
346
+ if (s.ok()) {
347
+ if (versions_->LastSequence() < max_sequence) {
348
+ versions_->SetLastSequence(max_sequence);
349
+ }
350
+ }
351
+ }
352
+
353
+ return s;
354
+ }
355
+
356
+ Status DBImpl::RecoverLogFile(uint64_t log_number,
357
+ VersionEdit* edit,
358
+ SequenceNumber* max_sequence) {
359
+ struct LogReporter : public log::Reader::Reporter {
360
+ Env* env;
361
+ Logger* info_log;
362
+ const char* fname;
363
+ Status* status; // NULL if options_.paranoid_checks==false
364
+ virtual void Corruption(size_t bytes, const Status& s) {
365
+ Log(info_log, "%s%s: dropping %d bytes; %s",
366
+ (this->status == NULL ? "(ignoring error) " : ""),
367
+ fname, static_cast<int>(bytes), s.ToString().c_str());
368
+ if (this->status != NULL && this->status->ok()) *this->status = s;
369
+ }
370
+ };
371
+
372
+ mutex_.AssertHeld();
373
+
374
+ // Open the log file
375
+ std::string fname = LogFileName(dbname_, log_number);
376
+ SequentialFile* file;
377
+ Status status = env_->NewSequentialFile(fname, &file);
378
+ if (!status.ok()) {
379
+ MaybeIgnoreError(&status);
380
+ return status;
381
+ }
382
+
383
+ // Create the log reader.
384
+ LogReporter reporter;
385
+ reporter.env = env_;
386
+ reporter.info_log = options_.info_log;
387
+ reporter.fname = fname.c_str();
388
+ reporter.status = (options_.paranoid_checks ? &status : NULL);
389
+ // We intentially make log::Reader do checksumming even if
390
+ // paranoid_checks==false so that corruptions cause entire commits
391
+ // to be skipped instead of propagating bad information (like overly
392
+ // large sequence numbers).
393
+ log::Reader reader(file, &reporter, true/*checksum*/,
394
+ 0/*initial_offset*/);
395
+ Log(options_.info_log, "Recovering log #%llu",
396
+ (unsigned long long) log_number);
397
+
398
+ // Read all the records and add to a memtable
399
+ std::string scratch;
400
+ Slice record;
401
+ WriteBatch batch;
402
+ MemTable* mem = NULL;
403
+ while (reader.ReadRecord(&record, &scratch) &&
404
+ status.ok()) {
405
+ if (record.size() < 12) {
406
+ reporter.Corruption(
407
+ record.size(), Status::Corruption("log record too small"));
408
+ continue;
409
+ }
410
+ WriteBatchInternal::SetContents(&batch, record);
411
+
412
+ if (mem == NULL) {
413
+ mem = new MemTable(internal_comparator_);
414
+ mem->Ref();
415
+ }
416
+ status = WriteBatchInternal::InsertInto(&batch, mem);
417
+ MaybeIgnoreError(&status);
418
+ if (!status.ok()) {
419
+ break;
420
+ }
421
+ const SequenceNumber last_seq =
422
+ WriteBatchInternal::Sequence(&batch) +
423
+ WriteBatchInternal::Count(&batch) - 1;
424
+ if (last_seq > *max_sequence) {
425
+ *max_sequence = last_seq;
426
+ }
427
+
428
+ if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
429
+ status = WriteLevel0Table(mem, edit, NULL);
430
+ if (!status.ok()) {
431
+ // Reflect errors immediately so that conditions like full
432
+ // file-systems cause the DB::Open() to fail.
433
+ break;
434
+ }
435
+ mem->Unref();
436
+ mem = NULL;
437
+ }
438
+ }
439
+
440
+ if (status.ok() && mem != NULL) {
441
+ status = WriteLevel0Table(mem, edit, NULL);
442
+ // Reflect errors immediately so that conditions like full
443
+ // file-systems cause the DB::Open() to fail.
444
+ }
445
+
446
+ if (mem != NULL) mem->Unref();
447
+ delete file;
448
+ return status;
449
+ }
450
+
451
+ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
452
+ Version* base) {
453
+ mutex_.AssertHeld();
454
+ const uint64_t start_micros = env_->NowMicros();
455
+ FileMetaData meta;
456
+ meta.number = versions_->NewFileNumber();
457
+ pending_outputs_.insert(meta.number);
458
+ Iterator* iter = mem->NewIterator();
459
+ Log(options_.info_log, "Level-0 table #%llu: started",
460
+ (unsigned long long) meta.number);
461
+
462
+ Status s;
463
+ {
464
+ mutex_.Unlock();
465
+ s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
466
+ mutex_.Lock();
467
+ }
468
+
469
+ Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
470
+ (unsigned long long) meta.number,
471
+ (unsigned long long) meta.file_size,
472
+ s.ToString().c_str());
473
+ delete iter;
474
+ pending_outputs_.erase(meta.number);
475
+
476
+
477
+ // Note that if file_size is zero, the file has been deleted and
478
+ // should not be added to the manifest.
479
+ int level = 0;
480
+ if (s.ok() && meta.file_size > 0) {
481
+ const Slice min_user_key = meta.smallest.user_key();
482
+ const Slice max_user_key = meta.largest.user_key();
483
+ if (base != NULL) {
484
+ level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
485
+ }
486
+ edit->AddFile(level, meta.number, meta.file_size,
487
+ meta.smallest, meta.largest);
488
+ }
489
+
490
+ CompactionStats stats;
491
+ stats.micros = env_->NowMicros() - start_micros;
492
+ stats.bytes_written = meta.file_size;
493
+ stats_[level].Add(stats);
494
+ return s;
495
+ }
496
+
497
+ Status DBImpl::CompactMemTable() {
498
+ mutex_.AssertHeld();
499
+ assert(imm_ != NULL);
500
+
501
+ // Save the contents of the memtable as a new Table
502
+ VersionEdit edit;
503
+ Version* base = versions_->current();
504
+ base->Ref();
505
+ Status s = WriteLevel0Table(imm_, &edit, base);
506
+ base->Unref();
507
+
508
+ if (s.ok() && shutting_down_.Acquire_Load()) {
509
+ s = Status::IOError("Deleting DB during memtable compaction");
510
+ }
511
+
512
+ // Replace immutable memtable with the generated Table
513
+ if (s.ok()) {
514
+ edit.SetPrevLogNumber(0);
515
+ edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
516
+ s = versions_->LogAndApply(&edit, &mutex_);
517
+ }
518
+
519
+ if (s.ok()) {
520
+ // Commit to the new state
521
+ imm_->Unref();
522
+ imm_ = NULL;
523
+ has_imm_.Release_Store(NULL);
524
+ DeleteObsoleteFiles();
525
+ }
526
+
527
+ return s;
528
+ }
529
+
530
+ void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
531
+ int max_level_with_files = 1;
532
+ {
533
+ MutexLock l(&mutex_);
534
+ Version* base = versions_->current();
535
+ for (int level = 1; level < config::kNumLevels; level++) {
536
+ if (base->OverlapInLevel(level, begin, end)) {
537
+ max_level_with_files = level;
538
+ }
539
+ }
540
+ }
541
+ TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
542
+ for (int level = 0; level < max_level_with_files; level++) {
543
+ TEST_CompactRange(level, begin, end);
544
+ }
545
+ }
546
+
547
+ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
548
+ assert(level >= 0);
549
+ assert(level + 1 < config::kNumLevels);
550
+
551
+ InternalKey begin_storage, end_storage;
552
+
553
+ ManualCompaction manual;
554
+ manual.level = level;
555
+ manual.done = false;
556
+ if (begin == NULL) {
557
+ manual.begin = NULL;
558
+ } else {
559
+ begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
560
+ manual.begin = &begin_storage;
561
+ }
562
+ if (end == NULL) {
563
+ manual.end = NULL;
564
+ } else {
565
+ end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
566
+ manual.end = &end_storage;
567
+ }
568
+
569
+ MutexLock l(&mutex_);
570
+ while (!manual.done) {
571
+ while (manual_compaction_ != NULL) {
572
+ bg_cv_.Wait();
573
+ }
574
+ manual_compaction_ = &manual;
575
+ MaybeScheduleCompaction();
576
+ while (manual_compaction_ == &manual) {
577
+ bg_cv_.Wait();
578
+ }
579
+ }
580
+ }
581
+
582
+ Status DBImpl::TEST_CompactMemTable() {
583
+ // NULL batch means just wait for earlier writes to be done
584
+ Status s = Write(WriteOptions(), NULL);
585
+ if (s.ok()) {
586
+ // Wait until the compaction completes
587
+ MutexLock l(&mutex_);
588
+ while (imm_ != NULL && bg_error_.ok()) {
589
+ bg_cv_.Wait();
590
+ }
591
+ if (imm_ != NULL) {
592
+ s = bg_error_;
593
+ }
594
+ }
595
+ return s;
596
+ }
597
+
598
+ void DBImpl::MaybeScheduleCompaction() {
599
+ mutex_.AssertHeld();
600
+ if (bg_compaction_scheduled_) {
601
+ // Already scheduled
602
+ } else if (shutting_down_.Acquire_Load()) {
603
+ // DB is being deleted; no more background compactions
604
+ } else if (imm_ == NULL &&
605
+ manual_compaction_ == NULL &&
606
+ !versions_->NeedsCompaction()) {
607
+ // No work to be done
608
+ } else {
609
+ bg_compaction_scheduled_ = true;
610
+ env_->Schedule(&DBImpl::BGWork, this);
611
+ }
612
+ }
613
+
614
+ void DBImpl::BGWork(void* db) {
615
+ reinterpret_cast<DBImpl*>(db)->BackgroundCall();
616
+ }
617
+
618
+ void DBImpl::BackgroundCall() {
619
+ MutexLock l(&mutex_);
620
+ assert(bg_compaction_scheduled_);
621
+ if (!shutting_down_.Acquire_Load()) {
622
+ Status s = BackgroundCompaction();
623
+ if (s.ok()) {
624
+ // Success
625
+ consecutive_compaction_errors_ = 0;
626
+ } else if (shutting_down_.Acquire_Load()) {
627
+ // Error most likely due to shutdown; do not wait
628
+ } else {
629
+ // Wait a little bit before retrying background compaction in
630
+ // case this is an environmental problem and we do not want to
631
+ // chew up resources for failed compactions for the duration of
632
+ // the problem.
633
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
634
+ Log(options_.info_log, "Waiting after background compaction error: %s",
635
+ s.ToString().c_str());
636
+ mutex_.Unlock();
637
+ ++consecutive_compaction_errors_;
638
+ int seconds_to_sleep = 1;
639
+ for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) {
640
+ seconds_to_sleep *= 2;
641
+ }
642
+ env_->SleepForMicroseconds(seconds_to_sleep * 1000000);
643
+ mutex_.Lock();
644
+ }
645
+ }
646
+
647
+ bg_compaction_scheduled_ = false;
648
+
649
+ // Previous compaction may have produced too many files in a level,
650
+ // so reschedule another compaction if needed.
651
+ MaybeScheduleCompaction();
652
+ bg_cv_.SignalAll();
653
+ }
654
+
655
+ Status DBImpl::BackgroundCompaction() {
656
+ mutex_.AssertHeld();
657
+
658
+ if (imm_ != NULL) {
659
+ return CompactMemTable();
660
+ }
661
+
662
+ Compaction* c;
663
+ bool is_manual = (manual_compaction_ != NULL);
664
+ InternalKey manual_end;
665
+ if (is_manual) {
666
+ ManualCompaction* m = manual_compaction_;
667
+ c = versions_->CompactRange(m->level, m->begin, m->end);
668
+ m->done = (c == NULL);
669
+ if (c != NULL) {
670
+ manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
671
+ }
672
+ Log(options_.info_log,
673
+ "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
674
+ m->level,
675
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
676
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
677
+ (m->done ? "(end)" : manual_end.DebugString().c_str()));
678
+ } else {
679
+ c = versions_->PickCompaction();
680
+ }
681
+
682
+ Status status;
683
+ if (c == NULL) {
684
+ // Nothing to do
685
+ } else if (!is_manual && c->IsTrivialMove()) {
686
+ // Move file to next level
687
+ assert(c->num_input_files(0) == 1);
688
+ FileMetaData* f = c->input(0, 0);
689
+ c->edit()->DeleteFile(c->level(), f->number);
690
+ c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
691
+ f->smallest, f->largest);
692
+ status = versions_->LogAndApply(c->edit(), &mutex_);
693
+ VersionSet::LevelSummaryStorage tmp;
694
+ Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
695
+ static_cast<unsigned long long>(f->number),
696
+ c->level() + 1,
697
+ static_cast<unsigned long long>(f->file_size),
698
+ status.ToString().c_str(),
699
+ versions_->LevelSummary(&tmp));
700
+ } else {
701
+ CompactionState* compact = new CompactionState(c);
702
+ status = DoCompactionWork(compact);
703
+ CleanupCompaction(compact);
704
+ c->ReleaseInputs();
705
+ DeleteObsoleteFiles();
706
+ }
707
+ delete c;
708
+
709
+ if (status.ok()) {
710
+ // Done
711
+ } else if (shutting_down_.Acquire_Load()) {
712
+ // Ignore compaction errors found during shutting down
713
+ } else {
714
+ Log(options_.info_log,
715
+ "Compaction error: %s", status.ToString().c_str());
716
+ if (options_.paranoid_checks && bg_error_.ok()) {
717
+ bg_error_ = status;
718
+ }
719
+ }
720
+
721
+ if (is_manual) {
722
+ ManualCompaction* m = manual_compaction_;
723
+ if (!status.ok()) {
724
+ m->done = true;
725
+ }
726
+ if (!m->done) {
727
+ // We only compacted part of the requested range. Update *m
728
+ // to the range that is left to be compacted.
729
+ m->tmp_storage = manual_end;
730
+ m->begin = &m->tmp_storage;
731
+ }
732
+ manual_compaction_ = NULL;
733
+ }
734
+ return status;
735
+ }
736
+
737
+ void DBImpl::CleanupCompaction(CompactionState* compact) {
738
+ mutex_.AssertHeld();
739
+ if (compact->builder != NULL) {
740
+ // May happen if we get a shutdown call in the middle of compaction
741
+ compact->builder->Abandon();
742
+ delete compact->builder;
743
+ } else {
744
+ assert(compact->outfile == NULL);
745
+ }
746
+ delete compact->outfile;
747
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
748
+ const CompactionState::Output& out = compact->outputs[i];
749
+ pending_outputs_.erase(out.number);
750
+ }
751
+ delete compact;
752
+ }
753
+
754
+ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
755
+ assert(compact != NULL);
756
+ assert(compact->builder == NULL);
757
+ uint64_t file_number;
758
+ {
759
+ mutex_.Lock();
760
+ file_number = versions_->NewFileNumber();
761
+ pending_outputs_.insert(file_number);
762
+ CompactionState::Output out;
763
+ out.number = file_number;
764
+ out.smallest.Clear();
765
+ out.largest.Clear();
766
+ compact->outputs.push_back(out);
767
+ mutex_.Unlock();
768
+ }
769
+
770
+ // Make the output file
771
+ std::string fname = TableFileName(dbname_, file_number);
772
+ Status s = env_->NewWritableFile(fname, &compact->outfile);
773
+ if (s.ok()) {
774
+ compact->builder = new TableBuilder(options_, compact->outfile);
775
+ }
776
+ return s;
777
+ }
778
+
779
+ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
780
+ Iterator* input) {
781
+ assert(compact != NULL);
782
+ assert(compact->outfile != NULL);
783
+ assert(compact->builder != NULL);
784
+
785
+ const uint64_t output_number = compact->current_output()->number;
786
+ assert(output_number != 0);
787
+
788
+ // Check for iterator errors
789
+ Status s = input->status();
790
+ const uint64_t current_entries = compact->builder->NumEntries();
791
+ if (s.ok()) {
792
+ s = compact->builder->Finish();
793
+ } else {
794
+ compact->builder->Abandon();
795
+ }
796
+ const uint64_t current_bytes = compact->builder->FileSize();
797
+ compact->current_output()->file_size = current_bytes;
798
+ compact->total_bytes += current_bytes;
799
+ delete compact->builder;
800
+ compact->builder = NULL;
801
+
802
+ // Finish and check for file errors
803
+ if (s.ok()) {
804
+ s = compact->outfile->Sync();
805
+ }
806
+ if (s.ok()) {
807
+ s = compact->outfile->Close();
808
+ }
809
+ delete compact->outfile;
810
+ compact->outfile = NULL;
811
+
812
+ if (s.ok() && current_entries > 0) {
813
+ // Verify that the table is usable
814
+ Iterator* iter = table_cache_->NewIterator(ReadOptions(),
815
+ output_number,
816
+ current_bytes);
817
+ s = iter->status();
818
+ delete iter;
819
+ if (s.ok()) {
820
+ Log(options_.info_log,
821
+ "Generated table #%llu: %lld keys, %lld bytes",
822
+ (unsigned long long) output_number,
823
+ (unsigned long long) current_entries,
824
+ (unsigned long long) current_bytes);
825
+ }
826
+ }
827
+ return s;
828
+ }
829
+
830
+
831
+ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
832
+ mutex_.AssertHeld();
833
+ Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
834
+ compact->compaction->num_input_files(0),
835
+ compact->compaction->level(),
836
+ compact->compaction->num_input_files(1),
837
+ compact->compaction->level() + 1,
838
+ static_cast<long long>(compact->total_bytes));
839
+
840
+ // Add compaction outputs
841
+ compact->compaction->AddInputDeletions(compact->compaction->edit());
842
+ const int level = compact->compaction->level();
843
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
844
+ const CompactionState::Output& out = compact->outputs[i];
845
+ compact->compaction->edit()->AddFile(
846
+ level + 1,
847
+ out.number, out.file_size, out.smallest, out.largest);
848
+ }
849
+ return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
850
+ }
851
+
852
+ Status DBImpl::DoCompactionWork(CompactionState* compact) {
853
+ const uint64_t start_micros = env_->NowMicros();
854
+ int64_t imm_micros = 0; // Micros spent doing imm_ compactions
855
+
856
+ Log(options_.info_log, "Compacting %d@%d + %d@%d files",
857
+ compact->compaction->num_input_files(0),
858
+ compact->compaction->level(),
859
+ compact->compaction->num_input_files(1),
860
+ compact->compaction->level() + 1);
861
+
862
+ assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
863
+ assert(compact->builder == NULL);
864
+ assert(compact->outfile == NULL);
865
+ if (snapshots_.empty()) {
866
+ compact->smallest_snapshot = versions_->LastSequence();
867
+ } else {
868
+ compact->smallest_snapshot = snapshots_.oldest()->number_;
869
+ }
870
+
871
+ // Release mutex while we're actually doing the compaction work
872
+ mutex_.Unlock();
873
+
874
+ Iterator* input = versions_->MakeInputIterator(compact->compaction);
875
+ input->SeekToFirst();
876
+ Status status;
877
+ ParsedInternalKey ikey;
878
+ std::string current_user_key;
879
+ bool has_current_user_key = false;
880
+ SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
881
+ for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
882
+ // Prioritize immutable compaction work
883
+ if (has_imm_.NoBarrier_Load() != NULL) {
884
+ const uint64_t imm_start = env_->NowMicros();
885
+ mutex_.Lock();
886
+ if (imm_ != NULL) {
887
+ CompactMemTable();
888
+ bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
889
+ }
890
+ mutex_.Unlock();
891
+ imm_micros += (env_->NowMicros() - imm_start);
892
+ }
893
+
894
+ Slice key = input->key();
895
+ if (compact->compaction->ShouldStopBefore(key) &&
896
+ compact->builder != NULL) {
897
+ status = FinishCompactionOutputFile(compact, input);
898
+ if (!status.ok()) {
899
+ break;
900
+ }
901
+ }
902
+
903
+ // Handle key/value, add to state, etc.
904
+ bool drop = false;
905
+ if (!ParseInternalKey(key, &ikey)) {
906
+ // Do not hide error keys
907
+ current_user_key.clear();
908
+ has_current_user_key = false;
909
+ last_sequence_for_key = kMaxSequenceNumber;
910
+ } else {
911
+ if (!has_current_user_key ||
912
+ user_comparator()->Compare(ikey.user_key,
913
+ Slice(current_user_key)) != 0) {
914
+ // First occurrence of this user key
915
+ current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
916
+ has_current_user_key = true;
917
+ last_sequence_for_key = kMaxSequenceNumber;
918
+ }
919
+
920
+ if (last_sequence_for_key <= compact->smallest_snapshot) {
921
+ // Hidden by an newer entry for same user key
922
+ drop = true; // (A)
923
+ } else if (ikey.type == kTypeDeletion &&
924
+ ikey.sequence <= compact->smallest_snapshot &&
925
+ compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
926
+ // For this user key:
927
+ // (1) there is no data in higher levels
928
+ // (2) data in lower levels will have larger sequence numbers
929
+ // (3) data in layers that are being compacted here and have
930
+ // smaller sequence numbers will be dropped in the next
931
+ // few iterations of this loop (by rule (A) above).
932
+ // Therefore this deletion marker is obsolete and can be dropped.
933
+ drop = true;
934
+ }
935
+
936
+ last_sequence_for_key = ikey.sequence;
937
+ }
938
+ #if 0
939
+ Log(options_.info_log,
940
+ " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
941
+ "%d smallest_snapshot: %d",
942
+ ikey.user_key.ToString().c_str(),
943
+ (int)ikey.sequence, ikey.type, kTypeValue, drop,
944
+ compact->compaction->IsBaseLevelForKey(ikey.user_key),
945
+ (int)last_sequence_for_key, (int)compact->smallest_snapshot);
946
+ #endif
947
+
948
+ if (!drop) {
949
+ // Open output file if necessary
950
+ if (compact->builder == NULL) {
951
+ status = OpenCompactionOutputFile(compact);
952
+ if (!status.ok()) {
953
+ break;
954
+ }
955
+ }
956
+ if (compact->builder->NumEntries() == 0) {
957
+ compact->current_output()->smallest.DecodeFrom(key);
958
+ }
959
+ compact->current_output()->largest.DecodeFrom(key);
960
+ compact->builder->Add(key, input->value());
961
+
962
+ // Close output file if it is big enough
963
+ if (compact->builder->FileSize() >=
964
+ compact->compaction->MaxOutputFileSize()) {
965
+ status = FinishCompactionOutputFile(compact, input);
966
+ if (!status.ok()) {
967
+ break;
968
+ }
969
+ }
970
+ }
971
+
972
+ input->Next();
973
+ }
974
+
975
+ if (status.ok() && shutting_down_.Acquire_Load()) {
976
+ status = Status::IOError("Deleting DB during compaction");
977
+ }
978
+ if (status.ok() && compact->builder != NULL) {
979
+ status = FinishCompactionOutputFile(compact, input);
980
+ }
981
+ if (status.ok()) {
982
+ status = input->status();
983
+ }
984
+ delete input;
985
+ input = NULL;
986
+
987
+ CompactionStats stats;
988
+ stats.micros = env_->NowMicros() - start_micros - imm_micros;
989
+ for (int which = 0; which < 2; which++) {
990
+ for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
991
+ stats.bytes_read += compact->compaction->input(which, i)->file_size;
992
+ }
993
+ }
994
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
995
+ stats.bytes_written += compact->outputs[i].file_size;
996
+ }
997
+
998
+ mutex_.Lock();
999
+ stats_[compact->compaction->level() + 1].Add(stats);
1000
+
1001
+ if (status.ok()) {
1002
+ status = InstallCompactionResults(compact);
1003
+ }
1004
+ VersionSet::LevelSummaryStorage tmp;
1005
+ Log(options_.info_log,
1006
+ "compacted to: %s", versions_->LevelSummary(&tmp));
1007
+ return status;
1008
+ }
1009
+
1010
+ namespace {
1011
+ struct IterState {
1012
+ port::Mutex* mu;
1013
+ Version* version;
1014
+ MemTable* mem;
1015
+ MemTable* imm;
1016
+ };
1017
+
1018
+ static void CleanupIteratorState(void* arg1, void* arg2) {
1019
+ IterState* state = reinterpret_cast<IterState*>(arg1);
1020
+ state->mu->Lock();
1021
+ state->mem->Unref();
1022
+ if (state->imm != NULL) state->imm->Unref();
1023
+ state->version->Unref();
1024
+ state->mu->Unlock();
1025
+ delete state;
1026
+ }
1027
+ } // namespace
1028
+
1029
+ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
1030
+ SequenceNumber* latest_snapshot) {
1031
+ IterState* cleanup = new IterState;
1032
+ mutex_.Lock();
1033
+ *latest_snapshot = versions_->LastSequence();
1034
+
1035
+ // Collect together all needed child iterators
1036
+ std::vector<Iterator*> list;
1037
+ list.push_back(mem_->NewIterator());
1038
+ mem_->Ref();
1039
+ if (imm_ != NULL) {
1040
+ list.push_back(imm_->NewIterator());
1041
+ imm_->Ref();
1042
+ }
1043
+ versions_->current()->AddIterators(options, &list);
1044
+ Iterator* internal_iter =
1045
+ NewMergingIterator(&internal_comparator_, &list[0], list.size());
1046
+ versions_->current()->Ref();
1047
+
1048
+ cleanup->mu = &mutex_;
1049
+ cleanup->mem = mem_;
1050
+ cleanup->imm = imm_;
1051
+ cleanup->version = versions_->current();
1052
+ internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
1053
+
1054
+ mutex_.Unlock();
1055
+ return internal_iter;
1056
+ }
1057
+
1058
+ Iterator* DBImpl::TEST_NewInternalIterator() {
1059
+ SequenceNumber ignored;
1060
+ return NewInternalIterator(ReadOptions(), &ignored);
1061
+ }
1062
+
1063
+ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
1064
+ MutexLock l(&mutex_);
1065
+ return versions_->MaxNextLevelOverlappingBytes();
1066
+ }
1067
+
1068
+ Status DBImpl::Get(const ReadOptions& options,
1069
+ const Slice& key,
1070
+ std::string* value) {
1071
+ Status s;
1072
+ MutexLock l(&mutex_);
1073
+ SequenceNumber snapshot;
1074
+ if (options.snapshot != NULL) {
1075
+ snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
1076
+ } else {
1077
+ snapshot = versions_->LastSequence();
1078
+ }
1079
+
1080
+ MemTable* mem = mem_;
1081
+ MemTable* imm = imm_;
1082
+ Version* current = versions_->current();
1083
+ mem->Ref();
1084
+ if (imm != NULL) imm->Ref();
1085
+ current->Ref();
1086
+
1087
+ bool have_stat_update = false;
1088
+ Version::GetStats stats;
1089
+
1090
+ // Unlock while reading from files and memtables
1091
+ {
1092
+ mutex_.Unlock();
1093
+ // First look in the memtable, then in the immutable memtable (if any).
1094
+ LookupKey lkey(key, snapshot);
1095
+ if (mem->Get(lkey, value, &s)) {
1096
+ // Done
1097
+ } else if (imm != NULL && imm->Get(lkey, value, &s)) {
1098
+ // Done
1099
+ } else {
1100
+ s = current->Get(options, lkey, value, &stats);
1101
+ have_stat_update = true;
1102
+ }
1103
+ mutex_.Lock();
1104
+ }
1105
+
1106
+ if (have_stat_update && current->UpdateStats(stats)) {
1107
+ MaybeScheduleCompaction();
1108
+ }
1109
+ mem->Unref();
1110
+ if (imm != NULL) imm->Unref();
1111
+ current->Unref();
1112
+ return s;
1113
+ }
1114
+
1115
+ Iterator* DBImpl::NewIterator(const ReadOptions& options) {
1116
+ SequenceNumber latest_snapshot;
1117
+ Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
1118
+ return NewDBIterator(
1119
+ &dbname_, env_, user_comparator(), internal_iter,
1120
+ (options.snapshot != NULL
1121
+ ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
1122
+ : latest_snapshot));
1123
+ }
1124
+
1125
+ const Snapshot* DBImpl::GetSnapshot() {
1126
+ MutexLock l(&mutex_);
1127
+ return snapshots_.New(versions_->LastSequence());
1128
+ }
1129
+
1130
+ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
1131
+ MutexLock l(&mutex_);
1132
+ snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
1133
+ }
1134
+
1135
+ // Convenience methods
1136
+ Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
1137
+ return DB::Put(o, key, val);
1138
+ }
1139
+
1140
+ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
1141
+ return DB::Delete(options, key);
1142
+ }
1143
+
1144
+ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
1145
+ Writer w(&mutex_);
1146
+ w.batch = my_batch;
1147
+ w.sync = options.sync;
1148
+ w.done = false;
1149
+
1150
+ MutexLock l(&mutex_);
1151
+ writers_.push_back(&w);
1152
+ while (!w.done && &w != writers_.front()) {
1153
+ w.cv.Wait();
1154
+ }
1155
+ if (w.done) {
1156
+ return w.status;
1157
+ }
1158
+
1159
+ // May temporarily unlock and wait.
1160
+ Status status = MakeRoomForWrite(my_batch == NULL);
1161
+ uint64_t last_sequence = versions_->LastSequence();
1162
+ Writer* last_writer = &w;
1163
+ if (status.ok() && my_batch != NULL) { // NULL batch is for compactions
1164
+ WriteBatch* updates = BuildBatchGroup(&last_writer);
1165
+ WriteBatchInternal::SetSequence(updates, last_sequence + 1);
1166
+ last_sequence += WriteBatchInternal::Count(updates);
1167
+
1168
+ // Add to log and apply to memtable. We can release the lock
1169
+ // during this phase since &w is currently responsible for logging
1170
+ // and protects against concurrent loggers and concurrent writes
1171
+ // into mem_.
1172
+ {
1173
+ mutex_.Unlock();
1174
+ status = log_->AddRecord(WriteBatchInternal::Contents(updates));
1175
+ if (status.ok() && options.sync) {
1176
+ status = logfile_->Sync();
1177
+ }
1178
+ if (status.ok()) {
1179
+ status = WriteBatchInternal::InsertInto(updates, mem_);
1180
+ }
1181
+ mutex_.Lock();
1182
+ }
1183
+ if (updates == tmp_batch_) tmp_batch_->Clear();
1184
+
1185
+ versions_->SetLastSequence(last_sequence);
1186
+ }
1187
+
1188
+ while (true) {
1189
+ Writer* ready = writers_.front();
1190
+ writers_.pop_front();
1191
+ if (ready != &w) {
1192
+ ready->status = status;
1193
+ ready->done = true;
1194
+ ready->cv.Signal();
1195
+ }
1196
+ if (ready == last_writer) break;
1197
+ }
1198
+
1199
+ // Notify new head of write queue
1200
+ if (!writers_.empty()) {
1201
+ writers_.front()->cv.Signal();
1202
+ }
1203
+
1204
+ return status;
1205
+ }
1206
+
1207
+ // REQUIRES: Writer list must be non-empty
1208
+ // REQUIRES: First writer must have a non-NULL batch
1209
+ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
1210
+ assert(!writers_.empty());
1211
+ Writer* first = writers_.front();
1212
+ WriteBatch* result = first->batch;
1213
+ assert(result != NULL);
1214
+
1215
+ size_t size = WriteBatchInternal::ByteSize(first->batch);
1216
+
1217
+ // Allow the group to grow up to a maximum size, but if the
1218
+ // original write is small, limit the growth so we do not slow
1219
+ // down the small write too much.
1220
+ size_t max_size = 1 << 20;
1221
+ if (size <= (128<<10)) {
1222
+ max_size = size + (128<<10);
1223
+ }
1224
+
1225
+ *last_writer = first;
1226
+ std::deque<Writer*>::iterator iter = writers_.begin();
1227
+ ++iter; // Advance past "first"
1228
+ for (; iter != writers_.end(); ++iter) {
1229
+ Writer* w = *iter;
1230
+ if (w->sync && !first->sync) {
1231
+ // Do not include a sync write into a batch handled by a non-sync write.
1232
+ break;
1233
+ }
1234
+
1235
+ if (w->batch != NULL) {
1236
+ size += WriteBatchInternal::ByteSize(w->batch);
1237
+ if (size > max_size) {
1238
+ // Do not make batch too big
1239
+ break;
1240
+ }
1241
+
1242
+ // Append to *reuslt
1243
+ if (result == first->batch) {
1244
+ // Switch to temporary batch instead of disturbing caller's batch
1245
+ result = tmp_batch_;
1246
+ assert(WriteBatchInternal::Count(result) == 0);
1247
+ WriteBatchInternal::Append(result, first->batch);
1248
+ }
1249
+ WriteBatchInternal::Append(result, w->batch);
1250
+ }
1251
+ *last_writer = w;
1252
+ }
1253
+ return result;
1254
+ }
1255
+
1256
+ // REQUIRES: mutex_ is held
1257
+ // REQUIRES: this thread is currently at the front of the writer queue
1258
+ Status DBImpl::MakeRoomForWrite(bool force) {
1259
+ mutex_.AssertHeld();
1260
+ assert(!writers_.empty());
1261
+ bool allow_delay = !force;
1262
+ Status s;
1263
+ while (true) {
1264
+ if (!bg_error_.ok()) {
1265
+ // Yield previous error
1266
+ s = bg_error_;
1267
+ break;
1268
+ } else if (
1269
+ allow_delay &&
1270
+ versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
1271
+ // We are getting close to hitting a hard limit on the number of
1272
+ // L0 files. Rather than delaying a single write by several
1273
+ // seconds when we hit the hard limit, start delaying each
1274
+ // individual write by 1ms to reduce latency variance. Also,
1275
+ // this delay hands over some CPU to the compaction thread in
1276
+ // case it is sharing the same core as the writer.
1277
+ mutex_.Unlock();
1278
+ env_->SleepForMicroseconds(1000);
1279
+ allow_delay = false; // Do not delay a single write more than once
1280
+ mutex_.Lock();
1281
+ } else if (!force &&
1282
+ (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
1283
+ // There is room in current memtable
1284
+ break;
1285
+ } else if (imm_ != NULL) {
1286
+ // We have filled up the current memtable, but the previous
1287
+ // one is still being compacted, so we wait.
1288
+ Log(options_.info_log, "Current memtable full; waiting...\n");
1289
+ bg_cv_.Wait();
1290
+ } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
1291
+ // There are too many level-0 files.
1292
+ Log(options_.info_log, "Too many L0 files; waiting...\n");
1293
+ bg_cv_.Wait();
1294
+ } else {
1295
+ // Attempt to switch to a new memtable and trigger compaction of old
1296
+ assert(versions_->PrevLogNumber() == 0);
1297
+ uint64_t new_log_number = versions_->NewFileNumber();
1298
+ WritableFile* lfile = NULL;
1299
+ s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
1300
+ if (!s.ok()) {
1301
+ // Avoid chewing through file number space in a tight loop.
1302
+ versions_->ReuseFileNumber(new_log_number);
1303
+ break;
1304
+ }
1305
+ delete log_;
1306
+ delete logfile_;
1307
+ logfile_ = lfile;
1308
+ logfile_number_ = new_log_number;
1309
+ log_ = new log::Writer(lfile);
1310
+ imm_ = mem_;
1311
+ has_imm_.Release_Store(imm_);
1312
+ mem_ = new MemTable(internal_comparator_);
1313
+ mem_->Ref();
1314
+ force = false; // Do not force another compaction if have room
1315
+ MaybeScheduleCompaction();
1316
+ }
1317
+ }
1318
+ return s;
1319
+ }
1320
+
1321
+ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
1322
+ value->clear();
1323
+
1324
+ MutexLock l(&mutex_);
1325
+ Slice in = property;
1326
+ Slice prefix("leveldb.");
1327
+ if (!in.starts_with(prefix)) return false;
1328
+ in.remove_prefix(prefix.size());
1329
+
1330
+ if (in.starts_with("num-files-at-level")) {
1331
+ in.remove_prefix(strlen("num-files-at-level"));
1332
+ uint64_t level;
1333
+ bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
1334
+ if (!ok || level >= config::kNumLevels) {
1335
+ return false;
1336
+ } else {
1337
+ char buf[100];
1338
+ snprintf(buf, sizeof(buf), "%d",
1339
+ versions_->NumLevelFiles(static_cast<int>(level)));
1340
+ *value = buf;
1341
+ return true;
1342
+ }
1343
+ } else if (in == "stats") {
1344
+ char buf[200];
1345
+ snprintf(buf, sizeof(buf),
1346
+ " Compactions\n"
1347
+ "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
1348
+ "--------------------------------------------------\n"
1349
+ );
1350
+ value->append(buf);
1351
+ for (int level = 0; level < config::kNumLevels; level++) {
1352
+ int files = versions_->NumLevelFiles(level);
1353
+ if (stats_[level].micros > 0 || files > 0) {
1354
+ snprintf(
1355
+ buf, sizeof(buf),
1356
+ "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
1357
+ level,
1358
+ files,
1359
+ versions_->NumLevelBytes(level) / 1048576.0,
1360
+ stats_[level].micros / 1e6,
1361
+ stats_[level].bytes_read / 1048576.0,
1362
+ stats_[level].bytes_written / 1048576.0);
1363
+ value->append(buf);
1364
+ }
1365
+ }
1366
+ return true;
1367
+ } else if (in == "sstables") {
1368
+ *value = versions_->current()->DebugString();
1369
+ return true;
1370
+ }
1371
+
1372
+ return false;
1373
+ }
1374
+
1375
+ void DBImpl::GetApproximateSizes(
1376
+ const Range* range, int n,
1377
+ uint64_t* sizes) {
1378
+ // TODO(opt): better implementation
1379
+ Version* v;
1380
+ {
1381
+ MutexLock l(&mutex_);
1382
+ versions_->current()->Ref();
1383
+ v = versions_->current();
1384
+ }
1385
+
1386
+ for (int i = 0; i < n; i++) {
1387
+ // Convert user_key into a corresponding internal key.
1388
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
1389
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
1390
+ uint64_t start = versions_->ApproximateOffsetOf(v, k1);
1391
+ uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
1392
+ sizes[i] = (limit >= start ? limit - start : 0);
1393
+ }
1394
+
1395
+ {
1396
+ MutexLock l(&mutex_);
1397
+ v->Unref();
1398
+ }
1399
+ }
1400
+
1401
+ // Default implementations of convenience methods that subclasses of DB
1402
+ // can call if they wish
1403
+ Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
1404
+ WriteBatch batch;
1405
+ batch.Put(key, value);
1406
+ return Write(opt, &batch);
1407
+ }
1408
+
1409
+ Status DB::Delete(const WriteOptions& opt, const Slice& key) {
1410
+ WriteBatch batch;
1411
+ batch.Delete(key);
1412
+ return Write(opt, &batch);
1413
+ }
1414
+
1415
+ DB::~DB() { }
1416
+
1417
+ Status DB::Open(const Options& options, const std::string& dbname,
1418
+ DB** dbptr) {
1419
+ *dbptr = NULL;
1420
+
1421
+ DBImpl* impl = new DBImpl(options, dbname);
1422
+ impl->mutex_.Lock();
1423
+ VersionEdit edit;
1424
+ Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
1425
+ if (s.ok()) {
1426
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
1427
+ WritableFile* lfile;
1428
+ s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
1429
+ &lfile);
1430
+ if (s.ok()) {
1431
+ edit.SetLogNumber(new_log_number);
1432
+ impl->logfile_ = lfile;
1433
+ impl->logfile_number_ = new_log_number;
1434
+ impl->log_ = new log::Writer(lfile);
1435
+ s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
1436
+ }
1437
+ if (s.ok()) {
1438
+ impl->DeleteObsoleteFiles();
1439
+ impl->MaybeScheduleCompaction();
1440
+ }
1441
+ }
1442
+ impl->mutex_.Unlock();
1443
+ if (s.ok()) {
1444
+ *dbptr = impl;
1445
+ } else {
1446
+ delete impl;
1447
+ }
1448
+ return s;
1449
+ }
1450
+
1451
+ Snapshot::~Snapshot() {
1452
+ }
1453
+
1454
+ Status DestroyDB(const std::string& dbname, const Options& options) {
1455
+ Env* env = options.env;
1456
+ std::vector<std::string> filenames;
1457
+ // Ignore error in case directory does not exist
1458
+ env->GetChildren(dbname, &filenames);
1459
+ if (filenames.empty()) {
1460
+ return Status::OK();
1461
+ }
1462
+
1463
+ FileLock* lock;
1464
+ const std::string lockname = LockFileName(dbname);
1465
+ Status result = env->LockFile(lockname, &lock);
1466
+ if (result.ok()) {
1467
+ uint64_t number;
1468
+ FileType type;
1469
+ for (size_t i = 0; i < filenames.size(); i++) {
1470
+ if (ParseFileName(filenames[i], &number, &type) &&
1471
+ type != kDBLockFile) { // Lock file will be deleted at end
1472
+ Status del = env->DeleteFile(dbname + "/" + filenames[i]);
1473
+ if (result.ok() && !del.ok()) {
1474
+ result = del;
1475
+ }
1476
+ }
1477
+ }
1478
+ env->UnlockFile(lock); // Ignore error since state is already gone
1479
+ env->DeleteFile(lockname);
1480
+ env->DeleteDir(dbname); // Ignore error in case dir contains other files
1481
+ }
1482
+ return result;
1483
+ }
1484
+
1485
+ } // namespace leveldb