filiptepper-leveldb-ruby 0.14

Sign up to get free protection for your applications and to get access to all the features.
Files changed (123) hide show
  1. data/LICENSE +24 -0
  2. data/README +72 -0
  3. data/ext/leveldb/extconf.rb +14 -0
  4. data/ext/leveldb/leveldb.cc +530 -0
  5. data/ext/leveldb/platform.rb +83 -0
  6. data/leveldb/Makefile +191 -0
  7. data/leveldb/build_detect_platform +160 -0
  8. data/leveldb/db/builder.cc +88 -0
  9. data/leveldb/db/builder.h +34 -0
  10. data/leveldb/db/c.cc +581 -0
  11. data/leveldb/db/corruption_test.cc +359 -0
  12. data/leveldb/db/db_bench.cc +970 -0
  13. data/leveldb/db/db_impl.cc +1448 -0
  14. data/leveldb/db/db_impl.h +194 -0
  15. data/leveldb/db/db_iter.cc +299 -0
  16. data/leveldb/db/db_iter.h +26 -0
  17. data/leveldb/db/db_test.cc +1901 -0
  18. data/leveldb/db/dbformat.cc +140 -0
  19. data/leveldb/db/dbformat.h +227 -0
  20. data/leveldb/db/dbformat_test.cc +112 -0
  21. data/leveldb/db/filename.cc +139 -0
  22. data/leveldb/db/filename.h +80 -0
  23. data/leveldb/db/filename_test.cc +122 -0
  24. data/leveldb/db/log_format.h +35 -0
  25. data/leveldb/db/log_reader.cc +259 -0
  26. data/leveldb/db/log_reader.h +108 -0
  27. data/leveldb/db/log_test.cc +500 -0
  28. data/leveldb/db/log_writer.cc +103 -0
  29. data/leveldb/db/log_writer.h +48 -0
  30. data/leveldb/db/memtable.cc +145 -0
  31. data/leveldb/db/memtable.h +91 -0
  32. data/leveldb/db/repair.cc +389 -0
  33. data/leveldb/db/skiplist.h +379 -0
  34. data/leveldb/db/skiplist_test.cc +378 -0
  35. data/leveldb/db/snapshot.h +66 -0
  36. data/leveldb/db/table_cache.cc +121 -0
  37. data/leveldb/db/table_cache.h +61 -0
  38. data/leveldb/db/version_edit.cc +266 -0
  39. data/leveldb/db/version_edit.h +107 -0
  40. data/leveldb/db/version_edit_test.cc +46 -0
  41. data/leveldb/db/version_set.cc +1402 -0
  42. data/leveldb/db/version_set.h +370 -0
  43. data/leveldb/db/version_set_test.cc +179 -0
  44. data/leveldb/db/write_batch.cc +147 -0
  45. data/leveldb/db/write_batch_internal.h +49 -0
  46. data/leveldb/db/write_batch_test.cc +120 -0
  47. data/leveldb/helpers/memenv/memenv.cc +374 -0
  48. data/leveldb/helpers/memenv/memenv.h +20 -0
  49. data/leveldb/helpers/memenv/memenv_test.cc +232 -0
  50. data/leveldb/include/leveldb/c.h +275 -0
  51. data/leveldb/include/leveldb/cache.h +99 -0
  52. data/leveldb/include/leveldb/comparator.h +63 -0
  53. data/leveldb/include/leveldb/db.h +161 -0
  54. data/leveldb/include/leveldb/env.h +323 -0
  55. data/leveldb/include/leveldb/filter_policy.h +70 -0
  56. data/leveldb/include/leveldb/iterator.h +100 -0
  57. data/leveldb/include/leveldb/options.h +195 -0
  58. data/leveldb/include/leveldb/slice.h +109 -0
  59. data/leveldb/include/leveldb/status.h +106 -0
  60. data/leveldb/include/leveldb/table.h +85 -0
  61. data/leveldb/include/leveldb/table_builder.h +92 -0
  62. data/leveldb/include/leveldb/write_batch.h +64 -0
  63. data/leveldb/port/atomic_pointer.h +144 -0
  64. data/leveldb/port/port.h +21 -0
  65. data/leveldb/port/port_android.cc +64 -0
  66. data/leveldb/port/port_android.h +159 -0
  67. data/leveldb/port/port_example.h +125 -0
  68. data/leveldb/port/port_posix.cc +50 -0
  69. data/leveldb/port/port_posix.h +129 -0
  70. data/leveldb/port/win/stdint.h +24 -0
  71. data/leveldb/table/block.cc +267 -0
  72. data/leveldb/table/block.h +44 -0
  73. data/leveldb/table/block_builder.cc +109 -0
  74. data/leveldb/table/block_builder.h +57 -0
  75. data/leveldb/table/filter_block.cc +111 -0
  76. data/leveldb/table/filter_block.h +68 -0
  77. data/leveldb/table/filter_block_test.cc +128 -0
  78. data/leveldb/table/format.cc +145 -0
  79. data/leveldb/table/format.h +108 -0
  80. data/leveldb/table/iterator.cc +67 -0
  81. data/leveldb/table/iterator_wrapper.h +63 -0
  82. data/leveldb/table/merger.cc +197 -0
  83. data/leveldb/table/merger.h +26 -0
  84. data/leveldb/table/table.cc +276 -0
  85. data/leveldb/table/table_builder.cc +270 -0
  86. data/leveldb/table/table_test.cc +838 -0
  87. data/leveldb/table/two_level_iterator.cc +182 -0
  88. data/leveldb/table/two_level_iterator.h +34 -0
  89. data/leveldb/util/arena.cc +68 -0
  90. data/leveldb/util/arena.h +68 -0
  91. data/leveldb/util/arena_test.cc +68 -0
  92. data/leveldb/util/bloom.cc +95 -0
  93. data/leveldb/util/bloom_test.cc +159 -0
  94. data/leveldb/util/cache.cc +328 -0
  95. data/leveldb/util/cache_test.cc +186 -0
  96. data/leveldb/util/coding.cc +194 -0
  97. data/leveldb/util/coding.h +104 -0
  98. data/leveldb/util/coding_test.cc +173 -0
  99. data/leveldb/util/comparator.cc +76 -0
  100. data/leveldb/util/crc32c.cc +332 -0
  101. data/leveldb/util/crc32c.h +45 -0
  102. data/leveldb/util/crc32c_test.cc +72 -0
  103. data/leveldb/util/env.cc +96 -0
  104. data/leveldb/util/env_posix.cc +609 -0
  105. data/leveldb/util/env_test.cc +104 -0
  106. data/leveldb/util/filter_policy.cc +11 -0
  107. data/leveldb/util/hash.cc +45 -0
  108. data/leveldb/util/hash.h +19 -0
  109. data/leveldb/util/histogram.cc +139 -0
  110. data/leveldb/util/histogram.h +42 -0
  111. data/leveldb/util/logging.cc +81 -0
  112. data/leveldb/util/logging.h +47 -0
  113. data/leveldb/util/mutexlock.h +39 -0
  114. data/leveldb/util/options.cc +29 -0
  115. data/leveldb/util/posix_logger.h +98 -0
  116. data/leveldb/util/random.h +59 -0
  117. data/leveldb/util/status.cc +75 -0
  118. data/leveldb/util/testharness.cc +77 -0
  119. data/leveldb/util/testharness.h +138 -0
  120. data/leveldb/util/testutil.cc +51 -0
  121. data/leveldb/util/testutil.h +53 -0
  122. data/lib/leveldb.rb +76 -0
  123. metadata +175 -0
@@ -0,0 +1,1448 @@
1
+ // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file. See the AUTHORS file for names of contributors.
4
+
5
+ #include "db/db_impl.h"
6
+
7
+ #include <algorithm>
8
+ #include <set>
9
+ #include <string>
10
+ #include <stdint.h>
11
+ #include <stdio.h>
12
+ #include <vector>
13
+ #include "db/builder.h"
14
+ #include "db/db_iter.h"
15
+ #include "db/dbformat.h"
16
+ #include "db/filename.h"
17
+ #include "db/log_reader.h"
18
+ #include "db/log_writer.h"
19
+ #include "db/memtable.h"
20
+ #include "db/table_cache.h"
21
+ #include "db/version_set.h"
22
+ #include "db/write_batch_internal.h"
23
+ #include "leveldb/db.h"
24
+ #include "leveldb/env.h"
25
+ #include "leveldb/status.h"
26
+ #include "leveldb/table.h"
27
+ #include "leveldb/table_builder.h"
28
+ #include "port/port.h"
29
+ #include "table/block.h"
30
+ #include "table/merger.h"
31
+ #include "table/two_level_iterator.h"
32
+ #include "util/coding.h"
33
+ #include "util/logging.h"
34
+ #include "util/mutexlock.h"
35
+
36
+ namespace leveldb {
37
+
38
+ // Information kept for every waiting writer
39
+ struct DBImpl::Writer {
40
+ Status status;
41
+ WriteBatch* batch;
42
+ bool sync;
43
+ bool done;
44
+ port::CondVar cv;
45
+
46
+ explicit Writer(port::Mutex* mu) : cv(mu) { }
47
+ };
48
+
49
+ struct DBImpl::CompactionState {
50
+ Compaction* const compaction;
51
+
52
+ // Sequence numbers < smallest_snapshot are not significant since we
53
+ // will never have to service a snapshot below smallest_snapshot.
54
+ // Therefore if we have seen a sequence number S <= smallest_snapshot,
55
+ // we can drop all entries for the same key with sequence numbers < S.
56
+ SequenceNumber smallest_snapshot;
57
+
58
+ // Files produced by compaction
59
+ struct Output {
60
+ uint64_t number;
61
+ uint64_t file_size;
62
+ InternalKey smallest, largest;
63
+ };
64
+ std::vector<Output> outputs;
65
+
66
+ // State kept for output being generated
67
+ WritableFile* outfile;
68
+ TableBuilder* builder;
69
+
70
+ uint64_t total_bytes;
71
+
72
+ Output* current_output() { return &outputs[outputs.size()-1]; }
73
+
74
+ explicit CompactionState(Compaction* c)
75
+ : compaction(c),
76
+ outfile(NULL),
77
+ builder(NULL),
78
+ total_bytes(0) {
79
+ }
80
+ };
81
+
82
+ // Fix user-supplied options to be reasonable
83
+ template <class T,class V>
84
+ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
85
+ if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
86
+ if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
87
+ }
88
+ Options SanitizeOptions(const std::string& dbname,
89
+ const InternalKeyComparator* icmp,
90
+ const InternalFilterPolicy* ipolicy,
91
+ const Options& src) {
92
+ Options result = src;
93
+ result.comparator = icmp;
94
+ result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
95
+ ClipToRange(&result.max_open_files, 20, 50000);
96
+ ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
97
+ ClipToRange(&result.block_size, 1<<10, 4<<20);
98
+ if (result.info_log == NULL) {
99
+ // Open a log file in the same directory as the db
100
+ src.env->CreateDir(dbname); // In case it does not exist
101
+ src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
102
+ Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
103
+ if (!s.ok()) {
104
+ // No place suitable for logging
105
+ result.info_log = NULL;
106
+ }
107
+ }
108
+ if (result.block_cache == NULL) {
109
+ result.block_cache = NewLRUCache(8 << 20);
110
+ }
111
+ return result;
112
+ }
113
+
114
+ DBImpl::DBImpl(const Options& options, const std::string& dbname)
115
+ : env_(options.env),
116
+ internal_comparator_(options.comparator),
117
+ internal_filter_policy_(options.filter_policy),
118
+ options_(SanitizeOptions(
119
+ dbname, &internal_comparator_, &internal_filter_policy_, options)),
120
+ owns_info_log_(options_.info_log != options.info_log),
121
+ owns_cache_(options_.block_cache != options.block_cache),
122
+ dbname_(dbname),
123
+ db_lock_(NULL),
124
+ shutting_down_(NULL),
125
+ bg_cv_(&mutex_),
126
+ mem_(new MemTable(internal_comparator_)),
127
+ imm_(NULL),
128
+ logfile_(NULL),
129
+ logfile_number_(0),
130
+ log_(NULL),
131
+ tmp_batch_(new WriteBatch),
132
+ bg_compaction_scheduled_(false),
133
+ manual_compaction_(NULL) {
134
+ mem_->Ref();
135
+ has_imm_.Release_Store(NULL);
136
+
137
+ // Reserve ten files or so for other uses and give the rest to TableCache.
138
+ const int table_cache_size = options.max_open_files - 10;
139
+ table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
140
+
141
+ versions_ = new VersionSet(dbname_, &options_, table_cache_,
142
+ &internal_comparator_);
143
+ }
144
+
145
+ DBImpl::~DBImpl() {
146
+ // Wait for background work to finish
147
+ mutex_.Lock();
148
+ shutting_down_.Release_Store(this); // Any non-NULL value is ok
149
+ while (bg_compaction_scheduled_) {
150
+ bg_cv_.Wait();
151
+ }
152
+ mutex_.Unlock();
153
+
154
+ if (db_lock_ != NULL) {
155
+ env_->UnlockFile(db_lock_);
156
+ }
157
+
158
+ delete versions_;
159
+ if (mem_ != NULL) mem_->Unref();
160
+ if (imm_ != NULL) imm_->Unref();
161
+ delete tmp_batch_;
162
+ delete log_;
163
+ delete logfile_;
164
+ delete table_cache_;
165
+
166
+ if (owns_info_log_) {
167
+ delete options_.info_log;
168
+ }
169
+ if (owns_cache_) {
170
+ delete options_.block_cache;
171
+ }
172
+ }
173
+
174
+ Status DBImpl::NewDB() {
175
+ VersionEdit new_db;
176
+ new_db.SetComparatorName(user_comparator()->Name());
177
+ new_db.SetLogNumber(0);
178
+ new_db.SetNextFile(2);
179
+ new_db.SetLastSequence(0);
180
+
181
+ const std::string manifest = DescriptorFileName(dbname_, 1);
182
+ WritableFile* file;
183
+ Status s = env_->NewWritableFile(manifest, &file);
184
+ if (!s.ok()) {
185
+ return s;
186
+ }
187
+ {
188
+ log::Writer log(file);
189
+ std::string record;
190
+ new_db.EncodeTo(&record);
191
+ s = log.AddRecord(record);
192
+ if (s.ok()) {
193
+ s = file->Close();
194
+ }
195
+ }
196
+ delete file;
197
+ if (s.ok()) {
198
+ // Make "CURRENT" file that points to the new manifest file.
199
+ s = SetCurrentFile(env_, dbname_, 1);
200
+ } else {
201
+ env_->DeleteFile(manifest);
202
+ }
203
+ return s;
204
+ }
205
+
206
+ void DBImpl::MaybeIgnoreError(Status* s) const {
207
+ if (s->ok() || options_.paranoid_checks) {
208
+ // No change needed
209
+ } else {
210
+ Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
211
+ *s = Status::OK();
212
+ }
213
+ }
214
+
215
+ void DBImpl::DeleteObsoleteFiles() {
216
+ // Make a set of all of the live files
217
+ std::set<uint64_t> live = pending_outputs_;
218
+ versions_->AddLiveFiles(&live);
219
+
220
+ std::vector<std::string> filenames;
221
+ env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
222
+ uint64_t number;
223
+ FileType type;
224
+ for (size_t i = 0; i < filenames.size(); i++) {
225
+ if (ParseFileName(filenames[i], &number, &type)) {
226
+ bool keep = true;
227
+ switch (type) {
228
+ case kLogFile:
229
+ keep = ((number >= versions_->LogNumber()) ||
230
+ (number == versions_->PrevLogNumber()));
231
+ break;
232
+ case kDescriptorFile:
233
+ // Keep my manifest file, and any newer incarnations'
234
+ // (in case there is a race that allows other incarnations)
235
+ keep = (number >= versions_->ManifestFileNumber());
236
+ break;
237
+ case kTableFile:
238
+ keep = (live.find(number) != live.end());
239
+ break;
240
+ case kTempFile:
241
+ // Any temp files that are currently being written to must
242
+ // be recorded in pending_outputs_, which is inserted into "live"
243
+ keep = (live.find(number) != live.end());
244
+ break;
245
+ case kCurrentFile:
246
+ case kDBLockFile:
247
+ case kInfoLogFile:
248
+ keep = true;
249
+ break;
250
+ }
251
+
252
+ if (!keep) {
253
+ if (type == kTableFile) {
254
+ table_cache_->Evict(number);
255
+ }
256
+ Log(options_.info_log, "Delete type=%d #%lld\n",
257
+ int(type),
258
+ static_cast<unsigned long long>(number));
259
+ env_->DeleteFile(dbname_ + "/" + filenames[i]);
260
+ }
261
+ }
262
+ }
263
+ }
264
+
265
+ Status DBImpl::Recover(VersionEdit* edit) {
266
+ mutex_.AssertHeld();
267
+
268
+ // Ignore error from CreateDir since the creation of the DB is
269
+ // committed only when the descriptor is created, and this directory
270
+ // may already exist from a previous failed creation attempt.
271
+ env_->CreateDir(dbname_);
272
+ assert(db_lock_ == NULL);
273
+ Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
274
+ if (!s.ok()) {
275
+ return s;
276
+ }
277
+
278
+ if (!env_->FileExists(CurrentFileName(dbname_))) {
279
+ if (options_.create_if_missing) {
280
+ s = NewDB();
281
+ if (!s.ok()) {
282
+ return s;
283
+ }
284
+ } else {
285
+ return Status::InvalidArgument(
286
+ dbname_, "does not exist (create_if_missing is false)");
287
+ }
288
+ } else {
289
+ if (options_.error_if_exists) {
290
+ return Status::InvalidArgument(
291
+ dbname_, "exists (error_if_exists is true)");
292
+ }
293
+ }
294
+
295
+ s = versions_->Recover();
296
+ if (s.ok()) {
297
+ SequenceNumber max_sequence(0);
298
+
299
+ // Recover from all newer log files than the ones named in the
300
+ // descriptor (new log files may have been added by the previous
301
+ // incarnation without registering them in the descriptor).
302
+ //
303
+ // Note that PrevLogNumber() is no longer used, but we pay
304
+ // attention to it in case we are recovering a database
305
+ // produced by an older version of leveldb.
306
+ const uint64_t min_log = versions_->LogNumber();
307
+ const uint64_t prev_log = versions_->PrevLogNumber();
308
+ std::vector<std::string> filenames;
309
+ s = env_->GetChildren(dbname_, &filenames);
310
+ if (!s.ok()) {
311
+ return s;
312
+ }
313
+ uint64_t number;
314
+ FileType type;
315
+ std::vector<uint64_t> logs;
316
+ for (size_t i = 0; i < filenames.size(); i++) {
317
+ if (ParseFileName(filenames[i], &number, &type)
318
+ && type == kLogFile
319
+ && ((number >= min_log) || (number == prev_log))) {
320
+ logs.push_back(number);
321
+ }
322
+ }
323
+
324
+ // Recover in the order in which the logs were generated
325
+ std::sort(logs.begin(), logs.end());
326
+ for (size_t i = 0; i < logs.size(); i++) {
327
+ s = RecoverLogFile(logs[i], edit, &max_sequence);
328
+
329
+ // The previous incarnation may not have written any MANIFEST
330
+ // records after allocating this log number. So we manually
331
+ // update the file number allocation counter in VersionSet.
332
+ versions_->MarkFileNumberUsed(logs[i]);
333
+ }
334
+
335
+ if (s.ok()) {
336
+ if (versions_->LastSequence() < max_sequence) {
337
+ versions_->SetLastSequence(max_sequence);
338
+ }
339
+ }
340
+ }
341
+
342
+ return s;
343
+ }
344
+
345
+ Status DBImpl::RecoverLogFile(uint64_t log_number,
346
+ VersionEdit* edit,
347
+ SequenceNumber* max_sequence) {
348
+ struct LogReporter : public log::Reader::Reporter {
349
+ Env* env;
350
+ Logger* info_log;
351
+ const char* fname;
352
+ Status* status; // NULL if options_.paranoid_checks==false
353
+ virtual void Corruption(size_t bytes, const Status& s) {
354
+ Log(info_log, "%s%s: dropping %d bytes; %s",
355
+ (this->status == NULL ? "(ignoring error) " : ""),
356
+ fname, static_cast<int>(bytes), s.ToString().c_str());
357
+ if (this->status != NULL && this->status->ok()) *this->status = s;
358
+ }
359
+ };
360
+
361
+ mutex_.AssertHeld();
362
+
363
+ // Open the log file
364
+ std::string fname = LogFileName(dbname_, log_number);
365
+ SequentialFile* file;
366
+ Status status = env_->NewSequentialFile(fname, &file);
367
+ if (!status.ok()) {
368
+ MaybeIgnoreError(&status);
369
+ return status;
370
+ }
371
+
372
+ // Create the log reader.
373
+ LogReporter reporter;
374
+ reporter.env = env_;
375
+ reporter.info_log = options_.info_log;
376
+ reporter.fname = fname.c_str();
377
+ reporter.status = (options_.paranoid_checks ? &status : NULL);
378
+ // We intentially make log::Reader do checksumming even if
379
+ // paranoid_checks==false so that corruptions cause entire commits
380
+ // to be skipped instead of propagating bad information (like overly
381
+ // large sequence numbers).
382
+ log::Reader reader(file, &reporter, true/*checksum*/,
383
+ 0/*initial_offset*/);
384
+ Log(options_.info_log, "Recovering log #%llu",
385
+ (unsigned long long) log_number);
386
+
387
+ // Read all the records and add to a memtable
388
+ std::string scratch;
389
+ Slice record;
390
+ WriteBatch batch;
391
+ MemTable* mem = NULL;
392
+ while (reader.ReadRecord(&record, &scratch) &&
393
+ status.ok()) {
394
+ if (record.size() < 12) {
395
+ reporter.Corruption(
396
+ record.size(), Status::Corruption("log record too small"));
397
+ continue;
398
+ }
399
+ WriteBatchInternal::SetContents(&batch, record);
400
+
401
+ if (mem == NULL) {
402
+ mem = new MemTable(internal_comparator_);
403
+ mem->Ref();
404
+ }
405
+ status = WriteBatchInternal::InsertInto(&batch, mem);
406
+ MaybeIgnoreError(&status);
407
+ if (!status.ok()) {
408
+ break;
409
+ }
410
+ const SequenceNumber last_seq =
411
+ WriteBatchInternal::Sequence(&batch) +
412
+ WriteBatchInternal::Count(&batch) - 1;
413
+ if (last_seq > *max_sequence) {
414
+ *max_sequence = last_seq;
415
+ }
416
+
417
+ if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
418
+ status = WriteLevel0Table(mem, edit, NULL);
419
+ if (!status.ok()) {
420
+ // Reflect errors immediately so that conditions like full
421
+ // file-systems cause the DB::Open() to fail.
422
+ break;
423
+ }
424
+ mem->Unref();
425
+ mem = NULL;
426
+ }
427
+ }
428
+
429
+ if (status.ok() && mem != NULL) {
430
+ status = WriteLevel0Table(mem, edit, NULL);
431
+ // Reflect errors immediately so that conditions like full
432
+ // file-systems cause the DB::Open() to fail.
433
+ }
434
+
435
+ if (mem != NULL) mem->Unref();
436
+ delete file;
437
+ return status;
438
+ }
439
+
440
+ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
441
+ Version* base) {
442
+ mutex_.AssertHeld();
443
+ const uint64_t start_micros = env_->NowMicros();
444
+ FileMetaData meta;
445
+ meta.number = versions_->NewFileNumber();
446
+ pending_outputs_.insert(meta.number);
447
+ Iterator* iter = mem->NewIterator();
448
+ Log(options_.info_log, "Level-0 table #%llu: started",
449
+ (unsigned long long) meta.number);
450
+
451
+ Status s;
452
+ {
453
+ mutex_.Unlock();
454
+ s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
455
+ mutex_.Lock();
456
+ }
457
+
458
+ Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
459
+ (unsigned long long) meta.number,
460
+ (unsigned long long) meta.file_size,
461
+ s.ToString().c_str());
462
+ delete iter;
463
+ pending_outputs_.erase(meta.number);
464
+
465
+
466
+ // Note that if file_size is zero, the file has been deleted and
467
+ // should not be added to the manifest.
468
+ int level = 0;
469
+ if (s.ok() && meta.file_size > 0) {
470
+ const Slice min_user_key = meta.smallest.user_key();
471
+ const Slice max_user_key = meta.largest.user_key();
472
+ if (base != NULL) {
473
+ level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
474
+ }
475
+ edit->AddFile(level, meta.number, meta.file_size,
476
+ meta.smallest, meta.largest);
477
+ }
478
+
479
+ CompactionStats stats;
480
+ stats.micros = env_->NowMicros() - start_micros;
481
+ stats.bytes_written = meta.file_size;
482
+ stats_[level].Add(stats);
483
+ return s;
484
+ }
485
+
486
+ Status DBImpl::CompactMemTable() {
487
+ mutex_.AssertHeld();
488
+ assert(imm_ != NULL);
489
+
490
+ // Save the contents of the memtable as a new Table
491
+ VersionEdit edit;
492
+ Version* base = versions_->current();
493
+ base->Ref();
494
+ Status s = WriteLevel0Table(imm_, &edit, base);
495
+ base->Unref();
496
+
497
+ if (s.ok() && shutting_down_.Acquire_Load()) {
498
+ s = Status::IOError("Deleting DB during memtable compaction");
499
+ }
500
+
501
+ // Replace immutable memtable with the generated Table
502
+ if (s.ok()) {
503
+ edit.SetPrevLogNumber(0);
504
+ edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
505
+ s = versions_->LogAndApply(&edit, &mutex_);
506
+ }
507
+
508
+ if (s.ok()) {
509
+ // Commit to the new state
510
+ imm_->Unref();
511
+ imm_ = NULL;
512
+ has_imm_.Release_Store(NULL);
513
+ DeleteObsoleteFiles();
514
+ }
515
+
516
+ return s;
517
+ }
518
+
519
+ void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
520
+ int max_level_with_files = 1;
521
+ {
522
+ MutexLock l(&mutex_);
523
+ Version* base = versions_->current();
524
+ for (int level = 1; level < config::kNumLevels; level++) {
525
+ if (base->OverlapInLevel(level, begin, end)) {
526
+ max_level_with_files = level;
527
+ }
528
+ }
529
+ }
530
+ TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
531
+ for (int level = 0; level < max_level_with_files; level++) {
532
+ TEST_CompactRange(level, begin, end);
533
+ }
534
+ }
535
+
536
+ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
537
+ assert(level >= 0);
538
+ assert(level + 1 < config::kNumLevels);
539
+
540
+ InternalKey begin_storage, end_storage;
541
+
542
+ ManualCompaction manual;
543
+ manual.level = level;
544
+ manual.done = false;
545
+ if (begin == NULL) {
546
+ manual.begin = NULL;
547
+ } else {
548
+ begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
549
+ manual.begin = &begin_storage;
550
+ }
551
+ if (end == NULL) {
552
+ manual.end = NULL;
553
+ } else {
554
+ end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
555
+ manual.end = &end_storage;
556
+ }
557
+
558
+ MutexLock l(&mutex_);
559
+ while (!manual.done) {
560
+ while (manual_compaction_ != NULL) {
561
+ bg_cv_.Wait();
562
+ }
563
+ manual_compaction_ = &manual;
564
+ MaybeScheduleCompaction();
565
+ while (manual_compaction_ == &manual) {
566
+ bg_cv_.Wait();
567
+ }
568
+ }
569
+ }
570
+
571
+ Status DBImpl::TEST_CompactMemTable() {
572
+ // NULL batch means just wait for earlier writes to be done
573
+ Status s = Write(WriteOptions(), NULL);
574
+ if (s.ok()) {
575
+ // Wait until the compaction completes
576
+ MutexLock l(&mutex_);
577
+ while (imm_ != NULL && bg_error_.ok()) {
578
+ bg_cv_.Wait();
579
+ }
580
+ if (imm_ != NULL) {
581
+ s = bg_error_;
582
+ }
583
+ }
584
+ return s;
585
+ }
586
+
587
+ void DBImpl::MaybeScheduleCompaction() {
588
+ mutex_.AssertHeld();
589
+ if (bg_compaction_scheduled_) {
590
+ // Already scheduled
591
+ } else if (shutting_down_.Acquire_Load()) {
592
+ // DB is being deleted; no more background compactions
593
+ } else if (imm_ == NULL &&
594
+ manual_compaction_ == NULL &&
595
+ !versions_->NeedsCompaction()) {
596
+ // No work to be done
597
+ } else {
598
+ bg_compaction_scheduled_ = true;
599
+ env_->Schedule(&DBImpl::BGWork, this);
600
+ }
601
+ }
602
+
603
+ void DBImpl::BGWork(void* db) {
604
+ reinterpret_cast<DBImpl*>(db)->BackgroundCall();
605
+ }
606
+
607
+ void DBImpl::BackgroundCall() {
608
+ MutexLock l(&mutex_);
609
+ assert(bg_compaction_scheduled_);
610
+ if (!shutting_down_.Acquire_Load()) {
611
+ BackgroundCompaction();
612
+ }
613
+ bg_compaction_scheduled_ = false;
614
+
615
+ // Previous compaction may have produced too many files in a level,
616
+ // so reschedule another compaction if needed.
617
+ MaybeScheduleCompaction();
618
+ bg_cv_.SignalAll();
619
+ }
620
+
621
+ void DBImpl::BackgroundCompaction() {
622
+ mutex_.AssertHeld();
623
+
624
+ if (imm_ != NULL) {
625
+ CompactMemTable();
626
+ return;
627
+ }
628
+
629
+ Compaction* c;
630
+ bool is_manual = (manual_compaction_ != NULL);
631
+ InternalKey manual_end;
632
+ if (is_manual) {
633
+ ManualCompaction* m = manual_compaction_;
634
+ c = versions_->CompactRange(m->level, m->begin, m->end);
635
+ m->done = (c == NULL);
636
+ if (c != NULL) {
637
+ manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
638
+ }
639
+ Log(options_.info_log,
640
+ "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
641
+ m->level,
642
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
643
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
644
+ (m->done ? "(end)" : manual_end.DebugString().c_str()));
645
+ } else {
646
+ c = versions_->PickCompaction();
647
+ }
648
+
649
+ Status status;
650
+ if (c == NULL) {
651
+ // Nothing to do
652
+ } else if (!is_manual && c->IsTrivialMove()) {
653
+ // Move file to next level
654
+ assert(c->num_input_files(0) == 1);
655
+ FileMetaData* f = c->input(0, 0);
656
+ c->edit()->DeleteFile(c->level(), f->number);
657
+ c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
658
+ f->smallest, f->largest);
659
+ status = versions_->LogAndApply(c->edit(), &mutex_);
660
+ VersionSet::LevelSummaryStorage tmp;
661
+ Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
662
+ static_cast<unsigned long long>(f->number),
663
+ c->level() + 1,
664
+ static_cast<unsigned long long>(f->file_size),
665
+ status.ToString().c_str(),
666
+ versions_->LevelSummary(&tmp));
667
+ } else {
668
+ CompactionState* compact = new CompactionState(c);
669
+ status = DoCompactionWork(compact);
670
+ CleanupCompaction(compact);
671
+ c->ReleaseInputs();
672
+ DeleteObsoleteFiles();
673
+ }
674
+ delete c;
675
+
676
+ if (status.ok()) {
677
+ // Done
678
+ } else if (shutting_down_.Acquire_Load()) {
679
+ // Ignore compaction errors found during shutting down
680
+ } else {
681
+ Log(options_.info_log,
682
+ "Compaction error: %s", status.ToString().c_str());
683
+ if (options_.paranoid_checks && bg_error_.ok()) {
684
+ bg_error_ = status;
685
+ }
686
+ }
687
+
688
+ if (is_manual) {
689
+ ManualCompaction* m = manual_compaction_;
690
+ if (!status.ok()) {
691
+ m->done = true;
692
+ }
693
+ if (!m->done) {
694
+ // We only compacted part of the requested range. Update *m
695
+ // to the range that is left to be compacted.
696
+ m->tmp_storage = manual_end;
697
+ m->begin = &m->tmp_storage;
698
+ }
699
+ manual_compaction_ = NULL;
700
+ }
701
+ }
702
+
703
+ void DBImpl::CleanupCompaction(CompactionState* compact) {
704
+ mutex_.AssertHeld();
705
+ if (compact->builder != NULL) {
706
+ // May happen if we get a shutdown call in the middle of compaction
707
+ compact->builder->Abandon();
708
+ delete compact->builder;
709
+ } else {
710
+ assert(compact->outfile == NULL);
711
+ }
712
+ delete compact->outfile;
713
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
714
+ const CompactionState::Output& out = compact->outputs[i];
715
+ pending_outputs_.erase(out.number);
716
+ }
717
+ delete compact;
718
+ }
719
+
720
+ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
721
+ assert(compact != NULL);
722
+ assert(compact->builder == NULL);
723
+ uint64_t file_number;
724
+ {
725
+ mutex_.Lock();
726
+ file_number = versions_->NewFileNumber();
727
+ pending_outputs_.insert(file_number);
728
+ CompactionState::Output out;
729
+ out.number = file_number;
730
+ out.smallest.Clear();
731
+ out.largest.Clear();
732
+ compact->outputs.push_back(out);
733
+ mutex_.Unlock();
734
+ }
735
+
736
+ // Make the output file
737
+ std::string fname = TableFileName(dbname_, file_number);
738
+ Status s = env_->NewWritableFile(fname, &compact->outfile);
739
+ if (s.ok()) {
740
+ compact->builder = new TableBuilder(options_, compact->outfile);
741
+ }
742
+ return s;
743
+ }
744
+
745
+ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
746
+ Iterator* input) {
747
+ assert(compact != NULL);
748
+ assert(compact->outfile != NULL);
749
+ assert(compact->builder != NULL);
750
+
751
+ const uint64_t output_number = compact->current_output()->number;
752
+ assert(output_number != 0);
753
+
754
+ // Check for iterator errors
755
+ Status s = input->status();
756
+ const uint64_t current_entries = compact->builder->NumEntries();
757
+ if (s.ok()) {
758
+ s = compact->builder->Finish();
759
+ } else {
760
+ compact->builder->Abandon();
761
+ }
762
+ const uint64_t current_bytes = compact->builder->FileSize();
763
+ compact->current_output()->file_size = current_bytes;
764
+ compact->total_bytes += current_bytes;
765
+ delete compact->builder;
766
+ compact->builder = NULL;
767
+
768
+ // Finish and check for file errors
769
+ if (s.ok()) {
770
+ s = compact->outfile->Sync();
771
+ }
772
+ if (s.ok()) {
773
+ s = compact->outfile->Close();
774
+ }
775
+ delete compact->outfile;
776
+ compact->outfile = NULL;
777
+
778
+ if (s.ok() && current_entries > 0) {
779
+ // Verify that the table is usable
780
+ Iterator* iter = table_cache_->NewIterator(ReadOptions(),
781
+ output_number,
782
+ current_bytes);
783
+ s = iter->status();
784
+ delete iter;
785
+ if (s.ok()) {
786
+ Log(options_.info_log,
787
+ "Generated table #%llu: %lld keys, %lld bytes",
788
+ (unsigned long long) output_number,
789
+ (unsigned long long) current_entries,
790
+ (unsigned long long) current_bytes);
791
+ }
792
+ }
793
+ return s;
794
+ }
795
+
796
+
797
+ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
798
+ mutex_.AssertHeld();
799
+ Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
800
+ compact->compaction->num_input_files(0),
801
+ compact->compaction->level(),
802
+ compact->compaction->num_input_files(1),
803
+ compact->compaction->level() + 1,
804
+ static_cast<long long>(compact->total_bytes));
805
+
806
+ // Add compaction outputs
807
+ compact->compaction->AddInputDeletions(compact->compaction->edit());
808
+ const int level = compact->compaction->level();
809
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
810
+ const CompactionState::Output& out = compact->outputs[i];
811
+ compact->compaction->edit()->AddFile(
812
+ level + 1,
813
+ out.number, out.file_size, out.smallest, out.largest);
814
+ }
815
+ return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
816
+ }
817
+
818
+ Status DBImpl::DoCompactionWork(CompactionState* compact) {
819
+ const uint64_t start_micros = env_->NowMicros();
820
+ int64_t imm_micros = 0; // Micros spent doing imm_ compactions
821
+
822
+ Log(options_.info_log, "Compacting %d@%d + %d@%d files",
823
+ compact->compaction->num_input_files(0),
824
+ compact->compaction->level(),
825
+ compact->compaction->num_input_files(1),
826
+ compact->compaction->level() + 1);
827
+
828
+ assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
829
+ assert(compact->builder == NULL);
830
+ assert(compact->outfile == NULL);
831
+ if (snapshots_.empty()) {
832
+ compact->smallest_snapshot = versions_->LastSequence();
833
+ } else {
834
+ compact->smallest_snapshot = snapshots_.oldest()->number_;
835
+ }
836
+
837
+ // Release mutex while we're actually doing the compaction work
838
+ mutex_.Unlock();
839
+
840
+ Iterator* input = versions_->MakeInputIterator(compact->compaction);
841
+ input->SeekToFirst();
842
+ Status status;
843
+ ParsedInternalKey ikey;
844
+ std::string current_user_key;
845
+ bool has_current_user_key = false;
846
+ SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
847
+ for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
848
+ // Prioritize immutable compaction work
849
+ if (has_imm_.NoBarrier_Load() != NULL) {
850
+ const uint64_t imm_start = env_->NowMicros();
851
+ mutex_.Lock();
852
+ if (imm_ != NULL) {
853
+ CompactMemTable();
854
+ bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
855
+ }
856
+ mutex_.Unlock();
857
+ imm_micros += (env_->NowMicros() - imm_start);
858
+ }
859
+
860
+ Slice key = input->key();
861
+ if (compact->compaction->ShouldStopBefore(key) &&
862
+ compact->builder != NULL) {
863
+ status = FinishCompactionOutputFile(compact, input);
864
+ if (!status.ok()) {
865
+ break;
866
+ }
867
+ }
868
+
869
+ // Handle key/value, add to state, etc.
870
+ bool drop = false;
871
+ if (!ParseInternalKey(key, &ikey)) {
872
+ // Do not hide error keys
873
+ current_user_key.clear();
874
+ has_current_user_key = false;
875
+ last_sequence_for_key = kMaxSequenceNumber;
876
+ } else {
877
+ if (!has_current_user_key ||
878
+ user_comparator()->Compare(ikey.user_key,
879
+ Slice(current_user_key)) != 0) {
880
+ // First occurrence of this user key
881
+ current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
882
+ has_current_user_key = true;
883
+ last_sequence_for_key = kMaxSequenceNumber;
884
+ }
885
+
886
+ if (last_sequence_for_key <= compact->smallest_snapshot) {
887
+ // Hidden by an newer entry for same user key
888
+ drop = true; // (A)
889
+ } else if (ikey.type == kTypeDeletion &&
890
+ ikey.sequence <= compact->smallest_snapshot &&
891
+ compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
892
+ // For this user key:
893
+ // (1) there is no data in higher levels
894
+ // (2) data in lower levels will have larger sequence numbers
895
+ // (3) data in layers that are being compacted here and have
896
+ // smaller sequence numbers will be dropped in the next
897
+ // few iterations of this loop (by rule (A) above).
898
+ // Therefore this deletion marker is obsolete and can be dropped.
899
+ drop = true;
900
+ }
901
+
902
+ last_sequence_for_key = ikey.sequence;
903
+ }
904
+ #if 0
905
+ Log(options_.info_log,
906
+ " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
907
+ "%d smallest_snapshot: %d",
908
+ ikey.user_key.ToString().c_str(),
909
+ (int)ikey.sequence, ikey.type, kTypeValue, drop,
910
+ compact->compaction->IsBaseLevelForKey(ikey.user_key),
911
+ (int)last_sequence_for_key, (int)compact->smallest_snapshot);
912
+ #endif
913
+
914
+ if (!drop) {
915
+ // Open output file if necessary
916
+ if (compact->builder == NULL) {
917
+ status = OpenCompactionOutputFile(compact);
918
+ if (!status.ok()) {
919
+ break;
920
+ }
921
+ }
922
+ if (compact->builder->NumEntries() == 0) {
923
+ compact->current_output()->smallest.DecodeFrom(key);
924
+ }
925
+ compact->current_output()->largest.DecodeFrom(key);
926
+ compact->builder->Add(key, input->value());
927
+
928
+ // Close output file if it is big enough
929
+ if (compact->builder->FileSize() >=
930
+ compact->compaction->MaxOutputFileSize()) {
931
+ status = FinishCompactionOutputFile(compact, input);
932
+ if (!status.ok()) {
933
+ break;
934
+ }
935
+ }
936
+ }
937
+
938
+ input->Next();
939
+ }
940
+
941
+ if (status.ok() && shutting_down_.Acquire_Load()) {
942
+ status = Status::IOError("Deleting DB during compaction");
943
+ }
944
+ if (status.ok() && compact->builder != NULL) {
945
+ status = FinishCompactionOutputFile(compact, input);
946
+ }
947
+ if (status.ok()) {
948
+ status = input->status();
949
+ }
950
+ delete input;
951
+ input = NULL;
952
+
953
+ CompactionStats stats;
954
+ stats.micros = env_->NowMicros() - start_micros - imm_micros;
955
+ for (int which = 0; which < 2; which++) {
956
+ for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
957
+ stats.bytes_read += compact->compaction->input(which, i)->file_size;
958
+ }
959
+ }
960
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
961
+ stats.bytes_written += compact->outputs[i].file_size;
962
+ }
963
+
964
+ mutex_.Lock();
965
+ stats_[compact->compaction->level() + 1].Add(stats);
966
+
967
+ if (status.ok()) {
968
+ status = InstallCompactionResults(compact);
969
+ }
970
+ VersionSet::LevelSummaryStorage tmp;
971
+ Log(options_.info_log,
972
+ "compacted to: %s", versions_->LevelSummary(&tmp));
973
+ return status;
974
+ }
975
+
976
+ namespace {
977
+ struct IterState {
978
+ port::Mutex* mu;
979
+ Version* version;
980
+ MemTable* mem;
981
+ MemTable* imm;
982
+ };
983
+
984
+ static void CleanupIteratorState(void* arg1, void* arg2) {
985
+ IterState* state = reinterpret_cast<IterState*>(arg1);
986
+ state->mu->Lock();
987
+ state->mem->Unref();
988
+ if (state->imm != NULL) state->imm->Unref();
989
+ state->version->Unref();
990
+ state->mu->Unlock();
991
+ delete state;
992
+ }
993
+ } // namespace
994
+
995
+ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
996
+ SequenceNumber* latest_snapshot) {
997
+ IterState* cleanup = new IterState;
998
+ mutex_.Lock();
999
+ *latest_snapshot = versions_->LastSequence();
1000
+
1001
+ // Collect together all needed child iterators
1002
+ std::vector<Iterator*> list;
1003
+ list.push_back(mem_->NewIterator());
1004
+ mem_->Ref();
1005
+ if (imm_ != NULL) {
1006
+ list.push_back(imm_->NewIterator());
1007
+ imm_->Ref();
1008
+ }
1009
+ versions_->current()->AddIterators(options, &list);
1010
+ Iterator* internal_iter =
1011
+ NewMergingIterator(&internal_comparator_, &list[0], list.size());
1012
+ versions_->current()->Ref();
1013
+
1014
+ cleanup->mu = &mutex_;
1015
+ cleanup->mem = mem_;
1016
+ cleanup->imm = imm_;
1017
+ cleanup->version = versions_->current();
1018
+ internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
1019
+
1020
+ mutex_.Unlock();
1021
+ return internal_iter;
1022
+ }
1023
+
1024
+ Iterator* DBImpl::TEST_NewInternalIterator() {
1025
+ SequenceNumber ignored;
1026
+ return NewInternalIterator(ReadOptions(), &ignored);
1027
+ }
1028
+
1029
+ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
1030
+ MutexLock l(&mutex_);
1031
+ return versions_->MaxNextLevelOverlappingBytes();
1032
+ }
1033
+
1034
+ Status DBImpl::Get(const ReadOptions& options,
1035
+ const Slice& key,
1036
+ std::string* value) {
1037
+ Status s;
1038
+ MutexLock l(&mutex_);
1039
+ SequenceNumber snapshot;
1040
+ if (options.snapshot != NULL) {
1041
+ snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
1042
+ } else {
1043
+ snapshot = versions_->LastSequence();
1044
+ }
1045
+
1046
+ MemTable* mem = mem_;
1047
+ MemTable* imm = imm_;
1048
+ Version* current = versions_->current();
1049
+ mem->Ref();
1050
+ if (imm != NULL) imm->Ref();
1051
+ current->Ref();
1052
+
1053
+ bool have_stat_update = false;
1054
+ Version::GetStats stats;
1055
+
1056
+ // Unlock while reading from files and memtables
1057
+ {
1058
+ mutex_.Unlock();
1059
+ // First look in the memtable, then in the immutable memtable (if any).
1060
+ LookupKey lkey(key, snapshot);
1061
+ if (mem->Get(lkey, value, &s)) {
1062
+ // Done
1063
+ } else if (imm != NULL && imm->Get(lkey, value, &s)) {
1064
+ // Done
1065
+ } else {
1066
+ s = current->Get(options, lkey, value, &stats);
1067
+ have_stat_update = true;
1068
+ }
1069
+ mutex_.Lock();
1070
+ }
1071
+
1072
+ if (have_stat_update && current->UpdateStats(stats)) {
1073
+ MaybeScheduleCompaction();
1074
+ }
1075
+ mem->Unref();
1076
+ if (imm != NULL) imm->Unref();
1077
+ current->Unref();
1078
+ return s;
1079
+ }
1080
+
1081
+ Iterator* DBImpl::NewIterator(const ReadOptions& options) {
1082
+ SequenceNumber latest_snapshot;
1083
+ Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
1084
+ return NewDBIterator(
1085
+ &dbname_, env_, user_comparator(), internal_iter,
1086
+ (options.snapshot != NULL
1087
+ ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
1088
+ : latest_snapshot));
1089
+ }
1090
+
1091
+ const Snapshot* DBImpl::GetSnapshot() {
1092
+ MutexLock l(&mutex_);
1093
+ return snapshots_.New(versions_->LastSequence());
1094
+ }
1095
+
1096
+ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
1097
+ MutexLock l(&mutex_);
1098
+ snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
1099
+ }
1100
+
1101
+ // Convenience methods
1102
+ Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
1103
+ return DB::Put(o, key, val);
1104
+ }
1105
+
1106
+ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
1107
+ return DB::Delete(options, key);
1108
+ }
1109
+
1110
+ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
1111
+ Writer w(&mutex_);
1112
+ w.batch = my_batch;
1113
+ w.sync = options.sync;
1114
+ w.done = false;
1115
+
1116
+ MutexLock l(&mutex_);
1117
+ writers_.push_back(&w);
1118
+ while (!w.done && &w != writers_.front()) {
1119
+ w.cv.Wait();
1120
+ }
1121
+ if (w.done) {
1122
+ return w.status;
1123
+ }
1124
+
1125
+ // May temporarily unlock and wait.
1126
+ Status status = MakeRoomForWrite(my_batch == NULL);
1127
+ uint64_t last_sequence = versions_->LastSequence();
1128
+ Writer* last_writer = &w;
1129
+ if (status.ok() && my_batch != NULL) { // NULL batch is for compactions
1130
+ WriteBatch* updates = BuildBatchGroup(&last_writer);
1131
+ WriteBatchInternal::SetSequence(updates, last_sequence + 1);
1132
+ last_sequence += WriteBatchInternal::Count(updates);
1133
+
1134
+ // Add to log and apply to memtable. We can release the lock
1135
+ // during this phase since &w is currently responsible for logging
1136
+ // and protects against concurrent loggers and concurrent writes
1137
+ // into mem_.
1138
+ {
1139
+ mutex_.Unlock();
1140
+ status = log_->AddRecord(WriteBatchInternal::Contents(updates));
1141
+ if (status.ok() && options.sync) {
1142
+ status = logfile_->Sync();
1143
+ }
1144
+ if (status.ok()) {
1145
+ status = WriteBatchInternal::InsertInto(updates, mem_);
1146
+ }
1147
+ mutex_.Lock();
1148
+ }
1149
+ if (updates == tmp_batch_) tmp_batch_->Clear();
1150
+
1151
+ versions_->SetLastSequence(last_sequence);
1152
+ }
1153
+
1154
+ while (true) {
1155
+ Writer* ready = writers_.front();
1156
+ writers_.pop_front();
1157
+ if (ready != &w) {
1158
+ ready->status = status;
1159
+ ready->done = true;
1160
+ ready->cv.Signal();
1161
+ }
1162
+ if (ready == last_writer) break;
1163
+ }
1164
+
1165
+ // Notify new head of write queue
1166
+ if (!writers_.empty()) {
1167
+ writers_.front()->cv.Signal();
1168
+ }
1169
+
1170
+ return status;
1171
+ }
1172
+
1173
+ // REQUIRES: Writer list must be non-empty
1174
+ // REQUIRES: First writer must have a non-NULL batch
1175
+ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
1176
+ assert(!writers_.empty());
1177
+ Writer* first = writers_.front();
1178
+ WriteBatch* result = first->batch;
1179
+ assert(result != NULL);
1180
+
1181
+ size_t size = WriteBatchInternal::ByteSize(first->batch);
1182
+
1183
+ // Allow the group to grow up to a maximum size, but if the
1184
+ // original write is small, limit the growth so we do not slow
1185
+ // down the small write too much.
1186
+ size_t max_size = 1 << 20;
1187
+ if (size <= (128<<10)) {
1188
+ max_size = size + (128<<10);
1189
+ }
1190
+
1191
+ *last_writer = first;
1192
+ std::deque<Writer*>::iterator iter = writers_.begin();
1193
+ ++iter; // Advance past "first"
1194
+ for (; iter != writers_.end(); ++iter) {
1195
+ Writer* w = *iter;
1196
+ if (w->sync && !first->sync) {
1197
+ // Do not include a sync write into a batch handled by a non-sync write.
1198
+ break;
1199
+ }
1200
+
1201
+ if (w->batch != NULL) {
1202
+ size += WriteBatchInternal::ByteSize(w->batch);
1203
+ if (size > max_size) {
1204
+ // Do not make batch too big
1205
+ break;
1206
+ }
1207
+
1208
+ // Append to *reuslt
1209
+ if (result == first->batch) {
1210
+ // Switch to temporary batch instead of disturbing caller's batch
1211
+ result = tmp_batch_;
1212
+ assert(WriteBatchInternal::Count(result) == 0);
1213
+ WriteBatchInternal::Append(result, first->batch);
1214
+ }
1215
+ WriteBatchInternal::Append(result, w->batch);
1216
+ }
1217
+ *last_writer = w;
1218
+ }
1219
+ return result;
1220
+ }
1221
+
1222
+ // REQUIRES: mutex_ is held
1223
+ // REQUIRES: this thread is currently at the front of the writer queue
1224
+ Status DBImpl::MakeRoomForWrite(bool force) {
1225
+ mutex_.AssertHeld();
1226
+ assert(!writers_.empty());
1227
+ bool allow_delay = !force;
1228
+ Status s;
1229
+ while (true) {
1230
+ if (!bg_error_.ok()) {
1231
+ // Yield previous error
1232
+ s = bg_error_;
1233
+ break;
1234
+ } else if (
1235
+ allow_delay &&
1236
+ versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
1237
+ // We are getting close to hitting a hard limit on the number of
1238
+ // L0 files. Rather than delaying a single write by several
1239
+ // seconds when we hit the hard limit, start delaying each
1240
+ // individual write by 1ms to reduce latency variance. Also,
1241
+ // this delay hands over some CPU to the compaction thread in
1242
+ // case it is sharing the same core as the writer.
1243
+ mutex_.Unlock();
1244
+ env_->SleepForMicroseconds(1000);
1245
+ allow_delay = false; // Do not delay a single write more than once
1246
+ mutex_.Lock();
1247
+ } else if (!force &&
1248
+ (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
1249
+ // There is room in current memtable
1250
+ break;
1251
+ } else if (imm_ != NULL) {
1252
+ // We have filled up the current memtable, but the previous
1253
+ // one is still being compacted, so we wait.
1254
+ bg_cv_.Wait();
1255
+ } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
1256
+ // There are too many level-0 files.
1257
+ Log(options_.info_log, "waiting...\n");
1258
+ bg_cv_.Wait();
1259
+ } else {
1260
+ // Attempt to switch to a new memtable and trigger compaction of old
1261
+ assert(versions_->PrevLogNumber() == 0);
1262
+ uint64_t new_log_number = versions_->NewFileNumber();
1263
+ WritableFile* lfile = NULL;
1264
+ s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
1265
+ if (!s.ok()) {
1266
+ break;
1267
+ }
1268
+ delete log_;
1269
+ delete logfile_;
1270
+ logfile_ = lfile;
1271
+ logfile_number_ = new_log_number;
1272
+ log_ = new log::Writer(lfile);
1273
+ imm_ = mem_;
1274
+ has_imm_.Release_Store(imm_);
1275
+ mem_ = new MemTable(internal_comparator_);
1276
+ mem_->Ref();
1277
+ force = false; // Do not force another compaction if have room
1278
+ MaybeScheduleCompaction();
1279
+ }
1280
+ }
1281
+ return s;
1282
+ }
1283
+
1284
+ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
1285
+ value->clear();
1286
+
1287
+ MutexLock l(&mutex_);
1288
+ Slice in = property;
1289
+ Slice prefix("leveldb.");
1290
+ if (!in.starts_with(prefix)) return false;
1291
+ in.remove_prefix(prefix.size());
1292
+
1293
+ if (in.starts_with("num-files-at-level")) {
1294
+ in.remove_prefix(strlen("num-files-at-level"));
1295
+ uint64_t level;
1296
+ bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
1297
+ if (!ok || level >= config::kNumLevels) {
1298
+ return false;
1299
+ } else {
1300
+ char buf[100];
1301
+ snprintf(buf, sizeof(buf), "%d",
1302
+ versions_->NumLevelFiles(static_cast<int>(level)));
1303
+ *value = buf;
1304
+ return true;
1305
+ }
1306
+ } else if (in == "stats") {
1307
+ char buf[200];
1308
+ snprintf(buf, sizeof(buf),
1309
+ " Compactions\n"
1310
+ "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
1311
+ "--------------------------------------------------\n"
1312
+ );
1313
+ value->append(buf);
1314
+ for (int level = 0; level < config::kNumLevels; level++) {
1315
+ int files = versions_->NumLevelFiles(level);
1316
+ if (stats_[level].micros > 0 || files > 0) {
1317
+ snprintf(
1318
+ buf, sizeof(buf),
1319
+ "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
1320
+ level,
1321
+ files,
1322
+ versions_->NumLevelBytes(level) / 1048576.0,
1323
+ stats_[level].micros / 1e6,
1324
+ stats_[level].bytes_read / 1048576.0,
1325
+ stats_[level].bytes_written / 1048576.0);
1326
+ value->append(buf);
1327
+ }
1328
+ }
1329
+ return true;
1330
+ } else if (in == "sstables") {
1331
+ *value = versions_->current()->DebugString();
1332
+ return true;
1333
+ }
1334
+
1335
+ return false;
1336
+ }
1337
+
1338
+ void DBImpl::GetApproximateSizes(
1339
+ const Range* range, int n,
1340
+ uint64_t* sizes) {
1341
+ // TODO(opt): better implementation
1342
+ Version* v;
1343
+ {
1344
+ MutexLock l(&mutex_);
1345
+ versions_->current()->Ref();
1346
+ v = versions_->current();
1347
+ }
1348
+
1349
+ for (int i = 0; i < n; i++) {
1350
+ // Convert user_key into a corresponding internal key.
1351
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
1352
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
1353
+ uint64_t start = versions_->ApproximateOffsetOf(v, k1);
1354
+ uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
1355
+ sizes[i] = (limit >= start ? limit - start : 0);
1356
+ }
1357
+
1358
+ {
1359
+ MutexLock l(&mutex_);
1360
+ v->Unref();
1361
+ }
1362
+ }
1363
+
1364
+ // Default implementations of convenience methods that subclasses of DB
1365
+ // can call if they wish
1366
+ Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
1367
+ WriteBatch batch;
1368
+ batch.Put(key, value);
1369
+ return Write(opt, &batch);
1370
+ }
1371
+
1372
+ Status DB::Delete(const WriteOptions& opt, const Slice& key) {
1373
+ WriteBatch batch;
1374
+ batch.Delete(key);
1375
+ return Write(opt, &batch);
1376
+ }
1377
+
1378
+ DB::~DB() { }
1379
+
1380
+ Status DB::Open(const Options& options, const std::string& dbname,
1381
+ DB** dbptr) {
1382
+ *dbptr = NULL;
1383
+
1384
+ DBImpl* impl = new DBImpl(options, dbname);
1385
+ impl->mutex_.Lock();
1386
+ VersionEdit edit;
1387
+ Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
1388
+ if (s.ok()) {
1389
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
1390
+ WritableFile* lfile;
1391
+ s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
1392
+ &lfile);
1393
+ if (s.ok()) {
1394
+ edit.SetLogNumber(new_log_number);
1395
+ impl->logfile_ = lfile;
1396
+ impl->logfile_number_ = new_log_number;
1397
+ impl->log_ = new log::Writer(lfile);
1398
+ s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
1399
+ }
1400
+ if (s.ok()) {
1401
+ impl->DeleteObsoleteFiles();
1402
+ impl->MaybeScheduleCompaction();
1403
+ }
1404
+ }
1405
+ impl->mutex_.Unlock();
1406
+ if (s.ok()) {
1407
+ *dbptr = impl;
1408
+ } else {
1409
+ delete impl;
1410
+ }
1411
+ return s;
1412
+ }
1413
+
1414
+ Snapshot::~Snapshot() {
1415
+ }
1416
+
1417
+ Status DestroyDB(const std::string& dbname, const Options& options) {
1418
+ Env* env = options.env;
1419
+ std::vector<std::string> filenames;
1420
+ // Ignore error in case directory does not exist
1421
+ env->GetChildren(dbname, &filenames);
1422
+ if (filenames.empty()) {
1423
+ return Status::OK();
1424
+ }
1425
+
1426
+ FileLock* lock;
1427
+ const std::string lockname = LockFileName(dbname);
1428
+ Status result = env->LockFile(lockname, &lock);
1429
+ if (result.ok()) {
1430
+ uint64_t number;
1431
+ FileType type;
1432
+ for (size_t i = 0; i < filenames.size(); i++) {
1433
+ if (ParseFileName(filenames[i], &number, &type) &&
1434
+ type != kDBLockFile) { // Lock file will be deleted at end
1435
+ Status del = env->DeleteFile(dbname + "/" + filenames[i]);
1436
+ if (result.ok() && !del.ok()) {
1437
+ result = del;
1438
+ }
1439
+ }
1440
+ }
1441
+ env->UnlockFile(lock); // Ignore error since state is already gone
1442
+ env->DeleteFile(lockname);
1443
+ env->DeleteDir(dbname); // Ignore error in case dir contains other files
1444
+ }
1445
+ return result;
1446
+ }
1447
+
1448
+ } // namespace leveldb