perobs 2.3.1 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,513 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FlatFile.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2016 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'zlib'
29
+
30
+ require 'perobs/Log'
31
+ require 'perobs/IndexTree'
32
+ require 'perobs/FreeSpaceManager'
33
+
34
+ module PEROBS
35
+
36
+ # The FlatFile class manages the storage file of the FlatFileDB. It contains
37
+ # a sequence of blobs Each blob consists of a 25 byte header and the actual
38
+ # blob data bytes. The header has the following structure:
39
+ #
40
+ # 1 Byte: Mark byte.
41
+ # Bit 0: 0 deleted entry, 1 valid entry
42
+ # Bit 1: 0 unmarked, 1 marked
43
+ # Bit 2 - 7: reserved, must be 0
44
+ # 8 bytes: Length of the data blob in bytes
45
+ # 8 bytes: ID of the value in the data blob
46
+ # 4 bytes: CRC32 checksum of the data blob
47
+ #
48
+ # If the bit 0 of the mark byte is 0, only the length is valid. The blob is
49
+ # empty. Only of bit 0 is set then entry is valid.
50
+ class FlatFile
51
+
52
+ # Utility class to hold all the data that is stored in a blob header.
53
+ class Header < Struct.new(:mark, :length, :id, :crc)
54
+ end
55
+
56
+ # The 'pack()' format of the header.
57
+ BLOB_HEADER_FORMAT = 'CQQL'
58
+ # The length of the header in bytes.
59
+ BLOB_HEADER_LENGTH = 21
60
+
61
+ # Create a new FlatFile object for a database in the given path.
62
+ # @param dir [String] Directory path for the data base file
63
+ def initialize(dir)
64
+ @db_dir = dir
65
+ @f = nil
66
+ @index = IndexTree.new(dir)
67
+ @space_list = FreeSpaceManager.new(dir)
68
+ end
69
+
70
+ # Open the flat file for reading and writing.
71
+ def open
72
+ file_name = File.join(@db_dir, 'database.blobs')
73
+ begin
74
+ if File.exist?(file_name)
75
+ @f = File.open(file_name, 'rb+')
76
+ else
77
+ PEROBS.log.info 'New database.blobs file created'
78
+ @f = File.open(file_name, 'wb+')
79
+ end
80
+ rescue IOError => e
81
+ PEROBS.log.fatal "Cannot open flat file database #{file_name}: " +
82
+ e.message
83
+ end
84
+ @index.open
85
+ @space_list.open
86
+ end
87
+
88
+ # Close the flat file. This method must be called to ensure that all data
89
+ # is really written into the filesystem.
90
+ def close
91
+ @space_list.close
92
+ @index.close
93
+ @f.flush
94
+ @f.close
95
+ @f = nil
96
+ end
97
+
98
+ # Force outstanding data to be written to the filesystem.
99
+ def sync
100
+ begin
101
+ @f.flush
102
+ rescue IOError => e
103
+ PEROBS.log.fatal "Cannot sync flat file database: #{e.message}"
104
+ end
105
+ end
106
+
107
+ # Delete the blob for the specified ID.
108
+ # @param id [Integer] ID of the object to be deleted
109
+ # @return [Boolean] True if object was deleted, false otherwise
110
+ def delete_obj_by_id(id)
111
+ if (pos = find_obj_addr_by_id(id))
112
+ delete_obj_by_address(pos, id)
113
+ return true
114
+ end
115
+
116
+ return false
117
+ end
118
+
119
+ # Delete the blob that is stored at the specified address.
120
+ # @param addr [Integer] Address of the blob to delete
121
+ # @param id [Integer] ID of the blob to delete
122
+ def delete_obj_by_address(addr, id)
123
+ @index.delete_value(id)
124
+ header = read_blob_header(addr, id)
125
+ begin
126
+ @f.seek(addr)
127
+ @f.write([ 0 ].pack('C'))
128
+ @f.flush
129
+ @space_list.add_space(addr, header.length)
130
+ rescue => e
131
+ PEROBS.log.fatal "Cannot erase blob for ID #{header.id}: #{e.message}"
132
+ end
133
+ end
134
+
135
+ # Delete all unmarked objects.
136
+ def delete_unmarked_objects
137
+ deleted_ids = []
138
+ each_blob_header do |pos, mark, length, blob_id, crc|
139
+ if (mark & 3 == 1)
140
+ delete_obj_by_address(pos, blob_id)
141
+ deleted_ids << blob_id
142
+ end
143
+ end
144
+ defragmentize
145
+
146
+ deleted_ids
147
+ end
148
+
149
+ # Write the given object into the file. This method assumes that no other
150
+ # entry with the given ID exists already in the file.
151
+ # @param id [Integer] ID of the object
152
+ # @param raw_obj [String] Raw object as String
153
+ # @return [Integer] position of the written blob in the blob file
154
+ def write_obj_by_id(id, raw_obj)
155
+ addr, length = find_free_blob(raw_obj.length)
156
+ begin
157
+ if length != -1
158
+ # Just a safeguard so we don't overwrite current data.
159
+ header = read_blob_header(addr)
160
+ if header.length != length
161
+ PEROBS.log.fatal "Length in free list (#{length}) and header " +
162
+ "(#{header.length}) don't match."
163
+ end
164
+ if raw_obj.length > header.length
165
+ PEROBS.log.fatal "Object (#{raw_obj.length}) is longer than " +
166
+ "blob space (#{header.length})."
167
+ end
168
+ if header.mark != 0
169
+ PEROBS.log.fatal "Mark (#{header.mark}) is not 0."
170
+ end
171
+ end
172
+ @f.seek(addr)
173
+ @f.write([ 1, raw_obj.length, id, checksum(raw_obj)].
174
+ pack(BLOB_HEADER_FORMAT))
175
+ @f.write(raw_obj)
176
+ if length != -1 && raw_obj.length < length
177
+ # The new object was not appended and it did not completely fill the
178
+ # free space. So we have to write a new header to mark the remaining
179
+ # empty space.
180
+ unless length - raw_obj.length >= BLOB_HEADER_LENGTH
181
+ PEROBS.log.fatal "Not enough space to append the empty space " +
182
+ "header (space: #{length} bytes, object: #{raw_obj.length} " +
183
+ "bytes)."
184
+ end
185
+ space_address = @f.pos
186
+ space_length = length - BLOB_HEADER_LENGTH - raw_obj.length
187
+ @f.write([ 0, space_length, 0, 0 ].pack(BLOB_HEADER_FORMAT))
188
+ # Register the new space with the space list.
189
+ @space_list.add_space(space_address, space_length) if space_length > 0
190
+ end
191
+ @f.flush
192
+ @index.put_value(id, addr)
193
+ rescue IOError => e
194
+ PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
195
+ e.message
196
+ end
197
+
198
+ addr
199
+ end
200
+
201
+ # Find the address of the object with the given ID.
202
+ # @param id [Integer] ID of the object
203
+ # @return [Integer] Offset in the flat file or nil if not found
204
+ def find_obj_addr_by_id(id)
205
+ @index.get_value(id)
206
+ end
207
+
208
+ # Read the object with the given ID.
209
+ # @param id [Integer] ID of the object
210
+ # @return [String or nil] Raw object data if found, otherwise nil
211
+ def read_obj_by_id(id)
212
+ if (addr = find_obj_addr_by_id(id))
213
+ return read_obj_by_address(addr, id)
214
+ end
215
+
216
+ nil
217
+ end
218
+
219
+ # Read the object at the specified address.
220
+ # @param addr [Integer] Offset in the flat file
221
+ # @param id [Integer] ID of the data blob
222
+ # @return [String] Raw object data
223
+ def read_obj_by_address(addr, id)
224
+ header = read_blob_header(addr, id)
225
+ if header.id != id
226
+ PEROBS.log.fatal "Database index corrupted: Index for object " +
227
+ "#{id} points to object with ID #{header.id}"
228
+ end
229
+ begin
230
+ @f.seek(addr + BLOB_HEADER_LENGTH)
231
+ buf = @f.read(header.length)
232
+ if checksum(buf) != header.crc
233
+ PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
234
+ end
235
+ return buf
236
+ rescue => e
237
+ PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
238
+ end
239
+ end
240
+
241
+ # Mark the object with the given ID.
242
+ # @param id [Integer] ID of the object
243
+ def mark_obj_by_id(id)
244
+ if (addr = find_obj_addr_by_id(id))
245
+ mark_obj_by_address(addr, id)
246
+ end
247
+ end
248
+
249
+ # Mark the object at the specified address.
250
+ # @param addr [Integer] Offset in the file
251
+ # @param id [Integer] ID of the object
252
+ def mark_obj_by_address(addr, id)
253
+ header = read_blob_header(addr, id)
254
+ begin
255
+ @f.seek(addr)
256
+ @f.write([ header.mark | 2 ].pack('C'))
257
+ @f.flush
258
+ rescue => e
259
+ PEROBS.log.fatal "Marking of FlatFile blob with ID #{id} " +
260
+ "failed: #{e.message}"
261
+ end
262
+ end
263
+
264
+ # Return true if the object with the given ID is marked, false otherwise.
265
+ # @param id [Integer] ID of the object
266
+ def is_marked_by_id?(id)
267
+ if (addr = find_obj_addr_by_id(id))
268
+ header = read_blob_header(addr, id)
269
+ return (header.mark & 2) == 2
270
+ end
271
+
272
+ false
273
+ end
274
+
275
+ # Clear alls marks.
276
+ def clear_all_marks
277
+ each_blob_header do |pos, mark, length, blob_id, crc|
278
+ if (mark & 1 == 1)
279
+ begin
280
+ @f.seek(pos)
281
+ @f.write([ mark & 0b11111101 ].pack('C'))
282
+ @f.flush
283
+ rescue => e
284
+ PEROBS.log.fatal "Unmarking of FlatFile blob with ID #{blob_id} " +
285
+ "failed: #{e.message}"
286
+ end
287
+ end
288
+ end
289
+ end
290
+
291
+ # Eliminate all the holes in the file. This is an in-place
292
+ # implementation. No additional space will be needed on the file system.
293
+ def defragmentize
294
+ distance = 0
295
+ t = Time.now
296
+ PEROBS.log.debug "Defragmenting FlatFile"
297
+ # Iterate over all entries.
298
+ each_blob_header do |pos, mark, length, blob_id, crc|
299
+ # Total size of the current entry
300
+ entry_bytes = BLOB_HEADER_LENGTH + length
301
+ if (mark & 1 == 1)
302
+ # We have found a valid entry.
303
+ if distance > 0
304
+ begin
305
+ # Read current entry into a buffer
306
+ @f.seek(pos)
307
+ buf = @f.read(entry_bytes)
308
+ # Write the buffer right after the end of the previous entry.
309
+ @f.seek(pos - distance)
310
+ @f.write(buf)
311
+ # Update the index with the new position
312
+ @index.put_value(blob_id, pos - distance)
313
+ # Mark the space between the relocated current entry and the
314
+ # next valid entry as deleted space.
315
+ @f.write([ 0, distance - BLOB_HEADER_LENGTH, 0, 0 ].
316
+ pack(BLOB_HEADER_FORMAT))
317
+ @f.flush
318
+ rescue => e
319
+ PEROBS.log.fatal "Error while moving blob for ID #{blob_id}: " +
320
+ e.message
321
+ end
322
+ end
323
+ else
324
+ distance += entry_bytes
325
+ end
326
+ end
327
+ PEROBS.log.debug "FlatFile defragmented in #{Time.now - t} seconds"
328
+ PEROBS.log.debug "#{distance} bytes or " +
329
+ "#{'%.1f' % (distance.to_f / @f.size * 100.0)}% reclaimed"
330
+
331
+ @f.flush
332
+ @f.truncate(@f.size - distance)
333
+ @f.flush
334
+ @space_list.clear
335
+
336
+ sync
337
+ end
338
+
339
+ def check(repair = false)
340
+ return unless @f
341
+
342
+ # First check the database blob file. Each entry should be readable and
343
+ # correct.
344
+ each_blob_header do |pos, mark, length, blob_id, crc|
345
+ if (mark & 1 == 1)
346
+ # We have a non-deleted entry.
347
+ begin
348
+ @f.seek(pos + BLOB_HEADER_LENGTH)
349
+ buf = @f.read(length)
350
+ if crc && checksum(buf) != crc
351
+ if repair
352
+ PEROBS.log.error "Checksum failure while checking blob " +
353
+ "with ID #{id}. Deleting object."
354
+ delete_obj_by_address(pos, blob_id)
355
+ else
356
+ PEROBS.log.fatal "Checksum failure while checking blob " +
357
+ "with ID #{id}"
358
+ end
359
+ end
360
+ rescue => e
361
+ PEROBS.log.fatal "Check of blob with ID #{blob_id} failed: " +
362
+ e.message
363
+ end
364
+ end
365
+ end
366
+
367
+ # Now we check the index data. It must be correct and the entries must
368
+ # match the blob file. All entries in the index must be in the blob file
369
+ # and vise versa.
370
+ begin
371
+ unless @index.check(self) && @space_list.check(self) &&
372
+ cross_check_entries
373
+ return unless repair
374
+
375
+ regenerate_index_and_spaces
376
+ end
377
+ rescue PEROBS::FatalError
378
+ regenerate_index_and_spaces
379
+ end
380
+
381
+ sync
382
+ end
383
+
384
+ # This method clears the index tree and the free space list and
385
+ # regenerates them from the FlatFile.
386
+ def regenerate_index_and_spaces
387
+ PEROBS.log.warn "Re-generating FlatFileDB index and space files"
388
+ @index.clear
389
+ @space_list.clear
390
+
391
+ each_blob_header do |pos, mark, length, id, crc|
392
+ if mark == 0
393
+ @space_list.add_space(pos, length) if length > 0
394
+ else
395
+ @index.put_value(id, pos)
396
+ end
397
+ end
398
+ end
399
+
400
+ def has_space?(address, size)
401
+ header = read_blob_header(address)
402
+ header.length == size
403
+ end
404
+
405
+ def has_id_at?(id, address)
406
+ header = read_blob_header(address)
407
+ header.id == id
408
+ end
409
+
410
+ def inspect
411
+ s = '['
412
+ each_blob_header do |pos, mark, length, blob_id, crc|
413
+ s << "{ :pos => #{pos}, :mark => #{mark}, " +
414
+ ":length => #{length}, :id => #{blob_id}, :crc => #{crc}"
415
+ if mark != 0
416
+ s << ", :value => #{@f.read(length)}"
417
+ end
418
+ s << " }\n"
419
+ end
420
+ s + ']'
421
+ end
422
+
423
+
424
+
425
+ private
426
+
427
+ def read_blob_header(addr, id = nil)
428
+ buf = nil
429
+ begin
430
+ @f.seek(addr)
431
+ buf = @f.read(BLOB_HEADER_LENGTH)
432
+ rescue => e
433
+ PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
434
+ end
435
+ if buf.nil? || buf.length != BLOB_HEADER_LENGTH
436
+ PEROBS.log.fatal "Cannot read blob header " +
437
+ "#{id ? "for ID #{id} " : ''}at address " +
438
+ "#{addr}"
439
+ end
440
+ header = Header.new(*buf.unpack(BLOB_HEADER_FORMAT))
441
+ if id && header.id != id
442
+ PEROBS.log.fatal "Mismatch between FlatFile index and blob file " +
443
+ "found for entry with ID #{id}/#{header.id}"
444
+ end
445
+
446
+ return header
447
+ end
448
+
449
+ def find_free_blob(bytes)
450
+ address, size = @space_list.get_space(bytes)
451
+ unless address
452
+ # We have not found any suitable space. Return the end of the file.
453
+ return [ @f.size, -1 ]
454
+ end
455
+ if size == bytes || size - BLOB_HEADER_LENGTH >= bytes
456
+ return [ address, size ]
457
+ end
458
+
459
+ # Return the found space again. It's too small for the new content plus
460
+ # the gap header.
461
+ @space_list.add_space(address, size)
462
+
463
+ # We need a space that is large enough to hold the bytes and the gap
464
+ # header.
465
+ @space_list.get_space(bytes + BLOB_HEADER_LENGTH) || [ @f.size, -1 ]
466
+ end
467
+
468
+ def checksum(raw_obj)
469
+ Zlib.crc32(raw_obj, 0)
470
+ end
471
+
472
+ def cross_check_entries
473
+ each_blob_header do |pos, mark, length, blob_id, crc|
474
+ if mark == 0
475
+ if length > 0
476
+ unless @space_list.has_space?(pos, length)
477
+ PEROBS.log.error "FlatFile has free space " +
478
+ "(addr: #{pos}, len: #{length}) that is not in FreeSpaceManager"
479
+ return false
480
+ end
481
+ end
482
+ else
483
+ unless @index.get_value(blob_id) == pos
484
+ PEROBS.log.error "FlatFile blob at address #{pos} is listed " +
485
+ "in index with address #{@index.get_value(blob_id)}"
486
+ return false
487
+ end
488
+ end
489
+ end
490
+
491
+ true
492
+ end
493
+
494
+ def each_blob_header(&block)
495
+ pos = 0
496
+ begin
497
+ @f.seek(0)
498
+ while (buf = @f.read(BLOB_HEADER_LENGTH))
499
+ mark, length, id, crc = buf.unpack(BLOB_HEADER_FORMAT)
500
+ yield(pos, mark, length, id, crc)
501
+
502
+ pos += BLOB_HEADER_LENGTH + length
503
+ @f.seek(pos)
504
+ end
505
+ rescue IOError => e
506
+ PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
507
+ end
508
+ end
509
+
510
+ end
511
+
512
+ end
513
+