perobs 2.3.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,513 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FlatFile.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2016 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'zlib'
29
+
30
+ require 'perobs/Log'
31
+ require 'perobs/IndexTree'
32
+ require 'perobs/FreeSpaceManager'
33
+
34
+ module PEROBS
35
+
36
+ # The FlatFile class manages the storage file of the FlatFileDB. It contains
37
+ # a sequence of blobs Each blob consists of a 25 byte header and the actual
38
+ # blob data bytes. The header has the following structure:
39
+ #
40
+ # 1 Byte: Mark byte.
41
+ # Bit 0: 0 deleted entry, 1 valid entry
42
+ # Bit 1: 0 unmarked, 1 marked
43
+ # Bit 2 - 7: reserved, must be 0
44
+ # 8 bytes: Length of the data blob in bytes
45
+ # 8 bytes: ID of the value in the data blob
46
+ # 4 bytes: CRC32 checksum of the data blob
47
+ #
48
+ # If the bit 0 of the mark byte is 0, only the length is valid. The blob is
49
+ # empty. Only of bit 0 is set then entry is valid.
50
+ class FlatFile
51
+
52
+ # Utility class to hold all the data that is stored in a blob header.
53
+ class Header < Struct.new(:mark, :length, :id, :crc)
54
+ end
55
+
56
+ # The 'pack()' format of the header.
57
+ BLOB_HEADER_FORMAT = 'CQQL'
58
+ # The length of the header in bytes.
59
+ BLOB_HEADER_LENGTH = 21
60
+
61
+ # Create a new FlatFile object for a database in the given path.
62
+ # @param dir [String] Directory path for the data base file
63
+ def initialize(dir)
64
+ @db_dir = dir
65
+ @f = nil
66
+ @index = IndexTree.new(dir)
67
+ @space_list = FreeSpaceManager.new(dir)
68
+ end
69
+
70
+ # Open the flat file for reading and writing.
71
+ def open
72
+ file_name = File.join(@db_dir, 'database.blobs')
73
+ begin
74
+ if File.exist?(file_name)
75
+ @f = File.open(file_name, 'rb+')
76
+ else
77
+ PEROBS.log.info 'New database.blobs file created'
78
+ @f = File.open(file_name, 'wb+')
79
+ end
80
+ rescue IOError => e
81
+ PEROBS.log.fatal "Cannot open flat file database #{file_name}: " +
82
+ e.message
83
+ end
84
+ @index.open
85
+ @space_list.open
86
+ end
87
+
88
+ # Close the flat file. This method must be called to ensure that all data
89
+ # is really written into the filesystem.
90
+ def close
91
+ @space_list.close
92
+ @index.close
93
+ @f.flush
94
+ @f.close
95
+ @f = nil
96
+ end
97
+
98
+ # Force outstanding data to be written to the filesystem.
99
+ def sync
100
+ begin
101
+ @f.flush
102
+ rescue IOError => e
103
+ PEROBS.log.fatal "Cannot sync flat file database: #{e.message}"
104
+ end
105
+ end
106
+
107
+ # Delete the blob for the specified ID.
108
+ # @param id [Integer] ID of the object to be deleted
109
+ # @return [Boolean] True if object was deleted, false otherwise
110
+ def delete_obj_by_id(id)
111
+ if (pos = find_obj_addr_by_id(id))
112
+ delete_obj_by_address(pos, id)
113
+ return true
114
+ end
115
+
116
+ return false
117
+ end
118
+
119
+ # Delete the blob that is stored at the specified address.
120
+ # @param addr [Integer] Address of the blob to delete
121
+ # @param id [Integer] ID of the blob to delete
122
+ def delete_obj_by_address(addr, id)
123
+ @index.delete_value(id)
124
+ header = read_blob_header(addr, id)
125
+ begin
126
+ @f.seek(addr)
127
+ @f.write([ 0 ].pack('C'))
128
+ @f.flush
129
+ @space_list.add_space(addr, header.length)
130
+ rescue => e
131
+ PEROBS.log.fatal "Cannot erase blob for ID #{header.id}: #{e.message}"
132
+ end
133
+ end
134
+
135
+ # Delete all unmarked objects.
136
+ def delete_unmarked_objects
137
+ deleted_ids = []
138
+ each_blob_header do |pos, mark, length, blob_id, crc|
139
+ if (mark & 3 == 1)
140
+ delete_obj_by_address(pos, blob_id)
141
+ deleted_ids << blob_id
142
+ end
143
+ end
144
+ defragmentize
145
+
146
+ deleted_ids
147
+ end
148
+
149
+ # Write the given object into the file. This method assumes that no other
150
+ # entry with the given ID exists already in the file.
151
+ # @param id [Integer] ID of the object
152
+ # @param raw_obj [String] Raw object as String
153
+ # @return [Integer] position of the written blob in the blob file
154
+ def write_obj_by_id(id, raw_obj)
155
+ addr, length = find_free_blob(raw_obj.length)
156
+ begin
157
+ if length != -1
158
+ # Just a safeguard so we don't overwrite current data.
159
+ header = read_blob_header(addr)
160
+ if header.length != length
161
+ PEROBS.log.fatal "Length in free list (#{length}) and header " +
162
+ "(#{header.length}) don't match."
163
+ end
164
+ if raw_obj.length > header.length
165
+ PEROBS.log.fatal "Object (#{raw_obj.length}) is longer than " +
166
+ "blob space (#{header.length})."
167
+ end
168
+ if header.mark != 0
169
+ PEROBS.log.fatal "Mark (#{header.mark}) is not 0."
170
+ end
171
+ end
172
+ @f.seek(addr)
173
+ @f.write([ 1, raw_obj.length, id, checksum(raw_obj)].
174
+ pack(BLOB_HEADER_FORMAT))
175
+ @f.write(raw_obj)
176
+ if length != -1 && raw_obj.length < length
177
+ # The new object was not appended and it did not completely fill the
178
+ # free space. So we have to write a new header to mark the remaining
179
+ # empty space.
180
+ unless length - raw_obj.length >= BLOB_HEADER_LENGTH
181
+ PEROBS.log.fatal "Not enough space to append the empty space " +
182
+ "header (space: #{length} bytes, object: #{raw_obj.length} " +
183
+ "bytes)."
184
+ end
185
+ space_address = @f.pos
186
+ space_length = length - BLOB_HEADER_LENGTH - raw_obj.length
187
+ @f.write([ 0, space_length, 0, 0 ].pack(BLOB_HEADER_FORMAT))
188
+ # Register the new space with the space list.
189
+ @space_list.add_space(space_address, space_length) if space_length > 0
190
+ end
191
+ @f.flush
192
+ @index.put_value(id, addr)
193
+ rescue IOError => e
194
+ PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
195
+ e.message
196
+ end
197
+
198
+ addr
199
+ end
200
+
201
+ # Find the address of the object with the given ID.
202
+ # @param id [Integer] ID of the object
203
+ # @return [Integer] Offset in the flat file or nil if not found
204
+ def find_obj_addr_by_id(id)
205
+ @index.get_value(id)
206
+ end
207
+
208
+ # Read the object with the given ID.
209
+ # @param id [Integer] ID of the object
210
+ # @return [String or nil] Raw object data if found, otherwise nil
211
+ def read_obj_by_id(id)
212
+ if (addr = find_obj_addr_by_id(id))
213
+ return read_obj_by_address(addr, id)
214
+ end
215
+
216
+ nil
217
+ end
218
+
219
+ # Read the object at the specified address.
220
+ # @param addr [Integer] Offset in the flat file
221
+ # @param id [Integer] ID of the data blob
222
+ # @return [String] Raw object data
223
+ def read_obj_by_address(addr, id)
224
+ header = read_blob_header(addr, id)
225
+ if header.id != id
226
+ PEROBS.log.fatal "Database index corrupted: Index for object " +
227
+ "#{id} points to object with ID #{header.id}"
228
+ end
229
+ begin
230
+ @f.seek(addr + BLOB_HEADER_LENGTH)
231
+ buf = @f.read(header.length)
232
+ if checksum(buf) != header.crc
233
+ PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
234
+ end
235
+ return buf
236
+ rescue => e
237
+ PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
238
+ end
239
+ end
240
+
241
+ # Mark the object with the given ID.
242
+ # @param id [Integer] ID of the object
243
+ def mark_obj_by_id(id)
244
+ if (addr = find_obj_addr_by_id(id))
245
+ mark_obj_by_address(addr, id)
246
+ end
247
+ end
248
+
249
+ # Mark the object at the specified address.
250
+ # @param addr [Integer] Offset in the file
251
+ # @param id [Integer] ID of the object
252
+ def mark_obj_by_address(addr, id)
253
+ header = read_blob_header(addr, id)
254
+ begin
255
+ @f.seek(addr)
256
+ @f.write([ header.mark | 2 ].pack('C'))
257
+ @f.flush
258
+ rescue => e
259
+ PEROBS.log.fatal "Marking of FlatFile blob with ID #{id} " +
260
+ "failed: #{e.message}"
261
+ end
262
+ end
263
+
264
+ # Return true if the object with the given ID is marked, false otherwise.
265
+ # @param id [Integer] ID of the object
266
+ def is_marked_by_id?(id)
267
+ if (addr = find_obj_addr_by_id(id))
268
+ header = read_blob_header(addr, id)
269
+ return (header.mark & 2) == 2
270
+ end
271
+
272
+ false
273
+ end
274
+
275
+ # Clear alls marks.
276
+ def clear_all_marks
277
+ each_blob_header do |pos, mark, length, blob_id, crc|
278
+ if (mark & 1 == 1)
279
+ begin
280
+ @f.seek(pos)
281
+ @f.write([ mark & 0b11111101 ].pack('C'))
282
+ @f.flush
283
+ rescue => e
284
+ PEROBS.log.fatal "Unmarking of FlatFile blob with ID #{blob_id} " +
285
+ "failed: #{e.message}"
286
+ end
287
+ end
288
+ end
289
+ end
290
+
291
+ # Eliminate all the holes in the file. This is an in-place
292
+ # implementation. No additional space will be needed on the file system.
293
+ def defragmentize
294
+ distance = 0
295
+ t = Time.now
296
+ PEROBS.log.debug "Defragmenting FlatFile"
297
+ # Iterate over all entries.
298
+ each_blob_header do |pos, mark, length, blob_id, crc|
299
+ # Total size of the current entry
300
+ entry_bytes = BLOB_HEADER_LENGTH + length
301
+ if (mark & 1 == 1)
302
+ # We have found a valid entry.
303
+ if distance > 0
304
+ begin
305
+ # Read current entry into a buffer
306
+ @f.seek(pos)
307
+ buf = @f.read(entry_bytes)
308
+ # Write the buffer right after the end of the previous entry.
309
+ @f.seek(pos - distance)
310
+ @f.write(buf)
311
+ # Update the index with the new position
312
+ @index.put_value(blob_id, pos - distance)
313
+ # Mark the space between the relocated current entry and the
314
+ # next valid entry as deleted space.
315
+ @f.write([ 0, distance - BLOB_HEADER_LENGTH, 0, 0 ].
316
+ pack(BLOB_HEADER_FORMAT))
317
+ @f.flush
318
+ rescue => e
319
+ PEROBS.log.fatal "Error while moving blob for ID #{blob_id}: " +
320
+ e.message
321
+ end
322
+ end
323
+ else
324
+ distance += entry_bytes
325
+ end
326
+ end
327
+ PEROBS.log.debug "FlatFile defragmented in #{Time.now - t} seconds"
328
+ PEROBS.log.debug "#{distance} bytes or " +
329
+ "#{'%.1f' % (distance.to_f / @f.size * 100.0)}% reclaimed"
330
+
331
+ @f.flush
332
+ @f.truncate(@f.size - distance)
333
+ @f.flush
334
+ @space_list.clear
335
+
336
+ sync
337
+ end
338
+
339
+ def check(repair = false)
340
+ return unless @f
341
+
342
+ # First check the database blob file. Each entry should be readable and
343
+ # correct.
344
+ each_blob_header do |pos, mark, length, blob_id, crc|
345
+ if (mark & 1 == 1)
346
+ # We have a non-deleted entry.
347
+ begin
348
+ @f.seek(pos + BLOB_HEADER_LENGTH)
349
+ buf = @f.read(length)
350
+ if crc && checksum(buf) != crc
351
+ if repair
352
+ PEROBS.log.error "Checksum failure while checking blob " +
353
+ "with ID #{id}. Deleting object."
354
+ delete_obj_by_address(pos, blob_id)
355
+ else
356
+ PEROBS.log.fatal "Checksum failure while checking blob " +
357
+ "with ID #{id}"
358
+ end
359
+ end
360
+ rescue => e
361
+ PEROBS.log.fatal "Check of blob with ID #{blob_id} failed: " +
362
+ e.message
363
+ end
364
+ end
365
+ end
366
+
367
+ # Now we check the index data. It must be correct and the entries must
368
+ # match the blob file. All entries in the index must be in the blob file
369
+ # and vise versa.
370
+ begin
371
+ unless @index.check(self) && @space_list.check(self) &&
372
+ cross_check_entries
373
+ return unless repair
374
+
375
+ regenerate_index_and_spaces
376
+ end
377
+ rescue PEROBS::FatalError
378
+ regenerate_index_and_spaces
379
+ end
380
+
381
+ sync
382
+ end
383
+
384
+ # This method clears the index tree and the free space list and
385
+ # regenerates them from the FlatFile.
386
+ def regenerate_index_and_spaces
387
+ PEROBS.log.warn "Re-generating FlatFileDB index and space files"
388
+ @index.clear
389
+ @space_list.clear
390
+
391
+ each_blob_header do |pos, mark, length, id, crc|
392
+ if mark == 0
393
+ @space_list.add_space(pos, length) if length > 0
394
+ else
395
+ @index.put_value(id, pos)
396
+ end
397
+ end
398
+ end
399
+
400
+ def has_space?(address, size)
401
+ header = read_blob_header(address)
402
+ header.length == size
403
+ end
404
+
405
+ def has_id_at?(id, address)
406
+ header = read_blob_header(address)
407
+ header.id == id
408
+ end
409
+
410
+ def inspect
411
+ s = '['
412
+ each_blob_header do |pos, mark, length, blob_id, crc|
413
+ s << "{ :pos => #{pos}, :mark => #{mark}, " +
414
+ ":length => #{length}, :id => #{blob_id}, :crc => #{crc}"
415
+ if mark != 0
416
+ s << ", :value => #{@f.read(length)}"
417
+ end
418
+ s << " }\n"
419
+ end
420
+ s + ']'
421
+ end
422
+
423
+
424
+
425
+ private
426
+
427
+ def read_blob_header(addr, id = nil)
428
+ buf = nil
429
+ begin
430
+ @f.seek(addr)
431
+ buf = @f.read(BLOB_HEADER_LENGTH)
432
+ rescue => e
433
+ PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
434
+ end
435
+ if buf.nil? || buf.length != BLOB_HEADER_LENGTH
436
+ PEROBS.log.fatal "Cannot read blob header " +
437
+ "#{id ? "for ID #{id} " : ''}at address " +
438
+ "#{addr}"
439
+ end
440
+ header = Header.new(*buf.unpack(BLOB_HEADER_FORMAT))
441
+ if id && header.id != id
442
+ PEROBS.log.fatal "Mismatch between FlatFile index and blob file " +
443
+ "found for entry with ID #{id}/#{header.id}"
444
+ end
445
+
446
+ return header
447
+ end
448
+
449
+ def find_free_blob(bytes)
450
+ address, size = @space_list.get_space(bytes)
451
+ unless address
452
+ # We have not found any suitable space. Return the end of the file.
453
+ return [ @f.size, -1 ]
454
+ end
455
+ if size == bytes || size - BLOB_HEADER_LENGTH >= bytes
456
+ return [ address, size ]
457
+ end
458
+
459
+ # Return the found space again. It's too small for the new content plus
460
+ # the gap header.
461
+ @space_list.add_space(address, size)
462
+
463
+ # We need a space that is large enough to hold the bytes and the gap
464
+ # header.
465
+ @space_list.get_space(bytes + BLOB_HEADER_LENGTH) || [ @f.size, -1 ]
466
+ end
467
+
468
+ def checksum(raw_obj)
469
+ Zlib.crc32(raw_obj, 0)
470
+ end
471
+
472
+ def cross_check_entries
473
+ each_blob_header do |pos, mark, length, blob_id, crc|
474
+ if mark == 0
475
+ if length > 0
476
+ unless @space_list.has_space?(pos, length)
477
+ PEROBS.log.error "FlatFile has free space " +
478
+ "(addr: #{pos}, len: #{length}) that is not in FreeSpaceManager"
479
+ return false
480
+ end
481
+ end
482
+ else
483
+ unless @index.get_value(blob_id) == pos
484
+ PEROBS.log.error "FlatFile blob at address #{pos} is listed " +
485
+ "in index with address #{@index.get_value(blob_id)}"
486
+ return false
487
+ end
488
+ end
489
+ end
490
+
491
+ true
492
+ end
493
+
494
+ def each_blob_header(&block)
495
+ pos = 0
496
+ begin
497
+ @f.seek(0)
498
+ while (buf = @f.read(BLOB_HEADER_LENGTH))
499
+ mark, length, id, crc = buf.unpack(BLOB_HEADER_FORMAT)
500
+ yield(pos, mark, length, id, crc)
501
+
502
+ pos += BLOB_HEADER_LENGTH + length
503
+ @f.seek(pos)
504
+ end
505
+ rescue IOError => e
506
+ PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
507
+ end
508
+ end
509
+
510
+ end
511
+
512
+ end
513
+