perobs 4.2.0 → 4.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/perobs.rb +1 -0
- data/lib/perobs/BTree.rb +2 -2
- data/lib/perobs/BTreeNode.rb +32 -15
- data/lib/perobs/FlatFile.rb +37 -56
- data/lib/perobs/FuzzyStringMatcher.rb +32 -49
- data/lib/perobs/Hash.rb +68 -23
- data/lib/perobs/IDListPageFile.rb +2 -1
- data/lib/perobs/ObjectBase.rb +7 -0
- data/lib/perobs/SpaceTree.rb +1 -1
- data/lib/perobs/Store.rb +1 -0
- data/lib/perobs/version.rb +1 -1
- data/test/FlatFileDB_spec.rb +30 -0
- data/test/FuzzyStringMatcher_spec.rb +94 -4
- data/test/Hash_spec.rb +12 -1
- data/test/Store_spec.rb +14 -0
- metadata +6 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a61fc945e0ef9f5ed6558080931d2acae42cc0401f375275684e4ee32fefe4f7
|
4
|
+
data.tar.gz: 4d864fdc0791aa78d8c180b4686ee825cd25e209284fca1966f144813c063280
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3834a9caae693d82837fb9f75141cb35e85f1a2c1439d1bb898f8578d9ae082f46deb0233d2b8a02d6ae6b0bf66862098b47ff571ff3d9a6b874fadaef6d23a
|
7
|
+
data.tar.gz: 883f1b5e553fae2be0039aa090d89bf6eb44ec1d0dc31488aeaa727ec8bc2844c9b722568cd7dde0b33c507121afd11e720154d4ff27e66aa3ac3812d5603954
|
data/README.md
CHANGED
data/lib/perobs.rb
CHANGED
data/lib/perobs/BTree.rb
CHANGED
@@ -70,7 +70,7 @@ module PEROBS
|
|
70
70
|
@nodes.register_custom_data('first_leaf')
|
71
71
|
@nodes.register_custom_data('last_leaf')
|
72
72
|
@nodes.register_custom_data('btree_size')
|
73
|
-
@node_cache = PersistentObjectCache.new(2**
|
73
|
+
@node_cache = PersistentObjectCache.new(2**13, 2**13, BTreeNode, self)
|
74
74
|
@root = @first_leaf = @last_leaf = nil
|
75
75
|
@size = 0
|
76
76
|
|
@@ -190,7 +190,7 @@ module PEROBS
|
|
190
190
|
"Number of leave nodes: #{stats.leave_nodes}; " +
|
191
191
|
"Number of leaves: #{stats.leaves}"
|
192
192
|
|
193
|
-
|
193
|
+
true
|
194
194
|
end
|
195
195
|
|
196
196
|
# Register a new node as root node of the tree.
|
data/lib/perobs/BTreeNode.rb
CHANGED
@@ -59,7 +59,7 @@ module PEROBS
|
|
59
59
|
# if not
|
60
60
|
def initialize(tree, node_address = nil, parent = nil, is_leaf = true,
|
61
61
|
prev_sibling = nil, next_sibling = nil,
|
62
|
-
keys =
|
62
|
+
keys = nil, values = nil, children = nil)
|
63
63
|
@tree = tree
|
64
64
|
if node_address == 0
|
65
65
|
PEROBS.log.fatal "Node address may not be 0"
|
@@ -68,13 +68,13 @@ module PEROBS
|
|
68
68
|
@parent = link(parent)
|
69
69
|
@prev_sibling = link(prev_sibling)
|
70
70
|
@next_sibling = link(next_sibling)
|
71
|
-
@keys = keys
|
71
|
+
@keys = keys || []
|
72
72
|
if (@is_leaf = is_leaf)
|
73
|
-
@values = values
|
74
|
-
@children =
|
73
|
+
@values = values || []
|
74
|
+
@children = nil
|
75
75
|
else
|
76
|
-
@children = children
|
77
|
-
@values =
|
76
|
+
@children = children || []
|
77
|
+
@values = nil
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
@@ -585,11 +585,11 @@ module PEROBS
|
|
585
585
|
end
|
586
586
|
|
587
587
|
def trim(idx)
|
588
|
-
@keys
|
588
|
+
@keys.slice!(idx, @keys.length - idx)
|
589
589
|
if @is_leaf
|
590
|
-
@values
|
590
|
+
@values.slice!(idx, @values.length - idx)
|
591
591
|
else
|
592
|
-
@children
|
592
|
+
@children.slice!(idx + 1, @children.length - idx - 1)
|
593
593
|
end
|
594
594
|
@tree.node_cache.insert(self)
|
595
595
|
end
|
@@ -654,13 +654,18 @@ module PEROBS
|
|
654
654
|
# @yield [key, value]
|
655
655
|
# @return [nil or Hash] nil in case of errors or a hash with some
|
656
656
|
# statistical information about the tree
|
657
|
-
def check
|
657
|
+
def check(&block)
|
658
658
|
stats = Stats.new(nil, 0, 0, 0)
|
659
659
|
|
660
660
|
traverse do |node, position, stack|
|
661
661
|
if position == 0
|
662
662
|
stats.nodes_count += 1
|
663
663
|
if node.parent
|
664
|
+
unless node.parent.is_a?(BTreeNodeLink)
|
665
|
+
node.error "parent is a #{node.parent.class} instead of a " +
|
666
|
+
"BTreeNodeLink"
|
667
|
+
return nil
|
668
|
+
end
|
664
669
|
# After a split the nodes will only have half the maximum keys.
|
665
670
|
# For branch nodes one of the split nodes will have even 1 key
|
666
671
|
# less as this will become the branch key in a parent node.
|
@@ -695,6 +700,16 @@ module PEROBS
|
|
695
700
|
else
|
696
701
|
stats.branch_depth = node.tree_level
|
697
702
|
end
|
703
|
+
if node.prev_sibling && !node.prev_sibling.is_a?(BTreeNodeLink)
|
704
|
+
node.error "prev_sibling is a #{node.prev_sibling.class} " +
|
705
|
+
"instead of a BTreeNodeLink"
|
706
|
+
return nil
|
707
|
+
end
|
708
|
+
if node.next_sibling && !node.next_sibling.is_a?(BTreeNodeLink)
|
709
|
+
node.error "next_sibling is a #{node.next_sibling.class} " +
|
710
|
+
"instead of a BTreeNodeLink"
|
711
|
+
return nil
|
712
|
+
end
|
698
713
|
if node.prev_sibling.nil? && @tree.first_leaf != node
|
699
714
|
node.error "Leaf node #{node.node_address} has no previous " +
|
700
715
|
"sibling but is not the first leaf of the tree"
|
@@ -708,9 +723,9 @@ module PEROBS
|
|
708
723
|
unless node.keys.size == node.values.size
|
709
724
|
node.error "Key count (#{node.keys.size}) and value " +
|
710
725
|
"count (#{node.values.size}) don't match"
|
711
|
-
|
726
|
+
return nil
|
712
727
|
end
|
713
|
-
unless node.children.
|
728
|
+
unless node.children.nil?
|
714
729
|
node.error "@children must be nil for a leaf node"
|
715
730
|
return nil
|
716
731
|
end
|
@@ -718,14 +733,14 @@ module PEROBS
|
|
718
733
|
stats.leave_nodes += 1
|
719
734
|
stats.leaves += node.keys.length
|
720
735
|
else
|
721
|
-
unless node.values.
|
736
|
+
unless node.values.nil?
|
722
737
|
node.error "@values must be nil for a branch node"
|
723
738
|
return nil
|
724
739
|
end
|
725
740
|
unless node.children.size == node.keys.size + 1
|
726
741
|
node.error "Key count (#{node.keys.size}) must be one " +
|
727
742
|
"less than children count (#{node.children.size})"
|
728
|
-
|
743
|
+
return nil
|
729
744
|
end
|
730
745
|
node.children.each_with_index do |child, i|
|
731
746
|
unless child.is_a?(BTreeNodeLink)
|
@@ -789,7 +804,9 @@ module PEROBS
|
|
789
804
|
else
|
790
805
|
if block_given?
|
791
806
|
# If a block was given, call this block with the key and value.
|
792
|
-
|
807
|
+
unless yield(node.keys[index], node.values[index])
|
808
|
+
return nil
|
809
|
+
end
|
793
810
|
end
|
794
811
|
end
|
795
812
|
end
|
data/lib/perobs/FlatFile.rb
CHANGED
@@ -293,7 +293,7 @@ module PEROBS
|
|
293
293
|
header = FlatFileBlobHeader.read(@f, addr, id)
|
294
294
|
if header.id != id
|
295
295
|
PEROBS.log.fatal "Database index corrupted: Index for object " +
|
296
|
-
"#{id} points to object with ID #{header.id}"
|
296
|
+
"#{id} points to object with ID #{header.id} at address #{addr}"
|
297
297
|
end
|
298
298
|
|
299
299
|
buf = nil
|
@@ -302,7 +302,8 @@ module PEROBS
|
|
302
302
|
@f.seek(addr + FlatFileBlobHeader::LENGTH)
|
303
303
|
buf = @f.read(header.length)
|
304
304
|
rescue IOError => e
|
305
|
-
PEROBS.log.fatal "Cannot read blob for ID #{id}
|
305
|
+
PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
|
306
|
+
e.message
|
306
307
|
end
|
307
308
|
|
308
309
|
# Uncompress the data if the compression bit is set in the flags byte.
|
@@ -311,12 +312,13 @@ module PEROBS
|
|
311
312
|
buf = Zlib.inflate(buf)
|
312
313
|
rescue Zlib::BufError, Zlib::DataError
|
313
314
|
PEROBS.log.fatal "Corrupted compressed block with ID " +
|
314
|
-
"#{
|
315
|
+
"#{id} found at address #{addr}."
|
315
316
|
end
|
316
317
|
end
|
317
318
|
|
318
319
|
if checksum(buf) != header.crc
|
319
|
-
PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
|
320
|
+
PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
|
321
|
+
"at address #{addr}"
|
320
322
|
end
|
321
323
|
|
322
324
|
buf
|
@@ -339,7 +341,7 @@ module PEROBS
|
|
339
341
|
if @marks
|
340
342
|
@marks.clear
|
341
343
|
else
|
342
|
-
@marks = IDList.new(@db_dir, 'marks',
|
344
|
+
@marks = IDList.new(@db_dir, 'marks', item_counter)
|
343
345
|
end
|
344
346
|
end
|
345
347
|
|
@@ -452,16 +454,14 @@ module PEROBS
|
|
452
454
|
regenerate_index_and_spaces
|
453
455
|
end
|
454
456
|
|
455
|
-
# Check
|
456
|
-
# @param repair [Boolean] True if errors should be fixed.
|
457
|
+
# Check the FlatFile.
|
457
458
|
# @return [Integer] Number of errors found
|
458
|
-
def check(
|
459
|
+
def check()
|
459
460
|
errors = 0
|
460
461
|
return errors unless @f
|
461
462
|
|
462
463
|
t = Time.now
|
463
|
-
PEROBS.log.info "Checking FlatFile database"
|
464
|
-
"#{repair ? ' in repair mode' : ''}..."
|
464
|
+
PEROBS.log.info "Checking FlatFile database..."
|
465
465
|
|
466
466
|
# First check the database blob file. Each entry should be readable and
|
467
467
|
# correct and all IDs must be unique. We use a shadow index to keep
|
@@ -483,7 +483,6 @@ module PEROBS
|
|
483
483
|
if buf.bytesize != header.length
|
484
484
|
PEROBS.log.error "Premature end of file in blob with ID " +
|
485
485
|
"#{header.id}."
|
486
|
-
discard_damaged_blob(header) if repair
|
487
486
|
errors += 1
|
488
487
|
next
|
489
488
|
end
|
@@ -496,7 +495,6 @@ module PEROBS
|
|
496
495
|
rescue Zlib::BufError, Zlib::DataError
|
497
496
|
PEROBS.log.error "Corrupted compressed block with ID " +
|
498
497
|
"#{header.id} found."
|
499
|
-
discard_damaged_blob(header) if repair
|
500
498
|
errors += 1
|
501
499
|
next
|
502
500
|
end
|
@@ -505,7 +503,6 @@ module PEROBS
|
|
505
503
|
if header.crc && checksum(buf) != header.crc
|
506
504
|
PEROBS.log.error "Checksum failure while checking blob " +
|
507
505
|
"with ID #{header.id}"
|
508
|
-
discard_damaged_blob(header) if repair
|
509
506
|
errors += 1
|
510
507
|
next
|
511
508
|
end
|
@@ -521,22 +518,6 @@ module PEROBS
|
|
521
518
|
errors += 1
|
522
519
|
previous_header = FlatFileBlobHeader.read(@f, previous_address,
|
523
520
|
header.id)
|
524
|
-
if repair
|
525
|
-
# We have two blobs with the same ID and we must discard one of
|
526
|
-
# them.
|
527
|
-
if header.is_outdated?
|
528
|
-
discard_damaged_blob(header)
|
529
|
-
elsif previous_header.is_outdated?
|
530
|
-
discard_damaged_blob(previous_header)
|
531
|
-
else
|
532
|
-
PEROBS.log.error "None of the blobs with same ID have " +
|
533
|
-
"the outdated flag set. Deleting the smaller one."
|
534
|
-
errors += 1
|
535
|
-
discard_damaged_blob(header.length < previous_header.length ?
|
536
|
-
header : previous_header)
|
537
|
-
end
|
538
|
-
next
|
539
|
-
end
|
540
521
|
else
|
541
522
|
# ID is unique so far. Add it to the shadow index.
|
542
523
|
new_index.insert(header.id, header.addr)
|
@@ -553,12 +534,6 @@ module PEROBS
|
|
553
534
|
PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
|
554
535
|
'bytes found at the end of FlatFile.'
|
555
536
|
corrupted_blobs += 1
|
556
|
-
if repair
|
557
|
-
PEROBS.log.error "Truncating FlatFile to " +
|
558
|
-
"#{end_of_last_healthy_blob} bytes by discarding " +
|
559
|
-
"#{@f.size - end_of_last_healthy_blob} bytes"
|
560
|
-
@f.truncate(end_of_last_healthy_blob)
|
561
|
-
end
|
562
537
|
end
|
563
538
|
|
564
539
|
errors += corrupted_blobs
|
@@ -568,17 +543,19 @@ module PEROBS
|
|
568
543
|
new_index.close
|
569
544
|
new_index.erase
|
570
545
|
|
571
|
-
if
|
572
|
-
erase_index_files
|
573
|
-
defragmentize
|
574
|
-
regenerate_index_and_spaces
|
575
|
-
elsif corrupted_blobs == 0
|
546
|
+
if corrupted_blobs == 0
|
576
547
|
# Now we check the index data. It must be correct and the entries must
|
577
548
|
# match the blob file. All entries in the index must be in the blob file
|
578
549
|
# and vise versa.
|
579
550
|
begin
|
580
551
|
index_ok = @index.check do |id, address|
|
581
|
-
has_id_at?(id, address)
|
552
|
+
unless has_id_at?(id, address)
|
553
|
+
PEROBS.log.error "Index contains an entry for " +
|
554
|
+
"ID #{id} at address #{address} that is not in FlatFile"
|
555
|
+
false
|
556
|
+
else
|
557
|
+
true
|
558
|
+
end
|
582
559
|
end
|
583
560
|
x_check_errs = 0
|
584
561
|
space_check_ok = true
|
@@ -586,16 +563,13 @@ module PEROBS
|
|
586
563
|
(x_check_errs = cross_check_entries) == 0
|
587
564
|
errors += 1 unless index_ok && space_check_ok
|
588
565
|
errors += x_check_errs
|
589
|
-
regenerate_index_and_spaces if repair
|
590
566
|
end
|
591
567
|
rescue PEROBS::FatalError
|
592
568
|
errors += 1
|
593
|
-
regenerate_index_and_spaces if repair
|
594
569
|
end
|
595
570
|
end
|
596
571
|
|
597
|
-
|
598
|
-
PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
|
572
|
+
PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
|
599
573
|
"#{errors} errors found."
|
600
574
|
|
601
575
|
errors
|
@@ -687,17 +661,7 @@ module PEROBS
|
|
687
661
|
header.id)
|
688
662
|
# We have two blobs with the same ID and we must discard one of
|
689
663
|
# them.
|
690
|
-
|
691
|
-
discard_damaged_blob(header)
|
692
|
-
elsif previous_header.is_outdated?
|
693
|
-
discard_damaged_blob(previous_header)
|
694
|
-
else
|
695
|
-
PEROBS.log.error "None of the blobs with same ID have " +
|
696
|
-
"the outdated flag set. Deleting the smaller one."
|
697
|
-
errors += 1
|
698
|
-
discard_damaged_blob(header.length < previous_header.length ?
|
699
|
-
header : previous_header)
|
700
|
-
end
|
664
|
+
discard_duplicate_blobs(header, previous_header)
|
701
665
|
else
|
702
666
|
# ID is unique so far. Add it to the shadow index.
|
703
667
|
@index.insert(header.id, header.addr)
|
@@ -927,6 +891,23 @@ module PEROBS
|
|
927
891
|
header.clear_flags
|
928
892
|
end
|
929
893
|
|
894
|
+
def discard_duplicate_blobs(header, previous_header)
|
895
|
+
if header.is_outdated?
|
896
|
+
discard_damaged_blob(header)
|
897
|
+
elsif previous_header.is_outdated?
|
898
|
+
discard_damaged_blob(previous_header)
|
899
|
+
else
|
900
|
+
smaller, larger = header.length < previous_header.length ?
|
901
|
+
[ header, previous_header ] : [ previous_header, header ]
|
902
|
+
PEROBS.log.error "None of the blobs with same ID have " +
|
903
|
+
"the outdated flag set. Deleting the smaller one " +
|
904
|
+
"at address #{smaller.addr}"
|
905
|
+
discard_damaged_blob(smaller)
|
906
|
+
@space_list.add_space(smaller.addr, smaller.length)
|
907
|
+
@index.insert(larger.id, larger.addr)
|
908
|
+
end
|
909
|
+
end
|
910
|
+
|
930
911
|
def open_index_files(abort_on_missing_files = false)
|
931
912
|
begin
|
932
913
|
@index.open(abort_on_missing_files)
|
@@ -26,40 +26,42 @@
|
|
26
26
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
27
|
|
28
28
|
require 'perobs/Log'
|
29
|
-
require 'perobs/
|
29
|
+
require 'perobs/Object'
|
30
30
|
|
31
31
|
module PEROBS
|
32
32
|
|
33
33
|
# The fuzzy string matcher can be used to perform a fuzzy string search
|
34
34
|
# against a known set of strings. The dictionary of known strings does not
|
35
|
-
# store the actual strings but references to
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
|
35
|
+
# store the actual strings but references to String or PEROBS objects.
|
36
|
+
# Once the dictionary has been established, fuzzy matches can be done. Since
|
37
|
+
# the actual input strings are not directly stored, you cannot remove or
|
38
|
+
# modified already stored strings. To remove strings, you have to clear the
|
39
|
+
# matcher and add the strings again that you want to keep.
|
40
|
+
class FuzzyStringMatcher < PEROBS::Object
|
41
|
+
|
42
|
+
attr_persist :case_sensitive, :n, :dict
|
40
43
|
|
41
44
|
# Create a new FuzzyStringMatcher.
|
42
|
-
# @param
|
43
|
-
# @param name [String] Unique name of the string matcher
|
45
|
+
# @param p [PEROBS::Store] place to store the dictionary
|
44
46
|
# @param case_sensitive [Boolean] True if case matters for matching
|
45
47
|
# @param n [Integer] Determines what kind of n-gramm is used to store the
|
46
48
|
# references in the dictionary. It also determines the minimum word
|
47
|
-
# length that can be used for fuzzy matches.
|
48
|
-
|
49
|
-
|
50
|
-
|
49
|
+
# length that can be used for fuzzy matches. Values between 2 and
|
50
|
+
# 10 are supported. The default is 4.
|
51
|
+
def initialize(p, case_sensitive = false, n = 4)
|
52
|
+
super(p)
|
51
53
|
if n < 2 || n > 10
|
52
54
|
raise ArgumentError, 'n must be between 2 and 10'
|
53
55
|
end
|
54
|
-
|
55
|
-
|
56
|
+
self.case_sensitive = case_sensitive
|
57
|
+
self.n = n
|
56
58
|
|
57
|
-
clear unless
|
59
|
+
clear unless @dict
|
58
60
|
end
|
59
61
|
|
60
62
|
# Wipe the dictionary.
|
61
63
|
def clear
|
62
|
-
|
64
|
+
self.dict = @store.new(BigHash)
|
63
65
|
end
|
64
66
|
|
65
67
|
# Add a string with its reference to the dictionary.
|
@@ -79,11 +81,8 @@ module PEROBS
|
|
79
81
|
@dict[n_gramm] = ng_list = @store.new(Hash)
|
80
82
|
end
|
81
83
|
|
82
|
-
|
83
|
-
|
84
|
-
else
|
85
|
-
ng_list[reference] = 0
|
86
|
-
end
|
84
|
+
# We use the Hash as a Set. The value doesn't matter.
|
85
|
+
ng_list[reference] = true unless ng_list.include?(reference)
|
87
86
|
end
|
88
87
|
|
89
88
|
nil
|
@@ -109,22 +108,12 @@ module PEROBS
|
|
109
108
|
|
110
109
|
matches = {}
|
111
110
|
|
112
|
-
# This will be the best possible score for a perfect match.
|
113
|
-
best_possible_score = 0
|
114
111
|
each_n_gramm(string) do |n_gramm|
|
115
|
-
best_possible_score += 1
|
116
112
|
if (ng_list = @dict[n_gramm])
|
117
|
-
ng_list.each do |reference,
|
113
|
+
ng_list.each do |reference, dummy|
|
118
114
|
if matches.include?(reference)
|
119
115
|
matches[reference] += 1
|
120
116
|
else
|
121
|
-
# We use internally a 10 times larger list so that we don't
|
122
|
-
# throw away good matches too early. If the max_count value is
|
123
|
-
# chosen too small there is a risk of not finding the best
|
124
|
-
# matches!
|
125
|
-
if matches.size > 10 * max_count
|
126
|
-
matches = discard_worst_match(matches)
|
127
|
-
end
|
128
117
|
matches[reference] = 1
|
129
118
|
end
|
130
119
|
end
|
@@ -133,19 +122,23 @@ module PEROBS
|
|
133
122
|
|
134
123
|
return [] if matches.empty?
|
135
124
|
|
136
|
-
|
137
|
-
match_list = matches.to_a.sort do |a, b|
|
138
|
-
b[1] <=> a[1]
|
139
|
-
end
|
125
|
+
match_list = matches.to_a
|
140
126
|
|
141
127
|
# Set occurance counters to scores relative to the best possible score.
|
128
|
+
# This will be the best possible score for a perfect match.
|
129
|
+
best_possible_score = string.length - @n + 1
|
142
130
|
match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
|
143
131
|
|
144
|
-
# Delete all matches that
|
145
|
-
# top match.
|
132
|
+
# Delete all matches that don't have the required minimum match score.
|
146
133
|
match_list.delete_if { |a| a[1] < min_score }
|
147
134
|
|
148
|
-
|
135
|
+
# Sort the list best to worst match
|
136
|
+
match_list.sort! do |a, b|
|
137
|
+
b[1] <=> a[1]
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the top max_count matches.
|
141
|
+
match_list[0..max_count - 1]
|
149
142
|
end
|
150
143
|
|
151
144
|
# Returns some internal stats about the dictionary.
|
@@ -176,16 +169,6 @@ module PEROBS
|
|
176
169
|
end
|
177
170
|
end
|
178
171
|
|
179
|
-
def discard_worst_match(matches)
|
180
|
-
# Sort in the order of occurance count downwards.
|
181
|
-
match_list = matches.to_a.sort do |a, b|
|
182
|
-
b[1] <=> a[1]
|
183
|
-
end
|
184
|
-
# Discard the lowest half of the matches
|
185
|
-
match_list = match_list[0..match_list.length / 2]
|
186
|
-
match_list.to_h
|
187
|
-
end
|
188
|
-
|
189
172
|
end
|
190
173
|
|
191
174
|
end
|
data/lib/perobs/Hash.rb
CHANGED
@@ -124,9 +124,9 @@ module PEROBS
|
|
124
124
|
|
125
125
|
# Proxy for assignment method.
|
126
126
|
def []=(key, value)
|
127
|
-
unless key.is_a?(String)
|
128
|
-
raise ArgumentError, "PEROBS::Hash[] key must be a String
|
129
|
-
"#{key.class}"
|
127
|
+
unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
|
128
|
+
raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
|
129
|
+
"a PEROBS object but is a #{key.class}"
|
130
130
|
end
|
131
131
|
_check_assignment_value(value)
|
132
132
|
@store.cache.cache_write(self)
|
@@ -143,18 +143,33 @@ module PEROBS
|
|
143
143
|
# is referencing.
|
144
144
|
# @return [Array of Integer] IDs of referenced objects
|
145
145
|
def _referenced_object_ids
|
146
|
-
|
147
|
-
|
146
|
+
ids = []
|
147
|
+
@data.each do |k, v|
|
148
|
+
if k && k.respond_to?(:is_poxreference?)
|
149
|
+
ids << k.id
|
150
|
+
end
|
151
|
+
if v && v.respond_to?(:is_poxreference?)
|
152
|
+
ids << v.id
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
ids
|
148
157
|
end
|
149
158
|
|
150
159
|
# This method should only be used during store repair operations. It will
|
151
160
|
# delete all referenced to the given object ID.
|
152
161
|
# @param id [Integer] targeted object ID
|
153
162
|
def _delete_reference_to_id(id)
|
163
|
+
original_length = @data.length
|
164
|
+
|
154
165
|
@data.delete_if do |k, v|
|
155
|
-
|
166
|
+
(k && k.respond_to?(:is_poxreference?) && k.id == id) ||
|
167
|
+
(v && v.respond_to?(:is_poxreference?) && v.id == id)
|
168
|
+
end
|
169
|
+
|
170
|
+
if @data.length != original_length
|
171
|
+
@store.cache.cache_write(self)
|
156
172
|
end
|
157
|
-
@store.cache.cache_write(self)
|
158
173
|
end
|
159
174
|
|
160
175
|
# Restore the persistent data from a single data structure.
|
@@ -163,8 +178,18 @@ module PEROBS
|
|
163
178
|
# @private
|
164
179
|
def _deserialize(data)
|
165
180
|
@data = {}
|
166
|
-
|
167
|
-
|
181
|
+
|
182
|
+
data.each do |k, v|
|
183
|
+
# References to other PEROBS Objects are marshalled with our own
|
184
|
+
# format. If we detect such a marshalled String we convert it into a
|
185
|
+
# POXReference object.
|
186
|
+
if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
|
187
|
+
k = POXReference.new(@store, match[1].to_i)
|
188
|
+
end
|
189
|
+
dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
|
190
|
+
@data[k] = dv
|
191
|
+
end
|
192
|
+
|
168
193
|
@data
|
169
194
|
end
|
170
195
|
|
@@ -185,26 +210,46 @@ module PEROBS
|
|
185
210
|
data = {}
|
186
211
|
|
187
212
|
@data.each do |k, v|
|
188
|
-
if
|
189
|
-
|
190
|
-
|
191
|
-
#
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
v.inspect
|
200
|
-
end
|
201
|
-
data[k] = v
|
213
|
+
if k.respond_to?(:is_poxreference?)
|
214
|
+
# JSON only supports Strings as hash keys. Since JSON is the default
|
215
|
+
# internal storage format in the database, we have to marshall
|
216
|
+
# PEROBS::Object references ourselves.
|
217
|
+
k = "#<PEROBS::POReference id=#{k.id}>"
|
218
|
+
elsif k[0..24] == '#<PEROBS::POReference id='
|
219
|
+
# This could obviously result in conflicts with 'normal' String hash
|
220
|
+
# keys. This is extremely unlikely, but we better catch this case
|
221
|
+
# before it causes hard to debug trouble.
|
222
|
+
raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
|
223
|
+
"internal representation of marshalled hash keys!"
|
202
224
|
end
|
225
|
+
data[k] = serialize_helper(v)
|
203
226
|
end
|
204
227
|
|
205
228
|
data
|
206
229
|
end
|
207
230
|
|
231
|
+
def serialize_helper(v)
|
232
|
+
if v.respond_to?(:is_poxreference?)
|
233
|
+
# References to other PEROBS objects (POXReference) are stored as
|
234
|
+
# POReference in the database.
|
235
|
+
return POReference.new(v.id)
|
236
|
+
else
|
237
|
+
# Outside of the PEROBS library all PEROBS::ObjectBase derived
|
238
|
+
# objects should not be used directly. The library only exposes them
|
239
|
+
# via POXReference proxy objects.
|
240
|
+
if v.is_a?(ObjectBase)
|
241
|
+
PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
|
242
|
+
"It is stored in a PEROBS::Hash. " +
|
243
|
+
'Have you used self() instead of myself() to ' +
|
244
|
+
"get the reference of this PEROBS object?\n" +
|
245
|
+
v.inspect
|
246
|
+
end
|
247
|
+
|
248
|
+
# All other objects are serialized by their native methods.
|
249
|
+
return v
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
208
253
|
end
|
209
254
|
|
210
255
|
end
|
@@ -54,7 +54,8 @@ module PEROBS
|
|
54
54
|
@file_name = File.join(dir, name + '.cache')
|
55
55
|
@page_size = page_size
|
56
56
|
open
|
57
|
-
@pages = PersistentObjectCache.new(max_in_memory,
|
57
|
+
@pages = PersistentObjectCache.new(max_in_memory, max_in_memory,
|
58
|
+
IDListPage, self)
|
58
59
|
@page_counter = 0
|
59
60
|
end
|
60
61
|
|
data/lib/perobs/ObjectBase.rb
CHANGED
@@ -102,6 +102,13 @@ module PEROBS
|
|
102
102
|
end
|
103
103
|
end
|
104
104
|
|
105
|
+
# To allow POXReference objects to be used as Hash keys we need to
|
106
|
+
# implement this function. Conveniently, we can just use the PEROBS object
|
107
|
+
# ID since that is unique.
|
108
|
+
def hash
|
109
|
+
@id
|
110
|
+
end
|
111
|
+
|
105
112
|
# Shortcut to access the _id() method of the referenced object.
|
106
113
|
def _id
|
107
114
|
@id
|
data/lib/perobs/SpaceTree.rb
CHANGED
@@ -54,7 +54,7 @@ module PEROBS
|
|
54
54
|
|
55
55
|
# Benchmark runs showed a cache size of 128 to be a good compromise
|
56
56
|
# between read and write performance trade-offs and memory consumption.
|
57
|
-
@cache = PersistentObjectCache.new(256,
|
57
|
+
@cache = PersistentObjectCache.new(256, 256, SpaceTreeNode, self)
|
58
58
|
end
|
59
59
|
|
60
60
|
# Open the SpaceTree file.
|
data/lib/perobs/Store.rb
CHANGED
data/lib/perobs/version.rb
CHANGED
data/test/FlatFileDB_spec.rb
CHANGED
@@ -265,5 +265,35 @@ describe PEROBS::FlatFileDB do
|
|
265
265
|
db.close
|
266
266
|
end
|
267
267
|
|
268
|
+
it 'should handle duplicate entries for the same ID in database.blobs file' do
|
269
|
+
@store.exit
|
270
|
+
|
271
|
+
db = PEROBS::FlatFileDB.new(@db_dir)
|
272
|
+
db_file = File.join(@db_dir, 'database.blobs')
|
273
|
+
db.open
|
274
|
+
0.upto(5) do |i|
|
275
|
+
db.put_object("#{i + 1}:#{'X' * (i + 1) * 30}$", i + 1)
|
276
|
+
end
|
277
|
+
db.close
|
278
|
+
|
279
|
+
# This appends the entry 2 again
|
280
|
+
blob2 = File.read(db_file, 319 - 199, 199)
|
281
|
+
File.write(db_file, blob2, File.size(db_file))
|
282
|
+
|
283
|
+
db.open
|
284
|
+
expect(db.check_db).to eql(2)
|
285
|
+
expect(db.check_db(true)).to eql(1)
|
286
|
+
db.close
|
287
|
+
db = PEROBS::FlatFileDB.new(@db_dir, { :log => $stderr,
|
288
|
+
:log_level => Logger::WARN })
|
289
|
+
db.open
|
290
|
+
expect(db.check_db).to eql(0)
|
291
|
+
|
292
|
+
0.upto(5) do |i|
|
293
|
+
expect(db.get_object(i + 1)).to eql("#{i + 1}:#{'X' * (i + 1) * 30}$")
|
294
|
+
end
|
295
|
+
db.close
|
296
|
+
end
|
297
|
+
|
268
298
|
end
|
269
299
|
|
@@ -29,13 +29,25 @@ require 'perobs/FuzzyStringMatcher'
|
|
29
29
|
|
30
30
|
module PEROBS
|
31
31
|
|
32
|
+
class WordRef < PEROBS::Object
|
33
|
+
|
34
|
+
attr_persist :word, :line
|
35
|
+
|
36
|
+
def initialize(store, word, line)
|
37
|
+
super(store)
|
38
|
+
self.word = word
|
39
|
+
self.line = line
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
32
44
|
describe FuzzyStringMatcher do
|
33
45
|
|
34
46
|
before(:all) do
|
35
47
|
@db_name = generate_db_name(__FILE__)
|
36
48
|
@store = PEROBS::Store.new(@db_name)
|
37
|
-
@fsm =
|
38
|
-
@fsm2 =
|
49
|
+
@store['fsm'] = @fsm = @store.new(FuzzyStringMatcher)
|
50
|
+
@store['fsm2'] = @fsm2 = @store.new(FuzzyStringMatcher, true, 2)
|
39
51
|
end
|
40
52
|
|
41
53
|
after(:all) do
|
@@ -103,6 +115,44 @@ module PEROBS
|
|
103
115
|
expect(@fsm.best_matches('foobar')).to eql([])
|
104
116
|
end
|
105
117
|
|
118
|
+
it 'should find a match' do
|
119
|
+
dut = {
|
120
|
+
[ 'one' ] => [ [ 'one', 1.0 ] ],
|
121
|
+
[ 'three' ] => [ [ 'three', 1.0 ] ],
|
122
|
+
[ 'four' ]=> [ [ 'four', 1.0 ], [ 'fourteen', 0.666 ] ],
|
123
|
+
[ 'four', 1.0 ]=> [ [ 'four', 1.0 ] ],
|
124
|
+
[ 'even' ] => [ [ 'seven', 0.666 ], [ 'eleven', 0.666 ] ],
|
125
|
+
[ 'teen' ] => [ ['thirteen', 0.6666666666666666],
|
126
|
+
['fourteen', 0.6666666666666666],
|
127
|
+
['fifteen', 0.6666666666666666],
|
128
|
+
['sixteen', 0.6666666666666666],
|
129
|
+
['seventeen', 0.6666666666666666],
|
130
|
+
['eighteen', 0.6666666666666666],
|
131
|
+
['nineteen', 0.6666666666666666] ],
|
132
|
+
[ 'aight' ] => [ [ 'eight', 0.5 ] ],
|
133
|
+
[ 'thirdteen' ] => [ [ 'thirteen', 0.5 ] ],
|
134
|
+
[ 'shirt teen', 0.3 ] => [ [ 'thirteen', 0.333 ] ]
|
135
|
+
}
|
136
|
+
check_data_under_test(@fsm, dut)
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'should sort best to worst matches' do
|
140
|
+
@fsm.clear
|
141
|
+
%w( xbar xfoox foor bar foobar barfoo foo rab baar fool xbarx
|
142
|
+
foobarx xfoobarx foo_bar ).each do |w|
|
143
|
+
@fsm.learn(w, w)
|
144
|
+
end
|
145
|
+
dut = {
|
146
|
+
[ 'foo' ] => [["foo", 1.0], ["foor", 0.5], ["foobar", 0.5],
|
147
|
+
["fool", 0.5], ["foobarx", 0.5], ["foo_bar", 0.5],
|
148
|
+
["barfoo", 0.5]],
|
149
|
+
[ 'bar' ] => [["bar", 1.0], ["barfoo", 0.5], ["xbar", 0.5],
|
150
|
+
["foobar", 0.5], ["foo_bar", 0.5]],
|
151
|
+
[ 'foobar' ] => [["foobar", 1.0], ["foobarx", 0.8], ["xfoobarx", 0.6]]
|
152
|
+
}
|
153
|
+
check_data_under_test(@fsm, dut)
|
154
|
+
end
|
155
|
+
|
106
156
|
it 'should handle a larger text' do
|
107
157
|
text =<<-EOT
|
108
158
|
MIT License
|
@@ -131,9 +181,9 @@ EOT
|
|
131
181
|
@fsm2.learn(word, word)
|
132
182
|
end
|
133
183
|
stats = @fsm2.stats
|
134
|
-
expect(stats['dictionary_size']).to eql(
|
184
|
+
expect(stats['dictionary_size']).to eql(352)
|
135
185
|
expect(stats['max_list_size']).to eql(22)
|
136
|
-
expect(stats['avg_list_size']).to be_within(0.001).of(2.
|
186
|
+
expect(stats['avg_list_size']).to be_within(0.001).of(2.409)
|
137
187
|
end
|
138
188
|
|
139
189
|
it 'should find case sensitive matches' do
|
@@ -145,6 +195,46 @@ EOT
|
|
145
195
|
check_data_under_test(@fsm2, dut)
|
146
196
|
end
|
147
197
|
|
198
|
+
it 'should support references to PEROBS objects' do
|
199
|
+
text =<<-EOT
|
200
|
+
MIT License
|
201
|
+
|
202
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
203
|
+
a copy of this software and associated documentation files (the
|
204
|
+
"Software"), to deal in the Software without restriction, including
|
205
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
206
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
207
|
+
permit persons to whom the Software is furnished to do so, subject to
|
208
|
+
the following conditions:
|
209
|
+
EOT
|
210
|
+
|
211
|
+
line_no = 1
|
212
|
+
@store['fsm'] = fsm = @store.new(FuzzyStringMatcher)
|
213
|
+
@store['refs'] = refs = @store.new(Array)
|
214
|
+
text.each_line do |line|
|
215
|
+
line.split.each do |word|
|
216
|
+
ref = @store.new(WordRef, word, line_no)
|
217
|
+
refs << ref
|
218
|
+
fsm.learn(word, ref)
|
219
|
+
end
|
220
|
+
line_no += 1
|
221
|
+
end
|
222
|
+
|
223
|
+
found_lines = []
|
224
|
+
fsm.best_matches('SOFTWARE').each do |match|
|
225
|
+
found_lines << match[0].line
|
226
|
+
end
|
227
|
+
expect(found_lines.sort).to eql([ 4, 5, 5, 7, 8 ])
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should with small search words' do
|
231
|
+
@fsm.clear
|
232
|
+
mats = 'Yukihiro Matsumoto'
|
233
|
+
@fsm.learn(mats)
|
234
|
+
expect(@fsm.best_matches('Yukihiro').first.first).to eql(mats)
|
235
|
+
expect(@fsm.best_matches('Mats', 0.3).first.first).to eql(mats)
|
236
|
+
end
|
237
|
+
|
148
238
|
def check_data_under_test(fsm, dut)
|
149
239
|
dut.each do |inputs, reference|
|
150
240
|
key = inputs[0]
|
data/test/Hash_spec.rb
CHANGED
@@ -31,7 +31,6 @@ require 'spec_helper'
|
|
31
31
|
|
32
32
|
require 'perobs'
|
33
33
|
|
34
|
-
|
35
34
|
class PO < PEROBS::Object
|
36
35
|
|
37
36
|
attr_persist :name
|
@@ -68,9 +67,13 @@ describe PEROBS::Hash do
|
|
68
67
|
h['po'] = po = @store.new(PO)
|
69
68
|
po.name = 'foobar'
|
70
69
|
h['b'] = 'B'
|
70
|
+
@store['po_key'] = po_key = @store.new(PO)
|
71
|
+
po_key.name = 'po key'
|
72
|
+
h[po_key] = 'PO Key'
|
71
73
|
|
72
74
|
expect(h['a']).to eq('A')
|
73
75
|
expect(h['b']).to eq('B')
|
76
|
+
expect(h[@store['po_key']]).to eq('PO Key')
|
74
77
|
@store.exit
|
75
78
|
|
76
79
|
@store = PEROBS::Store.new(@db_name)
|
@@ -78,6 +81,14 @@ describe PEROBS::Hash do
|
|
78
81
|
expect(h['a']).to eq('A')
|
79
82
|
expect(h['b']).to eq('B')
|
80
83
|
expect(h['po'].name).to eq('foobar')
|
84
|
+
po_key = @store['po_key']
|
85
|
+
expect(po_key.name).to eq('po key')
|
86
|
+
expect(h[po_key]).to eq('PO Key')
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should not allow hash keys that conflict with internal notations' do
|
90
|
+
@store['h'] = h = @store.new(PEROBS::Hash)
|
91
|
+
expect { h['#<PEROBS::POReference id=1234>'] = 'foo'; @store.sync }.to raise_error(ArgumentError)
|
81
92
|
end
|
82
93
|
|
83
94
|
it 'should have an each method to iterate' do
|
data/test/Store_spec.rb
CHANGED
@@ -251,6 +251,20 @@ describe PEROBS::Store do
|
|
251
251
|
end
|
252
252
|
expect(i).to eq(6)
|
253
253
|
|
254
|
+
capture_io { store.gc }
|
255
|
+
capture_io { expect { store.check }.to_not raise_error }
|
256
|
+
capture_io { store.exit }
|
257
|
+
|
258
|
+
store = PEROBS::Store.new(@db_file)
|
259
|
+
capture_io { expect { store.check }.to_not raise_error }
|
260
|
+
|
261
|
+
person = store['person1']
|
262
|
+
i = 0
|
263
|
+
while (person = person.related) do
|
264
|
+
i += 1
|
265
|
+
end
|
266
|
+
expect(i).to eq(6)
|
267
|
+
|
254
268
|
capture_io { store.gc }
|
255
269
|
capture_io { expect { store.check }.to_not raise_error }
|
256
270
|
expect { store.delete_store }.to_not raise_error
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perobs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Schlaeger
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -147,7 +147,7 @@ homepage: https://github.com/scrapper/perobs
|
|
147
147
|
licenses:
|
148
148
|
- MIT
|
149
149
|
metadata: {}
|
150
|
-
post_install_message:
|
150
|
+
post_install_message:
|
151
151
|
rdoc_options: []
|
152
152
|
require_paths:
|
153
153
|
- lib
|
@@ -162,9 +162,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
162
162
|
- !ruby/object:Gem::Version
|
163
163
|
version: '0'
|
164
164
|
requirements: []
|
165
|
-
|
166
|
-
|
167
|
-
signing_key:
|
165
|
+
rubygems_version: 3.2.3
|
166
|
+
signing_key:
|
168
167
|
specification_version: 4
|
169
168
|
summary: Persistent Ruby Object Store
|
170
169
|
test_files:
|