perobs 4.2.0 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/perobs.rb +1 -0
- data/lib/perobs/BTree.rb +2 -2
- data/lib/perobs/BTreeNode.rb +32 -15
- data/lib/perobs/FlatFile.rb +37 -56
- data/lib/perobs/FuzzyStringMatcher.rb +32 -49
- data/lib/perobs/Hash.rb +68 -23
- data/lib/perobs/IDListPageFile.rb +2 -1
- data/lib/perobs/ObjectBase.rb +7 -0
- data/lib/perobs/SpaceTree.rb +1 -1
- data/lib/perobs/Store.rb +1 -0
- data/lib/perobs/version.rb +1 -1
- data/test/FlatFileDB_spec.rb +30 -0
- data/test/FuzzyStringMatcher_spec.rb +94 -4
- data/test/Hash_spec.rb +12 -1
- data/test/Store_spec.rb +14 -0
- metadata +6 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a61fc945e0ef9f5ed6558080931d2acae42cc0401f375275684e4ee32fefe4f7
|
4
|
+
data.tar.gz: 4d864fdc0791aa78d8c180b4686ee825cd25e209284fca1966f144813c063280
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3834a9caae693d82837fb9f75141cb35e85f1a2c1439d1bb898f8578d9ae082f46deb0233d2b8a02d6ae6b0bf66862098b47ff571ff3d9a6b874fadaef6d23a
|
7
|
+
data.tar.gz: 883f1b5e553fae2be0039aa090d89bf6eb44ec1d0dc31488aeaa727ec8bc2844c9b722568cd7dde0b33c507121afd11e720154d4ff27e66aa3ac3812d5603954
|
data/README.md
CHANGED
data/lib/perobs.rb
CHANGED
data/lib/perobs/BTree.rb
CHANGED
@@ -70,7 +70,7 @@ module PEROBS
|
|
70
70
|
@nodes.register_custom_data('first_leaf')
|
71
71
|
@nodes.register_custom_data('last_leaf')
|
72
72
|
@nodes.register_custom_data('btree_size')
|
73
|
-
@node_cache = PersistentObjectCache.new(2**
|
73
|
+
@node_cache = PersistentObjectCache.new(2**13, 2**13, BTreeNode, self)
|
74
74
|
@root = @first_leaf = @last_leaf = nil
|
75
75
|
@size = 0
|
76
76
|
|
@@ -190,7 +190,7 @@ module PEROBS
|
|
190
190
|
"Number of leave nodes: #{stats.leave_nodes}; " +
|
191
191
|
"Number of leaves: #{stats.leaves}"
|
192
192
|
|
193
|
-
|
193
|
+
true
|
194
194
|
end
|
195
195
|
|
196
196
|
# Register a new node as root node of the tree.
|
data/lib/perobs/BTreeNode.rb
CHANGED
@@ -59,7 +59,7 @@ module PEROBS
|
|
59
59
|
# if not
|
60
60
|
def initialize(tree, node_address = nil, parent = nil, is_leaf = true,
|
61
61
|
prev_sibling = nil, next_sibling = nil,
|
62
|
-
keys =
|
62
|
+
keys = nil, values = nil, children = nil)
|
63
63
|
@tree = tree
|
64
64
|
if node_address == 0
|
65
65
|
PEROBS.log.fatal "Node address may not be 0"
|
@@ -68,13 +68,13 @@ module PEROBS
|
|
68
68
|
@parent = link(parent)
|
69
69
|
@prev_sibling = link(prev_sibling)
|
70
70
|
@next_sibling = link(next_sibling)
|
71
|
-
@keys = keys
|
71
|
+
@keys = keys || []
|
72
72
|
if (@is_leaf = is_leaf)
|
73
|
-
@values = values
|
74
|
-
@children =
|
73
|
+
@values = values || []
|
74
|
+
@children = nil
|
75
75
|
else
|
76
|
-
@children = children
|
77
|
-
@values =
|
76
|
+
@children = children || []
|
77
|
+
@values = nil
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
@@ -585,11 +585,11 @@ module PEROBS
|
|
585
585
|
end
|
586
586
|
|
587
587
|
def trim(idx)
|
588
|
-
@keys
|
588
|
+
@keys.slice!(idx, @keys.length - idx)
|
589
589
|
if @is_leaf
|
590
|
-
@values
|
590
|
+
@values.slice!(idx, @values.length - idx)
|
591
591
|
else
|
592
|
-
@children
|
592
|
+
@children.slice!(idx + 1, @children.length - idx - 1)
|
593
593
|
end
|
594
594
|
@tree.node_cache.insert(self)
|
595
595
|
end
|
@@ -654,13 +654,18 @@ module PEROBS
|
|
654
654
|
# @yield [key, value]
|
655
655
|
# @return [nil or Hash] nil in case of errors or a hash with some
|
656
656
|
# statistical information about the tree
|
657
|
-
def check
|
657
|
+
def check(&block)
|
658
658
|
stats = Stats.new(nil, 0, 0, 0)
|
659
659
|
|
660
660
|
traverse do |node, position, stack|
|
661
661
|
if position == 0
|
662
662
|
stats.nodes_count += 1
|
663
663
|
if node.parent
|
664
|
+
unless node.parent.is_a?(BTreeNodeLink)
|
665
|
+
node.error "parent is a #{node.parent.class} instead of a " +
|
666
|
+
"BTreeNodeLink"
|
667
|
+
return nil
|
668
|
+
end
|
664
669
|
# After a split the nodes will only have half the maximum keys.
|
665
670
|
# For branch nodes one of the split nodes will have even 1 key
|
666
671
|
# less as this will become the branch key in a parent node.
|
@@ -695,6 +700,16 @@ module PEROBS
|
|
695
700
|
else
|
696
701
|
stats.branch_depth = node.tree_level
|
697
702
|
end
|
703
|
+
if node.prev_sibling && !node.prev_sibling.is_a?(BTreeNodeLink)
|
704
|
+
node.error "prev_sibling is a #{node.prev_sibling.class} " +
|
705
|
+
"instead of a BTreeNodeLink"
|
706
|
+
return nil
|
707
|
+
end
|
708
|
+
if node.next_sibling && !node.next_sibling.is_a?(BTreeNodeLink)
|
709
|
+
node.error "next_sibling is a #{node.next_sibling.class} " +
|
710
|
+
"instead of a BTreeNodeLink"
|
711
|
+
return nil
|
712
|
+
end
|
698
713
|
if node.prev_sibling.nil? && @tree.first_leaf != node
|
699
714
|
node.error "Leaf node #{node.node_address} has no previous " +
|
700
715
|
"sibling but is not the first leaf of the tree"
|
@@ -708,9 +723,9 @@ module PEROBS
|
|
708
723
|
unless node.keys.size == node.values.size
|
709
724
|
node.error "Key count (#{node.keys.size}) and value " +
|
710
725
|
"count (#{node.values.size}) don't match"
|
711
|
-
|
726
|
+
return nil
|
712
727
|
end
|
713
|
-
unless node.children.
|
728
|
+
unless node.children.nil?
|
714
729
|
node.error "@children must be nil for a leaf node"
|
715
730
|
return nil
|
716
731
|
end
|
@@ -718,14 +733,14 @@ module PEROBS
|
|
718
733
|
stats.leave_nodes += 1
|
719
734
|
stats.leaves += node.keys.length
|
720
735
|
else
|
721
|
-
unless node.values.
|
736
|
+
unless node.values.nil?
|
722
737
|
node.error "@values must be nil for a branch node"
|
723
738
|
return nil
|
724
739
|
end
|
725
740
|
unless node.children.size == node.keys.size + 1
|
726
741
|
node.error "Key count (#{node.keys.size}) must be one " +
|
727
742
|
"less than children count (#{node.children.size})"
|
728
|
-
|
743
|
+
return nil
|
729
744
|
end
|
730
745
|
node.children.each_with_index do |child, i|
|
731
746
|
unless child.is_a?(BTreeNodeLink)
|
@@ -789,7 +804,9 @@ module PEROBS
|
|
789
804
|
else
|
790
805
|
if block_given?
|
791
806
|
# If a block was given, call this block with the key and value.
|
792
|
-
|
807
|
+
unless yield(node.keys[index], node.values[index])
|
808
|
+
return nil
|
809
|
+
end
|
793
810
|
end
|
794
811
|
end
|
795
812
|
end
|
data/lib/perobs/FlatFile.rb
CHANGED
@@ -293,7 +293,7 @@ module PEROBS
|
|
293
293
|
header = FlatFileBlobHeader.read(@f, addr, id)
|
294
294
|
if header.id != id
|
295
295
|
PEROBS.log.fatal "Database index corrupted: Index for object " +
|
296
|
-
"#{id} points to object with ID #{header.id}"
|
296
|
+
"#{id} points to object with ID #{header.id} at address #{addr}"
|
297
297
|
end
|
298
298
|
|
299
299
|
buf = nil
|
@@ -302,7 +302,8 @@ module PEROBS
|
|
302
302
|
@f.seek(addr + FlatFileBlobHeader::LENGTH)
|
303
303
|
buf = @f.read(header.length)
|
304
304
|
rescue IOError => e
|
305
|
-
PEROBS.log.fatal "Cannot read blob for ID #{id}
|
305
|
+
PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
|
306
|
+
e.message
|
306
307
|
end
|
307
308
|
|
308
309
|
# Uncompress the data if the compression bit is set in the flags byte.
|
@@ -311,12 +312,13 @@ module PEROBS
|
|
311
312
|
buf = Zlib.inflate(buf)
|
312
313
|
rescue Zlib::BufError, Zlib::DataError
|
313
314
|
PEROBS.log.fatal "Corrupted compressed block with ID " +
|
314
|
-
"#{
|
315
|
+
"#{id} found at address #{addr}."
|
315
316
|
end
|
316
317
|
end
|
317
318
|
|
318
319
|
if checksum(buf) != header.crc
|
319
|
-
PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
|
320
|
+
PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
|
321
|
+
"at address #{addr}"
|
320
322
|
end
|
321
323
|
|
322
324
|
buf
|
@@ -339,7 +341,7 @@ module PEROBS
|
|
339
341
|
if @marks
|
340
342
|
@marks.clear
|
341
343
|
else
|
342
|
-
@marks = IDList.new(@db_dir, 'marks',
|
344
|
+
@marks = IDList.new(@db_dir, 'marks', item_counter)
|
343
345
|
end
|
344
346
|
end
|
345
347
|
|
@@ -452,16 +454,14 @@ module PEROBS
|
|
452
454
|
regenerate_index_and_spaces
|
453
455
|
end
|
454
456
|
|
455
|
-
# Check
|
456
|
-
# @param repair [Boolean] True if errors should be fixed.
|
457
|
+
# Check the FlatFile.
|
457
458
|
# @return [Integer] Number of errors found
|
458
|
-
def check(
|
459
|
+
def check()
|
459
460
|
errors = 0
|
460
461
|
return errors unless @f
|
461
462
|
|
462
463
|
t = Time.now
|
463
|
-
PEROBS.log.info "Checking FlatFile database"
|
464
|
-
"#{repair ? ' in repair mode' : ''}..."
|
464
|
+
PEROBS.log.info "Checking FlatFile database..."
|
465
465
|
|
466
466
|
# First check the database blob file. Each entry should be readable and
|
467
467
|
# correct and all IDs must be unique. We use a shadow index to keep
|
@@ -483,7 +483,6 @@ module PEROBS
|
|
483
483
|
if buf.bytesize != header.length
|
484
484
|
PEROBS.log.error "Premature end of file in blob with ID " +
|
485
485
|
"#{header.id}."
|
486
|
-
discard_damaged_blob(header) if repair
|
487
486
|
errors += 1
|
488
487
|
next
|
489
488
|
end
|
@@ -496,7 +495,6 @@ module PEROBS
|
|
496
495
|
rescue Zlib::BufError, Zlib::DataError
|
497
496
|
PEROBS.log.error "Corrupted compressed block with ID " +
|
498
497
|
"#{header.id} found."
|
499
|
-
discard_damaged_blob(header) if repair
|
500
498
|
errors += 1
|
501
499
|
next
|
502
500
|
end
|
@@ -505,7 +503,6 @@ module PEROBS
|
|
505
503
|
if header.crc && checksum(buf) != header.crc
|
506
504
|
PEROBS.log.error "Checksum failure while checking blob " +
|
507
505
|
"with ID #{header.id}"
|
508
|
-
discard_damaged_blob(header) if repair
|
509
506
|
errors += 1
|
510
507
|
next
|
511
508
|
end
|
@@ -521,22 +518,6 @@ module PEROBS
|
|
521
518
|
errors += 1
|
522
519
|
previous_header = FlatFileBlobHeader.read(@f, previous_address,
|
523
520
|
header.id)
|
524
|
-
if repair
|
525
|
-
# We have two blobs with the same ID and we must discard one of
|
526
|
-
# them.
|
527
|
-
if header.is_outdated?
|
528
|
-
discard_damaged_blob(header)
|
529
|
-
elsif previous_header.is_outdated?
|
530
|
-
discard_damaged_blob(previous_header)
|
531
|
-
else
|
532
|
-
PEROBS.log.error "None of the blobs with same ID have " +
|
533
|
-
"the outdated flag set. Deleting the smaller one."
|
534
|
-
errors += 1
|
535
|
-
discard_damaged_blob(header.length < previous_header.length ?
|
536
|
-
header : previous_header)
|
537
|
-
end
|
538
|
-
next
|
539
|
-
end
|
540
521
|
else
|
541
522
|
# ID is unique so far. Add it to the shadow index.
|
542
523
|
new_index.insert(header.id, header.addr)
|
@@ -553,12 +534,6 @@ module PEROBS
|
|
553
534
|
PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
|
554
535
|
'bytes found at the end of FlatFile.'
|
555
536
|
corrupted_blobs += 1
|
556
|
-
if repair
|
557
|
-
PEROBS.log.error "Truncating FlatFile to " +
|
558
|
-
"#{end_of_last_healthy_blob} bytes by discarding " +
|
559
|
-
"#{@f.size - end_of_last_healthy_blob} bytes"
|
560
|
-
@f.truncate(end_of_last_healthy_blob)
|
561
|
-
end
|
562
537
|
end
|
563
538
|
|
564
539
|
errors += corrupted_blobs
|
@@ -568,17 +543,19 @@ module PEROBS
|
|
568
543
|
new_index.close
|
569
544
|
new_index.erase
|
570
545
|
|
571
|
-
if
|
572
|
-
erase_index_files
|
573
|
-
defragmentize
|
574
|
-
regenerate_index_and_spaces
|
575
|
-
elsif corrupted_blobs == 0
|
546
|
+
if corrupted_blobs == 0
|
576
547
|
# Now we check the index data. It must be correct and the entries must
|
577
548
|
# match the blob file. All entries in the index must be in the blob file
|
578
549
|
# and vise versa.
|
579
550
|
begin
|
580
551
|
index_ok = @index.check do |id, address|
|
581
|
-
has_id_at?(id, address)
|
552
|
+
unless has_id_at?(id, address)
|
553
|
+
PEROBS.log.error "Index contains an entry for " +
|
554
|
+
"ID #{id} at address #{address} that is not in FlatFile"
|
555
|
+
false
|
556
|
+
else
|
557
|
+
true
|
558
|
+
end
|
582
559
|
end
|
583
560
|
x_check_errs = 0
|
584
561
|
space_check_ok = true
|
@@ -586,16 +563,13 @@ module PEROBS
|
|
586
563
|
(x_check_errs = cross_check_entries) == 0
|
587
564
|
errors += 1 unless index_ok && space_check_ok
|
588
565
|
errors += x_check_errs
|
589
|
-
regenerate_index_and_spaces if repair
|
590
566
|
end
|
591
567
|
rescue PEROBS::FatalError
|
592
568
|
errors += 1
|
593
|
-
regenerate_index_and_spaces if repair
|
594
569
|
end
|
595
570
|
end
|
596
571
|
|
597
|
-
|
598
|
-
PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
|
572
|
+
PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
|
599
573
|
"#{errors} errors found."
|
600
574
|
|
601
575
|
errors
|
@@ -687,17 +661,7 @@ module PEROBS
|
|
687
661
|
header.id)
|
688
662
|
# We have two blobs with the same ID and we must discard one of
|
689
663
|
# them.
|
690
|
-
|
691
|
-
discard_damaged_blob(header)
|
692
|
-
elsif previous_header.is_outdated?
|
693
|
-
discard_damaged_blob(previous_header)
|
694
|
-
else
|
695
|
-
PEROBS.log.error "None of the blobs with same ID have " +
|
696
|
-
"the outdated flag set. Deleting the smaller one."
|
697
|
-
errors += 1
|
698
|
-
discard_damaged_blob(header.length < previous_header.length ?
|
699
|
-
header : previous_header)
|
700
|
-
end
|
664
|
+
discard_duplicate_blobs(header, previous_header)
|
701
665
|
else
|
702
666
|
# ID is unique so far. Add it to the shadow index.
|
703
667
|
@index.insert(header.id, header.addr)
|
@@ -927,6 +891,23 @@ module PEROBS
|
|
927
891
|
header.clear_flags
|
928
892
|
end
|
929
893
|
|
894
|
+
def discard_duplicate_blobs(header, previous_header)
|
895
|
+
if header.is_outdated?
|
896
|
+
discard_damaged_blob(header)
|
897
|
+
elsif previous_header.is_outdated?
|
898
|
+
discard_damaged_blob(previous_header)
|
899
|
+
else
|
900
|
+
smaller, larger = header.length < previous_header.length ?
|
901
|
+
[ header, previous_header ] : [ previous_header, header ]
|
902
|
+
PEROBS.log.error "None of the blobs with same ID have " +
|
903
|
+
"the outdated flag set. Deleting the smaller one " +
|
904
|
+
"at address #{smaller.addr}"
|
905
|
+
discard_damaged_blob(smaller)
|
906
|
+
@space_list.add_space(smaller.addr, smaller.length)
|
907
|
+
@index.insert(larger.id, larger.addr)
|
908
|
+
end
|
909
|
+
end
|
910
|
+
|
930
911
|
def open_index_files(abort_on_missing_files = false)
|
931
912
|
begin
|
932
913
|
@index.open(abort_on_missing_files)
|
@@ -26,40 +26,42 @@
|
|
26
26
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
27
|
|
28
28
|
require 'perobs/Log'
|
29
|
-
require 'perobs/
|
29
|
+
require 'perobs/Object'
|
30
30
|
|
31
31
|
module PEROBS
|
32
32
|
|
33
33
|
# The fuzzy string matcher can be used to perform a fuzzy string search
|
34
34
|
# against a known set of strings. The dictionary of known strings does not
|
35
|
-
# store the actual strings but references to
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
|
35
|
+
# store the actual strings but references to String or PEROBS objects.
|
36
|
+
# Once the dictionary has been established, fuzzy matches can be done. Since
|
37
|
+
# the actual input strings are not directly stored, you cannot remove or
|
38
|
+
# modified already stored strings. To remove strings, you have to clear the
|
39
|
+
# matcher and add the strings again that you want to keep.
|
40
|
+
class FuzzyStringMatcher < PEROBS::Object
|
41
|
+
|
42
|
+
attr_persist :case_sensitive, :n, :dict
|
40
43
|
|
41
44
|
# Create a new FuzzyStringMatcher.
|
42
|
-
# @param
|
43
|
-
# @param name [String] Unique name of the string matcher
|
45
|
+
# @param p [PEROBS::Store] place to store the dictionary
|
44
46
|
# @param case_sensitive [Boolean] True if case matters for matching
|
45
47
|
# @param n [Integer] Determines what kind of n-gramm is used to store the
|
46
48
|
# references in the dictionary. It also determines the minimum word
|
47
|
-
# length that can be used for fuzzy matches.
|
48
|
-
|
49
|
-
|
50
|
-
|
49
|
+
# length that can be used for fuzzy matches. Values between 2 and
|
50
|
+
# 10 are supported. The default is 4.
|
51
|
+
def initialize(p, case_sensitive = false, n = 4)
|
52
|
+
super(p)
|
51
53
|
if n < 2 || n > 10
|
52
54
|
raise ArgumentError, 'n must be between 2 and 10'
|
53
55
|
end
|
54
|
-
|
55
|
-
|
56
|
+
self.case_sensitive = case_sensitive
|
57
|
+
self.n = n
|
56
58
|
|
57
|
-
clear unless
|
59
|
+
clear unless @dict
|
58
60
|
end
|
59
61
|
|
60
62
|
# Wipe the dictionary.
|
61
63
|
def clear
|
62
|
-
|
64
|
+
self.dict = @store.new(BigHash)
|
63
65
|
end
|
64
66
|
|
65
67
|
# Add a string with its reference to the dictionary.
|
@@ -79,11 +81,8 @@ module PEROBS
|
|
79
81
|
@dict[n_gramm] = ng_list = @store.new(Hash)
|
80
82
|
end
|
81
83
|
|
82
|
-
|
83
|
-
|
84
|
-
else
|
85
|
-
ng_list[reference] = 0
|
86
|
-
end
|
84
|
+
# We use the Hash as a Set. The value doesn't matter.
|
85
|
+
ng_list[reference] = true unless ng_list.include?(reference)
|
87
86
|
end
|
88
87
|
|
89
88
|
nil
|
@@ -109,22 +108,12 @@ module PEROBS
|
|
109
108
|
|
110
109
|
matches = {}
|
111
110
|
|
112
|
-
# This will be the best possible score for a perfect match.
|
113
|
-
best_possible_score = 0
|
114
111
|
each_n_gramm(string) do |n_gramm|
|
115
|
-
best_possible_score += 1
|
116
112
|
if (ng_list = @dict[n_gramm])
|
117
|
-
ng_list.each do |reference,
|
113
|
+
ng_list.each do |reference, dummy|
|
118
114
|
if matches.include?(reference)
|
119
115
|
matches[reference] += 1
|
120
116
|
else
|
121
|
-
# We use internally a 10 times larger list so that we don't
|
122
|
-
# throw away good matches too early. If the max_count value is
|
123
|
-
# chosen too small there is a risk of not finding the best
|
124
|
-
# matches!
|
125
|
-
if matches.size > 10 * max_count
|
126
|
-
matches = discard_worst_match(matches)
|
127
|
-
end
|
128
117
|
matches[reference] = 1
|
129
118
|
end
|
130
119
|
end
|
@@ -133,19 +122,23 @@ module PEROBS
|
|
133
122
|
|
134
123
|
return [] if matches.empty?
|
135
124
|
|
136
|
-
|
137
|
-
match_list = matches.to_a.sort do |a, b|
|
138
|
-
b[1] <=> a[1]
|
139
|
-
end
|
125
|
+
match_list = matches.to_a
|
140
126
|
|
141
127
|
# Set occurance counters to scores relative to the best possible score.
|
128
|
+
# This will be the best possible score for a perfect match.
|
129
|
+
best_possible_score = string.length - @n + 1
|
142
130
|
match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
|
143
131
|
|
144
|
-
# Delete all matches that
|
145
|
-
# top match.
|
132
|
+
# Delete all matches that don't have the required minimum match score.
|
146
133
|
match_list.delete_if { |a| a[1] < min_score }
|
147
134
|
|
148
|
-
|
135
|
+
# Sort the list best to worst match
|
136
|
+
match_list.sort! do |a, b|
|
137
|
+
b[1] <=> a[1]
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the top max_count matches.
|
141
|
+
match_list[0..max_count - 1]
|
149
142
|
end
|
150
143
|
|
151
144
|
# Returns some internal stats about the dictionary.
|
@@ -176,16 +169,6 @@ module PEROBS
|
|
176
169
|
end
|
177
170
|
end
|
178
171
|
|
179
|
-
def discard_worst_match(matches)
|
180
|
-
# Sort in the order of occurance count downwards.
|
181
|
-
match_list = matches.to_a.sort do |a, b|
|
182
|
-
b[1] <=> a[1]
|
183
|
-
end
|
184
|
-
# Discard the lowest half of the matches
|
185
|
-
match_list = match_list[0..match_list.length / 2]
|
186
|
-
match_list.to_h
|
187
|
-
end
|
188
|
-
|
189
172
|
end
|
190
173
|
|
191
174
|
end
|
data/lib/perobs/Hash.rb
CHANGED
@@ -124,9 +124,9 @@ module PEROBS
|
|
124
124
|
|
125
125
|
# Proxy for assignment method.
|
126
126
|
def []=(key, value)
|
127
|
-
unless key.is_a?(String)
|
128
|
-
raise ArgumentError, "PEROBS::Hash[] key must be a String
|
129
|
-
"#{key.class}"
|
127
|
+
unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
|
128
|
+
raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
|
129
|
+
"a PEROBS object but is a #{key.class}"
|
130
130
|
end
|
131
131
|
_check_assignment_value(value)
|
132
132
|
@store.cache.cache_write(self)
|
@@ -143,18 +143,33 @@ module PEROBS
|
|
143
143
|
# is referencing.
|
144
144
|
# @return [Array of Integer] IDs of referenced objects
|
145
145
|
def _referenced_object_ids
|
146
|
-
|
147
|
-
|
146
|
+
ids = []
|
147
|
+
@data.each do |k, v|
|
148
|
+
if k && k.respond_to?(:is_poxreference?)
|
149
|
+
ids << k.id
|
150
|
+
end
|
151
|
+
if v && v.respond_to?(:is_poxreference?)
|
152
|
+
ids << v.id
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
ids
|
148
157
|
end
|
149
158
|
|
150
159
|
# This method should only be used during store repair operations. It will
|
151
160
|
# delete all referenced to the given object ID.
|
152
161
|
# @param id [Integer] targeted object ID
|
153
162
|
def _delete_reference_to_id(id)
|
163
|
+
original_length = @data.length
|
164
|
+
|
154
165
|
@data.delete_if do |k, v|
|
155
|
-
|
166
|
+
(k && k.respond_to?(:is_poxreference?) && k.id == id) ||
|
167
|
+
(v && v.respond_to?(:is_poxreference?) && v.id == id)
|
168
|
+
end
|
169
|
+
|
170
|
+
if @data.length != original_length
|
171
|
+
@store.cache.cache_write(self)
|
156
172
|
end
|
157
|
-
@store.cache.cache_write(self)
|
158
173
|
end
|
159
174
|
|
160
175
|
# Restore the persistent data from a single data structure.
|
@@ -163,8 +178,18 @@ module PEROBS
|
|
163
178
|
# @private
|
164
179
|
def _deserialize(data)
|
165
180
|
@data = {}
|
166
|
-
|
167
|
-
|
181
|
+
|
182
|
+
data.each do |k, v|
|
183
|
+
# References to other PEROBS Objects are marshalled with our own
|
184
|
+
# format. If we detect such a marshalled String we convert it into a
|
185
|
+
# POXReference object.
|
186
|
+
if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
|
187
|
+
k = POXReference.new(@store, match[1].to_i)
|
188
|
+
end
|
189
|
+
dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
|
190
|
+
@data[k] = dv
|
191
|
+
end
|
192
|
+
|
168
193
|
@data
|
169
194
|
end
|
170
195
|
|
@@ -185,26 +210,46 @@ module PEROBS
|
|
185
210
|
data = {}
|
186
211
|
|
187
212
|
@data.each do |k, v|
|
188
|
-
if
|
189
|
-
|
190
|
-
|
191
|
-
#
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
v.inspect
|
200
|
-
end
|
201
|
-
data[k] = v
|
213
|
+
if k.respond_to?(:is_poxreference?)
|
214
|
+
# JSON only supports Strings as hash keys. Since JSON is the default
|
215
|
+
# internal storage format in the database, we have to marshall
|
216
|
+
# PEROBS::Object references ourselves.
|
217
|
+
k = "#<PEROBS::POReference id=#{k.id}>"
|
218
|
+
elsif k[0..24] == '#<PEROBS::POReference id='
|
219
|
+
# This could obviously result in conflicts with 'normal' String hash
|
220
|
+
# keys. This is extremely unlikely, but we better catch this case
|
221
|
+
# before it causes hard to debug trouble.
|
222
|
+
raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
|
223
|
+
"internal representation of marshalled hash keys!"
|
202
224
|
end
|
225
|
+
data[k] = serialize_helper(v)
|
203
226
|
end
|
204
227
|
|
205
228
|
data
|
206
229
|
end
|
207
230
|
|
231
|
+
def serialize_helper(v)
|
232
|
+
if v.respond_to?(:is_poxreference?)
|
233
|
+
# References to other PEROBS objects (POXReference) are stored as
|
234
|
+
# POReference in the database.
|
235
|
+
return POReference.new(v.id)
|
236
|
+
else
|
237
|
+
# Outside of the PEROBS library all PEROBS::ObjectBase derived
|
238
|
+
# objects should not be used directly. The library only exposes them
|
239
|
+
# via POXReference proxy objects.
|
240
|
+
if v.is_a?(ObjectBase)
|
241
|
+
PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
|
242
|
+
"It is stored in a PEROBS::Hash. " +
|
243
|
+
'Have you used self() instead of myself() to ' +
|
244
|
+
"get the reference of this PEROBS object?\n" +
|
245
|
+
v.inspect
|
246
|
+
end
|
247
|
+
|
248
|
+
# All other objects are serialized by their native methods.
|
249
|
+
return v
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
208
253
|
end
|
209
254
|
|
210
255
|
end
|
@@ -54,7 +54,8 @@ module PEROBS
|
|
54
54
|
@file_name = File.join(dir, name + '.cache')
|
55
55
|
@page_size = page_size
|
56
56
|
open
|
57
|
-
@pages = PersistentObjectCache.new(max_in_memory,
|
57
|
+
@pages = PersistentObjectCache.new(max_in_memory, max_in_memory,
|
58
|
+
IDListPage, self)
|
58
59
|
@page_counter = 0
|
59
60
|
end
|
60
61
|
|
data/lib/perobs/ObjectBase.rb
CHANGED
@@ -102,6 +102,13 @@ module PEROBS
|
|
102
102
|
end
|
103
103
|
end
|
104
104
|
|
105
|
+
# To allow POXReference objects to be used as Hash keys we need to
|
106
|
+
# implement this function. Conveniently, we can just use the PEROBS object
|
107
|
+
# ID since that is unique.
|
108
|
+
def hash
|
109
|
+
@id
|
110
|
+
end
|
111
|
+
|
105
112
|
# Shortcut to access the _id() method of the referenced object.
|
106
113
|
def _id
|
107
114
|
@id
|
data/lib/perobs/SpaceTree.rb
CHANGED
@@ -54,7 +54,7 @@ module PEROBS
|
|
54
54
|
|
55
55
|
# Benchmark runs showed a cache size of 128 to be a good compromise
|
56
56
|
# between read and write performance trade-offs and memory consumption.
|
57
|
-
@cache = PersistentObjectCache.new(256,
|
57
|
+
@cache = PersistentObjectCache.new(256, 256, SpaceTreeNode, self)
|
58
58
|
end
|
59
59
|
|
60
60
|
# Open the SpaceTree file.
|
data/lib/perobs/Store.rb
CHANGED
data/lib/perobs/version.rb
CHANGED
data/test/FlatFileDB_spec.rb
CHANGED
@@ -265,5 +265,35 @@ describe PEROBS::FlatFileDB do
|
|
265
265
|
db.close
|
266
266
|
end
|
267
267
|
|
268
|
+
it 'should handle duplicate entries for the same ID in database.blobs file' do
|
269
|
+
@store.exit
|
270
|
+
|
271
|
+
db = PEROBS::FlatFileDB.new(@db_dir)
|
272
|
+
db_file = File.join(@db_dir, 'database.blobs')
|
273
|
+
db.open
|
274
|
+
0.upto(5) do |i|
|
275
|
+
db.put_object("#{i + 1}:#{'X' * (i + 1) * 30}$", i + 1)
|
276
|
+
end
|
277
|
+
db.close
|
278
|
+
|
279
|
+
# This appends the entry 2 again
|
280
|
+
blob2 = File.read(db_file, 319 - 199, 199)
|
281
|
+
File.write(db_file, blob2, File.size(db_file))
|
282
|
+
|
283
|
+
db.open
|
284
|
+
expect(db.check_db).to eql(2)
|
285
|
+
expect(db.check_db(true)).to eql(1)
|
286
|
+
db.close
|
287
|
+
db = PEROBS::FlatFileDB.new(@db_dir, { :log => $stderr,
|
288
|
+
:log_level => Logger::WARN })
|
289
|
+
db.open
|
290
|
+
expect(db.check_db).to eql(0)
|
291
|
+
|
292
|
+
0.upto(5) do |i|
|
293
|
+
expect(db.get_object(i + 1)).to eql("#{i + 1}:#{'X' * (i + 1) * 30}$")
|
294
|
+
end
|
295
|
+
db.close
|
296
|
+
end
|
297
|
+
|
268
298
|
end
|
269
299
|
|
@@ -29,13 +29,25 @@ require 'perobs/FuzzyStringMatcher'
|
|
29
29
|
|
30
30
|
module PEROBS
|
31
31
|
|
32
|
+
class WordRef < PEROBS::Object
|
33
|
+
|
34
|
+
attr_persist :word, :line
|
35
|
+
|
36
|
+
def initialize(store, word, line)
|
37
|
+
super(store)
|
38
|
+
self.word = word
|
39
|
+
self.line = line
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
32
44
|
describe FuzzyStringMatcher do
|
33
45
|
|
34
46
|
before(:all) do
|
35
47
|
@db_name = generate_db_name(__FILE__)
|
36
48
|
@store = PEROBS::Store.new(@db_name)
|
37
|
-
@fsm =
|
38
|
-
@fsm2 =
|
49
|
+
@store['fsm'] = @fsm = @store.new(FuzzyStringMatcher)
|
50
|
+
@store['fsm2'] = @fsm2 = @store.new(FuzzyStringMatcher, true, 2)
|
39
51
|
end
|
40
52
|
|
41
53
|
after(:all) do
|
@@ -103,6 +115,44 @@ module PEROBS
|
|
103
115
|
expect(@fsm.best_matches('foobar')).to eql([])
|
104
116
|
end
|
105
117
|
|
118
|
+
it 'should find a match' do
|
119
|
+
dut = {
|
120
|
+
[ 'one' ] => [ [ 'one', 1.0 ] ],
|
121
|
+
[ 'three' ] => [ [ 'three', 1.0 ] ],
|
122
|
+
[ 'four' ]=> [ [ 'four', 1.0 ], [ 'fourteen', 0.666 ] ],
|
123
|
+
[ 'four', 1.0 ]=> [ [ 'four', 1.0 ] ],
|
124
|
+
[ 'even' ] => [ [ 'seven', 0.666 ], [ 'eleven', 0.666 ] ],
|
125
|
+
[ 'teen' ] => [ ['thirteen', 0.6666666666666666],
|
126
|
+
['fourteen', 0.6666666666666666],
|
127
|
+
['fifteen', 0.6666666666666666],
|
128
|
+
['sixteen', 0.6666666666666666],
|
129
|
+
['seventeen', 0.6666666666666666],
|
130
|
+
['eighteen', 0.6666666666666666],
|
131
|
+
['nineteen', 0.6666666666666666] ],
|
132
|
+
[ 'aight' ] => [ [ 'eight', 0.5 ] ],
|
133
|
+
[ 'thirdteen' ] => [ [ 'thirteen', 0.5 ] ],
|
134
|
+
[ 'shirt teen', 0.3 ] => [ [ 'thirteen', 0.333 ] ]
|
135
|
+
}
|
136
|
+
check_data_under_test(@fsm, dut)
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'should sort best to worst matches' do
|
140
|
+
@fsm.clear
|
141
|
+
%w( xbar xfoox foor bar foobar barfoo foo rab baar fool xbarx
|
142
|
+
foobarx xfoobarx foo_bar ).each do |w|
|
143
|
+
@fsm.learn(w, w)
|
144
|
+
end
|
145
|
+
dut = {
|
146
|
+
[ 'foo' ] => [["foo", 1.0], ["foor", 0.5], ["foobar", 0.5],
|
147
|
+
["fool", 0.5], ["foobarx", 0.5], ["foo_bar", 0.5],
|
148
|
+
["barfoo", 0.5]],
|
149
|
+
[ 'bar' ] => [["bar", 1.0], ["barfoo", 0.5], ["xbar", 0.5],
|
150
|
+
["foobar", 0.5], ["foo_bar", 0.5]],
|
151
|
+
[ 'foobar' ] => [["foobar", 1.0], ["foobarx", 0.8], ["xfoobarx", 0.6]]
|
152
|
+
}
|
153
|
+
check_data_under_test(@fsm, dut)
|
154
|
+
end
|
155
|
+
|
106
156
|
it 'should handle a larger text' do
|
107
157
|
text =<<-EOT
|
108
158
|
MIT License
|
@@ -131,9 +181,9 @@ EOT
|
|
131
181
|
@fsm2.learn(word, word)
|
132
182
|
end
|
133
183
|
stats = @fsm2.stats
|
134
|
-
expect(stats['dictionary_size']).to eql(
|
184
|
+
expect(stats['dictionary_size']).to eql(352)
|
135
185
|
expect(stats['max_list_size']).to eql(22)
|
136
|
-
expect(stats['avg_list_size']).to be_within(0.001).of(2.
|
186
|
+
expect(stats['avg_list_size']).to be_within(0.001).of(2.409)
|
137
187
|
end
|
138
188
|
|
139
189
|
it 'should find case sensitive matches' do
|
@@ -145,6 +195,46 @@ EOT
|
|
145
195
|
check_data_under_test(@fsm2, dut)
|
146
196
|
end
|
147
197
|
|
198
|
+
it 'should support references to PEROBS objects' do
|
199
|
+
text =<<-EOT
|
200
|
+
MIT License
|
201
|
+
|
202
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
203
|
+
a copy of this software and associated documentation files (the
|
204
|
+
"Software"), to deal in the Software without restriction, including
|
205
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
206
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
207
|
+
permit persons to whom the Software is furnished to do so, subject to
|
208
|
+
the following conditions:
|
209
|
+
EOT
|
210
|
+
|
211
|
+
line_no = 1
|
212
|
+
@store['fsm'] = fsm = @store.new(FuzzyStringMatcher)
|
213
|
+
@store['refs'] = refs = @store.new(Array)
|
214
|
+
text.each_line do |line|
|
215
|
+
line.split.each do |word|
|
216
|
+
ref = @store.new(WordRef, word, line_no)
|
217
|
+
refs << ref
|
218
|
+
fsm.learn(word, ref)
|
219
|
+
end
|
220
|
+
line_no += 1
|
221
|
+
end
|
222
|
+
|
223
|
+
found_lines = []
|
224
|
+
fsm.best_matches('SOFTWARE').each do |match|
|
225
|
+
found_lines << match[0].line
|
226
|
+
end
|
227
|
+
expect(found_lines.sort).to eql([ 4, 5, 5, 7, 8 ])
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should with small search words' do
|
231
|
+
@fsm.clear
|
232
|
+
mats = 'Yukihiro Matsumoto'
|
233
|
+
@fsm.learn(mats)
|
234
|
+
expect(@fsm.best_matches('Yukihiro').first.first).to eql(mats)
|
235
|
+
expect(@fsm.best_matches('Mats', 0.3).first.first).to eql(mats)
|
236
|
+
end
|
237
|
+
|
148
238
|
def check_data_under_test(fsm, dut)
|
149
239
|
dut.each do |inputs, reference|
|
150
240
|
key = inputs[0]
|
data/test/Hash_spec.rb
CHANGED
@@ -31,7 +31,6 @@ require 'spec_helper'
|
|
31
31
|
|
32
32
|
require 'perobs'
|
33
33
|
|
34
|
-
|
35
34
|
class PO < PEROBS::Object
|
36
35
|
|
37
36
|
attr_persist :name
|
@@ -68,9 +67,13 @@ describe PEROBS::Hash do
|
|
68
67
|
h['po'] = po = @store.new(PO)
|
69
68
|
po.name = 'foobar'
|
70
69
|
h['b'] = 'B'
|
70
|
+
@store['po_key'] = po_key = @store.new(PO)
|
71
|
+
po_key.name = 'po key'
|
72
|
+
h[po_key] = 'PO Key'
|
71
73
|
|
72
74
|
expect(h['a']).to eq('A')
|
73
75
|
expect(h['b']).to eq('B')
|
76
|
+
expect(h[@store['po_key']]).to eq('PO Key')
|
74
77
|
@store.exit
|
75
78
|
|
76
79
|
@store = PEROBS::Store.new(@db_name)
|
@@ -78,6 +81,14 @@ describe PEROBS::Hash do
|
|
78
81
|
expect(h['a']).to eq('A')
|
79
82
|
expect(h['b']).to eq('B')
|
80
83
|
expect(h['po'].name).to eq('foobar')
|
84
|
+
po_key = @store['po_key']
|
85
|
+
expect(po_key.name).to eq('po key')
|
86
|
+
expect(h[po_key]).to eq('PO Key')
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should not allow hash keys that conflict with internal notations' do
|
90
|
+
@store['h'] = h = @store.new(PEROBS::Hash)
|
91
|
+
expect { h['#<PEROBS::POReference id=1234>'] = 'foo'; @store.sync }.to raise_error(ArgumentError)
|
81
92
|
end
|
82
93
|
|
83
94
|
it 'should have an each method to iterate' do
|
data/test/Store_spec.rb
CHANGED
@@ -251,6 +251,20 @@ describe PEROBS::Store do
|
|
251
251
|
end
|
252
252
|
expect(i).to eq(6)
|
253
253
|
|
254
|
+
capture_io { store.gc }
|
255
|
+
capture_io { expect { store.check }.to_not raise_error }
|
256
|
+
capture_io { store.exit }
|
257
|
+
|
258
|
+
store = PEROBS::Store.new(@db_file)
|
259
|
+
capture_io { expect { store.check }.to_not raise_error }
|
260
|
+
|
261
|
+
person = store['person1']
|
262
|
+
i = 0
|
263
|
+
while (person = person.related) do
|
264
|
+
i += 1
|
265
|
+
end
|
266
|
+
expect(i).to eq(6)
|
267
|
+
|
254
268
|
capture_io { store.gc }
|
255
269
|
capture_io { expect { store.check }.to_not raise_error }
|
256
270
|
expect { store.delete_store }.to_not raise_error
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perobs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Schlaeger
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -147,7 +147,7 @@ homepage: https://github.com/scrapper/perobs
|
|
147
147
|
licenses:
|
148
148
|
- MIT
|
149
149
|
metadata: {}
|
150
|
-
post_install_message:
|
150
|
+
post_install_message:
|
151
151
|
rdoc_options: []
|
152
152
|
require_paths:
|
153
153
|
- lib
|
@@ -162,9 +162,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
162
162
|
- !ruby/object:Gem::Version
|
163
163
|
version: '0'
|
164
164
|
requirements: []
|
165
|
-
|
166
|
-
|
167
|
-
signing_key:
|
165
|
+
rubygems_version: 3.2.3
|
166
|
+
signing_key:
|
168
167
|
specification_version: 4
|
169
168
|
summary: Persistent Ruby Object Store
|
170
169
|
test_files:
|