perobs 4.2.0 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -16
- data/lib/perobs/BTree.rb +2 -2
- data/lib/perobs/BTreeNode.rb +46 -29
- data/lib/perobs/BigArrayNode.rb +11 -9
- data/lib/perobs/Cache.rb +32 -6
- data/lib/perobs/EquiBlobsFile.rb +2 -0
- data/lib/perobs/FlatFile.rb +40 -60
- data/lib/perobs/FuzzyStringMatcher.rb +32 -49
- data/lib/perobs/Hash.rb +68 -23
- data/lib/perobs/IDListPageFile.rb +2 -1
- data/lib/perobs/IDListPageRecord.rb +1 -1
- data/lib/perobs/Log.rb +5 -0
- data/lib/perobs/ObjectBase.rb +7 -0
- data/lib/perobs/SpaceTree.rb +1 -1
- data/lib/perobs/Store.rb +177 -125
- data/lib/perobs/version.rb +1 -1
- data/lib/perobs.rb +1 -0
- data/perobs.gemspec +1 -1
- data/test/FlatFileDB_spec.rb +30 -0
- data/test/FuzzyStringMatcher_spec.rb +94 -4
- data/test/Hash_spec.rb +12 -1
- data/test/Store_spec.rb +14 -0
- metadata +8 -10
- data/lib/perobs/BTreeNodeCache.rb +0 -109
data/lib/perobs/Store.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# = Store.rb -- Persistent Ruby Object Store
|
4
4
|
#
|
5
|
-
# Copyright (c) 2015, 2016, 2017, 2018, 2019
|
5
|
+
# Copyright (c) 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022
|
6
6
|
# by Chris Schlaeger <chris@taskjuggler.org>
|
7
7
|
#
|
8
8
|
# MIT License
|
@@ -27,6 +27,7 @@
|
|
27
27
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
28
28
|
|
29
29
|
require 'set'
|
30
|
+
require 'monitor'
|
30
31
|
|
31
32
|
require 'perobs/Log'
|
32
33
|
require 'perobs/Handle'
|
@@ -46,7 +47,7 @@ require 'perobs/ConsoleProgressMeter'
|
|
46
47
|
# PErsistent Ruby OBject Store
|
47
48
|
module PEROBS
|
48
49
|
|
49
|
-
Statistics = Struct.new(:in_memory_objects, :root_objects,
|
50
|
+
Statistics = Struct.new(:in_memory_objects, :root_objects,
|
50
51
|
:marked_objects, :swept_objects,
|
51
52
|
:created_objects, :collected_objects)
|
52
53
|
|
@@ -160,9 +161,6 @@ module PEROBS
|
|
160
161
|
# List of PEROBS objects that are currently available as Ruby objects
|
161
162
|
# hashed by their ID.
|
162
163
|
@in_memory_objects = {}
|
163
|
-
# List of objects that were destroyed already but were still found in
|
164
|
-
# the in_memory_objects list. _collect has not yet been called for them.
|
165
|
-
@zombie_objects = {}
|
166
164
|
|
167
165
|
# This objects keeps some counters of interest.
|
168
166
|
@stats = Statistics.new
|
@@ -173,6 +171,9 @@ module PEROBS
|
|
173
171
|
# objects in memory.
|
174
172
|
@cache = Cache.new(options[:cache_bits] || 16)
|
175
173
|
|
174
|
+
# Lock to serialize access to the Store and all stored data.
|
175
|
+
@lock = Monitor.new
|
176
|
+
|
176
177
|
# The named (global) objects IDs hashed by their name
|
177
178
|
unless options[:no_root_objects]
|
178
179
|
unless (@root_objects = object_by_id(0))
|
@@ -243,8 +244,8 @@ module PEROBS
|
|
243
244
|
end
|
244
245
|
end
|
245
246
|
|
246
|
-
@db = @class_map = @in_memory_objects = @
|
247
|
-
@
|
247
|
+
@db = @class_map = @in_memory_objects = @stats = @cache =
|
248
|
+
@root_objects = nil
|
248
249
|
end
|
249
250
|
|
250
251
|
# You need to call this method to create new PEROBS objects that belong to
|
@@ -259,11 +260,13 @@ module PEROBS
|
|
259
260
|
PEROBS.log.fatal "#{klass} is not a BasicObject derivative"
|
260
261
|
end
|
261
262
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
263
|
+
@lock.synchronize do
|
264
|
+
obj = _construct_po(klass, _new_id, *args)
|
265
|
+
# Mark the new object as modified so it gets pushed into the database.
|
266
|
+
@cache.cache_write(obj)
|
267
|
+
# Return a POXReference proxy for the newly created object.
|
268
|
+
obj.myself
|
269
|
+
end
|
267
270
|
end
|
268
271
|
|
269
272
|
# For library internal use only!
|
@@ -280,9 +283,11 @@ module PEROBS
|
|
280
283
|
# method was called. This is an alternative to exit() that additionaly
|
281
284
|
# deletes the entire database.
|
282
285
|
def delete_store
|
283
|
-
@
|
284
|
-
|
285
|
-
@
|
286
|
+
@lock.synchronize do
|
287
|
+
@db.delete_database
|
288
|
+
@db = @class_map = @in_memory_objects = @stats = @cache =
|
289
|
+
@root_objects = nil
|
290
|
+
end
|
286
291
|
end
|
287
292
|
|
288
293
|
# Store the provided object under the given name. Use this to make the
|
@@ -294,25 +299,27 @@ module PEROBS
|
|
294
299
|
# @param obj [PEROBS::Object] The object to store
|
295
300
|
# @return [PEROBS::Object] The stored object.
|
296
301
|
def []=(name, obj)
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
+
@lock.synchronize do
|
303
|
+
# If the passed object is nil, we delete the entry if it exists.
|
304
|
+
if obj.nil?
|
305
|
+
@root_objects.delete(name)
|
306
|
+
return nil
|
307
|
+
end
|
302
308
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
+
# We only allow derivatives of PEROBS::Object to be stored in the
|
310
|
+
# store.
|
311
|
+
unless obj.is_a?(ObjectBase)
|
312
|
+
PEROBS.log.fatal 'Object must be of class PEROBS::Object but ' +
|
313
|
+
"is of class #{obj.class}"
|
314
|
+
end
|
309
315
|
|
310
|
-
|
311
|
-
|
312
|
-
|
316
|
+
unless obj.store == self
|
317
|
+
PEROBS.log.fatal 'The object does not belong to this store.'
|
318
|
+
end
|
313
319
|
|
314
|
-
|
315
|
-
|
320
|
+
# Store the name and mark the name list as modified.
|
321
|
+
@root_objects[name] = obj._id
|
322
|
+
end
|
316
323
|
|
317
324
|
obj
|
318
325
|
end
|
@@ -322,28 +329,34 @@ module PEROBS
|
|
322
329
|
# returned.
|
323
330
|
# @return The requested object or nil if it doesn't exist.
|
324
331
|
def [](name)
|
325
|
-
|
326
|
-
|
332
|
+
@lock.synchronize do
|
333
|
+
# Return nil if there is no object with that name.
|
334
|
+
return nil unless (id = @root_objects[name])
|
327
335
|
|
328
|
-
|
336
|
+
POXReference.new(self, id)
|
337
|
+
end
|
329
338
|
end
|
330
339
|
|
331
340
|
# Return a list with all the names of the root objects.
|
332
341
|
# @return [Array of Symbols]
|
333
342
|
def names
|
334
|
-
@
|
343
|
+
@lock.synchronize do
|
344
|
+
@root_objects.keys
|
345
|
+
end
|
335
346
|
end
|
336
347
|
|
337
348
|
# Flush out all modified objects to disk and shrink the in-memory list if
|
338
349
|
# needed.
|
339
350
|
def sync
|
340
|
-
|
341
|
-
@cache.
|
351
|
+
@lock.synchronize do
|
352
|
+
if @cache.in_transaction?
|
353
|
+
@cache.abort_transaction
|
354
|
+
@cache.flush
|
355
|
+
PEROBS.log.fatal "You cannot call sync() during a transaction: \n" +
|
356
|
+
Kernel.caller.join("\n")
|
357
|
+
end
|
342
358
|
@cache.flush
|
343
|
-
PEROBS.log.fatal "You cannot call sync() during a transaction: \n" +
|
344
|
-
Kernel.caller.join("\n")
|
345
359
|
end
|
346
|
-
@cache.flush
|
347
360
|
end
|
348
361
|
|
349
362
|
# Return the number of object stored in the store. CAVEAT: This method
|
@@ -353,7 +366,9 @@ module PEROBS
|
|
353
366
|
def size
|
354
367
|
# We don't include the Hash that stores the root objects into the object
|
355
368
|
# count.
|
356
|
-
@
|
369
|
+
@lock.synchronize do
|
370
|
+
@db.item_counter - 1
|
371
|
+
end
|
357
372
|
end
|
358
373
|
|
359
374
|
# Discard all objects that are not somehow connected to the root objects
|
@@ -362,51 +377,20 @@ module PEROBS
|
|
362
377
|
# method periodically.
|
363
378
|
# @return [Integer] The number of collected objects
|
364
379
|
def gc
|
365
|
-
|
366
|
-
|
367
|
-
|
380
|
+
@lock.synchronize do
|
381
|
+
sync
|
382
|
+
mark
|
383
|
+
sweep
|
384
|
+
end
|
368
385
|
end
|
369
386
|
|
370
387
|
# Return the object with the provided ID. This method is not part of the
|
371
388
|
# public API and should never be called by outside users. It's purely
|
372
389
|
# intended for internal use.
|
373
390
|
def object_by_id(id)
|
374
|
-
|
375
|
-
|
376
|
-
begin
|
377
|
-
object = ObjectSpace._id2ref(ruby_object_id)
|
378
|
-
# Let's make sure the object is really the object we are looking
|
379
|
-
# for. The GC might have recycled it already and the Ruby object ID
|
380
|
-
# could now be used for another object.
|
381
|
-
if object.is_a?(ObjectBase) && object._id == id
|
382
|
-
return object
|
383
|
-
end
|
384
|
-
rescue RangeError => e
|
385
|
-
# Due to a race condition the object can still be in the
|
386
|
-
# @in_memory_objects list but has been collected already by the Ruby
|
387
|
-
# GC. In that case we need to load it again. The _collect() call
|
388
|
-
# will happen much later, potentially after we have registered a new
|
389
|
-
# object with the same ID.
|
390
|
-
@zombie_objects[id] = @in_memory_objects.delete(id)
|
391
|
-
end
|
391
|
+
@lock.synchronize do
|
392
|
+
object_by_id_internal(id)
|
392
393
|
end
|
393
|
-
|
394
|
-
if (obj = @cache.object_by_id(id))
|
395
|
-
PEROBS.log.fatal "Object #{id} with Ruby #{obj.object_id} is in cache but not in_memory"
|
396
|
-
end
|
397
|
-
|
398
|
-
# We don't have the object in memory. Let's find it in the storage.
|
399
|
-
if @db.include?(id)
|
400
|
-
# Great, object found. Read it into memory and return it.
|
401
|
-
obj = ObjectBase::read(self, id)
|
402
|
-
# Add the object to the in-memory storage list.
|
403
|
-
@cache.cache_read(obj)
|
404
|
-
|
405
|
-
return obj
|
406
|
-
end
|
407
|
-
|
408
|
-
# The requested object does not exist. Return nil.
|
409
|
-
nil
|
410
394
|
end
|
411
395
|
|
412
396
|
# This method can be used to check the database and optionally repair it.
|
@@ -471,38 +455,54 @@ module PEROBS
|
|
471
455
|
# beginning of the transaction. The exception is passed on to the
|
472
456
|
# enclosing scope, so you probably want to handle it accordingly.
|
473
457
|
def transaction
|
474
|
-
|
458
|
+
transaction_not_started = true
|
459
|
+
while transaction_not_started do
|
460
|
+
begin
|
461
|
+
@lock.synchronize do
|
462
|
+
@cache.begin_transaction
|
463
|
+
# If we get to this point, the transaction was successfully
|
464
|
+
# started. We can exit the loop.
|
465
|
+
transaction_not_started = false
|
466
|
+
end
|
467
|
+
rescue TransactionInOtherThread
|
468
|
+
# sleep up to 50ms
|
469
|
+
sleep(rand(50) / 1000.0)
|
470
|
+
end
|
471
|
+
end
|
472
|
+
|
475
473
|
begin
|
476
474
|
yield if block_given?
|
477
475
|
rescue => e
|
478
|
-
@cache.abort_transaction
|
476
|
+
@lock.synchronize { @cache.abort_transaction }
|
479
477
|
raise e
|
480
478
|
end
|
481
|
-
@cache.end_transaction
|
479
|
+
@lock.synchronize { @cache.end_transaction }
|
482
480
|
end
|
483
481
|
|
484
482
|
# Calls the given block once for each object, passing that object as a
|
485
483
|
# parameter.
|
486
484
|
def each
|
487
|
-
@
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
"
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
485
|
+
@lock.synchronize do
|
486
|
+
@db.clear_marks
|
487
|
+
# Start with the object 0 and the indexes of the root objects. Push them
|
488
|
+
# onto the work stack.
|
489
|
+
stack = [ 0 ] + @root_objects.values
|
490
|
+
while !stack.empty?
|
491
|
+
# Get an object index from the stack.
|
492
|
+
id = stack.pop
|
493
|
+
next if @db.is_marked?(id)
|
494
|
+
|
495
|
+
unless (obj = object_by_id_internal(id))
|
496
|
+
PEROBS.log.fatal "Database is corrupted. Object with ID #{id} " +
|
497
|
+
"not found."
|
498
|
+
end
|
499
|
+
# Mark the object so it will never be pushed to the stack again.
|
500
|
+
@db.mark(id)
|
501
|
+
yield(obj.myself) if block_given?
|
502
|
+
# Push the IDs of all unmarked referenced objects onto the stack
|
503
|
+
obj._referenced_object_ids.each do |r_id|
|
504
|
+
stack << r_id unless @db.is_marked?(r_id)
|
505
|
+
end
|
506
506
|
end
|
507
507
|
end
|
508
508
|
end
|
@@ -510,7 +510,7 @@ module PEROBS
|
|
510
510
|
# Rename classes of objects stored in the data base.
|
511
511
|
# @param rename_map [Hash] Hash that maps the old name to the new name
|
512
512
|
def rename_classes(rename_map)
|
513
|
-
@class_map.rename(rename_map)
|
513
|
+
@lock.synchronize { @class_map.rename(rename_map) }
|
514
514
|
end
|
515
515
|
|
516
516
|
# Internal method. Don't use this outside of this library!
|
@@ -518,14 +518,16 @@ module PEROBS
|
|
518
518
|
# random numbers between 0 and 2**64 - 1.
|
519
519
|
# @return [Integer]
|
520
520
|
def _new_id
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
521
|
+
@lock.synchronize do
|
522
|
+
begin
|
523
|
+
# Generate a random number. It's recommended to not store more than
|
524
|
+
# 2**62 objects in the same store.
|
525
|
+
id = rand(2**64)
|
526
|
+
# Ensure that we don't have already another object with this ID.
|
527
|
+
end while @in_memory_objects.include?(id) || @db.include?(id)
|
527
528
|
|
528
|
-
|
529
|
+
id
|
530
|
+
end
|
529
531
|
end
|
530
532
|
|
531
533
|
# Internal method. Don't use this outside of this library!
|
@@ -536,16 +538,18 @@ module PEROBS
|
|
536
538
|
# @param obj [BasicObject] Object to register
|
537
539
|
# @param id [Integer] object ID
|
538
540
|
def _register_in_memory(obj, id)
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
"
|
545
|
-
|
541
|
+
@lock.synchronize do
|
542
|
+
unless obj.is_a?(ObjectBase)
|
543
|
+
PEROBS.log.fatal "You can only register ObjectBase objects"
|
544
|
+
end
|
545
|
+
if @in_memory_objects.include?(id)
|
546
|
+
PEROBS.log.fatal "The Store::_in_memory_objects list already " +
|
547
|
+
"contains an object for ID #{id}"
|
548
|
+
end
|
546
549
|
|
547
|
-
|
548
|
-
|
550
|
+
@in_memory_objects[id] = obj.object_id
|
551
|
+
@stats[:created_objects] += 1
|
552
|
+
end
|
549
553
|
end
|
550
554
|
|
551
555
|
# Remove the object from the in-memory list. This is an internal method
|
@@ -553,26 +557,73 @@ module PEROBS
|
|
553
557
|
# finalizer, so many restrictions apply!
|
554
558
|
# @param id [Integer] Object ID of object to remove from the list
|
555
559
|
def _collect(id, ruby_object_id)
|
556
|
-
|
560
|
+
# This method should only be called from the Ruby garbage collector.
|
561
|
+
# Therefor no locking is needed or even possible. The GC can kick in at
|
562
|
+
# any time and we could be anywhere in the code. So there is a small
|
563
|
+
# risk for a race here, but it should not have any serious consequences.
|
564
|
+
if @in_memory_objects && @in_memory_objects[id] == ruby_object_id
|
557
565
|
@in_memory_objects.delete(id)
|
558
566
|
@stats[:collected_objects] += 1
|
559
|
-
elsif @zombie_objects[id] == ruby_object_id
|
560
|
-
@zombie_objects.delete(id)
|
561
|
-
@stats[:collected_objects] += 1
|
562
567
|
end
|
563
568
|
end
|
564
569
|
|
565
570
|
# This method returns a Hash with some statistics about this store.
|
566
571
|
def statistics
|
567
|
-
@
|
568
|
-
|
569
|
-
|
572
|
+
@lock.synchronize do
|
573
|
+
@stats.in_memory_objects = @in_memory_objects.length
|
574
|
+
@stats.root_objects = @root_objects.length
|
575
|
+
end
|
570
576
|
|
571
577
|
@stats
|
572
578
|
end
|
573
579
|
|
574
580
|
private
|
575
581
|
|
582
|
+
def object_by_id_internal(id)
|
583
|
+
if (ruby_object_id = @in_memory_objects[id])
|
584
|
+
# We have the object in memory so we can just return it.
|
585
|
+
begin
|
586
|
+
object = ObjectSpace._id2ref(ruby_object_id)
|
587
|
+
# Let's make sure the object is really the object we are looking
|
588
|
+
# for. The GC might have recycled it already and the Ruby object ID
|
589
|
+
# could now be used for another object.
|
590
|
+
if object.is_a?(ObjectBase) && object._id == id
|
591
|
+
return object
|
592
|
+
end
|
593
|
+
rescue RangeError => e
|
594
|
+
# Due to a race condition the object can still be in the
|
595
|
+
# @in_memory_objects list but has been collected already by the Ruby
|
596
|
+
# GC. The _collect() call has not been completed yet. We now have to
|
597
|
+
# wait until this has been done. I think the GC lock will prevent a
|
598
|
+
# race on @in_memory_objects.
|
599
|
+
GC.start
|
600
|
+
while @in_memory_objects.include?(id)
|
601
|
+
sleep 0.01
|
602
|
+
end
|
603
|
+
end
|
604
|
+
end
|
605
|
+
|
606
|
+
# This is just a safety check. It has never triggered, so we can disable
|
607
|
+
# it for now.
|
608
|
+
#if (obj = @cache.object_by_id(id))
|
609
|
+
# PEROBS.log.fatal "Object #{id} with Ruby #{obj.object_id} is in " +
|
610
|
+
# "cache but not in_memory"
|
611
|
+
#end
|
612
|
+
|
613
|
+
# We don't have the object in memory. Let's find it in the storage.
|
614
|
+
if @db.include?(id)
|
615
|
+
# Great, object found. Read it into memory and return it.
|
616
|
+
obj = ObjectBase::read(self, id)
|
617
|
+
# Add the object to the in-memory storage list.
|
618
|
+
@cache.cache_read(obj)
|
619
|
+
|
620
|
+
return obj
|
621
|
+
end
|
622
|
+
|
623
|
+
# The requested object does not exist. Return nil.
|
624
|
+
nil
|
625
|
+
end
|
626
|
+
|
576
627
|
# Mark phase of a mark-and-sweep garbage collector. It will mark all
|
577
628
|
# objects that are reachable from the root objects.
|
578
629
|
def mark
|
@@ -599,6 +650,7 @@ module PEROBS
|
|
599
650
|
@stats.swept_objects = @db.delete_unmarked_objects do |id|
|
600
651
|
@cache.evict(id)
|
601
652
|
end
|
653
|
+
@db.clear_marks
|
602
654
|
GC.start
|
603
655
|
PEROBS.log.debug "#{@stats.swept_objects} objects collected"
|
604
656
|
@stats.swept_objects
|
data/lib/perobs/version.rb
CHANGED
data/lib/perobs.rb
CHANGED
data/perobs.gemspec
CHANGED
@@ -20,5 +20,5 @@ GEM_SPEC = Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency 'bundler', '~> 2.3'
|
22
22
|
spec.add_development_dependency 'yard', '~>0.9.12'
|
23
|
-
spec.add_development_dependency 'rake', '~>
|
23
|
+
spec.add_development_dependency 'rake', '~> 13.0.3'
|
24
24
|
end
|
data/test/FlatFileDB_spec.rb
CHANGED
@@ -265,5 +265,35 @@ describe PEROBS::FlatFileDB do
|
|
265
265
|
db.close
|
266
266
|
end
|
267
267
|
|
268
|
+
it 'should handle duplicate entries for the same ID in database.blobs file' do
|
269
|
+
@store.exit
|
270
|
+
|
271
|
+
db = PEROBS::FlatFileDB.new(@db_dir)
|
272
|
+
db_file = File.join(@db_dir, 'database.blobs')
|
273
|
+
db.open
|
274
|
+
0.upto(5) do |i|
|
275
|
+
db.put_object("#{i + 1}:#{'X' * (i + 1) * 30}$", i + 1)
|
276
|
+
end
|
277
|
+
db.close
|
278
|
+
|
279
|
+
# This appends the entry 2 again
|
280
|
+
blob2 = File.read(db_file, 319 - 199, 199)
|
281
|
+
File.write(db_file, blob2, File.size(db_file))
|
282
|
+
|
283
|
+
db.open
|
284
|
+
expect(db.check_db).to eql(2)
|
285
|
+
expect(db.check_db(true)).to eql(1)
|
286
|
+
db.close
|
287
|
+
db = PEROBS::FlatFileDB.new(@db_dir, { :log => $stderr,
|
288
|
+
:log_level => Logger::WARN })
|
289
|
+
db.open
|
290
|
+
expect(db.check_db).to eql(0)
|
291
|
+
|
292
|
+
0.upto(5) do |i|
|
293
|
+
expect(db.get_object(i + 1)).to eql("#{i + 1}:#{'X' * (i + 1) * 30}$")
|
294
|
+
end
|
295
|
+
db.close
|
296
|
+
end
|
297
|
+
|
268
298
|
end
|
269
299
|
|
@@ -29,13 +29,25 @@ require 'perobs/FuzzyStringMatcher'
|
|
29
29
|
|
30
30
|
module PEROBS
|
31
31
|
|
32
|
+
class WordRef < PEROBS::Object
|
33
|
+
|
34
|
+
attr_persist :word, :line
|
35
|
+
|
36
|
+
def initialize(store, word, line)
|
37
|
+
super(store)
|
38
|
+
self.word = word
|
39
|
+
self.line = line
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
32
44
|
describe FuzzyStringMatcher do
|
33
45
|
|
34
46
|
before(:all) do
|
35
47
|
@db_name = generate_db_name(__FILE__)
|
36
48
|
@store = PEROBS::Store.new(@db_name)
|
37
|
-
@fsm =
|
38
|
-
@fsm2 =
|
49
|
+
@store['fsm'] = @fsm = @store.new(FuzzyStringMatcher)
|
50
|
+
@store['fsm2'] = @fsm2 = @store.new(FuzzyStringMatcher, true, 2)
|
39
51
|
end
|
40
52
|
|
41
53
|
after(:all) do
|
@@ -103,6 +115,44 @@ module PEROBS
|
|
103
115
|
expect(@fsm.best_matches('foobar')).to eql([])
|
104
116
|
end
|
105
117
|
|
118
|
+
it 'should find a match' do
|
119
|
+
dut = {
|
120
|
+
[ 'one' ] => [ [ 'one', 1.0 ] ],
|
121
|
+
[ 'three' ] => [ [ 'three', 1.0 ] ],
|
122
|
+
[ 'four' ]=> [ [ 'four', 1.0 ], [ 'fourteen', 0.666 ] ],
|
123
|
+
[ 'four', 1.0 ]=> [ [ 'four', 1.0 ] ],
|
124
|
+
[ 'even' ] => [ [ 'seven', 0.666 ], [ 'eleven', 0.666 ] ],
|
125
|
+
[ 'teen' ] => [ ['thirteen', 0.6666666666666666],
|
126
|
+
['fourteen', 0.6666666666666666],
|
127
|
+
['fifteen', 0.6666666666666666],
|
128
|
+
['sixteen', 0.6666666666666666],
|
129
|
+
['seventeen', 0.6666666666666666],
|
130
|
+
['eighteen', 0.6666666666666666],
|
131
|
+
['nineteen', 0.6666666666666666] ],
|
132
|
+
[ 'aight' ] => [ [ 'eight', 0.5 ] ],
|
133
|
+
[ 'thirdteen' ] => [ [ 'thirteen', 0.5 ] ],
|
134
|
+
[ 'shirt teen', 0.3 ] => [ [ 'thirteen', 0.333 ] ]
|
135
|
+
}
|
136
|
+
check_data_under_test(@fsm, dut)
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'should sort best to worst matches' do
|
140
|
+
@fsm.clear
|
141
|
+
%w( xbar xfoox foor bar foobar barfoo foo rab baar fool xbarx
|
142
|
+
foobarx xfoobarx foo_bar ).each do |w|
|
143
|
+
@fsm.learn(w, w)
|
144
|
+
end
|
145
|
+
dut = {
|
146
|
+
[ 'foo' ] => [["foo", 1.0], ["foor", 0.5], ["foobar", 0.5],
|
147
|
+
["fool", 0.5], ["foobarx", 0.5], ["foo_bar", 0.5],
|
148
|
+
["barfoo", 0.5]],
|
149
|
+
[ 'bar' ] => [["bar", 1.0], ["barfoo", 0.5], ["xbar", 0.5],
|
150
|
+
["foobar", 0.5], ["foo_bar", 0.5]],
|
151
|
+
[ 'foobar' ] => [["foobar", 1.0], ["foobarx", 0.8], ["xfoobarx", 0.6]]
|
152
|
+
}
|
153
|
+
check_data_under_test(@fsm, dut)
|
154
|
+
end
|
155
|
+
|
106
156
|
it 'should handle a larger text' do
|
107
157
|
text =<<-EOT
|
108
158
|
MIT License
|
@@ -131,9 +181,9 @@ EOT
|
|
131
181
|
@fsm2.learn(word, word)
|
132
182
|
end
|
133
183
|
stats = @fsm2.stats
|
134
|
-
expect(stats['dictionary_size']).to eql(
|
184
|
+
expect(stats['dictionary_size']).to eql(352)
|
135
185
|
expect(stats['max_list_size']).to eql(22)
|
136
|
-
expect(stats['avg_list_size']).to be_within(0.001).of(2.
|
186
|
+
expect(stats['avg_list_size']).to be_within(0.001).of(2.409)
|
137
187
|
end
|
138
188
|
|
139
189
|
it 'should find case sensitive matches' do
|
@@ -145,6 +195,46 @@ EOT
|
|
145
195
|
check_data_under_test(@fsm2, dut)
|
146
196
|
end
|
147
197
|
|
198
|
+
it 'should support references to PEROBS objects' do
|
199
|
+
text =<<-EOT
|
200
|
+
MIT License
|
201
|
+
|
202
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
203
|
+
a copy of this software and associated documentation files (the
|
204
|
+
"Software"), to deal in the Software without restriction, including
|
205
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
206
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
207
|
+
permit persons to whom the Software is furnished to do so, subject to
|
208
|
+
the following conditions:
|
209
|
+
EOT
|
210
|
+
|
211
|
+
line_no = 1
|
212
|
+
@store['fsm'] = fsm = @store.new(FuzzyStringMatcher)
|
213
|
+
@store['refs'] = refs = @store.new(Array)
|
214
|
+
text.each_line do |line|
|
215
|
+
line.split.each do |word|
|
216
|
+
ref = @store.new(WordRef, word, line_no)
|
217
|
+
refs << ref
|
218
|
+
fsm.learn(word, ref)
|
219
|
+
end
|
220
|
+
line_no += 1
|
221
|
+
end
|
222
|
+
|
223
|
+
found_lines = []
|
224
|
+
fsm.best_matches('SOFTWARE').each do |match|
|
225
|
+
found_lines << match[0].line
|
226
|
+
end
|
227
|
+
expect(found_lines.sort).to eql([ 4, 5, 5, 7, 8 ])
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should with small search words' do
|
231
|
+
@fsm.clear
|
232
|
+
mats = 'Yukihiro Matsumoto'
|
233
|
+
@fsm.learn(mats)
|
234
|
+
expect(@fsm.best_matches('Yukihiro').first.first).to eql(mats)
|
235
|
+
expect(@fsm.best_matches('Mats', 0.3).first.first).to eql(mats)
|
236
|
+
end
|
237
|
+
|
148
238
|
def check_data_under_test(fsm, dut)
|
149
239
|
dut.each do |inputs, reference|
|
150
240
|
key = inputs[0]
|