perobs 4.2.0 → 4.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +27 -16
- data/lib/perobs/BTree.rb +2 -2
- data/lib/perobs/BTreeNode.rb +46 -29
- data/lib/perobs/BigArrayNode.rb +11 -9
- data/lib/perobs/Cache.rb +32 -6
- data/lib/perobs/EquiBlobsFile.rb +2 -0
- data/lib/perobs/FlatFile.rb +40 -60
- data/lib/perobs/FuzzyStringMatcher.rb +32 -49
- data/lib/perobs/Hash.rb +68 -23
- data/lib/perobs/IDListPageFile.rb +2 -1
- data/lib/perobs/IDListPageRecord.rb +1 -1
- data/lib/perobs/Log.rb +5 -0
- data/lib/perobs/ObjectBase.rb +7 -0
- data/lib/perobs/SpaceTree.rb +1 -1
- data/lib/perobs/Store.rb +177 -125
- data/lib/perobs/version.rb +1 -1
- data/lib/perobs.rb +1 -0
- data/perobs.gemspec +1 -1
- data/test/FlatFileDB_spec.rb +30 -0
- data/test/FuzzyStringMatcher_spec.rb +94 -4
- data/test/Hash_spec.rb +12 -1
- data/test/Store_spec.rb +14 -0
- metadata +8 -10
- data/lib/perobs/BTreeNodeCache.rb +0 -109
data/lib/perobs/Store.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# = Store.rb -- Persistent Ruby Object Store
|
4
4
|
#
|
5
|
-
# Copyright (c) 2015, 2016, 2017, 2018, 2019
|
5
|
+
# Copyright (c) 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022
|
6
6
|
# by Chris Schlaeger <chris@taskjuggler.org>
|
7
7
|
#
|
8
8
|
# MIT License
|
@@ -27,6 +27,7 @@
|
|
27
27
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
28
28
|
|
29
29
|
require 'set'
|
30
|
+
require 'monitor'
|
30
31
|
|
31
32
|
require 'perobs/Log'
|
32
33
|
require 'perobs/Handle'
|
@@ -46,7 +47,7 @@ require 'perobs/ConsoleProgressMeter'
|
|
46
47
|
# PErsistent Ruby OBject Store
|
47
48
|
module PEROBS
|
48
49
|
|
49
|
-
Statistics = Struct.new(:in_memory_objects, :root_objects,
|
50
|
+
Statistics = Struct.new(:in_memory_objects, :root_objects,
|
50
51
|
:marked_objects, :swept_objects,
|
51
52
|
:created_objects, :collected_objects)
|
52
53
|
|
@@ -160,9 +161,6 @@ module PEROBS
|
|
160
161
|
# List of PEROBS objects that are currently available as Ruby objects
|
161
162
|
# hashed by their ID.
|
162
163
|
@in_memory_objects = {}
|
163
|
-
# List of objects that were destroyed already but were still found in
|
164
|
-
# the in_memory_objects list. _collect has not yet been called for them.
|
165
|
-
@zombie_objects = {}
|
166
164
|
|
167
165
|
# This objects keeps some counters of interest.
|
168
166
|
@stats = Statistics.new
|
@@ -173,6 +171,9 @@ module PEROBS
|
|
173
171
|
# objects in memory.
|
174
172
|
@cache = Cache.new(options[:cache_bits] || 16)
|
175
173
|
|
174
|
+
# Lock to serialize access to the Store and all stored data.
|
175
|
+
@lock = Monitor.new
|
176
|
+
|
176
177
|
# The named (global) objects IDs hashed by their name
|
177
178
|
unless options[:no_root_objects]
|
178
179
|
unless (@root_objects = object_by_id(0))
|
@@ -243,8 +244,8 @@ module PEROBS
|
|
243
244
|
end
|
244
245
|
end
|
245
246
|
|
246
|
-
@db = @class_map = @in_memory_objects = @
|
247
|
-
@
|
247
|
+
@db = @class_map = @in_memory_objects = @stats = @cache =
|
248
|
+
@root_objects = nil
|
248
249
|
end
|
249
250
|
|
250
251
|
# You need to call this method to create new PEROBS objects that belong to
|
@@ -259,11 +260,13 @@ module PEROBS
|
|
259
260
|
PEROBS.log.fatal "#{klass} is not a BasicObject derivative"
|
260
261
|
end
|
261
262
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
263
|
+
@lock.synchronize do
|
264
|
+
obj = _construct_po(klass, _new_id, *args)
|
265
|
+
# Mark the new object as modified so it gets pushed into the database.
|
266
|
+
@cache.cache_write(obj)
|
267
|
+
# Return a POXReference proxy for the newly created object.
|
268
|
+
obj.myself
|
269
|
+
end
|
267
270
|
end
|
268
271
|
|
269
272
|
# For library internal use only!
|
@@ -280,9 +283,11 @@ module PEROBS
|
|
280
283
|
# method was called. This is an alternative to exit() that additionaly
|
281
284
|
# deletes the entire database.
|
282
285
|
def delete_store
|
283
|
-
@
|
284
|
-
|
285
|
-
@
|
286
|
+
@lock.synchronize do
|
287
|
+
@db.delete_database
|
288
|
+
@db = @class_map = @in_memory_objects = @stats = @cache =
|
289
|
+
@root_objects = nil
|
290
|
+
end
|
286
291
|
end
|
287
292
|
|
288
293
|
# Store the provided object under the given name. Use this to make the
|
@@ -294,25 +299,27 @@ module PEROBS
|
|
294
299
|
# @param obj [PEROBS::Object] The object to store
|
295
300
|
# @return [PEROBS::Object] The stored object.
|
296
301
|
def []=(name, obj)
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
+
@lock.synchronize do
|
303
|
+
# If the passed object is nil, we delete the entry if it exists.
|
304
|
+
if obj.nil?
|
305
|
+
@root_objects.delete(name)
|
306
|
+
return nil
|
307
|
+
end
|
302
308
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
+
# We only allow derivatives of PEROBS::Object to be stored in the
|
310
|
+
# store.
|
311
|
+
unless obj.is_a?(ObjectBase)
|
312
|
+
PEROBS.log.fatal 'Object must be of class PEROBS::Object but ' +
|
313
|
+
"is of class #{obj.class}"
|
314
|
+
end
|
309
315
|
|
310
|
-
|
311
|
-
|
312
|
-
|
316
|
+
unless obj.store == self
|
317
|
+
PEROBS.log.fatal 'The object does not belong to this store.'
|
318
|
+
end
|
313
319
|
|
314
|
-
|
315
|
-
|
320
|
+
# Store the name and mark the name list as modified.
|
321
|
+
@root_objects[name] = obj._id
|
322
|
+
end
|
316
323
|
|
317
324
|
obj
|
318
325
|
end
|
@@ -322,28 +329,34 @@ module PEROBS
|
|
322
329
|
# returned.
|
323
330
|
# @return The requested object or nil if it doesn't exist.
|
324
331
|
def [](name)
|
325
|
-
|
326
|
-
|
332
|
+
@lock.synchronize do
|
333
|
+
# Return nil if there is no object with that name.
|
334
|
+
return nil unless (id = @root_objects[name])
|
327
335
|
|
328
|
-
|
336
|
+
POXReference.new(self, id)
|
337
|
+
end
|
329
338
|
end
|
330
339
|
|
331
340
|
# Return a list with all the names of the root objects.
|
332
341
|
# @return [Array of Symbols]
|
333
342
|
def names
|
334
|
-
@
|
343
|
+
@lock.synchronize do
|
344
|
+
@root_objects.keys
|
345
|
+
end
|
335
346
|
end
|
336
347
|
|
337
348
|
# Flush out all modified objects to disk and shrink the in-memory list if
|
338
349
|
# needed.
|
339
350
|
def sync
|
340
|
-
|
341
|
-
@cache.
|
351
|
+
@lock.synchronize do
|
352
|
+
if @cache.in_transaction?
|
353
|
+
@cache.abort_transaction
|
354
|
+
@cache.flush
|
355
|
+
PEROBS.log.fatal "You cannot call sync() during a transaction: \n" +
|
356
|
+
Kernel.caller.join("\n")
|
357
|
+
end
|
342
358
|
@cache.flush
|
343
|
-
PEROBS.log.fatal "You cannot call sync() during a transaction: \n" +
|
344
|
-
Kernel.caller.join("\n")
|
345
359
|
end
|
346
|
-
@cache.flush
|
347
360
|
end
|
348
361
|
|
349
362
|
# Return the number of object stored in the store. CAVEAT: This method
|
@@ -353,7 +366,9 @@ module PEROBS
|
|
353
366
|
def size
|
354
367
|
# We don't include the Hash that stores the root objects into the object
|
355
368
|
# count.
|
356
|
-
@
|
369
|
+
@lock.synchronize do
|
370
|
+
@db.item_counter - 1
|
371
|
+
end
|
357
372
|
end
|
358
373
|
|
359
374
|
# Discard all objects that are not somehow connected to the root objects
|
@@ -362,51 +377,20 @@ module PEROBS
|
|
362
377
|
# method periodically.
|
363
378
|
# @return [Integer] The number of collected objects
|
364
379
|
def gc
|
365
|
-
|
366
|
-
|
367
|
-
|
380
|
+
@lock.synchronize do
|
381
|
+
sync
|
382
|
+
mark
|
383
|
+
sweep
|
384
|
+
end
|
368
385
|
end
|
369
386
|
|
370
387
|
# Return the object with the provided ID. This method is not part of the
|
371
388
|
# public API and should never be called by outside users. It's purely
|
372
389
|
# intended for internal use.
|
373
390
|
def object_by_id(id)
|
374
|
-
|
375
|
-
|
376
|
-
begin
|
377
|
-
object = ObjectSpace._id2ref(ruby_object_id)
|
378
|
-
# Let's make sure the object is really the object we are looking
|
379
|
-
# for. The GC might have recycled it already and the Ruby object ID
|
380
|
-
# could now be used for another object.
|
381
|
-
if object.is_a?(ObjectBase) && object._id == id
|
382
|
-
return object
|
383
|
-
end
|
384
|
-
rescue RangeError => e
|
385
|
-
# Due to a race condition the object can still be in the
|
386
|
-
# @in_memory_objects list but has been collected already by the Ruby
|
387
|
-
# GC. In that case we need to load it again. The _collect() call
|
388
|
-
# will happen much later, potentially after we have registered a new
|
389
|
-
# object with the same ID.
|
390
|
-
@zombie_objects[id] = @in_memory_objects.delete(id)
|
391
|
-
end
|
391
|
+
@lock.synchronize do
|
392
|
+
object_by_id_internal(id)
|
392
393
|
end
|
393
|
-
|
394
|
-
if (obj = @cache.object_by_id(id))
|
395
|
-
PEROBS.log.fatal "Object #{id} with Ruby #{obj.object_id} is in cache but not in_memory"
|
396
|
-
end
|
397
|
-
|
398
|
-
# We don't have the object in memory. Let's find it in the storage.
|
399
|
-
if @db.include?(id)
|
400
|
-
# Great, object found. Read it into memory and return it.
|
401
|
-
obj = ObjectBase::read(self, id)
|
402
|
-
# Add the object to the in-memory storage list.
|
403
|
-
@cache.cache_read(obj)
|
404
|
-
|
405
|
-
return obj
|
406
|
-
end
|
407
|
-
|
408
|
-
# The requested object does not exist. Return nil.
|
409
|
-
nil
|
410
394
|
end
|
411
395
|
|
412
396
|
# This method can be used to check the database and optionally repair it.
|
@@ -471,38 +455,54 @@ module PEROBS
|
|
471
455
|
# beginning of the transaction. The exception is passed on to the
|
472
456
|
# enclosing scope, so you probably want to handle it accordingly.
|
473
457
|
def transaction
|
474
|
-
|
458
|
+
transaction_not_started = true
|
459
|
+
while transaction_not_started do
|
460
|
+
begin
|
461
|
+
@lock.synchronize do
|
462
|
+
@cache.begin_transaction
|
463
|
+
# If we get to this point, the transaction was successfully
|
464
|
+
# started. We can exit the loop.
|
465
|
+
transaction_not_started = false
|
466
|
+
end
|
467
|
+
rescue TransactionInOtherThread
|
468
|
+
# sleep up to 50ms
|
469
|
+
sleep(rand(50) / 1000.0)
|
470
|
+
end
|
471
|
+
end
|
472
|
+
|
475
473
|
begin
|
476
474
|
yield if block_given?
|
477
475
|
rescue => e
|
478
|
-
@cache.abort_transaction
|
476
|
+
@lock.synchronize { @cache.abort_transaction }
|
479
477
|
raise e
|
480
478
|
end
|
481
|
-
@cache.end_transaction
|
479
|
+
@lock.synchronize { @cache.end_transaction }
|
482
480
|
end
|
483
481
|
|
484
482
|
# Calls the given block once for each object, passing that object as a
|
485
483
|
# parameter.
|
486
484
|
def each
|
487
|
-
@
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
"
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
485
|
+
@lock.synchronize do
|
486
|
+
@db.clear_marks
|
487
|
+
# Start with the object 0 and the indexes of the root objects. Push them
|
488
|
+
# onto the work stack.
|
489
|
+
stack = [ 0 ] + @root_objects.values
|
490
|
+
while !stack.empty?
|
491
|
+
# Get an object index from the stack.
|
492
|
+
id = stack.pop
|
493
|
+
next if @db.is_marked?(id)
|
494
|
+
|
495
|
+
unless (obj = object_by_id_internal(id))
|
496
|
+
PEROBS.log.fatal "Database is corrupted. Object with ID #{id} " +
|
497
|
+
"not found."
|
498
|
+
end
|
499
|
+
# Mark the object so it will never be pushed to the stack again.
|
500
|
+
@db.mark(id)
|
501
|
+
yield(obj.myself) if block_given?
|
502
|
+
# Push the IDs of all unmarked referenced objects onto the stack
|
503
|
+
obj._referenced_object_ids.each do |r_id|
|
504
|
+
stack << r_id unless @db.is_marked?(r_id)
|
505
|
+
end
|
506
506
|
end
|
507
507
|
end
|
508
508
|
end
|
@@ -510,7 +510,7 @@ module PEROBS
|
|
510
510
|
# Rename classes of objects stored in the data base.
|
511
511
|
# @param rename_map [Hash] Hash that maps the old name to the new name
|
512
512
|
def rename_classes(rename_map)
|
513
|
-
@class_map.rename(rename_map)
|
513
|
+
@lock.synchronize { @class_map.rename(rename_map) }
|
514
514
|
end
|
515
515
|
|
516
516
|
# Internal method. Don't use this outside of this library!
|
@@ -518,14 +518,16 @@ module PEROBS
|
|
518
518
|
# random numbers between 0 and 2**64 - 1.
|
519
519
|
# @return [Integer]
|
520
520
|
def _new_id
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
521
|
+
@lock.synchronize do
|
522
|
+
begin
|
523
|
+
# Generate a random number. It's recommended to not store more than
|
524
|
+
# 2**62 objects in the same store.
|
525
|
+
id = rand(2**64)
|
526
|
+
# Ensure that we don't have already another object with this ID.
|
527
|
+
end while @in_memory_objects.include?(id) || @db.include?(id)
|
527
528
|
|
528
|
-
|
529
|
+
id
|
530
|
+
end
|
529
531
|
end
|
530
532
|
|
531
533
|
# Internal method. Don't use this outside of this library!
|
@@ -536,16 +538,18 @@ module PEROBS
|
|
536
538
|
# @param obj [BasicObject] Object to register
|
537
539
|
# @param id [Integer] object ID
|
538
540
|
def _register_in_memory(obj, id)
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
"
|
545
|
-
|
541
|
+
@lock.synchronize do
|
542
|
+
unless obj.is_a?(ObjectBase)
|
543
|
+
PEROBS.log.fatal "You can only register ObjectBase objects"
|
544
|
+
end
|
545
|
+
if @in_memory_objects.include?(id)
|
546
|
+
PEROBS.log.fatal "The Store::_in_memory_objects list already " +
|
547
|
+
"contains an object for ID #{id}"
|
548
|
+
end
|
546
549
|
|
547
|
-
|
548
|
-
|
550
|
+
@in_memory_objects[id] = obj.object_id
|
551
|
+
@stats[:created_objects] += 1
|
552
|
+
end
|
549
553
|
end
|
550
554
|
|
551
555
|
# Remove the object from the in-memory list. This is an internal method
|
@@ -553,26 +557,73 @@ module PEROBS
|
|
553
557
|
# finalizer, so many restrictions apply!
|
554
558
|
# @param id [Integer] Object ID of object to remove from the list
|
555
559
|
def _collect(id, ruby_object_id)
|
556
|
-
|
560
|
+
# This method should only be called from the Ruby garbage collector.
|
561
|
+
# Therefor no locking is needed or even possible. The GC can kick in at
|
562
|
+
# any time and we could be anywhere in the code. So there is a small
|
563
|
+
# risk for a race here, but it should not have any serious consequences.
|
564
|
+
if @in_memory_objects && @in_memory_objects[id] == ruby_object_id
|
557
565
|
@in_memory_objects.delete(id)
|
558
566
|
@stats[:collected_objects] += 1
|
559
|
-
elsif @zombie_objects[id] == ruby_object_id
|
560
|
-
@zombie_objects.delete(id)
|
561
|
-
@stats[:collected_objects] += 1
|
562
567
|
end
|
563
568
|
end
|
564
569
|
|
565
570
|
# This method returns a Hash with some statistics about this store.
|
566
571
|
def statistics
|
567
|
-
@
|
568
|
-
|
569
|
-
|
572
|
+
@lock.synchronize do
|
573
|
+
@stats.in_memory_objects = @in_memory_objects.length
|
574
|
+
@stats.root_objects = @root_objects.length
|
575
|
+
end
|
570
576
|
|
571
577
|
@stats
|
572
578
|
end
|
573
579
|
|
574
580
|
private
|
575
581
|
|
582
|
+
def object_by_id_internal(id)
|
583
|
+
if (ruby_object_id = @in_memory_objects[id])
|
584
|
+
# We have the object in memory so we can just return it.
|
585
|
+
begin
|
586
|
+
object = ObjectSpace._id2ref(ruby_object_id)
|
587
|
+
# Let's make sure the object is really the object we are looking
|
588
|
+
# for. The GC might have recycled it already and the Ruby object ID
|
589
|
+
# could now be used for another object.
|
590
|
+
if object.is_a?(ObjectBase) && object._id == id
|
591
|
+
return object
|
592
|
+
end
|
593
|
+
rescue RangeError => e
|
594
|
+
# Due to a race condition the object can still be in the
|
595
|
+
# @in_memory_objects list but has been collected already by the Ruby
|
596
|
+
# GC. The _collect() call has not been completed yet. We now have to
|
597
|
+
# wait until this has been done. I think the GC lock will prevent a
|
598
|
+
# race on @in_memory_objects.
|
599
|
+
GC.start
|
600
|
+
while @in_memory_objects.include?(id)
|
601
|
+
sleep 0.01
|
602
|
+
end
|
603
|
+
end
|
604
|
+
end
|
605
|
+
|
606
|
+
# This is just a safety check. It has never triggered, so we can disable
|
607
|
+
# it for now.
|
608
|
+
#if (obj = @cache.object_by_id(id))
|
609
|
+
# PEROBS.log.fatal "Object #{id} with Ruby #{obj.object_id} is in " +
|
610
|
+
# "cache but not in_memory"
|
611
|
+
#end
|
612
|
+
|
613
|
+
# We don't have the object in memory. Let's find it in the storage.
|
614
|
+
if @db.include?(id)
|
615
|
+
# Great, object found. Read it into memory and return it.
|
616
|
+
obj = ObjectBase::read(self, id)
|
617
|
+
# Add the object to the in-memory storage list.
|
618
|
+
@cache.cache_read(obj)
|
619
|
+
|
620
|
+
return obj
|
621
|
+
end
|
622
|
+
|
623
|
+
# The requested object does not exist. Return nil.
|
624
|
+
nil
|
625
|
+
end
|
626
|
+
|
576
627
|
# Mark phase of a mark-and-sweep garbage collector. It will mark all
|
577
628
|
# objects that are reachable from the root objects.
|
578
629
|
def mark
|
@@ -599,6 +650,7 @@ module PEROBS
|
|
599
650
|
@stats.swept_objects = @db.delete_unmarked_objects do |id|
|
600
651
|
@cache.evict(id)
|
601
652
|
end
|
653
|
+
@db.clear_marks
|
602
654
|
GC.start
|
603
655
|
PEROBS.log.debug "#{@stats.swept_objects} objects collected"
|
604
656
|
@stats.swept_objects
|
data/lib/perobs/version.rb
CHANGED
data/lib/perobs.rb
CHANGED
data/perobs.gemspec
CHANGED
@@ -20,5 +20,5 @@ GEM_SPEC = Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency 'bundler', '~> 2.3'
|
22
22
|
spec.add_development_dependency 'yard', '~>0.9.12'
|
23
|
-
spec.add_development_dependency 'rake', '~>
|
23
|
+
spec.add_development_dependency 'rake', '~> 13.0.3'
|
24
24
|
end
|
data/test/FlatFileDB_spec.rb
CHANGED
@@ -265,5 +265,35 @@ describe PEROBS::FlatFileDB do
|
|
265
265
|
db.close
|
266
266
|
end
|
267
267
|
|
268
|
+
it 'should handle duplicate entries for the same ID in database.blobs file' do
|
269
|
+
@store.exit
|
270
|
+
|
271
|
+
db = PEROBS::FlatFileDB.new(@db_dir)
|
272
|
+
db_file = File.join(@db_dir, 'database.blobs')
|
273
|
+
db.open
|
274
|
+
0.upto(5) do |i|
|
275
|
+
db.put_object("#{i + 1}:#{'X' * (i + 1) * 30}$", i + 1)
|
276
|
+
end
|
277
|
+
db.close
|
278
|
+
|
279
|
+
# This appends the entry 2 again
|
280
|
+
blob2 = File.read(db_file, 319 - 199, 199)
|
281
|
+
File.write(db_file, blob2, File.size(db_file))
|
282
|
+
|
283
|
+
db.open
|
284
|
+
expect(db.check_db).to eql(2)
|
285
|
+
expect(db.check_db(true)).to eql(1)
|
286
|
+
db.close
|
287
|
+
db = PEROBS::FlatFileDB.new(@db_dir, { :log => $stderr,
|
288
|
+
:log_level => Logger::WARN })
|
289
|
+
db.open
|
290
|
+
expect(db.check_db).to eql(0)
|
291
|
+
|
292
|
+
0.upto(5) do |i|
|
293
|
+
expect(db.get_object(i + 1)).to eql("#{i + 1}:#{'X' * (i + 1) * 30}$")
|
294
|
+
end
|
295
|
+
db.close
|
296
|
+
end
|
297
|
+
|
268
298
|
end
|
269
299
|
|
@@ -29,13 +29,25 @@ require 'perobs/FuzzyStringMatcher'
|
|
29
29
|
|
30
30
|
module PEROBS
|
31
31
|
|
32
|
+
class WordRef < PEROBS::Object
|
33
|
+
|
34
|
+
attr_persist :word, :line
|
35
|
+
|
36
|
+
def initialize(store, word, line)
|
37
|
+
super(store)
|
38
|
+
self.word = word
|
39
|
+
self.line = line
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
32
44
|
describe FuzzyStringMatcher do
|
33
45
|
|
34
46
|
before(:all) do
|
35
47
|
@db_name = generate_db_name(__FILE__)
|
36
48
|
@store = PEROBS::Store.new(@db_name)
|
37
|
-
@fsm =
|
38
|
-
@fsm2 =
|
49
|
+
@store['fsm'] = @fsm = @store.new(FuzzyStringMatcher)
|
50
|
+
@store['fsm2'] = @fsm2 = @store.new(FuzzyStringMatcher, true, 2)
|
39
51
|
end
|
40
52
|
|
41
53
|
after(:all) do
|
@@ -103,6 +115,44 @@ module PEROBS
|
|
103
115
|
expect(@fsm.best_matches('foobar')).to eql([])
|
104
116
|
end
|
105
117
|
|
118
|
+
it 'should find a match' do
|
119
|
+
dut = {
|
120
|
+
[ 'one' ] => [ [ 'one', 1.0 ] ],
|
121
|
+
[ 'three' ] => [ [ 'three', 1.0 ] ],
|
122
|
+
[ 'four' ]=> [ [ 'four', 1.0 ], [ 'fourteen', 0.666 ] ],
|
123
|
+
[ 'four', 1.0 ]=> [ [ 'four', 1.0 ] ],
|
124
|
+
[ 'even' ] => [ [ 'seven', 0.666 ], [ 'eleven', 0.666 ] ],
|
125
|
+
[ 'teen' ] => [ ['thirteen', 0.6666666666666666],
|
126
|
+
['fourteen', 0.6666666666666666],
|
127
|
+
['fifteen', 0.6666666666666666],
|
128
|
+
['sixteen', 0.6666666666666666],
|
129
|
+
['seventeen', 0.6666666666666666],
|
130
|
+
['eighteen', 0.6666666666666666],
|
131
|
+
['nineteen', 0.6666666666666666] ],
|
132
|
+
[ 'aight' ] => [ [ 'eight', 0.5 ] ],
|
133
|
+
[ 'thirdteen' ] => [ [ 'thirteen', 0.5 ] ],
|
134
|
+
[ 'shirt teen', 0.3 ] => [ [ 'thirteen', 0.333 ] ]
|
135
|
+
}
|
136
|
+
check_data_under_test(@fsm, dut)
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'should sort best to worst matches' do
|
140
|
+
@fsm.clear
|
141
|
+
%w( xbar xfoox foor bar foobar barfoo foo rab baar fool xbarx
|
142
|
+
foobarx xfoobarx foo_bar ).each do |w|
|
143
|
+
@fsm.learn(w, w)
|
144
|
+
end
|
145
|
+
dut = {
|
146
|
+
[ 'foo' ] => [["foo", 1.0], ["foor", 0.5], ["foobar", 0.5],
|
147
|
+
["fool", 0.5], ["foobarx", 0.5], ["foo_bar", 0.5],
|
148
|
+
["barfoo", 0.5]],
|
149
|
+
[ 'bar' ] => [["bar", 1.0], ["barfoo", 0.5], ["xbar", 0.5],
|
150
|
+
["foobar", 0.5], ["foo_bar", 0.5]],
|
151
|
+
[ 'foobar' ] => [["foobar", 1.0], ["foobarx", 0.8], ["xfoobarx", 0.6]]
|
152
|
+
}
|
153
|
+
check_data_under_test(@fsm, dut)
|
154
|
+
end
|
155
|
+
|
106
156
|
it 'should handle a larger text' do
|
107
157
|
text =<<-EOT
|
108
158
|
MIT License
|
@@ -131,9 +181,9 @@ EOT
|
|
131
181
|
@fsm2.learn(word, word)
|
132
182
|
end
|
133
183
|
stats = @fsm2.stats
|
134
|
-
expect(stats['dictionary_size']).to eql(
|
184
|
+
expect(stats['dictionary_size']).to eql(352)
|
135
185
|
expect(stats['max_list_size']).to eql(22)
|
136
|
-
expect(stats['avg_list_size']).to be_within(0.001).of(2.
|
186
|
+
expect(stats['avg_list_size']).to be_within(0.001).of(2.409)
|
137
187
|
end
|
138
188
|
|
139
189
|
it 'should find case sensitive matches' do
|
@@ -145,6 +195,46 @@ EOT
|
|
145
195
|
check_data_under_test(@fsm2, dut)
|
146
196
|
end
|
147
197
|
|
198
|
+
it 'should support references to PEROBS objects' do
|
199
|
+
text =<<-EOT
|
200
|
+
MIT License
|
201
|
+
|
202
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
203
|
+
a copy of this software and associated documentation files (the
|
204
|
+
"Software"), to deal in the Software without restriction, including
|
205
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
206
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
207
|
+
permit persons to whom the Software is furnished to do so, subject to
|
208
|
+
the following conditions:
|
209
|
+
EOT
|
210
|
+
|
211
|
+
line_no = 1
|
212
|
+
@store['fsm'] = fsm = @store.new(FuzzyStringMatcher)
|
213
|
+
@store['refs'] = refs = @store.new(Array)
|
214
|
+
text.each_line do |line|
|
215
|
+
line.split.each do |word|
|
216
|
+
ref = @store.new(WordRef, word, line_no)
|
217
|
+
refs << ref
|
218
|
+
fsm.learn(word, ref)
|
219
|
+
end
|
220
|
+
line_no += 1
|
221
|
+
end
|
222
|
+
|
223
|
+
found_lines = []
|
224
|
+
fsm.best_matches('SOFTWARE').each do |match|
|
225
|
+
found_lines << match[0].line
|
226
|
+
end
|
227
|
+
expect(found_lines.sort).to eql([ 4, 5, 5, 7, 8 ])
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should with small search words' do
|
231
|
+
@fsm.clear
|
232
|
+
mats = 'Yukihiro Matsumoto'
|
233
|
+
@fsm.learn(mats)
|
234
|
+
expect(@fsm.best_matches('Yukihiro').first.first).to eql(mats)
|
235
|
+
expect(@fsm.best_matches('Mats', 0.3).first.first).to eql(mats)
|
236
|
+
end
|
237
|
+
|
148
238
|
def check_data_under_test(fsm, dut)
|
149
239
|
dut.each do |inputs, reference|
|
150
240
|
key = inputs[0]
|