perobs 4.0.0 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +27 -16
- data/lib/perobs/Array.rb +66 -19
- data/lib/perobs/BTree.rb +106 -15
- data/lib/perobs/BTreeBlob.rb +4 -3
- data/lib/perobs/BTreeDB.rb +5 -4
- data/lib/perobs/BTreeNode.rb +482 -156
- data/lib/perobs/BTreeNodeLink.rb +10 -0
- data/lib/perobs/BigArray.rb +285 -0
- data/lib/perobs/BigArrayNode.rb +1002 -0
- data/lib/perobs/BigHash.rb +246 -0
- data/lib/perobs/BigTree.rb +197 -0
- data/lib/perobs/BigTreeNode.rb +873 -0
- data/lib/perobs/Cache.rb +48 -10
- data/lib/perobs/ConsoleProgressMeter.rb +61 -0
- data/lib/perobs/DataBase.rb +4 -3
- data/lib/perobs/DynamoDB.rb +57 -15
- data/lib/perobs/EquiBlobsFile.rb +155 -50
- data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
- data/lib/perobs/FlatFile.rb +519 -227
- data/lib/perobs/FlatFileBlobHeader.rb +113 -54
- data/lib/perobs/FlatFileDB.rb +49 -23
- data/lib/perobs/FuzzyStringMatcher.rb +175 -0
- data/lib/perobs/Hash.rb +127 -33
- data/lib/perobs/IDList.rb +144 -0
- data/lib/perobs/IDListPage.rb +107 -0
- data/lib/perobs/IDListPageFile.rb +180 -0
- data/lib/perobs/IDListPageRecord.rb +142 -0
- data/lib/perobs/Object.rb +18 -15
- data/lib/perobs/ObjectBase.rb +46 -5
- data/lib/perobs/PersistentObjectCache.rb +57 -68
- data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
- data/lib/perobs/ProgressMeter.rb +97 -0
- data/lib/perobs/SpaceManager.rb +273 -0
- data/lib/perobs/SpaceTree.rb +21 -12
- data/lib/perobs/SpaceTreeNode.rb +53 -61
- data/lib/perobs/Store.rb +264 -145
- data/lib/perobs/version.rb +1 -1
- data/lib/perobs.rb +2 -0
- data/perobs.gemspec +4 -4
- data/test/Array_spec.rb +15 -6
- data/test/BTree_spec.rb +6 -2
- data/test/BigArray_spec.rb +261 -0
- data/test/BigHash_spec.rb +152 -0
- data/test/BigTreeNode_spec.rb +153 -0
- data/test/BigTree_spec.rb +259 -0
- data/test/EquiBlobsFile_spec.rb +105 -1
- data/test/FNV_Hash_1a_64_spec.rb +59 -0
- data/test/FlatFileDB_spec.rb +198 -14
- data/test/FuzzyStringMatcher_spec.rb +261 -0
- data/test/Hash_spec.rb +13 -3
- data/test/IDList_spec.rb +77 -0
- data/test/LegacyDBs/LegacyDB.rb +155 -0
- data/test/LegacyDBs/version_3/class_map.json +1 -0
- data/test/LegacyDBs/version_3/config.json +1 -0
- data/test/LegacyDBs/version_3/database.blobs +0 -0
- data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
- data/test/LegacyDBs/version_3/index.blobs +0 -0
- data/test/LegacyDBs/version_3/version +1 -0
- data/test/LockFile_spec.rb +9 -6
- data/test/SpaceManager_spec.rb +176 -0
- data/test/SpaceTree_spec.rb +4 -1
- data/test/Store_spec.rb +305 -203
- data/test/spec_helper.rb +9 -4
- metadata +57 -16
- data/lib/perobs/BTreeNodeCache.rb +0 -109
- data/lib/perobs/TreeDB.rb +0 -277
@@ -48,12 +48,13 @@ module PEROBS
|
|
48
48
|
# The 'pack()' format of the header.
|
49
49
|
FORMAT = 'CQQL'
|
50
50
|
# The length of the header in bytes.
|
51
|
-
LENGTH =
|
51
|
+
LENGTH = 25
|
52
52
|
VALID_FLAG_BIT = 0
|
53
53
|
COMPRESSED_FLAG_BIT = 2
|
54
54
|
OUTDATED_FLAG_BIT = 3
|
55
55
|
|
56
56
|
attr_reader :addr, :flags, :length, :id, :crc
|
57
|
+
attr_accessor :corruption_start
|
57
58
|
|
58
59
|
# Create a new FlatFileBlobHeader with the given flags, length, id and crc.
|
59
60
|
# @param file [File] the FlatFile that contains the header
|
@@ -69,50 +70,120 @@ module PEROBS
|
|
69
70
|
@length = length
|
70
71
|
@id = id
|
71
72
|
@crc = crc
|
73
|
+
# This is only set if the header is preceded by a corrupted blob.
|
74
|
+
@corruption_start = nil
|
72
75
|
end
|
73
76
|
|
74
77
|
# Read the header from the given File.
|
75
78
|
# @param file [File]
|
76
|
-
# @
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
79
|
+
# @param addr [Integer] address in the file to start reading. If no
|
80
|
+
# address is specified use the current position in the file.
|
81
|
+
# @param id [Integer] Optional ID that the header should have. If no id is
|
82
|
+
# specified there is no check against the actual ID done.
|
83
|
+
# @return FlatFileBlobHeader or nil if there are no more blobs to read in
|
84
|
+
# the file.
|
85
|
+
def FlatFileBlobHeader::read(file, addr = nil, id = nil)
|
86
|
+
# If an address was specified we expect the read to always succeed. If
|
87
|
+
# no address is specified and we can't read the header we generate an
|
88
|
+
# error message but it is not fatal.
|
89
|
+
errors_are_fatal = !addr.nil?
|
90
|
+
|
91
|
+
mode = :searching_next_header
|
92
|
+
addr = file.pos unless addr
|
93
|
+
buf = nil
|
94
|
+
corruption_start = nil
|
95
|
+
|
96
|
+
loop do
|
97
|
+
buf_with_crc = nil
|
98
|
+
begin
|
99
|
+
file.seek(addr)
|
100
|
+
buf_with_crc = file.read(LENGTH)
|
101
|
+
rescue IOError => e
|
102
|
+
if errors_are_fatal
|
103
|
+
PEROBS.log.fatal "Cannot read blob header in flat file DB at " +
|
104
|
+
"address #{addr}: #{e.message}"
|
105
|
+
else
|
106
|
+
PEROBS.log.error "Cannot read blob header in flat file DB: " +
|
107
|
+
e.message
|
108
|
+
return nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Did we read anything?
|
113
|
+
if buf_with_crc.nil?
|
114
|
+
if errors_are_fatal
|
115
|
+
PEROBS.log.fatal "Cannot read blob header " +
|
116
|
+
"#{id ? "for ID #{id} " : ''}at address #{addr}"
|
117
|
+
else
|
118
|
+
if corruption_start
|
119
|
+
PEROBS.log.error "Corruption found at end of blob file at " +
|
120
|
+
"address #{addr}"
|
121
|
+
end
|
122
|
+
# We have reached the end of the file.
|
123
|
+
return nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Did we get the full header?
|
128
|
+
if buf_with_crc.length != LENGTH
|
129
|
+
msg = "Incomplete FlatFileBlobHeader: Only " +
|
130
|
+
"#{buf_with_crc.length} " +
|
131
|
+
"bytes of #{LENGTH} could be read "
|
132
|
+
"#{id ? "for ID #{id} " : ''}at address #{addr}"
|
133
|
+
if errors_are_fatal
|
134
|
+
PEROBS.log.fatal msg
|
135
|
+
else
|
136
|
+
PEROBS.log.error msg
|
137
|
+
end
|
138
|
+
return nil
|
139
|
+
end
|
140
|
+
|
141
|
+
# Check the CRC of the header
|
142
|
+
buf = buf_with_crc[0..-5]
|
143
|
+
crc = buf_with_crc[-4..-1].unpack('L')[0]
|
144
|
+
|
145
|
+
if (read_crc = Zlib.crc32(buf, 0)) == crc
|
146
|
+
# We have found a valid header.
|
147
|
+
if corruption_start
|
148
|
+
PEROBS.log.error "FlatFile corruption ends at #{addr}. " +
|
149
|
+
"#{addr - corruption_start} bytes skipped. Some data may " +
|
150
|
+
"not be recoverable."
|
151
|
+
end
|
152
|
+
break
|
153
|
+
else
|
154
|
+
if errors_are_fatal
|
155
|
+
PEROBS.log.fatal "FlatFile Header CRC mismatch at address " +
|
156
|
+
"#{addr}. Header CRC is #{'%08x' % read_crc} but should be " +
|
157
|
+
"#{'%08x' % crc}."
|
158
|
+
else
|
159
|
+
if corruption_start.nil?
|
160
|
+
if errors_are_fatal
|
161
|
+
PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
|
162
|
+
"Header CRC mismatch at address #{addr}. Header CRC is " +
|
163
|
+
"#{'%08x' % read_crc} but should be #{'%08x' % crc}."
|
164
|
+
else
|
165
|
+
PEROBS.log.error "FlatFile corruption found. The FlatFile " +
|
166
|
+
"Header CRC mismatch at address #{addr}. Header CRC is " +
|
167
|
+
"#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
|
168
|
+
"Trying to find the next header."
|
169
|
+
end
|
170
|
+
corruption_start = addr
|
171
|
+
end
|
172
|
+
# The blob file is corrupted. There is no valid header at the
|
173
|
+
# current position in the file. We now try to find the next valid
|
174
|
+
# header by iterating over the remainder of the file advanding one
|
175
|
+
# byte with each step until we hit the end of the file or find the
|
176
|
+
# next valid header.
|
177
|
+
addr += 1
|
178
|
+
end
|
179
|
+
end
|
84
180
|
end
|
85
181
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
PEROBS.log.error "Incomplete FlatFileBlobHeader: Only #{buf.length} " +
|
90
|
-
"bytes of #{LENGTH} could be read"
|
91
|
-
return nil
|
182
|
+
header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
|
183
|
+
if corruption_start
|
184
|
+
header.corruption_start = corruption_start
|
92
185
|
end
|
93
186
|
|
94
|
-
FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
|
95
|
-
end
|
96
|
-
|
97
|
-
# Read the header from the given File.
|
98
|
-
# @param file [File]
|
99
|
-
# @param addr [Integer] address in the file to start reading
|
100
|
-
# @param id [Integer] Optional ID that the header should have
|
101
|
-
# @return FlatFileBlobHeader
|
102
|
-
def FlatFileBlobHeader::read_at(file, addr, id = nil)
|
103
|
-
buf = nil
|
104
|
-
begin
|
105
|
-
file.seek(addr)
|
106
|
-
buf = file.read(LENGTH)
|
107
|
-
rescue IOError => e
|
108
|
-
PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
|
109
|
-
end
|
110
|
-
if buf.nil? || buf.length != LENGTH
|
111
|
-
PEROBS.log.fatal "Cannot read blob header " +
|
112
|
-
"#{id ? "for ID #{id} " : ''}at address " +
|
113
|
-
"#{addr}"
|
114
|
-
end
|
115
|
-
header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
|
116
187
|
if id && header.id != id
|
117
188
|
PEROBS.log.fatal "Mismatch between FlatFile index and blob file " +
|
118
189
|
"found. FlatFile has entry with ID #{header.id} at address " +
|
@@ -123,11 +194,12 @@ module PEROBS
|
|
123
194
|
end
|
124
195
|
|
125
196
|
# Write the header to a given File.
|
126
|
-
# @param file [File]
|
127
197
|
def write
|
128
198
|
begin
|
199
|
+
buf = [ @flags, @length, @id, @crc].pack(FORMAT)
|
200
|
+
crc = Zlib.crc32(buf, 0)
|
129
201
|
@file.seek(@addr)
|
130
|
-
@file.write(
|
202
|
+
@file.write(buf + [ crc ].pack('L'))
|
131
203
|
rescue IOError => e
|
132
204
|
PEROBS.log.fatal "Cannot write blob header into flat file DB: " +
|
133
205
|
e.message
|
@@ -135,11 +207,9 @@ module PEROBS
|
|
135
207
|
end
|
136
208
|
|
137
209
|
# Reset all the flags bit to 0. This marks the blob as invalid.
|
138
|
-
# @param file [File] The file handle of the blob file.
|
139
|
-
# @param addr [Integer] The address of the header
|
140
210
|
def clear_flags
|
141
211
|
@flags = 0
|
142
|
-
|
212
|
+
write
|
143
213
|
end
|
144
214
|
|
145
215
|
# Return true if the header is for a non-empty blob.
|
@@ -156,7 +226,7 @@ module PEROBS
|
|
156
226
|
# transaction has been completed.
|
157
227
|
def set_outdated_flag
|
158
228
|
set_flag(OUTDATED_FLAG_BIT)
|
159
|
-
|
229
|
+
write
|
160
230
|
end
|
161
231
|
|
162
232
|
# Return true if the blob contains outdated data.
|
@@ -166,17 +236,6 @@ module PEROBS
|
|
166
236
|
|
167
237
|
private
|
168
238
|
|
169
|
-
def write_flags
|
170
|
-
begin
|
171
|
-
@file.seek(@addr)
|
172
|
-
@file.write([ @flags ].pack('C'))
|
173
|
-
@file.flush
|
174
|
-
rescue IOError => e
|
175
|
-
PEROBS.log.fatal "Writing flags of FlatFileBlobHeader with ID #{@id} " +
|
176
|
-
"failed: #{e.message}"
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
239
|
def bit_set?(n)
|
181
240
|
mask = 1 << n
|
182
241
|
@flags & mask == mask
|
data/lib/perobs/FlatFileDB.rb
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
#
|
3
3
|
# = FlatFileDB.rb -- Persistent Ruby Object Store
|
4
4
|
#
|
5
|
-
# Copyright (c) 2015, 2016
|
5
|
+
# Copyright (c) 2015, 2016, 2017, 2018, 2019
|
6
|
+
# by Chris Schlaeger <chris@taskjuggler.org>
|
6
7
|
#
|
7
8
|
# MIT License
|
8
9
|
#
|
@@ -41,7 +42,7 @@ module PEROBS
|
|
41
42
|
|
42
43
|
# This version number increases whenever the on-disk format changes in a
|
43
44
|
# way that requires conversion actions after an update.
|
44
|
-
VERSION =
|
45
|
+
VERSION = 4
|
45
46
|
|
46
47
|
attr_reader :max_blob_size
|
47
48
|
|
@@ -50,13 +51,17 @@ module PEROBS
|
|
50
51
|
# @param options [Hash] options to customize the behavior. Currently only
|
51
52
|
# the following options are supported:
|
52
53
|
# :serializer : Can be :marshal, :json, :yaml
|
54
|
+
# :progressmeter : Reference to a ProgressMeter object
|
55
|
+
# :log : IO that should be used for logging
|
56
|
+
# :log_level : Minimum Logger level to log
|
53
57
|
def initialize(db_name, options = {})
|
54
|
-
super(options
|
58
|
+
super(options)
|
55
59
|
|
56
60
|
@db_dir = db_name
|
57
61
|
# Create the database directory if it doesn't exist yet.
|
58
62
|
ensure_dir_exists(@db_dir)
|
59
|
-
PEROBS.log.
|
63
|
+
PEROBS.log.level = options[:log_level] if options[:log_level]
|
64
|
+
PEROBS.log.open(options[:log] || File.join(@db_dir, 'log'))
|
60
65
|
check_version_and_upgrade
|
61
66
|
|
62
67
|
# Read the existing DB config.
|
@@ -68,7 +73,7 @@ module PEROBS
|
|
68
73
|
|
69
74
|
# Open the FlatFileDB for transactions.
|
70
75
|
def open
|
71
|
-
@flat_file = FlatFile.new(@db_dir)
|
76
|
+
@flat_file = FlatFile.new(@db_dir, @progressmeter)
|
72
77
|
@flat_file.open
|
73
78
|
PEROBS.log.info "FlatFile '#{@db_dir}' opened"
|
74
79
|
end
|
@@ -143,8 +148,9 @@ module PEROBS
|
|
143
148
|
end
|
144
149
|
end
|
145
150
|
|
146
|
-
|
147
|
-
|
151
|
+
# @return [Integer] Number of objects stored in the DB.
|
152
|
+
def item_counter
|
153
|
+
@flat_file.item_counter
|
148
154
|
end
|
149
155
|
|
150
156
|
# This method must be called to initiate the marking process.
|
@@ -154,9 +160,9 @@ module PEROBS
|
|
154
160
|
|
155
161
|
# Permanently delete all objects that have not been marked. Those are
|
156
162
|
# orphaned and are no longer referenced by any actively used object.
|
157
|
-
# @return [
|
158
|
-
def delete_unmarked_objects
|
159
|
-
@flat_file.delete_unmarked_objects
|
163
|
+
# @return [Integer] Number of the removed objects from the DB.
|
164
|
+
def delete_unmarked_objects(&block)
|
165
|
+
@flat_file.delete_unmarked_objects(&block)
|
160
166
|
end
|
161
167
|
|
162
168
|
# Mark an object.
|
@@ -178,7 +184,11 @@ module PEROBS
|
|
178
184
|
# repaired.
|
179
185
|
# @return number of errors found
|
180
186
|
def check_db(repair = false)
|
181
|
-
|
187
|
+
if repair
|
188
|
+
@flat_file.repair
|
189
|
+
else
|
190
|
+
@flat_file.check
|
191
|
+
end
|
182
192
|
end
|
183
193
|
|
184
194
|
# Check if the stored object is syntactically correct.
|
@@ -226,7 +236,8 @@ module PEROBS
|
|
226
236
|
"'#{version_file}': " + e.message
|
227
237
|
end
|
228
238
|
else
|
229
|
-
#
|
239
|
+
# The DB is brand new.
|
240
|
+
version = VERSION
|
230
241
|
write_version_file(version_file)
|
231
242
|
end
|
232
243
|
|
@@ -234,25 +245,40 @@ module PEROBS
|
|
234
245
|
PEROBS.log.fatal "Cannot downgrade the FlatFile database from " +
|
235
246
|
"version #{version} to version #{VERSION}"
|
236
247
|
end
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
open
|
242
|
-
@flat_file.refresh
|
243
|
-
close
|
248
|
+
if version < 3
|
249
|
+
PEROBS.log.fatal "The upgrade of this version of the PEROBS database " +
|
250
|
+
"is not supported by this version of PEROBS. Please try an earlier " +
|
251
|
+
"version of PEROBS to upgrade the database before using this version."
|
244
252
|
end
|
245
253
|
|
246
|
-
#
|
247
|
-
#
|
248
|
-
|
254
|
+
# Version upgrades must be done one version number at a time. If the
|
255
|
+
# existing DB is multiple versions older than what the current PEROBS
|
256
|
+
# version expects than multiple upgrade runs will be needed.
|
257
|
+
while version < VERSION
|
258
|
+
if version == 3
|
259
|
+
PEROBS.log.warn "Updating FlatFileDB #{@db_dir} from version 3 to " +
|
260
|
+
"version 4 ..."
|
261
|
+
# Version 4 adds checksums for blob file headers. We have to convert
|
262
|
+
# the blob file to include the checksums.
|
263
|
+
FlatFile.insert_header_checksums(@db_dir)
|
264
|
+
open
|
265
|
+
@flat_file.regenerate_index_and_spaces
|
266
|
+
close
|
267
|
+
end
|
268
|
+
|
269
|
+
# After a successful upgrade change the version number in the DB as
|
270
|
+
# well.
|
249
271
|
write_version_file(version_file)
|
250
272
|
PEROBS.log.warn "Update of FlatFileDB '#{@db_dir}' from version " +
|
251
|
-
"#{version} to version #{
|
273
|
+
"#{version} to version #{version + 1} completed"
|
274
|
+
|
275
|
+
# Update version variable to new version.
|
276
|
+
version += 1
|
252
277
|
end
|
253
278
|
end
|
254
279
|
|
255
280
|
def write_version_file(version_file)
|
281
|
+
|
256
282
|
begin
|
257
283
|
RobustFile.write(version_file, VERSION)
|
258
284
|
rescue IOError => e
|
@@ -0,0 +1,175 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
|
4
|
+
#
|
5
|
+
# Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
|
6
|
+
#
|
7
|
+
# MIT License
|
8
|
+
#
|
9
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
10
|
+
# a copy of this software and associated documentation files (the
|
11
|
+
# "Software"), to deal in the Software without restriction, including
|
12
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
13
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
14
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
15
|
+
# the following conditions:
|
16
|
+
#
|
17
|
+
# The above copyright notice and this permission notice shall be
|
18
|
+
# included in all copies or substantial portions of the Software.
|
19
|
+
#
|
20
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
22
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
24
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
25
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
26
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
|
+
|
28
|
+
require 'perobs/Log'
|
29
|
+
require 'perobs/Object'
|
30
|
+
|
31
|
+
module PEROBS
|
32
|
+
|
33
|
+
# The fuzzy string matcher can be used to perform a fuzzy string search
|
34
|
+
# against a known set of strings. The dictionary of known strings does not
|
35
|
+
# store the actual strings but references to String or PEROBS objects.
|
36
|
+
# Once the dictionary has been established, fuzzy matches can be done. Since
|
37
|
+
# the actual input strings are not directly stored, you cannot remove or
|
38
|
+
# modified already stored strings. To remove strings, you have to clear the
|
39
|
+
# matcher and add the strings again that you want to keep.
|
40
|
+
class FuzzyStringMatcher < PEROBS::Object
|
41
|
+
|
42
|
+
attr_persist :case_sensitive, :n, :dict
|
43
|
+
|
44
|
+
# Create a new FuzzyStringMatcher.
|
45
|
+
# @param p [PEROBS::Store] place to store the dictionary
|
46
|
+
# @param case_sensitive [Boolean] True if case matters for matching
|
47
|
+
# @param n [Integer] Determines what kind of n-gramm is used to store the
|
48
|
+
# references in the dictionary. It also determines the minimum word
|
49
|
+
# length that can be used for fuzzy matches. Values between 2 and
|
50
|
+
# 10 are supported. The default is 4.
|
51
|
+
def initialize(p, case_sensitive = false, n = 4)
|
52
|
+
super(p)
|
53
|
+
if n < 2 || n > 10
|
54
|
+
raise ArgumentError, 'n must be between 2 and 10'
|
55
|
+
end
|
56
|
+
self.case_sensitive = case_sensitive
|
57
|
+
self.n = n
|
58
|
+
|
59
|
+
clear unless @dict
|
60
|
+
end
|
61
|
+
|
62
|
+
# Wipe the dictionary.
|
63
|
+
def clear
|
64
|
+
self.dict = @store.new(BigHash)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Add a string with its reference to the dictionary.
|
68
|
+
# @param string [String] The string to store
|
69
|
+
# @param reference [Object] Any object that is associated with the string
|
70
|
+
def learn(string, reference = string)
|
71
|
+
reference = string if reference.nil?
|
72
|
+
|
73
|
+
unless @case_sensitive
|
74
|
+
string = string.downcase
|
75
|
+
end
|
76
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
77
|
+
string = "\002" + string + "\003"
|
78
|
+
|
79
|
+
each_n_gramm(string) do |n_gramm|
|
80
|
+
unless (ng_list = @dict[n_gramm])
|
81
|
+
@dict[n_gramm] = ng_list = @store.new(Hash)
|
82
|
+
end
|
83
|
+
|
84
|
+
# We use the Hash as a Set. The value doesn't matter.
|
85
|
+
ng_list[reference] = true unless ng_list.include?(reference)
|
86
|
+
end
|
87
|
+
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
|
91
|
+
# Find the references who's string best matches the given string.
|
92
|
+
# @param string [String] string to search for
|
93
|
+
# @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
|
94
|
+
# the matching should be done. The larger the value the more closer
|
95
|
+
# the given string needs to be.
|
96
|
+
# @param max_count [Integer] The maximum number of matches that should be
|
97
|
+
# returned.
|
98
|
+
# @return [Array] The result is an Array of Arrays. The nested Arrays only
|
99
|
+
# have 2 entries. The reference and a Float value between 0 and
|
100
|
+
# 1.0 that describes how good the match is. The matches are sorted
|
101
|
+
# in descending order by the match score.
|
102
|
+
def best_matches(string, min_score = 0.5, max_count = 100)
|
103
|
+
unless @case_sensitive
|
104
|
+
string = string.downcase
|
105
|
+
end
|
106
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
107
|
+
string = "\002" + string + "\003"
|
108
|
+
|
109
|
+
matches = {}
|
110
|
+
|
111
|
+
each_n_gramm(string) do |n_gramm|
|
112
|
+
if (ng_list = @dict[n_gramm])
|
113
|
+
ng_list.each do |reference, dummy|
|
114
|
+
if matches.include?(reference)
|
115
|
+
matches[reference] += 1
|
116
|
+
else
|
117
|
+
matches[reference] = 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
return [] if matches.empty?
|
124
|
+
|
125
|
+
match_list = matches.to_a
|
126
|
+
|
127
|
+
# Set occurance counters to scores relative to the best possible score.
|
128
|
+
# This will be the best possible score for a perfect match.
|
129
|
+
best_possible_score = string.length - @n + 1
|
130
|
+
match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
|
131
|
+
|
132
|
+
# Delete all matches that don't have the required minimum match score.
|
133
|
+
match_list.delete_if { |a| a[1] < min_score }
|
134
|
+
|
135
|
+
# Sort the list best to worst match
|
136
|
+
match_list.sort! do |a, b|
|
137
|
+
b[1] <=> a[1]
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the top max_count matches.
|
141
|
+
match_list[0..max_count - 1]
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns some internal stats about the dictionary.
|
145
|
+
def stats
|
146
|
+
s = {}
|
147
|
+
s['dictionary_size'] = @dict.size
|
148
|
+
max = total = 0
|
149
|
+
@dict.each do |n_gramm, ng_list|
|
150
|
+
size = ng_list.length
|
151
|
+
max = size if size > max
|
152
|
+
total += size
|
153
|
+
end
|
154
|
+
s['max_list_size'] = max
|
155
|
+
s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
|
156
|
+
|
157
|
+
s
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def each_n_gramm(string, &block)
|
163
|
+
return if string.length < @n
|
164
|
+
|
165
|
+
0.upto(string.length - @n) do |i|
|
166
|
+
n_gramm = string[i, @n]
|
167
|
+
|
168
|
+
yield(n_gramm)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|