perobs 4.0.0 → 4.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +27 -16
- data/lib/perobs/Array.rb +66 -19
- data/lib/perobs/BTree.rb +106 -15
- data/lib/perobs/BTreeBlob.rb +4 -3
- data/lib/perobs/BTreeDB.rb +5 -4
- data/lib/perobs/BTreeNode.rb +482 -156
- data/lib/perobs/BTreeNodeLink.rb +10 -0
- data/lib/perobs/BigArray.rb +285 -0
- data/lib/perobs/BigArrayNode.rb +1002 -0
- data/lib/perobs/BigHash.rb +246 -0
- data/lib/perobs/BigTree.rb +197 -0
- data/lib/perobs/BigTreeNode.rb +873 -0
- data/lib/perobs/Cache.rb +48 -10
- data/lib/perobs/ConsoleProgressMeter.rb +61 -0
- data/lib/perobs/DataBase.rb +4 -3
- data/lib/perobs/DynamoDB.rb +57 -15
- data/lib/perobs/EquiBlobsFile.rb +155 -50
- data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
- data/lib/perobs/FlatFile.rb +519 -227
- data/lib/perobs/FlatFileBlobHeader.rb +113 -54
- data/lib/perobs/FlatFileDB.rb +49 -23
- data/lib/perobs/FuzzyStringMatcher.rb +175 -0
- data/lib/perobs/Hash.rb +127 -33
- data/lib/perobs/IDList.rb +144 -0
- data/lib/perobs/IDListPage.rb +107 -0
- data/lib/perobs/IDListPageFile.rb +180 -0
- data/lib/perobs/IDListPageRecord.rb +142 -0
- data/lib/perobs/Object.rb +18 -15
- data/lib/perobs/ObjectBase.rb +46 -5
- data/lib/perobs/PersistentObjectCache.rb +57 -68
- data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
- data/lib/perobs/ProgressMeter.rb +97 -0
- data/lib/perobs/SpaceManager.rb +273 -0
- data/lib/perobs/SpaceTree.rb +21 -12
- data/lib/perobs/SpaceTreeNode.rb +53 -61
- data/lib/perobs/Store.rb +264 -145
- data/lib/perobs/version.rb +1 -1
- data/lib/perobs.rb +2 -0
- data/perobs.gemspec +4 -4
- data/test/Array_spec.rb +15 -6
- data/test/BTree_spec.rb +6 -2
- data/test/BigArray_spec.rb +261 -0
- data/test/BigHash_spec.rb +152 -0
- data/test/BigTreeNode_spec.rb +153 -0
- data/test/BigTree_spec.rb +259 -0
- data/test/EquiBlobsFile_spec.rb +105 -1
- data/test/FNV_Hash_1a_64_spec.rb +59 -0
- data/test/FlatFileDB_spec.rb +198 -14
- data/test/FuzzyStringMatcher_spec.rb +261 -0
- data/test/Hash_spec.rb +13 -3
- data/test/IDList_spec.rb +77 -0
- data/test/LegacyDBs/LegacyDB.rb +155 -0
- data/test/LegacyDBs/version_3/class_map.json +1 -0
- data/test/LegacyDBs/version_3/config.json +1 -0
- data/test/LegacyDBs/version_3/database.blobs +0 -0
- data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
- data/test/LegacyDBs/version_3/index.blobs +0 -0
- data/test/LegacyDBs/version_3/version +1 -0
- data/test/LockFile_spec.rb +9 -6
- data/test/SpaceManager_spec.rb +176 -0
- data/test/SpaceTree_spec.rb +4 -1
- data/test/Store_spec.rb +305 -203
- data/test/spec_helper.rb +9 -4
- metadata +57 -16
- data/lib/perobs/BTreeNodeCache.rb +0 -109
- data/lib/perobs/TreeDB.rb +0 -277
@@ -48,12 +48,13 @@ module PEROBS
|
|
48
48
|
# The 'pack()' format of the header.
|
49
49
|
FORMAT = 'CQQL'
|
50
50
|
# The length of the header in bytes.
|
51
|
-
LENGTH =
|
51
|
+
LENGTH = 25
|
52
52
|
VALID_FLAG_BIT = 0
|
53
53
|
COMPRESSED_FLAG_BIT = 2
|
54
54
|
OUTDATED_FLAG_BIT = 3
|
55
55
|
|
56
56
|
attr_reader :addr, :flags, :length, :id, :crc
|
57
|
+
attr_accessor :corruption_start
|
57
58
|
|
58
59
|
# Create a new FlatFileBlobHeader with the given flags, length, id and crc.
|
59
60
|
# @param file [File] the FlatFile that contains the header
|
@@ -69,50 +70,120 @@ module PEROBS
|
|
69
70
|
@length = length
|
70
71
|
@id = id
|
71
72
|
@crc = crc
|
73
|
+
# This is only set if the header is preceded by a corrupted blob.
|
74
|
+
@corruption_start = nil
|
72
75
|
end
|
73
76
|
|
74
77
|
# Read the header from the given File.
|
75
78
|
# @param file [File]
|
76
|
-
# @
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
79
|
+
# @param addr [Integer] address in the file to start reading. If no
|
80
|
+
# address is specified use the current position in the file.
|
81
|
+
# @param id [Integer] Optional ID that the header should have. If no id is
|
82
|
+
# specified there is no check against the actual ID done.
|
83
|
+
# @return FlatFileBlobHeader or nil if there are no more blobs to read in
|
84
|
+
# the file.
|
85
|
+
def FlatFileBlobHeader::read(file, addr = nil, id = nil)
|
86
|
+
# If an address was specified we expect the read to always succeed. If
|
87
|
+
# no address is specified and we can't read the header we generate an
|
88
|
+
# error message but it is not fatal.
|
89
|
+
errors_are_fatal = !addr.nil?
|
90
|
+
|
91
|
+
mode = :searching_next_header
|
92
|
+
addr = file.pos unless addr
|
93
|
+
buf = nil
|
94
|
+
corruption_start = nil
|
95
|
+
|
96
|
+
loop do
|
97
|
+
buf_with_crc = nil
|
98
|
+
begin
|
99
|
+
file.seek(addr)
|
100
|
+
buf_with_crc = file.read(LENGTH)
|
101
|
+
rescue IOError => e
|
102
|
+
if errors_are_fatal
|
103
|
+
PEROBS.log.fatal "Cannot read blob header in flat file DB at " +
|
104
|
+
"address #{addr}: #{e.message}"
|
105
|
+
else
|
106
|
+
PEROBS.log.error "Cannot read blob header in flat file DB: " +
|
107
|
+
e.message
|
108
|
+
return nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Did we read anything?
|
113
|
+
if buf_with_crc.nil?
|
114
|
+
if errors_are_fatal
|
115
|
+
PEROBS.log.fatal "Cannot read blob header " +
|
116
|
+
"#{id ? "for ID #{id} " : ''}at address #{addr}"
|
117
|
+
else
|
118
|
+
if corruption_start
|
119
|
+
PEROBS.log.error "Corruption found at end of blob file at " +
|
120
|
+
"address #{addr}"
|
121
|
+
end
|
122
|
+
# We have reached the end of the file.
|
123
|
+
return nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Did we get the full header?
|
128
|
+
if buf_with_crc.length != LENGTH
|
129
|
+
msg = "Incomplete FlatFileBlobHeader: Only " +
|
130
|
+
"#{buf_with_crc.length} " +
|
131
|
+
"bytes of #{LENGTH} could be read "
|
132
|
+
"#{id ? "for ID #{id} " : ''}at address #{addr}"
|
133
|
+
if errors_are_fatal
|
134
|
+
PEROBS.log.fatal msg
|
135
|
+
else
|
136
|
+
PEROBS.log.error msg
|
137
|
+
end
|
138
|
+
return nil
|
139
|
+
end
|
140
|
+
|
141
|
+
# Check the CRC of the header
|
142
|
+
buf = buf_with_crc[0..-5]
|
143
|
+
crc = buf_with_crc[-4..-1].unpack('L')[0]
|
144
|
+
|
145
|
+
if (read_crc = Zlib.crc32(buf, 0)) == crc
|
146
|
+
# We have found a valid header.
|
147
|
+
if corruption_start
|
148
|
+
PEROBS.log.error "FlatFile corruption ends at #{addr}. " +
|
149
|
+
"#{addr - corruption_start} bytes skipped. Some data may " +
|
150
|
+
"not be recoverable."
|
151
|
+
end
|
152
|
+
break
|
153
|
+
else
|
154
|
+
if errors_are_fatal
|
155
|
+
PEROBS.log.fatal "FlatFile Header CRC mismatch at address " +
|
156
|
+
"#{addr}. Header CRC is #{'%08x' % read_crc} but should be " +
|
157
|
+
"#{'%08x' % crc}."
|
158
|
+
else
|
159
|
+
if corruption_start.nil?
|
160
|
+
if errors_are_fatal
|
161
|
+
PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
|
162
|
+
"Header CRC mismatch at address #{addr}. Header CRC is " +
|
163
|
+
"#{'%08x' % read_crc} but should be #{'%08x' % crc}."
|
164
|
+
else
|
165
|
+
PEROBS.log.error "FlatFile corruption found. The FlatFile " +
|
166
|
+
"Header CRC mismatch at address #{addr}. Header CRC is " +
|
167
|
+
"#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
|
168
|
+
"Trying to find the next header."
|
169
|
+
end
|
170
|
+
corruption_start = addr
|
171
|
+
end
|
172
|
+
# The blob file is corrupted. There is no valid header at the
|
173
|
+
# current position in the file. We now try to find the next valid
|
174
|
+
# header by iterating over the remainder of the file advanding one
|
175
|
+
# byte with each step until we hit the end of the file or find the
|
176
|
+
# next valid header.
|
177
|
+
addr += 1
|
178
|
+
end
|
179
|
+
end
|
84
180
|
end
|
85
181
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
PEROBS.log.error "Incomplete FlatFileBlobHeader: Only #{buf.length} " +
|
90
|
-
"bytes of #{LENGTH} could be read"
|
91
|
-
return nil
|
182
|
+
header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
|
183
|
+
if corruption_start
|
184
|
+
header.corruption_start = corruption_start
|
92
185
|
end
|
93
186
|
|
94
|
-
FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
|
95
|
-
end
|
96
|
-
|
97
|
-
# Read the header from the given File.
|
98
|
-
# @param file [File]
|
99
|
-
# @param addr [Integer] address in the file to start reading
|
100
|
-
# @param id [Integer] Optional ID that the header should have
|
101
|
-
# @return FlatFileBlobHeader
|
102
|
-
def FlatFileBlobHeader::read_at(file, addr, id = nil)
|
103
|
-
buf = nil
|
104
|
-
begin
|
105
|
-
file.seek(addr)
|
106
|
-
buf = file.read(LENGTH)
|
107
|
-
rescue IOError => e
|
108
|
-
PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
|
109
|
-
end
|
110
|
-
if buf.nil? || buf.length != LENGTH
|
111
|
-
PEROBS.log.fatal "Cannot read blob header " +
|
112
|
-
"#{id ? "for ID #{id} " : ''}at address " +
|
113
|
-
"#{addr}"
|
114
|
-
end
|
115
|
-
header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
|
116
187
|
if id && header.id != id
|
117
188
|
PEROBS.log.fatal "Mismatch between FlatFile index and blob file " +
|
118
189
|
"found. FlatFile has entry with ID #{header.id} at address " +
|
@@ -123,11 +194,12 @@ module PEROBS
|
|
123
194
|
end
|
124
195
|
|
125
196
|
# Write the header to a given File.
|
126
|
-
# @param file [File]
|
127
197
|
def write
|
128
198
|
begin
|
199
|
+
buf = [ @flags, @length, @id, @crc].pack(FORMAT)
|
200
|
+
crc = Zlib.crc32(buf, 0)
|
129
201
|
@file.seek(@addr)
|
130
|
-
@file.write(
|
202
|
+
@file.write(buf + [ crc ].pack('L'))
|
131
203
|
rescue IOError => e
|
132
204
|
PEROBS.log.fatal "Cannot write blob header into flat file DB: " +
|
133
205
|
e.message
|
@@ -135,11 +207,9 @@ module PEROBS
|
|
135
207
|
end
|
136
208
|
|
137
209
|
# Reset all the flags bit to 0. This marks the blob as invalid.
|
138
|
-
# @param file [File] The file handle of the blob file.
|
139
|
-
# @param addr [Integer] The address of the header
|
140
210
|
def clear_flags
|
141
211
|
@flags = 0
|
142
|
-
|
212
|
+
write
|
143
213
|
end
|
144
214
|
|
145
215
|
# Return true if the header is for a non-empty blob.
|
@@ -156,7 +226,7 @@ module PEROBS
|
|
156
226
|
# transaction has been completed.
|
157
227
|
def set_outdated_flag
|
158
228
|
set_flag(OUTDATED_FLAG_BIT)
|
159
|
-
|
229
|
+
write
|
160
230
|
end
|
161
231
|
|
162
232
|
# Return true if the blob contains outdated data.
|
@@ -166,17 +236,6 @@ module PEROBS
|
|
166
236
|
|
167
237
|
private
|
168
238
|
|
169
|
-
def write_flags
|
170
|
-
begin
|
171
|
-
@file.seek(@addr)
|
172
|
-
@file.write([ @flags ].pack('C'))
|
173
|
-
@file.flush
|
174
|
-
rescue IOError => e
|
175
|
-
PEROBS.log.fatal "Writing flags of FlatFileBlobHeader with ID #{@id} " +
|
176
|
-
"failed: #{e.message}"
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
239
|
def bit_set?(n)
|
181
240
|
mask = 1 << n
|
182
241
|
@flags & mask == mask
|
data/lib/perobs/FlatFileDB.rb
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
#
|
3
3
|
# = FlatFileDB.rb -- Persistent Ruby Object Store
|
4
4
|
#
|
5
|
-
# Copyright (c) 2015, 2016
|
5
|
+
# Copyright (c) 2015, 2016, 2017, 2018, 2019
|
6
|
+
# by Chris Schlaeger <chris@taskjuggler.org>
|
6
7
|
#
|
7
8
|
# MIT License
|
8
9
|
#
|
@@ -41,7 +42,7 @@ module PEROBS
|
|
41
42
|
|
42
43
|
# This version number increases whenever the on-disk format changes in a
|
43
44
|
# way that requires conversion actions after an update.
|
44
|
-
VERSION =
|
45
|
+
VERSION = 4
|
45
46
|
|
46
47
|
attr_reader :max_blob_size
|
47
48
|
|
@@ -50,13 +51,17 @@ module PEROBS
|
|
50
51
|
# @param options [Hash] options to customize the behavior. Currently only
|
51
52
|
# the following options are supported:
|
52
53
|
# :serializer : Can be :marshal, :json, :yaml
|
54
|
+
# :progressmeter : Reference to a ProgressMeter object
|
55
|
+
# :log : IO that should be used for logging
|
56
|
+
# :log_level : Minimum Logger level to log
|
53
57
|
def initialize(db_name, options = {})
|
54
|
-
super(options
|
58
|
+
super(options)
|
55
59
|
|
56
60
|
@db_dir = db_name
|
57
61
|
# Create the database directory if it doesn't exist yet.
|
58
62
|
ensure_dir_exists(@db_dir)
|
59
|
-
PEROBS.log.
|
63
|
+
PEROBS.log.level = options[:log_level] if options[:log_level]
|
64
|
+
PEROBS.log.open(options[:log] || File.join(@db_dir, 'log'))
|
60
65
|
check_version_and_upgrade
|
61
66
|
|
62
67
|
# Read the existing DB config.
|
@@ -68,7 +73,7 @@ module PEROBS
|
|
68
73
|
|
69
74
|
# Open the FlatFileDB for transactions.
|
70
75
|
def open
|
71
|
-
@flat_file = FlatFile.new(@db_dir)
|
76
|
+
@flat_file = FlatFile.new(@db_dir, @progressmeter)
|
72
77
|
@flat_file.open
|
73
78
|
PEROBS.log.info "FlatFile '#{@db_dir}' opened"
|
74
79
|
end
|
@@ -143,8 +148,9 @@ module PEROBS
|
|
143
148
|
end
|
144
149
|
end
|
145
150
|
|
146
|
-
|
147
|
-
|
151
|
+
# @return [Integer] Number of objects stored in the DB.
|
152
|
+
def item_counter
|
153
|
+
@flat_file.item_counter
|
148
154
|
end
|
149
155
|
|
150
156
|
# This method must be called to initiate the marking process.
|
@@ -154,9 +160,9 @@ module PEROBS
|
|
154
160
|
|
155
161
|
# Permanently delete all objects that have not been marked. Those are
|
156
162
|
# orphaned and are no longer referenced by any actively used object.
|
157
|
-
# @return [
|
158
|
-
def delete_unmarked_objects
|
159
|
-
@flat_file.delete_unmarked_objects
|
163
|
+
# @return [Integer] Number of the removed objects from the DB.
|
164
|
+
def delete_unmarked_objects(&block)
|
165
|
+
@flat_file.delete_unmarked_objects(&block)
|
160
166
|
end
|
161
167
|
|
162
168
|
# Mark an object.
|
@@ -178,7 +184,11 @@ module PEROBS
|
|
178
184
|
# repaired.
|
179
185
|
# @return number of errors found
|
180
186
|
def check_db(repair = false)
|
181
|
-
|
187
|
+
if repair
|
188
|
+
@flat_file.repair
|
189
|
+
else
|
190
|
+
@flat_file.check
|
191
|
+
end
|
182
192
|
end
|
183
193
|
|
184
194
|
# Check if the stored object is syntactically correct.
|
@@ -226,7 +236,8 @@ module PEROBS
|
|
226
236
|
"'#{version_file}': " + e.message
|
227
237
|
end
|
228
238
|
else
|
229
|
-
#
|
239
|
+
# The DB is brand new.
|
240
|
+
version = VERSION
|
230
241
|
write_version_file(version_file)
|
231
242
|
end
|
232
243
|
|
@@ -234,25 +245,40 @@ module PEROBS
|
|
234
245
|
PEROBS.log.fatal "Cannot downgrade the FlatFile database from " +
|
235
246
|
"version #{version} to version #{VERSION}"
|
236
247
|
end
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
open
|
242
|
-
@flat_file.refresh
|
243
|
-
close
|
248
|
+
if version < 3
|
249
|
+
PEROBS.log.fatal "The upgrade of this version of the PEROBS database " +
|
250
|
+
"is not supported by this version of PEROBS. Please try an earlier " +
|
251
|
+
"version of PEROBS to upgrade the database before using this version."
|
244
252
|
end
|
245
253
|
|
246
|
-
#
|
247
|
-
#
|
248
|
-
|
254
|
+
# Version upgrades must be done one version number at a time. If the
|
255
|
+
# existing DB is multiple versions older than what the current PEROBS
|
256
|
+
# version expects than multiple upgrade runs will be needed.
|
257
|
+
while version < VERSION
|
258
|
+
if version == 3
|
259
|
+
PEROBS.log.warn "Updating FlatFileDB #{@db_dir} from version 3 to " +
|
260
|
+
"version 4 ..."
|
261
|
+
# Version 4 adds checksums for blob file headers. We have to convert
|
262
|
+
# the blob file to include the checksums.
|
263
|
+
FlatFile.insert_header_checksums(@db_dir)
|
264
|
+
open
|
265
|
+
@flat_file.regenerate_index_and_spaces
|
266
|
+
close
|
267
|
+
end
|
268
|
+
|
269
|
+
# After a successful upgrade change the version number in the DB as
|
270
|
+
# well.
|
249
271
|
write_version_file(version_file)
|
250
272
|
PEROBS.log.warn "Update of FlatFileDB '#{@db_dir}' from version " +
|
251
|
-
"#{version} to version #{
|
273
|
+
"#{version} to version #{version + 1} completed"
|
274
|
+
|
275
|
+
# Update version variable to new version.
|
276
|
+
version += 1
|
252
277
|
end
|
253
278
|
end
|
254
279
|
|
255
280
|
def write_version_file(version_file)
|
281
|
+
|
256
282
|
begin
|
257
283
|
RobustFile.write(version_file, VERSION)
|
258
284
|
rescue IOError => e
|
@@ -0,0 +1,175 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
|
4
|
+
#
|
5
|
+
# Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
|
6
|
+
#
|
7
|
+
# MIT License
|
8
|
+
#
|
9
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
10
|
+
# a copy of this software and associated documentation files (the
|
11
|
+
# "Software"), to deal in the Software without restriction, including
|
12
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
13
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
14
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
15
|
+
# the following conditions:
|
16
|
+
#
|
17
|
+
# The above copyright notice and this permission notice shall be
|
18
|
+
# included in all copies or substantial portions of the Software.
|
19
|
+
#
|
20
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
22
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
24
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
25
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
26
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
|
+
|
28
|
+
require 'perobs/Log'
|
29
|
+
require 'perobs/Object'
|
30
|
+
|
31
|
+
module PEROBS
|
32
|
+
|
33
|
+
# The fuzzy string matcher can be used to perform a fuzzy string search
|
34
|
+
# against a known set of strings. The dictionary of known strings does not
|
35
|
+
# store the actual strings but references to String or PEROBS objects.
|
36
|
+
# Once the dictionary has been established, fuzzy matches can be done. Since
|
37
|
+
# the actual input strings are not directly stored, you cannot remove or
|
38
|
+
# modified already stored strings. To remove strings, you have to clear the
|
39
|
+
# matcher and add the strings again that you want to keep.
|
40
|
+
class FuzzyStringMatcher < PEROBS::Object
|
41
|
+
|
42
|
+
attr_persist :case_sensitive, :n, :dict
|
43
|
+
|
44
|
+
# Create a new FuzzyStringMatcher.
|
45
|
+
# @param p [PEROBS::Store] place to store the dictionary
|
46
|
+
# @param case_sensitive [Boolean] True if case matters for matching
|
47
|
+
# @param n [Integer] Determines what kind of n-gramm is used to store the
|
48
|
+
# references in the dictionary. It also determines the minimum word
|
49
|
+
# length that can be used for fuzzy matches. Values between 2 and
|
50
|
+
# 10 are supported. The default is 4.
|
51
|
+
def initialize(p, case_sensitive = false, n = 4)
|
52
|
+
super(p)
|
53
|
+
if n < 2 || n > 10
|
54
|
+
raise ArgumentError, 'n must be between 2 and 10'
|
55
|
+
end
|
56
|
+
self.case_sensitive = case_sensitive
|
57
|
+
self.n = n
|
58
|
+
|
59
|
+
clear unless @dict
|
60
|
+
end
|
61
|
+
|
62
|
+
# Wipe the dictionary.
|
63
|
+
def clear
|
64
|
+
self.dict = @store.new(BigHash)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Add a string with its reference to the dictionary.
|
68
|
+
# @param string [String] The string to store
|
69
|
+
# @param reference [Object] Any object that is associated with the string
|
70
|
+
def learn(string, reference = string)
|
71
|
+
reference = string if reference.nil?
|
72
|
+
|
73
|
+
unless @case_sensitive
|
74
|
+
string = string.downcase
|
75
|
+
end
|
76
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
77
|
+
string = "\002" + string + "\003"
|
78
|
+
|
79
|
+
each_n_gramm(string) do |n_gramm|
|
80
|
+
unless (ng_list = @dict[n_gramm])
|
81
|
+
@dict[n_gramm] = ng_list = @store.new(Hash)
|
82
|
+
end
|
83
|
+
|
84
|
+
# We use the Hash as a Set. The value doesn't matter.
|
85
|
+
ng_list[reference] = true unless ng_list.include?(reference)
|
86
|
+
end
|
87
|
+
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
|
91
|
+
# Find the references who's string best matches the given string.
|
92
|
+
# @param string [String] string to search for
|
93
|
+
# @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
|
94
|
+
# the matching should be done. The larger the value the more closer
|
95
|
+
# the given string needs to be.
|
96
|
+
# @param max_count [Integer] The maximum number of matches that should be
|
97
|
+
# returned.
|
98
|
+
# @return [Array] The result is an Array of Arrays. The nested Arrays only
|
99
|
+
# have 2 entries. The reference and a Float value between 0 and
|
100
|
+
# 1.0 that describes how good the match is. The matches are sorted
|
101
|
+
# in descending order by the match score.
|
102
|
+
def best_matches(string, min_score = 0.5, max_count = 100)
|
103
|
+
unless @case_sensitive
|
104
|
+
string = string.downcase
|
105
|
+
end
|
106
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
107
|
+
string = "\002" + string + "\003"
|
108
|
+
|
109
|
+
matches = {}
|
110
|
+
|
111
|
+
each_n_gramm(string) do |n_gramm|
|
112
|
+
if (ng_list = @dict[n_gramm])
|
113
|
+
ng_list.each do |reference, dummy|
|
114
|
+
if matches.include?(reference)
|
115
|
+
matches[reference] += 1
|
116
|
+
else
|
117
|
+
matches[reference] = 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
return [] if matches.empty?
|
124
|
+
|
125
|
+
match_list = matches.to_a
|
126
|
+
|
127
|
+
# Set occurance counters to scores relative to the best possible score.
|
128
|
+
# This will be the best possible score for a perfect match.
|
129
|
+
best_possible_score = string.length - @n + 1
|
130
|
+
match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
|
131
|
+
|
132
|
+
# Delete all matches that don't have the required minimum match score.
|
133
|
+
match_list.delete_if { |a| a[1] < min_score }
|
134
|
+
|
135
|
+
# Sort the list best to worst match
|
136
|
+
match_list.sort! do |a, b|
|
137
|
+
b[1] <=> a[1]
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the top max_count matches.
|
141
|
+
match_list[0..max_count - 1]
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns some internal stats about the dictionary.
|
145
|
+
def stats
|
146
|
+
s = {}
|
147
|
+
s['dictionary_size'] = @dict.size
|
148
|
+
max = total = 0
|
149
|
+
@dict.each do |n_gramm, ng_list|
|
150
|
+
size = ng_list.length
|
151
|
+
max = size if size > max
|
152
|
+
total += size
|
153
|
+
end
|
154
|
+
s['max_list_size'] = max
|
155
|
+
s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
|
156
|
+
|
157
|
+
s
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def each_n_gramm(string, &block)
|
163
|
+
return if string.length < @n
|
164
|
+
|
165
|
+
0.upto(string.length - @n) do |i|
|
166
|
+
n_gramm = string[i, @n]
|
167
|
+
|
168
|
+
yield(n_gramm)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|