perobs 4.0.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +27 -16
  3. data/lib/perobs/Array.rb +66 -19
  4. data/lib/perobs/BTree.rb +106 -15
  5. data/lib/perobs/BTreeBlob.rb +4 -3
  6. data/lib/perobs/BTreeDB.rb +5 -4
  7. data/lib/perobs/BTreeNode.rb +482 -156
  8. data/lib/perobs/BTreeNodeLink.rb +10 -0
  9. data/lib/perobs/BigArray.rb +285 -0
  10. data/lib/perobs/BigArrayNode.rb +1002 -0
  11. data/lib/perobs/BigHash.rb +246 -0
  12. data/lib/perobs/BigTree.rb +197 -0
  13. data/lib/perobs/BigTreeNode.rb +873 -0
  14. data/lib/perobs/Cache.rb +48 -10
  15. data/lib/perobs/ConsoleProgressMeter.rb +61 -0
  16. data/lib/perobs/DataBase.rb +4 -3
  17. data/lib/perobs/DynamoDB.rb +57 -15
  18. data/lib/perobs/EquiBlobsFile.rb +155 -50
  19. data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
  20. data/lib/perobs/FlatFile.rb +519 -227
  21. data/lib/perobs/FlatFileBlobHeader.rb +113 -54
  22. data/lib/perobs/FlatFileDB.rb +49 -23
  23. data/lib/perobs/FuzzyStringMatcher.rb +175 -0
  24. data/lib/perobs/Hash.rb +127 -33
  25. data/lib/perobs/IDList.rb +144 -0
  26. data/lib/perobs/IDListPage.rb +107 -0
  27. data/lib/perobs/IDListPageFile.rb +180 -0
  28. data/lib/perobs/IDListPageRecord.rb +142 -0
  29. data/lib/perobs/Object.rb +18 -15
  30. data/lib/perobs/ObjectBase.rb +46 -5
  31. data/lib/perobs/PersistentObjectCache.rb +57 -68
  32. data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
  33. data/lib/perobs/ProgressMeter.rb +97 -0
  34. data/lib/perobs/SpaceManager.rb +273 -0
  35. data/lib/perobs/SpaceTree.rb +21 -12
  36. data/lib/perobs/SpaceTreeNode.rb +53 -61
  37. data/lib/perobs/Store.rb +264 -145
  38. data/lib/perobs/version.rb +1 -1
  39. data/lib/perobs.rb +2 -0
  40. data/perobs.gemspec +4 -4
  41. data/test/Array_spec.rb +15 -6
  42. data/test/BTree_spec.rb +6 -2
  43. data/test/BigArray_spec.rb +261 -0
  44. data/test/BigHash_spec.rb +152 -0
  45. data/test/BigTreeNode_spec.rb +153 -0
  46. data/test/BigTree_spec.rb +259 -0
  47. data/test/EquiBlobsFile_spec.rb +105 -1
  48. data/test/FNV_Hash_1a_64_spec.rb +59 -0
  49. data/test/FlatFileDB_spec.rb +198 -14
  50. data/test/FuzzyStringMatcher_spec.rb +261 -0
  51. data/test/Hash_spec.rb +13 -3
  52. data/test/IDList_spec.rb +77 -0
  53. data/test/LegacyDBs/LegacyDB.rb +155 -0
  54. data/test/LegacyDBs/version_3/class_map.json +1 -0
  55. data/test/LegacyDBs/version_3/config.json +1 -0
  56. data/test/LegacyDBs/version_3/database.blobs +0 -0
  57. data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
  58. data/test/LegacyDBs/version_3/index.blobs +0 -0
  59. data/test/LegacyDBs/version_3/version +1 -0
  60. data/test/LockFile_spec.rb +9 -6
  61. data/test/SpaceManager_spec.rb +176 -0
  62. data/test/SpaceTree_spec.rb +4 -1
  63. data/test/Store_spec.rb +305 -203
  64. data/test/spec_helper.rb +9 -4
  65. metadata +57 -16
  66. data/lib/perobs/BTreeNodeCache.rb +0 -109
  67. data/lib/perobs/TreeDB.rb +0 -277
@@ -48,12 +48,13 @@ module PEROBS
48
48
  # The 'pack()' format of the header.
49
49
  FORMAT = 'CQQL'
50
50
  # The length of the header in bytes.
51
- LENGTH = 21
51
+ LENGTH = 25
52
52
  VALID_FLAG_BIT = 0
53
53
  COMPRESSED_FLAG_BIT = 2
54
54
  OUTDATED_FLAG_BIT = 3
55
55
 
56
56
  attr_reader :addr, :flags, :length, :id, :crc
57
+ attr_accessor :corruption_start
57
58
 
58
59
  # Create a new FlatFileBlobHeader with the given flags, length, id and crc.
59
60
  # @param file [File] the FlatFile that contains the header
@@ -69,50 +70,120 @@ module PEROBS
69
70
  @length = length
70
71
  @id = id
71
72
  @crc = crc
73
+ # This is only set if the header is preceded by a corrupted blob.
74
+ @corruption_start = nil
72
75
  end
73
76
 
74
77
  # Read the header from the given File.
75
78
  # @param file [File]
76
- # @return FlatFileBlobHeader
77
- def FlatFileBlobHeader::read(file)
78
- begin
79
- addr = file.pos
80
- buf = file.read(LENGTH)
81
- rescue IOError => e
82
- PEROBS.log.error "Cannot read blob header in flat file DB: #{e.message}"
83
- return nil
79
+ # @param addr [Integer] address in the file to start reading. If no
80
+ # address is specified use the current position in the file.
81
+ # @param id [Integer] Optional ID that the header should have. If no id is
82
+ # specified there is no check against the actual ID done.
83
+ # @return FlatFileBlobHeader or nil if there are no more blobs to read in
84
+ # the file.
85
+ def FlatFileBlobHeader::read(file, addr = nil, id = nil)
86
+ # If an address was specified we expect the read to always succeed. If
87
+ # no address is specified and we can't read the header we generate an
88
+ # error message but it is not fatal.
89
+ errors_are_fatal = !addr.nil?
90
+
91
+ mode = :searching_next_header
92
+ addr = file.pos unless addr
93
+ buf = nil
94
+ corruption_start = nil
95
+
96
+ loop do
97
+ buf_with_crc = nil
98
+ begin
99
+ file.seek(addr)
100
+ buf_with_crc = file.read(LENGTH)
101
+ rescue IOError => e
102
+ if errors_are_fatal
103
+ PEROBS.log.fatal "Cannot read blob header in flat file DB at " +
104
+ "address #{addr}: #{e.message}"
105
+ else
106
+ PEROBS.log.error "Cannot read blob header in flat file DB: " +
107
+ e.message
108
+ return nil
109
+ end
110
+ end
111
+
112
+ # Did we read anything?
113
+ if buf_with_crc.nil?
114
+ if errors_are_fatal
115
+ PEROBS.log.fatal "Cannot read blob header " +
116
+ "#{id ? "for ID #{id} " : ''}at address #{addr}"
117
+ else
118
+ if corruption_start
119
+ PEROBS.log.error "Corruption found at end of blob file at " +
120
+ "address #{addr}"
121
+ end
122
+ # We have reached the end of the file.
123
+ return nil
124
+ end
125
+ end
126
+
127
+ # Did we get the full header?
128
+ if buf_with_crc.length != LENGTH
129
+ msg = "Incomplete FlatFileBlobHeader: Only " +
130
+ "#{buf_with_crc.length} " +
131
+ "bytes of #{LENGTH} could be read "
132
+ "#{id ? "for ID #{id} " : ''}at address #{addr}"
133
+ if errors_are_fatal
134
+ PEROBS.log.fatal msg
135
+ else
136
+ PEROBS.log.error msg
137
+ end
138
+ return nil
139
+ end
140
+
141
+ # Check the CRC of the header
142
+ buf = buf_with_crc[0..-5]
143
+ crc = buf_with_crc[-4..-1].unpack('L')[0]
144
+
145
+ if (read_crc = Zlib.crc32(buf, 0)) == crc
146
+ # We have found a valid header.
147
+ if corruption_start
148
+ PEROBS.log.error "FlatFile corruption ends at #{addr}. " +
149
+ "#{addr - corruption_start} bytes skipped. Some data may " +
150
+ "not be recoverable."
151
+ end
152
+ break
153
+ else
154
+ if errors_are_fatal
155
+ PEROBS.log.fatal "FlatFile Header CRC mismatch at address " +
156
+ "#{addr}. Header CRC is #{'%08x' % read_crc} but should be " +
157
+ "#{'%08x' % crc}."
158
+ else
159
+ if corruption_start.nil?
160
+ if errors_are_fatal
161
+ PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
162
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
163
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}."
164
+ else
165
+ PEROBS.log.error "FlatFile corruption found. The FlatFile " +
166
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
167
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
168
+ "Trying to find the next header."
169
+ end
170
+ corruption_start = addr
171
+ end
172
+ # The blob file is corrupted. There is no valid header at the
173
+ # current position in the file. We now try to find the next valid
174
+ # header by iterating over the remainder of the file advanding one
175
+ # byte with each step until we hit the end of the file or find the
176
+ # next valid header.
177
+ addr += 1
178
+ end
179
+ end
84
180
  end
85
181
 
86
- return nil unless buf
87
-
88
- if buf.length != LENGTH
89
- PEROBS.log.error "Incomplete FlatFileBlobHeader: Only #{buf.length} " +
90
- "bytes of #{LENGTH} could be read"
91
- return nil
182
+ header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
183
+ if corruption_start
184
+ header.corruption_start = corruption_start
92
185
  end
93
186
 
94
- FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
95
- end
96
-
97
- # Read the header from the given File.
98
- # @param file [File]
99
- # @param addr [Integer] address in the file to start reading
100
- # @param id [Integer] Optional ID that the header should have
101
- # @return FlatFileBlobHeader
102
- def FlatFileBlobHeader::read_at(file, addr, id = nil)
103
- buf = nil
104
- begin
105
- file.seek(addr)
106
- buf = file.read(LENGTH)
107
- rescue IOError => e
108
- PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
109
- end
110
- if buf.nil? || buf.length != LENGTH
111
- PEROBS.log.fatal "Cannot read blob header " +
112
- "#{id ? "for ID #{id} " : ''}at address " +
113
- "#{addr}"
114
- end
115
- header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
116
187
  if id && header.id != id
117
188
  PEROBS.log.fatal "Mismatch between FlatFile index and blob file " +
118
189
  "found. FlatFile has entry with ID #{header.id} at address " +
@@ -123,11 +194,12 @@ module PEROBS
123
194
  end
124
195
 
125
196
  # Write the header to a given File.
126
- # @param file [File]
127
197
  def write
128
198
  begin
199
+ buf = [ @flags, @length, @id, @crc].pack(FORMAT)
200
+ crc = Zlib.crc32(buf, 0)
129
201
  @file.seek(@addr)
130
- @file.write([ @flags, @length, @id, @crc].pack(FORMAT))
202
+ @file.write(buf + [ crc ].pack('L'))
131
203
  rescue IOError => e
132
204
  PEROBS.log.fatal "Cannot write blob header into flat file DB: " +
133
205
  e.message
@@ -135,11 +207,9 @@ module PEROBS
135
207
  end
136
208
 
137
209
  # Reset all the flags bit to 0. This marks the blob as invalid.
138
- # @param file [File] The file handle of the blob file.
139
- # @param addr [Integer] The address of the header
140
210
  def clear_flags
141
211
  @flags = 0
142
- write_flags
212
+ write
143
213
  end
144
214
 
145
215
  # Return true if the header is for a non-empty blob.
@@ -156,7 +226,7 @@ module PEROBS
156
226
  # transaction has been completed.
157
227
  def set_outdated_flag
158
228
  set_flag(OUTDATED_FLAG_BIT)
159
- write_flags
229
+ write
160
230
  end
161
231
 
162
232
  # Return true if the blob contains outdated data.
@@ -166,17 +236,6 @@ module PEROBS
166
236
 
167
237
  private
168
238
 
169
- def write_flags
170
- begin
171
- @file.seek(@addr)
172
- @file.write([ @flags ].pack('C'))
173
- @file.flush
174
- rescue IOError => e
175
- PEROBS.log.fatal "Writing flags of FlatFileBlobHeader with ID #{@id} " +
176
- "failed: #{e.message}"
177
- end
178
- end
179
-
180
239
  def bit_set?(n)
181
240
  mask = 1 << n
182
241
  @flags & mask == mask
@@ -2,7 +2,8 @@
2
2
  #
3
3
  # = FlatFileDB.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2015, 2016 by Chris Schlaeger <chris@taskjuggler.org>
5
+ # Copyright (c) 2015, 2016, 2017, 2018, 2019
6
+ # by Chris Schlaeger <chris@taskjuggler.org>
6
7
  #
7
8
  # MIT License
8
9
  #
@@ -41,7 +42,7 @@ module PEROBS
41
42
 
42
43
  # This version number increases whenever the on-disk format changes in a
43
44
  # way that requires conversion actions after an update.
44
- VERSION = 2
45
+ VERSION = 4
45
46
 
46
47
  attr_reader :max_blob_size
47
48
 
@@ -50,13 +51,17 @@ module PEROBS
50
51
  # @param options [Hash] options to customize the behavior. Currently only
51
52
  # the following options are supported:
52
53
  # :serializer : Can be :marshal, :json, :yaml
54
+ # :progressmeter : Reference to a ProgressMeter object
55
+ # :log : IO that should be used for logging
56
+ # :log_level : Minimum Logger level to log
53
57
  def initialize(db_name, options = {})
54
- super(options[:serializer] || :json)
58
+ super(options)
55
59
 
56
60
  @db_dir = db_name
57
61
  # Create the database directory if it doesn't exist yet.
58
62
  ensure_dir_exists(@db_dir)
59
- PEROBS.log.open(File.join(@db_dir, 'log'))
63
+ PEROBS.log.level = options[:log_level] if options[:log_level]
64
+ PEROBS.log.open(options[:log] || File.join(@db_dir, 'log'))
60
65
  check_version_and_upgrade
61
66
 
62
67
  # Read the existing DB config.
@@ -68,7 +73,7 @@ module PEROBS
68
73
 
69
74
  # Open the FlatFileDB for transactions.
70
75
  def open
71
- @flat_file = FlatFile.new(@db_dir)
76
+ @flat_file = FlatFile.new(@db_dir, @progressmeter)
72
77
  @flat_file.open
73
78
  PEROBS.log.info "FlatFile '#{@db_dir}' opened"
74
79
  end
@@ -143,8 +148,9 @@ module PEROBS
143
148
  end
144
149
  end
145
150
 
146
- def search_object(id)
147
- @flat_file.search_object(id)
151
+ # @return [Integer] Number of objects stored in the DB.
152
+ def item_counter
153
+ @flat_file.item_counter
148
154
  end
149
155
 
150
156
  # This method must be called to initiate the marking process.
@@ -154,9 +160,9 @@ module PEROBS
154
160
 
155
161
  # Permanently delete all objects that have not been marked. Those are
156
162
  # orphaned and are no longer referenced by any actively used object.
157
- # @return [Array] List of IDs that have been removed from the DB.
158
- def delete_unmarked_objects
159
- @flat_file.delete_unmarked_objects
163
+ # @return [Integer] Number of the removed objects from the DB.
164
+ def delete_unmarked_objects(&block)
165
+ @flat_file.delete_unmarked_objects(&block)
160
166
  end
161
167
 
162
168
  # Mark an object.
@@ -178,7 +184,11 @@ module PEROBS
178
184
  # repaired.
179
185
  # @return number of errors found
180
186
  def check_db(repair = false)
181
- @flat_file.check(repair)
187
+ if repair
188
+ @flat_file.repair
189
+ else
190
+ @flat_file.check
191
+ end
182
192
  end
183
193
 
184
194
  # Check if the stored object is syntactically correct.
@@ -226,7 +236,8 @@ module PEROBS
226
236
  "'#{version_file}': " + e.message
227
237
  end
228
238
  else
229
- # Early versions of PEROBS did not have a version file.
239
+ # The DB is brand new.
240
+ version = VERSION
230
241
  write_version_file(version_file)
231
242
  end
232
243
 
@@ -234,25 +245,40 @@ module PEROBS
234
245
  PEROBS.log.fatal "Cannot downgrade the FlatFile database from " +
235
246
  "version #{version} to version #{VERSION}"
236
247
  end
237
-
238
- if version == 1
239
- # Version 1 had no support for data compression. Make sure all entries
240
- # are compressed to save space.
241
- open
242
- @flat_file.refresh
243
- close
248
+ if version < 3
249
+ PEROBS.log.fatal "The upgrade of this version of the PEROBS database " +
250
+ "is not supported by this version of PEROBS. Please try an earlier " +
251
+ "version of PEROBS to upgrade the database before using this version."
244
252
  end
245
253
 
246
- # After a successful upgrade change the version number in the DB as
247
- # well.
248
- if version < VERSION
254
+ # Version upgrades must be done one version number at a time. If the
255
+ # existing DB is multiple versions older than what the current PEROBS
256
+ # version expects than multiple upgrade runs will be needed.
257
+ while version < VERSION
258
+ if version == 3
259
+ PEROBS.log.warn "Updating FlatFileDB #{@db_dir} from version 3 to " +
260
+ "version 4 ..."
261
+ # Version 4 adds checksums for blob file headers. We have to convert
262
+ # the blob file to include the checksums.
263
+ FlatFile.insert_header_checksums(@db_dir)
264
+ open
265
+ @flat_file.regenerate_index_and_spaces
266
+ close
267
+ end
268
+
269
+ # After a successful upgrade change the version number in the DB as
270
+ # well.
249
271
  write_version_file(version_file)
250
272
  PEROBS.log.warn "Update of FlatFileDB '#{@db_dir}' from version " +
251
- "#{version} to version #{VERSION} completed"
273
+ "#{version} to version #{version + 1} completed"
274
+
275
+ # Update version variable to new version.
276
+ version += 1
252
277
  end
253
278
  end
254
279
 
255
280
  def write_version_file(version_file)
281
+
256
282
  begin
257
283
  RobustFile.write(version_file, VERSION)
258
284
  rescue IOError => e
@@ -0,0 +1,175 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/Log'
29
+ require 'perobs/Object'
30
+
31
+ module PEROBS
32
+
33
+ # The fuzzy string matcher can be used to perform a fuzzy string search
34
+ # against a known set of strings. The dictionary of known strings does not
35
+ # store the actual strings but references to String or PEROBS objects.
36
+ # Once the dictionary has been established, fuzzy matches can be done. Since
37
+ # the actual input strings are not directly stored, you cannot remove or
38
+ # modified already stored strings. To remove strings, you have to clear the
39
+ # matcher and add the strings again that you want to keep.
40
+ class FuzzyStringMatcher < PEROBS::Object
41
+
42
+ attr_persist :case_sensitive, :n, :dict
43
+
44
+ # Create a new FuzzyStringMatcher.
45
+ # @param p [PEROBS::Store] place to store the dictionary
46
+ # @param case_sensitive [Boolean] True if case matters for matching
47
+ # @param n [Integer] Determines what kind of n-gramm is used to store the
48
+ # references in the dictionary. It also determines the minimum word
49
+ # length that can be used for fuzzy matches. Values between 2 and
50
+ # 10 are supported. The default is 4.
51
+ def initialize(p, case_sensitive = false, n = 4)
52
+ super(p)
53
+ if n < 2 || n > 10
54
+ raise ArgumentError, 'n must be between 2 and 10'
55
+ end
56
+ self.case_sensitive = case_sensitive
57
+ self.n = n
58
+
59
+ clear unless @dict
60
+ end
61
+
62
+ # Wipe the dictionary.
63
+ def clear
64
+ self.dict = @store.new(BigHash)
65
+ end
66
+
67
+ # Add a string with its reference to the dictionary.
68
+ # @param string [String] The string to store
69
+ # @param reference [Object] Any object that is associated with the string
70
+ def learn(string, reference = string)
71
+ reference = string if reference.nil?
72
+
73
+ unless @case_sensitive
74
+ string = string.downcase
75
+ end
76
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
77
+ string = "\002" + string + "\003"
78
+
79
+ each_n_gramm(string) do |n_gramm|
80
+ unless (ng_list = @dict[n_gramm])
81
+ @dict[n_gramm] = ng_list = @store.new(Hash)
82
+ end
83
+
84
+ # We use the Hash as a Set. The value doesn't matter.
85
+ ng_list[reference] = true unless ng_list.include?(reference)
86
+ end
87
+
88
+ nil
89
+ end
90
+
91
+ # Find the references who's string best matches the given string.
92
+ # @param string [String] string to search for
93
+ # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
94
+ # the matching should be done. The larger the value the more closer
95
+ # the given string needs to be.
96
+ # @param max_count [Integer] The maximum number of matches that should be
97
+ # returned.
98
+ # @return [Array] The result is an Array of Arrays. The nested Arrays only
99
+ # have 2 entries. The reference and a Float value between 0 and
100
+ # 1.0 that describes how good the match is. The matches are sorted
101
+ # in descending order by the match score.
102
+ def best_matches(string, min_score = 0.5, max_count = 100)
103
+ unless @case_sensitive
104
+ string = string.downcase
105
+ end
106
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
107
+ string = "\002" + string + "\003"
108
+
109
+ matches = {}
110
+
111
+ each_n_gramm(string) do |n_gramm|
112
+ if (ng_list = @dict[n_gramm])
113
+ ng_list.each do |reference, dummy|
114
+ if matches.include?(reference)
115
+ matches[reference] += 1
116
+ else
117
+ matches[reference] = 1
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ return [] if matches.empty?
124
+
125
+ match_list = matches.to_a
126
+
127
+ # Set occurance counters to scores relative to the best possible score.
128
+ # This will be the best possible score for a perfect match.
129
+ best_possible_score = string.length - @n + 1
130
+ match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
131
+
132
+ # Delete all matches that don't have the required minimum match score.
133
+ match_list.delete_if { |a| a[1] < min_score }
134
+
135
+ # Sort the list best to worst match
136
+ match_list.sort! do |a, b|
137
+ b[1] <=> a[1]
138
+ end
139
+
140
+ # Return the top max_count matches.
141
+ match_list[0..max_count - 1]
142
+ end
143
+
144
+ # Returns some internal stats about the dictionary.
145
+ def stats
146
+ s = {}
147
+ s['dictionary_size'] = @dict.size
148
+ max = total = 0
149
+ @dict.each do |n_gramm, ng_list|
150
+ size = ng_list.length
151
+ max = size if size > max
152
+ total += size
153
+ end
154
+ s['max_list_size'] = max
155
+ s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
156
+
157
+ s
158
+ end
159
+
160
+ private
161
+
162
+ def each_n_gramm(string, &block)
163
+ return if string.length < @n
164
+
165
+ 0.upto(string.length - @n) do |i|
166
+ n_gramm = string[i, @n]
167
+
168
+ yield(n_gramm)
169
+ end
170
+ end
171
+
172
+ end
173
+
174
+ end
175
+