perobs 4.0.0 → 4.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +27 -16
  3. data/lib/perobs/Array.rb +66 -19
  4. data/lib/perobs/BTree.rb +106 -15
  5. data/lib/perobs/BTreeBlob.rb +4 -3
  6. data/lib/perobs/BTreeDB.rb +5 -4
  7. data/lib/perobs/BTreeNode.rb +482 -156
  8. data/lib/perobs/BTreeNodeLink.rb +10 -0
  9. data/lib/perobs/BigArray.rb +285 -0
  10. data/lib/perobs/BigArrayNode.rb +1002 -0
  11. data/lib/perobs/BigHash.rb +246 -0
  12. data/lib/perobs/BigTree.rb +197 -0
  13. data/lib/perobs/BigTreeNode.rb +873 -0
  14. data/lib/perobs/Cache.rb +48 -10
  15. data/lib/perobs/ConsoleProgressMeter.rb +61 -0
  16. data/lib/perobs/DataBase.rb +4 -3
  17. data/lib/perobs/DynamoDB.rb +57 -15
  18. data/lib/perobs/EquiBlobsFile.rb +155 -50
  19. data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
  20. data/lib/perobs/FlatFile.rb +519 -227
  21. data/lib/perobs/FlatFileBlobHeader.rb +113 -54
  22. data/lib/perobs/FlatFileDB.rb +49 -23
  23. data/lib/perobs/FuzzyStringMatcher.rb +175 -0
  24. data/lib/perobs/Hash.rb +127 -33
  25. data/lib/perobs/IDList.rb +144 -0
  26. data/lib/perobs/IDListPage.rb +107 -0
  27. data/lib/perobs/IDListPageFile.rb +180 -0
  28. data/lib/perobs/IDListPageRecord.rb +142 -0
  29. data/lib/perobs/Object.rb +18 -15
  30. data/lib/perobs/ObjectBase.rb +46 -5
  31. data/lib/perobs/PersistentObjectCache.rb +57 -68
  32. data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
  33. data/lib/perobs/ProgressMeter.rb +97 -0
  34. data/lib/perobs/SpaceManager.rb +273 -0
  35. data/lib/perobs/SpaceTree.rb +21 -12
  36. data/lib/perobs/SpaceTreeNode.rb +53 -61
  37. data/lib/perobs/Store.rb +264 -145
  38. data/lib/perobs/version.rb +1 -1
  39. data/lib/perobs.rb +2 -0
  40. data/perobs.gemspec +4 -4
  41. data/test/Array_spec.rb +15 -6
  42. data/test/BTree_spec.rb +6 -2
  43. data/test/BigArray_spec.rb +261 -0
  44. data/test/BigHash_spec.rb +152 -0
  45. data/test/BigTreeNode_spec.rb +153 -0
  46. data/test/BigTree_spec.rb +259 -0
  47. data/test/EquiBlobsFile_spec.rb +105 -1
  48. data/test/FNV_Hash_1a_64_spec.rb +59 -0
  49. data/test/FlatFileDB_spec.rb +198 -14
  50. data/test/FuzzyStringMatcher_spec.rb +261 -0
  51. data/test/Hash_spec.rb +13 -3
  52. data/test/IDList_spec.rb +77 -0
  53. data/test/LegacyDBs/LegacyDB.rb +155 -0
  54. data/test/LegacyDBs/version_3/class_map.json +1 -0
  55. data/test/LegacyDBs/version_3/config.json +1 -0
  56. data/test/LegacyDBs/version_3/database.blobs +0 -0
  57. data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
  58. data/test/LegacyDBs/version_3/index.blobs +0 -0
  59. data/test/LegacyDBs/version_3/version +1 -0
  60. data/test/LockFile_spec.rb +9 -6
  61. data/test/SpaceManager_spec.rb +176 -0
  62. data/test/SpaceTree_spec.rb +4 -1
  63. data/test/Store_spec.rb +305 -203
  64. data/test/spec_helper.rb +9 -4
  65. metadata +57 -16
  66. data/lib/perobs/BTreeNodeCache.rb +0 -109
  67. data/lib/perobs/TreeDB.rb +0 -277
@@ -48,12 +48,13 @@ module PEROBS
48
48
  # The 'pack()' format of the header.
49
49
  FORMAT = 'CQQL'
50
50
  # The length of the header in bytes.
51
- LENGTH = 21
51
+ LENGTH = 25
52
52
  VALID_FLAG_BIT = 0
53
53
  COMPRESSED_FLAG_BIT = 2
54
54
  OUTDATED_FLAG_BIT = 3
55
55
 
56
56
  attr_reader :addr, :flags, :length, :id, :crc
57
+ attr_accessor :corruption_start
57
58
 
58
59
  # Create a new FlatFileBlobHeader with the given flags, length, id and crc.
59
60
  # @param file [File] the FlatFile that contains the header
@@ -69,50 +70,120 @@ module PEROBS
69
70
  @length = length
70
71
  @id = id
71
72
  @crc = crc
73
+ # This is only set if the header is preceded by a corrupted blob.
74
+ @corruption_start = nil
72
75
  end
73
76
 
74
77
  # Read the header from the given File.
75
78
  # @param file [File]
76
- # @return FlatFileBlobHeader
77
- def FlatFileBlobHeader::read(file)
78
- begin
79
- addr = file.pos
80
- buf = file.read(LENGTH)
81
- rescue IOError => e
82
- PEROBS.log.error "Cannot read blob header in flat file DB: #{e.message}"
83
- return nil
79
+ # @param addr [Integer] address in the file to start reading. If no
80
+ # address is specified use the current position in the file.
81
+ # @param id [Integer] Optional ID that the header should have. If no id is
82
+ # specified there is no check against the actual ID done.
83
+ # @return FlatFileBlobHeader or nil if there are no more blobs to read in
84
+ # the file.
85
+ def FlatFileBlobHeader::read(file, addr = nil, id = nil)
86
+ # If an address was specified we expect the read to always succeed. If
87
+ # no address is specified and we can't read the header we generate an
88
+ # error message but it is not fatal.
89
+ errors_are_fatal = !addr.nil?
90
+
91
+ mode = :searching_next_header
92
+ addr = file.pos unless addr
93
+ buf = nil
94
+ corruption_start = nil
95
+
96
+ loop do
97
+ buf_with_crc = nil
98
+ begin
99
+ file.seek(addr)
100
+ buf_with_crc = file.read(LENGTH)
101
+ rescue IOError => e
102
+ if errors_are_fatal
103
+ PEROBS.log.fatal "Cannot read blob header in flat file DB at " +
104
+ "address #{addr}: #{e.message}"
105
+ else
106
+ PEROBS.log.error "Cannot read blob header in flat file DB: " +
107
+ e.message
108
+ return nil
109
+ end
110
+ end
111
+
112
+ # Did we read anything?
113
+ if buf_with_crc.nil?
114
+ if errors_are_fatal
115
+ PEROBS.log.fatal "Cannot read blob header " +
116
+ "#{id ? "for ID #{id} " : ''}at address #{addr}"
117
+ else
118
+ if corruption_start
119
+ PEROBS.log.error "Corruption found at end of blob file at " +
120
+ "address #{addr}"
121
+ end
122
+ # We have reached the end of the file.
123
+ return nil
124
+ end
125
+ end
126
+
127
+ # Did we get the full header?
128
+ if buf_with_crc.length != LENGTH
129
+ msg = "Incomplete FlatFileBlobHeader: Only " +
130
+ "#{buf_with_crc.length} " +
131
+ "bytes of #{LENGTH} could be read "
132
+ "#{id ? "for ID #{id} " : ''}at address #{addr}"
133
+ if errors_are_fatal
134
+ PEROBS.log.fatal msg
135
+ else
136
+ PEROBS.log.error msg
137
+ end
138
+ return nil
139
+ end
140
+
141
+ # Check the CRC of the header
142
+ buf = buf_with_crc[0..-5]
143
+ crc = buf_with_crc[-4..-1].unpack('L')[0]
144
+
145
+ if (read_crc = Zlib.crc32(buf, 0)) == crc
146
+ # We have found a valid header.
147
+ if corruption_start
148
+ PEROBS.log.error "FlatFile corruption ends at #{addr}. " +
149
+ "#{addr - corruption_start} bytes skipped. Some data may " +
150
+ "not be recoverable."
151
+ end
152
+ break
153
+ else
154
+ if errors_are_fatal
155
+ PEROBS.log.fatal "FlatFile Header CRC mismatch at address " +
156
+ "#{addr}. Header CRC is #{'%08x' % read_crc} but should be " +
157
+ "#{'%08x' % crc}."
158
+ else
159
+ if corruption_start.nil?
160
+ if errors_are_fatal
161
+ PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
162
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
163
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}."
164
+ else
165
+ PEROBS.log.error "FlatFile corruption found. The FlatFile " +
166
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
167
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
168
+ "Trying to find the next header."
169
+ end
170
+ corruption_start = addr
171
+ end
172
+ # The blob file is corrupted. There is no valid header at the
173
+ # current position in the file. We now try to find the next valid
174
+ # header by iterating over the remainder of the file advanding one
175
+ # byte with each step until we hit the end of the file or find the
176
+ # next valid header.
177
+ addr += 1
178
+ end
179
+ end
84
180
  end
85
181
 
86
- return nil unless buf
87
-
88
- if buf.length != LENGTH
89
- PEROBS.log.error "Incomplete FlatFileBlobHeader: Only #{buf.length} " +
90
- "bytes of #{LENGTH} could be read"
91
- return nil
182
+ header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
183
+ if corruption_start
184
+ header.corruption_start = corruption_start
92
185
  end
93
186
 
94
- FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
95
- end
96
-
97
- # Read the header from the given File.
98
- # @param file [File]
99
- # @param addr [Integer] address in the file to start reading
100
- # @param id [Integer] Optional ID that the header should have
101
- # @return FlatFileBlobHeader
102
- def FlatFileBlobHeader::read_at(file, addr, id = nil)
103
- buf = nil
104
- begin
105
- file.seek(addr)
106
- buf = file.read(LENGTH)
107
- rescue IOError => e
108
- PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
109
- end
110
- if buf.nil? || buf.length != LENGTH
111
- PEROBS.log.fatal "Cannot read blob header " +
112
- "#{id ? "for ID #{id} " : ''}at address " +
113
- "#{addr}"
114
- end
115
- header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
116
187
  if id && header.id != id
117
188
  PEROBS.log.fatal "Mismatch between FlatFile index and blob file " +
118
189
  "found. FlatFile has entry with ID #{header.id} at address " +
@@ -123,11 +194,12 @@ module PEROBS
123
194
  end
124
195
 
125
196
  # Write the header to a given File.
126
- # @param file [File]
127
197
  def write
128
198
  begin
199
+ buf = [ @flags, @length, @id, @crc].pack(FORMAT)
200
+ crc = Zlib.crc32(buf, 0)
129
201
  @file.seek(@addr)
130
- @file.write([ @flags, @length, @id, @crc].pack(FORMAT))
202
+ @file.write(buf + [ crc ].pack('L'))
131
203
  rescue IOError => e
132
204
  PEROBS.log.fatal "Cannot write blob header into flat file DB: " +
133
205
  e.message
@@ -135,11 +207,9 @@ module PEROBS
135
207
  end
136
208
 
137
209
  # Reset all the flags bit to 0. This marks the blob as invalid.
138
- # @param file [File] The file handle of the blob file.
139
- # @param addr [Integer] The address of the header
140
210
  def clear_flags
141
211
  @flags = 0
142
- write_flags
212
+ write
143
213
  end
144
214
 
145
215
  # Return true if the header is for a non-empty blob.
@@ -156,7 +226,7 @@ module PEROBS
156
226
  # transaction has been completed.
157
227
  def set_outdated_flag
158
228
  set_flag(OUTDATED_FLAG_BIT)
159
- write_flags
229
+ write
160
230
  end
161
231
 
162
232
  # Return true if the blob contains outdated data.
@@ -166,17 +236,6 @@ module PEROBS
166
236
 
167
237
  private
168
238
 
169
- def write_flags
170
- begin
171
- @file.seek(@addr)
172
- @file.write([ @flags ].pack('C'))
173
- @file.flush
174
- rescue IOError => e
175
- PEROBS.log.fatal "Writing flags of FlatFileBlobHeader with ID #{@id} " +
176
- "failed: #{e.message}"
177
- end
178
- end
179
-
180
239
  def bit_set?(n)
181
240
  mask = 1 << n
182
241
  @flags & mask == mask
@@ -2,7 +2,8 @@
2
2
  #
3
3
  # = FlatFileDB.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2015, 2016 by Chris Schlaeger <chris@taskjuggler.org>
5
+ # Copyright (c) 2015, 2016, 2017, 2018, 2019
6
+ # by Chris Schlaeger <chris@taskjuggler.org>
6
7
  #
7
8
  # MIT License
8
9
  #
@@ -41,7 +42,7 @@ module PEROBS
41
42
 
42
43
  # This version number increases whenever the on-disk format changes in a
43
44
  # way that requires conversion actions after an update.
44
- VERSION = 2
45
+ VERSION = 4
45
46
 
46
47
  attr_reader :max_blob_size
47
48
 
@@ -50,13 +51,17 @@ module PEROBS
50
51
  # @param options [Hash] options to customize the behavior. Currently only
51
52
  # the following options are supported:
52
53
  # :serializer : Can be :marshal, :json, :yaml
54
+ # :progressmeter : Reference to a ProgressMeter object
55
+ # :log : IO that should be used for logging
56
+ # :log_level : Minimum Logger level to log
53
57
  def initialize(db_name, options = {})
54
- super(options[:serializer] || :json)
58
+ super(options)
55
59
 
56
60
  @db_dir = db_name
57
61
  # Create the database directory if it doesn't exist yet.
58
62
  ensure_dir_exists(@db_dir)
59
- PEROBS.log.open(File.join(@db_dir, 'log'))
63
+ PEROBS.log.level = options[:log_level] if options[:log_level]
64
+ PEROBS.log.open(options[:log] || File.join(@db_dir, 'log'))
60
65
  check_version_and_upgrade
61
66
 
62
67
  # Read the existing DB config.
@@ -68,7 +73,7 @@ module PEROBS
68
73
 
69
74
  # Open the FlatFileDB for transactions.
70
75
  def open
71
- @flat_file = FlatFile.new(@db_dir)
76
+ @flat_file = FlatFile.new(@db_dir, @progressmeter)
72
77
  @flat_file.open
73
78
  PEROBS.log.info "FlatFile '#{@db_dir}' opened"
74
79
  end
@@ -143,8 +148,9 @@ module PEROBS
143
148
  end
144
149
  end
145
150
 
146
- def search_object(id)
147
- @flat_file.search_object(id)
151
+ # @return [Integer] Number of objects stored in the DB.
152
+ def item_counter
153
+ @flat_file.item_counter
148
154
  end
149
155
 
150
156
  # This method must be called to initiate the marking process.
@@ -154,9 +160,9 @@ module PEROBS
154
160
 
155
161
  # Permanently delete all objects that have not been marked. Those are
156
162
  # orphaned and are no longer referenced by any actively used object.
157
- # @return [Array] List of IDs that have been removed from the DB.
158
- def delete_unmarked_objects
159
- @flat_file.delete_unmarked_objects
163
+ # @return [Integer] Number of the removed objects from the DB.
164
+ def delete_unmarked_objects(&block)
165
+ @flat_file.delete_unmarked_objects(&block)
160
166
  end
161
167
 
162
168
  # Mark an object.
@@ -178,7 +184,11 @@ module PEROBS
178
184
  # repaired.
179
185
  # @return number of errors found
180
186
  def check_db(repair = false)
181
- @flat_file.check(repair)
187
+ if repair
188
+ @flat_file.repair
189
+ else
190
+ @flat_file.check
191
+ end
182
192
  end
183
193
 
184
194
  # Check if the stored object is syntactically correct.
@@ -226,7 +236,8 @@ module PEROBS
226
236
  "'#{version_file}': " + e.message
227
237
  end
228
238
  else
229
- # Early versions of PEROBS did not have a version file.
239
+ # The DB is brand new.
240
+ version = VERSION
230
241
  write_version_file(version_file)
231
242
  end
232
243
 
@@ -234,25 +245,40 @@ module PEROBS
234
245
  PEROBS.log.fatal "Cannot downgrade the FlatFile database from " +
235
246
  "version #{version} to version #{VERSION}"
236
247
  end
237
-
238
- if version == 1
239
- # Version 1 had no support for data compression. Make sure all entries
240
- # are compressed to save space.
241
- open
242
- @flat_file.refresh
243
- close
248
+ if version < 3
249
+ PEROBS.log.fatal "The upgrade of this version of the PEROBS database " +
250
+ "is not supported by this version of PEROBS. Please try an earlier " +
251
+ "version of PEROBS to upgrade the database before using this version."
244
252
  end
245
253
 
246
- # After a successful upgrade change the version number in the DB as
247
- # well.
248
- if version < VERSION
254
+ # Version upgrades must be done one version number at a time. If the
255
+ # existing DB is multiple versions older than what the current PEROBS
256
+ # version expects than multiple upgrade runs will be needed.
257
+ while version < VERSION
258
+ if version == 3
259
+ PEROBS.log.warn "Updating FlatFileDB #{@db_dir} from version 3 to " +
260
+ "version 4 ..."
261
+ # Version 4 adds checksums for blob file headers. We have to convert
262
+ # the blob file to include the checksums.
263
+ FlatFile.insert_header_checksums(@db_dir)
264
+ open
265
+ @flat_file.regenerate_index_and_spaces
266
+ close
267
+ end
268
+
269
+ # After a successful upgrade change the version number in the DB as
270
+ # well.
249
271
  write_version_file(version_file)
250
272
  PEROBS.log.warn "Update of FlatFileDB '#{@db_dir}' from version " +
251
- "#{version} to version #{VERSION} completed"
273
+ "#{version} to version #{version + 1} completed"
274
+
275
+ # Update version variable to new version.
276
+ version += 1
252
277
  end
253
278
  end
254
279
 
255
280
  def write_version_file(version_file)
281
+
256
282
  begin
257
283
  RobustFile.write(version_file, VERSION)
258
284
  rescue IOError => e
@@ -0,0 +1,175 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/Log'
29
+ require 'perobs/Object'
30
+
31
+ module PEROBS
32
+
33
+ # The fuzzy string matcher can be used to perform a fuzzy string search
34
+ # against a known set of strings. The dictionary of known strings does not
35
+ # store the actual strings but references to String or PEROBS objects.
36
+ # Once the dictionary has been established, fuzzy matches can be done. Since
37
+ # the actual input strings are not directly stored, you cannot remove or
38
+ # modified already stored strings. To remove strings, you have to clear the
39
+ # matcher and add the strings again that you want to keep.
40
+ class FuzzyStringMatcher < PEROBS::Object
41
+
42
+ attr_persist :case_sensitive, :n, :dict
43
+
44
+ # Create a new FuzzyStringMatcher.
45
+ # @param p [PEROBS::Store] place to store the dictionary
46
+ # @param case_sensitive [Boolean] True if case matters for matching
47
+ # @param n [Integer] Determines what kind of n-gramm is used to store the
48
+ # references in the dictionary. It also determines the minimum word
49
+ # length that can be used for fuzzy matches. Values between 2 and
50
+ # 10 are supported. The default is 4.
51
+ def initialize(p, case_sensitive = false, n = 4)
52
+ super(p)
53
+ if n < 2 || n > 10
54
+ raise ArgumentError, 'n must be between 2 and 10'
55
+ end
56
+ self.case_sensitive = case_sensitive
57
+ self.n = n
58
+
59
+ clear unless @dict
60
+ end
61
+
62
+ # Wipe the dictionary.
63
+ def clear
64
+ self.dict = @store.new(BigHash)
65
+ end
66
+
67
+ # Add a string with its reference to the dictionary.
68
+ # @param string [String] The string to store
69
+ # @param reference [Object] Any object that is associated with the string
70
+ def learn(string, reference = string)
71
+ reference = string if reference.nil?
72
+
73
+ unless @case_sensitive
74
+ string = string.downcase
75
+ end
76
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
77
+ string = "\002" + string + "\003"
78
+
79
+ each_n_gramm(string) do |n_gramm|
80
+ unless (ng_list = @dict[n_gramm])
81
+ @dict[n_gramm] = ng_list = @store.new(Hash)
82
+ end
83
+
84
+ # We use the Hash as a Set. The value doesn't matter.
85
+ ng_list[reference] = true unless ng_list.include?(reference)
86
+ end
87
+
88
+ nil
89
+ end
90
+
91
+ # Find the references who's string best matches the given string.
92
+ # @param string [String] string to search for
93
+ # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
94
+ # the matching should be done. The larger the value the more closer
95
+ # the given string needs to be.
96
+ # @param max_count [Integer] The maximum number of matches that should be
97
+ # returned.
98
+ # @return [Array] The result is an Array of Arrays. The nested Arrays only
99
+ # have 2 entries. The reference and a Float value between 0 and
100
+ # 1.0 that describes how good the match is. The matches are sorted
101
+ # in descending order by the match score.
102
+ def best_matches(string, min_score = 0.5, max_count = 100)
103
+ unless @case_sensitive
104
+ string = string.downcase
105
+ end
106
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
107
+ string = "\002" + string + "\003"
108
+
109
+ matches = {}
110
+
111
+ each_n_gramm(string) do |n_gramm|
112
+ if (ng_list = @dict[n_gramm])
113
+ ng_list.each do |reference, dummy|
114
+ if matches.include?(reference)
115
+ matches[reference] += 1
116
+ else
117
+ matches[reference] = 1
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ return [] if matches.empty?
124
+
125
+ match_list = matches.to_a
126
+
127
+ # Set occurance counters to scores relative to the best possible score.
128
+ # This will be the best possible score for a perfect match.
129
+ best_possible_score = string.length - @n + 1
130
+ match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
131
+
132
+ # Delete all matches that don't have the required minimum match score.
133
+ match_list.delete_if { |a| a[1] < min_score }
134
+
135
+ # Sort the list best to worst match
136
+ match_list.sort! do |a, b|
137
+ b[1] <=> a[1]
138
+ end
139
+
140
+ # Return the top max_count matches.
141
+ match_list[0..max_count - 1]
142
+ end
143
+
144
+ # Returns some internal stats about the dictionary.
145
+ def stats
146
+ s = {}
147
+ s['dictionary_size'] = @dict.size
148
+ max = total = 0
149
+ @dict.each do |n_gramm, ng_list|
150
+ size = ng_list.length
151
+ max = size if size > max
152
+ total += size
153
+ end
154
+ s['max_list_size'] = max
155
+ s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
156
+
157
+ s
158
+ end
159
+
160
+ private
161
+
162
+ def each_n_gramm(string, &block)
163
+ return if string.length < @n
164
+
165
+ 0.upto(string.length - @n) do |i|
166
+ n_gramm = string[i, @n]
167
+
168
+ yield(n_gramm)
169
+ end
170
+ end
171
+
172
+ end
173
+
174
+ end
175
+