batch_jaro_winkler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+
2
+
3
+ require 'mkmf'
4
+
5
+ create_makefile 'batch_jaro_winkler/batch_jaro_winkler'
@@ -0,0 +1,242 @@
1
+
2
+
3
+ require 'ffi'
4
+ require 'batch_jaro_winkler/batch_jaro_winkler'
5
+ require 'batch_jaro_winkler/version'
6
+
7
+ # Memory leak with older MRI versions which you can reproduce with the following program (at least on macOS 10.15.3 19D76):
8
+ # while true do
9
+ # 1000.times do
10
+ # # random 10 characters string
11
+ # str = (0...10).map{ (65 + rand(26)).chr }.join
12
+ # str.encode('utf-32le')
13
+ # end
14
+ # GC.start(full_mark: true, immediate_sweep: true)
15
+ # GC.start
16
+ # end
17
+ # Change utf-32le to utf-32 to watch to memory leak vanish
18
+ # This is the fix that was deployed: https://github.com/ruby/ruby/compare/v2_6_4...v2_6_5#diff-7a2f2c7dfe0bf61d38272aeaf68ac768R2117
19
+ def version_with_memory_leak?(version)
20
+ major, minor, patch = version.split('.')
21
+ memory_leak = false
22
+ if !major.nil? && major.to_i <= 2 && !minor.nil? && minor.to_i <= 6
23
+ major, minor, patch = major.to_i, minor.to_i, patch.to_i
24
+ memory_leak = true
25
+ if major == 2 && minor == 6 && patch >= 5
26
+ memory_leak = false
27
+ elsif major == 2 && minor == 5 && patch >= 8
28
+ memory_leak = false
29
+ end
30
+ end
31
+ memory_leak
32
+ end
33
+
34
+ def encode_utf32_le_without_memory_leak(str)
35
+ str = str.encode('utf-32')
36
+ # Ignore BOM, restore correct byte order within codepoint
37
+ str = (str.bytes[4..-1] || []).map(&:chr).each_slice(4).map{ |c| c.reverse }.flatten.join
38
+ str.force_encoding('utf-32le')
39
+ str
40
+ end
41
+
42
+ module BatchJaroWinkler
43
+ extend FFI::Library
44
+ ffi_lib FFI::CURRENT_PROCESS
45
+
46
+ class BjwResult < FFI::Struct
47
+ layout :candidate, :pointer,
48
+ :score, :float,
49
+ :candidate_length, :uint32
50
+ end
51
+
52
+ class RuntimeModel
53
+ attr_reader :model
54
+
55
+ def initialize(exportable_model)
56
+ # We keep a reference because we use the candidates strings in the runtime model and
57
+ # so we must guarantee that the exportable model is not garbage collected before the runtime model.
58
+ @exportable_model = exportable_model
59
+ @model = BatchJaroWinkler.bjw_build_runtime_model(exportable_model)
60
+ raise 'batch_jaro_winkler.build_runtime_model failed' if @model.nil?
61
+ # Makes a call to bjw_free_runtime_model when the runtime model is GC'd
62
+ @_gced_model = FFI::AutoPointer.new(@model, BatchJaroWinkler.method(:bjw_free_runtime_model))
63
+ end
64
+ end
65
+
66
+ attach_function :bjw_build_exportable_model, [:pointer, :uint32, :pointer, :uint32, :pointer, :uint32, :pointer], :pointer
67
+ attach_function :bjw_build_runtime_model, [:buffer_in], :pointer
68
+ attach_function :bjw_free_runtime_model, [:pointer], :void
69
+ attach_function :bjw_jaro_winkler_distance, [:pointer, :buffer_in, :uint32, :float, :float, :float, :uint32, :pointer], :pointer
70
+ # Alias to 'free'
71
+ attach_function :_bjw_free, [:pointer], :void
72
+
73
+ # Automatically freed when the block closes
74
+ def self.allocate_c_data(nb_candidates, with_min_scores)
75
+ FFI::MemoryPointer.new(:uint32, 1, false) do |exportable_model_size|
76
+ FFI::MemoryPointer.new(:pointer, nb_candidates, false) do |c_candidates|
77
+ FFI::MemoryPointer.new(:uint32, nb_candidates, false) do |c_candidates_lengths|
78
+ return yield([exportable_model_size, c_candidates, c_candidates_lengths, nil]) unless with_min_scores
79
+ FFI::MemoryPointer.new(:float, nb_candidates, false) do |c_min_scores|
80
+ yield([exportable_model_size, c_candidates, c_candidates_lengths, c_min_scores])
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
86
+
87
+ # inp_candidates must follow one of these formats:
88
+ # - ['hi', 'hello']
89
+ # - [{ candidate: 'hi', min_score: 0.5 }, { candidate: 'hello', min_score: 0.8 }]
90
+ def self.build_exportable_model_bytes(char_width, candidates, opts = {})
91
+ current_version_has_memory_leak = version_with_memory_leak?(RUBY_VERSION)
92
+ nb_runtime_threads = opts[:nb_runtime_threads] || 1
93
+ if nb_runtime_threads < 1
94
+ raise ArgumentError.new('nb_runtime_threads must be > 0')
95
+ end
96
+ candidates_encoded = char_width != 0
97
+ char_width = 4 unless candidates_encoded
98
+ if char_width != 1 && char_width != 2 && char_width != 4
99
+ raise ArgumentError.new('char_width must be 1, 2 or 4')
100
+ end
101
+ # float size is platform dependent, so don't rely on it being 4
102
+ float_size = 0
103
+ FFI::MemoryPointer.new(:float, 1, false) do |one_float|
104
+ float_size = one_float.size
105
+ end
106
+ nb_candidates = candidates.size
107
+ with_min_scores = false
108
+ if nb_candidates > 0 && candidates[0].respond_to?(:each_pair) && candidates[0].key?(:min_score)
109
+ with_min_scores = true
110
+ end
111
+
112
+ exportable_model = nil
113
+ allocate_c_data(nb_candidates, with_min_scores) do |(exportable_model_size, c_candidates, c_candidates_lengths, c_min_scores)|
114
+ # Keep in ruby array also to guarantee that encoded strings are not garbage collected.
115
+ _stored_candidates = Array.new(nb_candidates)
116
+ candidates.each_with_index do |cand, i_cand|
117
+ cand_string = cand
118
+ if with_min_scores
119
+ cand_string = cand[:candidate]
120
+ if cand[:min_score] < 0.0 or cand[:min_score] > 1.0
121
+ raise 'min_score must be >= 0.0 and <= 1.0'
122
+ end
123
+ c_min_scores.put(:float, i_cand * float_size, cand[:min_score])
124
+ end
125
+ unless candidates_encoded
126
+ cand_string = current_version_has_memory_leak ? encode_utf32_le_without_memory_leak(cand_string) : cand_string.encode('utf-32le')
127
+ end
128
+ cand_string.force_encoding('ascii')
129
+ cand_length = cand_string.size / char_width
130
+ cand_string = FFI::MemoryPointer.from_string(cand_string)
131
+ _stored_candidates[i_cand] = cand_string
132
+ c_candidates.put(:pointer, i_cand * FFI::Pointer.size, cand_string)
133
+ # sizeof(uint32_t) = 4
134
+ c_candidates_lengths.put(:uint32, i_cand * 4, cand_length)
135
+ end
136
+
137
+ exportable_model = BatchJaroWinkler.bjw_build_exportable_model(c_candidates, char_width, c_candidates_lengths, nb_candidates, c_min_scores, nb_runtime_threads, exportable_model_size)
138
+ next unless exportable_model
139
+
140
+ # Will free the raw C exportable model when GC'd
141
+ _gced_exportable_model = FFI::AutoPointer.new(exportable_model, BatchJaroWinkler.method(:_bjw_free))
142
+ exportable_model = exportable_model.read_string(exportable_model_size.get(:uint32, 0))
143
+ end
144
+
145
+ raise 'batch_jaro_winkler.build_exportable_model failed' unless exportable_model
146
+ exportable_model
147
+ end
148
+
149
+ def self.build_exportable_model(candidates, opts = {})
150
+ BatchJaroWinkler.build_exportable_model_bytes(0, candidates, opts)
151
+ end
152
+
153
+ def self.build_runtime_model(exportable_model)
154
+ RuntimeModel.new(exportable_model)
155
+ end
156
+
157
+ def self.jaro_winkler_distance_bytes(char_width, runtime_model, inp, opts = {})
158
+ return [] if opts[:n_best_results] == 0
159
+ current_version_has_memory_leak = version_with_memory_leak?(RUBY_VERSION)
160
+ opts[:weight] = 0.1 unless opts.key?(:weight)
161
+ opts[:threshold] = 0.7 unless opts.key?(:threshold)
162
+ opts[:n_best_results] = 0 unless opts[:n_best_results]
163
+
164
+ if !(opts[:min_score].nil?) && (opts[:min_score] < 0.0 || opts[:min_score] > 1.0)
165
+ raise ArgumentError.new('min_score must be >= 0.0 and <= 1.0')
166
+ end
167
+ if !(opts[:weight].nil?) && (opts[:weight] < 0.0 || opts[:weight] > 0.25)
168
+ raise ArgumentError.new('weight must be >= 0.0 and <= 0.25')
169
+ end
170
+ if !(opts[:threshold].nil?) && (opts[:threshold] < 0.0 || opts[:threshold] > 1.0)
171
+ raise ArgumentError.new('threshold must be >= 0.0 and <= 1.0')
172
+ end
173
+ if opts[:n_best_results] < 0
174
+ raise ArgumentError.new('n_best_results must be >= 0')
175
+ end
176
+ opts[:min_score] = -1.0 if opts[:min_score].nil?
177
+ opts[:weight] = -1.0 if opts[:weight].nil?
178
+ opts[:threshold] = -1.0 if opts[:threshold].nil?
179
+
180
+ inp_encoded = char_width != 0
181
+ char_width = 4 unless inp_encoded
182
+ if char_width != 1 && char_width != 2 && char_width != 4
183
+ raise ArgumentError.new('char_width must be 1, 2 or 4')
184
+ end
185
+
186
+ unless inp_encoded
187
+ inp = current_version_has_memory_leak ? encode_utf32_le_without_memory_leak(inp) : inp.encode('utf-32le')
188
+ end
189
+ inp.force_encoding('ascii')
190
+ c_results = nil
191
+ nb_results = nil
192
+ FFI::MemoryPointer.new(:uint32, 1, false) do |c_nb_results|
193
+ c_results = BatchJaroWinkler.bjw_jaro_winkler_distance(runtime_model.model, inp, inp.size / char_width, opts[:min_score], opts[:weight], opts[:threshold], opts[:n_best_results], c_nb_results)
194
+ nb_results = c_nb_results.get(:uint32, 0)
195
+ end
196
+ raise 'batch_jaro_winkler.jaro_winkler_distance failed' unless c_results
197
+
198
+ # Will free the raw C results when GC'd
199
+ _gced_results = FFI::AutoPointer.new(c_results, BatchJaroWinkler.method(:_bjw_free))
200
+ c_results_address = c_results.address
201
+ c_results = FFI::Pointer.new(BjwResult, c_results)
202
+
203
+ native_conversion = true
204
+ begin
205
+ BatchJaroWinkler.method(:rb_bjw_build_runtime_result)
206
+ rescue NameError
207
+ native_conversion = false
208
+ end
209
+
210
+ if native_conversion
211
+ res = []
212
+ ok = BatchJaroWinkler.rb_bjw_build_runtime_result([], res, c_results_address, nb_results, inp_encoded, char_width)
213
+ raise 'rb_bjw_build_runtime_result failed' unless ok
214
+ res
215
+ else
216
+ # standard slow ffi version
217
+ Array.new(nb_results) do |i_result|
218
+ res = BjwResult.new(c_results[i_result])
219
+ candidate = res[:candidate].read_string(res[:candidate_length] * char_width)
220
+ unless inp_encoded
221
+ candidate.force_encoding('utf-32le')
222
+ candidate = candidate.encode('utf-8')
223
+ end
224
+ [candidate, res[:score]]
225
+ end
226
+ end
227
+ end
228
+
229
+ def self.jaro_winkler_distance(runtime_model, inp, opts = {})
230
+ BatchJaroWinkler.jaro_winkler_distance_bytes(0, runtime_model, inp, opts)
231
+ end
232
+
233
+ def self.jaro_distance_bytes(char_width, runtime_model, inp, opts = {})
234
+ opts[:weight] = nil
235
+ opts[:threshold] = nil
236
+ BatchJaroWinkler.jaro_winkler_distance_bytes(char_width, runtime_model, inp, opts)
237
+ end
238
+
239
+ def self.jaro_distance(runtime_model, inp, opts = {})
240
+ BatchJaroWinkler.jaro_distance_bytes(0, runtime_model, inp, opts)
241
+ end
242
+ end
@@ -0,0 +1,3 @@
1
+ module BatchJaroWinkler
2
+ VERSION = '0.0.1'
3
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: batch_jaro_winkler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Dominik Bousquet
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-04-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ffi
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.12'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.12.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.12'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.12.2
33
+ description: This project aims to perform jaro and jaro winkler distance calculations
34
+ as fast as possible. It does that by using an optimized model built in advance of
35
+ the actual runtime calculations. Supports any encoding. Built with maximum performance
36
+ in mind.
37
+ email: bousquet.dominik@gmail.com
38
+ executables: []
39
+ extensions:
40
+ - ext/batch_jaro_winkler/extconf.rb
41
+ extra_rdoc_files: []
42
+ files:
43
+ - ext/batch_jaro_winkler/batch_jaro_winkler.c
44
+ - ext/batch_jaro_winkler/ext/LICENSE.uthash.txt
45
+ - ext/batch_jaro_winkler/ext/batch_jaro_winkler.c
46
+ - ext/batch_jaro_winkler/ext/batch_jaro_winkler.h
47
+ - ext/batch_jaro_winkler/ext/batch_jaro_winkler_internal.h
48
+ - ext/batch_jaro_winkler/ext/batch_jaro_winkler_runtime.h
49
+ - ext/batch_jaro_winkler/ext/uthash.h
50
+ - ext/batch_jaro_winkler/extconf.rb
51
+ - lib/batch_jaro_winkler.rb
52
+ - lib/batch_jaro_winkler/version.rb
53
+ homepage: https://github.com/dbousque/batch_jaro_winkler
54
+ licenses:
55
+ - MIT
56
+ metadata:
57
+ source_code_uri: https://github.com/dbousque
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubygems_version: 3.1.2
74
+ signing_key:
75
+ specification_version: 4
76
+ summary: Fast batch jaro winkler distance implementation in C99.
77
+ test_files: []