batch_jaro_winkler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/batch_jaro_winkler/batch_jaro_winkler.c +104 -0
- data/ext/batch_jaro_winkler/ext/LICENSE.uthash.txt +20 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler.c +890 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler.h +50 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler_internal.h +98 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler_runtime.h +578 -0
- data/ext/batch_jaro_winkler/ext/uthash.h +1230 -0
- data/ext/batch_jaro_winkler/extconf.rb +5 -0
- data/lib/batch_jaro_winkler.rb +242 -0
- data/lib/batch_jaro_winkler/version.rb +3 -0
- metadata +77 -0
@@ -0,0 +1,242 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'ffi'
|
4
|
+
require 'batch_jaro_winkler/batch_jaro_winkler'
|
5
|
+
require 'batch_jaro_winkler/version'
|
6
|
+
|
7
|
+
# Memory leak with older MRI versions which you can reproduce with the following program (at least on macOS 10.15.3 19D76):
|
8
|
+
# while true do
|
9
|
+
# 1000.times do
|
10
|
+
# # random 10 characters string
|
11
|
+
# str = (0...10).map{ (65 + rand(26)).chr }.join
|
12
|
+
# str.encode('utf-32le')
|
13
|
+
# end
|
14
|
+
# GC.start(full_mark: true, immediate_sweep: true)
|
15
|
+
# GC.start
|
16
|
+
# end
|
17
|
+
# Change utf-32le to utf-32 to watch to memory leak vanish
|
18
|
+
# This is the fix that was deployed: https://github.com/ruby/ruby/compare/v2_6_4...v2_6_5#diff-7a2f2c7dfe0bf61d38272aeaf68ac768R2117
|
19
|
+
def version_with_memory_leak?(version)
|
20
|
+
major, minor, patch = version.split('.')
|
21
|
+
memory_leak = false
|
22
|
+
if !major.nil? && major.to_i <= 2 && !minor.nil? && minor.to_i <= 6
|
23
|
+
major, minor, patch = major.to_i, minor.to_i, patch.to_i
|
24
|
+
memory_leak = true
|
25
|
+
if major == 2 && minor == 6 && patch >= 5
|
26
|
+
memory_leak = false
|
27
|
+
elsif major == 2 && minor == 5 && patch >= 8
|
28
|
+
memory_leak = false
|
29
|
+
end
|
30
|
+
end
|
31
|
+
memory_leak
|
32
|
+
end
|
33
|
+
|
34
|
+
def encode_utf32_le_without_memory_leak(str)
|
35
|
+
str = str.encode('utf-32')
|
36
|
+
# Ignore BOM, restore correct byte order within codepoint
|
37
|
+
str = (str.bytes[4..-1] || []).map(&:chr).each_slice(4).map{ |c| c.reverse }.flatten.join
|
38
|
+
str.force_encoding('utf-32le')
|
39
|
+
str
|
40
|
+
end
|
41
|
+
|
42
|
+
module BatchJaroWinkler
|
43
|
+
extend FFI::Library
|
44
|
+
ffi_lib FFI::CURRENT_PROCESS
|
45
|
+
|
46
|
+
class BjwResult < FFI::Struct
|
47
|
+
layout :candidate, :pointer,
|
48
|
+
:score, :float,
|
49
|
+
:candidate_length, :uint32
|
50
|
+
end
|
51
|
+
|
52
|
+
class RuntimeModel
|
53
|
+
attr_reader :model
|
54
|
+
|
55
|
+
def initialize(exportable_model)
|
56
|
+
# We keep a reference because we use the candidates strings in the runtime model and
|
57
|
+
# so we must guarantee that the exportable model is not garbage collected before the runtime model.
|
58
|
+
@exportable_model = exportable_model
|
59
|
+
@model = BatchJaroWinkler.bjw_build_runtime_model(exportable_model)
|
60
|
+
raise 'batch_jaro_winkler.build_runtime_model failed' if @model.nil?
|
61
|
+
# Makes a call to bjw_free_runtime_model when the runtime model is GC'd
|
62
|
+
@_gced_model = FFI::AutoPointer.new(@model, BatchJaroWinkler.method(:bjw_free_runtime_model))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
attach_function :bjw_build_exportable_model, [:pointer, :uint32, :pointer, :uint32, :pointer, :uint32, :pointer], :pointer
|
67
|
+
attach_function :bjw_build_runtime_model, [:buffer_in], :pointer
|
68
|
+
attach_function :bjw_free_runtime_model, [:pointer], :void
|
69
|
+
attach_function :bjw_jaro_winkler_distance, [:pointer, :buffer_in, :uint32, :float, :float, :float, :uint32, :pointer], :pointer
|
70
|
+
# Alias to 'free'
|
71
|
+
attach_function :_bjw_free, [:pointer], :void
|
72
|
+
|
73
|
+
# Automatically freed when the block closes
|
74
|
+
def self.allocate_c_data(nb_candidates, with_min_scores)
|
75
|
+
FFI::MemoryPointer.new(:uint32, 1, false) do |exportable_model_size|
|
76
|
+
FFI::MemoryPointer.new(:pointer, nb_candidates, false) do |c_candidates|
|
77
|
+
FFI::MemoryPointer.new(:uint32, nb_candidates, false) do |c_candidates_lengths|
|
78
|
+
return yield([exportable_model_size, c_candidates, c_candidates_lengths, nil]) unless with_min_scores
|
79
|
+
FFI::MemoryPointer.new(:float, nb_candidates, false) do |c_min_scores|
|
80
|
+
yield([exportable_model_size, c_candidates, c_candidates_lengths, c_min_scores])
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# inp_candidates must follow one of these formats:
|
88
|
+
# - ['hi', 'hello']
|
89
|
+
# - [{ candidate: 'hi', min_score: 0.5 }, { candidate: 'hello', min_score: 0.8 }]
|
90
|
+
def self.build_exportable_model_bytes(char_width, candidates, opts = {})
|
91
|
+
current_version_has_memory_leak = version_with_memory_leak?(RUBY_VERSION)
|
92
|
+
nb_runtime_threads = opts[:nb_runtime_threads] || 1
|
93
|
+
if nb_runtime_threads < 1
|
94
|
+
raise ArgumentError.new('nb_runtime_threads must be > 0')
|
95
|
+
end
|
96
|
+
candidates_encoded = char_width != 0
|
97
|
+
char_width = 4 unless candidates_encoded
|
98
|
+
if char_width != 1 && char_width != 2 && char_width != 4
|
99
|
+
raise ArgumentError.new('char_width must be 1, 2 or 4')
|
100
|
+
end
|
101
|
+
# float size is platform dependent, so don't rely on it being 4
|
102
|
+
float_size = 0
|
103
|
+
FFI::MemoryPointer.new(:float, 1, false) do |one_float|
|
104
|
+
float_size = one_float.size
|
105
|
+
end
|
106
|
+
nb_candidates = candidates.size
|
107
|
+
with_min_scores = false
|
108
|
+
if nb_candidates > 0 && candidates[0].respond_to?(:each_pair) && candidates[0].key?(:min_score)
|
109
|
+
with_min_scores = true
|
110
|
+
end
|
111
|
+
|
112
|
+
exportable_model = nil
|
113
|
+
allocate_c_data(nb_candidates, with_min_scores) do |(exportable_model_size, c_candidates, c_candidates_lengths, c_min_scores)|
|
114
|
+
# Keep in ruby array also to guarantee that encoded strings are not garbage collected.
|
115
|
+
_stored_candidates = Array.new(nb_candidates)
|
116
|
+
candidates.each_with_index do |cand, i_cand|
|
117
|
+
cand_string = cand
|
118
|
+
if with_min_scores
|
119
|
+
cand_string = cand[:candidate]
|
120
|
+
if cand[:min_score] < 0.0 or cand[:min_score] > 1.0
|
121
|
+
raise 'min_score must be >= 0.0 and <= 1.0'
|
122
|
+
end
|
123
|
+
c_min_scores.put(:float, i_cand * float_size, cand[:min_score])
|
124
|
+
end
|
125
|
+
unless candidates_encoded
|
126
|
+
cand_string = current_version_has_memory_leak ? encode_utf32_le_without_memory_leak(cand_string) : cand_string.encode('utf-32le')
|
127
|
+
end
|
128
|
+
cand_string.force_encoding('ascii')
|
129
|
+
cand_length = cand_string.size / char_width
|
130
|
+
cand_string = FFI::MemoryPointer.from_string(cand_string)
|
131
|
+
_stored_candidates[i_cand] = cand_string
|
132
|
+
c_candidates.put(:pointer, i_cand * FFI::Pointer.size, cand_string)
|
133
|
+
# sizeof(uint32_t) = 4
|
134
|
+
c_candidates_lengths.put(:uint32, i_cand * 4, cand_length)
|
135
|
+
end
|
136
|
+
|
137
|
+
exportable_model = BatchJaroWinkler.bjw_build_exportable_model(c_candidates, char_width, c_candidates_lengths, nb_candidates, c_min_scores, nb_runtime_threads, exportable_model_size)
|
138
|
+
next unless exportable_model
|
139
|
+
|
140
|
+
# Will free the raw C exportable model when GC'd
|
141
|
+
_gced_exportable_model = FFI::AutoPointer.new(exportable_model, BatchJaroWinkler.method(:_bjw_free))
|
142
|
+
exportable_model = exportable_model.read_string(exportable_model_size.get(:uint32, 0))
|
143
|
+
end
|
144
|
+
|
145
|
+
raise 'batch_jaro_winkler.build_exportable_model failed' unless exportable_model
|
146
|
+
exportable_model
|
147
|
+
end
|
148
|
+
|
149
|
+
def self.build_exportable_model(candidates, opts = {})
|
150
|
+
BatchJaroWinkler.build_exportable_model_bytes(0, candidates, opts)
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.build_runtime_model(exportable_model)
|
154
|
+
RuntimeModel.new(exportable_model)
|
155
|
+
end
|
156
|
+
|
157
|
+
def self.jaro_winkler_distance_bytes(char_width, runtime_model, inp, opts = {})
|
158
|
+
return [] if opts[:n_best_results] == 0
|
159
|
+
current_version_has_memory_leak = version_with_memory_leak?(RUBY_VERSION)
|
160
|
+
opts[:weight] = 0.1 unless opts.key?(:weight)
|
161
|
+
opts[:threshold] = 0.7 unless opts.key?(:threshold)
|
162
|
+
opts[:n_best_results] = 0 unless opts[:n_best_results]
|
163
|
+
|
164
|
+
if !(opts[:min_score].nil?) && (opts[:min_score] < 0.0 || opts[:min_score] > 1.0)
|
165
|
+
raise ArgumentError.new('min_score must be >= 0.0 and <= 1.0')
|
166
|
+
end
|
167
|
+
if !(opts[:weight].nil?) && (opts[:weight] < 0.0 || opts[:weight] > 0.25)
|
168
|
+
raise ArgumentError.new('weight must be >= 0.0 and <= 0.25')
|
169
|
+
end
|
170
|
+
if !(opts[:threshold].nil?) && (opts[:threshold] < 0.0 || opts[:threshold] > 1.0)
|
171
|
+
raise ArgumentError.new('threshold must be >= 0.0 and <= 1.0')
|
172
|
+
end
|
173
|
+
if opts[:n_best_results] < 0
|
174
|
+
raise ArgumentError.new('n_best_results must be >= 0')
|
175
|
+
end
|
176
|
+
opts[:min_score] = -1.0 if opts[:min_score].nil?
|
177
|
+
opts[:weight] = -1.0 if opts[:weight].nil?
|
178
|
+
opts[:threshold] = -1.0 if opts[:threshold].nil?
|
179
|
+
|
180
|
+
inp_encoded = char_width != 0
|
181
|
+
char_width = 4 unless inp_encoded
|
182
|
+
if char_width != 1 && char_width != 2 && char_width != 4
|
183
|
+
raise ArgumentError.new('char_width must be 1, 2 or 4')
|
184
|
+
end
|
185
|
+
|
186
|
+
unless inp_encoded
|
187
|
+
inp = current_version_has_memory_leak ? encode_utf32_le_without_memory_leak(inp) : inp.encode('utf-32le')
|
188
|
+
end
|
189
|
+
inp.force_encoding('ascii')
|
190
|
+
c_results = nil
|
191
|
+
nb_results = nil
|
192
|
+
FFI::MemoryPointer.new(:uint32, 1, false) do |c_nb_results|
|
193
|
+
c_results = BatchJaroWinkler.bjw_jaro_winkler_distance(runtime_model.model, inp, inp.size / char_width, opts[:min_score], opts[:weight], opts[:threshold], opts[:n_best_results], c_nb_results)
|
194
|
+
nb_results = c_nb_results.get(:uint32, 0)
|
195
|
+
end
|
196
|
+
raise 'batch_jaro_winkler.jaro_winkler_distance failed' unless c_results
|
197
|
+
|
198
|
+
# Will free the raw C results when GC'd
|
199
|
+
_gced_results = FFI::AutoPointer.new(c_results, BatchJaroWinkler.method(:_bjw_free))
|
200
|
+
c_results_address = c_results.address
|
201
|
+
c_results = FFI::Pointer.new(BjwResult, c_results)
|
202
|
+
|
203
|
+
native_conversion = true
|
204
|
+
begin
|
205
|
+
BatchJaroWinkler.method(:rb_bjw_build_runtime_result)
|
206
|
+
rescue NameError
|
207
|
+
native_conversion = false
|
208
|
+
end
|
209
|
+
|
210
|
+
if native_conversion
|
211
|
+
res = []
|
212
|
+
ok = BatchJaroWinkler.rb_bjw_build_runtime_result([], res, c_results_address, nb_results, inp_encoded, char_width)
|
213
|
+
raise 'rb_bjw_build_runtime_result failed' unless ok
|
214
|
+
res
|
215
|
+
else
|
216
|
+
# standard slow ffi version
|
217
|
+
Array.new(nb_results) do |i_result|
|
218
|
+
res = BjwResult.new(c_results[i_result])
|
219
|
+
candidate = res[:candidate].read_string(res[:candidate_length] * char_width)
|
220
|
+
unless inp_encoded
|
221
|
+
candidate.force_encoding('utf-32le')
|
222
|
+
candidate = candidate.encode('utf-8')
|
223
|
+
end
|
224
|
+
[candidate, res[:score]]
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
def self.jaro_winkler_distance(runtime_model, inp, opts = {})
|
230
|
+
BatchJaroWinkler.jaro_winkler_distance_bytes(0, runtime_model, inp, opts)
|
231
|
+
end
|
232
|
+
|
233
|
+
def self.jaro_distance_bytes(char_width, runtime_model, inp, opts = {})
|
234
|
+
opts[:weight] = nil
|
235
|
+
opts[:threshold] = nil
|
236
|
+
BatchJaroWinkler.jaro_winkler_distance_bytes(char_width, runtime_model, inp, opts)
|
237
|
+
end
|
238
|
+
|
239
|
+
def self.jaro_distance(runtime_model, inp, opts = {})
|
240
|
+
BatchJaroWinkler.jaro_distance_bytes(0, runtime_model, inp, opts)
|
241
|
+
end
|
242
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: batch_jaro_winkler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dominik Bousquet
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-04-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ffi
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.12'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.12.2
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.12'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.12.2
|
33
|
+
description: This project aims to perform jaro and jaro winkler distance calculations
|
34
|
+
as fast as possible. It does that by using an optimized model built in advance of
|
35
|
+
the actual runtime calculations. Supports any encoding. Built with maximum performance
|
36
|
+
in mind.
|
37
|
+
email: bousquet.dominik@gmail.com
|
38
|
+
executables: []
|
39
|
+
extensions:
|
40
|
+
- ext/batch_jaro_winkler/extconf.rb
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- ext/batch_jaro_winkler/batch_jaro_winkler.c
|
44
|
+
- ext/batch_jaro_winkler/ext/LICENSE.uthash.txt
|
45
|
+
- ext/batch_jaro_winkler/ext/batch_jaro_winkler.c
|
46
|
+
- ext/batch_jaro_winkler/ext/batch_jaro_winkler.h
|
47
|
+
- ext/batch_jaro_winkler/ext/batch_jaro_winkler_internal.h
|
48
|
+
- ext/batch_jaro_winkler/ext/batch_jaro_winkler_runtime.h
|
49
|
+
- ext/batch_jaro_winkler/ext/uthash.h
|
50
|
+
- ext/batch_jaro_winkler/extconf.rb
|
51
|
+
- lib/batch_jaro_winkler.rb
|
52
|
+
- lib/batch_jaro_winkler/version.rb
|
53
|
+
homepage: https://github.com/dbousque/batch_jaro_winkler
|
54
|
+
licenses:
|
55
|
+
- MIT
|
56
|
+
metadata:
|
57
|
+
source_code_uri: https://github.com/dbousque
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubygems_version: 3.1.2
|
74
|
+
signing_key:
|
75
|
+
specification_version: 4
|
76
|
+
summary: Fast batch jaro winkler distance implementation in C99.
|
77
|
+
test_files: []
|