batch_jaro_winkler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: fa81343451beff7427878758a54f8e1af1b8cc2ee1905ab437bad420af0450c7
4
+ data.tar.gz: a13aac63be06874621637a2f94c7256c4e5a5ffbd6f12316cc226aa05bad87f1
5
+ SHA512:
6
+ metadata.gz: c0fa50b077e593d2d69e89d5695c337f2bcbe755e390519af23d67e4d7893222f7576703ba0e1b5a09e2344d5f925762d5492bfe49a9925992e1ff588b841b27
7
+ data.tar.gz: 1c0e9765a11ded4a92d0fb9cf675674b73e145c04bf41aab438510159879f2ccce56da064f6dd4170ed0da0c9b80171dd88020f48ce4cdb0d1cae94014b31788
@@ -0,0 +1,104 @@
1
+
2
+
3
+ /*
4
+ For this file to work with other ruby implementations than MRI, replace everything with:
5
+
6
+ #include "ext/batch_jaro_winkler.c"
7
+ void Init_batch_jaro_winkler(void){}
8
+ */
9
+
10
+ #include "ext/batch_jaro_winkler.c"
11
+ #include "ruby.h"
12
+ #include "ruby/encoding.h"
13
+
14
+ VALUE rb_bjw_build_runtime_result(VALUE self, VALUE tmp_store, VALUE rb_results, VALUE rb_c_results, VALUE rb_nb_results, VALUE rb_inp_encoded, VALUE rb_char_width)
15
+ {
16
+ bjw_result *results;
17
+ uint32_t nb_results;
18
+ uint32_t i_result;
19
+ VALUE tmp_candidate;
20
+ rb_encoding *utf32le_encoding;
21
+ rb_encoding *utf8_encoding;
22
+ VALUE rb_utf8_encoding;
23
+ uint32_t char_width;
24
+ int inp_encoded;
25
+ char *all_candidates;
26
+ VALUE rb_all_candidates;
27
+ uint64_t total_nb_bytes;
28
+ uint64_t decal;
29
+ uint64_t bytes_len;
30
+ uint64_t candidate_length_in_bytes;
31
+ uint64_t i_char;
32
+
33
+ nb_results = (uint32_t)(NUM2ULL(rb_nb_results));
34
+ results = (bjw_result*)(NUM2ULL(rb_c_results));
35
+ char_width = (uint32_t)(NUM2ULL(rb_char_width));
36
+ inp_encoded = RTEST(rb_inp_encoded);
37
+
38
+ utf32le_encoding = rb_enc_find("UTF-32LE");
39
+ utf8_encoding = rb_enc_find("UTF-8");
40
+ rb_utf8_encoding = rb_enc_from_encoding(utf8_encoding);
41
+ // We use tmp_store so that local ruby objects are marked by the GC
42
+ rb_ary_push(tmp_store, rb_utf8_encoding);
43
+
44
+ if (!inp_encoded)
45
+ {
46
+ total_nb_bytes = 0;
47
+ for (i_result = 0; i_result < nb_results; i_result++)
48
+ total_nb_bytes += results[i_result].candidate_length;
49
+ total_nb_bytes *= char_width;
50
+ all_candidates = malloc(total_nb_bytes);
51
+ if (!all_candidates)
52
+ return (Qfalse);
53
+ decal = 0;
54
+ for (i_result = 0; i_result < nb_results; i_result++)
55
+ {
56
+ bytes_len = results[i_result].candidate_length * char_width;
57
+ for (i_char = 0; i_char < bytes_len; i_char++)
58
+ all_candidates[decal + i_char] = ((char*)results[i_result].candidate)[i_char];
59
+ decal += bytes_len;
60
+ }
61
+ rb_all_candidates = rb_enc_str_new(all_candidates, total_nb_bytes, utf32le_encoding);
62
+ // We use tmp_store so that local ruby objects are marked by the GC
63
+ rb_ary_push(tmp_store, rb_all_candidates);
64
+ free(all_candidates);
65
+ rb_all_candidates = rb_str_encode(rb_all_candidates, rb_utf8_encoding, 0, Qnil);
66
+ // We use tmp_store so that local ruby objects are marked by the GC
67
+ rb_ary_push(tmp_store, rb_all_candidates);
68
+ all_candidates = RSTRING_PTR(rb_all_candidates);
69
+ }
70
+
71
+ decal = 0;
72
+ for (i_result = 0; i_result < nb_results; i_result++)
73
+ {
74
+ if (!inp_encoded)
75
+ {
76
+ candidate_length_in_bytes = 0;
77
+ for (i_char = 0; i_char < results[i_result].candidate_length; i_char++)
78
+ {
79
+ if ((all_candidates[decal + candidate_length_in_bytes] & 0xf8) == 0xf0)
80
+ candidate_length_in_bytes += 4;
81
+ else if ((all_candidates[decal + candidate_length_in_bytes] & 0xf0) == 0xe0)
82
+ candidate_length_in_bytes += 3;
83
+ else if ((all_candidates[decal + candidate_length_in_bytes] & 0xe0) == 0xc0)
84
+ candidate_length_in_bytes += 2;
85
+ else
86
+ candidate_length_in_bytes += 1;
87
+ }
88
+ tmp_candidate = rb_enc_str_new(all_candidates + decal, candidate_length_in_bytes, utf8_encoding);
89
+ decal += candidate_length_in_bytes;
90
+ }
91
+ else
92
+ tmp_candidate = rb_str_new(results[i_result].candidate, results[i_result].candidate_length * char_width);
93
+ rb_ary_push(rb_results, rb_ary_new_from_args(2, tmp_candidate, rb_float_new(results[i_result].score)));
94
+ }
95
+ return (Qtrue);
96
+ }
97
+
98
+ void Init_batch_jaro_winkler(void)
99
+ {
100
+ VALUE cBatchJaroWinkler;
101
+
102
+ cBatchJaroWinkler = rb_define_module("BatchJaroWinkler");
103
+ rb_define_singleton_method(cBatchJaroWinkler, "rb_bjw_build_runtime_result", rb_bjw_build_runtime_result, 6);
104
+ }
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2005-2018, Troy D. Hanson http://troydhanson.github.com/uthash/
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+
10
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
11
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
12
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
13
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
14
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
15
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
17
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
18
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
19
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
20
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,890 @@
1
+ /*
2
+ MIT License
3
+
4
+ Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ */
24
+
25
+ #include "batch_jaro_winkler_internal.h"
26
+
27
+ #define BJW_CHAR_TYPE uint32_t
28
+ #define BJW_CHAR_ACCESS_TYPE uint32_t
29
+ #include "batch_jaro_winkler_runtime.h"
30
+
31
+ #undef BJW_CHAR_ACCESS_TYPE
32
+ #define BJW_CHAR_ACCESS_TYPE uint16_t
33
+ #include "batch_jaro_winkler_runtime.h"
34
+
35
+ #undef BJW_CHAR_ACCESS_TYPE
36
+ #define BJW_CHAR_ACCESS_TYPE uint8_t
37
+ #include "batch_jaro_winkler_runtime.h"
38
+
39
+ #undef BJW_CHAR_TYPE
40
+ #undef BJW_CHAR_ACCESS_TYPE
41
+ #define BJW_CHAR_TYPE uint16_t
42
+ #define BJW_CHAR_ACCESS_TYPE uint32_t
43
+ #include "batch_jaro_winkler_runtime.h"
44
+
45
+ #undef BJW_CHAR_ACCESS_TYPE
46
+ #define BJW_CHAR_ACCESS_TYPE uint16_t
47
+ #include "batch_jaro_winkler_runtime.h"
48
+
49
+ #undef BJW_CHAR_ACCESS_TYPE
50
+ #define BJW_CHAR_ACCESS_TYPE uint8_t
51
+ #include "batch_jaro_winkler_runtime.h"
52
+
53
+ #undef BJW_CHAR_TYPE
54
+ #undef BJW_CHAR_ACCESS_TYPE
55
+ #define BJW_CHAR_TYPE uint8_t
56
+ #define BJW_CHAR_ACCESS_TYPE uint32_t
57
+ #include "batch_jaro_winkler_runtime.h"
58
+
59
+ #undef BJW_CHAR_ACCESS_TYPE
60
+ #define BJW_CHAR_ACCESS_TYPE uint16_t
61
+ #include "batch_jaro_winkler_runtime.h"
62
+
63
+ #undef BJW_CHAR_ACCESS_TYPE
64
+ #define BJW_CHAR_ACCESS_TYPE uint8_t
65
+ #include "batch_jaro_winkler_runtime.h"
66
+
67
+ #undef BJW_CHAR_TYPE
68
+ #undef BJW_CHAR_ACCESS_TYPE
69
+
70
+ static inline uint32_t sorted_candidate_char_at(t_sorted_candidate *sorted_candidate, uint32_t i)
71
+ {
72
+ uint32_t res;
73
+
74
+ if (sorted_candidate->char_width == 4)
75
+ res = ((uint32_t*)sorted_candidate->candidate)[i];
76
+ else if (sorted_candidate->char_width == 2)
77
+ res = ((uint16_t*)sorted_candidate->candidate)[i];
78
+ else
79
+ res = ((uint8_t*)sorted_candidate->candidate)[i];
80
+ return (res);
81
+ }
82
+
83
+ static int sort_by_length_and_alphabetical_order(const void *void_cand1, const void *void_cand2)
84
+ {
85
+ t_sorted_candidate *cand1;
86
+ t_sorted_candidate *cand2;
87
+ uint32_t i;
88
+
89
+ cand1 = (t_sorted_candidate*)void_cand1;
90
+ cand2 = (t_sorted_candidate*)void_cand2;
91
+ if (cand1->candidate_length < cand2->candidate_length)
92
+ return (-1);
93
+ if (cand1->candidate_length > cand2->candidate_length)
94
+ return (1);
95
+ for (i = 0; i < cand1->candidate_length && i < cand2->candidate_length && sorted_candidate_char_at(cand1, i) == sorted_candidate_char_at(cand2, i); i++){}
96
+ return (
97
+ i >= cand1->candidate_length && i >= cand2->candidate_length ? 0 :
98
+ i >= cand1->candidate_length ? -1 :
99
+ i >= cand2->candidate_length ? 1 :
100
+ sorted_candidate_char_at(cand1, i) < sorted_candidate_char_at(cand2, i) ? -1 :
101
+ 1
102
+ );
103
+ }
104
+
105
+ static void free_char_occurrences(t_char_occurrences *char_occurrences)
106
+ {
107
+ t_char_occurrences *tmp_char_occurrence;
108
+ t_char_occurrences *tmp1;
109
+ t_tmp_candidate_occurrences *candidate_occurrences;
110
+ t_tmp_candidate_occurrences *tmp_candidate_occurrences;
111
+ t_tmp_candidate_occurrences *tmp2;
112
+
113
+ HASH_ITER(hh, char_occurrences, tmp_char_occurrence, tmp1)
114
+ {
115
+ HASH_DEL(char_occurrences, tmp_char_occurrence);
116
+ candidate_occurrences = tmp_char_occurrence->candidates_occurrences;
117
+ HASH_ITER(hh, candidate_occurrences, tmp_candidate_occurrences, tmp2)
118
+ {
119
+ HASH_DEL(candidate_occurrences, tmp_candidate_occurrences);
120
+ free(tmp_candidate_occurrences->occ_indexes);
121
+ free(tmp_candidate_occurrences);
122
+ }
123
+ free(tmp_char_occurrence);
124
+ }
125
+ }
126
+
127
+ static void *exit_build_exportable_model_for_thread_error(t_sorted_candidate *sorted_candidates, t_char_occurrences *char_occurrences)
128
+ {
129
+ free(sorted_candidates);
130
+ free_char_occurrences(char_occurrences);
131
+ return (NULL);
132
+ }
133
+
134
+ static uint8_t *build_exportable_model_for_thread(
135
+ void **original_candidates, uint32_t original_char_width, void **compressed_candidates, uint32_t compressed_char_width,
136
+ uint32_t char_access_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t *res_model_size
137
+ )
138
+ {
139
+ uint32_t i_candidate;
140
+ uint32_t i_char;
141
+ uint32_t i_occurrence;
142
+ uint32_t i_candidate_occurrrence;
143
+ // important to set to NULL for uthash
144
+ t_char_occurrences *char_occurrences = NULL;
145
+ t_char_occurrences *char_occurrence;
146
+ t_tmp_candidate_occurrences *candidate_occurrences;
147
+ uint32_t key;
148
+ uint32_t total_candidates_lengths;
149
+ uint32_t nb_char_matches;
150
+ uint32_t nb_candidate_occurrences;
151
+ uint8_t *model;
152
+ t_sorted_candidate *sorted_candidates;
153
+ uint32_t store_original_candidates;
154
+
155
+ store_original_candidates = original_candidates != compressed_candidates ? 1 : 0;
156
+ sorted_candidates = malloc(sizeof(t_sorted_candidate) * nb_candidates);
157
+ if (!sorted_candidates)
158
+ return (NULL);
159
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
160
+ {
161
+ sorted_candidates[i_candidate] = (t_sorted_candidate){
162
+ .original_ind = i_candidate,
163
+ .candidate = compressed_candidates[i_candidate],
164
+ .char_width = compressed_char_width,
165
+ .min_score = min_scores ? min_scores[i_candidate] : -1.0f,
166
+ .candidate_length = candidates_lengths[i_candidate]
167
+ };
168
+ }
169
+
170
+ // we sort to improve the runtime memory access pattern
171
+ qsort(sorted_candidates, nb_candidates, sizeof(t_sorted_candidate), &sort_by_length_and_alphabetical_order);
172
+
173
+ nb_char_matches = 0;
174
+ nb_candidate_occurrences = 0;
175
+ total_candidates_lengths = 0;
176
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
177
+ {
178
+ for (i_char = 0; i_char < sorted_candidates[i_candidate].candidate_length; i_char++)
179
+ {
180
+ // Find character matches
181
+ if (compressed_char_width == 4)
182
+ key = (uint32_t)(((uint32_t*)sorted_candidates[i_candidate].candidate)[i_char]);
183
+ else if (compressed_char_width == 2)
184
+ key = (uint32_t)(((uint16_t*)sorted_candidates[i_candidate].candidate)[i_char]);
185
+ else
186
+ key = (uint32_t)(((uint8_t*)sorted_candidates[i_candidate].candidate)[i_char]);
187
+ char_occurrence = NULL;
188
+ HASH_FIND(hh, char_occurrences, &key, sizeof(uint32_t), char_occurrence);
189
+ if (!char_occurrence) {
190
+ nb_char_matches++;
191
+ if (!(char_occurrence = malloc(sizeof(t_char_occurrences))))
192
+ return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
193
+ char_occurrence->id = key;
194
+ // important to set to NULL for uthash
195
+ char_occurrence->candidates_occurrences = NULL;
196
+ if (store_original_candidates)
197
+ {
198
+ if (original_char_width == 4)
199
+ char_occurrence->original_representation = (uint32_t)(((uint32_t*)original_candidates[sorted_candidates[i_candidate].original_ind])[i_char]);
200
+ else if (original_char_width == 2)
201
+ char_occurrence->original_representation = (uint32_t)(((uint16_t*)original_candidates[sorted_candidates[i_candidate].original_ind])[i_char]);
202
+ else
203
+ char_occurrence->original_representation = (uint32_t)(((uint8_t*)original_candidates[sorted_candidates[i_candidate].original_ind])[i_char]);
204
+ }
205
+ HASH_ADD(hh, char_occurrences, id, sizeof(uint32_t), char_occurrence);
206
+ }
207
+
208
+ // Find character occurences for this candidate
209
+ key = i_candidate;
210
+ candidate_occurrences = NULL;
211
+ HASH_FIND(hh, char_occurrence->candidates_occurrences, &key, sizeof(uint32_t), candidate_occurrences);
212
+ if (!candidate_occurrences)
213
+ {
214
+ nb_candidate_occurrences++;
215
+ if (!(candidate_occurrences = malloc(sizeof(t_tmp_candidate_occurrences))))
216
+ return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
217
+ candidate_occurrences->id = key;
218
+ candidate_occurrences->occ_indexes_len = 0;
219
+ candidate_occurrences->occ_indexes_size = 32;
220
+ candidate_occurrences->occ_indexes = malloc(char_access_width * candidate_occurrences->occ_indexes_size);
221
+ if (!candidate_occurrences->occ_indexes)
222
+ {
223
+ free(candidate_occurrences);
224
+ return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
225
+ }
226
+ HASH_ADD(hh, char_occurrence->candidates_occurrences, id, sizeof(uint32_t), candidate_occurrences);
227
+ }
228
+
229
+ // Not big enough, increase size
230
+ if (candidate_occurrences->occ_indexes_len == candidate_occurrences->occ_indexes_size)
231
+ {
232
+ void *new_occ_indexes = malloc(char_access_width * candidate_occurrences->occ_indexes_size * 2);
233
+ if (!new_occ_indexes)
234
+ return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
235
+ memcpy(new_occ_indexes, candidate_occurrences->occ_indexes, char_access_width * candidate_occurrences->occ_indexes_size);
236
+ candidate_occurrences->occ_indexes_size *= 2;
237
+ free(candidate_occurrences->occ_indexes);
238
+ candidate_occurrences->occ_indexes = new_occ_indexes;
239
+ }
240
+
241
+ if (char_access_width == 4)
242
+ ((uint32_t*)candidate_occurrences->occ_indexes)[candidate_occurrences->occ_indexes_len] = i_char;
243
+ if (char_access_width == 2)
244
+ ((uint16_t*)candidate_occurrences->occ_indexes)[candidate_occurrences->occ_indexes_len] = i_char;
245
+ else
246
+ ((uint8_t*)candidate_occurrences->occ_indexes)[candidate_occurrences->occ_indexes_len] = i_char;
247
+ candidate_occurrences->occ_indexes_len++;
248
+ }
249
+ total_candidates_lengths += sorted_candidates[i_candidate].candidate_length;
250
+ }
251
+
252
+ // candidate_ind + nb_occurrences
253
+ uint32_t metadata_size = (sizeof(uint32_t) + char_access_width) * nb_candidate_occurrences;
254
+ uint32_t indexes_size = char_access_width * total_candidates_lengths;
255
+ uint32_t occurrences_size = metadata_size + indexes_size;
256
+
257
+ uint32_t total_size =
258
+ sizeof(uint32_t) + // nb_candidates
259
+ sizeof(uint32_t) + // total_candidates_lengths
260
+ sizeof(uint32_t) + // min_scores present or not (uint32_t used to keep 4 bytes alignment)
261
+ sizeof(uint32_t) + // nb_char_matches
262
+ sizeof(uint32_t) + // nb_candidate_occurrences
263
+ sizeof(uint32_t) + // store_original_candidates
264
+ sizeof(uint32_t) * nb_candidates * (min_scores ? 1 : 0) + // min_scores - can go from 0.0 to 1.0 -> convert to uint32_t for cross-platform support
265
+ sizeof(uint32_t) * nb_char_matches + // chars_occurrences_decals
266
+ sizeof(uint32_t) * nb_char_matches + // nb_candidates_per_char_match
267
+ sizeof(uint32_t) * (nb_candidates + 1) + // candidates_decal
268
+ original_char_width * total_candidates_lengths * store_original_candidates + // original_candidates (if store_original_candidates)
269
+ compressed_char_width * total_candidates_lengths + // candidates (compressed)
270
+ original_char_width * nb_char_matches * store_original_candidates + // original_chars (if store_original_candidates)
271
+ compressed_char_width * nb_char_matches + // chars
272
+ occurrences_size; // occurrences
273
+
274
+ if (!(model = malloc(total_size)))
275
+ return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
276
+ uint8_t *res_buffer_head = model;
277
+ *((uint32_t*)res_buffer_head) = nb_candidates;
278
+ res_buffer_head += sizeof(uint32_t);
279
+ *((uint32_t*)res_buffer_head) = total_candidates_lengths;
280
+ res_buffer_head += sizeof(uint32_t);
281
+ *((uint32_t*)res_buffer_head) = min_scores ? 1 : 0;
282
+ res_buffer_head += sizeof(uint32_t);
283
+ *((uint32_t*)res_buffer_head) = nb_char_matches;
284
+ res_buffer_head += sizeof(uint32_t);
285
+ *((uint32_t*)res_buffer_head) = nb_candidate_occurrences;
286
+ res_buffer_head += sizeof(uint32_t);
287
+ *((uint32_t*)res_buffer_head) = store_original_candidates;
288
+ res_buffer_head += sizeof(uint32_t);
289
+
290
+ if (min_scores)
291
+ {
292
+ uint32_t *res_min_scores = (uint32_t*)res_buffer_head;
293
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
294
+ {
295
+ // To prevent rounding errors when min_score == 1.0f
296
+ if (sorted_candidates[i_candidate].min_score >= 1.0f)
297
+ res_min_scores[i_candidate] = UINT32_MAX;
298
+ else
299
+ res_min_scores[i_candidate] = (uint32_t)(sorted_candidates[i_candidate].min_score * UINT32_MAX);
300
+ }
301
+ res_buffer_head += sizeof(uint32_t) * nb_candidates;
302
+ }
303
+
304
+ uint32_t *chars_occurrences_decals = (uint32_t*)res_buffer_head;
305
+ res_buffer_head += sizeof(uint32_t) * nb_char_matches;
306
+ uint32_t *nb_candidates_per_char_match = (uint32_t*)res_buffer_head;
307
+ res_buffer_head += sizeof(uint32_t) * nb_char_matches;
308
+
309
+ uint32_t *candidates_decal = (uint32_t*)res_buffer_head;
310
+ uint32_t decal = 0;
311
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
312
+ {
313
+ candidates_decal[i_candidate] = decal;
314
+ decal += sorted_candidates[i_candidate].candidate_length;
315
+ }
316
+ candidates_decal[i_candidate] = decal;
317
+ res_buffer_head += sizeof(uint32_t) * (nb_candidates + 1);
318
+
319
+ void *res_original_candidates = res_buffer_head;
320
+ res_buffer_head += original_char_width * total_candidates_lengths * store_original_candidates;
321
+ void *res_compressed_candidates = res_buffer_head;
322
+ res_buffer_head += compressed_char_width * total_candidates_lengths;
323
+ uint32_t candidates_char_decal = 0;
324
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
325
+ {
326
+ if (store_original_candidates)
327
+ {
328
+ memcpy(
329
+ res_original_candidates + (candidates_char_decal * original_char_width),
330
+ original_candidates[sorted_candidates[i_candidate].original_ind],
331
+ sorted_candidates[i_candidate].candidate_length * original_char_width
332
+ );
333
+ }
334
+ memcpy(
335
+ res_compressed_candidates + (candidates_char_decal * compressed_char_width),
336
+ sorted_candidates[i_candidate].candidate,
337
+ sorted_candidates[i_candidate].candidate_length * compressed_char_width
338
+ );
339
+ candidates_char_decal += sorted_candidates[i_candidate].candidate_length;
340
+ }
341
+
342
+ void *original_chars = res_buffer_head;
343
+ res_buffer_head += original_char_width * nb_char_matches * store_original_candidates;
344
+ void *chars = res_buffer_head;
345
+ res_buffer_head += compressed_char_width * nb_char_matches;
346
+
347
+ uint8_t *occurrences = (uint8_t*)res_buffer_head;
348
+ uint8_t *occurrences_head = occurrences;
349
+
350
+ i_char = 0;
351
+ for (char_occurrence = char_occurrences; char_occurrence; char_occurrence = char_occurrence->hh.next)
352
+ {
353
+ if (store_original_candidates)
354
+ {
355
+ if (original_char_width == 4)
356
+ ((uint32_t*)original_chars)[i_char] = char_occurrence->original_representation;
357
+ else if (original_char_width == 2)
358
+ ((uint16_t*)original_chars)[i_char] = char_occurrence->original_representation;
359
+ else
360
+ ((uint8_t*)original_chars)[i_char] = char_occurrence->original_representation;
361
+ }
362
+ if (compressed_char_width == 4)
363
+ ((uint32_t*)chars)[i_char] = (uint32_t)char_occurrence->id;
364
+ else if (compressed_char_width == 2)
365
+ ((uint16_t*)chars)[i_char] = (uint16_t)char_occurrence->id;
366
+ else
367
+ ((uint8_t*)chars)[i_char] = (uint8_t)char_occurrence->id;
368
+ chars_occurrences_decals[i_char] = occurrences_head - occurrences;
369
+
370
+ i_candidate_occurrrence = 0;
371
+ for (candidate_occurrences = char_occurrence->candidates_occurrences; candidate_occurrences; candidate_occurrences = candidate_occurrences->hh.next)
372
+ {
373
+ // 1 uint32_t for the candidate's index
374
+ // + 1 BJW_CHAR_ACCESS_TYPE for the number of occurrences
375
+ // + N BJW_CHAR_ACCESS_TYPE for the occurrences indexes
376
+ *((uint32_t*)occurrences_head) = (uint32_t)candidate_occurrences->id;
377
+ occurrences_head += sizeof(uint32_t);
378
+ if (char_access_width == 4)
379
+ *((uint32_t*)occurrences_head) = candidate_occurrences->occ_indexes_len;
380
+ else if (char_access_width == 2)
381
+ *((uint16_t*)occurrences_head) = candidate_occurrences->occ_indexes_len;
382
+ else
383
+ *((uint8_t*)occurrences_head) = candidate_occurrences->occ_indexes_len;
384
+ occurrences_head += char_access_width;
385
+ for (i_occurrence = 0; i_occurrence < candidate_occurrences->occ_indexes_len; i_occurrence++)
386
+ {
387
+ if (char_access_width == 4)
388
+ *((uint32_t*)occurrences_head) = ((uint32_t*)candidate_occurrences->occ_indexes)[i_occurrence];
389
+ if (char_access_width == 2)
390
+ *((uint16_t*)occurrences_head) = ((uint16_t*)candidate_occurrences->occ_indexes)[i_occurrence];
391
+ else
392
+ *((uint8_t*)occurrences_head) = ((uint8_t*)candidate_occurrences->occ_indexes)[i_occurrence];
393
+ occurrences_head += char_access_width;
394
+ }
395
+
396
+ i_candidate_occurrrence++;
397
+ }
398
+
399
+ nb_candidates_per_char_match[i_char] = i_candidate_occurrrence;
400
+ i_char++;
401
+ }
402
+
403
+ *res_model_size = total_size;
404
+ free_char_occurrences(char_occurrences);
405
+ free(sorted_candidates);
406
+
407
+ return (model);
408
+ }
409
+
410
+ // pack all result data in single buffer
411
+ static void *build_exportable_model(
412
+ void **original_candidates, uint32_t original_char_width, void **compressed_candidates, uint32_t compressed_char_width,
413
+ uint32_t char_access_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t nb_runtime_threads, uint32_t *res_model_size
414
+ )
415
+ {
416
+ uint32_t i_thread;
417
+ uint32_t i;
418
+ uint8_t *model_per_thread[nb_runtime_threads];
419
+ uint32_t model_size_per_thread[nb_runtime_threads];
420
+ uint8_t *res_buffer;
421
+ uint8_t *res_buffer_head;
422
+ void **original_candidates_for_thread;
423
+ void **compressed_candidates_for_thread;
424
+ uint32_t *candidates_lengths_for_thread;
425
+ uint32_t nb_candidates_for_thread;
426
+ float *min_scores_for_thread;
427
+ uint32_t nb_taken_candidates;
428
+ uint32_t aligned_model_size;
429
+
430
+ nb_taken_candidates = 0;
431
+ *res_model_size = 0;
432
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
433
+ {
434
+ nb_candidates_for_thread = i_thread == nb_runtime_threads - 1 ? nb_candidates - nb_taken_candidates : (nb_candidates / nb_runtime_threads);
435
+ original_candidates_for_thread = original_candidates + nb_taken_candidates;
436
+ compressed_candidates_for_thread = compressed_candidates + nb_taken_candidates;
437
+ candidates_lengths_for_thread = candidates_lengths + nb_taken_candidates;
438
+ min_scores_for_thread = min_scores ? min_scores + nb_taken_candidates : NULL;
439
+
440
+ model_per_thread[i_thread] = build_exportable_model_for_thread(
441
+ original_candidates_for_thread, original_char_width, compressed_candidates_for_thread, compressed_char_width,
442
+ char_access_width, candidates_lengths_for_thread, nb_candidates_for_thread, min_scores_for_thread, &(model_size_per_thread[i_thread])
443
+ );
444
+ if (!model_per_thread[i_thread])
445
+ {
446
+ for (i = 0; i < i_thread; i++)
447
+ free(model_per_thread[i]);
448
+ return (NULL);
449
+ }
450
+ // align on next 4 byte boundary
451
+ *res_model_size += model_size_per_thread[i_thread] + (4 - (model_size_per_thread[i_thread] % 4));
452
+ nb_taken_candidates += nb_candidates_for_thread;
453
+ }
454
+
455
+ // we put the number of threads + nb candidates + char_width + char_access_width + original_char_width and the models per thread sizes at the start of the model
456
+ *res_model_size += sizeof(uint32_t) * (nb_runtime_threads + 5);
457
+ res_buffer = malloc(*res_model_size);
458
+ if (!res_buffer)
459
+ {
460
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
461
+ free(model_per_thread[i_thread]);
462
+ return (NULL);
463
+ }
464
+ *((uint32_t*)(res_buffer + sizeof(uint32_t) * 0)) = nb_runtime_threads;
465
+ *((uint32_t*)(res_buffer + sizeof(uint32_t) * 1)) = nb_candidates;
466
+ *((uint32_t*)(res_buffer + sizeof(uint32_t) * 2)) = compressed_char_width;
467
+ *((uint32_t*)(res_buffer + sizeof(uint32_t) * 3)) = char_access_width;
468
+ *((uint32_t*)(res_buffer + sizeof(uint32_t) * 4)) = original_char_width;
469
+ res_buffer_head = res_buffer + sizeof(uint32_t) * (nb_runtime_threads + 5);
470
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
471
+ {
472
+ // align on next 4 byte boundary
473
+ aligned_model_size = model_size_per_thread[i_thread] + (4 - (model_size_per_thread[i_thread] % 4));
474
+ *((uint32_t*)(res_buffer + sizeof(uint32_t) * (i_thread + 5))) = aligned_model_size;
475
+ memcpy(res_buffer_head, model_per_thread[i_thread], model_size_per_thread[i_thread]);
476
+ // align on next 4 byte boundary
477
+ res_buffer_head += aligned_model_size;
478
+ free(model_per_thread[i_thread]);
479
+ }
480
+
481
+ return (res_buffer);
482
+ }
483
+
484
+ static void free_chars(t_char *chars)
485
+ {
486
+ t_char *tmp_char;
487
+ t_char *tmp;
488
+
489
+ HASH_ITER(hh, chars, tmp_char, tmp)
490
+ {
491
+ HASH_DEL(chars, tmp_char);
492
+ free(tmp_char);
493
+ }
494
+ }
495
+
496
+ // Used by the ruby library
497
+ void _bjw_free(void *ptr)
498
+ {
499
+ free(ptr);
500
+ }
501
+
502
+ // Pack all result data in single buffer.
503
+ void *bjw_build_exportable_model(void **candidates, uint32_t char_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t nb_runtime_threads, uint32_t *res_model_size)
504
+ {
505
+ uint32_t i_candidate;
506
+ uint32_t i_char;
507
+ uint32_t i;
508
+ uint32_t longest_candidate;
509
+ // important to set to NULL for uthash
510
+ t_char *chars = NULL;
511
+ t_char *chr;
512
+ uint32_t key;
513
+ uint32_t nb_chars;
514
+ uint32_t compressed_char_width;
515
+ uint32_t char_access_width;
516
+ void **compressed_candidates;
517
+ void *exportable_model;
518
+
519
+ nb_chars = 0;
520
+ longest_candidate = 0;
521
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
522
+ {
523
+ if (candidates_lengths[i_candidate] > longest_candidate)
524
+ longest_candidate = candidates_lengths[i_candidate];
525
+ for (i_char = 0; i_char < candidates_lengths[i_candidate]; i_char++)
526
+ {
527
+ if (char_width == 4)
528
+ key = (uint32_t)(((uint32_t**)candidates)[i_candidate][i_char]);
529
+ else if (char_width == 2)
530
+ key = (uint32_t)(((uint16_t**)candidates)[i_candidate][i_char]);
531
+ else
532
+ key = (uint32_t)(((uint8_t**)candidates)[i_candidate][i_char]);
533
+ chr = NULL;
534
+ HASH_FIND(hh, chars, &key, sizeof(uint32_t), chr);
535
+ if (!chr) {
536
+ nb_chars++;
537
+ if (!(chr = malloc(sizeof(t_char))))
538
+ {
539
+ free_chars(chars);
540
+ return (NULL);
541
+ }
542
+ chr->id = key;
543
+ chr->new_representation = nb_chars;
544
+ HASH_ADD(hh, chars, id, sizeof(uint32_t), chr);
545
+ }
546
+ }
547
+ }
548
+
549
+ compressed_char_width = char_width;
550
+ // We keep one available char (0) to represent an unknown character in the input at runtime.
551
+ if (nb_chars < 256 - 1)
552
+ compressed_char_width = 1;
553
+ else if (nb_chars < 256 * 256 - 1)
554
+ compressed_char_width = 2;
555
+ char_access_width = 4;
556
+ // We can't go up to 256, since we need to be able to send inputs of arbitrary lengths at runtime
557
+ // and characters up to longest_candidate * 2 can be considered for the score.
558
+ if (longest_candidate < 128)
559
+ char_access_width = 1;
560
+ else if (longest_candidate < 256 * 128)
561
+ char_access_width = 2;
562
+
563
+ compressed_candidates = candidates;
564
+
565
+ // Rewrite candidates with smallest possible char_width
566
+ if (compressed_char_width < char_width)
567
+ {
568
+ if (!(compressed_candidates = malloc(sizeof(void*) * nb_candidates)))
569
+ {
570
+ free_chars(chars);
571
+ return (NULL);
572
+ }
573
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
574
+ {
575
+ if (!(compressed_candidates[i_candidate] = malloc(compressed_char_width * candidates_lengths[i_candidate])))
576
+ {
577
+ free_chars(chars);
578
+ for (i = 0; i < i_candidate; i++)
579
+ free(compressed_candidates[i]);
580
+ free(compressed_candidates);
581
+ return (NULL);
582
+ }
583
+ for (i_char = 0; i_char < candidates_lengths[i_candidate]; i_char++)
584
+ {
585
+ if (char_width == 4)
586
+ key = (uint32_t)(((uint32_t**)candidates)[i_candidate][i_char]);
587
+ else if (char_width == 2)
588
+ key = (uint32_t)(((uint16_t**)candidates)[i_candidate][i_char]);
589
+ else
590
+ key = (uint32_t)(((uint8_t**)candidates)[i_candidate][i_char]);
591
+ HASH_FIND(hh, chars, &key, sizeof(uint32_t), chr);
592
+ if (compressed_char_width == 4)
593
+ ((uint32_t**)compressed_candidates)[i_candidate][i_char] = chr->new_representation;
594
+ else if (compressed_char_width == 2)
595
+ ((uint16_t**)compressed_candidates)[i_candidate][i_char] = chr->new_representation;
596
+ else
597
+ ((uint8_t**)compressed_candidates)[i_candidate][i_char] = chr->new_representation;
598
+ }
599
+ }
600
+ }
601
+
602
+ free_chars(chars);
603
+
604
+ exportable_model = build_exportable_model(
605
+ candidates, char_width, compressed_candidates, compressed_char_width, char_access_width,
606
+ candidates_lengths, nb_candidates, min_scores, nb_runtime_threads, res_model_size
607
+ );
608
+
609
+ if (compressed_candidates != candidates)
610
+ {
611
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
612
+ free(compressed_candidates[i_candidate]);
613
+ free(compressed_candidates);
614
+ }
615
+
616
+ return (exportable_model);
617
+ }
618
+
619
+ void bjw_free_runtime_model(void *runtime_model)
620
+ {
621
+ uint32_t char_width;
622
+ uint32_t char_access_width;
623
+
624
+ char_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 2));
625
+ char_access_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 3));
626
+
627
+ void (*free_function)(void*) = NULL;
628
+ if (char_width == 4 && char_access_width == 4)
629
+ free_function = free_runtime_model_uint32_t_uint32_t;
630
+ else if (char_width == 4 && char_access_width == 2)
631
+ free_function = free_runtime_model_uint32_t_uint16_t;
632
+ else if (char_width == 4 && char_access_width == 1)
633
+ free_function = free_runtime_model_uint32_t_uint8_t;
634
+ else if (char_width == 2 && char_access_width == 4)
635
+ free_function = free_runtime_model_uint16_t_uint32_t;
636
+ else if (char_width == 2 && char_access_width == 2)
637
+ free_function = free_runtime_model_uint16_t_uint16_t;
638
+ else if (char_width == 2 && char_access_width == 1)
639
+ free_function = free_runtime_model_uint16_t_uint8_t;
640
+ else if (char_width == 1 && char_access_width == 4)
641
+ free_function = free_runtime_model_uint8_t_uint32_t;
642
+ else if (char_width == 1 && char_access_width == 2)
643
+ free_function = free_runtime_model_uint8_t_uint16_t;
644
+ else if (char_width == 1 && char_access_width == 1)
645
+ free_function = free_runtime_model_uint8_t_uint8_t;
646
+
647
+ free_function(runtime_model);
648
+ }
649
+
650
+ void *bjw_build_runtime_model(void *exportable_model)
651
+ {
652
+ uint32_t nb_runtime_threads;
653
+ uint32_t nb_candidates;
654
+ uint32_t *model_size_per_thread;
655
+ uint32_t char_width;
656
+ uint32_t char_access_width;
657
+ uint32_t original_char_width;
658
+
659
+ uint8_t *exportable_model_head = (uint8_t*)exportable_model;
660
+ nb_runtime_threads = *((uint32_t*)exportable_model_head);
661
+ exportable_model_head += sizeof(uint32_t);
662
+ nb_candidates = *((uint32_t*)exportable_model_head);
663
+ exportable_model_head += sizeof(uint32_t);
664
+ char_width = *((uint32_t*)exportable_model_head);
665
+ exportable_model_head += sizeof(uint32_t);
666
+ char_access_width = *((uint32_t*)exportable_model_head);
667
+ exportable_model_head += sizeof(uint32_t);
668
+ original_char_width = *((uint32_t*)exportable_model_head);
669
+ exportable_model_head += sizeof(uint32_t);
670
+ model_size_per_thread = (uint32_t*)exportable_model_head;
671
+ exportable_model_head += sizeof(uint32_t) * nb_runtime_threads;
672
+
673
+ void *(*build_function)(uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t*) = NULL;
674
+ if (char_width == 4 && char_access_width == 4)
675
+ build_function = build_runtime_model_uint32_t_uint32_t;
676
+ else if (char_width == 4 && char_access_width == 2)
677
+ build_function = build_runtime_model_uint32_t_uint16_t;
678
+ else if (char_width == 4 && char_access_width == 1)
679
+ build_function = build_runtime_model_uint32_t_uint8_t;
680
+ else if (char_width == 2 && char_access_width == 4)
681
+ build_function = build_runtime_model_uint16_t_uint32_t;
682
+ else if (char_width == 2 && char_access_width == 2)
683
+ build_function = build_runtime_model_uint16_t_uint16_t;
684
+ else if (char_width == 2 && char_access_width == 1)
685
+ build_function = build_runtime_model_uint16_t_uint8_t;
686
+ else if (char_width == 1 && char_access_width == 4)
687
+ build_function = build_runtime_model_uint8_t_uint32_t;
688
+ else if (char_width == 1 && char_access_width == 2)
689
+ build_function = build_runtime_model_uint8_t_uint16_t;
690
+ else if (char_width == 1 && char_access_width == 1)
691
+ build_function = build_runtime_model_uint8_t_uint8_t;
692
+
693
+ return (build_function(exportable_model_head, nb_runtime_threads, nb_candidates, original_char_width, model_size_per_thread));
694
+ }
695
+
696
+ static int sort_results_by_score(const void *void_res1, const void *void_res2)
697
+ {
698
+ bjw_result *res1;
699
+ bjw_result *res2;
700
+
701
+ res1 = (bjw_result*)void_res1;
702
+ res2 = (bjw_result*)void_res2;
703
+ return (res1->score < res2->score ? 1 : res1->score == res2->score ? 0 : -1);
704
+ }
705
+
706
+ bjw_result *bjw_jaro_winkler_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, float weight, float threshold, uint32_t n_best_results, uint32_t *nb_results)
707
+ {
708
+ uint32_t i_thread;
709
+ uint32_t nb_runtime_threads;
710
+ uint32_t nb_candidates;
711
+ uint32_t char_width;
712
+ uint32_t char_access_width;
713
+ uint32_t original_char_width;
714
+ char both_min_score_and_min_scores;
715
+ uint32_t n_best_i_try;
716
+ uint32_t n_best_nb_tries;
717
+ float n_best_tries[3];
718
+ bjw_result *results;
719
+ bjw_result *tmp_results;
720
+ uint32_t results_decal;
721
+ t_thread_data *threads_data;
722
+ #if BJW_USE_THREADS
723
+ # ifdef _WIN32
724
+ HANDLE *threads;
725
+ # else
726
+ pthread_t *threads;
727
+ # endif
728
+ #endif
729
+
730
+ nb_runtime_threads = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 0));
731
+ nb_candidates = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 1));
732
+ char_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 2));
733
+ char_access_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 3));
734
+ original_char_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 4));
735
+
736
+ // Characters after 256 won't be taken into consideration for score calculation anyway, and uint8_t won't be able to represent the indices.
737
+ if (char_access_width == 1 && input_length >= 256)
738
+ input_length = 256 - 1;
739
+ else if (char_access_width == 2 && input_length >= 256 * 256)
740
+ input_length = (256 * 256) - 1;
741
+ both_min_score_and_min_scores = min_score < 0.0f && n_best_results != 0;
742
+ if (n_best_results > nb_candidates)
743
+ n_best_results = nb_candidates;
744
+
745
+ #if BJW_USE_THREADS
746
+ # ifdef _WIN32
747
+ if (!(threads = malloc(sizeof(HANDLE) * nb_runtime_threads)))
748
+ return (NULL);
749
+ # else
750
+ if (!(threads = malloc(sizeof(pthread_t) * nb_runtime_threads)))
751
+ return (NULL);
752
+ # endif
753
+ #endif
754
+
755
+ if (!(threads_data = malloc(sizeof(t_thread_data) * nb_runtime_threads)))
756
+ return (NULL);
757
+
758
+ void* (*runtime_function)(void*) = NULL;
759
+ if (char_width == 4 && char_access_width == 4)
760
+ runtime_function = jaro_winkler_distance_for_thread_uint32_t_uint32_t;
761
+ else if (char_width == 4 && char_access_width == 2)
762
+ runtime_function = jaro_winkler_distance_for_thread_uint32_t_uint16_t;
763
+ else if (char_width == 4 && char_access_width == 1)
764
+ runtime_function = jaro_winkler_distance_for_thread_uint32_t_uint8_t;
765
+ else if (char_width == 2 && char_access_width == 4)
766
+ runtime_function = jaro_winkler_distance_for_thread_uint16_t_uint32_t;
767
+ else if (char_width == 2 && char_access_width == 2)
768
+ runtime_function = jaro_winkler_distance_for_thread_uint16_t_uint16_t;
769
+ else if (char_width == 2 && char_access_width == 1)
770
+ runtime_function = jaro_winkler_distance_for_thread_uint16_t_uint8_t;
771
+ else if (char_width == 1 && char_access_width == 4)
772
+ runtime_function = jaro_winkler_distance_for_thread_uint8_t_uint32_t;
773
+ else if (char_width == 1 && char_access_width == 2)
774
+ runtime_function = jaro_winkler_distance_for_thread_uint8_t_uint16_t;
775
+ else if (char_width == 1 && char_access_width == 1)
776
+ runtime_function = jaro_winkler_distance_for_thread_uint8_t_uint8_t;
777
+
778
+ if (n_best_results != 0)
779
+ {
780
+ if (nb_candidates > 0)
781
+ n_best_tries[0] = 1.0f - (((float)n_best_results) / nb_candidates);
782
+ else
783
+ n_best_tries[0] = -1.0f;
784
+ if (n_best_tries[0] > 0.8f)
785
+ n_best_tries[0] = 0.8f;
786
+ n_best_tries[1] = n_best_tries[0] - 0.2f;
787
+ n_best_tries[1] = n_best_tries[1] < 0.0f ? -1.0f : n_best_tries[1];
788
+ n_best_tries[2] = min_score;
789
+ n_best_nb_tries = 3;
790
+
791
+ if (n_best_tries[1] <= min_score)
792
+ {
793
+ n_best_nb_tries--;
794
+ n_best_tries[1] = min_score;
795
+ }
796
+ if (n_best_tries[0] <= min_score)
797
+ {
798
+ n_best_nb_tries--;
799
+ n_best_tries[0] = min_score;
800
+ }
801
+ }
802
+ else
803
+ {
804
+ n_best_tries[0] = min_score;
805
+ n_best_nb_tries = 1;
806
+ }
807
+
808
+ for (n_best_i_try = 0; n_best_i_try < n_best_nb_tries; n_best_i_try++)
809
+ {
810
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
811
+ {
812
+ threads_data[i_thread] = (t_thread_data){
813
+ .runtime_models = runtime_model + sizeof(uint32_t) * 5,
814
+ .i_thread = i_thread,
815
+ .original_char_width = original_char_width,
816
+ .input = input,
817
+ .input_length = input_length,
818
+ .min_score = n_best_tries[n_best_i_try],
819
+ .weight = weight,
820
+ .threshold = threshold,
821
+ .both_min_score_and_min_scores = both_min_score_and_min_scores,
822
+ .results = NULL,
823
+ .nb_results = 0
824
+ };
825
+
826
+ #if BJW_USE_THREADS
827
+ # ifdef _WIN32
828
+ threads[i_thread] = CreateThread(NULL, 0, runtime_function, &(threads_data[i_thread]), 0, NULL);
829
+ # else
830
+ pthread_create(&(threads[i_thread]), NULL, runtime_function, &(threads_data[i_thread]));
831
+ # endif
832
+ #else
833
+ runtime_function(&(threads_data[i_thread]));
834
+ #endif
835
+ }
836
+
837
+ *nb_results = 0;
838
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
839
+ {
840
+ #if BJW_USE_THREADS
841
+ # ifdef _WIN32
842
+ WaitForSingleObject(threads[i_thread], INFINITE);
843
+ CloseHandle(threads[i_thread]);
844
+ # else
845
+ pthread_join(threads[i_thread], NULL);
846
+ # endif
847
+ #endif
848
+ *nb_results += threads_data[i_thread].nb_results;
849
+ }
850
+
851
+ if (n_best_results == 0 || *nb_results >= n_best_results)
852
+ break ;
853
+ }
854
+
855
+ if (!(results = malloc(sizeof(bjw_result) * (*nb_results))))
856
+ return (NULL);
857
+ results_decal = 0;
858
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
859
+ {
860
+ memcpy(&(results[results_decal]), threads_data[i_thread].results, sizeof(bjw_result) * threads_data[i_thread].nb_results);
861
+ free(threads_data[i_thread].results);
862
+ results_decal += threads_data[i_thread].nb_results;
863
+ }
864
+
865
+ if (n_best_results != 0)
866
+ {
867
+ qsort(results, *nb_results, sizeof(bjw_result), &sort_results_by_score);
868
+ if (*nb_results > n_best_results)
869
+ {
870
+ if (!(tmp_results = malloc(sizeof(bjw_result) * n_best_results)))
871
+ return (NULL);
872
+ memcpy(tmp_results, results, sizeof(bjw_result) * n_best_results);
873
+ free(results);
874
+ results = tmp_results;
875
+ *nb_results = n_best_results;
876
+ }
877
+ }
878
+
879
+ #if BJW_USE_THREADS
880
+ free(threads);
881
+ #endif
882
+ free(threads_data);
883
+
884
+ return (results);
885
+ }
886
+
887
+ bjw_result *bjw_jaro_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, uint32_t n_best_results, uint32_t *nb_results)
888
+ {
889
+ return (bjw_jaro_winkler_distance(runtime_model, input, input_length, min_score, -1.0f, -1.0f, n_best_results, nb_results));
890
+ }