batch_jaro_winkler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ /*
2
+ MIT License
3
+
4
+ Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ */
24
+
25
+ #ifndef BATCH_JARO_WINKLER_H
26
+ # define BATCH_JARO_WINKLER_H
27
+
28
+ # include <stdint.h>
29
+
30
+ // Set to 0 if you don't have access to POSIX threads or Windows threads
31
+ # define BJW_USE_THREADS 1
32
+
33
+ typedef struct
34
+ {
35
+ void *candidate;
36
+ float score;
37
+ uint32_t candidate_length;
38
+ } bjw_result;
39
+
40
+ // You can pass the resulting buffer around, you are responsible for freeing it.
41
+ void *bjw_build_exportable_model(void **candidates, uint32_t char_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t nb_runtime_threads, uint32_t *res_model_size);
42
+
43
+ // Allocates buffers required for the runtime. You can use a runtime model multiple times.
44
+ void *bjw_build_runtime_model(void *exportable_model);
45
+ void bjw_free_runtime_model(void *runtime_model);
46
+
47
+ bjw_result *bjw_jaro_winkler_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, float weight, float threshold, uint32_t n_best_results, uint32_t *nb_results);
48
+ bjw_result *bjw_jaro_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, uint32_t n_best_results, uint32_t *nb_results);
49
+
50
+ #endif
@@ -0,0 +1,98 @@
1
+ /*
2
+ MIT License
3
+
4
+ Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ */
24
+
25
+ #ifndef BATCH_JARO_WINKLER_INTERNAL_H
26
+ # define BATCH_JARO_WINKLER_INTERNAL_H
27
+
28
+ # include "batch_jaro_winkler.h"
29
+
30
+ // used for malloc, free
31
+ # include <stdlib.h>
32
+ // used for uint8_t, int16_t etc.
33
+ # include <stdint.h>
34
+ // used for ceil
35
+ # include <math.h>
36
+
37
+ # if BJW_USE_THREADS
38
+ # ifdef _WIN32
39
+ # include <windows.h>
40
+ # else
41
+ # include <pthread.h>
42
+ # endif
43
+ # endif
44
+
45
+ # include "uthash.h"
46
+
47
+ typedef struct s_char
48
+ {
49
+ uint32_t id;
50
+ uint32_t new_representation;
51
+ UT_hash_handle hh;
52
+ } t_char;
53
+
54
+ typedef struct s_tmp_candidate_occurrences
55
+ {
56
+ uint32_t id;
57
+ void *occ_indexes;
58
+ uint32_t occ_indexes_len;
59
+ uint32_t occ_indexes_size;
60
+ // internal data used by uthash
61
+ UT_hash_handle hh;
62
+ } t_tmp_candidate_occurrences;
63
+
64
+ typedef struct s_char_occurrences
65
+ {
66
+ // character represented as an int for uthash
67
+ uint32_t id;
68
+ uint32_t original_representation;
69
+ t_tmp_candidate_occurrences *candidates_occurrences;
70
+ // internal data used by uthash
71
+ UT_hash_handle hh;
72
+ } t_char_occurrences;
73
+
74
+ typedef struct s_sorted_candidate
75
+ {
76
+ uint32_t original_ind;
77
+ void *candidate;
78
+ uint32_t char_width;
79
+ float min_score;
80
+ uint32_t candidate_length;
81
+ } t_sorted_candidate;
82
+
83
+ typedef struct s_thread_data
84
+ {
85
+ void *runtime_models;
86
+ uint32_t i_thread;
87
+ uint32_t original_char_width;
88
+ void *input;
89
+ uint32_t input_length;
90
+ float min_score;
91
+ float weight;
92
+ float threshold;
93
+ char both_min_score_and_min_scores;
94
+ bjw_result *results;
95
+ uint32_t nb_results;
96
+ } t_thread_data;
97
+
98
+ #endif
@@ -0,0 +1,578 @@
1
+ /*
2
+ MIT License
3
+
4
+ Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ */
24
+
25
+ #define BJW_SUFFIX_PASTER(name, type1, type2) name ## _ ## type1 ## _ ## type2
26
+ #define BJW_SUFFIX_HANDLER(name, type1, type2) BJW_SUFFIX_PASTER(name, type1, type2)
27
+ #define BJW_SUFFIX(name) BJW_SUFFIX_HANDLER(name, BJW_CHAR_TYPE, BJW_CHAR_ACCESS_TYPE)
28
+
29
+ // this represents the data needed for a candidate when finding matches
30
+ typedef struct
31
+ {
32
+ BJW_CHAR_ACCESS_TYPE candidate_length;
33
+ BJW_CHAR_ACCESS_TYPE nb_matches;
34
+ BJW_CHAR_ACCESS_TYPE required_nb_matches;
35
+ // the search_range cannot be higher than max(runtime_input_len, max_candidate_len) / 2 - 1,
36
+ // so 1 byte is enough
37
+ BJW_CHAR_ACCESS_TYPE search_range;
38
+ } BJW_SUFFIX(t_candidate_data);
39
+
40
+ // holds match information needed to calculate the jaro distance
41
+ typedef struct
42
+ {
43
+ uint8_t *input_flags;
44
+ uint8_t *candidates_flags;
45
+ uint32_t *candidates_decal;
46
+ } BJW_SUFFIX(t_occurrences_matches);
47
+
48
+ // map from character to matching candidate occurrences of the character
49
+ typedef struct
50
+ {
51
+ // character represented as an uint32 for uthash
52
+ uint32_t id;
53
+ // candidate character occurrences as explained above
54
+ uint8_t *occurrences;
55
+ // number of occurrences to skip per candidate
56
+ BJW_CHAR_ACCESS_TYPE *nb_already_considered;
57
+ uint32_t nb_matching_candidates;
58
+ // internal data used by uthash
59
+ UT_hash_handle hh;
60
+ } BJW_SUFFIX(t_char_matches);
61
+
62
+ typedef struct
63
+ {
64
+ BJW_SUFFIX(t_char_matches) *all_char_matches;
65
+ BJW_CHAR_ACCESS_TYPE *all_nb_already_considered;
66
+ uint32_t all_nb_candidate_occurrences;
67
+ } BJW_SUFFIX(t_char_matches_data);
68
+
69
+ typedef struct
70
+ {
71
+ uint32_t nb_candidates;
72
+ uint32_t total_candidates_lengths;
73
+ BJW_SUFFIX(t_candidate_data) *candidates_data;
74
+ void *original_candidates;
75
+ BJW_CHAR_TYPE *candidates;
76
+ uint32_t *min_scores;
77
+ BJW_SUFFIX(t_occurrences_matches) occurrences_matches;
78
+ t_char *original_chars_to_new;
79
+ t_char *all_original_chars_to_new;
80
+ BJW_SUFFIX(t_char_matches) *char_matches;
81
+ BJW_SUFFIX(t_char_matches_data) char_matches_data;
82
+ } BJW_SUFFIX(t_runtime_model);
83
+
84
+ static void BJW_SUFFIX(free_runtime_model_for_thread)
85
+ (BJW_SUFFIX(t_runtime_model) *runtime_model)
86
+ {
87
+ HASH_CLEAR(hh, runtime_model->char_matches);
88
+ HASH_CLEAR(hh, runtime_model->original_chars_to_new);
89
+ free(runtime_model->candidates_data);
90
+ runtime_model->candidates_data = NULL;
91
+ free(runtime_model->occurrences_matches.candidates_flags);
92
+ runtime_model->occurrences_matches.candidates_flags = NULL;
93
+ free(runtime_model->char_matches_data.all_char_matches);
94
+ runtime_model->char_matches_data.all_char_matches = NULL;
95
+ free(runtime_model->char_matches_data.all_nb_already_considered);
96
+ runtime_model->char_matches_data.all_nb_already_considered = NULL;
97
+ free(runtime_model->all_original_chars_to_new);
98
+ runtime_model->all_original_chars_to_new = NULL;
99
+ }
100
+
101
+ static char BJW_SUFFIX(build_runtime_model_for_thread)
102
+ (uint8_t *exportable_model_head, uint32_t original_char_width, BJW_SUFFIX(t_runtime_model) *runtime_model)
103
+ {
104
+ uint32_t nb_candidates;
105
+ uint32_t total_candidates_lengths;
106
+ char min_scores_present;
107
+ uint32_t nb_char_matches;
108
+ uint32_t nb_candidate_occurrences;
109
+ uint32_t store_original_candidates;
110
+ void *original_candidates;
111
+ BJW_CHAR_TYPE *candidates;
112
+ uint32_t *min_scores;
113
+ void *original_chars;
114
+ t_char *original_char_match;
115
+ BJW_CHAR_TYPE *chars;
116
+ uint32_t *chars_occurrences_decals;
117
+ uint32_t *nb_candidates_per_char_match;
118
+ uint32_t *candidates_decal;
119
+ uint8_t *occurrences;
120
+ uint32_t i_char;
121
+ uint32_t nb_already_considered_decal;
122
+ BJW_SUFFIX(t_char_matches) *match;
123
+
124
+ nb_candidates = *((uint32_t*)exportable_model_head);
125
+ exportable_model_head += sizeof(uint32_t);
126
+ total_candidates_lengths = *((uint32_t*)exportable_model_head);
127
+ exportable_model_head += sizeof(uint32_t);
128
+ min_scores_present = *((uint32_t*)exportable_model_head) ? 1 : 0;
129
+ exportable_model_head += sizeof(uint32_t);
130
+ nb_char_matches = *((uint32_t*)exportable_model_head);
131
+ exportable_model_head += sizeof(uint32_t);
132
+ nb_candidate_occurrences = *((uint32_t*)exportable_model_head);
133
+ exportable_model_head += sizeof(uint32_t);
134
+ store_original_candidates = *((uint32_t*)exportable_model_head);
135
+ exportable_model_head += sizeof(uint32_t);
136
+ min_scores = min_scores_present ? (uint32_t*)exportable_model_head : NULL;
137
+ exportable_model_head += sizeof(uint32_t) * nb_candidates * min_scores_present;
138
+ chars_occurrences_decals = (uint32_t*)exportable_model_head;
139
+ exportable_model_head += sizeof(uint32_t) * nb_char_matches;
140
+ nb_candidates_per_char_match = (uint32_t*)exportable_model_head;
141
+ exportable_model_head += sizeof(uint32_t) * nb_char_matches;
142
+ candidates_decal = (uint32_t*)exportable_model_head;
143
+ exportable_model_head += sizeof(uint32_t) * (nb_candidates + 1);
144
+ original_candidates = exportable_model_head;
145
+ exportable_model_head += original_char_width * total_candidates_lengths * store_original_candidates;
146
+ candidates = (BJW_CHAR_TYPE*)exportable_model_head;
147
+ exportable_model_head += sizeof(BJW_CHAR_TYPE) * total_candidates_lengths;
148
+ original_chars = exportable_model_head;
149
+ exportable_model_head += original_char_width * nb_char_matches * store_original_candidates;
150
+ chars = (BJW_CHAR_TYPE*)exportable_model_head;
151
+ exportable_model_head += sizeof(BJW_CHAR_TYPE) * nb_char_matches;
152
+ occurrences = (uint8_t*)exportable_model_head;
153
+
154
+ runtime_model->nb_candidates = nb_candidates;
155
+ runtime_model->total_candidates_lengths = total_candidates_lengths;
156
+ runtime_model->candidates_data = malloc(sizeof(BJW_SUFFIX(t_candidate_data)) * nb_candidates);
157
+ runtime_model->original_candidates = store_original_candidates ? original_candidates : candidates;
158
+ runtime_model->candidates = candidates;
159
+ runtime_model->min_scores = min_scores;
160
+ runtime_model->occurrences_matches.candidates_flags = malloc(sizeof(uint8_t) * total_candidates_lengths);
161
+ runtime_model->occurrences_matches.candidates_decal = candidates_decal;
162
+ // important to set to NULL for uthash
163
+ runtime_model->original_chars_to_new = NULL;
164
+ runtime_model->all_original_chars_to_new = store_original_candidates ? malloc(sizeof(t_char) * nb_char_matches) : NULL;
165
+ // important to set to NULL for uthash
166
+ runtime_model->char_matches = NULL;
167
+ runtime_model->char_matches_data.all_char_matches = malloc(sizeof(BJW_SUFFIX(t_char_matches)) * nb_char_matches);
168
+ runtime_model->char_matches_data.all_nb_already_considered = malloc(sizeof(BJW_CHAR_ACCESS_TYPE) * nb_candidate_occurrences);
169
+ runtime_model->char_matches_data.all_nb_candidate_occurrences = nb_candidate_occurrences;
170
+
171
+ if (!runtime_model->candidates_data || !runtime_model->occurrences_matches.candidates_flags
172
+ || !runtime_model->char_matches_data.all_char_matches || !runtime_model->char_matches_data.all_nb_already_considered
173
+ || (store_original_candidates && !runtime_model->all_original_chars_to_new)
174
+ )
175
+ {
176
+ BJW_SUFFIX(free_runtime_model_for_thread)(runtime_model);
177
+ return (0);
178
+ }
179
+
180
+ nb_already_considered_decal = 0;
181
+ for (i_char = 0; i_char < nb_char_matches; i_char++)
182
+ {
183
+ if (store_original_candidates)
184
+ {
185
+ original_char_match = &(runtime_model->all_original_chars_to_new[i_char]);
186
+ if (original_char_width == 4)
187
+ original_char_match->id = (uint32_t)(((uint32_t*)original_chars)[i_char]);
188
+ else if (original_char_width == 2)
189
+ original_char_match->id = (uint32_t)(((uint16_t*)original_chars)[i_char]);
190
+ else
191
+ original_char_match->id = (uint32_t)(((uint8_t*)original_chars)[i_char]);
192
+ original_char_match->new_representation = chars[i_char];
193
+ HASH_ADD(hh, runtime_model->original_chars_to_new, id, sizeof(uint32_t), original_char_match);
194
+ }
195
+
196
+ match = &(runtime_model->char_matches_data.all_char_matches[i_char]);
197
+ match->id = chars[i_char];
198
+ match->occurrences = occurrences + chars_occurrences_decals[i_char];
199
+ match->nb_already_considered = runtime_model->char_matches_data.all_nb_already_considered + nb_already_considered_decal;
200
+ match->nb_matching_candidates = nb_candidates_per_char_match[i_char];
201
+ HASH_ADD(hh, runtime_model->char_matches, id, sizeof(uint32_t), match);
202
+ nb_already_considered_decal += nb_candidates_per_char_match[i_char];
203
+ }
204
+
205
+ return (1);
206
+ }
207
+
208
+ static void BJW_SUFFIX(free_runtime_model)
209
+ (void *runtime_model)
210
+ {
211
+ BJW_SUFFIX(t_runtime_model) *runtime_models;
212
+ uint32_t i_thread;
213
+ uint32_t nb_runtime_threads;
214
+
215
+ nb_runtime_threads = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 0));
216
+ runtime_models = (BJW_SUFFIX(t_runtime_model)*)(runtime_model + sizeof(uint32_t) * 5);
217
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
218
+ BJW_SUFFIX(free_runtime_model_for_thread)(&(runtime_models[i_thread]));
219
+ free(runtime_model);
220
+ }
221
+
222
+ static void *BJW_SUFFIX(build_runtime_model)
223
+ (uint8_t *exportable_model_head, uint32_t nb_runtime_threads, uint32_t nb_candidates, uint32_t original_char_width, uint32_t *model_size_per_thread)
224
+ {
225
+ uint32_t i_thread;
226
+ uint32_t i;
227
+ void *res;
228
+ BJW_SUFFIX(t_runtime_model) *runtime_models;
229
+
230
+ res = malloc(sizeof(BJW_SUFFIX(t_runtime_model)) * nb_runtime_threads + sizeof(uint32_t) * 5);
231
+ if (!res)
232
+ return (NULL);
233
+ *((uint32_t*)(res + sizeof(uint32_t) * 0)) = nb_runtime_threads;
234
+ *((uint32_t*)(res + sizeof(uint32_t) * 1)) = nb_candidates;
235
+ *((uint32_t*)(res + sizeof(uint32_t) * 2)) = sizeof(BJW_CHAR_TYPE);
236
+ *((uint32_t*)(res + sizeof(uint32_t) * 3)) = sizeof(BJW_CHAR_ACCESS_TYPE);
237
+ *((uint32_t*)(res + sizeof(uint32_t) * 4)) = original_char_width;
238
+ runtime_models = res + sizeof(uint32_t) * 5;
239
+
240
+ for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
241
+ {
242
+ if (!BJW_SUFFIX(build_runtime_model_for_thread)(exportable_model_head, original_char_width, &(runtime_models[i_thread])))
243
+ {
244
+ for (i = 0; i < i_thread; i++)
245
+ BJW_SUFFIX(free_runtime_model_for_thread)(&(runtime_models[i]));
246
+ free(res);
247
+ return (NULL);
248
+ }
249
+ exportable_model_head += model_size_per_thread[i_thread];
250
+ }
251
+
252
+ return (res);
253
+ }
254
+
255
+ static void BJW_SUFFIX(populate_candidates_data)
256
+ (BJW_SUFFIX(t_candidate_data) *candidates_data, uint32_t *candidates_decal, uint32_t nb_candidates, float min_score, uint32_t *min_scores, uint32_t input_length, float weight, char both_min_score_and_min_scores)
257
+ {
258
+ uint32_t i_candidate;
259
+ float candidate_min_score;
260
+ BJW_CHAR_ACCESS_TYPE candidate_length;
261
+ BJW_CHAR_ACCESS_TYPE required_nb_matches;
262
+ BJW_CHAR_ACCESS_TYPE search_range;
263
+ float float_required_nb_matches;
264
+ float bottom_part;
265
+
266
+ for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
267
+ {
268
+ candidate_min_score = min_score;
269
+ if (min_score < 0.0f || (both_min_score_and_min_scores && ((float)min_scores[i_candidate]) / UINT32_MAX > candidate_min_score))
270
+ candidate_min_score = ((float)min_scores[i_candidate]) / UINT32_MAX;
271
+
272
+ candidate_length = candidates_decal[i_candidate + 1] - candidates_decal[i_candidate];
273
+ if (candidate_length < 1)
274
+ {
275
+ candidates_data[i_candidate] = (BJW_SUFFIX(t_candidate_data)){
276
+ .candidate_length = candidate_length,
277
+ .nb_matches = 0,
278
+ .required_nb_matches = 0,
279
+ .search_range = 1
280
+ };
281
+ continue ;
282
+ }
283
+
284
+ // equations solved using: https://www.mathpapa.com/equation-solver
285
+ // jaro distance
286
+ if (weight < 0.0f)
287
+ float_required_nb_matches = (3.0f * candidate_min_score * candidate_length * input_length - (candidate_length * input_length)) / (candidate_length + input_length);
288
+ else
289
+ {
290
+ // jaro winkler distance. Assume all prefixing characters match
291
+ bottom_part = -(4.0f * candidate_length * weight) - (4.0f * input_length * weight) + candidate_length + input_length;
292
+ if (bottom_part == 0.0f || bottom_part == -0.0f)
293
+ float_required_nb_matches = candidate_length > input_length ? candidate_length + 1 : input_length + 1;
294
+ else
295
+ {
296
+ float_required_nb_matches = (
297
+ (3.0f * candidate_min_score * candidate_length * input_length) -
298
+ (8.0f * weight * candidate_length * input_length) -
299
+ (candidate_length * input_length)
300
+ ) / bottom_part;
301
+ }
302
+ }
303
+ if (float_required_nb_matches < 0.0f)
304
+ float_required_nb_matches = 0.0f;
305
+ required_nb_matches = (BJW_CHAR_ACCESS_TYPE)(ceil(float_required_nb_matches));
306
+
307
+ search_range = (input_length > candidate_length ? input_length : candidate_length) / 2;
308
+ search_range = search_range <= 1 ? 0 : search_range - 1;
309
+
310
+ candidates_data[i_candidate] = (BJW_SUFFIX(t_candidate_data)){
311
+ .candidate_length = candidate_length,
312
+ .nb_matches = 0,
313
+ .required_nb_matches = required_nb_matches,
314
+ .search_range = search_range
315
+ };
316
+ }
317
+ }
318
+
319
+ static void BJW_SUFFIX(find_occurrences_matches)
320
+ (BJW_SUFFIX(t_runtime_model) *runtime_model, BJW_SUFFIX(t_char_matches) *match, uint32_t remaining_chars, uint32_t i_char, uint32_t input_length)
321
+ {
322
+ uint8_t *occurrences_head;
323
+ uint32_t i_candidate;
324
+ uint32_t candidate_ind;
325
+ BJW_CHAR_ACCESS_TYPE nb_occurrences;
326
+ BJW_CHAR_ACCESS_TYPE *occurrences;
327
+ BJW_SUFFIX(t_candidate_data) *candidate_data;
328
+ uint32_t i_occurrence;
329
+ uint32_t low_search_range;
330
+ uint32_t high_search_range;
331
+ uint32_t candidate_decal;
332
+
333
+ occurrences_head = match->occurrences;
334
+ for (i_candidate = 0; i_candidate < match->nb_matching_candidates; i_candidate++)
335
+ {
336
+ candidate_ind = *((uint32_t*)occurrences_head);
337
+ candidate_data = &(runtime_model->candidates_data[candidate_ind]);
338
+
339
+ nb_occurrences = *((BJW_CHAR_ACCESS_TYPE*)(occurrences_head + sizeof(uint32_t)));
340
+ occurrences = (BJW_CHAR_ACCESS_TYPE*)(occurrences_head + sizeof(uint32_t) + sizeof(BJW_CHAR_ACCESS_TYPE));
341
+ occurrences_head += sizeof(uint32_t) + sizeof(BJW_CHAR_ACCESS_TYPE) + sizeof(BJW_CHAR_ACCESS_TYPE) * nb_occurrences;
342
+
343
+ // assuming all remaining chars match, is it enough to get enough matches?
344
+ if (candidate_data->nb_matches + remaining_chars < candidate_data->required_nb_matches)
345
+ continue ;
346
+
347
+ i_occurrence = match->nb_already_considered[i_candidate];
348
+ // if we already tried all occurrences
349
+ if (i_occurrence >= nb_occurrences)
350
+ continue ;
351
+
352
+ low_search_range = i_char < candidate_data->search_range ? 0 : i_char - candidate_data->search_range;
353
+ high_search_range = i_char + candidate_data->search_range;
354
+ while (i_occurrence < nb_occurrences && occurrences[i_occurrence] < low_search_range)
355
+ i_occurrence++;
356
+ // don't increment when i_occurrence >= nb_occurrences, to prevent overflow
357
+ if (i_occurrence >= nb_occurrences || occurrences[i_occurrence] <= high_search_range)
358
+ match->nb_already_considered[i_candidate] = i_occurrence + (i_occurrence < nb_occurrences);
359
+ if (i_occurrence < nb_occurrences && occurrences[i_occurrence] <= high_search_range)
360
+ {
361
+ candidate_data->nb_matches++;
362
+ runtime_model->occurrences_matches.input_flags[candidate_ind * input_length + i_char] = 1;
363
+ candidate_decal = runtime_model->occurrences_matches.candidates_decal[candidate_ind];
364
+ runtime_model->occurrences_matches.candidates_flags[candidate_decal + occurrences[i_occurrence]] = 1;
365
+ }
366
+ }
367
+ }
368
+
369
+ static uint32_t BJW_SUFFIX(get_nb_transpositions)
370
+ (BJW_CHAR_TYPE *input, uint8_t *input_flags, BJW_CHAR_TYPE *candidate, uint8_t *candidate_flags, uint32_t nb_matches)
371
+ {
372
+ uint32_t trans_count;
373
+ uint32_t input_ind;
374
+ uint32_t candidate_ind;
375
+ uint32_t i_match;
376
+
377
+ trans_count = 0;
378
+ input_ind = 0;
379
+ candidate_ind = 0;
380
+ for (i_match = 0; i_match < nb_matches; i_match++)
381
+ {
382
+ // Go to next ok input flag
383
+ while (!input_flags[input_ind])
384
+ input_ind++;
385
+ // Go to next ok candidate flag
386
+ while (!candidate_flags[candidate_ind])
387
+ candidate_ind++;
388
+ if (input[input_ind] != candidate[candidate_ind])
389
+ trans_count++;
390
+ input_ind++;
391
+ candidate_ind++;
392
+ }
393
+ return (trans_count);
394
+ }
395
+
396
+ static uint32_t BJW_SUFFIX(jaro_winkler_distance_from_flags)
397
+ (BJW_SUFFIX(t_runtime_model) *runtime_model, uint32_t original_char_width, BJW_CHAR_TYPE *input, uint32_t input_length, float min_score, float weight, float threshold, char both_min_score_and_min_scores, bjw_result *results)
398
+ {
399
+ uint32_t i_candidate;
400
+ BJW_CHAR_TYPE *candidate;
401
+ BJW_SUFFIX(t_candidate_data) *candidate_data;
402
+ uint32_t candidate_decal;
403
+ uint32_t nb_transpositions;
404
+ float score;
405
+ float candidate_min_score;
406
+ uint32_t nb_results;
407
+ uint32_t i_char;
408
+ uint32_t prefix_length;
409
+
410
+ nb_results = 0;
411
+ for (i_candidate = 0; i_candidate < runtime_model->nb_candidates; i_candidate++)
412
+ {
413
+ candidate_data = &(runtime_model->candidates_data[i_candidate]);
414
+
415
+ if (candidate_data->nb_matches < candidate_data->required_nb_matches)
416
+ continue ;
417
+
418
+ candidate_min_score = min_score;
419
+ if (min_score < 0.0f || (both_min_score_and_min_scores && ((float)runtime_model->min_scores[i_candidate]) / UINT32_MAX > candidate_min_score))
420
+ candidate_min_score = ((float)runtime_model->min_scores[i_candidate]) / UINT32_MAX;
421
+
422
+ candidate_decal = runtime_model->occurrences_matches.candidates_decal[i_candidate];
423
+ candidate = &(runtime_model->candidates[candidate_decal]);
424
+
425
+ if (candidate_data->nb_matches == 0)
426
+ {
427
+ if (candidate_min_score <= 0.0f)
428
+ {
429
+ results[nb_results].candidate = runtime_model->original_candidates + original_char_width * candidate_decal;
430
+ results[nb_results].score = 0.0f;
431
+ results[nb_results].candidate_length = candidate_data->candidate_length;
432
+ nb_results++;
433
+ }
434
+ continue ;
435
+ }
436
+
437
+ nb_transpositions = BJW_SUFFIX(get_nb_transpositions)(
438
+ input, &(runtime_model->occurrences_matches.input_flags[i_candidate * input_length]),
439
+ candidate, &(runtime_model->occurrences_matches.candidates_flags[candidate_decal]),
440
+ candidate_data->nb_matches
441
+ );
442
+ nb_transpositions /= 2;
443
+
444
+ score =
445
+ candidate_data->nb_matches / ((float)input_length) +
446
+ candidate_data->nb_matches / ((float)candidate_data->candidate_length) +
447
+ ((float)(candidate_data->nb_matches - nb_transpositions)) / ((float)candidate_data->nb_matches);
448
+ score /= 3.0f;
449
+
450
+ if (weight >= 0.0f && score >= threshold)
451
+ {
452
+ prefix_length = candidate_data->candidate_length < input_length ? candidate_data->candidate_length : input_length;
453
+ prefix_length = prefix_length > 4 ? 4 : prefix_length;
454
+ for (i_char = 0; i_char < prefix_length && input[i_char] == candidate[i_char]; i_char++){}
455
+ score = score + (i_char * weight * (1.0f - score));
456
+ }
457
+
458
+ if (score < candidate_min_score)
459
+ continue ;
460
+
461
+ results[nb_results].candidate = runtime_model->original_candidates + original_char_width * candidate_decal;
462
+ results[nb_results].score = score;
463
+ results[nb_results].candidate_length = candidate_data->candidate_length;
464
+ nb_results++;
465
+ }
466
+ return (nb_results);
467
+ }
468
+
469
+ static void *BJW_SUFFIX(build_compressed_input)
470
+ (BJW_SUFFIX(t_runtime_model) *runtime_model, void *input, uint32_t input_length, uint32_t original_char_width)
471
+ {
472
+ BJW_CHAR_TYPE *compressed_input;
473
+ uint32_t i_char;
474
+ t_char *original_char_match;
475
+ uint32_t key;
476
+
477
+ if (sizeof(BJW_CHAR_TYPE) == original_char_width)
478
+ return (input);
479
+ if (!(compressed_input = malloc(sizeof(BJW_CHAR_TYPE) * input_length)))
480
+ return (NULL);
481
+ for (i_char = 0; i_char < input_length; i_char++)
482
+ {
483
+ if (original_char_width == 4)
484
+ key = (uint32_t)(((uint32_t*)input)[i_char]);
485
+ else if (original_char_width == 2)
486
+ key = (uint32_t)(((uint16_t*)input)[i_char]);
487
+ else
488
+ key = (uint32_t)(((uint8_t*)input)[i_char]);
489
+ HASH_FIND(hh, runtime_model->original_chars_to_new, &key, sizeof(uint32_t), original_char_match);
490
+ if (!original_char_match)
491
+ compressed_input[i_char] = 0;
492
+ else
493
+ compressed_input[i_char] = original_char_match->new_representation;
494
+ }
495
+ return (compressed_input);
496
+ }
497
+
498
+ static void *BJW_SUFFIX(jaro_winkler_distance_for_thread)
499
+ (void *thread_data_raw)
500
+ {
501
+ t_thread_data *thread_data;
502
+ BJW_SUFFIX(t_runtime_model) *runtime_model;
503
+ void *input;
504
+ uint32_t input_length;
505
+ float min_score;
506
+ float weight;
507
+ float threshold;
508
+ char both_min_score_and_min_scores;
509
+ uint32_t i_char;
510
+ uint32_t key;
511
+ BJW_SUFFIX(t_char_matches) *match;
512
+ BJW_CHAR_TYPE *compressed_input;
513
+
514
+ thread_data = (t_thread_data*)thread_data_raw;
515
+
516
+ runtime_model = &(((BJW_SUFFIX(t_runtime_model)*)thread_data->runtime_models)[thread_data->i_thread]);
517
+ input = thread_data->input;
518
+ input_length = thread_data->input_length;
519
+ min_score = thread_data->min_score;
520
+ weight = thread_data->weight;
521
+ threshold = thread_data->threshold;
522
+ both_min_score_and_min_scores = thread_data->both_min_score_and_min_scores;
523
+
524
+ compressed_input = BJW_SUFFIX(build_compressed_input)(runtime_model, input, input_length, thread_data->original_char_width);
525
+ if (!compressed_input)
526
+ return (NULL);
527
+
528
+ if (!runtime_model->min_scores && min_score < 0.0f)
529
+ min_score = 0.0f;
530
+ both_min_score_and_min_scores = both_min_score_and_min_scores && runtime_model->min_scores;
531
+ if (!(thread_data->results = malloc(sizeof(bjw_result) * runtime_model->nb_candidates)))
532
+ {
533
+ if (compressed_input != input)
534
+ free(compressed_input);
535
+ return (NULL);
536
+ }
537
+ runtime_model->occurrences_matches.input_flags = malloc(sizeof(uint8_t) * input_length * runtime_model->nb_candidates);
538
+ if (!runtime_model->occurrences_matches.input_flags)
539
+ {
540
+ free(thread_data->results);
541
+ thread_data->results = NULL;
542
+ if (compressed_input != input)
543
+ free(compressed_input);
544
+ return (NULL);
545
+ }
546
+ bzero(runtime_model->occurrences_matches.input_flags, sizeof(uint8_t) * input_length * runtime_model->nb_candidates);
547
+ bzero(runtime_model->occurrences_matches.candidates_flags, sizeof(uint8_t) * runtime_model->total_candidates_lengths);
548
+ bzero(runtime_model->char_matches_data.all_nb_already_considered, sizeof(BJW_CHAR_ACCESS_TYPE) * runtime_model->char_matches_data.all_nb_candidate_occurrences);
549
+
550
+ BJW_SUFFIX(populate_candidates_data)(
551
+ runtime_model->candidates_data, runtime_model->occurrences_matches.candidates_decal, runtime_model->nb_candidates,
552
+ min_score, runtime_model->min_scores, input_length, weight, both_min_score_and_min_scores
553
+ );
554
+
555
+ // we populate the flags
556
+ for (i_char = 0; i_char < input_length; i_char++)
557
+ {
558
+ key = compressed_input[i_char];
559
+ HASH_FIND(hh, runtime_model->char_matches, &key, sizeof(uint32_t), match);
560
+ // no occurrences for this character
561
+ if (!match)
562
+ continue ;
563
+ BJW_SUFFIX(find_occurrences_matches)(runtime_model, match, input_length - i_char, i_char, input_length);
564
+ }
565
+
566
+ // we use the flags to calculate the rest of the jaro winkler distance
567
+ thread_data->nb_results = BJW_SUFFIX(jaro_winkler_distance_from_flags)(
568
+ runtime_model, thread_data->original_char_width, compressed_input, input_length, min_score, weight, threshold, both_min_score_and_min_scores, thread_data->results
569
+ );
570
+
571
+ free(runtime_model->occurrences_matches.input_flags);
572
+ runtime_model->occurrences_matches.input_flags = NULL;
573
+
574
+ if (compressed_input != input)
575
+ free(compressed_input);
576
+
577
+ return (NULL);
578
+ }