batch_jaro_winkler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/batch_jaro_winkler/batch_jaro_winkler.c +104 -0
- data/ext/batch_jaro_winkler/ext/LICENSE.uthash.txt +20 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler.c +890 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler.h +50 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler_internal.h +98 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler_runtime.h +578 -0
- data/ext/batch_jaro_winkler/ext/uthash.h +1230 -0
- data/ext/batch_jaro_winkler/extconf.rb +5 -0
- data/lib/batch_jaro_winkler.rb +242 -0
- data/lib/batch_jaro_winkler/version.rb +3 -0
- metadata +77 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
/*
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#ifndef BATCH_JARO_WINKLER_H
|
26
|
+
# define BATCH_JARO_WINKLER_H
|
27
|
+
|
28
|
+
# include <stdint.h>
|
29
|
+
|
30
|
+
// Set to 0 if you don't have access to POSIX threads or Windows threads
|
31
|
+
# define BJW_USE_THREADS 1
|
32
|
+
|
33
|
+
typedef struct
|
34
|
+
{
|
35
|
+
void *candidate;
|
36
|
+
float score;
|
37
|
+
uint32_t candidate_length;
|
38
|
+
} bjw_result;
|
39
|
+
|
40
|
+
// You can pass the resulting buffer around, you are responsible for freeing it.
|
41
|
+
void *bjw_build_exportable_model(void **candidates, uint32_t char_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t nb_runtime_threads, uint32_t *res_model_size);
|
42
|
+
|
43
|
+
// Allocates buffers required for the runtime. You can use a runtime model multiple times.
|
44
|
+
void *bjw_build_runtime_model(void *exportable_model);
|
45
|
+
void bjw_free_runtime_model(void *runtime_model);
|
46
|
+
|
47
|
+
bjw_result *bjw_jaro_winkler_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, float weight, float threshold, uint32_t n_best_results, uint32_t *nb_results);
|
48
|
+
bjw_result *bjw_jaro_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, uint32_t n_best_results, uint32_t *nb_results);
|
49
|
+
|
50
|
+
#endif
|
@@ -0,0 +1,98 @@
|
|
1
|
+
/*
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#ifndef BATCH_JARO_WINKLER_INTERNAL_H
|
26
|
+
# define BATCH_JARO_WINKLER_INTERNAL_H
|
27
|
+
|
28
|
+
# include "batch_jaro_winkler.h"
|
29
|
+
|
30
|
+
// used for malloc, free
|
31
|
+
# include <stdlib.h>
|
32
|
+
// used for uint8_t, int16_t etc.
|
33
|
+
# include <stdint.h>
|
34
|
+
// used for ceil
|
35
|
+
# include <math.h>
|
36
|
+
|
37
|
+
# if BJW_USE_THREADS
|
38
|
+
# ifdef _WIN32
|
39
|
+
# include <windows.h>
|
40
|
+
# else
|
41
|
+
# include <pthread.h>
|
42
|
+
# endif
|
43
|
+
# endif
|
44
|
+
|
45
|
+
# include "uthash.h"
|
46
|
+
|
47
|
+
typedef struct s_char
|
48
|
+
{
|
49
|
+
uint32_t id;
|
50
|
+
uint32_t new_representation;
|
51
|
+
UT_hash_handle hh;
|
52
|
+
} t_char;
|
53
|
+
|
54
|
+
typedef struct s_tmp_candidate_occurrences
|
55
|
+
{
|
56
|
+
uint32_t id;
|
57
|
+
void *occ_indexes;
|
58
|
+
uint32_t occ_indexes_len;
|
59
|
+
uint32_t occ_indexes_size;
|
60
|
+
// internal data used by uthash
|
61
|
+
UT_hash_handle hh;
|
62
|
+
} t_tmp_candidate_occurrences;
|
63
|
+
|
64
|
+
typedef struct s_char_occurrences
|
65
|
+
{
|
66
|
+
// character represented as an int for uthash
|
67
|
+
uint32_t id;
|
68
|
+
uint32_t original_representation;
|
69
|
+
t_tmp_candidate_occurrences *candidates_occurrences;
|
70
|
+
// internal data used by uthash
|
71
|
+
UT_hash_handle hh;
|
72
|
+
} t_char_occurrences;
|
73
|
+
|
74
|
+
typedef struct s_sorted_candidate
|
75
|
+
{
|
76
|
+
uint32_t original_ind;
|
77
|
+
void *candidate;
|
78
|
+
uint32_t char_width;
|
79
|
+
float min_score;
|
80
|
+
uint32_t candidate_length;
|
81
|
+
} t_sorted_candidate;
|
82
|
+
|
83
|
+
typedef struct s_thread_data
|
84
|
+
{
|
85
|
+
void *runtime_models;
|
86
|
+
uint32_t i_thread;
|
87
|
+
uint32_t original_char_width;
|
88
|
+
void *input;
|
89
|
+
uint32_t input_length;
|
90
|
+
float min_score;
|
91
|
+
float weight;
|
92
|
+
float threshold;
|
93
|
+
char both_min_score_and_min_scores;
|
94
|
+
bjw_result *results;
|
95
|
+
uint32_t nb_results;
|
96
|
+
} t_thread_data;
|
97
|
+
|
98
|
+
#endif
|
@@ -0,0 +1,578 @@
|
|
1
|
+
/*
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#define BJW_SUFFIX_PASTER(name, type1, type2) name ## _ ## type1 ## _ ## type2
|
26
|
+
#define BJW_SUFFIX_HANDLER(name, type1, type2) BJW_SUFFIX_PASTER(name, type1, type2)
|
27
|
+
#define BJW_SUFFIX(name) BJW_SUFFIX_HANDLER(name, BJW_CHAR_TYPE, BJW_CHAR_ACCESS_TYPE)
|
28
|
+
|
29
|
+
// this represents the data needed for a candidate when finding matches
|
30
|
+
typedef struct
|
31
|
+
{
|
32
|
+
BJW_CHAR_ACCESS_TYPE candidate_length;
|
33
|
+
BJW_CHAR_ACCESS_TYPE nb_matches;
|
34
|
+
BJW_CHAR_ACCESS_TYPE required_nb_matches;
|
35
|
+
// the search_range cannot be higher than max(runtime_input_len, max_candidate_len) / 2 - 1,
|
36
|
+
// so 1 byte is enough
|
37
|
+
BJW_CHAR_ACCESS_TYPE search_range;
|
38
|
+
} BJW_SUFFIX(t_candidate_data);
|
39
|
+
|
40
|
+
// holds match information needed to calculate the jaro distance
|
41
|
+
typedef struct
|
42
|
+
{
|
43
|
+
uint8_t *input_flags;
|
44
|
+
uint8_t *candidates_flags;
|
45
|
+
uint32_t *candidates_decal;
|
46
|
+
} BJW_SUFFIX(t_occurrences_matches);
|
47
|
+
|
48
|
+
// map from character to matching candidate occurrences of the character
|
49
|
+
typedef struct
|
50
|
+
{
|
51
|
+
// character represented as an uint32 for uthash
|
52
|
+
uint32_t id;
|
53
|
+
// candidate character occurrences as explained above
|
54
|
+
uint8_t *occurrences;
|
55
|
+
// number of occurrences to skip per candidate
|
56
|
+
BJW_CHAR_ACCESS_TYPE *nb_already_considered;
|
57
|
+
uint32_t nb_matching_candidates;
|
58
|
+
// internal data used by uthash
|
59
|
+
UT_hash_handle hh;
|
60
|
+
} BJW_SUFFIX(t_char_matches);
|
61
|
+
|
62
|
+
typedef struct
|
63
|
+
{
|
64
|
+
BJW_SUFFIX(t_char_matches) *all_char_matches;
|
65
|
+
BJW_CHAR_ACCESS_TYPE *all_nb_already_considered;
|
66
|
+
uint32_t all_nb_candidate_occurrences;
|
67
|
+
} BJW_SUFFIX(t_char_matches_data);
|
68
|
+
|
69
|
+
typedef struct
|
70
|
+
{
|
71
|
+
uint32_t nb_candidates;
|
72
|
+
uint32_t total_candidates_lengths;
|
73
|
+
BJW_SUFFIX(t_candidate_data) *candidates_data;
|
74
|
+
void *original_candidates;
|
75
|
+
BJW_CHAR_TYPE *candidates;
|
76
|
+
uint32_t *min_scores;
|
77
|
+
BJW_SUFFIX(t_occurrences_matches) occurrences_matches;
|
78
|
+
t_char *original_chars_to_new;
|
79
|
+
t_char *all_original_chars_to_new;
|
80
|
+
BJW_SUFFIX(t_char_matches) *char_matches;
|
81
|
+
BJW_SUFFIX(t_char_matches_data) char_matches_data;
|
82
|
+
} BJW_SUFFIX(t_runtime_model);
|
83
|
+
|
84
|
+
static void BJW_SUFFIX(free_runtime_model_for_thread)
|
85
|
+
(BJW_SUFFIX(t_runtime_model) *runtime_model)
|
86
|
+
{
|
87
|
+
HASH_CLEAR(hh, runtime_model->char_matches);
|
88
|
+
HASH_CLEAR(hh, runtime_model->original_chars_to_new);
|
89
|
+
free(runtime_model->candidates_data);
|
90
|
+
runtime_model->candidates_data = NULL;
|
91
|
+
free(runtime_model->occurrences_matches.candidates_flags);
|
92
|
+
runtime_model->occurrences_matches.candidates_flags = NULL;
|
93
|
+
free(runtime_model->char_matches_data.all_char_matches);
|
94
|
+
runtime_model->char_matches_data.all_char_matches = NULL;
|
95
|
+
free(runtime_model->char_matches_data.all_nb_already_considered);
|
96
|
+
runtime_model->char_matches_data.all_nb_already_considered = NULL;
|
97
|
+
free(runtime_model->all_original_chars_to_new);
|
98
|
+
runtime_model->all_original_chars_to_new = NULL;
|
99
|
+
}
|
100
|
+
|
101
|
+
static char BJW_SUFFIX(build_runtime_model_for_thread)
|
102
|
+
(uint8_t *exportable_model_head, uint32_t original_char_width, BJW_SUFFIX(t_runtime_model) *runtime_model)
|
103
|
+
{
|
104
|
+
uint32_t nb_candidates;
|
105
|
+
uint32_t total_candidates_lengths;
|
106
|
+
char min_scores_present;
|
107
|
+
uint32_t nb_char_matches;
|
108
|
+
uint32_t nb_candidate_occurrences;
|
109
|
+
uint32_t store_original_candidates;
|
110
|
+
void *original_candidates;
|
111
|
+
BJW_CHAR_TYPE *candidates;
|
112
|
+
uint32_t *min_scores;
|
113
|
+
void *original_chars;
|
114
|
+
t_char *original_char_match;
|
115
|
+
BJW_CHAR_TYPE *chars;
|
116
|
+
uint32_t *chars_occurrences_decals;
|
117
|
+
uint32_t *nb_candidates_per_char_match;
|
118
|
+
uint32_t *candidates_decal;
|
119
|
+
uint8_t *occurrences;
|
120
|
+
uint32_t i_char;
|
121
|
+
uint32_t nb_already_considered_decal;
|
122
|
+
BJW_SUFFIX(t_char_matches) *match;
|
123
|
+
|
124
|
+
nb_candidates = *((uint32_t*)exportable_model_head);
|
125
|
+
exportable_model_head += sizeof(uint32_t);
|
126
|
+
total_candidates_lengths = *((uint32_t*)exportable_model_head);
|
127
|
+
exportable_model_head += sizeof(uint32_t);
|
128
|
+
min_scores_present = *((uint32_t*)exportable_model_head) ? 1 : 0;
|
129
|
+
exportable_model_head += sizeof(uint32_t);
|
130
|
+
nb_char_matches = *((uint32_t*)exportable_model_head);
|
131
|
+
exportable_model_head += sizeof(uint32_t);
|
132
|
+
nb_candidate_occurrences = *((uint32_t*)exportable_model_head);
|
133
|
+
exportable_model_head += sizeof(uint32_t);
|
134
|
+
store_original_candidates = *((uint32_t*)exportable_model_head);
|
135
|
+
exportable_model_head += sizeof(uint32_t);
|
136
|
+
min_scores = min_scores_present ? (uint32_t*)exportable_model_head : NULL;
|
137
|
+
exportable_model_head += sizeof(uint32_t) * nb_candidates * min_scores_present;
|
138
|
+
chars_occurrences_decals = (uint32_t*)exportable_model_head;
|
139
|
+
exportable_model_head += sizeof(uint32_t) * nb_char_matches;
|
140
|
+
nb_candidates_per_char_match = (uint32_t*)exportable_model_head;
|
141
|
+
exportable_model_head += sizeof(uint32_t) * nb_char_matches;
|
142
|
+
candidates_decal = (uint32_t*)exportable_model_head;
|
143
|
+
exportable_model_head += sizeof(uint32_t) * (nb_candidates + 1);
|
144
|
+
original_candidates = exportable_model_head;
|
145
|
+
exportable_model_head += original_char_width * total_candidates_lengths * store_original_candidates;
|
146
|
+
candidates = (BJW_CHAR_TYPE*)exportable_model_head;
|
147
|
+
exportable_model_head += sizeof(BJW_CHAR_TYPE) * total_candidates_lengths;
|
148
|
+
original_chars = exportable_model_head;
|
149
|
+
exportable_model_head += original_char_width * nb_char_matches * store_original_candidates;
|
150
|
+
chars = (BJW_CHAR_TYPE*)exportable_model_head;
|
151
|
+
exportable_model_head += sizeof(BJW_CHAR_TYPE) * nb_char_matches;
|
152
|
+
occurrences = (uint8_t*)exportable_model_head;
|
153
|
+
|
154
|
+
runtime_model->nb_candidates = nb_candidates;
|
155
|
+
runtime_model->total_candidates_lengths = total_candidates_lengths;
|
156
|
+
runtime_model->candidates_data = malloc(sizeof(BJW_SUFFIX(t_candidate_data)) * nb_candidates);
|
157
|
+
runtime_model->original_candidates = store_original_candidates ? original_candidates : candidates;
|
158
|
+
runtime_model->candidates = candidates;
|
159
|
+
runtime_model->min_scores = min_scores;
|
160
|
+
runtime_model->occurrences_matches.candidates_flags = malloc(sizeof(uint8_t) * total_candidates_lengths);
|
161
|
+
runtime_model->occurrences_matches.candidates_decal = candidates_decal;
|
162
|
+
// important to set to NULL for uthash
|
163
|
+
runtime_model->original_chars_to_new = NULL;
|
164
|
+
runtime_model->all_original_chars_to_new = store_original_candidates ? malloc(sizeof(t_char) * nb_char_matches) : NULL;
|
165
|
+
// important to set to NULL for uthash
|
166
|
+
runtime_model->char_matches = NULL;
|
167
|
+
runtime_model->char_matches_data.all_char_matches = malloc(sizeof(BJW_SUFFIX(t_char_matches)) * nb_char_matches);
|
168
|
+
runtime_model->char_matches_data.all_nb_already_considered = malloc(sizeof(BJW_CHAR_ACCESS_TYPE) * nb_candidate_occurrences);
|
169
|
+
runtime_model->char_matches_data.all_nb_candidate_occurrences = nb_candidate_occurrences;
|
170
|
+
|
171
|
+
if (!runtime_model->candidates_data || !runtime_model->occurrences_matches.candidates_flags
|
172
|
+
|| !runtime_model->char_matches_data.all_char_matches || !runtime_model->char_matches_data.all_nb_already_considered
|
173
|
+
|| (store_original_candidates && !runtime_model->all_original_chars_to_new)
|
174
|
+
)
|
175
|
+
{
|
176
|
+
BJW_SUFFIX(free_runtime_model_for_thread)(runtime_model);
|
177
|
+
return (0);
|
178
|
+
}
|
179
|
+
|
180
|
+
nb_already_considered_decal = 0;
|
181
|
+
for (i_char = 0; i_char < nb_char_matches; i_char++)
|
182
|
+
{
|
183
|
+
if (store_original_candidates)
|
184
|
+
{
|
185
|
+
original_char_match = &(runtime_model->all_original_chars_to_new[i_char]);
|
186
|
+
if (original_char_width == 4)
|
187
|
+
original_char_match->id = (uint32_t)(((uint32_t*)original_chars)[i_char]);
|
188
|
+
else if (original_char_width == 2)
|
189
|
+
original_char_match->id = (uint32_t)(((uint16_t*)original_chars)[i_char]);
|
190
|
+
else
|
191
|
+
original_char_match->id = (uint32_t)(((uint8_t*)original_chars)[i_char]);
|
192
|
+
original_char_match->new_representation = chars[i_char];
|
193
|
+
HASH_ADD(hh, runtime_model->original_chars_to_new, id, sizeof(uint32_t), original_char_match);
|
194
|
+
}
|
195
|
+
|
196
|
+
match = &(runtime_model->char_matches_data.all_char_matches[i_char]);
|
197
|
+
match->id = chars[i_char];
|
198
|
+
match->occurrences = occurrences + chars_occurrences_decals[i_char];
|
199
|
+
match->nb_already_considered = runtime_model->char_matches_data.all_nb_already_considered + nb_already_considered_decal;
|
200
|
+
match->nb_matching_candidates = nb_candidates_per_char_match[i_char];
|
201
|
+
HASH_ADD(hh, runtime_model->char_matches, id, sizeof(uint32_t), match);
|
202
|
+
nb_already_considered_decal += nb_candidates_per_char_match[i_char];
|
203
|
+
}
|
204
|
+
|
205
|
+
return (1);
|
206
|
+
}
|
207
|
+
|
208
|
+
static void BJW_SUFFIX(free_runtime_model)
|
209
|
+
(void *runtime_model)
|
210
|
+
{
|
211
|
+
BJW_SUFFIX(t_runtime_model) *runtime_models;
|
212
|
+
uint32_t i_thread;
|
213
|
+
uint32_t nb_runtime_threads;
|
214
|
+
|
215
|
+
nb_runtime_threads = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 0));
|
216
|
+
runtime_models = (BJW_SUFFIX(t_runtime_model)*)(runtime_model + sizeof(uint32_t) * 5);
|
217
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
218
|
+
BJW_SUFFIX(free_runtime_model_for_thread)(&(runtime_models[i_thread]));
|
219
|
+
free(runtime_model);
|
220
|
+
}
|
221
|
+
|
222
|
+
static void *BJW_SUFFIX(build_runtime_model)
|
223
|
+
(uint8_t *exportable_model_head, uint32_t nb_runtime_threads, uint32_t nb_candidates, uint32_t original_char_width, uint32_t *model_size_per_thread)
|
224
|
+
{
|
225
|
+
uint32_t i_thread;
|
226
|
+
uint32_t i;
|
227
|
+
void *res;
|
228
|
+
BJW_SUFFIX(t_runtime_model) *runtime_models;
|
229
|
+
|
230
|
+
res = malloc(sizeof(BJW_SUFFIX(t_runtime_model)) * nb_runtime_threads + sizeof(uint32_t) * 5);
|
231
|
+
if (!res)
|
232
|
+
return (NULL);
|
233
|
+
*((uint32_t*)(res + sizeof(uint32_t) * 0)) = nb_runtime_threads;
|
234
|
+
*((uint32_t*)(res + sizeof(uint32_t) * 1)) = nb_candidates;
|
235
|
+
*((uint32_t*)(res + sizeof(uint32_t) * 2)) = sizeof(BJW_CHAR_TYPE);
|
236
|
+
*((uint32_t*)(res + sizeof(uint32_t) * 3)) = sizeof(BJW_CHAR_ACCESS_TYPE);
|
237
|
+
*((uint32_t*)(res + sizeof(uint32_t) * 4)) = original_char_width;
|
238
|
+
runtime_models = res + sizeof(uint32_t) * 5;
|
239
|
+
|
240
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
241
|
+
{
|
242
|
+
if (!BJW_SUFFIX(build_runtime_model_for_thread)(exportable_model_head, original_char_width, &(runtime_models[i_thread])))
|
243
|
+
{
|
244
|
+
for (i = 0; i < i_thread; i++)
|
245
|
+
BJW_SUFFIX(free_runtime_model_for_thread)(&(runtime_models[i]));
|
246
|
+
free(res);
|
247
|
+
return (NULL);
|
248
|
+
}
|
249
|
+
exportable_model_head += model_size_per_thread[i_thread];
|
250
|
+
}
|
251
|
+
|
252
|
+
return (res);
|
253
|
+
}
|
254
|
+
|
255
|
+
static void BJW_SUFFIX(populate_candidates_data)
|
256
|
+
(BJW_SUFFIX(t_candidate_data) *candidates_data, uint32_t *candidates_decal, uint32_t nb_candidates, float min_score, uint32_t *min_scores, uint32_t input_length, float weight, char both_min_score_and_min_scores)
|
257
|
+
{
|
258
|
+
uint32_t i_candidate;
|
259
|
+
float candidate_min_score;
|
260
|
+
BJW_CHAR_ACCESS_TYPE candidate_length;
|
261
|
+
BJW_CHAR_ACCESS_TYPE required_nb_matches;
|
262
|
+
BJW_CHAR_ACCESS_TYPE search_range;
|
263
|
+
float float_required_nb_matches;
|
264
|
+
float bottom_part;
|
265
|
+
|
266
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
267
|
+
{
|
268
|
+
candidate_min_score = min_score;
|
269
|
+
if (min_score < 0.0f || (both_min_score_and_min_scores && ((float)min_scores[i_candidate]) / UINT32_MAX > candidate_min_score))
|
270
|
+
candidate_min_score = ((float)min_scores[i_candidate]) / UINT32_MAX;
|
271
|
+
|
272
|
+
candidate_length = candidates_decal[i_candidate + 1] - candidates_decal[i_candidate];
|
273
|
+
if (candidate_length < 1)
|
274
|
+
{
|
275
|
+
candidates_data[i_candidate] = (BJW_SUFFIX(t_candidate_data)){
|
276
|
+
.candidate_length = candidate_length,
|
277
|
+
.nb_matches = 0,
|
278
|
+
.required_nb_matches = 0,
|
279
|
+
.search_range = 1
|
280
|
+
};
|
281
|
+
continue ;
|
282
|
+
}
|
283
|
+
|
284
|
+
// equations solved using: https://www.mathpapa.com/equation-solver
|
285
|
+
// jaro distance
|
286
|
+
if (weight < 0.0f)
|
287
|
+
float_required_nb_matches = (3.0f * candidate_min_score * candidate_length * input_length - (candidate_length * input_length)) / (candidate_length + input_length);
|
288
|
+
else
|
289
|
+
{
|
290
|
+
// jaro winkler distance. Assume all prefixing characters match
|
291
|
+
bottom_part = -(4.0f * candidate_length * weight) - (4.0f * input_length * weight) + candidate_length + input_length;
|
292
|
+
if (bottom_part == 0.0f || bottom_part == -0.0f)
|
293
|
+
float_required_nb_matches = candidate_length > input_length ? candidate_length + 1 : input_length + 1;
|
294
|
+
else
|
295
|
+
{
|
296
|
+
float_required_nb_matches = (
|
297
|
+
(3.0f * candidate_min_score * candidate_length * input_length) -
|
298
|
+
(8.0f * weight * candidate_length * input_length) -
|
299
|
+
(candidate_length * input_length)
|
300
|
+
) / bottom_part;
|
301
|
+
}
|
302
|
+
}
|
303
|
+
if (float_required_nb_matches < 0.0f)
|
304
|
+
float_required_nb_matches = 0.0f;
|
305
|
+
required_nb_matches = (BJW_CHAR_ACCESS_TYPE)(ceil(float_required_nb_matches));
|
306
|
+
|
307
|
+
search_range = (input_length > candidate_length ? input_length : candidate_length) / 2;
|
308
|
+
search_range = search_range <= 1 ? 0 : search_range - 1;
|
309
|
+
|
310
|
+
candidates_data[i_candidate] = (BJW_SUFFIX(t_candidate_data)){
|
311
|
+
.candidate_length = candidate_length,
|
312
|
+
.nb_matches = 0,
|
313
|
+
.required_nb_matches = required_nb_matches,
|
314
|
+
.search_range = search_range
|
315
|
+
};
|
316
|
+
}
|
317
|
+
}
|
318
|
+
|
319
|
+
static void BJW_SUFFIX(find_occurrences_matches)
|
320
|
+
(BJW_SUFFIX(t_runtime_model) *runtime_model, BJW_SUFFIX(t_char_matches) *match, uint32_t remaining_chars, uint32_t i_char, uint32_t input_length)
|
321
|
+
{
|
322
|
+
uint8_t *occurrences_head;
|
323
|
+
uint32_t i_candidate;
|
324
|
+
uint32_t candidate_ind;
|
325
|
+
BJW_CHAR_ACCESS_TYPE nb_occurrences;
|
326
|
+
BJW_CHAR_ACCESS_TYPE *occurrences;
|
327
|
+
BJW_SUFFIX(t_candidate_data) *candidate_data;
|
328
|
+
uint32_t i_occurrence;
|
329
|
+
uint32_t low_search_range;
|
330
|
+
uint32_t high_search_range;
|
331
|
+
uint32_t candidate_decal;
|
332
|
+
|
333
|
+
occurrences_head = match->occurrences;
|
334
|
+
for (i_candidate = 0; i_candidate < match->nb_matching_candidates; i_candidate++)
|
335
|
+
{
|
336
|
+
candidate_ind = *((uint32_t*)occurrences_head);
|
337
|
+
candidate_data = &(runtime_model->candidates_data[candidate_ind]);
|
338
|
+
|
339
|
+
nb_occurrences = *((BJW_CHAR_ACCESS_TYPE*)(occurrences_head + sizeof(uint32_t)));
|
340
|
+
occurrences = (BJW_CHAR_ACCESS_TYPE*)(occurrences_head + sizeof(uint32_t) + sizeof(BJW_CHAR_ACCESS_TYPE));
|
341
|
+
occurrences_head += sizeof(uint32_t) + sizeof(BJW_CHAR_ACCESS_TYPE) + sizeof(BJW_CHAR_ACCESS_TYPE) * nb_occurrences;
|
342
|
+
|
343
|
+
// assuming all remaining chars match, is it enough to get enough matches?
|
344
|
+
if (candidate_data->nb_matches + remaining_chars < candidate_data->required_nb_matches)
|
345
|
+
continue ;
|
346
|
+
|
347
|
+
i_occurrence = match->nb_already_considered[i_candidate];
|
348
|
+
// if we already tried all occurrences
|
349
|
+
if (i_occurrence >= nb_occurrences)
|
350
|
+
continue ;
|
351
|
+
|
352
|
+
low_search_range = i_char < candidate_data->search_range ? 0 : i_char - candidate_data->search_range;
|
353
|
+
high_search_range = i_char + candidate_data->search_range;
|
354
|
+
while (i_occurrence < nb_occurrences && occurrences[i_occurrence] < low_search_range)
|
355
|
+
i_occurrence++;
|
356
|
+
// don't increment when i_occurrence >= nb_occurrences, to prevent overflow
|
357
|
+
if (i_occurrence >= nb_occurrences || occurrences[i_occurrence] <= high_search_range)
|
358
|
+
match->nb_already_considered[i_candidate] = i_occurrence + (i_occurrence < nb_occurrences);
|
359
|
+
if (i_occurrence < nb_occurrences && occurrences[i_occurrence] <= high_search_range)
|
360
|
+
{
|
361
|
+
candidate_data->nb_matches++;
|
362
|
+
runtime_model->occurrences_matches.input_flags[candidate_ind * input_length + i_char] = 1;
|
363
|
+
candidate_decal = runtime_model->occurrences_matches.candidates_decal[candidate_ind];
|
364
|
+
runtime_model->occurrences_matches.candidates_flags[candidate_decal + occurrences[i_occurrence]] = 1;
|
365
|
+
}
|
366
|
+
}
|
367
|
+
}
|
368
|
+
|
369
|
+
static uint32_t BJW_SUFFIX(get_nb_transpositions)
|
370
|
+
(BJW_CHAR_TYPE *input, uint8_t *input_flags, BJW_CHAR_TYPE *candidate, uint8_t *candidate_flags, uint32_t nb_matches)
|
371
|
+
{
|
372
|
+
uint32_t trans_count;
|
373
|
+
uint32_t input_ind;
|
374
|
+
uint32_t candidate_ind;
|
375
|
+
uint32_t i_match;
|
376
|
+
|
377
|
+
trans_count = 0;
|
378
|
+
input_ind = 0;
|
379
|
+
candidate_ind = 0;
|
380
|
+
for (i_match = 0; i_match < nb_matches; i_match++)
|
381
|
+
{
|
382
|
+
// Go to next ok input flag
|
383
|
+
while (!input_flags[input_ind])
|
384
|
+
input_ind++;
|
385
|
+
// Go to next ok candidate flag
|
386
|
+
while (!candidate_flags[candidate_ind])
|
387
|
+
candidate_ind++;
|
388
|
+
if (input[input_ind] != candidate[candidate_ind])
|
389
|
+
trans_count++;
|
390
|
+
input_ind++;
|
391
|
+
candidate_ind++;
|
392
|
+
}
|
393
|
+
return (trans_count);
|
394
|
+
}
|
395
|
+
|
396
|
+
static uint32_t BJW_SUFFIX(jaro_winkler_distance_from_flags)
|
397
|
+
(BJW_SUFFIX(t_runtime_model) *runtime_model, uint32_t original_char_width, BJW_CHAR_TYPE *input, uint32_t input_length, float min_score, float weight, float threshold, char both_min_score_and_min_scores, bjw_result *results)
|
398
|
+
{
|
399
|
+
uint32_t i_candidate;
|
400
|
+
BJW_CHAR_TYPE *candidate;
|
401
|
+
BJW_SUFFIX(t_candidate_data) *candidate_data;
|
402
|
+
uint32_t candidate_decal;
|
403
|
+
uint32_t nb_transpositions;
|
404
|
+
float score;
|
405
|
+
float candidate_min_score;
|
406
|
+
uint32_t nb_results;
|
407
|
+
uint32_t i_char;
|
408
|
+
uint32_t prefix_length;
|
409
|
+
|
410
|
+
nb_results = 0;
|
411
|
+
for (i_candidate = 0; i_candidate < runtime_model->nb_candidates; i_candidate++)
|
412
|
+
{
|
413
|
+
candidate_data = &(runtime_model->candidates_data[i_candidate]);
|
414
|
+
|
415
|
+
if (candidate_data->nb_matches < candidate_data->required_nb_matches)
|
416
|
+
continue ;
|
417
|
+
|
418
|
+
candidate_min_score = min_score;
|
419
|
+
if (min_score < 0.0f || (both_min_score_and_min_scores && ((float)runtime_model->min_scores[i_candidate]) / UINT32_MAX > candidate_min_score))
|
420
|
+
candidate_min_score = ((float)runtime_model->min_scores[i_candidate]) / UINT32_MAX;
|
421
|
+
|
422
|
+
candidate_decal = runtime_model->occurrences_matches.candidates_decal[i_candidate];
|
423
|
+
candidate = &(runtime_model->candidates[candidate_decal]);
|
424
|
+
|
425
|
+
if (candidate_data->nb_matches == 0)
|
426
|
+
{
|
427
|
+
if (candidate_min_score <= 0.0f)
|
428
|
+
{
|
429
|
+
results[nb_results].candidate = runtime_model->original_candidates + original_char_width * candidate_decal;
|
430
|
+
results[nb_results].score = 0.0f;
|
431
|
+
results[nb_results].candidate_length = candidate_data->candidate_length;
|
432
|
+
nb_results++;
|
433
|
+
}
|
434
|
+
continue ;
|
435
|
+
}
|
436
|
+
|
437
|
+
nb_transpositions = BJW_SUFFIX(get_nb_transpositions)(
|
438
|
+
input, &(runtime_model->occurrences_matches.input_flags[i_candidate * input_length]),
|
439
|
+
candidate, &(runtime_model->occurrences_matches.candidates_flags[candidate_decal]),
|
440
|
+
candidate_data->nb_matches
|
441
|
+
);
|
442
|
+
nb_transpositions /= 2;
|
443
|
+
|
444
|
+
score =
|
445
|
+
candidate_data->nb_matches / ((float)input_length) +
|
446
|
+
candidate_data->nb_matches / ((float)candidate_data->candidate_length) +
|
447
|
+
((float)(candidate_data->nb_matches - nb_transpositions)) / ((float)candidate_data->nb_matches);
|
448
|
+
score /= 3.0f;
|
449
|
+
|
450
|
+
if (weight >= 0.0f && score >= threshold)
|
451
|
+
{
|
452
|
+
prefix_length = candidate_data->candidate_length < input_length ? candidate_data->candidate_length : input_length;
|
453
|
+
prefix_length = prefix_length > 4 ? 4 : prefix_length;
|
454
|
+
for (i_char = 0; i_char < prefix_length && input[i_char] == candidate[i_char]; i_char++){}
|
455
|
+
score = score + (i_char * weight * (1.0f - score));
|
456
|
+
}
|
457
|
+
|
458
|
+
if (score < candidate_min_score)
|
459
|
+
continue ;
|
460
|
+
|
461
|
+
results[nb_results].candidate = runtime_model->original_candidates + original_char_width * candidate_decal;
|
462
|
+
results[nb_results].score = score;
|
463
|
+
results[nb_results].candidate_length = candidate_data->candidate_length;
|
464
|
+
nb_results++;
|
465
|
+
}
|
466
|
+
return (nb_results);
|
467
|
+
}
|
468
|
+
|
469
|
+
static void *BJW_SUFFIX(build_compressed_input)
|
470
|
+
(BJW_SUFFIX(t_runtime_model) *runtime_model, void *input, uint32_t input_length, uint32_t original_char_width)
|
471
|
+
{
|
472
|
+
BJW_CHAR_TYPE *compressed_input;
|
473
|
+
uint32_t i_char;
|
474
|
+
t_char *original_char_match;
|
475
|
+
uint32_t key;
|
476
|
+
|
477
|
+
if (sizeof(BJW_CHAR_TYPE) == original_char_width)
|
478
|
+
return (input);
|
479
|
+
if (!(compressed_input = malloc(sizeof(BJW_CHAR_TYPE) * input_length)))
|
480
|
+
return (NULL);
|
481
|
+
for (i_char = 0; i_char < input_length; i_char++)
|
482
|
+
{
|
483
|
+
if (original_char_width == 4)
|
484
|
+
key = (uint32_t)(((uint32_t*)input)[i_char]);
|
485
|
+
else if (original_char_width == 2)
|
486
|
+
key = (uint32_t)(((uint16_t*)input)[i_char]);
|
487
|
+
else
|
488
|
+
key = (uint32_t)(((uint8_t*)input)[i_char]);
|
489
|
+
HASH_FIND(hh, runtime_model->original_chars_to_new, &key, sizeof(uint32_t), original_char_match);
|
490
|
+
if (!original_char_match)
|
491
|
+
compressed_input[i_char] = 0;
|
492
|
+
else
|
493
|
+
compressed_input[i_char] = original_char_match->new_representation;
|
494
|
+
}
|
495
|
+
return (compressed_input);
|
496
|
+
}
|
497
|
+
|
498
|
+
static void *BJW_SUFFIX(jaro_winkler_distance_for_thread)
|
499
|
+
(void *thread_data_raw)
|
500
|
+
{
|
501
|
+
t_thread_data *thread_data;
|
502
|
+
BJW_SUFFIX(t_runtime_model) *runtime_model;
|
503
|
+
void *input;
|
504
|
+
uint32_t input_length;
|
505
|
+
float min_score;
|
506
|
+
float weight;
|
507
|
+
float threshold;
|
508
|
+
char both_min_score_and_min_scores;
|
509
|
+
uint32_t i_char;
|
510
|
+
uint32_t key;
|
511
|
+
BJW_SUFFIX(t_char_matches) *match;
|
512
|
+
BJW_CHAR_TYPE *compressed_input;
|
513
|
+
|
514
|
+
thread_data = (t_thread_data*)thread_data_raw;
|
515
|
+
|
516
|
+
runtime_model = &(((BJW_SUFFIX(t_runtime_model)*)thread_data->runtime_models)[thread_data->i_thread]);
|
517
|
+
input = thread_data->input;
|
518
|
+
input_length = thread_data->input_length;
|
519
|
+
min_score = thread_data->min_score;
|
520
|
+
weight = thread_data->weight;
|
521
|
+
threshold = thread_data->threshold;
|
522
|
+
both_min_score_and_min_scores = thread_data->both_min_score_and_min_scores;
|
523
|
+
|
524
|
+
compressed_input = BJW_SUFFIX(build_compressed_input)(runtime_model, input, input_length, thread_data->original_char_width);
|
525
|
+
if (!compressed_input)
|
526
|
+
return (NULL);
|
527
|
+
|
528
|
+
if (!runtime_model->min_scores && min_score < 0.0f)
|
529
|
+
min_score = 0.0f;
|
530
|
+
both_min_score_and_min_scores = both_min_score_and_min_scores && runtime_model->min_scores;
|
531
|
+
if (!(thread_data->results = malloc(sizeof(bjw_result) * runtime_model->nb_candidates)))
|
532
|
+
{
|
533
|
+
if (compressed_input != input)
|
534
|
+
free(compressed_input);
|
535
|
+
return (NULL);
|
536
|
+
}
|
537
|
+
runtime_model->occurrences_matches.input_flags = malloc(sizeof(uint8_t) * input_length * runtime_model->nb_candidates);
|
538
|
+
if (!runtime_model->occurrences_matches.input_flags)
|
539
|
+
{
|
540
|
+
free(thread_data->results);
|
541
|
+
thread_data->results = NULL;
|
542
|
+
if (compressed_input != input)
|
543
|
+
free(compressed_input);
|
544
|
+
return (NULL);
|
545
|
+
}
|
546
|
+
bzero(runtime_model->occurrences_matches.input_flags, sizeof(uint8_t) * input_length * runtime_model->nb_candidates);
|
547
|
+
bzero(runtime_model->occurrences_matches.candidates_flags, sizeof(uint8_t) * runtime_model->total_candidates_lengths);
|
548
|
+
bzero(runtime_model->char_matches_data.all_nb_already_considered, sizeof(BJW_CHAR_ACCESS_TYPE) * runtime_model->char_matches_data.all_nb_candidate_occurrences);
|
549
|
+
|
550
|
+
BJW_SUFFIX(populate_candidates_data)(
|
551
|
+
runtime_model->candidates_data, runtime_model->occurrences_matches.candidates_decal, runtime_model->nb_candidates,
|
552
|
+
min_score, runtime_model->min_scores, input_length, weight, both_min_score_and_min_scores
|
553
|
+
);
|
554
|
+
|
555
|
+
// we populate the flags
|
556
|
+
for (i_char = 0; i_char < input_length; i_char++)
|
557
|
+
{
|
558
|
+
key = compressed_input[i_char];
|
559
|
+
HASH_FIND(hh, runtime_model->char_matches, &key, sizeof(uint32_t), match);
|
560
|
+
// no occurrences for this character
|
561
|
+
if (!match)
|
562
|
+
continue ;
|
563
|
+
BJW_SUFFIX(find_occurrences_matches)(runtime_model, match, input_length - i_char, i_char, input_length);
|
564
|
+
}
|
565
|
+
|
566
|
+
// we use the flags to calculate the rest of the jaro winkler distance
|
567
|
+
thread_data->nb_results = BJW_SUFFIX(jaro_winkler_distance_from_flags)(
|
568
|
+
runtime_model, thread_data->original_char_width, compressed_input, input_length, min_score, weight, threshold, both_min_score_and_min_scores, thread_data->results
|
569
|
+
);
|
570
|
+
|
571
|
+
free(runtime_model->occurrences_matches.input_flags);
|
572
|
+
runtime_model->occurrences_matches.input_flags = NULL;
|
573
|
+
|
574
|
+
if (compressed_input != input)
|
575
|
+
free(compressed_input);
|
576
|
+
|
577
|
+
return (NULL);
|
578
|
+
}
|