batch_jaro_winkler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/batch_jaro_winkler/batch_jaro_winkler.c +104 -0
- data/ext/batch_jaro_winkler/ext/LICENSE.uthash.txt +20 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler.c +890 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler.h +50 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler_internal.h +98 -0
- data/ext/batch_jaro_winkler/ext/batch_jaro_winkler_runtime.h +578 -0
- data/ext/batch_jaro_winkler/ext/uthash.h +1230 -0
- data/ext/batch_jaro_winkler/extconf.rb +5 -0
- data/lib/batch_jaro_winkler.rb +242 -0
- data/lib/batch_jaro_winkler/version.rb +3 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: fa81343451beff7427878758a54f8e1af1b8cc2ee1905ab437bad420af0450c7
|
4
|
+
data.tar.gz: a13aac63be06874621637a2f94c7256c4e5a5ffbd6f12316cc226aa05bad87f1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c0fa50b077e593d2d69e89d5695c337f2bcbe755e390519af23d67e4d7893222f7576703ba0e1b5a09e2344d5f925762d5492bfe49a9925992e1ff588b841b27
|
7
|
+
data.tar.gz: 1c0e9765a11ded4a92d0fb9cf675674b73e145c04bf41aab438510159879f2ccce56da064f6dd4170ed0da0c9b80171dd88020f48ce4cdb0d1cae94014b31788
|
@@ -0,0 +1,104 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
/*
|
4
|
+
For this file to work with other ruby implementations than MRI, replace everything with:
|
5
|
+
|
6
|
+
#include "ext/batch_jaro_winkler.c"
|
7
|
+
void Init_batch_jaro_winkler(void){}
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include "ext/batch_jaro_winkler.c"
|
11
|
+
#include "ruby.h"
|
12
|
+
#include "ruby/encoding.h"
|
13
|
+
|
14
|
+
VALUE rb_bjw_build_runtime_result(VALUE self, VALUE tmp_store, VALUE rb_results, VALUE rb_c_results, VALUE rb_nb_results, VALUE rb_inp_encoded, VALUE rb_char_width)
|
15
|
+
{
|
16
|
+
bjw_result *results;
|
17
|
+
uint32_t nb_results;
|
18
|
+
uint32_t i_result;
|
19
|
+
VALUE tmp_candidate;
|
20
|
+
rb_encoding *utf32le_encoding;
|
21
|
+
rb_encoding *utf8_encoding;
|
22
|
+
VALUE rb_utf8_encoding;
|
23
|
+
uint32_t char_width;
|
24
|
+
int inp_encoded;
|
25
|
+
char *all_candidates;
|
26
|
+
VALUE rb_all_candidates;
|
27
|
+
uint64_t total_nb_bytes;
|
28
|
+
uint64_t decal;
|
29
|
+
uint64_t bytes_len;
|
30
|
+
uint64_t candidate_length_in_bytes;
|
31
|
+
uint64_t i_char;
|
32
|
+
|
33
|
+
nb_results = (uint32_t)(NUM2ULL(rb_nb_results));
|
34
|
+
results = (bjw_result*)(NUM2ULL(rb_c_results));
|
35
|
+
char_width = (uint32_t)(NUM2ULL(rb_char_width));
|
36
|
+
inp_encoded = RTEST(rb_inp_encoded);
|
37
|
+
|
38
|
+
utf32le_encoding = rb_enc_find("UTF-32LE");
|
39
|
+
utf8_encoding = rb_enc_find("UTF-8");
|
40
|
+
rb_utf8_encoding = rb_enc_from_encoding(utf8_encoding);
|
41
|
+
// We use tmp_store so that local ruby objects are marked by the GC
|
42
|
+
rb_ary_push(tmp_store, rb_utf8_encoding);
|
43
|
+
|
44
|
+
if (!inp_encoded)
|
45
|
+
{
|
46
|
+
total_nb_bytes = 0;
|
47
|
+
for (i_result = 0; i_result < nb_results; i_result++)
|
48
|
+
total_nb_bytes += results[i_result].candidate_length;
|
49
|
+
total_nb_bytes *= char_width;
|
50
|
+
all_candidates = malloc(total_nb_bytes);
|
51
|
+
if (!all_candidates)
|
52
|
+
return (Qfalse);
|
53
|
+
decal = 0;
|
54
|
+
for (i_result = 0; i_result < nb_results; i_result++)
|
55
|
+
{
|
56
|
+
bytes_len = results[i_result].candidate_length * char_width;
|
57
|
+
for (i_char = 0; i_char < bytes_len; i_char++)
|
58
|
+
all_candidates[decal + i_char] = ((char*)results[i_result].candidate)[i_char];
|
59
|
+
decal += bytes_len;
|
60
|
+
}
|
61
|
+
rb_all_candidates = rb_enc_str_new(all_candidates, total_nb_bytes, utf32le_encoding);
|
62
|
+
// We use tmp_store so that local ruby objects are marked by the GC
|
63
|
+
rb_ary_push(tmp_store, rb_all_candidates);
|
64
|
+
free(all_candidates);
|
65
|
+
rb_all_candidates = rb_str_encode(rb_all_candidates, rb_utf8_encoding, 0, Qnil);
|
66
|
+
// We use tmp_store so that local ruby objects are marked by the GC
|
67
|
+
rb_ary_push(tmp_store, rb_all_candidates);
|
68
|
+
all_candidates = RSTRING_PTR(rb_all_candidates);
|
69
|
+
}
|
70
|
+
|
71
|
+
decal = 0;
|
72
|
+
for (i_result = 0; i_result < nb_results; i_result++)
|
73
|
+
{
|
74
|
+
if (!inp_encoded)
|
75
|
+
{
|
76
|
+
candidate_length_in_bytes = 0;
|
77
|
+
for (i_char = 0; i_char < results[i_result].candidate_length; i_char++)
|
78
|
+
{
|
79
|
+
if ((all_candidates[decal + candidate_length_in_bytes] & 0xf8) == 0xf0)
|
80
|
+
candidate_length_in_bytes += 4;
|
81
|
+
else if ((all_candidates[decal + candidate_length_in_bytes] & 0xf0) == 0xe0)
|
82
|
+
candidate_length_in_bytes += 3;
|
83
|
+
else if ((all_candidates[decal + candidate_length_in_bytes] & 0xe0) == 0xc0)
|
84
|
+
candidate_length_in_bytes += 2;
|
85
|
+
else
|
86
|
+
candidate_length_in_bytes += 1;
|
87
|
+
}
|
88
|
+
tmp_candidate = rb_enc_str_new(all_candidates + decal, candidate_length_in_bytes, utf8_encoding);
|
89
|
+
decal += candidate_length_in_bytes;
|
90
|
+
}
|
91
|
+
else
|
92
|
+
tmp_candidate = rb_str_new(results[i_result].candidate, results[i_result].candidate_length * char_width);
|
93
|
+
rb_ary_push(rb_results, rb_ary_new_from_args(2, tmp_candidate, rb_float_new(results[i_result].score)));
|
94
|
+
}
|
95
|
+
return (Qtrue);
|
96
|
+
}
|
97
|
+
|
98
|
+
void Init_batch_jaro_winkler(void)
|
99
|
+
{
|
100
|
+
VALUE cBatchJaroWinkler;
|
101
|
+
|
102
|
+
cBatchJaroWinkler = rb_define_module("BatchJaroWinkler");
|
103
|
+
rb_define_singleton_method(cBatchJaroWinkler, "rb_bjw_build_runtime_result", rb_bjw_build_runtime_result, 6);
|
104
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2005-2018, Troy D. Hanson http://troydhanson.github.com/uthash/
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright
|
8
|
+
notice, this list of conditions and the following disclaimer.
|
9
|
+
|
10
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
11
|
+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
12
|
+
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
13
|
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
14
|
+
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
15
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
16
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
17
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
18
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
19
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
20
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
@@ -0,0 +1,890 @@
|
|
1
|
+
/*
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Copyright (c) 2020 Dominik Bousquet https://github.com/dbousque/batch_jaro_winkler
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#include "batch_jaro_winkler_internal.h"
|
26
|
+
|
27
|
+
#define BJW_CHAR_TYPE uint32_t
|
28
|
+
#define BJW_CHAR_ACCESS_TYPE uint32_t
|
29
|
+
#include "batch_jaro_winkler_runtime.h"
|
30
|
+
|
31
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
32
|
+
#define BJW_CHAR_ACCESS_TYPE uint16_t
|
33
|
+
#include "batch_jaro_winkler_runtime.h"
|
34
|
+
|
35
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
36
|
+
#define BJW_CHAR_ACCESS_TYPE uint8_t
|
37
|
+
#include "batch_jaro_winkler_runtime.h"
|
38
|
+
|
39
|
+
#undef BJW_CHAR_TYPE
|
40
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
41
|
+
#define BJW_CHAR_TYPE uint16_t
|
42
|
+
#define BJW_CHAR_ACCESS_TYPE uint32_t
|
43
|
+
#include "batch_jaro_winkler_runtime.h"
|
44
|
+
|
45
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
46
|
+
#define BJW_CHAR_ACCESS_TYPE uint16_t
|
47
|
+
#include "batch_jaro_winkler_runtime.h"
|
48
|
+
|
49
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
50
|
+
#define BJW_CHAR_ACCESS_TYPE uint8_t
|
51
|
+
#include "batch_jaro_winkler_runtime.h"
|
52
|
+
|
53
|
+
#undef BJW_CHAR_TYPE
|
54
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
55
|
+
#define BJW_CHAR_TYPE uint8_t
|
56
|
+
#define BJW_CHAR_ACCESS_TYPE uint32_t
|
57
|
+
#include "batch_jaro_winkler_runtime.h"
|
58
|
+
|
59
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
60
|
+
#define BJW_CHAR_ACCESS_TYPE uint16_t
|
61
|
+
#include "batch_jaro_winkler_runtime.h"
|
62
|
+
|
63
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
64
|
+
#define BJW_CHAR_ACCESS_TYPE uint8_t
|
65
|
+
#include "batch_jaro_winkler_runtime.h"
|
66
|
+
|
67
|
+
#undef BJW_CHAR_TYPE
|
68
|
+
#undef BJW_CHAR_ACCESS_TYPE
|
69
|
+
|
70
|
+
static inline uint32_t sorted_candidate_char_at(t_sorted_candidate *sorted_candidate, uint32_t i)
|
71
|
+
{
|
72
|
+
uint32_t res;
|
73
|
+
|
74
|
+
if (sorted_candidate->char_width == 4)
|
75
|
+
res = ((uint32_t*)sorted_candidate->candidate)[i];
|
76
|
+
else if (sorted_candidate->char_width == 2)
|
77
|
+
res = ((uint16_t*)sorted_candidate->candidate)[i];
|
78
|
+
else
|
79
|
+
res = ((uint8_t*)sorted_candidate->candidate)[i];
|
80
|
+
return (res);
|
81
|
+
}
|
82
|
+
|
83
|
+
static int sort_by_length_and_alphabetical_order(const void *void_cand1, const void *void_cand2)
|
84
|
+
{
|
85
|
+
t_sorted_candidate *cand1;
|
86
|
+
t_sorted_candidate *cand2;
|
87
|
+
uint32_t i;
|
88
|
+
|
89
|
+
cand1 = (t_sorted_candidate*)void_cand1;
|
90
|
+
cand2 = (t_sorted_candidate*)void_cand2;
|
91
|
+
if (cand1->candidate_length < cand2->candidate_length)
|
92
|
+
return (-1);
|
93
|
+
if (cand1->candidate_length > cand2->candidate_length)
|
94
|
+
return (1);
|
95
|
+
for (i = 0; i < cand1->candidate_length && i < cand2->candidate_length && sorted_candidate_char_at(cand1, i) == sorted_candidate_char_at(cand2, i); i++){}
|
96
|
+
return (
|
97
|
+
i >= cand1->candidate_length && i >= cand2->candidate_length ? 0 :
|
98
|
+
i >= cand1->candidate_length ? -1 :
|
99
|
+
i >= cand2->candidate_length ? 1 :
|
100
|
+
sorted_candidate_char_at(cand1, i) < sorted_candidate_char_at(cand2, i) ? -1 :
|
101
|
+
1
|
102
|
+
);
|
103
|
+
}
|
104
|
+
|
105
|
+
static void free_char_occurrences(t_char_occurrences *char_occurrences)
|
106
|
+
{
|
107
|
+
t_char_occurrences *tmp_char_occurrence;
|
108
|
+
t_char_occurrences *tmp1;
|
109
|
+
t_tmp_candidate_occurrences *candidate_occurrences;
|
110
|
+
t_tmp_candidate_occurrences *tmp_candidate_occurrences;
|
111
|
+
t_tmp_candidate_occurrences *tmp2;
|
112
|
+
|
113
|
+
HASH_ITER(hh, char_occurrences, tmp_char_occurrence, tmp1)
|
114
|
+
{
|
115
|
+
HASH_DEL(char_occurrences, tmp_char_occurrence);
|
116
|
+
candidate_occurrences = tmp_char_occurrence->candidates_occurrences;
|
117
|
+
HASH_ITER(hh, candidate_occurrences, tmp_candidate_occurrences, tmp2)
|
118
|
+
{
|
119
|
+
HASH_DEL(candidate_occurrences, tmp_candidate_occurrences);
|
120
|
+
free(tmp_candidate_occurrences->occ_indexes);
|
121
|
+
free(tmp_candidate_occurrences);
|
122
|
+
}
|
123
|
+
free(tmp_char_occurrence);
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
static void *exit_build_exportable_model_for_thread_error(t_sorted_candidate *sorted_candidates, t_char_occurrences *char_occurrences)
|
128
|
+
{
|
129
|
+
free(sorted_candidates);
|
130
|
+
free_char_occurrences(char_occurrences);
|
131
|
+
return (NULL);
|
132
|
+
}
|
133
|
+
|
134
|
+
static uint8_t *build_exportable_model_for_thread(
|
135
|
+
void **original_candidates, uint32_t original_char_width, void **compressed_candidates, uint32_t compressed_char_width,
|
136
|
+
uint32_t char_access_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t *res_model_size
|
137
|
+
)
|
138
|
+
{
|
139
|
+
uint32_t i_candidate;
|
140
|
+
uint32_t i_char;
|
141
|
+
uint32_t i_occurrence;
|
142
|
+
uint32_t i_candidate_occurrrence;
|
143
|
+
// important to set to NULL for uthash
|
144
|
+
t_char_occurrences *char_occurrences = NULL;
|
145
|
+
t_char_occurrences *char_occurrence;
|
146
|
+
t_tmp_candidate_occurrences *candidate_occurrences;
|
147
|
+
uint32_t key;
|
148
|
+
uint32_t total_candidates_lengths;
|
149
|
+
uint32_t nb_char_matches;
|
150
|
+
uint32_t nb_candidate_occurrences;
|
151
|
+
uint8_t *model;
|
152
|
+
t_sorted_candidate *sorted_candidates;
|
153
|
+
uint32_t store_original_candidates;
|
154
|
+
|
155
|
+
store_original_candidates = original_candidates != compressed_candidates ? 1 : 0;
|
156
|
+
sorted_candidates = malloc(sizeof(t_sorted_candidate) * nb_candidates);
|
157
|
+
if (!sorted_candidates)
|
158
|
+
return (NULL);
|
159
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
160
|
+
{
|
161
|
+
sorted_candidates[i_candidate] = (t_sorted_candidate){
|
162
|
+
.original_ind = i_candidate,
|
163
|
+
.candidate = compressed_candidates[i_candidate],
|
164
|
+
.char_width = compressed_char_width,
|
165
|
+
.min_score = min_scores ? min_scores[i_candidate] : -1.0f,
|
166
|
+
.candidate_length = candidates_lengths[i_candidate]
|
167
|
+
};
|
168
|
+
}
|
169
|
+
|
170
|
+
// we sort to improve the runtime memory access pattern
|
171
|
+
qsort(sorted_candidates, nb_candidates, sizeof(t_sorted_candidate), &sort_by_length_and_alphabetical_order);
|
172
|
+
|
173
|
+
nb_char_matches = 0;
|
174
|
+
nb_candidate_occurrences = 0;
|
175
|
+
total_candidates_lengths = 0;
|
176
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
177
|
+
{
|
178
|
+
for (i_char = 0; i_char < sorted_candidates[i_candidate].candidate_length; i_char++)
|
179
|
+
{
|
180
|
+
// Find character matches
|
181
|
+
if (compressed_char_width == 4)
|
182
|
+
key = (uint32_t)(((uint32_t*)sorted_candidates[i_candidate].candidate)[i_char]);
|
183
|
+
else if (compressed_char_width == 2)
|
184
|
+
key = (uint32_t)(((uint16_t*)sorted_candidates[i_candidate].candidate)[i_char]);
|
185
|
+
else
|
186
|
+
key = (uint32_t)(((uint8_t*)sorted_candidates[i_candidate].candidate)[i_char]);
|
187
|
+
char_occurrence = NULL;
|
188
|
+
HASH_FIND(hh, char_occurrences, &key, sizeof(uint32_t), char_occurrence);
|
189
|
+
if (!char_occurrence) {
|
190
|
+
nb_char_matches++;
|
191
|
+
if (!(char_occurrence = malloc(sizeof(t_char_occurrences))))
|
192
|
+
return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
|
193
|
+
char_occurrence->id = key;
|
194
|
+
// important to set to NULL for uthash
|
195
|
+
char_occurrence->candidates_occurrences = NULL;
|
196
|
+
if (store_original_candidates)
|
197
|
+
{
|
198
|
+
if (original_char_width == 4)
|
199
|
+
char_occurrence->original_representation = (uint32_t)(((uint32_t*)original_candidates[sorted_candidates[i_candidate].original_ind])[i_char]);
|
200
|
+
else if (original_char_width == 2)
|
201
|
+
char_occurrence->original_representation = (uint32_t)(((uint16_t*)original_candidates[sorted_candidates[i_candidate].original_ind])[i_char]);
|
202
|
+
else
|
203
|
+
char_occurrence->original_representation = (uint32_t)(((uint8_t*)original_candidates[sorted_candidates[i_candidate].original_ind])[i_char]);
|
204
|
+
}
|
205
|
+
HASH_ADD(hh, char_occurrences, id, sizeof(uint32_t), char_occurrence);
|
206
|
+
}
|
207
|
+
|
208
|
+
// Find character occurences for this candidate
|
209
|
+
key = i_candidate;
|
210
|
+
candidate_occurrences = NULL;
|
211
|
+
HASH_FIND(hh, char_occurrence->candidates_occurrences, &key, sizeof(uint32_t), candidate_occurrences);
|
212
|
+
if (!candidate_occurrences)
|
213
|
+
{
|
214
|
+
nb_candidate_occurrences++;
|
215
|
+
if (!(candidate_occurrences = malloc(sizeof(t_tmp_candidate_occurrences))))
|
216
|
+
return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
|
217
|
+
candidate_occurrences->id = key;
|
218
|
+
candidate_occurrences->occ_indexes_len = 0;
|
219
|
+
candidate_occurrences->occ_indexes_size = 32;
|
220
|
+
candidate_occurrences->occ_indexes = malloc(char_access_width * candidate_occurrences->occ_indexes_size);
|
221
|
+
if (!candidate_occurrences->occ_indexes)
|
222
|
+
{
|
223
|
+
free(candidate_occurrences);
|
224
|
+
return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
|
225
|
+
}
|
226
|
+
HASH_ADD(hh, char_occurrence->candidates_occurrences, id, sizeof(uint32_t), candidate_occurrences);
|
227
|
+
}
|
228
|
+
|
229
|
+
// Not big enough, increase size
|
230
|
+
if (candidate_occurrences->occ_indexes_len == candidate_occurrences->occ_indexes_size)
|
231
|
+
{
|
232
|
+
void *new_occ_indexes = malloc(char_access_width * candidate_occurrences->occ_indexes_size * 2);
|
233
|
+
if (!new_occ_indexes)
|
234
|
+
return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
|
235
|
+
memcpy(new_occ_indexes, candidate_occurrences->occ_indexes, char_access_width * candidate_occurrences->occ_indexes_size);
|
236
|
+
candidate_occurrences->occ_indexes_size *= 2;
|
237
|
+
free(candidate_occurrences->occ_indexes);
|
238
|
+
candidate_occurrences->occ_indexes = new_occ_indexes;
|
239
|
+
}
|
240
|
+
|
241
|
+
if (char_access_width == 4)
|
242
|
+
((uint32_t*)candidate_occurrences->occ_indexes)[candidate_occurrences->occ_indexes_len] = i_char;
|
243
|
+
if (char_access_width == 2)
|
244
|
+
((uint16_t*)candidate_occurrences->occ_indexes)[candidate_occurrences->occ_indexes_len] = i_char;
|
245
|
+
else
|
246
|
+
((uint8_t*)candidate_occurrences->occ_indexes)[candidate_occurrences->occ_indexes_len] = i_char;
|
247
|
+
candidate_occurrences->occ_indexes_len++;
|
248
|
+
}
|
249
|
+
total_candidates_lengths += sorted_candidates[i_candidate].candidate_length;
|
250
|
+
}
|
251
|
+
|
252
|
+
// candidate_ind + nb_occurrences
|
253
|
+
uint32_t metadata_size = (sizeof(uint32_t) + char_access_width) * nb_candidate_occurrences;
|
254
|
+
uint32_t indexes_size = char_access_width * total_candidates_lengths;
|
255
|
+
uint32_t occurrences_size = metadata_size + indexes_size;
|
256
|
+
|
257
|
+
uint32_t total_size =
|
258
|
+
sizeof(uint32_t) + // nb_candidates
|
259
|
+
sizeof(uint32_t) + // total_candidates_lengths
|
260
|
+
sizeof(uint32_t) + // min_scores present or not (uint32_t used to keep 4 bytes alignment)
|
261
|
+
sizeof(uint32_t) + // nb_char_matches
|
262
|
+
sizeof(uint32_t) + // nb_candidate_occurrences
|
263
|
+
sizeof(uint32_t) + // store_original_candidates
|
264
|
+
sizeof(uint32_t) * nb_candidates * (min_scores ? 1 : 0) + // min_scores - can go from 0.0 to 1.0 -> convert to uint32_t for cross-platform support
|
265
|
+
sizeof(uint32_t) * nb_char_matches + // chars_occurrences_decals
|
266
|
+
sizeof(uint32_t) * nb_char_matches + // nb_candidates_per_char_match
|
267
|
+
sizeof(uint32_t) * (nb_candidates + 1) + // candidates_decal
|
268
|
+
original_char_width * total_candidates_lengths * store_original_candidates + // original_candidates (if store_original_candidates)
|
269
|
+
compressed_char_width * total_candidates_lengths + // candidates (compressed)
|
270
|
+
original_char_width * nb_char_matches * store_original_candidates + // original_chars (if store_original_candidates)
|
271
|
+
compressed_char_width * nb_char_matches + // chars
|
272
|
+
occurrences_size; // occurrences
|
273
|
+
|
274
|
+
if (!(model = malloc(total_size)))
|
275
|
+
return (exit_build_exportable_model_for_thread_error(sorted_candidates, char_occurrences));
|
276
|
+
uint8_t *res_buffer_head = model;
|
277
|
+
*((uint32_t*)res_buffer_head) = nb_candidates;
|
278
|
+
res_buffer_head += sizeof(uint32_t);
|
279
|
+
*((uint32_t*)res_buffer_head) = total_candidates_lengths;
|
280
|
+
res_buffer_head += sizeof(uint32_t);
|
281
|
+
*((uint32_t*)res_buffer_head) = min_scores ? 1 : 0;
|
282
|
+
res_buffer_head += sizeof(uint32_t);
|
283
|
+
*((uint32_t*)res_buffer_head) = nb_char_matches;
|
284
|
+
res_buffer_head += sizeof(uint32_t);
|
285
|
+
*((uint32_t*)res_buffer_head) = nb_candidate_occurrences;
|
286
|
+
res_buffer_head += sizeof(uint32_t);
|
287
|
+
*((uint32_t*)res_buffer_head) = store_original_candidates;
|
288
|
+
res_buffer_head += sizeof(uint32_t);
|
289
|
+
|
290
|
+
if (min_scores)
|
291
|
+
{
|
292
|
+
uint32_t *res_min_scores = (uint32_t*)res_buffer_head;
|
293
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
294
|
+
{
|
295
|
+
// To prevent rounding errors when min_score == 1.0f
|
296
|
+
if (sorted_candidates[i_candidate].min_score >= 1.0f)
|
297
|
+
res_min_scores[i_candidate] = UINT32_MAX;
|
298
|
+
else
|
299
|
+
res_min_scores[i_candidate] = (uint32_t)(sorted_candidates[i_candidate].min_score * UINT32_MAX);
|
300
|
+
}
|
301
|
+
res_buffer_head += sizeof(uint32_t) * nb_candidates;
|
302
|
+
}
|
303
|
+
|
304
|
+
uint32_t *chars_occurrences_decals = (uint32_t*)res_buffer_head;
|
305
|
+
res_buffer_head += sizeof(uint32_t) * nb_char_matches;
|
306
|
+
uint32_t *nb_candidates_per_char_match = (uint32_t*)res_buffer_head;
|
307
|
+
res_buffer_head += sizeof(uint32_t) * nb_char_matches;
|
308
|
+
|
309
|
+
uint32_t *candidates_decal = (uint32_t*)res_buffer_head;
|
310
|
+
uint32_t decal = 0;
|
311
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
312
|
+
{
|
313
|
+
candidates_decal[i_candidate] = decal;
|
314
|
+
decal += sorted_candidates[i_candidate].candidate_length;
|
315
|
+
}
|
316
|
+
candidates_decal[i_candidate] = decal;
|
317
|
+
res_buffer_head += sizeof(uint32_t) * (nb_candidates + 1);
|
318
|
+
|
319
|
+
void *res_original_candidates = res_buffer_head;
|
320
|
+
res_buffer_head += original_char_width * total_candidates_lengths * store_original_candidates;
|
321
|
+
void *res_compressed_candidates = res_buffer_head;
|
322
|
+
res_buffer_head += compressed_char_width * total_candidates_lengths;
|
323
|
+
uint32_t candidates_char_decal = 0;
|
324
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
325
|
+
{
|
326
|
+
if (store_original_candidates)
|
327
|
+
{
|
328
|
+
memcpy(
|
329
|
+
res_original_candidates + (candidates_char_decal * original_char_width),
|
330
|
+
original_candidates[sorted_candidates[i_candidate].original_ind],
|
331
|
+
sorted_candidates[i_candidate].candidate_length * original_char_width
|
332
|
+
);
|
333
|
+
}
|
334
|
+
memcpy(
|
335
|
+
res_compressed_candidates + (candidates_char_decal * compressed_char_width),
|
336
|
+
sorted_candidates[i_candidate].candidate,
|
337
|
+
sorted_candidates[i_candidate].candidate_length * compressed_char_width
|
338
|
+
);
|
339
|
+
candidates_char_decal += sorted_candidates[i_candidate].candidate_length;
|
340
|
+
}
|
341
|
+
|
342
|
+
void *original_chars = res_buffer_head;
|
343
|
+
res_buffer_head += original_char_width * nb_char_matches * store_original_candidates;
|
344
|
+
void *chars = res_buffer_head;
|
345
|
+
res_buffer_head += compressed_char_width * nb_char_matches;
|
346
|
+
|
347
|
+
uint8_t *occurrences = (uint8_t*)res_buffer_head;
|
348
|
+
uint8_t *occurrences_head = occurrences;
|
349
|
+
|
350
|
+
i_char = 0;
|
351
|
+
for (char_occurrence = char_occurrences; char_occurrence; char_occurrence = char_occurrence->hh.next)
|
352
|
+
{
|
353
|
+
if (store_original_candidates)
|
354
|
+
{
|
355
|
+
if (original_char_width == 4)
|
356
|
+
((uint32_t*)original_chars)[i_char] = char_occurrence->original_representation;
|
357
|
+
else if (original_char_width == 2)
|
358
|
+
((uint16_t*)original_chars)[i_char] = char_occurrence->original_representation;
|
359
|
+
else
|
360
|
+
((uint8_t*)original_chars)[i_char] = char_occurrence->original_representation;
|
361
|
+
}
|
362
|
+
if (compressed_char_width == 4)
|
363
|
+
((uint32_t*)chars)[i_char] = (uint32_t)char_occurrence->id;
|
364
|
+
else if (compressed_char_width == 2)
|
365
|
+
((uint16_t*)chars)[i_char] = (uint16_t)char_occurrence->id;
|
366
|
+
else
|
367
|
+
((uint8_t*)chars)[i_char] = (uint8_t)char_occurrence->id;
|
368
|
+
chars_occurrences_decals[i_char] = occurrences_head - occurrences;
|
369
|
+
|
370
|
+
i_candidate_occurrrence = 0;
|
371
|
+
for (candidate_occurrences = char_occurrence->candidates_occurrences; candidate_occurrences; candidate_occurrences = candidate_occurrences->hh.next)
|
372
|
+
{
|
373
|
+
// 1 uint32_t for the candidate's index
|
374
|
+
// + 1 BJW_CHAR_ACCESS_TYPE for the number of occurrences
|
375
|
+
// + N BJW_CHAR_ACCESS_TYPE for the occurrences indexes
|
376
|
+
*((uint32_t*)occurrences_head) = (uint32_t)candidate_occurrences->id;
|
377
|
+
occurrences_head += sizeof(uint32_t);
|
378
|
+
if (char_access_width == 4)
|
379
|
+
*((uint32_t*)occurrences_head) = candidate_occurrences->occ_indexes_len;
|
380
|
+
else if (char_access_width == 2)
|
381
|
+
*((uint16_t*)occurrences_head) = candidate_occurrences->occ_indexes_len;
|
382
|
+
else
|
383
|
+
*((uint8_t*)occurrences_head) = candidate_occurrences->occ_indexes_len;
|
384
|
+
occurrences_head += char_access_width;
|
385
|
+
for (i_occurrence = 0; i_occurrence < candidate_occurrences->occ_indexes_len; i_occurrence++)
|
386
|
+
{
|
387
|
+
if (char_access_width == 4)
|
388
|
+
*((uint32_t*)occurrences_head) = ((uint32_t*)candidate_occurrences->occ_indexes)[i_occurrence];
|
389
|
+
if (char_access_width == 2)
|
390
|
+
*((uint16_t*)occurrences_head) = ((uint16_t*)candidate_occurrences->occ_indexes)[i_occurrence];
|
391
|
+
else
|
392
|
+
*((uint8_t*)occurrences_head) = ((uint8_t*)candidate_occurrences->occ_indexes)[i_occurrence];
|
393
|
+
occurrences_head += char_access_width;
|
394
|
+
}
|
395
|
+
|
396
|
+
i_candidate_occurrrence++;
|
397
|
+
}
|
398
|
+
|
399
|
+
nb_candidates_per_char_match[i_char] = i_candidate_occurrrence;
|
400
|
+
i_char++;
|
401
|
+
}
|
402
|
+
|
403
|
+
*res_model_size = total_size;
|
404
|
+
free_char_occurrences(char_occurrences);
|
405
|
+
free(sorted_candidates);
|
406
|
+
|
407
|
+
return (model);
|
408
|
+
}
|
409
|
+
|
410
|
+
// pack all result data in single buffer
|
411
|
+
static void *build_exportable_model(
|
412
|
+
void **original_candidates, uint32_t original_char_width, void **compressed_candidates, uint32_t compressed_char_width,
|
413
|
+
uint32_t char_access_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t nb_runtime_threads, uint32_t *res_model_size
|
414
|
+
)
|
415
|
+
{
|
416
|
+
uint32_t i_thread;
|
417
|
+
uint32_t i;
|
418
|
+
uint8_t *model_per_thread[nb_runtime_threads];
|
419
|
+
uint32_t model_size_per_thread[nb_runtime_threads];
|
420
|
+
uint8_t *res_buffer;
|
421
|
+
uint8_t *res_buffer_head;
|
422
|
+
void **original_candidates_for_thread;
|
423
|
+
void **compressed_candidates_for_thread;
|
424
|
+
uint32_t *candidates_lengths_for_thread;
|
425
|
+
uint32_t nb_candidates_for_thread;
|
426
|
+
float *min_scores_for_thread;
|
427
|
+
uint32_t nb_taken_candidates;
|
428
|
+
uint32_t aligned_model_size;
|
429
|
+
|
430
|
+
nb_taken_candidates = 0;
|
431
|
+
*res_model_size = 0;
|
432
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
433
|
+
{
|
434
|
+
nb_candidates_for_thread = i_thread == nb_runtime_threads - 1 ? nb_candidates - nb_taken_candidates : (nb_candidates / nb_runtime_threads);
|
435
|
+
original_candidates_for_thread = original_candidates + nb_taken_candidates;
|
436
|
+
compressed_candidates_for_thread = compressed_candidates + nb_taken_candidates;
|
437
|
+
candidates_lengths_for_thread = candidates_lengths + nb_taken_candidates;
|
438
|
+
min_scores_for_thread = min_scores ? min_scores + nb_taken_candidates : NULL;
|
439
|
+
|
440
|
+
model_per_thread[i_thread] = build_exportable_model_for_thread(
|
441
|
+
original_candidates_for_thread, original_char_width, compressed_candidates_for_thread, compressed_char_width,
|
442
|
+
char_access_width, candidates_lengths_for_thread, nb_candidates_for_thread, min_scores_for_thread, &(model_size_per_thread[i_thread])
|
443
|
+
);
|
444
|
+
if (!model_per_thread[i_thread])
|
445
|
+
{
|
446
|
+
for (i = 0; i < i_thread; i++)
|
447
|
+
free(model_per_thread[i]);
|
448
|
+
return (NULL);
|
449
|
+
}
|
450
|
+
// align on next 4 byte boundary
|
451
|
+
*res_model_size += model_size_per_thread[i_thread] + (4 - (model_size_per_thread[i_thread] % 4));
|
452
|
+
nb_taken_candidates += nb_candidates_for_thread;
|
453
|
+
}
|
454
|
+
|
455
|
+
// we put the number of threads + nb candidates + char_width + char_access_width + original_char_width and the models per thread sizes at the start of the model
|
456
|
+
*res_model_size += sizeof(uint32_t) * (nb_runtime_threads + 5);
|
457
|
+
res_buffer = malloc(*res_model_size);
|
458
|
+
if (!res_buffer)
|
459
|
+
{
|
460
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
461
|
+
free(model_per_thread[i_thread]);
|
462
|
+
return (NULL);
|
463
|
+
}
|
464
|
+
*((uint32_t*)(res_buffer + sizeof(uint32_t) * 0)) = nb_runtime_threads;
|
465
|
+
*((uint32_t*)(res_buffer + sizeof(uint32_t) * 1)) = nb_candidates;
|
466
|
+
*((uint32_t*)(res_buffer + sizeof(uint32_t) * 2)) = compressed_char_width;
|
467
|
+
*((uint32_t*)(res_buffer + sizeof(uint32_t) * 3)) = char_access_width;
|
468
|
+
*((uint32_t*)(res_buffer + sizeof(uint32_t) * 4)) = original_char_width;
|
469
|
+
res_buffer_head = res_buffer + sizeof(uint32_t) * (nb_runtime_threads + 5);
|
470
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
471
|
+
{
|
472
|
+
// align on next 4 byte boundary
|
473
|
+
aligned_model_size = model_size_per_thread[i_thread] + (4 - (model_size_per_thread[i_thread] % 4));
|
474
|
+
*((uint32_t*)(res_buffer + sizeof(uint32_t) * (i_thread + 5))) = aligned_model_size;
|
475
|
+
memcpy(res_buffer_head, model_per_thread[i_thread], model_size_per_thread[i_thread]);
|
476
|
+
// align on next 4 byte boundary
|
477
|
+
res_buffer_head += aligned_model_size;
|
478
|
+
free(model_per_thread[i_thread]);
|
479
|
+
}
|
480
|
+
|
481
|
+
return (res_buffer);
|
482
|
+
}
|
483
|
+
|
484
|
+
static void free_chars(t_char *chars)
|
485
|
+
{
|
486
|
+
t_char *tmp_char;
|
487
|
+
t_char *tmp;
|
488
|
+
|
489
|
+
HASH_ITER(hh, chars, tmp_char, tmp)
|
490
|
+
{
|
491
|
+
HASH_DEL(chars, tmp_char);
|
492
|
+
free(tmp_char);
|
493
|
+
}
|
494
|
+
}
|
495
|
+
|
496
|
+
// Used by the ruby library
|
497
|
+
void _bjw_free(void *ptr)
|
498
|
+
{
|
499
|
+
free(ptr);
|
500
|
+
}
|
501
|
+
|
502
|
+
// Pack all result data in single buffer.
|
503
|
+
void *bjw_build_exportable_model(void **candidates, uint32_t char_width, uint32_t *candidates_lengths, uint32_t nb_candidates, float *min_scores, uint32_t nb_runtime_threads, uint32_t *res_model_size)
|
504
|
+
{
|
505
|
+
uint32_t i_candidate;
|
506
|
+
uint32_t i_char;
|
507
|
+
uint32_t i;
|
508
|
+
uint32_t longest_candidate;
|
509
|
+
// important to set to NULL for uthash
|
510
|
+
t_char *chars = NULL;
|
511
|
+
t_char *chr;
|
512
|
+
uint32_t key;
|
513
|
+
uint32_t nb_chars;
|
514
|
+
uint32_t compressed_char_width;
|
515
|
+
uint32_t char_access_width;
|
516
|
+
void **compressed_candidates;
|
517
|
+
void *exportable_model;
|
518
|
+
|
519
|
+
nb_chars = 0;
|
520
|
+
longest_candidate = 0;
|
521
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
522
|
+
{
|
523
|
+
if (candidates_lengths[i_candidate] > longest_candidate)
|
524
|
+
longest_candidate = candidates_lengths[i_candidate];
|
525
|
+
for (i_char = 0; i_char < candidates_lengths[i_candidate]; i_char++)
|
526
|
+
{
|
527
|
+
if (char_width == 4)
|
528
|
+
key = (uint32_t)(((uint32_t**)candidates)[i_candidate][i_char]);
|
529
|
+
else if (char_width == 2)
|
530
|
+
key = (uint32_t)(((uint16_t**)candidates)[i_candidate][i_char]);
|
531
|
+
else
|
532
|
+
key = (uint32_t)(((uint8_t**)candidates)[i_candidate][i_char]);
|
533
|
+
chr = NULL;
|
534
|
+
HASH_FIND(hh, chars, &key, sizeof(uint32_t), chr);
|
535
|
+
if (!chr) {
|
536
|
+
nb_chars++;
|
537
|
+
if (!(chr = malloc(sizeof(t_char))))
|
538
|
+
{
|
539
|
+
free_chars(chars);
|
540
|
+
return (NULL);
|
541
|
+
}
|
542
|
+
chr->id = key;
|
543
|
+
chr->new_representation = nb_chars;
|
544
|
+
HASH_ADD(hh, chars, id, sizeof(uint32_t), chr);
|
545
|
+
}
|
546
|
+
}
|
547
|
+
}
|
548
|
+
|
549
|
+
compressed_char_width = char_width;
|
550
|
+
// We keep one available char (0) to represent an unknown character in the input at runtime.
|
551
|
+
if (nb_chars < 256 - 1)
|
552
|
+
compressed_char_width = 1;
|
553
|
+
else if (nb_chars < 256 * 256 - 1)
|
554
|
+
compressed_char_width = 2;
|
555
|
+
char_access_width = 4;
|
556
|
+
// We can't go up to 256, since we need to be able to send inputs of arbitrary lengths at runtime
|
557
|
+
// and characters up to longest_candidate * 2 can be considered for the score.
|
558
|
+
if (longest_candidate < 128)
|
559
|
+
char_access_width = 1;
|
560
|
+
else if (longest_candidate < 256 * 128)
|
561
|
+
char_access_width = 2;
|
562
|
+
|
563
|
+
compressed_candidates = candidates;
|
564
|
+
|
565
|
+
// Rewrite candidates with smallest possible char_width
|
566
|
+
if (compressed_char_width < char_width)
|
567
|
+
{
|
568
|
+
if (!(compressed_candidates = malloc(sizeof(void*) * nb_candidates)))
|
569
|
+
{
|
570
|
+
free_chars(chars);
|
571
|
+
return (NULL);
|
572
|
+
}
|
573
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
574
|
+
{
|
575
|
+
if (!(compressed_candidates[i_candidate] = malloc(compressed_char_width * candidates_lengths[i_candidate])))
|
576
|
+
{
|
577
|
+
free_chars(chars);
|
578
|
+
for (i = 0; i < i_candidate; i++)
|
579
|
+
free(compressed_candidates[i]);
|
580
|
+
free(compressed_candidates);
|
581
|
+
return (NULL);
|
582
|
+
}
|
583
|
+
for (i_char = 0; i_char < candidates_lengths[i_candidate]; i_char++)
|
584
|
+
{
|
585
|
+
if (char_width == 4)
|
586
|
+
key = (uint32_t)(((uint32_t**)candidates)[i_candidate][i_char]);
|
587
|
+
else if (char_width == 2)
|
588
|
+
key = (uint32_t)(((uint16_t**)candidates)[i_candidate][i_char]);
|
589
|
+
else
|
590
|
+
key = (uint32_t)(((uint8_t**)candidates)[i_candidate][i_char]);
|
591
|
+
HASH_FIND(hh, chars, &key, sizeof(uint32_t), chr);
|
592
|
+
if (compressed_char_width == 4)
|
593
|
+
((uint32_t**)compressed_candidates)[i_candidate][i_char] = chr->new_representation;
|
594
|
+
else if (compressed_char_width == 2)
|
595
|
+
((uint16_t**)compressed_candidates)[i_candidate][i_char] = chr->new_representation;
|
596
|
+
else
|
597
|
+
((uint8_t**)compressed_candidates)[i_candidate][i_char] = chr->new_representation;
|
598
|
+
}
|
599
|
+
}
|
600
|
+
}
|
601
|
+
|
602
|
+
free_chars(chars);
|
603
|
+
|
604
|
+
exportable_model = build_exportable_model(
|
605
|
+
candidates, char_width, compressed_candidates, compressed_char_width, char_access_width,
|
606
|
+
candidates_lengths, nb_candidates, min_scores, nb_runtime_threads, res_model_size
|
607
|
+
);
|
608
|
+
|
609
|
+
if (compressed_candidates != candidates)
|
610
|
+
{
|
611
|
+
for (i_candidate = 0; i_candidate < nb_candidates; i_candidate++)
|
612
|
+
free(compressed_candidates[i_candidate]);
|
613
|
+
free(compressed_candidates);
|
614
|
+
}
|
615
|
+
|
616
|
+
return (exportable_model);
|
617
|
+
}
|
618
|
+
|
619
|
+
void bjw_free_runtime_model(void *runtime_model)
|
620
|
+
{
|
621
|
+
uint32_t char_width;
|
622
|
+
uint32_t char_access_width;
|
623
|
+
|
624
|
+
char_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 2));
|
625
|
+
char_access_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 3));
|
626
|
+
|
627
|
+
void (*free_function)(void*) = NULL;
|
628
|
+
if (char_width == 4 && char_access_width == 4)
|
629
|
+
free_function = free_runtime_model_uint32_t_uint32_t;
|
630
|
+
else if (char_width == 4 && char_access_width == 2)
|
631
|
+
free_function = free_runtime_model_uint32_t_uint16_t;
|
632
|
+
else if (char_width == 4 && char_access_width == 1)
|
633
|
+
free_function = free_runtime_model_uint32_t_uint8_t;
|
634
|
+
else if (char_width == 2 && char_access_width == 4)
|
635
|
+
free_function = free_runtime_model_uint16_t_uint32_t;
|
636
|
+
else if (char_width == 2 && char_access_width == 2)
|
637
|
+
free_function = free_runtime_model_uint16_t_uint16_t;
|
638
|
+
else if (char_width == 2 && char_access_width == 1)
|
639
|
+
free_function = free_runtime_model_uint16_t_uint8_t;
|
640
|
+
else if (char_width == 1 && char_access_width == 4)
|
641
|
+
free_function = free_runtime_model_uint8_t_uint32_t;
|
642
|
+
else if (char_width == 1 && char_access_width == 2)
|
643
|
+
free_function = free_runtime_model_uint8_t_uint16_t;
|
644
|
+
else if (char_width == 1 && char_access_width == 1)
|
645
|
+
free_function = free_runtime_model_uint8_t_uint8_t;
|
646
|
+
|
647
|
+
free_function(runtime_model);
|
648
|
+
}
|
649
|
+
|
650
|
+
void *bjw_build_runtime_model(void *exportable_model)
|
651
|
+
{
|
652
|
+
uint32_t nb_runtime_threads;
|
653
|
+
uint32_t nb_candidates;
|
654
|
+
uint32_t *model_size_per_thread;
|
655
|
+
uint32_t char_width;
|
656
|
+
uint32_t char_access_width;
|
657
|
+
uint32_t original_char_width;
|
658
|
+
|
659
|
+
uint8_t *exportable_model_head = (uint8_t*)exportable_model;
|
660
|
+
nb_runtime_threads = *((uint32_t*)exportable_model_head);
|
661
|
+
exportable_model_head += sizeof(uint32_t);
|
662
|
+
nb_candidates = *((uint32_t*)exportable_model_head);
|
663
|
+
exportable_model_head += sizeof(uint32_t);
|
664
|
+
char_width = *((uint32_t*)exportable_model_head);
|
665
|
+
exportable_model_head += sizeof(uint32_t);
|
666
|
+
char_access_width = *((uint32_t*)exportable_model_head);
|
667
|
+
exportable_model_head += sizeof(uint32_t);
|
668
|
+
original_char_width = *((uint32_t*)exportable_model_head);
|
669
|
+
exportable_model_head += sizeof(uint32_t);
|
670
|
+
model_size_per_thread = (uint32_t*)exportable_model_head;
|
671
|
+
exportable_model_head += sizeof(uint32_t) * nb_runtime_threads;
|
672
|
+
|
673
|
+
void *(*build_function)(uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t*) = NULL;
|
674
|
+
if (char_width == 4 && char_access_width == 4)
|
675
|
+
build_function = build_runtime_model_uint32_t_uint32_t;
|
676
|
+
else if (char_width == 4 && char_access_width == 2)
|
677
|
+
build_function = build_runtime_model_uint32_t_uint16_t;
|
678
|
+
else if (char_width == 4 && char_access_width == 1)
|
679
|
+
build_function = build_runtime_model_uint32_t_uint8_t;
|
680
|
+
else if (char_width == 2 && char_access_width == 4)
|
681
|
+
build_function = build_runtime_model_uint16_t_uint32_t;
|
682
|
+
else if (char_width == 2 && char_access_width == 2)
|
683
|
+
build_function = build_runtime_model_uint16_t_uint16_t;
|
684
|
+
else if (char_width == 2 && char_access_width == 1)
|
685
|
+
build_function = build_runtime_model_uint16_t_uint8_t;
|
686
|
+
else if (char_width == 1 && char_access_width == 4)
|
687
|
+
build_function = build_runtime_model_uint8_t_uint32_t;
|
688
|
+
else if (char_width == 1 && char_access_width == 2)
|
689
|
+
build_function = build_runtime_model_uint8_t_uint16_t;
|
690
|
+
else if (char_width == 1 && char_access_width == 1)
|
691
|
+
build_function = build_runtime_model_uint8_t_uint8_t;
|
692
|
+
|
693
|
+
return (build_function(exportable_model_head, nb_runtime_threads, nb_candidates, original_char_width, model_size_per_thread));
|
694
|
+
}
|
695
|
+
|
696
|
+
static int sort_results_by_score(const void *void_res1, const void *void_res2)
|
697
|
+
{
|
698
|
+
bjw_result *res1;
|
699
|
+
bjw_result *res2;
|
700
|
+
|
701
|
+
res1 = (bjw_result*)void_res1;
|
702
|
+
res2 = (bjw_result*)void_res2;
|
703
|
+
return (res1->score < res2->score ? 1 : res1->score == res2->score ? 0 : -1);
|
704
|
+
}
|
705
|
+
|
706
|
+
bjw_result *bjw_jaro_winkler_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, float weight, float threshold, uint32_t n_best_results, uint32_t *nb_results)
|
707
|
+
{
|
708
|
+
uint32_t i_thread;
|
709
|
+
uint32_t nb_runtime_threads;
|
710
|
+
uint32_t nb_candidates;
|
711
|
+
uint32_t char_width;
|
712
|
+
uint32_t char_access_width;
|
713
|
+
uint32_t original_char_width;
|
714
|
+
char both_min_score_and_min_scores;
|
715
|
+
uint32_t n_best_i_try;
|
716
|
+
uint32_t n_best_nb_tries;
|
717
|
+
float n_best_tries[3];
|
718
|
+
bjw_result *results;
|
719
|
+
bjw_result *tmp_results;
|
720
|
+
uint32_t results_decal;
|
721
|
+
t_thread_data *threads_data;
|
722
|
+
#if BJW_USE_THREADS
|
723
|
+
# ifdef _WIN32
|
724
|
+
HANDLE *threads;
|
725
|
+
# else
|
726
|
+
pthread_t *threads;
|
727
|
+
# endif
|
728
|
+
#endif
|
729
|
+
|
730
|
+
nb_runtime_threads = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 0));
|
731
|
+
nb_candidates = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 1));
|
732
|
+
char_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 2));
|
733
|
+
char_access_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 3));
|
734
|
+
original_char_width = *((uint32_t*)(runtime_model + sizeof(uint32_t) * 4));
|
735
|
+
|
736
|
+
// Characters after 256 won't be taken into consideration for score calculation anyway, and uint8_t won't be able to represent the indices.
|
737
|
+
if (char_access_width == 1 && input_length >= 256)
|
738
|
+
input_length = 256 - 1;
|
739
|
+
else if (char_access_width == 2 && input_length >= 256 * 256)
|
740
|
+
input_length = (256 * 256) - 1;
|
741
|
+
both_min_score_and_min_scores = min_score < 0.0f && n_best_results != 0;
|
742
|
+
if (n_best_results > nb_candidates)
|
743
|
+
n_best_results = nb_candidates;
|
744
|
+
|
745
|
+
#if BJW_USE_THREADS
|
746
|
+
# ifdef _WIN32
|
747
|
+
if (!(threads = malloc(sizeof(HANDLE) * nb_runtime_threads)))
|
748
|
+
return (NULL);
|
749
|
+
# else
|
750
|
+
if (!(threads = malloc(sizeof(pthread_t) * nb_runtime_threads)))
|
751
|
+
return (NULL);
|
752
|
+
# endif
|
753
|
+
#endif
|
754
|
+
|
755
|
+
if (!(threads_data = malloc(sizeof(t_thread_data) * nb_runtime_threads)))
|
756
|
+
return (NULL);
|
757
|
+
|
758
|
+
void* (*runtime_function)(void*) = NULL;
|
759
|
+
if (char_width == 4 && char_access_width == 4)
|
760
|
+
runtime_function = jaro_winkler_distance_for_thread_uint32_t_uint32_t;
|
761
|
+
else if (char_width == 4 && char_access_width == 2)
|
762
|
+
runtime_function = jaro_winkler_distance_for_thread_uint32_t_uint16_t;
|
763
|
+
else if (char_width == 4 && char_access_width == 1)
|
764
|
+
runtime_function = jaro_winkler_distance_for_thread_uint32_t_uint8_t;
|
765
|
+
else if (char_width == 2 && char_access_width == 4)
|
766
|
+
runtime_function = jaro_winkler_distance_for_thread_uint16_t_uint32_t;
|
767
|
+
else if (char_width == 2 && char_access_width == 2)
|
768
|
+
runtime_function = jaro_winkler_distance_for_thread_uint16_t_uint16_t;
|
769
|
+
else if (char_width == 2 && char_access_width == 1)
|
770
|
+
runtime_function = jaro_winkler_distance_for_thread_uint16_t_uint8_t;
|
771
|
+
else if (char_width == 1 && char_access_width == 4)
|
772
|
+
runtime_function = jaro_winkler_distance_for_thread_uint8_t_uint32_t;
|
773
|
+
else if (char_width == 1 && char_access_width == 2)
|
774
|
+
runtime_function = jaro_winkler_distance_for_thread_uint8_t_uint16_t;
|
775
|
+
else if (char_width == 1 && char_access_width == 1)
|
776
|
+
runtime_function = jaro_winkler_distance_for_thread_uint8_t_uint8_t;
|
777
|
+
|
778
|
+
if (n_best_results != 0)
|
779
|
+
{
|
780
|
+
if (nb_candidates > 0)
|
781
|
+
n_best_tries[0] = 1.0f - (((float)n_best_results) / nb_candidates);
|
782
|
+
else
|
783
|
+
n_best_tries[0] = -1.0f;
|
784
|
+
if (n_best_tries[0] > 0.8f)
|
785
|
+
n_best_tries[0] = 0.8f;
|
786
|
+
n_best_tries[1] = n_best_tries[0] - 0.2f;
|
787
|
+
n_best_tries[1] = n_best_tries[1] < 0.0f ? -1.0f : n_best_tries[1];
|
788
|
+
n_best_tries[2] = min_score;
|
789
|
+
n_best_nb_tries = 3;
|
790
|
+
|
791
|
+
if (n_best_tries[1] <= min_score)
|
792
|
+
{
|
793
|
+
n_best_nb_tries--;
|
794
|
+
n_best_tries[1] = min_score;
|
795
|
+
}
|
796
|
+
if (n_best_tries[0] <= min_score)
|
797
|
+
{
|
798
|
+
n_best_nb_tries--;
|
799
|
+
n_best_tries[0] = min_score;
|
800
|
+
}
|
801
|
+
}
|
802
|
+
else
|
803
|
+
{
|
804
|
+
n_best_tries[0] = min_score;
|
805
|
+
n_best_nb_tries = 1;
|
806
|
+
}
|
807
|
+
|
808
|
+
for (n_best_i_try = 0; n_best_i_try < n_best_nb_tries; n_best_i_try++)
|
809
|
+
{
|
810
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
811
|
+
{
|
812
|
+
threads_data[i_thread] = (t_thread_data){
|
813
|
+
.runtime_models = runtime_model + sizeof(uint32_t) * 5,
|
814
|
+
.i_thread = i_thread,
|
815
|
+
.original_char_width = original_char_width,
|
816
|
+
.input = input,
|
817
|
+
.input_length = input_length,
|
818
|
+
.min_score = n_best_tries[n_best_i_try],
|
819
|
+
.weight = weight,
|
820
|
+
.threshold = threshold,
|
821
|
+
.both_min_score_and_min_scores = both_min_score_and_min_scores,
|
822
|
+
.results = NULL,
|
823
|
+
.nb_results = 0
|
824
|
+
};
|
825
|
+
|
826
|
+
#if BJW_USE_THREADS
|
827
|
+
# ifdef _WIN32
|
828
|
+
threads[i_thread] = CreateThread(NULL, 0, runtime_function, &(threads_data[i_thread]), 0, NULL);
|
829
|
+
# else
|
830
|
+
pthread_create(&(threads[i_thread]), NULL, runtime_function, &(threads_data[i_thread]));
|
831
|
+
# endif
|
832
|
+
#else
|
833
|
+
runtime_function(&(threads_data[i_thread]));
|
834
|
+
#endif
|
835
|
+
}
|
836
|
+
|
837
|
+
*nb_results = 0;
|
838
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
839
|
+
{
|
840
|
+
#if BJW_USE_THREADS
|
841
|
+
# ifdef _WIN32
|
842
|
+
WaitForSingleObject(threads[i_thread], INFINITE);
|
843
|
+
CloseHandle(threads[i_thread]);
|
844
|
+
# else
|
845
|
+
pthread_join(threads[i_thread], NULL);
|
846
|
+
# endif
|
847
|
+
#endif
|
848
|
+
*nb_results += threads_data[i_thread].nb_results;
|
849
|
+
}
|
850
|
+
|
851
|
+
if (n_best_results == 0 || *nb_results >= n_best_results)
|
852
|
+
break ;
|
853
|
+
}
|
854
|
+
|
855
|
+
if (!(results = malloc(sizeof(bjw_result) * (*nb_results))))
|
856
|
+
return (NULL);
|
857
|
+
results_decal = 0;
|
858
|
+
for (i_thread = 0; i_thread < nb_runtime_threads; i_thread++)
|
859
|
+
{
|
860
|
+
memcpy(&(results[results_decal]), threads_data[i_thread].results, sizeof(bjw_result) * threads_data[i_thread].nb_results);
|
861
|
+
free(threads_data[i_thread].results);
|
862
|
+
results_decal += threads_data[i_thread].nb_results;
|
863
|
+
}
|
864
|
+
|
865
|
+
if (n_best_results != 0)
|
866
|
+
{
|
867
|
+
qsort(results, *nb_results, sizeof(bjw_result), &sort_results_by_score);
|
868
|
+
if (*nb_results > n_best_results)
|
869
|
+
{
|
870
|
+
if (!(tmp_results = malloc(sizeof(bjw_result) * n_best_results)))
|
871
|
+
return (NULL);
|
872
|
+
memcpy(tmp_results, results, sizeof(bjw_result) * n_best_results);
|
873
|
+
free(results);
|
874
|
+
results = tmp_results;
|
875
|
+
*nb_results = n_best_results;
|
876
|
+
}
|
877
|
+
}
|
878
|
+
|
879
|
+
#if BJW_USE_THREADS
|
880
|
+
free(threads);
|
881
|
+
#endif
|
882
|
+
free(threads_data);
|
883
|
+
|
884
|
+
return (results);
|
885
|
+
}
|
886
|
+
|
887
|
+
bjw_result *bjw_jaro_distance(void *runtime_model, void *input, uint32_t input_length, float min_score, uint32_t n_best_results, uint32_t *nb_results)
|
888
|
+
{
|
889
|
+
return (bjw_jaro_winkler_distance(runtime_model, input, input_length, min_score, -1.0f, -1.0f, n_best_results, nb_results));
|
890
|
+
}
|