jaro_winkler 1.4.0-java → 1.5.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/jaro_winkler/adj_matrix.c +52 -44
- data/ext/jaro_winkler/adj_matrix.h +13 -13
- data/ext/jaro_winkler/codepoints.c +61 -0
- data/ext/jaro_winkler/codepoints.h +13 -0
- data/ext/jaro_winkler/jaro.c +84 -85
- data/ext/jaro_winkler/jaro.h +11 -11
- data/ext/jaro_winkler/jaro_winkler.c +52 -27
- data/lib/jaro_winkler.rb +5 -5
- data/lib/jaro_winkler/adjusting_table.rb +9 -14
- data/lib/jaro_winkler/jaro_winkler_pure.rb +95 -91
- data/lib/jaro_winkler/version.rb +3 -1
- metadata +41 -36
- data/ext/jaro_winkler/code.c +0 -29
- data/ext/jaro_winkler/code.h +0 -7
- data/ext/jaro_winkler/murmur_hash2.c +0 -64
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3fd3fabbc7662b62ead8988ef48c53a89d75b6f
|
4
|
+
data.tar.gz: 7ba3eff5e134aadab37aa52f22665d78e66f6dcd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ee03982e280949d7069e5f25d4e1c5103b047abe5853e6497f5926a3cc5155c11f0876fc93461f7f26b1eb0985d9cf987b109d5898386928419032533cd1c8f
|
7
|
+
data.tar.gz: 902531a857d93d74bce572bfc8c604e61dcaf5853794ad36d530208bf70c85d2f43b84624550bbee1e4ddc0fac80fe3be91d96764390d060b1b7230f96c24520
|
@@ -1,66 +1,75 @@
|
|
1
1
|
#include "adj_matrix.h"
|
2
|
-
#include "
|
3
|
-
|
4
|
-
#include <stdlib.h>
|
2
|
+
#include "codepoints.h"
|
3
|
+
#include "ruby.h"
|
5
4
|
|
6
5
|
const char *DEFAULT_ADJ_TABLE[] = {
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
"A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
|
7
|
+
"O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
|
8
|
+
"C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
|
9
|
+
"S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
|
10
|
+
"I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
|
11
|
+
"O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
|
12
12
|
|
13
|
-
extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
|
14
13
|
void node_free(Node *head);
|
15
14
|
|
16
|
-
AdjMatrix*
|
15
|
+
AdjMatrix *adj_matrix_new(uint32_t length) {
|
17
16
|
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
18
17
|
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
|
19
|
-
matrix->table = malloc(matrix->length * sizeof(Node**));
|
20
|
-
for(
|
21
|
-
matrix->table[i] = malloc(matrix->length * sizeof(Node*));
|
22
|
-
for (
|
18
|
+
matrix->table = malloc(matrix->length * sizeof(Node **));
|
19
|
+
for (size_t i = 0; i < matrix->length; i++) {
|
20
|
+
matrix->table[i] = malloc(matrix->length * sizeof(Node *));
|
21
|
+
for (size_t j = 0; j < matrix->length; j++)
|
23
22
|
matrix->table[i][j] = NULL;
|
24
23
|
}
|
25
24
|
return matrix;
|
26
25
|
}
|
27
26
|
|
28
|
-
void adj_matrix_add(AdjMatrix *matrix,
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
28
|
+
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
29
|
+
ADJ_MATRIX_DEFAULT_LENGTH,
|
30
|
+
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
31
|
+
ADJ_MATRIX_DEFAULT_LENGTH;
|
32
|
+
Node *new_node = malloc(sizeof(Node));
|
33
|
+
new_node->x = h1;
|
34
|
+
new_node->y = h2;
|
35
|
+
new_node->next = NULL;
|
36
|
+
if (matrix->table[h1][h2] == NULL) {
|
33
37
|
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
|
34
|
-
}
|
35
|
-
else{
|
38
|
+
} else {
|
36
39
|
Node *previous = NULL;
|
37
|
-
for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
|
40
|
+
for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
|
41
|
+
previous = i;
|
38
42
|
previous->next = new_node;
|
39
43
|
}
|
40
44
|
}
|
41
45
|
|
42
|
-
char adj_matrix_find(AdjMatrix *matrix,
|
43
|
-
|
44
|
-
|
46
|
+
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
47
|
+
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
48
|
+
ADJ_MATRIX_DEFAULT_LENGTH,
|
49
|
+
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
50
|
+
ADJ_MATRIX_DEFAULT_LENGTH;
|
45
51
|
Node *node = matrix->table[h1][h2];
|
46
|
-
if(node == NULL)
|
47
|
-
|
48
|
-
|
49
|
-
|
52
|
+
if (node == NULL)
|
53
|
+
return 0;
|
54
|
+
else {
|
55
|
+
for (Node *i = node; i != NULL; i = i->next)
|
56
|
+
if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
|
57
|
+
return 1;
|
50
58
|
return 0;
|
51
59
|
}
|
52
60
|
}
|
53
61
|
|
54
|
-
void node_free(Node *head){
|
55
|
-
if(head == NULL)
|
62
|
+
void node_free(Node *head) {
|
63
|
+
if (head == NULL)
|
64
|
+
return;
|
56
65
|
node_free(head->next);
|
57
66
|
free(head);
|
58
67
|
}
|
59
68
|
|
60
|
-
void adj_matrix_free(AdjMatrix *matrix){
|
61
|
-
for(
|
62
|
-
for(
|
63
|
-
if(matrix->table[i][j] != NULL){
|
69
|
+
void adj_matrix_free(AdjMatrix *matrix) {
|
70
|
+
for (size_t i = 0; i < matrix->length; i++) {
|
71
|
+
for (size_t j = 0; j < matrix->length; j++)
|
72
|
+
if (matrix->table[i][j] != NULL) {
|
64
73
|
node_free(matrix->table[i][j]);
|
65
74
|
matrix->table[i][j] = matrix->table[j][i] = NULL;
|
66
75
|
}
|
@@ -70,20 +79,19 @@ void adj_matrix_free(AdjMatrix *matrix){
|
|
70
79
|
free(matrix);
|
71
80
|
}
|
72
81
|
|
73
|
-
AdjMatrix*
|
82
|
+
AdjMatrix *adj_matrix_default() {
|
74
83
|
static char first_time = 1;
|
75
84
|
static AdjMatrix *ret_matrix;
|
76
|
-
if(first_time){
|
85
|
+
if (first_time) {
|
77
86
|
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
78
|
-
|
79
|
-
for(
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
|
87
|
+
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
|
88
|
+
for (size_t i = 0; i < length; i += 2) {
|
89
|
+
uint64_t code_1, code_2;
|
90
|
+
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
|
91
|
+
code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
|
84
92
|
adj_matrix_add(ret_matrix, code_1, code_2);
|
85
93
|
}
|
86
94
|
first_time = 0;
|
87
95
|
}
|
88
96
|
return ret_matrix;
|
89
|
-
}
|
97
|
+
}
|
@@ -1,22 +1,22 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "stdint.h"
|
4
|
+
|
3
5
|
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
4
6
|
#define ADJ_MATRIX_SEED 9527
|
5
7
|
|
6
|
-
typedef struct _node{
|
8
|
+
typedef struct _node {
|
7
9
|
struct _node *next;
|
8
|
-
|
10
|
+
uint64_t x, y;
|
9
11
|
} Node;
|
10
12
|
|
11
|
-
typedef struct{
|
13
|
+
typedef struct {
|
12
14
|
Node ***table;
|
13
|
-
|
15
|
+
uint32_t length;
|
14
16
|
} AdjMatrix;
|
15
17
|
|
16
|
-
AdjMatrix*
|
17
|
-
void
|
18
|
-
char
|
19
|
-
void
|
20
|
-
AdjMatrix*
|
21
|
-
|
22
|
-
#endif
|
18
|
+
AdjMatrix *adj_matrix_new(uint32_t length);
|
19
|
+
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
20
|
+
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
21
|
+
void adj_matrix_free(AdjMatrix *matrix);
|
22
|
+
AdjMatrix *adj_matrix_default();
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#include "codepoints.h"
|
2
|
+
#include "ruby.h"
|
3
|
+
#include "ruby/encoding.h"
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <string.h>
|
7
|
+
|
8
|
+
// this function is copied from string.c
|
9
|
+
static inline int single_byte_optimizable(VALUE str) {
|
10
|
+
rb_encoding *enc;
|
11
|
+
|
12
|
+
/* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
|
13
|
+
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
14
|
+
return 1;
|
15
|
+
|
16
|
+
enc = rb_enc_get(str);
|
17
|
+
if (rb_enc_mbmaxlen(enc) == 1)
|
18
|
+
return 1;
|
19
|
+
|
20
|
+
/* Conservative. Possibly single byte.
|
21
|
+
* "\xa1" in Shift_JIS for example. */
|
22
|
+
return 0;
|
23
|
+
}
|
24
|
+
|
25
|
+
void codepoints_init(CodePoints *codepoints, VALUE str) {
|
26
|
+
size_t i, length;
|
27
|
+
int32_t n;
|
28
|
+
uint32_t c;
|
29
|
+
const char *ptr, *end;
|
30
|
+
rb_encoding *enc;
|
31
|
+
|
32
|
+
if (single_byte_optimizable(str)) {
|
33
|
+
length = RSTRING_LEN(str);
|
34
|
+
ptr = RSTRING_PTR(str);
|
35
|
+
codepoints->data = malloc(length * sizeof(*codepoints->data));
|
36
|
+
for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
|
37
|
+
codepoints->data[i] = ptr[i] & 0xff;
|
38
|
+
} else {
|
39
|
+
codepoints->length = 0;
|
40
|
+
codepoints->size = 32;
|
41
|
+
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
|
42
|
+
str = rb_str_new_frozen(str);
|
43
|
+
ptr = RSTRING_PTR(str);
|
44
|
+
end = RSTRING_END(str);
|
45
|
+
enc = rb_enc_get(str);
|
46
|
+
|
47
|
+
while (ptr < end) {
|
48
|
+
c = rb_enc_codepoint_len(ptr, end, &n, enc);
|
49
|
+
if (codepoints->length == codepoints->size) {
|
50
|
+
codepoints->size *= 2;
|
51
|
+
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
|
52
|
+
codepoints->size);
|
53
|
+
}
|
54
|
+
codepoints->data[codepoints->length++] = c;
|
55
|
+
ptr += n;
|
56
|
+
}
|
57
|
+
RB_GC_GUARD(str);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "ruby.h"
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
uint32_t *data;
|
8
|
+
size_t length;
|
9
|
+
size_t size;
|
10
|
+
} CodePoints;
|
11
|
+
|
12
|
+
void codepoints_init(CodePoints *, VALUE str);
|
13
|
+
void codepoints_free(CodePoints *);
|
data/ext/jaro_winkler/jaro.c
CHANGED
@@ -1,73 +1,62 @@
|
|
1
1
|
#include "jaro.h"
|
2
|
-
#include "code.h"
|
3
2
|
#include "adj_matrix.h"
|
3
|
+
#include "codepoints.h"
|
4
4
|
|
5
|
-
#include <string.h>
|
6
|
-
#include <stdlib.h>
|
7
5
|
#include <ctype.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <string.h>
|
8
8
|
|
9
|
-
#define
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
if(
|
30
|
-
|
31
|
-
|
32
|
-
int short_codes_len, long_codes_len;
|
33
|
-
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
34
|
-
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
35
|
-
|
36
|
-
double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
37
|
-
|
38
|
-
free(short_codes); free(long_codes);
|
39
|
-
return ret;
|
40
|
-
}
|
41
|
-
|
42
|
-
double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
|
43
|
-
if(!short_codes_len || !long_codes_len) return 0.0;
|
44
|
-
|
45
|
-
if(short_codes_len > long_codes_len){
|
46
|
-
SWAP(short_codes, long_codes);
|
47
|
-
SWAP(short_codes_len, long_codes_len);
|
9
|
+
#define DEFAULT_WEIGHT 0.1
|
10
|
+
#define DEFAULT_THRESHOLD 0.7
|
11
|
+
#define SWAP(x, y) \
|
12
|
+
do { \
|
13
|
+
__typeof__(x) SWAP = x; \
|
14
|
+
x = y; \
|
15
|
+
y = SWAP; \
|
16
|
+
} while (0)
|
17
|
+
|
18
|
+
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
|
19
|
+
.threshold = DEFAULT_THRESHOLD,
|
20
|
+
.ignore_case = 0,
|
21
|
+
.adj_table = 0};
|
22
|
+
|
23
|
+
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
24
|
+
uint32_t *codepoints2, size_t len2,
|
25
|
+
Options *opt) {
|
26
|
+
if (!len1 || !len2)
|
27
|
+
return 0.0;
|
28
|
+
|
29
|
+
if (len1 > len2) {
|
30
|
+
SWAP(codepoints1, codepoints2);
|
31
|
+
SWAP(len1, len2);
|
48
32
|
}
|
49
33
|
|
50
|
-
if(opt->ignore_case){
|
51
|
-
for(
|
52
|
-
|
34
|
+
if (opt->ignore_case) {
|
35
|
+
for (size_t i = 0; i < len1; i++)
|
36
|
+
codepoints1[i] = tolower(codepoints1[i]);
|
37
|
+
for (size_t i = 0; i < len2; i++)
|
38
|
+
codepoints2[i] = tolower(codepoints2[i]);
|
53
39
|
}
|
54
40
|
|
55
|
-
|
56
|
-
if(window_size < 0)
|
41
|
+
int32_t window_size = (int32_t)len2 / 2 - 1;
|
42
|
+
if (window_size < 0)
|
43
|
+
window_size = 0;
|
57
44
|
|
58
|
-
char short_codes_flag[
|
59
|
-
char long_codes_flag[
|
60
|
-
memset(short_codes_flag, 0,
|
61
|
-
memset(long_codes_flag, 0,
|
45
|
+
char short_codes_flag[len1];
|
46
|
+
char long_codes_flag[len2];
|
47
|
+
memset(short_codes_flag, 0, len1);
|
48
|
+
memset(long_codes_flag, 0, len2);
|
62
49
|
|
63
50
|
// count number of matching characters
|
64
|
-
|
65
|
-
for(
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
51
|
+
size_t match_count = 0;
|
52
|
+
for (size_t i = 0; i < len1; i++) {
|
53
|
+
size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
|
54
|
+
size_t right =
|
55
|
+
(i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
|
56
|
+
if (right > len2 - 1)
|
57
|
+
right = len2 - 1;
|
58
|
+
for (size_t j = left; j <= right; j++) {
|
59
|
+
if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
|
71
60
|
short_codes_flag[i] = long_codes_flag[j] = 1;
|
72
61
|
match_count++;
|
73
62
|
break;
|
@@ -75,48 +64,58 @@ double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes
|
|
75
64
|
}
|
76
65
|
}
|
77
66
|
|
78
|
-
if(!match_count)
|
67
|
+
if (!match_count)
|
68
|
+
return 0.0;
|
79
69
|
|
80
70
|
// count number of transpositions
|
81
|
-
|
82
|
-
for(
|
83
|
-
if(short_codes_flag[i]){
|
84
|
-
for(j = k; j <
|
85
|
-
if(long_codes_flag[j]){
|
71
|
+
size_t transposition_count = 0, j = 0, k = 0;
|
72
|
+
for (size_t i = 0; i < len1; i++) {
|
73
|
+
if (short_codes_flag[i]) {
|
74
|
+
for (j = k; j < len2; j++) {
|
75
|
+
if (long_codes_flag[j]) {
|
86
76
|
k = j + 1;
|
87
77
|
break;
|
88
78
|
}
|
89
79
|
}
|
90
|
-
if(
|
80
|
+
if (codepoints1[i] != codepoints2[j])
|
81
|
+
transposition_count++;
|
91
82
|
}
|
92
83
|
}
|
93
84
|
|
94
85
|
// count similarities in nonmatched characters
|
95
|
-
|
96
|
-
if(opt->adj_table &&
|
97
|
-
for(
|
98
|
-
if(!short_codes_flag[i])
|
99
|
-
for(
|
100
|
-
if(!long_codes_flag[j])
|
101
|
-
if(adj_matrix_find(adj_matrix_default(),
|
86
|
+
size_t similar_count = 0;
|
87
|
+
if (opt->adj_table && len1 > match_count)
|
88
|
+
for (size_t i = 0; i < len1; i++)
|
89
|
+
if (!short_codes_flag[i])
|
90
|
+
for (size_t j = 0; j < len2; j++)
|
91
|
+
if (!long_codes_flag[j])
|
92
|
+
if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
|
93
|
+
codepoints2[j])) {
|
102
94
|
similar_count += 3;
|
103
95
|
break;
|
104
96
|
}
|
105
97
|
|
106
98
|
double m = (double)match_count;
|
107
|
-
double t = (double)(transposition_count/2);
|
108
|
-
if(opt->adj_table)
|
109
|
-
|
99
|
+
double t = (double)(transposition_count / 2);
|
100
|
+
if (opt->adj_table)
|
101
|
+
m = similar_count / 10.0 + m;
|
102
|
+
return (m / len1 + m / len2 + (m - t) / m) / 3;
|
110
103
|
}
|
111
104
|
|
112
|
-
double jaro_winkler_distance_from_codes(
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
105
|
+
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
106
|
+
uint32_t *codepoints2, size_t len2,
|
107
|
+
Options *opt) {
|
108
|
+
double jaro_distance =
|
109
|
+
jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
|
110
|
+
|
111
|
+
if (jaro_distance < opt->threshold)
|
112
|
+
return jaro_distance;
|
113
|
+
else {
|
114
|
+
size_t prefix = 0;
|
115
|
+
size_t max_4 = len1 > 4 ? 4 : len1;
|
116
|
+
for (prefix = 0;
|
117
|
+
prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
|
118
|
+
;
|
119
|
+
return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
|
121
120
|
}
|
122
|
-
}
|
121
|
+
}
|
data/ext/jaro_winkler/jaro.h
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
#
|
2
|
-
#define LIBJARO_JARO_H
|
1
|
+
#pragma once
|
3
2
|
|
4
|
-
#
|
5
|
-
#
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
6
5
|
|
7
|
-
typedef struct
|
6
|
+
typedef struct {
|
8
7
|
double weight, threshold;
|
9
8
|
char ignore_case, adj_table;
|
10
|
-
}
|
9
|
+
} Options;
|
11
10
|
|
11
|
+
extern const Options DEFAULT_OPTIONS;
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
double
|
16
|
-
|
17
|
-
|
13
|
+
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
14
|
+
uint32_t *codepoints2, size_t len2, Options *);
|
15
|
+
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
16
|
+
uint32_t *codepoints2, size_t len2,
|
17
|
+
Options *);
|
@@ -1,45 +1,70 @@
|
|
1
|
-
#include "
|
1
|
+
#include "codepoints.h"
|
2
2
|
#include "jaro.h"
|
3
|
+
#include "ruby.h"
|
3
4
|
|
4
|
-
VALUE rb_mJaroWinkler,
|
5
|
-
rb_eError,
|
6
|
-
rb_eInvalidWeightError;
|
5
|
+
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
|
7
6
|
|
8
|
-
VALUE rb_jaro_winkler_distance(
|
9
|
-
VALUE rb_jaro_distance(
|
10
|
-
VALUE distance(
|
7
|
+
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
|
8
|
+
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
|
9
|
+
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
10
|
+
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
11
|
+
uint32_t *codepoints2, size_t len2,
|
12
|
+
Options *));
|
11
13
|
|
12
|
-
void Init_jaro_winkler_ext(void){
|
14
|
+
void Init_jaro_winkler_ext(void) {
|
13
15
|
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
14
16
|
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
|
15
|
-
rb_eInvalidWeightError =
|
16
|
-
|
17
|
-
|
17
|
+
rb_eInvalidWeightError =
|
18
|
+
rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
|
19
|
+
rb_define_singleton_method(rb_mJaroWinkler, "distance",
|
20
|
+
rb_jaro_winkler_distance, -1);
|
21
|
+
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
|
22
|
+
-1);
|
18
23
|
}
|
19
24
|
|
20
|
-
|
21
|
-
|
25
|
+
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
26
|
+
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
27
|
+
uint32_t *codepoints2, size_t len2,
|
28
|
+
Options *)) {
|
22
29
|
VALUE s1, s2, opt;
|
23
|
-
|
24
|
-
|
25
|
-
|
30
|
+
CodePoints cp1, cp2;
|
31
|
+
|
32
|
+
rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
|
33
|
+
codepoints_init(&cp1, s1);
|
34
|
+
codepoints_init(&cp2, s2);
|
35
|
+
|
36
|
+
Options c_opt = DEFAULT_OPTIONS;
|
37
|
+
if (TYPE(opt) == T_HASH) {
|
26
38
|
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
27
39
|
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
28
40
|
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
29
41
|
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
30
|
-
if(!NIL_P(weight))
|
31
|
-
|
32
|
-
if(
|
33
|
-
|
34
|
-
|
42
|
+
if (!NIL_P(weight))
|
43
|
+
c_opt.weight = NUM2DBL(weight);
|
44
|
+
if (c_opt.weight > 0.25)
|
45
|
+
rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
|
46
|
+
"otherwise the distance can become "
|
47
|
+
"larger than 1.");
|
48
|
+
if (!NIL_P(threshold))
|
49
|
+
c_opt.threshold = NUM2DBL(threshold);
|
50
|
+
if (!NIL_P(ignore_case))
|
51
|
+
c_opt.ignore_case =
|
52
|
+
(TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
53
|
+
if (!NIL_P(adj_table))
|
54
|
+
c_opt.adj_table =
|
55
|
+
(TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
35
56
|
}
|
36
|
-
|
57
|
+
VALUE ret = rb_float_new(
|
58
|
+
(*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
|
59
|
+
codepoints_free(&cp1);
|
60
|
+
codepoints_free(&cp2);
|
61
|
+
return ret;
|
37
62
|
}
|
38
63
|
|
39
|
-
VALUE rb_jaro_distance(
|
40
|
-
return distance(argc, argv, self,
|
64
|
+
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
|
65
|
+
return distance(argc, argv, self, jaro_distance_from_codes);
|
41
66
|
}
|
42
67
|
|
43
|
-
VALUE rb_jaro_winkler_distance(
|
44
|
-
return distance(argc, argv, self,
|
45
|
-
}
|
68
|
+
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
|
69
|
+
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
|
70
|
+
}
|
data/lib/jaro_winkler.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'jaro_winkler/version'
|
2
4
|
|
3
|
-
|
4
|
-
when 'java'
|
5
|
-
require 'jaro_winkler/jaro_winkler_pure'
|
6
|
-
else
|
5
|
+
if RUBY_ENGINE == 'ruby'
|
7
6
|
require 'jaro_winkler/jaro_winkler_ext'
|
7
|
+
else
|
8
|
+
require 'jaro_winkler/jaro_winkler_pure'
|
8
9
|
end
|
9
|
-
|
@@ -1,19 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module JaroWinkler
|
2
|
-
DEFAULT_ADJ_TABLE = Hash.new
|
4
|
+
DEFAULT_ADJ_TABLE = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
|
3
5
|
[
|
4
|
-
[
|
5
|
-
[
|
6
|
-
[
|
7
|
-
[
|
6
|
+
%w[A E], %w[A I], %w[A O], %w[A U], %w[B V], %w[E I], %w[E O], %w[E U], %w[I O],
|
7
|
+
%w[I U], %w[O U], %w[I Y], %w[E Y], %w[C G], %w[E F], %w[W U], %w[W V], %w[X K],
|
8
|
+
%w[S Z], %w[X S], %w[Q C], %w[U V], %w[M N], %w[L I], %w[Q O], %w[P R], %w[I J],
|
9
|
+
%w[2 Z], %w[5 S], %w[8 B], %w[1 I], %w[1 L], %w[0 O], %w[0 Q], %w[C K], %w[G J],
|
8
10
|
['E', ' '], ['Y', ' '], ['S', ' ']
|
9
|
-
].each
|
10
|
-
if not DEFAULT_ADJ_TABLE.has_key?(s1)
|
11
|
-
DEFAULT_ADJ_TABLE[s1] = Hash.new
|
12
|
-
end
|
13
|
-
if not DEFAULT_ADJ_TABLE.has_key?(s2)
|
14
|
-
DEFAULT_ADJ_TABLE[s2] = Hash.new
|
15
|
-
end
|
11
|
+
].each do |s1, s2|
|
16
12
|
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
|
17
|
-
|
18
|
-
DEFAULT_ADJ_TABLE.default = Hash.new
|
13
|
+
end
|
19
14
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'jaro_winkler/adjusting_table'
|
2
4
|
module JaroWinkler
|
3
5
|
class Error < RuntimeError; end
|
@@ -6,120 +8,122 @@ module JaroWinkler
|
|
6
8
|
DEFAULT_WEIGHT = 0.1
|
7
9
|
DEFAULT_THRESHOLD = 0.7
|
8
10
|
DEFAULT_OPTIONS = {
|
9
|
-
jaro: {adj_table: false, ignore_case: false},
|
10
|
-
jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
|
11
|
-
}
|
12
|
-
|
13
|
-
module_function
|
14
|
-
|
15
|
-
def distance str1, str2, options={}
|
16
|
-
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
17
|
-
end
|
11
|
+
jaro: { adj_table: false, ignore_case: false },
|
12
|
+
jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
|
13
|
+
}.freeze
|
18
14
|
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
class << self
|
16
|
+
def distance(str1, str2, options = {})
|
17
|
+
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
18
|
+
end
|
22
19
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
jaro_distance = _jaro_distance(codes1, codes2, options);
|
20
|
+
def jaro_distance(str1, str2, options = {})
|
21
|
+
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
22
|
+
end
|
27
23
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
24
|
+
private
|
25
|
+
|
26
|
+
def _distance(codes1, codes2, options = {})
|
27
|
+
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
|
28
|
+
raise InvalidWeightError if options[:weight] > 0.25
|
29
|
+
jaro_distance = _jaro_distance(codes1, codes2, options)
|
30
|
+
|
31
|
+
if jaro_distance < options[:threshold]
|
32
|
+
jaro_distance
|
33
|
+
else
|
34
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
35
|
+
len1 = codes1.length
|
36
|
+
len2 = codes2.length
|
37
|
+
max_4 = len1 > 4 ? 4 : len1
|
38
|
+
prefix = 0
|
39
|
+
prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
|
40
|
+
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
|
37
41
|
end
|
38
|
-
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
|
39
42
|
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def _jaro_distance codes1, codes2, options={}
|
43
|
-
options = DEFAULT_OPTIONS[:jaro].merge options
|
44
43
|
|
45
|
-
codes1, codes2
|
46
|
-
|
47
|
-
return 0.0 if len1 == 0 || len2 == 0
|
44
|
+
def _jaro_distance(codes1, codes2, options = {})
|
45
|
+
options = DEFAULT_OPTIONS[:jaro].merge options
|
48
46
|
|
49
|
-
|
50
|
-
codes1.
|
51
|
-
codes2.
|
52
|
-
|
47
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
48
|
+
len1 = codes1.length
|
49
|
+
len2 = codes2.length
|
50
|
+
return 0.0 if len1 == 0 || len2 == 0
|
53
51
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
# // count number of matching characters
|
59
|
-
match_count = 0;
|
60
|
-
i = 0
|
61
|
-
while i < len1
|
62
|
-
left = (i >= window) ? i - window : 0
|
63
|
-
right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
|
64
|
-
right = len2 - 1 if right > len2 - 1
|
65
|
-
j = left
|
66
|
-
while j <= right
|
67
|
-
if flags2[j] == 0 && codes1[i] == codes2[j]
|
68
|
-
flags1 |= (1 << i)
|
69
|
-
flags2 |= (1 << j)
|
70
|
-
match_count += 1
|
71
|
-
break
|
72
|
-
end
|
73
|
-
j +=1
|
52
|
+
if options[:ignore_case]
|
53
|
+
codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
54
|
+
codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
74
55
|
end
|
75
|
-
i += 1
|
76
|
-
end
|
77
56
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
57
|
+
window = len2 / 2 - 1
|
58
|
+
window = 0 if window < 0
|
59
|
+
flags1 = 0
|
60
|
+
flags2 = 0
|
61
|
+
|
62
|
+
# // count number of matching characters
|
63
|
+
match_count = 0
|
64
|
+
i = 0
|
65
|
+
while i < len1
|
66
|
+
left = i >= window ? i - window : 0
|
67
|
+
right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
|
68
|
+
right = len2 - 1 if right > len2 - 1
|
69
|
+
j = left
|
70
|
+
while j <= right
|
71
|
+
if flags2[j] == 0 && codes1[i] == codes2[j]
|
72
|
+
flags1 |= (1 << i)
|
73
|
+
flags2 |= (1 << j)
|
74
|
+
match_count += 1
|
75
|
+
break
|
90
76
|
end
|
91
77
|
j += 1
|
92
78
|
end
|
93
|
-
|
79
|
+
i += 1
|
94
80
|
end
|
95
|
-
i += 1
|
96
|
-
end
|
97
81
|
|
98
|
-
|
99
|
-
|
100
|
-
|
82
|
+
return 0.0 if match_count == 0
|
83
|
+
|
84
|
+
# // count number of transpositions
|
85
|
+
transposition_count = j = k = 0
|
101
86
|
i = 0
|
102
87
|
while i < len1
|
103
|
-
if flags1[i] ==
|
104
|
-
j =
|
88
|
+
if flags1[i] == 1
|
89
|
+
j = k
|
105
90
|
while j < len2
|
106
|
-
if flags2[j] ==
|
107
|
-
|
108
|
-
|
109
|
-
break
|
110
|
-
end
|
91
|
+
if flags2[j] == 1
|
92
|
+
k = j + 1
|
93
|
+
break
|
111
94
|
end
|
112
95
|
j += 1
|
113
96
|
end
|
97
|
+
transposition_count += 1 if codes1[i] != codes2[j]
|
114
98
|
end
|
115
99
|
i += 1
|
116
100
|
end
|
117
|
-
end
|
118
101
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
102
|
+
# // count similarities in nonmatched characters
|
103
|
+
similar_count = 0
|
104
|
+
if options[:adj_table] && len1 > match_count
|
105
|
+
i = 0
|
106
|
+
while i < len1
|
107
|
+
if flags1[i] == 0
|
108
|
+
j = 0
|
109
|
+
while j < len2
|
110
|
+
if flags2[j] == 0
|
111
|
+
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
|
112
|
+
similar_count += 3
|
113
|
+
break
|
114
|
+
end
|
115
|
+
end
|
116
|
+
j += 1
|
117
|
+
end
|
118
|
+
end
|
119
|
+
i += 1
|
120
|
+
end
|
121
|
+
end
|
124
122
|
|
125
|
-
|
123
|
+
m = match_count.to_f
|
124
|
+
t = transposition_count / 2
|
125
|
+
m = similar_count / 10.0 + m if options[:adj_table]
|
126
|
+
(m / len1 + m / len2 + (m - t) / m) / 3
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
data/lib/jaro_winkler/version.rb
CHANGED
metadata
CHANGED
@@ -1,72 +1,77 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.1
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-06-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.7'
|
20
14
|
requirement: !ruby/object:Gem::Requirement
|
21
15
|
requirements:
|
22
|
-
- - ~>
|
16
|
+
- - "~>"
|
23
17
|
- !ruby/object:Gem::Version
|
24
18
|
version: '1.7'
|
25
|
-
|
19
|
+
name: bundler
|
26
20
|
type: :development
|
27
|
-
|
28
|
-
name: rake
|
21
|
+
prerelease: false
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- - ~>
|
24
|
+
- - "~>"
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
requirement: !ruby/object:Gem::Requirement
|
35
29
|
requirements:
|
36
|
-
- - ~>
|
30
|
+
- - "~>"
|
37
31
|
- !ruby/object:Gem::Version
|
38
|
-
version: '
|
39
|
-
|
32
|
+
version: '12.0'
|
33
|
+
name: rake
|
40
34
|
type: :development
|
41
|
-
|
42
|
-
name: rake-compiler
|
35
|
+
prerelease: false
|
43
36
|
version_requirements: !ruby/object:Gem::Requirement
|
44
37
|
requirements:
|
45
|
-
- -
|
38
|
+
- - "~>"
|
46
39
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
40
|
+
version: '12.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
48
42
|
requirement: !ruby/object:Gem::Requirement
|
49
43
|
requirements:
|
50
|
-
- -
|
44
|
+
- - ">="
|
51
45
|
- !ruby/object:Gem::Version
|
52
46
|
version: '0'
|
53
|
-
|
47
|
+
name: rake-compiler
|
54
48
|
type: :development
|
55
|
-
|
56
|
-
name: minitest
|
49
|
+
prerelease: false
|
57
50
|
version_requirements: !ruby/object:Gem::Requirement
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
62
56
|
requirement: !ruby/object:Gem::Requirement
|
63
57
|
requirements:
|
64
|
-
- -
|
58
|
+
- - ">="
|
65
59
|
- !ruby/object:Gem::Version
|
66
60
|
version: '0'
|
67
|
-
|
61
|
+
name: minitest
|
68
62
|
type: :development
|
69
|
-
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: |-
|
70
|
+
jaro_winkler is an implementation of Jaro-Winkler \
|
71
|
+
distance algorithm which is written in C extension and will fallback to pure \
|
72
|
+
Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
|
73
|
+
C and Ruby implementation support any kind of string encoding, such as \
|
74
|
+
UTF-8, EUC-JP, Big5, etc.
|
70
75
|
email: tonytonyjan@gmail.com
|
71
76
|
executables: []
|
72
77
|
extensions: []
|
@@ -74,12 +79,11 @@ extra_rdoc_files: []
|
|
74
79
|
files:
|
75
80
|
- ext/jaro_winkler/adj_matrix.c
|
76
81
|
- ext/jaro_winkler/adj_matrix.h
|
77
|
-
- ext/jaro_winkler/
|
78
|
-
- ext/jaro_winkler/
|
82
|
+
- ext/jaro_winkler/codepoints.c
|
83
|
+
- ext/jaro_winkler/codepoints.h
|
79
84
|
- ext/jaro_winkler/jaro.c
|
80
85
|
- ext/jaro_winkler/jaro.h
|
81
86
|
- ext/jaro_winkler/jaro_winkler.c
|
82
|
-
- ext/jaro_winkler/murmur_hash2.c
|
83
87
|
- lib/jaro_winkler.rb
|
84
88
|
- lib/jaro_winkler/adjusting_table.rb
|
85
89
|
- lib/jaro_winkler/jaro_winkler_pure.rb
|
@@ -94,18 +98,19 @@ require_paths:
|
|
94
98
|
- lib
|
95
99
|
required_ruby_version: !ruby/object:Gem::Requirement
|
96
100
|
requirements:
|
97
|
-
- -
|
101
|
+
- - ">="
|
98
102
|
- !ruby/object:Gem::Version
|
99
103
|
version: '0'
|
100
104
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
105
|
requirements:
|
102
|
-
- -
|
106
|
+
- - ">="
|
103
107
|
- !ruby/object:Gem::Version
|
104
108
|
version: '0'
|
105
109
|
requirements: []
|
106
110
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.
|
111
|
+
rubygems_version: 2.6.14.1
|
108
112
|
signing_key:
|
109
113
|
specification_version: 4
|
110
|
-
summary:
|
114
|
+
summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
|
115
|
+
which supports any kind of string encoding.
|
111
116
|
test_files: []
|
data/ext/jaro_winkler/code.c
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
#include <stdlib.h>
|
2
|
-
#include <string.h>
|
3
|
-
|
4
|
-
void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
|
5
|
-
unsigned char first_char = str[0];
|
6
|
-
if(first_char >= 252) *ret_byte_length = 6; // 1111110x
|
7
|
-
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
|
8
|
-
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
|
9
|
-
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
|
10
|
-
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
|
11
|
-
else *ret_byte_length = 1;
|
12
|
-
*ret_code = 0;
|
13
|
-
memcpy(ret_code, str, *ret_byte_length);
|
14
|
-
}
|
15
|
-
|
16
|
-
void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
|
17
|
-
unsigned int code;
|
18
|
-
char byte_length;
|
19
|
-
|
20
|
-
*ret_codes = calloc(length, sizeof(long long));
|
21
|
-
*ret_length = 0;
|
22
|
-
|
23
|
-
for(int i = 0; i < length;){
|
24
|
-
int byte_length;
|
25
|
-
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
|
26
|
-
*ret_length += 1;
|
27
|
-
i += byte_length;
|
28
|
-
}
|
29
|
-
}
|
data/ext/jaro_winkler/code.h
DELETED
@@ -1,64 +0,0 @@
|
|
1
|
-
//-----------------------------------------------------------------------------
|
2
|
-
// MurmurHash2, by Austin Appleby
|
3
|
-
|
4
|
-
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
-
|
6
|
-
// 1. We can read a 4-byte value from any address without crashing
|
7
|
-
// 2. sizeof(int) == 4
|
8
|
-
|
9
|
-
// And it has a few limitations -
|
10
|
-
|
11
|
-
// 1. It will not work incrementally.
|
12
|
-
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
-
// machines.
|
14
|
-
|
15
|
-
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
-
{
|
17
|
-
// 'm' and 'r' are mixing constants generated offline.
|
18
|
-
// They're not really 'magic', they just happen to work well.
|
19
|
-
|
20
|
-
const unsigned int m = 0x5bd1e995;
|
21
|
-
const int r = 24;
|
22
|
-
|
23
|
-
// Initialize the hash to a 'random' value
|
24
|
-
|
25
|
-
unsigned int h = seed ^ len;
|
26
|
-
|
27
|
-
// Mix 4 bytes at a time into the hash
|
28
|
-
|
29
|
-
const unsigned char * data = (const unsigned char *)key;
|
30
|
-
|
31
|
-
while(len >= 4)
|
32
|
-
{
|
33
|
-
unsigned int k = *(unsigned int *)data;
|
34
|
-
|
35
|
-
k *= m;
|
36
|
-
k ^= k >> r;
|
37
|
-
k *= m;
|
38
|
-
|
39
|
-
h *= m;
|
40
|
-
h ^= k;
|
41
|
-
|
42
|
-
data += 4;
|
43
|
-
len -= 4;
|
44
|
-
}
|
45
|
-
|
46
|
-
// Handle the last few bytes of the input array
|
47
|
-
|
48
|
-
switch(len)
|
49
|
-
{
|
50
|
-
case 3: h ^= data[2] << 16;
|
51
|
-
case 2: h ^= data[1] << 8;
|
52
|
-
case 1: h ^= data[0];
|
53
|
-
h *= m;
|
54
|
-
};
|
55
|
-
|
56
|
-
// Do a few final mixes of the hash to ensure the last few
|
57
|
-
// bytes are well-incorporated.
|
58
|
-
|
59
|
-
h ^= h >> 13;
|
60
|
-
h *= m;
|
61
|
-
h ^= h >> 15;
|
62
|
-
|
63
|
-
return h;
|
64
|
-
}
|