jaro_winkler 1.4.0-java → 1.5.1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/jaro_winkler/adj_matrix.c +52 -44
- data/ext/jaro_winkler/adj_matrix.h +13 -13
- data/ext/jaro_winkler/codepoints.c +61 -0
- data/ext/jaro_winkler/codepoints.h +13 -0
- data/ext/jaro_winkler/jaro.c +84 -85
- data/ext/jaro_winkler/jaro.h +11 -11
- data/ext/jaro_winkler/jaro_winkler.c +52 -27
- data/lib/jaro_winkler.rb +5 -5
- data/lib/jaro_winkler/adjusting_table.rb +9 -14
- data/lib/jaro_winkler/jaro_winkler_pure.rb +95 -91
- data/lib/jaro_winkler/version.rb +3 -1
- metadata +41 -36
- data/ext/jaro_winkler/code.c +0 -29
- data/ext/jaro_winkler/code.h +0 -7
- data/ext/jaro_winkler/murmur_hash2.c +0 -64
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3fd3fabbc7662b62ead8988ef48c53a89d75b6f
|
4
|
+
data.tar.gz: 7ba3eff5e134aadab37aa52f22665d78e66f6dcd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ee03982e280949d7069e5f25d4e1c5103b047abe5853e6497f5926a3cc5155c11f0876fc93461f7f26b1eb0985d9cf987b109d5898386928419032533cd1c8f
|
7
|
+
data.tar.gz: 902531a857d93d74bce572bfc8c604e61dcaf5853794ad36d530208bf70c85d2f43b84624550bbee1e4ddc0fac80fe3be91d96764390d060b1b7230f96c24520
|
@@ -1,66 +1,75 @@
|
|
1
1
|
#include "adj_matrix.h"
|
2
|
-
#include "
|
3
|
-
|
4
|
-
#include <stdlib.h>
|
2
|
+
#include "codepoints.h"
|
3
|
+
#include "ruby.h"
|
5
4
|
|
6
5
|
const char *DEFAULT_ADJ_TABLE[] = {
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
"A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
|
7
|
+
"O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
|
8
|
+
"C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
|
9
|
+
"S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
|
10
|
+
"I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
|
11
|
+
"O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
|
12
12
|
|
13
|
-
extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
|
14
13
|
void node_free(Node *head);
|
15
14
|
|
16
|
-
AdjMatrix*
|
15
|
+
AdjMatrix *adj_matrix_new(uint32_t length) {
|
17
16
|
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
18
17
|
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
|
19
|
-
matrix->table = malloc(matrix->length * sizeof(Node**));
|
20
|
-
for(
|
21
|
-
matrix->table[i] = malloc(matrix->length * sizeof(Node*));
|
22
|
-
for (
|
18
|
+
matrix->table = malloc(matrix->length * sizeof(Node **));
|
19
|
+
for (size_t i = 0; i < matrix->length; i++) {
|
20
|
+
matrix->table[i] = malloc(matrix->length * sizeof(Node *));
|
21
|
+
for (size_t j = 0; j < matrix->length; j++)
|
23
22
|
matrix->table[i][j] = NULL;
|
24
23
|
}
|
25
24
|
return matrix;
|
26
25
|
}
|
27
26
|
|
28
|
-
void adj_matrix_add(AdjMatrix *matrix,
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
28
|
+
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
29
|
+
ADJ_MATRIX_DEFAULT_LENGTH,
|
30
|
+
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
31
|
+
ADJ_MATRIX_DEFAULT_LENGTH;
|
32
|
+
Node *new_node = malloc(sizeof(Node));
|
33
|
+
new_node->x = h1;
|
34
|
+
new_node->y = h2;
|
35
|
+
new_node->next = NULL;
|
36
|
+
if (matrix->table[h1][h2] == NULL) {
|
33
37
|
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
|
34
|
-
}
|
35
|
-
else{
|
38
|
+
} else {
|
36
39
|
Node *previous = NULL;
|
37
|
-
for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
|
40
|
+
for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
|
41
|
+
previous = i;
|
38
42
|
previous->next = new_node;
|
39
43
|
}
|
40
44
|
}
|
41
45
|
|
42
|
-
char adj_matrix_find(AdjMatrix *matrix,
|
43
|
-
|
44
|
-
|
46
|
+
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
47
|
+
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
48
|
+
ADJ_MATRIX_DEFAULT_LENGTH,
|
49
|
+
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
50
|
+
ADJ_MATRIX_DEFAULT_LENGTH;
|
45
51
|
Node *node = matrix->table[h1][h2];
|
46
|
-
if(node == NULL)
|
47
|
-
|
48
|
-
|
49
|
-
|
52
|
+
if (node == NULL)
|
53
|
+
return 0;
|
54
|
+
else {
|
55
|
+
for (Node *i = node; i != NULL; i = i->next)
|
56
|
+
if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
|
57
|
+
return 1;
|
50
58
|
return 0;
|
51
59
|
}
|
52
60
|
}
|
53
61
|
|
54
|
-
void node_free(Node *head){
|
55
|
-
if(head == NULL)
|
62
|
+
void node_free(Node *head) {
|
63
|
+
if (head == NULL)
|
64
|
+
return;
|
56
65
|
node_free(head->next);
|
57
66
|
free(head);
|
58
67
|
}
|
59
68
|
|
60
|
-
void adj_matrix_free(AdjMatrix *matrix){
|
61
|
-
for(
|
62
|
-
for(
|
63
|
-
if(matrix->table[i][j] != NULL){
|
69
|
+
void adj_matrix_free(AdjMatrix *matrix) {
|
70
|
+
for (size_t i = 0; i < matrix->length; i++) {
|
71
|
+
for (size_t j = 0; j < matrix->length; j++)
|
72
|
+
if (matrix->table[i][j] != NULL) {
|
64
73
|
node_free(matrix->table[i][j]);
|
65
74
|
matrix->table[i][j] = matrix->table[j][i] = NULL;
|
66
75
|
}
|
@@ -70,20 +79,19 @@ void adj_matrix_free(AdjMatrix *matrix){
|
|
70
79
|
free(matrix);
|
71
80
|
}
|
72
81
|
|
73
|
-
AdjMatrix*
|
82
|
+
AdjMatrix *adj_matrix_default() {
|
74
83
|
static char first_time = 1;
|
75
84
|
static AdjMatrix *ret_matrix;
|
76
|
-
if(first_time){
|
85
|
+
if (first_time) {
|
77
86
|
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
78
|
-
|
79
|
-
for(
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
|
87
|
+
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
|
88
|
+
for (size_t i = 0; i < length; i += 2) {
|
89
|
+
uint64_t code_1, code_2;
|
90
|
+
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
|
91
|
+
code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
|
84
92
|
adj_matrix_add(ret_matrix, code_1, code_2);
|
85
93
|
}
|
86
94
|
first_time = 0;
|
87
95
|
}
|
88
96
|
return ret_matrix;
|
89
|
-
}
|
97
|
+
}
|
@@ -1,22 +1,22 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "stdint.h"
|
4
|
+
|
3
5
|
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
4
6
|
#define ADJ_MATRIX_SEED 9527
|
5
7
|
|
6
|
-
typedef struct _node{
|
8
|
+
typedef struct _node {
|
7
9
|
struct _node *next;
|
8
|
-
|
10
|
+
uint64_t x, y;
|
9
11
|
} Node;
|
10
12
|
|
11
|
-
typedef struct{
|
13
|
+
typedef struct {
|
12
14
|
Node ***table;
|
13
|
-
|
15
|
+
uint32_t length;
|
14
16
|
} AdjMatrix;
|
15
17
|
|
16
|
-
AdjMatrix*
|
17
|
-
void
|
18
|
-
char
|
19
|
-
void
|
20
|
-
AdjMatrix*
|
21
|
-
|
22
|
-
#endif
|
18
|
+
AdjMatrix *adj_matrix_new(uint32_t length);
|
19
|
+
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
20
|
+
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
21
|
+
void adj_matrix_free(AdjMatrix *matrix);
|
22
|
+
AdjMatrix *adj_matrix_default();
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#include "codepoints.h"
|
2
|
+
#include "ruby.h"
|
3
|
+
#include "ruby/encoding.h"
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <string.h>
|
7
|
+
|
8
|
+
// this function is copied from string.c
|
9
|
+
static inline int single_byte_optimizable(VALUE str) {
|
10
|
+
rb_encoding *enc;
|
11
|
+
|
12
|
+
/* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
|
13
|
+
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
14
|
+
return 1;
|
15
|
+
|
16
|
+
enc = rb_enc_get(str);
|
17
|
+
if (rb_enc_mbmaxlen(enc) == 1)
|
18
|
+
return 1;
|
19
|
+
|
20
|
+
/* Conservative. Possibly single byte.
|
21
|
+
* "\xa1" in Shift_JIS for example. */
|
22
|
+
return 0;
|
23
|
+
}
|
24
|
+
|
25
|
+
void codepoints_init(CodePoints *codepoints, VALUE str) {
|
26
|
+
size_t i, length;
|
27
|
+
int32_t n;
|
28
|
+
uint32_t c;
|
29
|
+
const char *ptr, *end;
|
30
|
+
rb_encoding *enc;
|
31
|
+
|
32
|
+
if (single_byte_optimizable(str)) {
|
33
|
+
length = RSTRING_LEN(str);
|
34
|
+
ptr = RSTRING_PTR(str);
|
35
|
+
codepoints->data = malloc(length * sizeof(*codepoints->data));
|
36
|
+
for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
|
37
|
+
codepoints->data[i] = ptr[i] & 0xff;
|
38
|
+
} else {
|
39
|
+
codepoints->length = 0;
|
40
|
+
codepoints->size = 32;
|
41
|
+
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
|
42
|
+
str = rb_str_new_frozen(str);
|
43
|
+
ptr = RSTRING_PTR(str);
|
44
|
+
end = RSTRING_END(str);
|
45
|
+
enc = rb_enc_get(str);
|
46
|
+
|
47
|
+
while (ptr < end) {
|
48
|
+
c = rb_enc_codepoint_len(ptr, end, &n, enc);
|
49
|
+
if (codepoints->length == codepoints->size) {
|
50
|
+
codepoints->size *= 2;
|
51
|
+
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
|
52
|
+
codepoints->size);
|
53
|
+
}
|
54
|
+
codepoints->data[codepoints->length++] = c;
|
55
|
+
ptr += n;
|
56
|
+
}
|
57
|
+
RB_GC_GUARD(str);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "ruby.h"
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
uint32_t *data;
|
8
|
+
size_t length;
|
9
|
+
size_t size;
|
10
|
+
} CodePoints;
|
11
|
+
|
12
|
+
void codepoints_init(CodePoints *, VALUE str);
|
13
|
+
void codepoints_free(CodePoints *);
|
data/ext/jaro_winkler/jaro.c
CHANGED
@@ -1,73 +1,62 @@
|
|
1
1
|
#include "jaro.h"
|
2
|
-
#include "code.h"
|
3
2
|
#include "adj_matrix.h"
|
3
|
+
#include "codepoints.h"
|
4
4
|
|
5
|
-
#include <string.h>
|
6
|
-
#include <stdlib.h>
|
7
5
|
#include <ctype.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <string.h>
|
8
8
|
|
9
|
-
#define
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
if(
|
30
|
-
|
31
|
-
|
32
|
-
int short_codes_len, long_codes_len;
|
33
|
-
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
34
|
-
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
35
|
-
|
36
|
-
double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
37
|
-
|
38
|
-
free(short_codes); free(long_codes);
|
39
|
-
return ret;
|
40
|
-
}
|
41
|
-
|
42
|
-
double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
|
43
|
-
if(!short_codes_len || !long_codes_len) return 0.0;
|
44
|
-
|
45
|
-
if(short_codes_len > long_codes_len){
|
46
|
-
SWAP(short_codes, long_codes);
|
47
|
-
SWAP(short_codes_len, long_codes_len);
|
9
|
+
#define DEFAULT_WEIGHT 0.1
|
10
|
+
#define DEFAULT_THRESHOLD 0.7
|
11
|
+
#define SWAP(x, y) \
|
12
|
+
do { \
|
13
|
+
__typeof__(x) SWAP = x; \
|
14
|
+
x = y; \
|
15
|
+
y = SWAP; \
|
16
|
+
} while (0)
|
17
|
+
|
18
|
+
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
|
19
|
+
.threshold = DEFAULT_THRESHOLD,
|
20
|
+
.ignore_case = 0,
|
21
|
+
.adj_table = 0};
|
22
|
+
|
23
|
+
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
24
|
+
uint32_t *codepoints2, size_t len2,
|
25
|
+
Options *opt) {
|
26
|
+
if (!len1 || !len2)
|
27
|
+
return 0.0;
|
28
|
+
|
29
|
+
if (len1 > len2) {
|
30
|
+
SWAP(codepoints1, codepoints2);
|
31
|
+
SWAP(len1, len2);
|
48
32
|
}
|
49
33
|
|
50
|
-
if(opt->ignore_case){
|
51
|
-
for(
|
52
|
-
|
34
|
+
if (opt->ignore_case) {
|
35
|
+
for (size_t i = 0; i < len1; i++)
|
36
|
+
codepoints1[i] = tolower(codepoints1[i]);
|
37
|
+
for (size_t i = 0; i < len2; i++)
|
38
|
+
codepoints2[i] = tolower(codepoints2[i]);
|
53
39
|
}
|
54
40
|
|
55
|
-
|
56
|
-
if(window_size < 0)
|
41
|
+
int32_t window_size = (int32_t)len2 / 2 - 1;
|
42
|
+
if (window_size < 0)
|
43
|
+
window_size = 0;
|
57
44
|
|
58
|
-
char short_codes_flag[
|
59
|
-
char long_codes_flag[
|
60
|
-
memset(short_codes_flag, 0,
|
61
|
-
memset(long_codes_flag, 0,
|
45
|
+
char short_codes_flag[len1];
|
46
|
+
char long_codes_flag[len2];
|
47
|
+
memset(short_codes_flag, 0, len1);
|
48
|
+
memset(long_codes_flag, 0, len2);
|
62
49
|
|
63
50
|
// count number of matching characters
|
64
|
-
|
65
|
-
for(
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
51
|
+
size_t match_count = 0;
|
52
|
+
for (size_t i = 0; i < len1; i++) {
|
53
|
+
size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
|
54
|
+
size_t right =
|
55
|
+
(i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
|
56
|
+
if (right > len2 - 1)
|
57
|
+
right = len2 - 1;
|
58
|
+
for (size_t j = left; j <= right; j++) {
|
59
|
+
if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
|
71
60
|
short_codes_flag[i] = long_codes_flag[j] = 1;
|
72
61
|
match_count++;
|
73
62
|
break;
|
@@ -75,48 +64,58 @@ double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes
|
|
75
64
|
}
|
76
65
|
}
|
77
66
|
|
78
|
-
if(!match_count)
|
67
|
+
if (!match_count)
|
68
|
+
return 0.0;
|
79
69
|
|
80
70
|
// count number of transpositions
|
81
|
-
|
82
|
-
for(
|
83
|
-
if(short_codes_flag[i]){
|
84
|
-
for(j = k; j <
|
85
|
-
if(long_codes_flag[j]){
|
71
|
+
size_t transposition_count = 0, j = 0, k = 0;
|
72
|
+
for (size_t i = 0; i < len1; i++) {
|
73
|
+
if (short_codes_flag[i]) {
|
74
|
+
for (j = k; j < len2; j++) {
|
75
|
+
if (long_codes_flag[j]) {
|
86
76
|
k = j + 1;
|
87
77
|
break;
|
88
78
|
}
|
89
79
|
}
|
90
|
-
if(
|
80
|
+
if (codepoints1[i] != codepoints2[j])
|
81
|
+
transposition_count++;
|
91
82
|
}
|
92
83
|
}
|
93
84
|
|
94
85
|
// count similarities in nonmatched characters
|
95
|
-
|
96
|
-
if(opt->adj_table &&
|
97
|
-
for(
|
98
|
-
if(!short_codes_flag[i])
|
99
|
-
for(
|
100
|
-
if(!long_codes_flag[j])
|
101
|
-
if(adj_matrix_find(adj_matrix_default(),
|
86
|
+
size_t similar_count = 0;
|
87
|
+
if (opt->adj_table && len1 > match_count)
|
88
|
+
for (size_t i = 0; i < len1; i++)
|
89
|
+
if (!short_codes_flag[i])
|
90
|
+
for (size_t j = 0; j < len2; j++)
|
91
|
+
if (!long_codes_flag[j])
|
92
|
+
if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
|
93
|
+
codepoints2[j])) {
|
102
94
|
similar_count += 3;
|
103
95
|
break;
|
104
96
|
}
|
105
97
|
|
106
98
|
double m = (double)match_count;
|
107
|
-
double t = (double)(transposition_count/2);
|
108
|
-
if(opt->adj_table)
|
109
|
-
|
99
|
+
double t = (double)(transposition_count / 2);
|
100
|
+
if (opt->adj_table)
|
101
|
+
m = similar_count / 10.0 + m;
|
102
|
+
return (m / len1 + m / len2 + (m - t) / m) / 3;
|
110
103
|
}
|
111
104
|
|
112
|
-
double jaro_winkler_distance_from_codes(
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
105
|
+
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
106
|
+
uint32_t *codepoints2, size_t len2,
|
107
|
+
Options *opt) {
|
108
|
+
double jaro_distance =
|
109
|
+
jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
|
110
|
+
|
111
|
+
if (jaro_distance < opt->threshold)
|
112
|
+
return jaro_distance;
|
113
|
+
else {
|
114
|
+
size_t prefix = 0;
|
115
|
+
size_t max_4 = len1 > 4 ? 4 : len1;
|
116
|
+
for (prefix = 0;
|
117
|
+
prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
|
118
|
+
;
|
119
|
+
return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
|
121
120
|
}
|
122
|
-
}
|
121
|
+
}
|
data/ext/jaro_winkler/jaro.h
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
#
|
2
|
-
#define LIBJARO_JARO_H
|
1
|
+
#pragma once
|
3
2
|
|
4
|
-
#
|
5
|
-
#
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
6
5
|
|
7
|
-
typedef struct
|
6
|
+
typedef struct {
|
8
7
|
double weight, threshold;
|
9
8
|
char ignore_case, adj_table;
|
10
|
-
}
|
9
|
+
} Options;
|
11
10
|
|
11
|
+
extern const Options DEFAULT_OPTIONS;
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
double
|
16
|
-
|
17
|
-
|
13
|
+
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
14
|
+
uint32_t *codepoints2, size_t len2, Options *);
|
15
|
+
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
16
|
+
uint32_t *codepoints2, size_t len2,
|
17
|
+
Options *);
|
@@ -1,45 +1,70 @@
|
|
1
|
-
#include "
|
1
|
+
#include "codepoints.h"
|
2
2
|
#include "jaro.h"
|
3
|
+
#include "ruby.h"
|
3
4
|
|
4
|
-
VALUE rb_mJaroWinkler,
|
5
|
-
rb_eError,
|
6
|
-
rb_eInvalidWeightError;
|
5
|
+
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
|
7
6
|
|
8
|
-
VALUE rb_jaro_winkler_distance(
|
9
|
-
VALUE rb_jaro_distance(
|
10
|
-
VALUE distance(
|
7
|
+
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
|
8
|
+
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
|
9
|
+
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
10
|
+
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
11
|
+
uint32_t *codepoints2, size_t len2,
|
12
|
+
Options *));
|
11
13
|
|
12
|
-
void Init_jaro_winkler_ext(void){
|
14
|
+
void Init_jaro_winkler_ext(void) {
|
13
15
|
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
14
16
|
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
|
15
|
-
rb_eInvalidWeightError =
|
16
|
-
|
17
|
-
|
17
|
+
rb_eInvalidWeightError =
|
18
|
+
rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
|
19
|
+
rb_define_singleton_method(rb_mJaroWinkler, "distance",
|
20
|
+
rb_jaro_winkler_distance, -1);
|
21
|
+
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
|
22
|
+
-1);
|
18
23
|
}
|
19
24
|
|
20
|
-
|
21
|
-
|
25
|
+
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
26
|
+
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
27
|
+
uint32_t *codepoints2, size_t len2,
|
28
|
+
Options *)) {
|
22
29
|
VALUE s1, s2, opt;
|
23
|
-
|
24
|
-
|
25
|
-
|
30
|
+
CodePoints cp1, cp2;
|
31
|
+
|
32
|
+
rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
|
33
|
+
codepoints_init(&cp1, s1);
|
34
|
+
codepoints_init(&cp2, s2);
|
35
|
+
|
36
|
+
Options c_opt = DEFAULT_OPTIONS;
|
37
|
+
if (TYPE(opt) == T_HASH) {
|
26
38
|
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
27
39
|
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
28
40
|
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
29
41
|
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
30
|
-
if(!NIL_P(weight))
|
31
|
-
|
32
|
-
if(
|
33
|
-
|
34
|
-
|
42
|
+
if (!NIL_P(weight))
|
43
|
+
c_opt.weight = NUM2DBL(weight);
|
44
|
+
if (c_opt.weight > 0.25)
|
45
|
+
rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
|
46
|
+
"otherwise the distance can become "
|
47
|
+
"larger than 1.");
|
48
|
+
if (!NIL_P(threshold))
|
49
|
+
c_opt.threshold = NUM2DBL(threshold);
|
50
|
+
if (!NIL_P(ignore_case))
|
51
|
+
c_opt.ignore_case =
|
52
|
+
(TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
53
|
+
if (!NIL_P(adj_table))
|
54
|
+
c_opt.adj_table =
|
55
|
+
(TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
35
56
|
}
|
36
|
-
|
57
|
+
VALUE ret = rb_float_new(
|
58
|
+
(*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
|
59
|
+
codepoints_free(&cp1);
|
60
|
+
codepoints_free(&cp2);
|
61
|
+
return ret;
|
37
62
|
}
|
38
63
|
|
39
|
-
VALUE rb_jaro_distance(
|
40
|
-
return distance(argc, argv, self,
|
64
|
+
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
|
65
|
+
return distance(argc, argv, self, jaro_distance_from_codes);
|
41
66
|
}
|
42
67
|
|
43
|
-
VALUE rb_jaro_winkler_distance(
|
44
|
-
return distance(argc, argv, self,
|
45
|
-
}
|
68
|
+
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
|
69
|
+
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
|
70
|
+
}
|
data/lib/jaro_winkler.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'jaro_winkler/version'
|
2
4
|
|
3
|
-
|
4
|
-
when 'java'
|
5
|
-
require 'jaro_winkler/jaro_winkler_pure'
|
6
|
-
else
|
5
|
+
if RUBY_ENGINE == 'ruby'
|
7
6
|
require 'jaro_winkler/jaro_winkler_ext'
|
7
|
+
else
|
8
|
+
require 'jaro_winkler/jaro_winkler_pure'
|
8
9
|
end
|
9
|
-
|
@@ -1,19 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module JaroWinkler
|
2
|
-
DEFAULT_ADJ_TABLE = Hash.new
|
4
|
+
DEFAULT_ADJ_TABLE = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
|
3
5
|
[
|
4
|
-
[
|
5
|
-
[
|
6
|
-
[
|
7
|
-
[
|
6
|
+
%w[A E], %w[A I], %w[A O], %w[A U], %w[B V], %w[E I], %w[E O], %w[E U], %w[I O],
|
7
|
+
%w[I U], %w[O U], %w[I Y], %w[E Y], %w[C G], %w[E F], %w[W U], %w[W V], %w[X K],
|
8
|
+
%w[S Z], %w[X S], %w[Q C], %w[U V], %w[M N], %w[L I], %w[Q O], %w[P R], %w[I J],
|
9
|
+
%w[2 Z], %w[5 S], %w[8 B], %w[1 I], %w[1 L], %w[0 O], %w[0 Q], %w[C K], %w[G J],
|
8
10
|
['E', ' '], ['Y', ' '], ['S', ' ']
|
9
|
-
].each
|
10
|
-
if not DEFAULT_ADJ_TABLE.has_key?(s1)
|
11
|
-
DEFAULT_ADJ_TABLE[s1] = Hash.new
|
12
|
-
end
|
13
|
-
if not DEFAULT_ADJ_TABLE.has_key?(s2)
|
14
|
-
DEFAULT_ADJ_TABLE[s2] = Hash.new
|
15
|
-
end
|
11
|
+
].each do |s1, s2|
|
16
12
|
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
|
17
|
-
|
18
|
-
DEFAULT_ADJ_TABLE.default = Hash.new
|
13
|
+
end
|
19
14
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'jaro_winkler/adjusting_table'
|
2
4
|
module JaroWinkler
|
3
5
|
class Error < RuntimeError; end
|
@@ -6,120 +8,122 @@ module JaroWinkler
|
|
6
8
|
DEFAULT_WEIGHT = 0.1
|
7
9
|
DEFAULT_THRESHOLD = 0.7
|
8
10
|
DEFAULT_OPTIONS = {
|
9
|
-
jaro: {adj_table: false, ignore_case: false},
|
10
|
-
jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
|
11
|
-
}
|
12
|
-
|
13
|
-
module_function
|
14
|
-
|
15
|
-
def distance str1, str2, options={}
|
16
|
-
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
17
|
-
end
|
11
|
+
jaro: { adj_table: false, ignore_case: false },
|
12
|
+
jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
|
13
|
+
}.freeze
|
18
14
|
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
class << self
|
16
|
+
def distance(str1, str2, options = {})
|
17
|
+
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
18
|
+
end
|
22
19
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
jaro_distance = _jaro_distance(codes1, codes2, options);
|
20
|
+
def jaro_distance(str1, str2, options = {})
|
21
|
+
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
22
|
+
end
|
27
23
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
24
|
+
private
|
25
|
+
|
26
|
+
def _distance(codes1, codes2, options = {})
|
27
|
+
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
|
28
|
+
raise InvalidWeightError if options[:weight] > 0.25
|
29
|
+
jaro_distance = _jaro_distance(codes1, codes2, options)
|
30
|
+
|
31
|
+
if jaro_distance < options[:threshold]
|
32
|
+
jaro_distance
|
33
|
+
else
|
34
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
35
|
+
len1 = codes1.length
|
36
|
+
len2 = codes2.length
|
37
|
+
max_4 = len1 > 4 ? 4 : len1
|
38
|
+
prefix = 0
|
39
|
+
prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
|
40
|
+
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
|
37
41
|
end
|
38
|
-
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
|
39
42
|
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def _jaro_distance codes1, codes2, options={}
|
43
|
-
options = DEFAULT_OPTIONS[:jaro].merge options
|
44
43
|
|
45
|
-
codes1, codes2
|
46
|
-
|
47
|
-
return 0.0 if len1 == 0 || len2 == 0
|
44
|
+
def _jaro_distance(codes1, codes2, options = {})
|
45
|
+
options = DEFAULT_OPTIONS[:jaro].merge options
|
48
46
|
|
49
|
-
|
50
|
-
codes1.
|
51
|
-
codes2.
|
52
|
-
|
47
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
48
|
+
len1 = codes1.length
|
49
|
+
len2 = codes2.length
|
50
|
+
return 0.0 if len1 == 0 || len2 == 0
|
53
51
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
# // count number of matching characters
|
59
|
-
match_count = 0;
|
60
|
-
i = 0
|
61
|
-
while i < len1
|
62
|
-
left = (i >= window) ? i - window : 0
|
63
|
-
right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
|
64
|
-
right = len2 - 1 if right > len2 - 1
|
65
|
-
j = left
|
66
|
-
while j <= right
|
67
|
-
if flags2[j] == 0 && codes1[i] == codes2[j]
|
68
|
-
flags1 |= (1 << i)
|
69
|
-
flags2 |= (1 << j)
|
70
|
-
match_count += 1
|
71
|
-
break
|
72
|
-
end
|
73
|
-
j +=1
|
52
|
+
if options[:ignore_case]
|
53
|
+
codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
54
|
+
codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
74
55
|
end
|
75
|
-
i += 1
|
76
|
-
end
|
77
56
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
57
|
+
window = len2 / 2 - 1
|
58
|
+
window = 0 if window < 0
|
59
|
+
flags1 = 0
|
60
|
+
flags2 = 0
|
61
|
+
|
62
|
+
# // count number of matching characters
|
63
|
+
match_count = 0
|
64
|
+
i = 0
|
65
|
+
while i < len1
|
66
|
+
left = i >= window ? i - window : 0
|
67
|
+
right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
|
68
|
+
right = len2 - 1 if right > len2 - 1
|
69
|
+
j = left
|
70
|
+
while j <= right
|
71
|
+
if flags2[j] == 0 && codes1[i] == codes2[j]
|
72
|
+
flags1 |= (1 << i)
|
73
|
+
flags2 |= (1 << j)
|
74
|
+
match_count += 1
|
75
|
+
break
|
90
76
|
end
|
91
77
|
j += 1
|
92
78
|
end
|
93
|
-
|
79
|
+
i += 1
|
94
80
|
end
|
95
|
-
i += 1
|
96
|
-
end
|
97
81
|
|
98
|
-
|
99
|
-
|
100
|
-
|
82
|
+
return 0.0 if match_count == 0
|
83
|
+
|
84
|
+
# // count number of transpositions
|
85
|
+
transposition_count = j = k = 0
|
101
86
|
i = 0
|
102
87
|
while i < len1
|
103
|
-
if flags1[i] ==
|
104
|
-
j =
|
88
|
+
if flags1[i] == 1
|
89
|
+
j = k
|
105
90
|
while j < len2
|
106
|
-
if flags2[j] ==
|
107
|
-
|
108
|
-
|
109
|
-
break
|
110
|
-
end
|
91
|
+
if flags2[j] == 1
|
92
|
+
k = j + 1
|
93
|
+
break
|
111
94
|
end
|
112
95
|
j += 1
|
113
96
|
end
|
97
|
+
transposition_count += 1 if codes1[i] != codes2[j]
|
114
98
|
end
|
115
99
|
i += 1
|
116
100
|
end
|
117
|
-
end
|
118
101
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
102
|
+
# // count similarities in nonmatched characters
|
103
|
+
similar_count = 0
|
104
|
+
if options[:adj_table] && len1 > match_count
|
105
|
+
i = 0
|
106
|
+
while i < len1
|
107
|
+
if flags1[i] == 0
|
108
|
+
j = 0
|
109
|
+
while j < len2
|
110
|
+
if flags2[j] == 0
|
111
|
+
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
|
112
|
+
similar_count += 3
|
113
|
+
break
|
114
|
+
end
|
115
|
+
end
|
116
|
+
j += 1
|
117
|
+
end
|
118
|
+
end
|
119
|
+
i += 1
|
120
|
+
end
|
121
|
+
end
|
124
122
|
|
125
|
-
|
123
|
+
m = match_count.to_f
|
124
|
+
t = transposition_count / 2
|
125
|
+
m = similar_count / 10.0 + m if options[:adj_table]
|
126
|
+
(m / len1 + m / len2 + (m - t) / m) / 3
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
data/lib/jaro_winkler/version.rb
CHANGED
metadata
CHANGED
@@ -1,72 +1,77 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.1
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-06-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.7'
|
20
14
|
requirement: !ruby/object:Gem::Requirement
|
21
15
|
requirements:
|
22
|
-
- - ~>
|
16
|
+
- - "~>"
|
23
17
|
- !ruby/object:Gem::Version
|
24
18
|
version: '1.7'
|
25
|
-
|
19
|
+
name: bundler
|
26
20
|
type: :development
|
27
|
-
|
28
|
-
name: rake
|
21
|
+
prerelease: false
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- - ~>
|
24
|
+
- - "~>"
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
requirement: !ruby/object:Gem::Requirement
|
35
29
|
requirements:
|
36
|
-
- - ~>
|
30
|
+
- - "~>"
|
37
31
|
- !ruby/object:Gem::Version
|
38
|
-
version: '
|
39
|
-
|
32
|
+
version: '12.0'
|
33
|
+
name: rake
|
40
34
|
type: :development
|
41
|
-
|
42
|
-
name: rake-compiler
|
35
|
+
prerelease: false
|
43
36
|
version_requirements: !ruby/object:Gem::Requirement
|
44
37
|
requirements:
|
45
|
-
- -
|
38
|
+
- - "~>"
|
46
39
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
40
|
+
version: '12.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
48
42
|
requirement: !ruby/object:Gem::Requirement
|
49
43
|
requirements:
|
50
|
-
- -
|
44
|
+
- - ">="
|
51
45
|
- !ruby/object:Gem::Version
|
52
46
|
version: '0'
|
53
|
-
|
47
|
+
name: rake-compiler
|
54
48
|
type: :development
|
55
|
-
|
56
|
-
name: minitest
|
49
|
+
prerelease: false
|
57
50
|
version_requirements: !ruby/object:Gem::Requirement
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
62
56
|
requirement: !ruby/object:Gem::Requirement
|
63
57
|
requirements:
|
64
|
-
- -
|
58
|
+
- - ">="
|
65
59
|
- !ruby/object:Gem::Version
|
66
60
|
version: '0'
|
67
|
-
|
61
|
+
name: minitest
|
68
62
|
type: :development
|
69
|
-
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: |-
|
70
|
+
jaro_winkler is an implementation of Jaro-Winkler \
|
71
|
+
distance algorithm which is written in C extension and will fallback to pure \
|
72
|
+
Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
|
73
|
+
C and Ruby implementation support any kind of string encoding, such as \
|
74
|
+
UTF-8, EUC-JP, Big5, etc.
|
70
75
|
email: tonytonyjan@gmail.com
|
71
76
|
executables: []
|
72
77
|
extensions: []
|
@@ -74,12 +79,11 @@ extra_rdoc_files: []
|
|
74
79
|
files:
|
75
80
|
- ext/jaro_winkler/adj_matrix.c
|
76
81
|
- ext/jaro_winkler/adj_matrix.h
|
77
|
-
- ext/jaro_winkler/
|
78
|
-
- ext/jaro_winkler/
|
82
|
+
- ext/jaro_winkler/codepoints.c
|
83
|
+
- ext/jaro_winkler/codepoints.h
|
79
84
|
- ext/jaro_winkler/jaro.c
|
80
85
|
- ext/jaro_winkler/jaro.h
|
81
86
|
- ext/jaro_winkler/jaro_winkler.c
|
82
|
-
- ext/jaro_winkler/murmur_hash2.c
|
83
87
|
- lib/jaro_winkler.rb
|
84
88
|
- lib/jaro_winkler/adjusting_table.rb
|
85
89
|
- lib/jaro_winkler/jaro_winkler_pure.rb
|
@@ -94,18 +98,19 @@ require_paths:
|
|
94
98
|
- lib
|
95
99
|
required_ruby_version: !ruby/object:Gem::Requirement
|
96
100
|
requirements:
|
97
|
-
- -
|
101
|
+
- - ">="
|
98
102
|
- !ruby/object:Gem::Version
|
99
103
|
version: '0'
|
100
104
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
105
|
requirements:
|
102
|
-
- -
|
106
|
+
- - ">="
|
103
107
|
- !ruby/object:Gem::Version
|
104
108
|
version: '0'
|
105
109
|
requirements: []
|
106
110
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.
|
111
|
+
rubygems_version: 2.6.14.1
|
108
112
|
signing_key:
|
109
113
|
specification_version: 4
|
110
|
-
summary:
|
114
|
+
summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
|
115
|
+
which supports any kind of string encoding.
|
111
116
|
test_files: []
|
data/ext/jaro_winkler/code.c
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
#include <stdlib.h>
|
2
|
-
#include <string.h>
|
3
|
-
|
4
|
-
void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
|
5
|
-
unsigned char first_char = str[0];
|
6
|
-
if(first_char >= 252) *ret_byte_length = 6; // 1111110x
|
7
|
-
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
|
8
|
-
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
|
9
|
-
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
|
10
|
-
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
|
11
|
-
else *ret_byte_length = 1;
|
12
|
-
*ret_code = 0;
|
13
|
-
memcpy(ret_code, str, *ret_byte_length);
|
14
|
-
}
|
15
|
-
|
16
|
-
void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
|
17
|
-
unsigned int code;
|
18
|
-
char byte_length;
|
19
|
-
|
20
|
-
*ret_codes = calloc(length, sizeof(long long));
|
21
|
-
*ret_length = 0;
|
22
|
-
|
23
|
-
for(int i = 0; i < length;){
|
24
|
-
int byte_length;
|
25
|
-
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
|
26
|
-
*ret_length += 1;
|
27
|
-
i += byte_length;
|
28
|
-
}
|
29
|
-
}
|
data/ext/jaro_winkler/code.h
DELETED
@@ -1,64 +0,0 @@
|
|
1
|
-
//-----------------------------------------------------------------------------
|
2
|
-
// MurmurHash2, by Austin Appleby
|
3
|
-
|
4
|
-
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
-
|
6
|
-
// 1. We can read a 4-byte value from any address without crashing
|
7
|
-
// 2. sizeof(int) == 4
|
8
|
-
|
9
|
-
// And it has a few limitations -
|
10
|
-
|
11
|
-
// 1. It will not work incrementally.
|
12
|
-
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
-
// machines.
|
14
|
-
|
15
|
-
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
-
{
|
17
|
-
// 'm' and 'r' are mixing constants generated offline.
|
18
|
-
// They're not really 'magic', they just happen to work well.
|
19
|
-
|
20
|
-
const unsigned int m = 0x5bd1e995;
|
21
|
-
const int r = 24;
|
22
|
-
|
23
|
-
// Initialize the hash to a 'random' value
|
24
|
-
|
25
|
-
unsigned int h = seed ^ len;
|
26
|
-
|
27
|
-
// Mix 4 bytes at a time into the hash
|
28
|
-
|
29
|
-
const unsigned char * data = (const unsigned char *)key;
|
30
|
-
|
31
|
-
while(len >= 4)
|
32
|
-
{
|
33
|
-
unsigned int k = *(unsigned int *)data;
|
34
|
-
|
35
|
-
k *= m;
|
36
|
-
k ^= k >> r;
|
37
|
-
k *= m;
|
38
|
-
|
39
|
-
h *= m;
|
40
|
-
h ^= k;
|
41
|
-
|
42
|
-
data += 4;
|
43
|
-
len -= 4;
|
44
|
-
}
|
45
|
-
|
46
|
-
// Handle the last few bytes of the input array
|
47
|
-
|
48
|
-
switch(len)
|
49
|
-
{
|
50
|
-
case 3: h ^= data[2] << 16;
|
51
|
-
case 2: h ^= data[1] << 8;
|
52
|
-
case 1: h ^= data[0];
|
53
|
-
h *= m;
|
54
|
-
};
|
55
|
-
|
56
|
-
// Do a few final mixes of the hash to ensure the last few
|
57
|
-
// bytes are well-incorporated.
|
58
|
-
|
59
|
-
h ^= h >> 13;
|
60
|
-
h *= m;
|
61
|
-
h ^= h >> 15;
|
62
|
-
|
63
|
-
return h;
|
64
|
-
}
|