jaro_winkler 1.5.1-universal-java-10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/jaro_winkler/adj_matrix.c +97 -0
- data/ext/jaro_winkler/adj_matrix.h +22 -0
- data/ext/jaro_winkler/codepoints.c +61 -0
- data/ext/jaro_winkler/codepoints.h +13 -0
- data/ext/jaro_winkler/jaro.c +121 -0
- data/ext/jaro_winkler/jaro.h +17 -0
- data/ext/jaro_winkler/jaro_winkler.c +70 -0
- data/lib/jaro_winkler.rb +9 -0
- data/lib/jaro_winkler/adjusting_table.rb +14 -0
- data/lib/jaro_winkler/jaro_winkler_pure.rb +129 -0
- data/lib/jaro_winkler/version.rb +5 -0
- metadata +116 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2f783ac8d8355443adfe51a5d3f32cf7b8ddef5f
|
4
|
+
data.tar.gz: 96f863de9c8e879104fdb78ded7834691ad6ca66
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 354d4337b57d40de31960a6fd050d56ca7dd852031f5ad0525e7f2147ef8345ab6c4b4d53ff93a0e6011b55dc342b2ab05147db277c392f31988109757219685
|
7
|
+
data.tar.gz: 8db99ee1730ac2d84d95ec509fd9d2395ae812d6169bf1ce9f60c44495bafa541b4fbe39a3688c521b5f38707d635164c7ef5b8b7b062390ff2be6a017e6ce8c
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#include "adj_matrix.h"
|
2
|
+
#include "codepoints.h"
|
3
|
+
#include "ruby.h"
|
4
|
+
|
5
|
+
const char *DEFAULT_ADJ_TABLE[] = {
|
6
|
+
"A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
|
7
|
+
"O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
|
8
|
+
"C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
|
9
|
+
"S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
|
10
|
+
"I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
|
11
|
+
"O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
|
12
|
+
|
13
|
+
void node_free(Node *head);
|
14
|
+
|
15
|
+
AdjMatrix *adj_matrix_new(uint32_t length) {
|
16
|
+
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
17
|
+
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
|
18
|
+
matrix->table = malloc(matrix->length * sizeof(Node **));
|
19
|
+
for (size_t i = 0; i < matrix->length; i++) {
|
20
|
+
matrix->table[i] = malloc(matrix->length * sizeof(Node *));
|
21
|
+
for (size_t j = 0; j < matrix->length; j++)
|
22
|
+
matrix->table[i][j] = NULL;
|
23
|
+
}
|
24
|
+
return matrix;
|
25
|
+
}
|
26
|
+
|
27
|
+
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
28
|
+
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
29
|
+
ADJ_MATRIX_DEFAULT_LENGTH,
|
30
|
+
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
31
|
+
ADJ_MATRIX_DEFAULT_LENGTH;
|
32
|
+
Node *new_node = malloc(sizeof(Node));
|
33
|
+
new_node->x = h1;
|
34
|
+
new_node->y = h2;
|
35
|
+
new_node->next = NULL;
|
36
|
+
if (matrix->table[h1][h2] == NULL) {
|
37
|
+
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
|
38
|
+
} else {
|
39
|
+
Node *previous = NULL;
|
40
|
+
for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
|
41
|
+
previous = i;
|
42
|
+
previous->next = new_node;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
47
|
+
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
48
|
+
ADJ_MATRIX_DEFAULT_LENGTH,
|
49
|
+
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
50
|
+
ADJ_MATRIX_DEFAULT_LENGTH;
|
51
|
+
Node *node = matrix->table[h1][h2];
|
52
|
+
if (node == NULL)
|
53
|
+
return 0;
|
54
|
+
else {
|
55
|
+
for (Node *i = node; i != NULL; i = i->next)
|
56
|
+
if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
|
57
|
+
return 1;
|
58
|
+
return 0;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
void node_free(Node *head) {
|
63
|
+
if (head == NULL)
|
64
|
+
return;
|
65
|
+
node_free(head->next);
|
66
|
+
free(head);
|
67
|
+
}
|
68
|
+
|
69
|
+
void adj_matrix_free(AdjMatrix *matrix) {
|
70
|
+
for (size_t i = 0; i < matrix->length; i++) {
|
71
|
+
for (size_t j = 0; j < matrix->length; j++)
|
72
|
+
if (matrix->table[i][j] != NULL) {
|
73
|
+
node_free(matrix->table[i][j]);
|
74
|
+
matrix->table[i][j] = matrix->table[j][i] = NULL;
|
75
|
+
}
|
76
|
+
free(matrix->table[i]);
|
77
|
+
}
|
78
|
+
free(matrix->table);
|
79
|
+
free(matrix);
|
80
|
+
}
|
81
|
+
|
82
|
+
AdjMatrix *adj_matrix_default() {
|
83
|
+
static char first_time = 1;
|
84
|
+
static AdjMatrix *ret_matrix;
|
85
|
+
if (first_time) {
|
86
|
+
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
87
|
+
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
|
88
|
+
for (size_t i = 0; i < length; i += 2) {
|
89
|
+
uint64_t code_1, code_2;
|
90
|
+
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
|
91
|
+
code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
|
92
|
+
adj_matrix_add(ret_matrix, code_1, code_2);
|
93
|
+
}
|
94
|
+
first_time = 0;
|
95
|
+
}
|
96
|
+
return ret_matrix;
|
97
|
+
}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "stdint.h"
|
4
|
+
|
5
|
+
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
6
|
+
#define ADJ_MATRIX_SEED 9527
|
7
|
+
|
8
|
+
typedef struct _node {
|
9
|
+
struct _node *next;
|
10
|
+
uint64_t x, y;
|
11
|
+
} Node;
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
Node ***table;
|
15
|
+
uint32_t length;
|
16
|
+
} AdjMatrix;
|
17
|
+
|
18
|
+
AdjMatrix *adj_matrix_new(uint32_t length);
|
19
|
+
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
20
|
+
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
21
|
+
void adj_matrix_free(AdjMatrix *matrix);
|
22
|
+
AdjMatrix *adj_matrix_default();
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#include "codepoints.h"
|
2
|
+
#include "ruby.h"
|
3
|
+
#include "ruby/encoding.h"
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <string.h>
|
7
|
+
|
8
|
+
// this function is copied from string.c
|
9
|
+
static inline int single_byte_optimizable(VALUE str) {
|
10
|
+
rb_encoding *enc;
|
11
|
+
|
12
|
+
/* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
|
13
|
+
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
14
|
+
return 1;
|
15
|
+
|
16
|
+
enc = rb_enc_get(str);
|
17
|
+
if (rb_enc_mbmaxlen(enc) == 1)
|
18
|
+
return 1;
|
19
|
+
|
20
|
+
/* Conservative. Possibly single byte.
|
21
|
+
* "\xa1" in Shift_JIS for example. */
|
22
|
+
return 0;
|
23
|
+
}
|
24
|
+
|
25
|
+
void codepoints_init(CodePoints *codepoints, VALUE str) {
|
26
|
+
size_t i, length;
|
27
|
+
int32_t n;
|
28
|
+
uint32_t c;
|
29
|
+
const char *ptr, *end;
|
30
|
+
rb_encoding *enc;
|
31
|
+
|
32
|
+
if (single_byte_optimizable(str)) {
|
33
|
+
length = RSTRING_LEN(str);
|
34
|
+
ptr = RSTRING_PTR(str);
|
35
|
+
codepoints->data = malloc(length * sizeof(*codepoints->data));
|
36
|
+
for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
|
37
|
+
codepoints->data[i] = ptr[i] & 0xff;
|
38
|
+
} else {
|
39
|
+
codepoints->length = 0;
|
40
|
+
codepoints->size = 32;
|
41
|
+
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
|
42
|
+
str = rb_str_new_frozen(str);
|
43
|
+
ptr = RSTRING_PTR(str);
|
44
|
+
end = RSTRING_END(str);
|
45
|
+
enc = rb_enc_get(str);
|
46
|
+
|
47
|
+
while (ptr < end) {
|
48
|
+
c = rb_enc_codepoint_len(ptr, end, &n, enc);
|
49
|
+
if (codepoints->length == codepoints->size) {
|
50
|
+
codepoints->size *= 2;
|
51
|
+
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
|
52
|
+
codepoints->size);
|
53
|
+
}
|
54
|
+
codepoints->data[codepoints->length++] = c;
|
55
|
+
ptr += n;
|
56
|
+
}
|
57
|
+
RB_GC_GUARD(str);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "ruby.h"
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
uint32_t *data;
|
8
|
+
size_t length;
|
9
|
+
size_t size;
|
10
|
+
} CodePoints;
|
11
|
+
|
12
|
+
void codepoints_init(CodePoints *, VALUE str);
|
13
|
+
void codepoints_free(CodePoints *);
|
@@ -0,0 +1,121 @@
|
|
1
|
+
#include "jaro.h"
|
2
|
+
#include "adj_matrix.h"
|
3
|
+
#include "codepoints.h"
|
4
|
+
|
5
|
+
#include <ctype.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <string.h>
|
8
|
+
|
9
|
+
#define DEFAULT_WEIGHT 0.1
|
10
|
+
#define DEFAULT_THRESHOLD 0.7
|
11
|
+
#define SWAP(x, y) \
|
12
|
+
do { \
|
13
|
+
__typeof__(x) SWAP = x; \
|
14
|
+
x = y; \
|
15
|
+
y = SWAP; \
|
16
|
+
} while (0)
|
17
|
+
|
18
|
+
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
|
19
|
+
.threshold = DEFAULT_THRESHOLD,
|
20
|
+
.ignore_case = 0,
|
21
|
+
.adj_table = 0};
|
22
|
+
|
23
|
+
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
24
|
+
uint32_t *codepoints2, size_t len2,
|
25
|
+
Options *opt) {
|
26
|
+
if (!len1 || !len2)
|
27
|
+
return 0.0;
|
28
|
+
|
29
|
+
if (len1 > len2) {
|
30
|
+
SWAP(codepoints1, codepoints2);
|
31
|
+
SWAP(len1, len2);
|
32
|
+
}
|
33
|
+
|
34
|
+
if (opt->ignore_case) {
|
35
|
+
for (size_t i = 0; i < len1; i++)
|
36
|
+
codepoints1[i] = tolower(codepoints1[i]);
|
37
|
+
for (size_t i = 0; i < len2; i++)
|
38
|
+
codepoints2[i] = tolower(codepoints2[i]);
|
39
|
+
}
|
40
|
+
|
41
|
+
int32_t window_size = (int32_t)len2 / 2 - 1;
|
42
|
+
if (window_size < 0)
|
43
|
+
window_size = 0;
|
44
|
+
|
45
|
+
char short_codes_flag[len1];
|
46
|
+
char long_codes_flag[len2];
|
47
|
+
memset(short_codes_flag, 0, len1);
|
48
|
+
memset(long_codes_flag, 0, len2);
|
49
|
+
|
50
|
+
// count number of matching characters
|
51
|
+
size_t match_count = 0;
|
52
|
+
for (size_t i = 0; i < len1; i++) {
|
53
|
+
size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
|
54
|
+
size_t right =
|
55
|
+
(i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
|
56
|
+
if (right > len2 - 1)
|
57
|
+
right = len2 - 1;
|
58
|
+
for (size_t j = left; j <= right; j++) {
|
59
|
+
if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
|
60
|
+
short_codes_flag[i] = long_codes_flag[j] = 1;
|
61
|
+
match_count++;
|
62
|
+
break;
|
63
|
+
}
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
if (!match_count)
|
68
|
+
return 0.0;
|
69
|
+
|
70
|
+
// count number of transpositions
|
71
|
+
size_t transposition_count = 0, j = 0, k = 0;
|
72
|
+
for (size_t i = 0; i < len1; i++) {
|
73
|
+
if (short_codes_flag[i]) {
|
74
|
+
for (j = k; j < len2; j++) {
|
75
|
+
if (long_codes_flag[j]) {
|
76
|
+
k = j + 1;
|
77
|
+
break;
|
78
|
+
}
|
79
|
+
}
|
80
|
+
if (codepoints1[i] != codepoints2[j])
|
81
|
+
transposition_count++;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
// count similarities in nonmatched characters
|
86
|
+
size_t similar_count = 0;
|
87
|
+
if (opt->adj_table && len1 > match_count)
|
88
|
+
for (size_t i = 0; i < len1; i++)
|
89
|
+
if (!short_codes_flag[i])
|
90
|
+
for (size_t j = 0; j < len2; j++)
|
91
|
+
if (!long_codes_flag[j])
|
92
|
+
if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
|
93
|
+
codepoints2[j])) {
|
94
|
+
similar_count += 3;
|
95
|
+
break;
|
96
|
+
}
|
97
|
+
|
98
|
+
double m = (double)match_count;
|
99
|
+
double t = (double)(transposition_count / 2);
|
100
|
+
if (opt->adj_table)
|
101
|
+
m = similar_count / 10.0 + m;
|
102
|
+
return (m / len1 + m / len2 + (m - t) / m) / 3;
|
103
|
+
}
|
104
|
+
|
105
|
+
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
106
|
+
uint32_t *codepoints2, size_t len2,
|
107
|
+
Options *opt) {
|
108
|
+
double jaro_distance =
|
109
|
+
jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
|
110
|
+
|
111
|
+
if (jaro_distance < opt->threshold)
|
112
|
+
return jaro_distance;
|
113
|
+
else {
|
114
|
+
size_t prefix = 0;
|
115
|
+
size_t max_4 = len1 > 4 ? 4 : len1;
|
116
|
+
for (prefix = 0;
|
117
|
+
prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
|
118
|
+
;
|
119
|
+
return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
|
120
|
+
}
|
121
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
double weight, threshold;
|
8
|
+
char ignore_case, adj_table;
|
9
|
+
} Options;
|
10
|
+
|
11
|
+
extern const Options DEFAULT_OPTIONS;
|
12
|
+
|
13
|
+
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
14
|
+
uint32_t *codepoints2, size_t len2, Options *);
|
15
|
+
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
16
|
+
uint32_t *codepoints2, size_t len2,
|
17
|
+
Options *);
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#include "codepoints.h"
|
2
|
+
#include "jaro.h"
|
3
|
+
#include "ruby.h"
|
4
|
+
|
5
|
+
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
|
6
|
+
|
7
|
+
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
|
8
|
+
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
|
9
|
+
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
10
|
+
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
11
|
+
uint32_t *codepoints2, size_t len2,
|
12
|
+
Options *));
|
13
|
+
|
14
|
+
void Init_jaro_winkler_ext(void) {
|
15
|
+
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
16
|
+
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
|
17
|
+
rb_eInvalidWeightError =
|
18
|
+
rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
|
19
|
+
rb_define_singleton_method(rb_mJaroWinkler, "distance",
|
20
|
+
rb_jaro_winkler_distance, -1);
|
21
|
+
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
|
22
|
+
-1);
|
23
|
+
}
|
24
|
+
|
25
|
+
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
26
|
+
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
27
|
+
uint32_t *codepoints2, size_t len2,
|
28
|
+
Options *)) {
|
29
|
+
VALUE s1, s2, opt;
|
30
|
+
CodePoints cp1, cp2;
|
31
|
+
|
32
|
+
rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
|
33
|
+
codepoints_init(&cp1, s1);
|
34
|
+
codepoints_init(&cp2, s2);
|
35
|
+
|
36
|
+
Options c_opt = DEFAULT_OPTIONS;
|
37
|
+
if (TYPE(opt) == T_HASH) {
|
38
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
39
|
+
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
40
|
+
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
41
|
+
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
42
|
+
if (!NIL_P(weight))
|
43
|
+
c_opt.weight = NUM2DBL(weight);
|
44
|
+
if (c_opt.weight > 0.25)
|
45
|
+
rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
|
46
|
+
"otherwise the distance can become "
|
47
|
+
"larger than 1.");
|
48
|
+
if (!NIL_P(threshold))
|
49
|
+
c_opt.threshold = NUM2DBL(threshold);
|
50
|
+
if (!NIL_P(ignore_case))
|
51
|
+
c_opt.ignore_case =
|
52
|
+
(TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
53
|
+
if (!NIL_P(adj_table))
|
54
|
+
c_opt.adj_table =
|
55
|
+
(TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
56
|
+
}
|
57
|
+
VALUE ret = rb_float_new(
|
58
|
+
(*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
|
59
|
+
codepoints_free(&cp1);
|
60
|
+
codepoints_free(&cp2);
|
61
|
+
return ret;
|
62
|
+
}
|
63
|
+
|
64
|
+
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
|
65
|
+
return distance(argc, argv, self, jaro_distance_from_codes);
|
66
|
+
}
|
67
|
+
|
68
|
+
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
|
69
|
+
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
|
70
|
+
}
|
data/lib/jaro_winkler.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module JaroWinkler
|
4
|
+
DEFAULT_ADJ_TABLE = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
|
5
|
+
[
|
6
|
+
%w[A E], %w[A I], %w[A O], %w[A U], %w[B V], %w[E I], %w[E O], %w[E U], %w[I O],
|
7
|
+
%w[I U], %w[O U], %w[I Y], %w[E Y], %w[C G], %w[E F], %w[W U], %w[W V], %w[X K],
|
8
|
+
%w[S Z], %w[X S], %w[Q C], %w[U V], %w[M N], %w[L I], %w[Q O], %w[P R], %w[I J],
|
9
|
+
%w[2 Z], %w[5 S], %w[8 B], %w[1 I], %w[1 L], %w[0 O], %w[0 Q], %w[C K], %w[G J],
|
10
|
+
['E', ' '], ['Y', ' '], ['S', ' ']
|
11
|
+
].each do |s1, s2|
|
12
|
+
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'jaro_winkler/adjusting_table'
|
4
|
+
module JaroWinkler
|
5
|
+
class Error < RuntimeError; end
|
6
|
+
class InvalidWeightError < Error; end
|
7
|
+
|
8
|
+
DEFAULT_WEIGHT = 0.1
|
9
|
+
DEFAULT_THRESHOLD = 0.7
|
10
|
+
DEFAULT_OPTIONS = {
|
11
|
+
jaro: { adj_table: false, ignore_case: false },
|
12
|
+
jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def distance(str1, str2, options = {})
|
17
|
+
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
18
|
+
end
|
19
|
+
|
20
|
+
def jaro_distance(str1, str2, options = {})
|
21
|
+
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def _distance(codes1, codes2, options = {})
|
27
|
+
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
|
28
|
+
raise InvalidWeightError if options[:weight] > 0.25
|
29
|
+
jaro_distance = _jaro_distance(codes1, codes2, options)
|
30
|
+
|
31
|
+
if jaro_distance < options[:threshold]
|
32
|
+
jaro_distance
|
33
|
+
else
|
34
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
35
|
+
len1 = codes1.length
|
36
|
+
len2 = codes2.length
|
37
|
+
max_4 = len1 > 4 ? 4 : len1
|
38
|
+
prefix = 0
|
39
|
+
prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
|
40
|
+
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def _jaro_distance(codes1, codes2, options = {})
|
45
|
+
options = DEFAULT_OPTIONS[:jaro].merge options
|
46
|
+
|
47
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
48
|
+
len1 = codes1.length
|
49
|
+
len2 = codes2.length
|
50
|
+
return 0.0 if len1 == 0 || len2 == 0
|
51
|
+
|
52
|
+
if options[:ignore_case]
|
53
|
+
codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
54
|
+
codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
55
|
+
end
|
56
|
+
|
57
|
+
window = len2 / 2 - 1
|
58
|
+
window = 0 if window < 0
|
59
|
+
flags1 = 0
|
60
|
+
flags2 = 0
|
61
|
+
|
62
|
+
# // count number of matching characters
|
63
|
+
match_count = 0
|
64
|
+
i = 0
|
65
|
+
while i < len1
|
66
|
+
left = i >= window ? i - window : 0
|
67
|
+
right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
|
68
|
+
right = len2 - 1 if right > len2 - 1
|
69
|
+
j = left
|
70
|
+
while j <= right
|
71
|
+
if flags2[j] == 0 && codes1[i] == codes2[j]
|
72
|
+
flags1 |= (1 << i)
|
73
|
+
flags2 |= (1 << j)
|
74
|
+
match_count += 1
|
75
|
+
break
|
76
|
+
end
|
77
|
+
j += 1
|
78
|
+
end
|
79
|
+
i += 1
|
80
|
+
end
|
81
|
+
|
82
|
+
return 0.0 if match_count == 0
|
83
|
+
|
84
|
+
# // count number of transpositions
|
85
|
+
transposition_count = j = k = 0
|
86
|
+
i = 0
|
87
|
+
while i < len1
|
88
|
+
if flags1[i] == 1
|
89
|
+
j = k
|
90
|
+
while j < len2
|
91
|
+
if flags2[j] == 1
|
92
|
+
k = j + 1
|
93
|
+
break
|
94
|
+
end
|
95
|
+
j += 1
|
96
|
+
end
|
97
|
+
transposition_count += 1 if codes1[i] != codes2[j]
|
98
|
+
end
|
99
|
+
i += 1
|
100
|
+
end
|
101
|
+
|
102
|
+
# // count similarities in nonmatched characters
|
103
|
+
similar_count = 0
|
104
|
+
if options[:adj_table] && len1 > match_count
|
105
|
+
i = 0
|
106
|
+
while i < len1
|
107
|
+
if flags1[i] == 0
|
108
|
+
j = 0
|
109
|
+
while j < len2
|
110
|
+
if flags2[j] == 0
|
111
|
+
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
|
112
|
+
similar_count += 3
|
113
|
+
break
|
114
|
+
end
|
115
|
+
end
|
116
|
+
j += 1
|
117
|
+
end
|
118
|
+
end
|
119
|
+
i += 1
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
m = match_count.to_f
|
124
|
+
t = transposition_count / 2
|
125
|
+
m = similar_count / 10.0 + m if options[:adj_table]
|
126
|
+
(m / len1 + m / len2 + (m - t) / m) / 3
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
metadata
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jaro_winkler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.5.1
|
5
|
+
platform: universal-java-10
|
6
|
+
authors:
|
7
|
+
- Jian Weihang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-06-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '1.7'
|
19
|
+
name: bundler
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - "~>"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '12.0'
|
33
|
+
name: rake
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '12.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
name: rake-compiler
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
name: minitest
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: |-
|
70
|
+
jaro_winkler is an implementation of Jaro-Winkler \
|
71
|
+
distance algorithm which is written in C extension and will fallback to pure \
|
72
|
+
Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
|
73
|
+
C and Ruby implementation support any kind of string encoding, such as \
|
74
|
+
UTF-8, EUC-JP, Big5, etc.
|
75
|
+
email: tonytonyjan@gmail.com
|
76
|
+
executables: []
|
77
|
+
extensions: []
|
78
|
+
extra_rdoc_files: []
|
79
|
+
files:
|
80
|
+
- ext/jaro_winkler/adj_matrix.c
|
81
|
+
- ext/jaro_winkler/adj_matrix.h
|
82
|
+
- ext/jaro_winkler/codepoints.c
|
83
|
+
- ext/jaro_winkler/codepoints.h
|
84
|
+
- ext/jaro_winkler/jaro.c
|
85
|
+
- ext/jaro_winkler/jaro.h
|
86
|
+
- ext/jaro_winkler/jaro_winkler.c
|
87
|
+
- lib/jaro_winkler.rb
|
88
|
+
- lib/jaro_winkler/adjusting_table.rb
|
89
|
+
- lib/jaro_winkler/jaro_winkler_pure.rb
|
90
|
+
- lib/jaro_winkler/version.rb
|
91
|
+
homepage: https://github.com/tonytonyjan/jaro_winkler
|
92
|
+
licenses:
|
93
|
+
- MIT
|
94
|
+
metadata: {}
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options: []
|
97
|
+
require_paths:
|
98
|
+
- lib
|
99
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: '0'
|
109
|
+
requirements: []
|
110
|
+
rubyforge_project:
|
111
|
+
rubygems_version: 2.6.14.1
|
112
|
+
signing_key:
|
113
|
+
specification_version: 4
|
114
|
+
summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
|
115
|
+
which supports any kind of string encoding.
|
116
|
+
test_files: []
|