jaro_winkler 1.5.1-java → 1.5.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/jaro_winkler/jaro_winkler_pure.rb +6 -0
- data/lib/jaro_winkler/version.rb +1 -1
- metadata +11 -18
- data/ext/jaro_winkler/adj_matrix.c +0 -97
- data/ext/jaro_winkler/adj_matrix.h +0 -22
- data/ext/jaro_winkler/codepoints.c +0 -61
- data/ext/jaro_winkler/codepoints.h +0 -13
- data/ext/jaro_winkler/jaro.c +0 -121
- data/ext/jaro_winkler/jaro.h +0 -17
- data/ext/jaro_winkler/jaro_winkler.c +0 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 47ffec43f4a902a16038fa817a68df9f5caea07ad68c4afe43c87b934b2ea1c8
|
4
|
+
data.tar.gz: 6a6cfd3195c5c03de0204fa25426a20ab0882cb8f012540022ac205f7c2cefad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cb9917bb131d2d5b51f99c1733dfb6ae6c695b1707edf8ff6f688d6a959c31536d5531cb31e020e6fe85df516ba1351f1f87d178d2f9aefa43beb7e99a916b5
|
7
|
+
data.tar.gz: 92973324ff6da1ba02bddd4d9a7a69b9badbc8c2556d2176b753e263e185ed0de84dd74310548f4ece54e47f558d581a1ab738e80a60c985438315c3a8a830d6
|
@@ -14,10 +14,12 @@ module JaroWinkler
|
|
14
14
|
|
15
15
|
class << self
|
16
16
|
def distance(str1, str2, options = {})
|
17
|
+
validate!(str1, str2)
|
17
18
|
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
18
19
|
end
|
19
20
|
|
20
21
|
def jaro_distance(str1, str2, options = {})
|
22
|
+
validate!(str1, str2)
|
21
23
|
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
22
24
|
end
|
23
25
|
|
@@ -125,5 +127,9 @@ module JaroWinkler
|
|
125
127
|
m = similar_count / 10.0 + m if options[:adj_table]
|
126
128
|
(m / len1 + m / len2 + (m - t) / m) / 3
|
127
129
|
end
|
130
|
+
|
131
|
+
def validate!(str1, str2)
|
132
|
+
raise TypeError unless str1.is_a?(String) && str2.is_a?(String)
|
133
|
+
end
|
128
134
|
end
|
129
135
|
end
|
data/lib/jaro_winkler/version.rb
CHANGED
metadata
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.
|
4
|
+
version: 1.5.2
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
14
15
|
requirement: !ruby/object:Gem::Requirement
|
15
16
|
requirements:
|
16
17
|
- - "~>"
|
17
18
|
- !ruby/object:Gem::Version
|
18
19
|
version: '1.7'
|
19
|
-
name: bundler
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -25,12 +25,12 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.7'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
28
29
|
requirement: !ruby/object:Gem::Requirement
|
29
30
|
requirements:
|
30
31
|
- - "~>"
|
31
32
|
- !ruby/object:Gem::Version
|
32
33
|
version: '12.0'
|
33
|
-
name: rake
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -39,12 +39,12 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '12.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
42
43
|
requirement: !ruby/object:Gem::Requirement
|
43
44
|
requirements:
|
44
45
|
- - ">="
|
45
46
|
- !ruby/object:Gem::Version
|
46
47
|
version: '0'
|
47
|
-
name: rake-compiler
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -53,12 +53,12 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
56
57
|
requirement: !ruby/object:Gem::Requirement
|
57
58
|
requirements:
|
58
59
|
- - ">="
|
59
60
|
- !ruby/object:Gem::Version
|
60
61
|
version: '0'
|
61
|
-
name: minitest
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -77,13 +77,6 @@ executables: []
|
|
77
77
|
extensions: []
|
78
78
|
extra_rdoc_files: []
|
79
79
|
files:
|
80
|
-
- ext/jaro_winkler/adj_matrix.c
|
81
|
-
- ext/jaro_winkler/adj_matrix.h
|
82
|
-
- ext/jaro_winkler/codepoints.c
|
83
|
-
- ext/jaro_winkler/codepoints.h
|
84
|
-
- ext/jaro_winkler/jaro.c
|
85
|
-
- ext/jaro_winkler/jaro.h
|
86
|
-
- ext/jaro_winkler/jaro_winkler.c
|
87
80
|
- lib/jaro_winkler.rb
|
88
81
|
- lib/jaro_winkler/adjusting_table.rb
|
89
82
|
- lib/jaro_winkler/jaro_winkler_pure.rb
|
@@ -92,7 +85,7 @@ homepage: https://github.com/tonytonyjan/jaro_winkler
|
|
92
85
|
licenses:
|
93
86
|
- MIT
|
94
87
|
metadata: {}
|
95
|
-
post_install_message:
|
88
|
+
post_install_message:
|
96
89
|
rdoc_options: []
|
97
90
|
require_paths:
|
98
91
|
- lib
|
@@ -107,9 +100,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
100
|
- !ruby/object:Gem::Version
|
108
101
|
version: '0'
|
109
102
|
requirements: []
|
110
|
-
rubyforge_project:
|
111
|
-
rubygems_version: 2.
|
112
|
-
signing_key:
|
103
|
+
rubyforge_project:
|
104
|
+
rubygems_version: 2.7.3
|
105
|
+
signing_key:
|
113
106
|
specification_version: 4
|
114
107
|
summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
|
115
108
|
which supports any kind of string encoding.
|
@@ -1,97 +0,0 @@
|
|
1
|
-
#include "adj_matrix.h"
|
2
|
-
#include "codepoints.h"
|
3
|
-
#include "ruby.h"
|
4
|
-
|
5
|
-
const char *DEFAULT_ADJ_TABLE[] = {
|
6
|
-
"A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
|
7
|
-
"O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
|
8
|
-
"C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
|
9
|
-
"S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
|
10
|
-
"I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
|
11
|
-
"O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
|
12
|
-
|
13
|
-
void node_free(Node *head);
|
14
|
-
|
15
|
-
AdjMatrix *adj_matrix_new(uint32_t length) {
|
16
|
-
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
17
|
-
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
|
18
|
-
matrix->table = malloc(matrix->length * sizeof(Node **));
|
19
|
-
for (size_t i = 0; i < matrix->length; i++) {
|
20
|
-
matrix->table[i] = malloc(matrix->length * sizeof(Node *));
|
21
|
-
for (size_t j = 0; j < matrix->length; j++)
|
22
|
-
matrix->table[i][j] = NULL;
|
23
|
-
}
|
24
|
-
return matrix;
|
25
|
-
}
|
26
|
-
|
27
|
-
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
28
|
-
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
29
|
-
ADJ_MATRIX_DEFAULT_LENGTH,
|
30
|
-
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
31
|
-
ADJ_MATRIX_DEFAULT_LENGTH;
|
32
|
-
Node *new_node = malloc(sizeof(Node));
|
33
|
-
new_node->x = h1;
|
34
|
-
new_node->y = h2;
|
35
|
-
new_node->next = NULL;
|
36
|
-
if (matrix->table[h1][h2] == NULL) {
|
37
|
-
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
|
38
|
-
} else {
|
39
|
-
Node *previous = NULL;
|
40
|
-
for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
|
41
|
-
previous = i;
|
42
|
-
previous->next = new_node;
|
43
|
-
}
|
44
|
-
}
|
45
|
-
|
46
|
-
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
47
|
-
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
48
|
-
ADJ_MATRIX_DEFAULT_LENGTH,
|
49
|
-
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
50
|
-
ADJ_MATRIX_DEFAULT_LENGTH;
|
51
|
-
Node *node = matrix->table[h1][h2];
|
52
|
-
if (node == NULL)
|
53
|
-
return 0;
|
54
|
-
else {
|
55
|
-
for (Node *i = node; i != NULL; i = i->next)
|
56
|
-
if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
|
57
|
-
return 1;
|
58
|
-
return 0;
|
59
|
-
}
|
60
|
-
}
|
61
|
-
|
62
|
-
void node_free(Node *head) {
|
63
|
-
if (head == NULL)
|
64
|
-
return;
|
65
|
-
node_free(head->next);
|
66
|
-
free(head);
|
67
|
-
}
|
68
|
-
|
69
|
-
void adj_matrix_free(AdjMatrix *matrix) {
|
70
|
-
for (size_t i = 0; i < matrix->length; i++) {
|
71
|
-
for (size_t j = 0; j < matrix->length; j++)
|
72
|
-
if (matrix->table[i][j] != NULL) {
|
73
|
-
node_free(matrix->table[i][j]);
|
74
|
-
matrix->table[i][j] = matrix->table[j][i] = NULL;
|
75
|
-
}
|
76
|
-
free(matrix->table[i]);
|
77
|
-
}
|
78
|
-
free(matrix->table);
|
79
|
-
free(matrix);
|
80
|
-
}
|
81
|
-
|
82
|
-
AdjMatrix *adj_matrix_default() {
|
83
|
-
static char first_time = 1;
|
84
|
-
static AdjMatrix *ret_matrix;
|
85
|
-
if (first_time) {
|
86
|
-
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
87
|
-
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
|
88
|
-
for (size_t i = 0; i < length; i += 2) {
|
89
|
-
uint64_t code_1, code_2;
|
90
|
-
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
|
91
|
-
code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
|
92
|
-
adj_matrix_add(ret_matrix, code_1, code_2);
|
93
|
-
}
|
94
|
-
first_time = 0;
|
95
|
-
}
|
96
|
-
return ret_matrix;
|
97
|
-
}
|
@@ -1,22 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "stdint.h"
|
4
|
-
|
5
|
-
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
6
|
-
#define ADJ_MATRIX_SEED 9527
|
7
|
-
|
8
|
-
typedef struct _node {
|
9
|
-
struct _node *next;
|
10
|
-
uint64_t x, y;
|
11
|
-
} Node;
|
12
|
-
|
13
|
-
typedef struct {
|
14
|
-
Node ***table;
|
15
|
-
uint32_t length;
|
16
|
-
} AdjMatrix;
|
17
|
-
|
18
|
-
AdjMatrix *adj_matrix_new(uint32_t length);
|
19
|
-
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
20
|
-
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
21
|
-
void adj_matrix_free(AdjMatrix *matrix);
|
22
|
-
AdjMatrix *adj_matrix_default();
|
@@ -1,61 +0,0 @@
|
|
1
|
-
#include "codepoints.h"
|
2
|
-
#include "ruby.h"
|
3
|
-
#include "ruby/encoding.h"
|
4
|
-
#include <stdint.h>
|
5
|
-
#include <stdlib.h>
|
6
|
-
#include <string.h>
|
7
|
-
|
8
|
-
// this function is copied from string.c
|
9
|
-
static inline int single_byte_optimizable(VALUE str) {
|
10
|
-
rb_encoding *enc;
|
11
|
-
|
12
|
-
/* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
|
13
|
-
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
14
|
-
return 1;
|
15
|
-
|
16
|
-
enc = rb_enc_get(str);
|
17
|
-
if (rb_enc_mbmaxlen(enc) == 1)
|
18
|
-
return 1;
|
19
|
-
|
20
|
-
/* Conservative. Possibly single byte.
|
21
|
-
* "\xa1" in Shift_JIS for example. */
|
22
|
-
return 0;
|
23
|
-
}
|
24
|
-
|
25
|
-
void codepoints_init(CodePoints *codepoints, VALUE str) {
|
26
|
-
size_t i, length;
|
27
|
-
int32_t n;
|
28
|
-
uint32_t c;
|
29
|
-
const char *ptr, *end;
|
30
|
-
rb_encoding *enc;
|
31
|
-
|
32
|
-
if (single_byte_optimizable(str)) {
|
33
|
-
length = RSTRING_LEN(str);
|
34
|
-
ptr = RSTRING_PTR(str);
|
35
|
-
codepoints->data = malloc(length * sizeof(*codepoints->data));
|
36
|
-
for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
|
37
|
-
codepoints->data[i] = ptr[i] & 0xff;
|
38
|
-
} else {
|
39
|
-
codepoints->length = 0;
|
40
|
-
codepoints->size = 32;
|
41
|
-
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
|
42
|
-
str = rb_str_new_frozen(str);
|
43
|
-
ptr = RSTRING_PTR(str);
|
44
|
-
end = RSTRING_END(str);
|
45
|
-
enc = rb_enc_get(str);
|
46
|
-
|
47
|
-
while (ptr < end) {
|
48
|
-
c = rb_enc_codepoint_len(ptr, end, &n, enc);
|
49
|
-
if (codepoints->length == codepoints->size) {
|
50
|
-
codepoints->size *= 2;
|
51
|
-
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
|
52
|
-
codepoints->size);
|
53
|
-
}
|
54
|
-
codepoints->data[codepoints->length++] = c;
|
55
|
-
ptr += n;
|
56
|
-
}
|
57
|
-
RB_GC_GUARD(str);
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
|
@@ -1,13 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
#include "ruby.h"
|
3
|
-
#include <stddef.h>
|
4
|
-
#include <stdint.h>
|
5
|
-
|
6
|
-
typedef struct {
|
7
|
-
uint32_t *data;
|
8
|
-
size_t length;
|
9
|
-
size_t size;
|
10
|
-
} CodePoints;
|
11
|
-
|
12
|
-
void codepoints_init(CodePoints *, VALUE str);
|
13
|
-
void codepoints_free(CodePoints *);
|
data/ext/jaro_winkler/jaro.c
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
#include "jaro.h"
|
2
|
-
#include "adj_matrix.h"
|
3
|
-
#include "codepoints.h"
|
4
|
-
|
5
|
-
#include <ctype.h>
|
6
|
-
#include <stdlib.h>
|
7
|
-
#include <string.h>
|
8
|
-
|
9
|
-
#define DEFAULT_WEIGHT 0.1
|
10
|
-
#define DEFAULT_THRESHOLD 0.7
|
11
|
-
#define SWAP(x, y) \
|
12
|
-
do { \
|
13
|
-
__typeof__(x) SWAP = x; \
|
14
|
-
x = y; \
|
15
|
-
y = SWAP; \
|
16
|
-
} while (0)
|
17
|
-
|
18
|
-
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
|
19
|
-
.threshold = DEFAULT_THRESHOLD,
|
20
|
-
.ignore_case = 0,
|
21
|
-
.adj_table = 0};
|
22
|
-
|
23
|
-
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
24
|
-
uint32_t *codepoints2, size_t len2,
|
25
|
-
Options *opt) {
|
26
|
-
if (!len1 || !len2)
|
27
|
-
return 0.0;
|
28
|
-
|
29
|
-
if (len1 > len2) {
|
30
|
-
SWAP(codepoints1, codepoints2);
|
31
|
-
SWAP(len1, len2);
|
32
|
-
}
|
33
|
-
|
34
|
-
if (opt->ignore_case) {
|
35
|
-
for (size_t i = 0; i < len1; i++)
|
36
|
-
codepoints1[i] = tolower(codepoints1[i]);
|
37
|
-
for (size_t i = 0; i < len2; i++)
|
38
|
-
codepoints2[i] = tolower(codepoints2[i]);
|
39
|
-
}
|
40
|
-
|
41
|
-
int32_t window_size = (int32_t)len2 / 2 - 1;
|
42
|
-
if (window_size < 0)
|
43
|
-
window_size = 0;
|
44
|
-
|
45
|
-
char short_codes_flag[len1];
|
46
|
-
char long_codes_flag[len2];
|
47
|
-
memset(short_codes_flag, 0, len1);
|
48
|
-
memset(long_codes_flag, 0, len2);
|
49
|
-
|
50
|
-
// count number of matching characters
|
51
|
-
size_t match_count = 0;
|
52
|
-
for (size_t i = 0; i < len1; i++) {
|
53
|
-
size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
|
54
|
-
size_t right =
|
55
|
-
(i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
|
56
|
-
if (right > len2 - 1)
|
57
|
-
right = len2 - 1;
|
58
|
-
for (size_t j = left; j <= right; j++) {
|
59
|
-
if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
|
60
|
-
short_codes_flag[i] = long_codes_flag[j] = 1;
|
61
|
-
match_count++;
|
62
|
-
break;
|
63
|
-
}
|
64
|
-
}
|
65
|
-
}
|
66
|
-
|
67
|
-
if (!match_count)
|
68
|
-
return 0.0;
|
69
|
-
|
70
|
-
// count number of transpositions
|
71
|
-
size_t transposition_count = 0, j = 0, k = 0;
|
72
|
-
for (size_t i = 0; i < len1; i++) {
|
73
|
-
if (short_codes_flag[i]) {
|
74
|
-
for (j = k; j < len2; j++) {
|
75
|
-
if (long_codes_flag[j]) {
|
76
|
-
k = j + 1;
|
77
|
-
break;
|
78
|
-
}
|
79
|
-
}
|
80
|
-
if (codepoints1[i] != codepoints2[j])
|
81
|
-
transposition_count++;
|
82
|
-
}
|
83
|
-
}
|
84
|
-
|
85
|
-
// count similarities in nonmatched characters
|
86
|
-
size_t similar_count = 0;
|
87
|
-
if (opt->adj_table && len1 > match_count)
|
88
|
-
for (size_t i = 0; i < len1; i++)
|
89
|
-
if (!short_codes_flag[i])
|
90
|
-
for (size_t j = 0; j < len2; j++)
|
91
|
-
if (!long_codes_flag[j])
|
92
|
-
if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
|
93
|
-
codepoints2[j])) {
|
94
|
-
similar_count += 3;
|
95
|
-
break;
|
96
|
-
}
|
97
|
-
|
98
|
-
double m = (double)match_count;
|
99
|
-
double t = (double)(transposition_count / 2);
|
100
|
-
if (opt->adj_table)
|
101
|
-
m = similar_count / 10.0 + m;
|
102
|
-
return (m / len1 + m / len2 + (m - t) / m) / 3;
|
103
|
-
}
|
104
|
-
|
105
|
-
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
106
|
-
uint32_t *codepoints2, size_t len2,
|
107
|
-
Options *opt) {
|
108
|
-
double jaro_distance =
|
109
|
-
jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
|
110
|
-
|
111
|
-
if (jaro_distance < opt->threshold)
|
112
|
-
return jaro_distance;
|
113
|
-
else {
|
114
|
-
size_t prefix = 0;
|
115
|
-
size_t max_4 = len1 > 4 ? 4 : len1;
|
116
|
-
for (prefix = 0;
|
117
|
-
prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
|
118
|
-
;
|
119
|
-
return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
|
120
|
-
}
|
121
|
-
}
|
data/ext/jaro_winkler/jaro.h
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include <stddef.h>
|
4
|
-
#include <stdint.h>
|
5
|
-
|
6
|
-
typedef struct {
|
7
|
-
double weight, threshold;
|
8
|
-
char ignore_case, adj_table;
|
9
|
-
} Options;
|
10
|
-
|
11
|
-
extern const Options DEFAULT_OPTIONS;
|
12
|
-
|
13
|
-
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
14
|
-
uint32_t *codepoints2, size_t len2, Options *);
|
15
|
-
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
16
|
-
uint32_t *codepoints2, size_t len2,
|
17
|
-
Options *);
|
@@ -1,70 +0,0 @@
|
|
1
|
-
#include "codepoints.h"
|
2
|
-
#include "jaro.h"
|
3
|
-
#include "ruby.h"
|
4
|
-
|
5
|
-
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
|
6
|
-
|
7
|
-
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
|
8
|
-
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
|
9
|
-
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
10
|
-
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
11
|
-
uint32_t *codepoints2, size_t len2,
|
12
|
-
Options *));
|
13
|
-
|
14
|
-
void Init_jaro_winkler_ext(void) {
|
15
|
-
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
16
|
-
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
|
17
|
-
rb_eInvalidWeightError =
|
18
|
-
rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
|
19
|
-
rb_define_singleton_method(rb_mJaroWinkler, "distance",
|
20
|
-
rb_jaro_winkler_distance, -1);
|
21
|
-
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
|
22
|
-
-1);
|
23
|
-
}
|
24
|
-
|
25
|
-
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
26
|
-
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
27
|
-
uint32_t *codepoints2, size_t len2,
|
28
|
-
Options *)) {
|
29
|
-
VALUE s1, s2, opt;
|
30
|
-
CodePoints cp1, cp2;
|
31
|
-
|
32
|
-
rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
|
33
|
-
codepoints_init(&cp1, s1);
|
34
|
-
codepoints_init(&cp2, s2);
|
35
|
-
|
36
|
-
Options c_opt = DEFAULT_OPTIONS;
|
37
|
-
if (TYPE(opt) == T_HASH) {
|
38
|
-
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
39
|
-
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
40
|
-
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
41
|
-
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
42
|
-
if (!NIL_P(weight))
|
43
|
-
c_opt.weight = NUM2DBL(weight);
|
44
|
-
if (c_opt.weight > 0.25)
|
45
|
-
rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
|
46
|
-
"otherwise the distance can become "
|
47
|
-
"larger than 1.");
|
48
|
-
if (!NIL_P(threshold))
|
49
|
-
c_opt.threshold = NUM2DBL(threshold);
|
50
|
-
if (!NIL_P(ignore_case))
|
51
|
-
c_opt.ignore_case =
|
52
|
-
(TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
53
|
-
if (!NIL_P(adj_table))
|
54
|
-
c_opt.adj_table =
|
55
|
-
(TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
56
|
-
}
|
57
|
-
VALUE ret = rb_float_new(
|
58
|
-
(*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
|
59
|
-
codepoints_free(&cp1);
|
60
|
-
codepoints_free(&cp2);
|
61
|
-
return ret;
|
62
|
-
}
|
63
|
-
|
64
|
-
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
|
65
|
-
return distance(argc, argv, self, jaro_distance_from_codes);
|
66
|
-
}
|
67
|
-
|
68
|
-
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
|
69
|
-
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
|
70
|
-
}
|