jaro_winkler 1.5.1-java → 1.5.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/jaro_winkler/jaro_winkler_pure.rb +6 -0
- data/lib/jaro_winkler/version.rb +1 -1
- metadata +11 -18
- data/ext/jaro_winkler/adj_matrix.c +0 -97
- data/ext/jaro_winkler/adj_matrix.h +0 -22
- data/ext/jaro_winkler/codepoints.c +0 -61
- data/ext/jaro_winkler/codepoints.h +0 -13
- data/ext/jaro_winkler/jaro.c +0 -121
- data/ext/jaro_winkler/jaro.h +0 -17
- data/ext/jaro_winkler/jaro_winkler.c +0 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 47ffec43f4a902a16038fa817a68df9f5caea07ad68c4afe43c87b934b2ea1c8
|
4
|
+
data.tar.gz: 6a6cfd3195c5c03de0204fa25426a20ab0882cb8f012540022ac205f7c2cefad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cb9917bb131d2d5b51f99c1733dfb6ae6c695b1707edf8ff6f688d6a959c31536d5531cb31e020e6fe85df516ba1351f1f87d178d2f9aefa43beb7e99a916b5
|
7
|
+
data.tar.gz: 92973324ff6da1ba02bddd4d9a7a69b9badbc8c2556d2176b753e263e185ed0de84dd74310548f4ece54e47f558d581a1ab738e80a60c985438315c3a8a830d6
|
@@ -14,10 +14,12 @@ module JaroWinkler
|
|
14
14
|
|
15
15
|
class << self
|
16
16
|
def distance(str1, str2, options = {})
|
17
|
+
validate!(str1, str2)
|
17
18
|
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
18
19
|
end
|
19
20
|
|
20
21
|
def jaro_distance(str1, str2, options = {})
|
22
|
+
validate!(str1, str2)
|
21
23
|
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
22
24
|
end
|
23
25
|
|
@@ -125,5 +127,9 @@ module JaroWinkler
|
|
125
127
|
m = similar_count / 10.0 + m if options[:adj_table]
|
126
128
|
(m / len1 + m / len2 + (m - t) / m) / 3
|
127
129
|
end
|
130
|
+
|
131
|
+
def validate!(str1, str2)
|
132
|
+
raise TypeError unless str1.is_a?(String) && str2.is_a?(String)
|
133
|
+
end
|
128
134
|
end
|
129
135
|
end
|
data/lib/jaro_winkler/version.rb
CHANGED
metadata
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.
|
4
|
+
version: 1.5.2
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
14
15
|
requirement: !ruby/object:Gem::Requirement
|
15
16
|
requirements:
|
16
17
|
- - "~>"
|
17
18
|
- !ruby/object:Gem::Version
|
18
19
|
version: '1.7'
|
19
|
-
name: bundler
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -25,12 +25,12 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.7'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
28
29
|
requirement: !ruby/object:Gem::Requirement
|
29
30
|
requirements:
|
30
31
|
- - "~>"
|
31
32
|
- !ruby/object:Gem::Version
|
32
33
|
version: '12.0'
|
33
|
-
name: rake
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -39,12 +39,12 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '12.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
42
43
|
requirement: !ruby/object:Gem::Requirement
|
43
44
|
requirements:
|
44
45
|
- - ">="
|
45
46
|
- !ruby/object:Gem::Version
|
46
47
|
version: '0'
|
47
|
-
name: rake-compiler
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -53,12 +53,12 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
56
57
|
requirement: !ruby/object:Gem::Requirement
|
57
58
|
requirements:
|
58
59
|
- - ">="
|
59
60
|
- !ruby/object:Gem::Version
|
60
61
|
version: '0'
|
61
|
-
name: minitest
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -77,13 +77,6 @@ executables: []
|
|
77
77
|
extensions: []
|
78
78
|
extra_rdoc_files: []
|
79
79
|
files:
|
80
|
-
- ext/jaro_winkler/adj_matrix.c
|
81
|
-
- ext/jaro_winkler/adj_matrix.h
|
82
|
-
- ext/jaro_winkler/codepoints.c
|
83
|
-
- ext/jaro_winkler/codepoints.h
|
84
|
-
- ext/jaro_winkler/jaro.c
|
85
|
-
- ext/jaro_winkler/jaro.h
|
86
|
-
- ext/jaro_winkler/jaro_winkler.c
|
87
80
|
- lib/jaro_winkler.rb
|
88
81
|
- lib/jaro_winkler/adjusting_table.rb
|
89
82
|
- lib/jaro_winkler/jaro_winkler_pure.rb
|
@@ -92,7 +85,7 @@ homepage: https://github.com/tonytonyjan/jaro_winkler
|
|
92
85
|
licenses:
|
93
86
|
- MIT
|
94
87
|
metadata: {}
|
95
|
-
post_install_message:
|
88
|
+
post_install_message:
|
96
89
|
rdoc_options: []
|
97
90
|
require_paths:
|
98
91
|
- lib
|
@@ -107,9 +100,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
100
|
- !ruby/object:Gem::Version
|
108
101
|
version: '0'
|
109
102
|
requirements: []
|
110
|
-
rubyforge_project:
|
111
|
-
rubygems_version: 2.
|
112
|
-
signing_key:
|
103
|
+
rubyforge_project:
|
104
|
+
rubygems_version: 2.7.3
|
105
|
+
signing_key:
|
113
106
|
specification_version: 4
|
114
107
|
summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
|
115
108
|
which supports any kind of string encoding.
|
@@ -1,97 +0,0 @@
|
|
1
|
-
#include "adj_matrix.h"
|
2
|
-
#include "codepoints.h"
|
3
|
-
#include "ruby.h"
|
4
|
-
|
5
|
-
const char *DEFAULT_ADJ_TABLE[] = {
|
6
|
-
"A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
|
7
|
-
"O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
|
8
|
-
"C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
|
9
|
-
"S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
|
10
|
-
"I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
|
11
|
-
"O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
|
12
|
-
|
13
|
-
void node_free(Node *head);
|
14
|
-
|
15
|
-
AdjMatrix *adj_matrix_new(uint32_t length) {
|
16
|
-
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
17
|
-
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
|
18
|
-
matrix->table = malloc(matrix->length * sizeof(Node **));
|
19
|
-
for (size_t i = 0; i < matrix->length; i++) {
|
20
|
-
matrix->table[i] = malloc(matrix->length * sizeof(Node *));
|
21
|
-
for (size_t j = 0; j < matrix->length; j++)
|
22
|
-
matrix->table[i][j] = NULL;
|
23
|
-
}
|
24
|
-
return matrix;
|
25
|
-
}
|
26
|
-
|
27
|
-
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
28
|
-
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
29
|
-
ADJ_MATRIX_DEFAULT_LENGTH,
|
30
|
-
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
31
|
-
ADJ_MATRIX_DEFAULT_LENGTH;
|
32
|
-
Node *new_node = malloc(sizeof(Node));
|
33
|
-
new_node->x = h1;
|
34
|
-
new_node->y = h2;
|
35
|
-
new_node->next = NULL;
|
36
|
-
if (matrix->table[h1][h2] == NULL) {
|
37
|
-
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
|
38
|
-
} else {
|
39
|
-
Node *previous = NULL;
|
40
|
-
for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
|
41
|
-
previous = i;
|
42
|
-
previous->next = new_node;
|
43
|
-
}
|
44
|
-
}
|
45
|
-
|
46
|
-
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
|
47
|
-
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
|
48
|
-
ADJ_MATRIX_DEFAULT_LENGTH,
|
49
|
-
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
|
50
|
-
ADJ_MATRIX_DEFAULT_LENGTH;
|
51
|
-
Node *node = matrix->table[h1][h2];
|
52
|
-
if (node == NULL)
|
53
|
-
return 0;
|
54
|
-
else {
|
55
|
-
for (Node *i = node; i != NULL; i = i->next)
|
56
|
-
if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
|
57
|
-
return 1;
|
58
|
-
return 0;
|
59
|
-
}
|
60
|
-
}
|
61
|
-
|
62
|
-
void node_free(Node *head) {
|
63
|
-
if (head == NULL)
|
64
|
-
return;
|
65
|
-
node_free(head->next);
|
66
|
-
free(head);
|
67
|
-
}
|
68
|
-
|
69
|
-
void adj_matrix_free(AdjMatrix *matrix) {
|
70
|
-
for (size_t i = 0; i < matrix->length; i++) {
|
71
|
-
for (size_t j = 0; j < matrix->length; j++)
|
72
|
-
if (matrix->table[i][j] != NULL) {
|
73
|
-
node_free(matrix->table[i][j]);
|
74
|
-
matrix->table[i][j] = matrix->table[j][i] = NULL;
|
75
|
-
}
|
76
|
-
free(matrix->table[i]);
|
77
|
-
}
|
78
|
-
free(matrix->table);
|
79
|
-
free(matrix);
|
80
|
-
}
|
81
|
-
|
82
|
-
AdjMatrix *adj_matrix_default() {
|
83
|
-
static char first_time = 1;
|
84
|
-
static AdjMatrix *ret_matrix;
|
85
|
-
if (first_time) {
|
86
|
-
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
87
|
-
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
|
88
|
-
for (size_t i = 0; i < length; i += 2) {
|
89
|
-
uint64_t code_1, code_2;
|
90
|
-
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
|
91
|
-
code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
|
92
|
-
adj_matrix_add(ret_matrix, code_1, code_2);
|
93
|
-
}
|
94
|
-
first_time = 0;
|
95
|
-
}
|
96
|
-
return ret_matrix;
|
97
|
-
}
|
@@ -1,22 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "stdint.h"
|
4
|
-
|
5
|
-
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
6
|
-
#define ADJ_MATRIX_SEED 9527
|
7
|
-
|
8
|
-
typedef struct _node {
|
9
|
-
struct _node *next;
|
10
|
-
uint64_t x, y;
|
11
|
-
} Node;
|
12
|
-
|
13
|
-
typedef struct {
|
14
|
-
Node ***table;
|
15
|
-
uint32_t length;
|
16
|
-
} AdjMatrix;
|
17
|
-
|
18
|
-
AdjMatrix *adj_matrix_new(uint32_t length);
|
19
|
-
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
20
|
-
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
|
21
|
-
void adj_matrix_free(AdjMatrix *matrix);
|
22
|
-
AdjMatrix *adj_matrix_default();
|
@@ -1,61 +0,0 @@
|
|
1
|
-
#include "codepoints.h"
|
2
|
-
#include "ruby.h"
|
3
|
-
#include "ruby/encoding.h"
|
4
|
-
#include <stdint.h>
|
5
|
-
#include <stdlib.h>
|
6
|
-
#include <string.h>
|
7
|
-
|
8
|
-
// this function is copied from string.c
|
9
|
-
static inline int single_byte_optimizable(VALUE str) {
|
10
|
-
rb_encoding *enc;
|
11
|
-
|
12
|
-
/* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
|
13
|
-
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
14
|
-
return 1;
|
15
|
-
|
16
|
-
enc = rb_enc_get(str);
|
17
|
-
if (rb_enc_mbmaxlen(enc) == 1)
|
18
|
-
return 1;
|
19
|
-
|
20
|
-
/* Conservative. Possibly single byte.
|
21
|
-
* "\xa1" in Shift_JIS for example. */
|
22
|
-
return 0;
|
23
|
-
}
|
24
|
-
|
25
|
-
void codepoints_init(CodePoints *codepoints, VALUE str) {
|
26
|
-
size_t i, length;
|
27
|
-
int32_t n;
|
28
|
-
uint32_t c;
|
29
|
-
const char *ptr, *end;
|
30
|
-
rb_encoding *enc;
|
31
|
-
|
32
|
-
if (single_byte_optimizable(str)) {
|
33
|
-
length = RSTRING_LEN(str);
|
34
|
-
ptr = RSTRING_PTR(str);
|
35
|
-
codepoints->data = malloc(length * sizeof(*codepoints->data));
|
36
|
-
for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
|
37
|
-
codepoints->data[i] = ptr[i] & 0xff;
|
38
|
-
} else {
|
39
|
-
codepoints->length = 0;
|
40
|
-
codepoints->size = 32;
|
41
|
-
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
|
42
|
-
str = rb_str_new_frozen(str);
|
43
|
-
ptr = RSTRING_PTR(str);
|
44
|
-
end = RSTRING_END(str);
|
45
|
-
enc = rb_enc_get(str);
|
46
|
-
|
47
|
-
while (ptr < end) {
|
48
|
-
c = rb_enc_codepoint_len(ptr, end, &n, enc);
|
49
|
-
if (codepoints->length == codepoints->size) {
|
50
|
-
codepoints->size *= 2;
|
51
|
-
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
|
52
|
-
codepoints->size);
|
53
|
-
}
|
54
|
-
codepoints->data[codepoints->length++] = c;
|
55
|
-
ptr += n;
|
56
|
-
}
|
57
|
-
RB_GC_GUARD(str);
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
|
@@ -1,13 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
#include "ruby.h"
|
3
|
-
#include <stddef.h>
|
4
|
-
#include <stdint.h>
|
5
|
-
|
6
|
-
typedef struct {
|
7
|
-
uint32_t *data;
|
8
|
-
size_t length;
|
9
|
-
size_t size;
|
10
|
-
} CodePoints;
|
11
|
-
|
12
|
-
void codepoints_init(CodePoints *, VALUE str);
|
13
|
-
void codepoints_free(CodePoints *);
|
data/ext/jaro_winkler/jaro.c
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
#include "jaro.h"
|
2
|
-
#include "adj_matrix.h"
|
3
|
-
#include "codepoints.h"
|
4
|
-
|
5
|
-
#include <ctype.h>
|
6
|
-
#include <stdlib.h>
|
7
|
-
#include <string.h>
|
8
|
-
|
9
|
-
#define DEFAULT_WEIGHT 0.1
|
10
|
-
#define DEFAULT_THRESHOLD 0.7
|
11
|
-
#define SWAP(x, y) \
|
12
|
-
do { \
|
13
|
-
__typeof__(x) SWAP = x; \
|
14
|
-
x = y; \
|
15
|
-
y = SWAP; \
|
16
|
-
} while (0)
|
17
|
-
|
18
|
-
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
|
19
|
-
.threshold = DEFAULT_THRESHOLD,
|
20
|
-
.ignore_case = 0,
|
21
|
-
.adj_table = 0};
|
22
|
-
|
23
|
-
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
24
|
-
uint32_t *codepoints2, size_t len2,
|
25
|
-
Options *opt) {
|
26
|
-
if (!len1 || !len2)
|
27
|
-
return 0.0;
|
28
|
-
|
29
|
-
if (len1 > len2) {
|
30
|
-
SWAP(codepoints1, codepoints2);
|
31
|
-
SWAP(len1, len2);
|
32
|
-
}
|
33
|
-
|
34
|
-
if (opt->ignore_case) {
|
35
|
-
for (size_t i = 0; i < len1; i++)
|
36
|
-
codepoints1[i] = tolower(codepoints1[i]);
|
37
|
-
for (size_t i = 0; i < len2; i++)
|
38
|
-
codepoints2[i] = tolower(codepoints2[i]);
|
39
|
-
}
|
40
|
-
|
41
|
-
int32_t window_size = (int32_t)len2 / 2 - 1;
|
42
|
-
if (window_size < 0)
|
43
|
-
window_size = 0;
|
44
|
-
|
45
|
-
char short_codes_flag[len1];
|
46
|
-
char long_codes_flag[len2];
|
47
|
-
memset(short_codes_flag, 0, len1);
|
48
|
-
memset(long_codes_flag, 0, len2);
|
49
|
-
|
50
|
-
// count number of matching characters
|
51
|
-
size_t match_count = 0;
|
52
|
-
for (size_t i = 0; i < len1; i++) {
|
53
|
-
size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
|
54
|
-
size_t right =
|
55
|
-
(i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
|
56
|
-
if (right > len2 - 1)
|
57
|
-
right = len2 - 1;
|
58
|
-
for (size_t j = left; j <= right; j++) {
|
59
|
-
if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
|
60
|
-
short_codes_flag[i] = long_codes_flag[j] = 1;
|
61
|
-
match_count++;
|
62
|
-
break;
|
63
|
-
}
|
64
|
-
}
|
65
|
-
}
|
66
|
-
|
67
|
-
if (!match_count)
|
68
|
-
return 0.0;
|
69
|
-
|
70
|
-
// count number of transpositions
|
71
|
-
size_t transposition_count = 0, j = 0, k = 0;
|
72
|
-
for (size_t i = 0; i < len1; i++) {
|
73
|
-
if (short_codes_flag[i]) {
|
74
|
-
for (j = k; j < len2; j++) {
|
75
|
-
if (long_codes_flag[j]) {
|
76
|
-
k = j + 1;
|
77
|
-
break;
|
78
|
-
}
|
79
|
-
}
|
80
|
-
if (codepoints1[i] != codepoints2[j])
|
81
|
-
transposition_count++;
|
82
|
-
}
|
83
|
-
}
|
84
|
-
|
85
|
-
// count similarities in nonmatched characters
|
86
|
-
size_t similar_count = 0;
|
87
|
-
if (opt->adj_table && len1 > match_count)
|
88
|
-
for (size_t i = 0; i < len1; i++)
|
89
|
-
if (!short_codes_flag[i])
|
90
|
-
for (size_t j = 0; j < len2; j++)
|
91
|
-
if (!long_codes_flag[j])
|
92
|
-
if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
|
93
|
-
codepoints2[j])) {
|
94
|
-
similar_count += 3;
|
95
|
-
break;
|
96
|
-
}
|
97
|
-
|
98
|
-
double m = (double)match_count;
|
99
|
-
double t = (double)(transposition_count / 2);
|
100
|
-
if (opt->adj_table)
|
101
|
-
m = similar_count / 10.0 + m;
|
102
|
-
return (m / len1 + m / len2 + (m - t) / m) / 3;
|
103
|
-
}
|
104
|
-
|
105
|
-
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
106
|
-
uint32_t *codepoints2, size_t len2,
|
107
|
-
Options *opt) {
|
108
|
-
double jaro_distance =
|
109
|
-
jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
|
110
|
-
|
111
|
-
if (jaro_distance < opt->threshold)
|
112
|
-
return jaro_distance;
|
113
|
-
else {
|
114
|
-
size_t prefix = 0;
|
115
|
-
size_t max_4 = len1 > 4 ? 4 : len1;
|
116
|
-
for (prefix = 0;
|
117
|
-
prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
|
118
|
-
;
|
119
|
-
return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
|
120
|
-
}
|
121
|
-
}
|
data/ext/jaro_winkler/jaro.h
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include <stddef.h>
|
4
|
-
#include <stdint.h>
|
5
|
-
|
6
|
-
typedef struct {
|
7
|
-
double weight, threshold;
|
8
|
-
char ignore_case, adj_table;
|
9
|
-
} Options;
|
10
|
-
|
11
|
-
extern const Options DEFAULT_OPTIONS;
|
12
|
-
|
13
|
-
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
14
|
-
uint32_t *codepoints2, size_t len2, Options *);
|
15
|
-
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
|
16
|
-
uint32_t *codepoints2, size_t len2,
|
17
|
-
Options *);
|
@@ -1,70 +0,0 @@
|
|
1
|
-
#include "codepoints.h"
|
2
|
-
#include "jaro.h"
|
3
|
-
#include "ruby.h"
|
4
|
-
|
5
|
-
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
|
6
|
-
|
7
|
-
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
|
8
|
-
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
|
9
|
-
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
10
|
-
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
11
|
-
uint32_t *codepoints2, size_t len2,
|
12
|
-
Options *));
|
13
|
-
|
14
|
-
void Init_jaro_winkler_ext(void) {
|
15
|
-
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
16
|
-
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
|
17
|
-
rb_eInvalidWeightError =
|
18
|
-
rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
|
19
|
-
rb_define_singleton_method(rb_mJaroWinkler, "distance",
|
20
|
-
rb_jaro_winkler_distance, -1);
|
21
|
-
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
|
22
|
-
-1);
|
23
|
-
}
|
24
|
-
|
25
|
-
VALUE distance(size_t argc, VALUE *argv, VALUE self,
|
26
|
-
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
|
27
|
-
uint32_t *codepoints2, size_t len2,
|
28
|
-
Options *)) {
|
29
|
-
VALUE s1, s2, opt;
|
30
|
-
CodePoints cp1, cp2;
|
31
|
-
|
32
|
-
rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
|
33
|
-
codepoints_init(&cp1, s1);
|
34
|
-
codepoints_init(&cp2, s2);
|
35
|
-
|
36
|
-
Options c_opt = DEFAULT_OPTIONS;
|
37
|
-
if (TYPE(opt) == T_HASH) {
|
38
|
-
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
39
|
-
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
40
|
-
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
41
|
-
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
42
|
-
if (!NIL_P(weight))
|
43
|
-
c_opt.weight = NUM2DBL(weight);
|
44
|
-
if (c_opt.weight > 0.25)
|
45
|
-
rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
|
46
|
-
"otherwise the distance can become "
|
47
|
-
"larger than 1.");
|
48
|
-
if (!NIL_P(threshold))
|
49
|
-
c_opt.threshold = NUM2DBL(threshold);
|
50
|
-
if (!NIL_P(ignore_case))
|
51
|
-
c_opt.ignore_case =
|
52
|
-
(TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
53
|
-
if (!NIL_P(adj_table))
|
54
|
-
c_opt.adj_table =
|
55
|
-
(TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
56
|
-
}
|
57
|
-
VALUE ret = rb_float_new(
|
58
|
-
(*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
|
59
|
-
codepoints_free(&cp1);
|
60
|
-
codepoints_free(&cp2);
|
61
|
-
return ret;
|
62
|
-
}
|
63
|
-
|
64
|
-
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
|
65
|
-
return distance(argc, argv, self, jaro_distance_from_codes);
|
66
|
-
}
|
67
|
-
|
68
|
-
VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
|
69
|
-
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
|
70
|
-
}
|