jaro_winkler 1.5.1-universal-java-10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2f783ac8d8355443adfe51a5d3f32cf7b8ddef5f
4
+ data.tar.gz: 96f863de9c8e879104fdb78ded7834691ad6ca66
5
+ SHA512:
6
+ metadata.gz: 354d4337b57d40de31960a6fd050d56ca7dd852031f5ad0525e7f2147ef8345ab6c4b4d53ff93a0e6011b55dc342b2ab05147db277c392f31988109757219685
7
+ data.tar.gz: 8db99ee1730ac2d84d95ec509fd9d2395ae812d6169bf1ce9f60c44495bafa541b4fbe39a3688c521b5f38707d635164c7ef5b8b7b062390ff2be6a017e6ce8c
@@ -0,0 +1,97 @@
1
+ #include "adj_matrix.h"
2
+ #include "codepoints.h"
3
+ #include "ruby.h"
4
+
5
+ const char *DEFAULT_ADJ_TABLE[] = {
6
+ "A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
7
+ "O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
8
+ "C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
9
+ "S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
10
+ "I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
11
+ "O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
12
+
13
+ void node_free(Node *head);
14
+
15
+ AdjMatrix *adj_matrix_new(uint32_t length) {
16
+ AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
17
+ matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
18
+ matrix->table = malloc(matrix->length * sizeof(Node **));
19
+ for (size_t i = 0; i < matrix->length; i++) {
20
+ matrix->table[i] = malloc(matrix->length * sizeof(Node *));
21
+ for (size_t j = 0; j < matrix->length; j++)
22
+ matrix->table[i][j] = NULL;
23
+ }
24
+ return matrix;
25
+ }
26
+
27
+ void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
28
+ uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
29
+ ADJ_MATRIX_DEFAULT_LENGTH,
30
+ h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
31
+ ADJ_MATRIX_DEFAULT_LENGTH;
32
+ Node *new_node = malloc(sizeof(Node));
33
+ new_node->x = h1;
34
+ new_node->y = h2;
35
+ new_node->next = NULL;
36
+ if (matrix->table[h1][h2] == NULL) {
37
+ matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
38
+ } else {
39
+ Node *previous = NULL;
40
+ for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
41
+ previous = i;
42
+ previous->next = new_node;
43
+ }
44
+ }
45
+
46
+ char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
47
+ uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
48
+ ADJ_MATRIX_DEFAULT_LENGTH,
49
+ h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
50
+ ADJ_MATRIX_DEFAULT_LENGTH;
51
+ Node *node = matrix->table[h1][h2];
52
+ if (node == NULL)
53
+ return 0;
54
+ else {
55
+ for (Node *i = node; i != NULL; i = i->next)
56
+ if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
57
+ return 1;
58
+ return 0;
59
+ }
60
+ }
61
+
62
+ void node_free(Node *head) {
63
+ if (head == NULL)
64
+ return;
65
+ node_free(head->next);
66
+ free(head);
67
+ }
68
+
69
+ void adj_matrix_free(AdjMatrix *matrix) {
70
+ for (size_t i = 0; i < matrix->length; i++) {
71
+ for (size_t j = 0; j < matrix->length; j++)
72
+ if (matrix->table[i][j] != NULL) {
73
+ node_free(matrix->table[i][j]);
74
+ matrix->table[i][j] = matrix->table[j][i] = NULL;
75
+ }
76
+ free(matrix->table[i]);
77
+ }
78
+ free(matrix->table);
79
+ free(matrix);
80
+ }
81
+
82
+ AdjMatrix *adj_matrix_default() {
83
+ static char first_time = 1;
84
+ static AdjMatrix *ret_matrix;
85
+ if (first_time) {
86
+ ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
87
+ size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
88
+ for (size_t i = 0; i < length; i += 2) {
89
+ uint64_t code_1, code_2;
90
+ code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
91
+ code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
92
+ adj_matrix_add(ret_matrix, code_1, code_2);
93
+ }
94
+ first_time = 0;
95
+ }
96
+ return ret_matrix;
97
+ }
@@ -0,0 +1,22 @@
1
+ #pragma once
2
+
3
+ #include "stdint.h"
4
+
5
+ #define ADJ_MATRIX_DEFAULT_LENGTH 958
6
+ #define ADJ_MATRIX_SEED 9527
7
+
8
+ typedef struct _node {
9
+ struct _node *next;
10
+ uint64_t x, y;
11
+ } Node;
12
+
13
+ typedef struct {
14
+ Node ***table;
15
+ uint32_t length;
16
+ } AdjMatrix;
17
+
18
+ AdjMatrix *adj_matrix_new(uint32_t length);
19
+ void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
20
+ char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
21
+ void adj_matrix_free(AdjMatrix *matrix);
22
+ AdjMatrix *adj_matrix_default();
@@ -0,0 +1,61 @@
1
+ #include "codepoints.h"
2
+ #include "ruby.h"
3
+ #include "ruby/encoding.h"
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ // this function is copied from string.c
9
+ static inline int single_byte_optimizable(VALUE str) {
10
+ rb_encoding *enc;
11
+
12
+ /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
13
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
14
+ return 1;
15
+
16
+ enc = rb_enc_get(str);
17
+ if (rb_enc_mbmaxlen(enc) == 1)
18
+ return 1;
19
+
20
+ /* Conservative. Possibly single byte.
21
+ * "\xa1" in Shift_JIS for example. */
22
+ return 0;
23
+ }
24
+
25
+ void codepoints_init(CodePoints *codepoints, VALUE str) {
26
+ size_t i, length;
27
+ int32_t n;
28
+ uint32_t c;
29
+ const char *ptr, *end;
30
+ rb_encoding *enc;
31
+
32
+ if (single_byte_optimizable(str)) {
33
+ length = RSTRING_LEN(str);
34
+ ptr = RSTRING_PTR(str);
35
+ codepoints->data = malloc(length * sizeof(*codepoints->data));
36
+ for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
37
+ codepoints->data[i] = ptr[i] & 0xff;
38
+ } else {
39
+ codepoints->length = 0;
40
+ codepoints->size = 32;
41
+ codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
42
+ str = rb_str_new_frozen(str);
43
+ ptr = RSTRING_PTR(str);
44
+ end = RSTRING_END(str);
45
+ enc = rb_enc_get(str);
46
+
47
+ while (ptr < end) {
48
+ c = rb_enc_codepoint_len(ptr, end, &n, enc);
49
+ if (codepoints->length == codepoints->size) {
50
+ codepoints->size *= 2;
51
+ codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
52
+ codepoints->size);
53
+ }
54
+ codepoints->data[codepoints->length++] = c;
55
+ ptr += n;
56
+ }
57
+ RB_GC_GUARD(str);
58
+ }
59
+ }
60
+
61
+ void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
@@ -0,0 +1,13 @@
1
+ #pragma once
2
+ #include "ruby.h"
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ uint32_t *data;
8
+ size_t length;
9
+ size_t size;
10
+ } CodePoints;
11
+
12
+ void codepoints_init(CodePoints *, VALUE str);
13
+ void codepoints_free(CodePoints *);
@@ -0,0 +1,121 @@
1
+ #include "jaro.h"
2
+ #include "adj_matrix.h"
3
+ #include "codepoints.h"
4
+
5
+ #include <ctype.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ #define DEFAULT_WEIGHT 0.1
10
+ #define DEFAULT_THRESHOLD 0.7
11
+ #define SWAP(x, y) \
12
+ do { \
13
+ __typeof__(x) SWAP = x; \
14
+ x = y; \
15
+ y = SWAP; \
16
+ } while (0)
17
+
18
+ const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
19
+ .threshold = DEFAULT_THRESHOLD,
20
+ .ignore_case = 0,
21
+ .adj_table = 0};
22
+
23
+ double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
24
+ uint32_t *codepoints2, size_t len2,
25
+ Options *opt) {
26
+ if (!len1 || !len2)
27
+ return 0.0;
28
+
29
+ if (len1 > len2) {
30
+ SWAP(codepoints1, codepoints2);
31
+ SWAP(len1, len2);
32
+ }
33
+
34
+ if (opt->ignore_case) {
35
+ for (size_t i = 0; i < len1; i++)
36
+ codepoints1[i] = tolower(codepoints1[i]);
37
+ for (size_t i = 0; i < len2; i++)
38
+ codepoints2[i] = tolower(codepoints2[i]);
39
+ }
40
+
41
+ int32_t window_size = (int32_t)len2 / 2 - 1;
42
+ if (window_size < 0)
43
+ window_size = 0;
44
+
45
+ char short_codes_flag[len1];
46
+ char long_codes_flag[len2];
47
+ memset(short_codes_flag, 0, len1);
48
+ memset(long_codes_flag, 0, len2);
49
+
50
+ // count number of matching characters
51
+ size_t match_count = 0;
52
+ for (size_t i = 0; i < len1; i++) {
53
+ size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
54
+ size_t right =
55
+ (i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
56
+ if (right > len2 - 1)
57
+ right = len2 - 1;
58
+ for (size_t j = left; j <= right; j++) {
59
+ if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
60
+ short_codes_flag[i] = long_codes_flag[j] = 1;
61
+ match_count++;
62
+ break;
63
+ }
64
+ }
65
+ }
66
+
67
+ if (!match_count)
68
+ return 0.0;
69
+
70
+ // count number of transpositions
71
+ size_t transposition_count = 0, j = 0, k = 0;
72
+ for (size_t i = 0; i < len1; i++) {
73
+ if (short_codes_flag[i]) {
74
+ for (j = k; j < len2; j++) {
75
+ if (long_codes_flag[j]) {
76
+ k = j + 1;
77
+ break;
78
+ }
79
+ }
80
+ if (codepoints1[i] != codepoints2[j])
81
+ transposition_count++;
82
+ }
83
+ }
84
+
85
+ // count similarities in nonmatched characters
86
+ size_t similar_count = 0;
87
+ if (opt->adj_table && len1 > match_count)
88
+ for (size_t i = 0; i < len1; i++)
89
+ if (!short_codes_flag[i])
90
+ for (size_t j = 0; j < len2; j++)
91
+ if (!long_codes_flag[j])
92
+ if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
93
+ codepoints2[j])) {
94
+ similar_count += 3;
95
+ break;
96
+ }
97
+
98
+ double m = (double)match_count;
99
+ double t = (double)(transposition_count / 2);
100
+ if (opt->adj_table)
101
+ m = similar_count / 10.0 + m;
102
+ return (m / len1 + m / len2 + (m - t) / m) / 3;
103
+ }
104
+
105
+ double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
106
+ uint32_t *codepoints2, size_t len2,
107
+ Options *opt) {
108
+ double jaro_distance =
109
+ jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
110
+
111
+ if (jaro_distance < opt->threshold)
112
+ return jaro_distance;
113
+ else {
114
+ size_t prefix = 0;
115
+ size_t max_4 = len1 > 4 ? 4 : len1;
116
+ for (prefix = 0;
117
+ prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
118
+ ;
119
+ return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
120
+ }
121
+ }
@@ -0,0 +1,17 @@
1
+ #pragma once
2
+
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ double weight, threshold;
8
+ char ignore_case, adj_table;
9
+ } Options;
10
+
11
+ extern const Options DEFAULT_OPTIONS;
12
+
13
+ double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
14
+ uint32_t *codepoints2, size_t len2, Options *);
15
+ double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
16
+ uint32_t *codepoints2, size_t len2,
17
+ Options *);
@@ -0,0 +1,70 @@
1
+ #include "codepoints.h"
2
+ #include "jaro.h"
3
+ #include "ruby.h"
4
+
5
+ VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
6
+
7
+ VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
8
+ VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
9
+ VALUE distance(size_t argc, VALUE *argv, VALUE self,
10
+ double (*distance_fn)(uint32_t *codepoints1, size_t len1,
11
+ uint32_t *codepoints2, size_t len2,
12
+ Options *));
13
+
14
+ void Init_jaro_winkler_ext(void) {
15
+ rb_mJaroWinkler = rb_define_module("JaroWinkler");
16
+ rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
17
+ rb_eInvalidWeightError =
18
+ rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
19
+ rb_define_singleton_method(rb_mJaroWinkler, "distance",
20
+ rb_jaro_winkler_distance, -1);
21
+ rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
22
+ -1);
23
+ }
24
+
25
+ VALUE distance(size_t argc, VALUE *argv, VALUE self,
26
+ double (*distance_fn)(uint32_t *codepoints1, size_t len1,
27
+ uint32_t *codepoints2, size_t len2,
28
+ Options *)) {
29
+ VALUE s1, s2, opt;
30
+ CodePoints cp1, cp2;
31
+
32
+ rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
33
+ codepoints_init(&cp1, s1);
34
+ codepoints_init(&cp2, s2);
35
+
36
+ Options c_opt = DEFAULT_OPTIONS;
37
+ if (TYPE(opt) == T_HASH) {
38
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
39
+ threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
40
+ ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
41
+ adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
42
+ if (!NIL_P(weight))
43
+ c_opt.weight = NUM2DBL(weight);
44
+ if (c_opt.weight > 0.25)
45
+ rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
46
+ "otherwise the distance can become "
47
+ "larger than 1.");
48
+ if (!NIL_P(threshold))
49
+ c_opt.threshold = NUM2DBL(threshold);
50
+ if (!NIL_P(ignore_case))
51
+ c_opt.ignore_case =
52
+ (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
53
+ if (!NIL_P(adj_table))
54
+ c_opt.adj_table =
55
+ (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
56
+ }
57
+ VALUE ret = rb_float_new(
58
+ (*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
59
+ codepoints_free(&cp1);
60
+ codepoints_free(&cp2);
61
+ return ret;
62
+ }
63
+
64
+ VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
65
+ return distance(argc, argv, self, jaro_distance_from_codes);
66
+ }
67
+
68
+ VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
69
+ return distance(argc, argv, self, jaro_winkler_distance_from_codes);
70
+ }
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'jaro_winkler/version'
4
+
5
+ if RUBY_ENGINE == 'ruby'
6
+ require 'jaro_winkler/jaro_winkler_ext'
7
+ else
8
+ require 'jaro_winkler/jaro_winkler_pure'
9
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module JaroWinkler
4
+ DEFAULT_ADJ_TABLE = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
5
+ [
6
+ %w[A E], %w[A I], %w[A O], %w[A U], %w[B V], %w[E I], %w[E O], %w[E U], %w[I O],
7
+ %w[I U], %w[O U], %w[I Y], %w[E Y], %w[C G], %w[E F], %w[W U], %w[W V], %w[X K],
8
+ %w[S Z], %w[X S], %w[Q C], %w[U V], %w[M N], %w[L I], %w[Q O], %w[P R], %w[I J],
9
+ %w[2 Z], %w[5 S], %w[8 B], %w[1 I], %w[1 L], %w[0 O], %w[0 Q], %w[C K], %w[G J],
10
+ ['E', ' '], ['Y', ' '], ['S', ' ']
11
+ ].each do |s1, s2|
12
+ DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
13
+ end
14
+ end
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'jaro_winkler/adjusting_table'
4
+ module JaroWinkler
5
+ class Error < RuntimeError; end
6
+ class InvalidWeightError < Error; end
7
+
8
+ DEFAULT_WEIGHT = 0.1
9
+ DEFAULT_THRESHOLD = 0.7
10
+ DEFAULT_OPTIONS = {
11
+ jaro: { adj_table: false, ignore_case: false },
12
+ jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
13
+ }.freeze
14
+
15
+ class << self
16
+ def distance(str1, str2, options = {})
17
+ _distance str1.codepoints.to_a, str2.codepoints.to_a, options
18
+ end
19
+
20
+ def jaro_distance(str1, str2, options = {})
21
+ _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
22
+ end
23
+
24
+ private
25
+
26
+ def _distance(codes1, codes2, options = {})
27
+ options = DEFAULT_OPTIONS[:jaro_winkler].merge options
28
+ raise InvalidWeightError if options[:weight] > 0.25
29
+ jaro_distance = _jaro_distance(codes1, codes2, options)
30
+
31
+ if jaro_distance < options[:threshold]
32
+ jaro_distance
33
+ else
34
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
35
+ len1 = codes1.length
36
+ len2 = codes2.length
37
+ max_4 = len1 > 4 ? 4 : len1
38
+ prefix = 0
39
+ prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
40
+ jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
41
+ end
42
+ end
43
+
44
+ def _jaro_distance(codes1, codes2, options = {})
45
+ options = DEFAULT_OPTIONS[:jaro].merge options
46
+
47
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
48
+ len1 = codes1.length
49
+ len2 = codes2.length
50
+ return 0.0 if len1 == 0 || len2 == 0
51
+
52
+ if options[:ignore_case]
53
+ codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
54
+ codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
55
+ end
56
+
57
+ window = len2 / 2 - 1
58
+ window = 0 if window < 0
59
+ flags1 = 0
60
+ flags2 = 0
61
+
62
+ # // count number of matching characters
63
+ match_count = 0
64
+ i = 0
65
+ while i < len1
66
+ left = i >= window ? i - window : 0
67
+ right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
68
+ right = len2 - 1 if right > len2 - 1
69
+ j = left
70
+ while j <= right
71
+ if flags2[j] == 0 && codes1[i] == codes2[j]
72
+ flags1 |= (1 << i)
73
+ flags2 |= (1 << j)
74
+ match_count += 1
75
+ break
76
+ end
77
+ j += 1
78
+ end
79
+ i += 1
80
+ end
81
+
82
+ return 0.0 if match_count == 0
83
+
84
+ # // count number of transpositions
85
+ transposition_count = j = k = 0
86
+ i = 0
87
+ while i < len1
88
+ if flags1[i] == 1
89
+ j = k
90
+ while j < len2
91
+ if flags2[j] == 1
92
+ k = j + 1
93
+ break
94
+ end
95
+ j += 1
96
+ end
97
+ transposition_count += 1 if codes1[i] != codes2[j]
98
+ end
99
+ i += 1
100
+ end
101
+
102
+ # // count similarities in nonmatched characters
103
+ similar_count = 0
104
+ if options[:adj_table] && len1 > match_count
105
+ i = 0
106
+ while i < len1
107
+ if flags1[i] == 0
108
+ j = 0
109
+ while j < len2
110
+ if flags2[j] == 0
111
+ if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
112
+ similar_count += 3
113
+ break
114
+ end
115
+ end
116
+ j += 1
117
+ end
118
+ end
119
+ i += 1
120
+ end
121
+ end
122
+
123
+ m = match_count.to_f
124
+ t = transposition_count / 2
125
+ m = similar_count / 10.0 + m if options[:adj_table]
126
+ (m / len1 + m / len2 + (m - t) / m) / 3
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module JaroWinkler
4
+ VERSION = '1.5.1'
5
+ end
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jaro_winkler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.5.1
5
+ platform: universal-java-10
6
+ authors:
7
+ - Jian Weihang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-06-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.7'
19
+ name: bundler
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '12.0'
33
+ name: rake
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '12.0'
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ name: rake-compiler
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ name: minitest
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: |-
70
+ jaro_winkler is an implementation of Jaro-Winkler \
71
+ distance algorithm which is written in C extension and will fallback to pure \
72
+ Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
73
+ C and Ruby implementation support any kind of string encoding, such as \
74
+ UTF-8, EUC-JP, Big5, etc.
75
+ email: tonytonyjan@gmail.com
76
+ executables: []
77
+ extensions: []
78
+ extra_rdoc_files: []
79
+ files:
80
+ - ext/jaro_winkler/adj_matrix.c
81
+ - ext/jaro_winkler/adj_matrix.h
82
+ - ext/jaro_winkler/codepoints.c
83
+ - ext/jaro_winkler/codepoints.h
84
+ - ext/jaro_winkler/jaro.c
85
+ - ext/jaro_winkler/jaro.h
86
+ - ext/jaro_winkler/jaro_winkler.c
87
+ - lib/jaro_winkler.rb
88
+ - lib/jaro_winkler/adjusting_table.rb
89
+ - lib/jaro_winkler/jaro_winkler_pure.rb
90
+ - lib/jaro_winkler/version.rb
91
+ homepage: https://github.com/tonytonyjan/jaro_winkler
92
+ licenses:
93
+ - MIT
94
+ metadata: {}
95
+ post_install_message:
96
+ rdoc_options: []
97
+ require_paths:
98
+ - lib
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
109
+ requirements: []
110
+ rubyforge_project:
111
+ rubygems_version: 2.6.14.1
112
+ signing_key:
113
+ specification_version: 4
114
+ summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
115
+ which supports any kind of string encoding.
116
+ test_files: []