jaro_winkler 1.5.1-java → 1.5.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a3fd3fabbc7662b62ead8988ef48c53a89d75b6f
4
- data.tar.gz: 7ba3eff5e134aadab37aa52f22665d78e66f6dcd
2
+ SHA256:
3
+ metadata.gz: 47ffec43f4a902a16038fa817a68df9f5caea07ad68c4afe43c87b934b2ea1c8
4
+ data.tar.gz: 6a6cfd3195c5c03de0204fa25426a20ab0882cb8f012540022ac205f7c2cefad
5
5
  SHA512:
6
- metadata.gz: 3ee03982e280949d7069e5f25d4e1c5103b047abe5853e6497f5926a3cc5155c11f0876fc93461f7f26b1eb0985d9cf987b109d5898386928419032533cd1c8f
7
- data.tar.gz: 902531a857d93d74bce572bfc8c604e61dcaf5853794ad36d530208bf70c85d2f43b84624550bbee1e4ddc0fac80fe3be91d96764390d060b1b7230f96c24520
6
+ metadata.gz: 5cb9917bb131d2d5b51f99c1733dfb6ae6c695b1707edf8ff6f688d6a959c31536d5531cb31e020e6fe85df516ba1351f1f87d178d2f9aefa43beb7e99a916b5
7
+ data.tar.gz: 92973324ff6da1ba02bddd4d9a7a69b9badbc8c2556d2176b753e263e185ed0de84dd74310548f4ece54e47f558d581a1ab738e80a60c985438315c3a8a830d6
@@ -14,10 +14,12 @@ module JaroWinkler
14
14
 
15
15
  class << self
16
16
  def distance(str1, str2, options = {})
17
+ validate!(str1, str2)
17
18
  _distance str1.codepoints.to_a, str2.codepoints.to_a, options
18
19
  end
19
20
 
20
21
  def jaro_distance(str1, str2, options = {})
22
+ validate!(str1, str2)
21
23
  _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
22
24
  end
23
25
 
@@ -125,5 +127,9 @@ module JaroWinkler
125
127
  m = similar_count / 10.0 + m if options[:adj_table]
126
128
  (m / len1 + m / len2 + (m - t) / m) / 3
127
129
  end
130
+
131
+ def validate!(str1, str2)
132
+ raise TypeError unless str1.is_a?(String) && str2.is_a?(String)
133
+ end
128
134
  end
129
135
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module JaroWinkler
4
- VERSION = '1.5.1'
4
+ VERSION = '1.5.2'
5
5
  end
metadata CHANGED
@@ -1,22 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.1
4
+ version: 1.5.2
5
5
  platform: java
6
6
  authors:
7
7
  - Jian Weihang
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-06-13 00:00:00.000000000 Z
11
+ date: 2019-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
+ name: bundler
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
17
  - - "~>"
17
18
  - !ruby/object:Gem::Version
18
19
  version: '1.7'
19
- name: bundler
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
@@ -25,12 +25,12 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.7'
27
27
  - !ruby/object:Gem::Dependency
28
+ name: rake
28
29
  requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - "~>"
31
32
  - !ruby/object:Gem::Version
32
33
  version: '12.0'
33
- name: rake
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
@@ -39,12 +39,12 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '12.0'
41
41
  - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
42
43
  requirement: !ruby/object:Gem::Requirement
43
44
  requirements:
44
45
  - - ">="
45
46
  - !ruby/object:Gem::Version
46
47
  version: '0'
47
- name: rake-compiler
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -53,12 +53,12 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
+ name: minitest
56
57
  requirement: !ruby/object:Gem::Requirement
57
58
  requirements:
58
59
  - - ">="
59
60
  - !ruby/object:Gem::Version
60
61
  version: '0'
61
- name: minitest
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
@@ -77,13 +77,6 @@ executables: []
77
77
  extensions: []
78
78
  extra_rdoc_files: []
79
79
  files:
80
- - ext/jaro_winkler/adj_matrix.c
81
- - ext/jaro_winkler/adj_matrix.h
82
- - ext/jaro_winkler/codepoints.c
83
- - ext/jaro_winkler/codepoints.h
84
- - ext/jaro_winkler/jaro.c
85
- - ext/jaro_winkler/jaro.h
86
- - ext/jaro_winkler/jaro_winkler.c
87
80
  - lib/jaro_winkler.rb
88
81
  - lib/jaro_winkler/adjusting_table.rb
89
82
  - lib/jaro_winkler/jaro_winkler_pure.rb
@@ -92,7 +85,7 @@ homepage: https://github.com/tonytonyjan/jaro_winkler
92
85
  licenses:
93
86
  - MIT
94
87
  metadata: {}
95
- post_install_message:
88
+ post_install_message:
96
89
  rdoc_options: []
97
90
  require_paths:
98
91
  - lib
@@ -107,9 +100,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
107
100
  - !ruby/object:Gem::Version
108
101
  version: '0'
109
102
  requirements: []
110
- rubyforge_project:
111
- rubygems_version: 2.6.14.1
112
- signing_key:
103
+ rubyforge_project:
104
+ rubygems_version: 2.7.3
105
+ signing_key:
113
106
  specification_version: 4
114
107
  summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
115
108
  which supports any kind of string encoding.
@@ -1,97 +0,0 @@
1
- #include "adj_matrix.h"
2
- #include "codepoints.h"
3
- #include "ruby.h"
4
-
5
- const char *DEFAULT_ADJ_TABLE[] = {
6
- "A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
7
- "O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
8
- "C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
9
- "S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
10
- "I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
11
- "O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
12
-
13
- void node_free(Node *head);
14
-
15
- AdjMatrix *adj_matrix_new(uint32_t length) {
16
- AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
17
- matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
18
- matrix->table = malloc(matrix->length * sizeof(Node **));
19
- for (size_t i = 0; i < matrix->length; i++) {
20
- matrix->table[i] = malloc(matrix->length * sizeof(Node *));
21
- for (size_t j = 0; j < matrix->length; j++)
22
- matrix->table[i][j] = NULL;
23
- }
24
- return matrix;
25
- }
26
-
27
- void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
28
- uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
29
- ADJ_MATRIX_DEFAULT_LENGTH,
30
- h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
31
- ADJ_MATRIX_DEFAULT_LENGTH;
32
- Node *new_node = malloc(sizeof(Node));
33
- new_node->x = h1;
34
- new_node->y = h2;
35
- new_node->next = NULL;
36
- if (matrix->table[h1][h2] == NULL) {
37
- matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
38
- } else {
39
- Node *previous = NULL;
40
- for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
41
- previous = i;
42
- previous->next = new_node;
43
- }
44
- }
45
-
46
- char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
47
- uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
48
- ADJ_MATRIX_DEFAULT_LENGTH,
49
- h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
50
- ADJ_MATRIX_DEFAULT_LENGTH;
51
- Node *node = matrix->table[h1][h2];
52
- if (node == NULL)
53
- return 0;
54
- else {
55
- for (Node *i = node; i != NULL; i = i->next)
56
- if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
57
- return 1;
58
- return 0;
59
- }
60
- }
61
-
62
- void node_free(Node *head) {
63
- if (head == NULL)
64
- return;
65
- node_free(head->next);
66
- free(head);
67
- }
68
-
69
- void adj_matrix_free(AdjMatrix *matrix) {
70
- for (size_t i = 0; i < matrix->length; i++) {
71
- for (size_t j = 0; j < matrix->length; j++)
72
- if (matrix->table[i][j] != NULL) {
73
- node_free(matrix->table[i][j]);
74
- matrix->table[i][j] = matrix->table[j][i] = NULL;
75
- }
76
- free(matrix->table[i]);
77
- }
78
- free(matrix->table);
79
- free(matrix);
80
- }
81
-
82
- AdjMatrix *adj_matrix_default() {
83
- static char first_time = 1;
84
- static AdjMatrix *ret_matrix;
85
- if (first_time) {
86
- ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
87
- size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
88
- for (size_t i = 0; i < length; i += 2) {
89
- uint64_t code_1, code_2;
90
- code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
91
- code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
92
- adj_matrix_add(ret_matrix, code_1, code_2);
93
- }
94
- first_time = 0;
95
- }
96
- return ret_matrix;
97
- }
@@ -1,22 +0,0 @@
1
- #pragma once
2
-
3
- #include "stdint.h"
4
-
5
- #define ADJ_MATRIX_DEFAULT_LENGTH 958
6
- #define ADJ_MATRIX_SEED 9527
7
-
8
- typedef struct _node {
9
- struct _node *next;
10
- uint64_t x, y;
11
- } Node;
12
-
13
- typedef struct {
14
- Node ***table;
15
- uint32_t length;
16
- } AdjMatrix;
17
-
18
- AdjMatrix *adj_matrix_new(uint32_t length);
19
- void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
20
- char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
21
- void adj_matrix_free(AdjMatrix *matrix);
22
- AdjMatrix *adj_matrix_default();
@@ -1,61 +0,0 @@
1
- #include "codepoints.h"
2
- #include "ruby.h"
3
- #include "ruby/encoding.h"
4
- #include <stdint.h>
5
- #include <stdlib.h>
6
- #include <string.h>
7
-
8
- // this function is copied from string.c
9
- static inline int single_byte_optimizable(VALUE str) {
10
- rb_encoding *enc;
11
-
12
- /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
13
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
14
- return 1;
15
-
16
- enc = rb_enc_get(str);
17
- if (rb_enc_mbmaxlen(enc) == 1)
18
- return 1;
19
-
20
- /* Conservative. Possibly single byte.
21
- * "\xa1" in Shift_JIS for example. */
22
- return 0;
23
- }
24
-
25
- void codepoints_init(CodePoints *codepoints, VALUE str) {
26
- size_t i, length;
27
- int32_t n;
28
- uint32_t c;
29
- const char *ptr, *end;
30
- rb_encoding *enc;
31
-
32
- if (single_byte_optimizable(str)) {
33
- length = RSTRING_LEN(str);
34
- ptr = RSTRING_PTR(str);
35
- codepoints->data = malloc(length * sizeof(*codepoints->data));
36
- for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
37
- codepoints->data[i] = ptr[i] & 0xff;
38
- } else {
39
- codepoints->length = 0;
40
- codepoints->size = 32;
41
- codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
42
- str = rb_str_new_frozen(str);
43
- ptr = RSTRING_PTR(str);
44
- end = RSTRING_END(str);
45
- enc = rb_enc_get(str);
46
-
47
- while (ptr < end) {
48
- c = rb_enc_codepoint_len(ptr, end, &n, enc);
49
- if (codepoints->length == codepoints->size) {
50
- codepoints->size *= 2;
51
- codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
52
- codepoints->size);
53
- }
54
- codepoints->data[codepoints->length++] = c;
55
- ptr += n;
56
- }
57
- RB_GC_GUARD(str);
58
- }
59
- }
60
-
61
- void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
@@ -1,13 +0,0 @@
1
- #pragma once
2
- #include "ruby.h"
3
- #include <stddef.h>
4
- #include <stdint.h>
5
-
6
- typedef struct {
7
- uint32_t *data;
8
- size_t length;
9
- size_t size;
10
- } CodePoints;
11
-
12
- void codepoints_init(CodePoints *, VALUE str);
13
- void codepoints_free(CodePoints *);
@@ -1,121 +0,0 @@
1
- #include "jaro.h"
2
- #include "adj_matrix.h"
3
- #include "codepoints.h"
4
-
5
- #include <ctype.h>
6
- #include <stdlib.h>
7
- #include <string.h>
8
-
9
- #define DEFAULT_WEIGHT 0.1
10
- #define DEFAULT_THRESHOLD 0.7
11
- #define SWAP(x, y) \
12
- do { \
13
- __typeof__(x) SWAP = x; \
14
- x = y; \
15
- y = SWAP; \
16
- } while (0)
17
-
18
- const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
19
- .threshold = DEFAULT_THRESHOLD,
20
- .ignore_case = 0,
21
- .adj_table = 0};
22
-
23
- double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
24
- uint32_t *codepoints2, size_t len2,
25
- Options *opt) {
26
- if (!len1 || !len2)
27
- return 0.0;
28
-
29
- if (len1 > len2) {
30
- SWAP(codepoints1, codepoints2);
31
- SWAP(len1, len2);
32
- }
33
-
34
- if (opt->ignore_case) {
35
- for (size_t i = 0; i < len1; i++)
36
- codepoints1[i] = tolower(codepoints1[i]);
37
- for (size_t i = 0; i < len2; i++)
38
- codepoints2[i] = tolower(codepoints2[i]);
39
- }
40
-
41
- int32_t window_size = (int32_t)len2 / 2 - 1;
42
- if (window_size < 0)
43
- window_size = 0;
44
-
45
- char short_codes_flag[len1];
46
- char long_codes_flag[len2];
47
- memset(short_codes_flag, 0, len1);
48
- memset(long_codes_flag, 0, len2);
49
-
50
- // count number of matching characters
51
- size_t match_count = 0;
52
- for (size_t i = 0; i < len1; i++) {
53
- size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
54
- size_t right =
55
- (i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
56
- if (right > len2 - 1)
57
- right = len2 - 1;
58
- for (size_t j = left; j <= right; j++) {
59
- if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
60
- short_codes_flag[i] = long_codes_flag[j] = 1;
61
- match_count++;
62
- break;
63
- }
64
- }
65
- }
66
-
67
- if (!match_count)
68
- return 0.0;
69
-
70
- // count number of transpositions
71
- size_t transposition_count = 0, j = 0, k = 0;
72
- for (size_t i = 0; i < len1; i++) {
73
- if (short_codes_flag[i]) {
74
- for (j = k; j < len2; j++) {
75
- if (long_codes_flag[j]) {
76
- k = j + 1;
77
- break;
78
- }
79
- }
80
- if (codepoints1[i] != codepoints2[j])
81
- transposition_count++;
82
- }
83
- }
84
-
85
- // count similarities in nonmatched characters
86
- size_t similar_count = 0;
87
- if (opt->adj_table && len1 > match_count)
88
- for (size_t i = 0; i < len1; i++)
89
- if (!short_codes_flag[i])
90
- for (size_t j = 0; j < len2; j++)
91
- if (!long_codes_flag[j])
92
- if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
93
- codepoints2[j])) {
94
- similar_count += 3;
95
- break;
96
- }
97
-
98
- double m = (double)match_count;
99
- double t = (double)(transposition_count / 2);
100
- if (opt->adj_table)
101
- m = similar_count / 10.0 + m;
102
- return (m / len1 + m / len2 + (m - t) / m) / 3;
103
- }
104
-
105
- double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
106
- uint32_t *codepoints2, size_t len2,
107
- Options *opt) {
108
- double jaro_distance =
109
- jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
110
-
111
- if (jaro_distance < opt->threshold)
112
- return jaro_distance;
113
- else {
114
- size_t prefix = 0;
115
- size_t max_4 = len1 > 4 ? 4 : len1;
116
- for (prefix = 0;
117
- prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
118
- ;
119
- return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
120
- }
121
- }
@@ -1,17 +0,0 @@
1
- #pragma once
2
-
3
- #include <stddef.h>
4
- #include <stdint.h>
5
-
6
- typedef struct {
7
- double weight, threshold;
8
- char ignore_case, adj_table;
9
- } Options;
10
-
11
- extern const Options DEFAULT_OPTIONS;
12
-
13
- double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
14
- uint32_t *codepoints2, size_t len2, Options *);
15
- double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
16
- uint32_t *codepoints2, size_t len2,
17
- Options *);
@@ -1,70 +0,0 @@
1
- #include "codepoints.h"
2
- #include "jaro.h"
3
- #include "ruby.h"
4
-
5
- VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
6
-
7
- VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
8
- VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
9
- VALUE distance(size_t argc, VALUE *argv, VALUE self,
10
- double (*distance_fn)(uint32_t *codepoints1, size_t len1,
11
- uint32_t *codepoints2, size_t len2,
12
- Options *));
13
-
14
- void Init_jaro_winkler_ext(void) {
15
- rb_mJaroWinkler = rb_define_module("JaroWinkler");
16
- rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
17
- rb_eInvalidWeightError =
18
- rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
19
- rb_define_singleton_method(rb_mJaroWinkler, "distance",
20
- rb_jaro_winkler_distance, -1);
21
- rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
22
- -1);
23
- }
24
-
25
- VALUE distance(size_t argc, VALUE *argv, VALUE self,
26
- double (*distance_fn)(uint32_t *codepoints1, size_t len1,
27
- uint32_t *codepoints2, size_t len2,
28
- Options *)) {
29
- VALUE s1, s2, opt;
30
- CodePoints cp1, cp2;
31
-
32
- rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
33
- codepoints_init(&cp1, s1);
34
- codepoints_init(&cp2, s2);
35
-
36
- Options c_opt = DEFAULT_OPTIONS;
37
- if (TYPE(opt) == T_HASH) {
38
- VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
39
- threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
40
- ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
41
- adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
42
- if (!NIL_P(weight))
43
- c_opt.weight = NUM2DBL(weight);
44
- if (c_opt.weight > 0.25)
45
- rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
46
- "otherwise the distance can become "
47
- "larger than 1.");
48
- if (!NIL_P(threshold))
49
- c_opt.threshold = NUM2DBL(threshold);
50
- if (!NIL_P(ignore_case))
51
- c_opt.ignore_case =
52
- (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
53
- if (!NIL_P(adj_table))
54
- c_opt.adj_table =
55
- (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
56
- }
57
- VALUE ret = rb_float_new(
58
- (*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
59
- codepoints_free(&cp1);
60
- codepoints_free(&cp2);
61
- return ret;
62
- }
63
-
64
- VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
65
- return distance(argc, argv, self, jaro_distance_from_codes);
66
- }
67
-
68
- VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
69
- return distance(argc, argv, self, jaro_winkler_distance_from_codes);
70
- }