jaro_winkler 1.5.1-java → 1.5.2-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a3fd3fabbc7662b62ead8988ef48c53a89d75b6f
4
- data.tar.gz: 7ba3eff5e134aadab37aa52f22665d78e66f6dcd
2
+ SHA256:
3
+ metadata.gz: 47ffec43f4a902a16038fa817a68df9f5caea07ad68c4afe43c87b934b2ea1c8
4
+ data.tar.gz: 6a6cfd3195c5c03de0204fa25426a20ab0882cb8f012540022ac205f7c2cefad
5
5
  SHA512:
6
- metadata.gz: 3ee03982e280949d7069e5f25d4e1c5103b047abe5853e6497f5926a3cc5155c11f0876fc93461f7f26b1eb0985d9cf987b109d5898386928419032533cd1c8f
7
- data.tar.gz: 902531a857d93d74bce572bfc8c604e61dcaf5853794ad36d530208bf70c85d2f43b84624550bbee1e4ddc0fac80fe3be91d96764390d060b1b7230f96c24520
6
+ metadata.gz: 5cb9917bb131d2d5b51f99c1733dfb6ae6c695b1707edf8ff6f688d6a959c31536d5531cb31e020e6fe85df516ba1351f1f87d178d2f9aefa43beb7e99a916b5
7
+ data.tar.gz: 92973324ff6da1ba02bddd4d9a7a69b9badbc8c2556d2176b753e263e185ed0de84dd74310548f4ece54e47f558d581a1ab738e80a60c985438315c3a8a830d6
@@ -14,10 +14,12 @@ module JaroWinkler
14
14
 
15
15
  class << self
16
16
  def distance(str1, str2, options = {})
17
+ validate!(str1, str2)
17
18
  _distance str1.codepoints.to_a, str2.codepoints.to_a, options
18
19
  end
19
20
 
20
21
  def jaro_distance(str1, str2, options = {})
22
+ validate!(str1, str2)
21
23
  _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
22
24
  end
23
25
 
@@ -125,5 +127,9 @@ module JaroWinkler
125
127
  m = similar_count / 10.0 + m if options[:adj_table]
126
128
  (m / len1 + m / len2 + (m - t) / m) / 3
127
129
  end
130
+
131
+ def validate!(str1, str2)
132
+ raise TypeError unless str1.is_a?(String) && str2.is_a?(String)
133
+ end
128
134
  end
129
135
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module JaroWinkler
4
- VERSION = '1.5.1'
4
+ VERSION = '1.5.2'
5
5
  end
metadata CHANGED
@@ -1,22 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.1
4
+ version: 1.5.2
5
5
  platform: java
6
6
  authors:
7
7
  - Jian Weihang
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-06-13 00:00:00.000000000 Z
11
+ date: 2019-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
+ name: bundler
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
17
  - - "~>"
17
18
  - !ruby/object:Gem::Version
18
19
  version: '1.7'
19
- name: bundler
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
@@ -25,12 +25,12 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.7'
27
27
  - !ruby/object:Gem::Dependency
28
+ name: rake
28
29
  requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - "~>"
31
32
  - !ruby/object:Gem::Version
32
33
  version: '12.0'
33
- name: rake
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
@@ -39,12 +39,12 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '12.0'
41
41
  - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
42
43
  requirement: !ruby/object:Gem::Requirement
43
44
  requirements:
44
45
  - - ">="
45
46
  - !ruby/object:Gem::Version
46
47
  version: '0'
47
- name: rake-compiler
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -53,12 +53,12 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
+ name: minitest
56
57
  requirement: !ruby/object:Gem::Requirement
57
58
  requirements:
58
59
  - - ">="
59
60
  - !ruby/object:Gem::Version
60
61
  version: '0'
61
- name: minitest
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
@@ -77,13 +77,6 @@ executables: []
77
77
  extensions: []
78
78
  extra_rdoc_files: []
79
79
  files:
80
- - ext/jaro_winkler/adj_matrix.c
81
- - ext/jaro_winkler/adj_matrix.h
82
- - ext/jaro_winkler/codepoints.c
83
- - ext/jaro_winkler/codepoints.h
84
- - ext/jaro_winkler/jaro.c
85
- - ext/jaro_winkler/jaro.h
86
- - ext/jaro_winkler/jaro_winkler.c
87
80
  - lib/jaro_winkler.rb
88
81
  - lib/jaro_winkler/adjusting_table.rb
89
82
  - lib/jaro_winkler/jaro_winkler_pure.rb
@@ -92,7 +85,7 @@ homepage: https://github.com/tonytonyjan/jaro_winkler
92
85
  licenses:
93
86
  - MIT
94
87
  metadata: {}
95
- post_install_message:
88
+ post_install_message:
96
89
  rdoc_options: []
97
90
  require_paths:
98
91
  - lib
@@ -107,9 +100,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
107
100
  - !ruby/object:Gem::Version
108
101
  version: '0'
109
102
  requirements: []
110
- rubyforge_project:
111
- rubygems_version: 2.6.14.1
112
- signing_key:
103
+ rubyforge_project:
104
+ rubygems_version: 2.7.3
105
+ signing_key:
113
106
  specification_version: 4
114
107
  summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
115
108
  which supports any kind of string encoding.
@@ -1,97 +0,0 @@
1
- #include "adj_matrix.h"
2
- #include "codepoints.h"
3
- #include "ruby.h"
4
-
5
- const char *DEFAULT_ADJ_TABLE[] = {
6
- "A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
7
- "O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
8
- "C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
9
- "S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
10
- "I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
11
- "O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
12
-
13
- void node_free(Node *head);
14
-
15
- AdjMatrix *adj_matrix_new(uint32_t length) {
16
- AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
17
- matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
18
- matrix->table = malloc(matrix->length * sizeof(Node **));
19
- for (size_t i = 0; i < matrix->length; i++) {
20
- matrix->table[i] = malloc(matrix->length * sizeof(Node *));
21
- for (size_t j = 0; j < matrix->length; j++)
22
- matrix->table[i][j] = NULL;
23
- }
24
- return matrix;
25
- }
26
-
27
- void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
28
- uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
29
- ADJ_MATRIX_DEFAULT_LENGTH,
30
- h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
31
- ADJ_MATRIX_DEFAULT_LENGTH;
32
- Node *new_node = malloc(sizeof(Node));
33
- new_node->x = h1;
34
- new_node->y = h2;
35
- new_node->next = NULL;
36
- if (matrix->table[h1][h2] == NULL) {
37
- matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
38
- } else {
39
- Node *previous = NULL;
40
- for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
41
- previous = i;
42
- previous->next = new_node;
43
- }
44
- }
45
-
46
- char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
47
- uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
48
- ADJ_MATRIX_DEFAULT_LENGTH,
49
- h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
50
- ADJ_MATRIX_DEFAULT_LENGTH;
51
- Node *node = matrix->table[h1][h2];
52
- if (node == NULL)
53
- return 0;
54
- else {
55
- for (Node *i = node; i != NULL; i = i->next)
56
- if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
57
- return 1;
58
- return 0;
59
- }
60
- }
61
-
62
- void node_free(Node *head) {
63
- if (head == NULL)
64
- return;
65
- node_free(head->next);
66
- free(head);
67
- }
68
-
69
- void adj_matrix_free(AdjMatrix *matrix) {
70
- for (size_t i = 0; i < matrix->length; i++) {
71
- for (size_t j = 0; j < matrix->length; j++)
72
- if (matrix->table[i][j] != NULL) {
73
- node_free(matrix->table[i][j]);
74
- matrix->table[i][j] = matrix->table[j][i] = NULL;
75
- }
76
- free(matrix->table[i]);
77
- }
78
- free(matrix->table);
79
- free(matrix);
80
- }
81
-
82
- AdjMatrix *adj_matrix_default() {
83
- static char first_time = 1;
84
- static AdjMatrix *ret_matrix;
85
- if (first_time) {
86
- ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
87
- size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
88
- for (size_t i = 0; i < length; i += 2) {
89
- uint64_t code_1, code_2;
90
- code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
91
- code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
92
- adj_matrix_add(ret_matrix, code_1, code_2);
93
- }
94
- first_time = 0;
95
- }
96
- return ret_matrix;
97
- }
@@ -1,22 +0,0 @@
1
- #pragma once
2
-
3
- #include "stdint.h"
4
-
5
- #define ADJ_MATRIX_DEFAULT_LENGTH 958
6
- #define ADJ_MATRIX_SEED 9527
7
-
8
- typedef struct _node {
9
- struct _node *next;
10
- uint64_t x, y;
11
- } Node;
12
-
13
- typedef struct {
14
- Node ***table;
15
- uint32_t length;
16
- } AdjMatrix;
17
-
18
- AdjMatrix *adj_matrix_new(uint32_t length);
19
- void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
20
- char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
21
- void adj_matrix_free(AdjMatrix *matrix);
22
- AdjMatrix *adj_matrix_default();
@@ -1,61 +0,0 @@
1
- #include "codepoints.h"
2
- #include "ruby.h"
3
- #include "ruby/encoding.h"
4
- #include <stdint.h>
5
- #include <stdlib.h>
6
- #include <string.h>
7
-
8
- // this function is copied from string.c
9
- static inline int single_byte_optimizable(VALUE str) {
10
- rb_encoding *enc;
11
-
12
- /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
13
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
14
- return 1;
15
-
16
- enc = rb_enc_get(str);
17
- if (rb_enc_mbmaxlen(enc) == 1)
18
- return 1;
19
-
20
- /* Conservative. Possibly single byte.
21
- * "\xa1" in Shift_JIS for example. */
22
- return 0;
23
- }
24
-
25
- void codepoints_init(CodePoints *codepoints, VALUE str) {
26
- size_t i, length;
27
- int32_t n;
28
- uint32_t c;
29
- const char *ptr, *end;
30
- rb_encoding *enc;
31
-
32
- if (single_byte_optimizable(str)) {
33
- length = RSTRING_LEN(str);
34
- ptr = RSTRING_PTR(str);
35
- codepoints->data = malloc(length * sizeof(*codepoints->data));
36
- for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
37
- codepoints->data[i] = ptr[i] & 0xff;
38
- } else {
39
- codepoints->length = 0;
40
- codepoints->size = 32;
41
- codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
42
- str = rb_str_new_frozen(str);
43
- ptr = RSTRING_PTR(str);
44
- end = RSTRING_END(str);
45
- enc = rb_enc_get(str);
46
-
47
- while (ptr < end) {
48
- c = rb_enc_codepoint_len(ptr, end, &n, enc);
49
- if (codepoints->length == codepoints->size) {
50
- codepoints->size *= 2;
51
- codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
52
- codepoints->size);
53
- }
54
- codepoints->data[codepoints->length++] = c;
55
- ptr += n;
56
- }
57
- RB_GC_GUARD(str);
58
- }
59
- }
60
-
61
- void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
@@ -1,13 +0,0 @@
1
- #pragma once
2
- #include "ruby.h"
3
- #include <stddef.h>
4
- #include <stdint.h>
5
-
6
- typedef struct {
7
- uint32_t *data;
8
- size_t length;
9
- size_t size;
10
- } CodePoints;
11
-
12
- void codepoints_init(CodePoints *, VALUE str);
13
- void codepoints_free(CodePoints *);
@@ -1,121 +0,0 @@
1
- #include "jaro.h"
2
- #include "adj_matrix.h"
3
- #include "codepoints.h"
4
-
5
- #include <ctype.h>
6
- #include <stdlib.h>
7
- #include <string.h>
8
-
9
- #define DEFAULT_WEIGHT 0.1
10
- #define DEFAULT_THRESHOLD 0.7
11
- #define SWAP(x, y) \
12
- do { \
13
- __typeof__(x) SWAP = x; \
14
- x = y; \
15
- y = SWAP; \
16
- } while (0)
17
-
18
- const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
19
- .threshold = DEFAULT_THRESHOLD,
20
- .ignore_case = 0,
21
- .adj_table = 0};
22
-
23
- double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
24
- uint32_t *codepoints2, size_t len2,
25
- Options *opt) {
26
- if (!len1 || !len2)
27
- return 0.0;
28
-
29
- if (len1 > len2) {
30
- SWAP(codepoints1, codepoints2);
31
- SWAP(len1, len2);
32
- }
33
-
34
- if (opt->ignore_case) {
35
- for (size_t i = 0; i < len1; i++)
36
- codepoints1[i] = tolower(codepoints1[i]);
37
- for (size_t i = 0; i < len2; i++)
38
- codepoints2[i] = tolower(codepoints2[i]);
39
- }
40
-
41
- int32_t window_size = (int32_t)len2 / 2 - 1;
42
- if (window_size < 0)
43
- window_size = 0;
44
-
45
- char short_codes_flag[len1];
46
- char long_codes_flag[len2];
47
- memset(short_codes_flag, 0, len1);
48
- memset(long_codes_flag, 0, len2);
49
-
50
- // count number of matching characters
51
- size_t match_count = 0;
52
- for (size_t i = 0; i < len1; i++) {
53
- size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
54
- size_t right =
55
- (i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
56
- if (right > len2 - 1)
57
- right = len2 - 1;
58
- for (size_t j = left; j <= right; j++) {
59
- if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
60
- short_codes_flag[i] = long_codes_flag[j] = 1;
61
- match_count++;
62
- break;
63
- }
64
- }
65
- }
66
-
67
- if (!match_count)
68
- return 0.0;
69
-
70
- // count number of transpositions
71
- size_t transposition_count = 0, j = 0, k = 0;
72
- for (size_t i = 0; i < len1; i++) {
73
- if (short_codes_flag[i]) {
74
- for (j = k; j < len2; j++) {
75
- if (long_codes_flag[j]) {
76
- k = j + 1;
77
- break;
78
- }
79
- }
80
- if (codepoints1[i] != codepoints2[j])
81
- transposition_count++;
82
- }
83
- }
84
-
85
- // count similarities in nonmatched characters
86
- size_t similar_count = 0;
87
- if (opt->adj_table && len1 > match_count)
88
- for (size_t i = 0; i < len1; i++)
89
- if (!short_codes_flag[i])
90
- for (size_t j = 0; j < len2; j++)
91
- if (!long_codes_flag[j])
92
- if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
93
- codepoints2[j])) {
94
- similar_count += 3;
95
- break;
96
- }
97
-
98
- double m = (double)match_count;
99
- double t = (double)(transposition_count / 2);
100
- if (opt->adj_table)
101
- m = similar_count / 10.0 + m;
102
- return (m / len1 + m / len2 + (m - t) / m) / 3;
103
- }
104
-
105
- double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
106
- uint32_t *codepoints2, size_t len2,
107
- Options *opt) {
108
- double jaro_distance =
109
- jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
110
-
111
- if (jaro_distance < opt->threshold)
112
- return jaro_distance;
113
- else {
114
- size_t prefix = 0;
115
- size_t max_4 = len1 > 4 ? 4 : len1;
116
- for (prefix = 0;
117
- prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
118
- ;
119
- return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
120
- }
121
- }
@@ -1,17 +0,0 @@
1
- #pragma once
2
-
3
- #include <stddef.h>
4
- #include <stdint.h>
5
-
6
- typedef struct {
7
- double weight, threshold;
8
- char ignore_case, adj_table;
9
- } Options;
10
-
11
- extern const Options DEFAULT_OPTIONS;
12
-
13
- double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
14
- uint32_t *codepoints2, size_t len2, Options *);
15
- double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
16
- uint32_t *codepoints2, size_t len2,
17
- Options *);
@@ -1,70 +0,0 @@
1
- #include "codepoints.h"
2
- #include "jaro.h"
3
- #include "ruby.h"
4
-
5
- VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
6
-
7
- VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
8
- VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
9
- VALUE distance(size_t argc, VALUE *argv, VALUE self,
10
- double (*distance_fn)(uint32_t *codepoints1, size_t len1,
11
- uint32_t *codepoints2, size_t len2,
12
- Options *));
13
-
14
- void Init_jaro_winkler_ext(void) {
15
- rb_mJaroWinkler = rb_define_module("JaroWinkler");
16
- rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
17
- rb_eInvalidWeightError =
18
- rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
19
- rb_define_singleton_method(rb_mJaroWinkler, "distance",
20
- rb_jaro_winkler_distance, -1);
21
- rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
22
- -1);
23
- }
24
-
25
- VALUE distance(size_t argc, VALUE *argv, VALUE self,
26
- double (*distance_fn)(uint32_t *codepoints1, size_t len1,
27
- uint32_t *codepoints2, size_t len2,
28
- Options *)) {
29
- VALUE s1, s2, opt;
30
- CodePoints cp1, cp2;
31
-
32
- rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
33
- codepoints_init(&cp1, s1);
34
- codepoints_init(&cp2, s2);
35
-
36
- Options c_opt = DEFAULT_OPTIONS;
37
- if (TYPE(opt) == T_HASH) {
38
- VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
39
- threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
40
- ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
41
- adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
42
- if (!NIL_P(weight))
43
- c_opt.weight = NUM2DBL(weight);
44
- if (c_opt.weight > 0.25)
45
- rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
46
- "otherwise the distance can become "
47
- "larger than 1.");
48
- if (!NIL_P(threshold))
49
- c_opt.threshold = NUM2DBL(threshold);
50
- if (!NIL_P(ignore_case))
51
- c_opt.ignore_case =
52
- (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
53
- if (!NIL_P(adj_table))
54
- c_opt.adj_table =
55
- (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
56
- }
57
- VALUE ret = rb_float_new(
58
- (*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
59
- codepoints_free(&cp1);
60
- codepoints_free(&cp2);
61
- return ret;
62
- }
63
-
64
- VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
65
- return distance(argc, argv, self, jaro_distance_from_codes);
66
- }
67
-
68
- VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
69
- return distance(argc, argv, self, jaro_winkler_distance_from_codes);
70
- }