jaro_winkler 1.4.0-java → 1.5.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 52f268c19787793ca7383fe1de1f0355e0a3e6b9
4
- data.tar.gz: f8814b814294a7f9268a6df2ad1ad72c75146c3e
3
+ metadata.gz: a3fd3fabbc7662b62ead8988ef48c53a89d75b6f
4
+ data.tar.gz: 7ba3eff5e134aadab37aa52f22665d78e66f6dcd
5
5
  SHA512:
6
- metadata.gz: 2ea65143ad847ef5cd565584c2dd1ce19908136506697eafe0579609227628a6e2bbb4baacd0d6c3ee883bcea07fff3043ae305d84c307a0e5f359dff64ab0c1
7
- data.tar.gz: 254d25523a0654343ca5b9a552789021a30d3dc7d0c613333db7d67f3ccc41ce003c01bf00b63089b9a340a83594d40f1d7c49b2e59601885b471e68048fc23f
6
+ metadata.gz: 3ee03982e280949d7069e5f25d4e1c5103b047abe5853e6497f5926a3cc5155c11f0876fc93461f7f26b1eb0985d9cf987b109d5898386928419032533cd1c8f
7
+ data.tar.gz: 902531a857d93d74bce572bfc8c604e61dcaf5853794ad36d530208bf70c85d2f43b84624550bbee1e4ddc0fac80fe3be91d96764390d060b1b7230f96c24520
@@ -1,66 +1,75 @@
1
1
  #include "adj_matrix.h"
2
- #include "code.h"
3
-
4
- #include <stdlib.h>
2
+ #include "codepoints.h"
3
+ #include "ruby.h"
5
4
 
6
5
  const char *DEFAULT_ADJ_TABLE[] = {
7
- "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
8
- "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
9
- "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
10
- "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
11
- };
6
+ "A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
7
+ "O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
8
+ "C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
9
+ "S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
10
+ "I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
11
+ "O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
12
12
 
13
- extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
14
13
  void node_free(Node *head);
15
14
 
16
- AdjMatrix* adj_matrix_new(unsigned int length){
15
+ AdjMatrix *adj_matrix_new(uint32_t length) {
17
16
  AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
18
17
  matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
19
- matrix->table = malloc(matrix->length * sizeof(Node**));
20
- for(int i = 0; i < matrix->length; i++){
21
- matrix->table[i] = malloc(matrix->length * sizeof(Node*));
22
- for (int j = 0; j < matrix->length; j++)
18
+ matrix->table = malloc(matrix->length * sizeof(Node **));
19
+ for (size_t i = 0; i < matrix->length; i++) {
20
+ matrix->table[i] = malloc(matrix->length * sizeof(Node *));
21
+ for (size_t j = 0; j < matrix->length; j++)
23
22
  matrix->table[i][j] = NULL;
24
23
  }
25
24
  return matrix;
26
25
  }
27
26
 
28
- void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
29
- unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
30
- h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
31
- Node *new_node = malloc(sizeof(Node)); new_node->x = h1; new_node->y = h2; new_node->next = NULL;
32
- if(matrix->table[h1][h2] == NULL){
27
+ void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
28
+ uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
29
+ ADJ_MATRIX_DEFAULT_LENGTH,
30
+ h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
31
+ ADJ_MATRIX_DEFAULT_LENGTH;
32
+ Node *new_node = malloc(sizeof(Node));
33
+ new_node->x = h1;
34
+ new_node->y = h2;
35
+ new_node->next = NULL;
36
+ if (matrix->table[h1][h2] == NULL) {
33
37
  matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
34
- }
35
- else{
38
+ } else {
36
39
  Node *previous = NULL;
37
- for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next) previous = i;
40
+ for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
41
+ previous = i;
38
42
  previous->next = new_node;
39
43
  }
40
44
  }
41
45
 
42
- char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
43
- unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
44
- h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
46
+ char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
47
+ uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
48
+ ADJ_MATRIX_DEFAULT_LENGTH,
49
+ h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
50
+ ADJ_MATRIX_DEFAULT_LENGTH;
45
51
  Node *node = matrix->table[h1][h2];
46
- if(node == NULL) return 0;
47
- else{
48
- for(Node *i = node; i != NULL; i = i->next)
49
- if((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1)) return 1;
52
+ if (node == NULL)
53
+ return 0;
54
+ else {
55
+ for (Node *i = node; i != NULL; i = i->next)
56
+ if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
57
+ return 1;
50
58
  return 0;
51
59
  }
52
60
  }
53
61
 
54
- void node_free(Node *head){
55
- if(head == NULL) return;
62
+ void node_free(Node *head) {
63
+ if (head == NULL)
64
+ return;
56
65
  node_free(head->next);
57
66
  free(head);
58
67
  }
59
68
 
60
- void adj_matrix_free(AdjMatrix *matrix){
61
- for(int i = 0; i < matrix->length; i++){
62
- for(int j = 0; j < matrix->length; j++)
63
- if(matrix->table[i][j] != NULL){
69
+ void adj_matrix_free(AdjMatrix *matrix) {
70
+ for (size_t i = 0; i < matrix->length; i++) {
71
+ for (size_t j = 0; j < matrix->length; j++)
72
+ if (matrix->table[i][j] != NULL) {
64
73
  node_free(matrix->table[i][j]);
65
74
  matrix->table[i][j] = matrix->table[j][i] = NULL;
66
75
  }
@@ -70,20 +79,19 @@ void adj_matrix_free(AdjMatrix *matrix){
70
79
  free(matrix);
71
80
  }
72
81
 
73
- AdjMatrix* adj_matrix_default(){
82
+ AdjMatrix *adj_matrix_default() {
74
83
  static char first_time = 1;
75
84
  static AdjMatrix *ret_matrix;
76
- if(first_time){
85
+ if (first_time) {
77
86
  ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
78
- int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
79
- for(int i = 0; i < length; i += 2){
80
- unsigned long long code_1, code_2;
81
- int dummy_length;
82
- utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
83
- utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
87
+ size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
88
+ for (size_t i = 0; i < length; i += 2) {
89
+ uint64_t code_1, code_2;
90
+ code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
91
+ code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
84
92
  adj_matrix_add(ret_matrix, code_1, code_2);
85
93
  }
86
94
  first_time = 0;
87
95
  }
88
96
  return ret_matrix;
89
- }
97
+ }
@@ -1,22 +1,22 @@
1
- #ifndef ADJ_MATRIX_H
2
- #define ADJ_MATRIX_H
1
+ #pragma once
2
+
3
+ #include "stdint.h"
4
+
3
5
  #define ADJ_MATRIX_DEFAULT_LENGTH 958
4
6
  #define ADJ_MATRIX_SEED 9527
5
7
 
6
- typedef struct _node{
8
+ typedef struct _node {
7
9
  struct _node *next;
8
- unsigned long long x, y;
10
+ uint64_t x, y;
9
11
  } Node;
10
12
 
11
- typedef struct{
13
+ typedef struct {
12
14
  Node ***table;
13
- unsigned int length;
15
+ uint32_t length;
14
16
  } AdjMatrix;
15
17
 
16
- AdjMatrix* adj_matrix_new (unsigned int length);
17
- void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
- char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
- void adj_matrix_free (AdjMatrix *matrix);
20
- AdjMatrix* adj_matrix_default();
21
-
22
- #endif
18
+ AdjMatrix *adj_matrix_new(uint32_t length);
19
+ void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
20
+ char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
21
+ void adj_matrix_free(AdjMatrix *matrix);
22
+ AdjMatrix *adj_matrix_default();
@@ -0,0 +1,61 @@
1
+ #include "codepoints.h"
2
+ #include "ruby.h"
3
+ #include "ruby/encoding.h"
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ // this function is copied from string.c
9
+ static inline int single_byte_optimizable(VALUE str) {
10
+ rb_encoding *enc;
11
+
12
+ /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
13
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
14
+ return 1;
15
+
16
+ enc = rb_enc_get(str);
17
+ if (rb_enc_mbmaxlen(enc) == 1)
18
+ return 1;
19
+
20
+ /* Conservative. Possibly single byte.
21
+ * "\xa1" in Shift_JIS for example. */
22
+ return 0;
23
+ }
24
+
25
+ void codepoints_init(CodePoints *codepoints, VALUE str) {
26
+ size_t i, length;
27
+ int32_t n;
28
+ uint32_t c;
29
+ const char *ptr, *end;
30
+ rb_encoding *enc;
31
+
32
+ if (single_byte_optimizable(str)) {
33
+ length = RSTRING_LEN(str);
34
+ ptr = RSTRING_PTR(str);
35
+ codepoints->data = malloc(length * sizeof(*codepoints->data));
36
+ for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
37
+ codepoints->data[i] = ptr[i] & 0xff;
38
+ } else {
39
+ codepoints->length = 0;
40
+ codepoints->size = 32;
41
+ codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
42
+ str = rb_str_new_frozen(str);
43
+ ptr = RSTRING_PTR(str);
44
+ end = RSTRING_END(str);
45
+ enc = rb_enc_get(str);
46
+
47
+ while (ptr < end) {
48
+ c = rb_enc_codepoint_len(ptr, end, &n, enc);
49
+ if (codepoints->length == codepoints->size) {
50
+ codepoints->size *= 2;
51
+ codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
52
+ codepoints->size);
53
+ }
54
+ codepoints->data[codepoints->length++] = c;
55
+ ptr += n;
56
+ }
57
+ RB_GC_GUARD(str);
58
+ }
59
+ }
60
+
61
+ void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
@@ -0,0 +1,13 @@
1
+ #pragma once
2
+ #include "ruby.h"
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ uint32_t *data;
8
+ size_t length;
9
+ size_t size;
10
+ } CodePoints;
11
+
12
+ void codepoints_init(CodePoints *, VALUE str);
13
+ void codepoints_free(CodePoints *);
@@ -1,73 +1,62 @@
1
1
  #include "jaro.h"
2
- #include "code.h"
3
2
  #include "adj_matrix.h"
3
+ #include "codepoints.h"
4
4
 
5
- #include <string.h>
6
- #include <stdlib.h>
7
5
  #include <ctype.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
8
 
9
- #define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
10
-
11
- double jaro_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
12
- double jaro_winkler_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
13
-
14
- double jaro_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
15
- if(!short_str_len || !long_str_len) return 0.0;
16
-
17
- unsigned long long *short_codes, *long_codes;
18
- int short_codes_len, long_codes_len;
19
- string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
20
- string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
21
-
22
- double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
23
-
24
- free(short_codes); free(long_codes);
25
- return ret;
26
- }
27
-
28
- double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
29
- if(!short_str_len || !long_str_len) return 0.0;
30
-
31
- unsigned long long *short_codes, *long_codes;
32
- int short_codes_len, long_codes_len;
33
- string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
34
- string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
35
-
36
- double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
37
-
38
- free(short_codes); free(long_codes);
39
- return ret;
40
- }
41
-
42
- double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
43
- if(!short_codes_len || !long_codes_len) return 0.0;
44
-
45
- if(short_codes_len > long_codes_len){
46
- SWAP(short_codes, long_codes);
47
- SWAP(short_codes_len, long_codes_len);
9
+ #define DEFAULT_WEIGHT 0.1
10
+ #define DEFAULT_THRESHOLD 0.7
11
+ #define SWAP(x, y) \
12
+ do { \
13
+ __typeof__(x) SWAP = x; \
14
+ x = y; \
15
+ y = SWAP; \
16
+ } while (0)
17
+
18
+ const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
19
+ .threshold = DEFAULT_THRESHOLD,
20
+ .ignore_case = 0,
21
+ .adj_table = 0};
22
+
23
+ double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
24
+ uint32_t *codepoints2, size_t len2,
25
+ Options *opt) {
26
+ if (!len1 || !len2)
27
+ return 0.0;
28
+
29
+ if (len1 > len2) {
30
+ SWAP(codepoints1, codepoints2);
31
+ SWAP(len1, len2);
48
32
  }
49
33
 
50
- if(opt->ignore_case){
51
- for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
52
- for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
34
+ if (opt->ignore_case) {
35
+ for (size_t i = 0; i < len1; i++)
36
+ codepoints1[i] = tolower(codepoints1[i]);
37
+ for (size_t i = 0; i < len2; i++)
38
+ codepoints2[i] = tolower(codepoints2[i]);
53
39
  }
54
40
 
55
- int window_size = long_codes_len/2 - 1;
56
- if(window_size < 0) window_size = 0;
41
+ int32_t window_size = (int32_t)len2 / 2 - 1;
42
+ if (window_size < 0)
43
+ window_size = 0;
57
44
 
58
- char short_codes_flag[short_codes_len];
59
- char long_codes_flag[long_codes_len];
60
- memset(short_codes_flag, 0, short_codes_len);
61
- memset(long_codes_flag, 0, long_codes_len);
45
+ char short_codes_flag[len1];
46
+ char long_codes_flag[len2];
47
+ memset(short_codes_flag, 0, len1);
48
+ memset(long_codes_flag, 0, len2);
62
49
 
63
50
  // count number of matching characters
64
- int match_count = 0;
65
- for(int i = 0; i < short_codes_len; i++){
66
- int left = (i >= window_size) ? i - window_size : 0;
67
- int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
68
- if(right > long_codes_len - 1) right = long_codes_len - 1;
69
- for(int j = left; j <= right; j++){
70
- if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
51
+ size_t match_count = 0;
52
+ for (size_t i = 0; i < len1; i++) {
53
+ size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
54
+ size_t right =
55
+ (i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
56
+ if (right > len2 - 1)
57
+ right = len2 - 1;
58
+ for (size_t j = left; j <= right; j++) {
59
+ if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
71
60
  short_codes_flag[i] = long_codes_flag[j] = 1;
72
61
  match_count++;
73
62
  break;
@@ -75,48 +64,58 @@ double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes
75
64
  }
76
65
  }
77
66
 
78
- if(!match_count) return 0.0;
67
+ if (!match_count)
68
+ return 0.0;
79
69
 
80
70
  // count number of transpositions
81
- int transposition_count = 0, j = 0, k = 0;
82
- for(int i = 0; i < short_codes_len; i++){
83
- if(short_codes_flag[i]){
84
- for(j = k; j < long_codes_len; j++){
85
- if(long_codes_flag[j]){
71
+ size_t transposition_count = 0, j = 0, k = 0;
72
+ for (size_t i = 0; i < len1; i++) {
73
+ if (short_codes_flag[i]) {
74
+ for (j = k; j < len2; j++) {
75
+ if (long_codes_flag[j]) {
86
76
  k = j + 1;
87
77
  break;
88
78
  }
89
79
  }
90
- if(short_codes[i] != long_codes[j]) transposition_count++;
80
+ if (codepoints1[i] != codepoints2[j])
81
+ transposition_count++;
91
82
  }
92
83
  }
93
84
 
94
85
  // count similarities in nonmatched characters
95
- int similar_count = 0;
96
- if(opt->adj_table && short_codes_len > match_count)
97
- for(int i = 0; i < short_codes_len; i++)
98
- if(!short_codes_flag[i])
99
- for(int j = 0; j < long_codes_len; j++)
100
- if(!long_codes_flag[j])
101
- if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
86
+ size_t similar_count = 0;
87
+ if (opt->adj_table && len1 > match_count)
88
+ for (size_t i = 0; i < len1; i++)
89
+ if (!short_codes_flag[i])
90
+ for (size_t j = 0; j < len2; j++)
91
+ if (!long_codes_flag[j])
92
+ if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
93
+ codepoints2[j])) {
102
94
  similar_count += 3;
103
95
  break;
104
96
  }
105
97
 
106
98
  double m = (double)match_count;
107
- double t = (double)(transposition_count/2);
108
- if(opt->adj_table) m = similar_count/10.0 + m;
109
- return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
99
+ double t = (double)(transposition_count / 2);
100
+ if (opt->adj_table)
101
+ m = similar_count / 10.0 + m;
102
+ return (m / len1 + m / len2 + (m - t) / m) / 3;
110
103
  }
111
104
 
112
- double jaro_winkler_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
113
- double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
114
-
115
- if(jaro_distance < opt->threshold) return jaro_distance;
116
- else{
117
- int prefix = 0;
118
- int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
119
- for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
120
- return jaro_distance + prefix*opt->weight*(1-jaro_distance);
105
+ double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
106
+ uint32_t *codepoints2, size_t len2,
107
+ Options *opt) {
108
+ double jaro_distance =
109
+ jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
110
+
111
+ if (jaro_distance < opt->threshold)
112
+ return jaro_distance;
113
+ else {
114
+ size_t prefix = 0;
115
+ size_t max_4 = len1 > 4 ? 4 : len1;
116
+ for (prefix = 0;
117
+ prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
118
+ ;
119
+ return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
121
120
  }
122
- }
121
+ }
@@ -1,17 +1,17 @@
1
- #ifndef LIBJARO_JARO_H
2
- #define LIBJARO_JARO_H
1
+ #pragma once
3
2
 
4
- #define DEFAULT_WEIGHT 0.1
5
- #define DEFAULT_THRESHOLD 0.7
3
+ #include <stddef.h>
4
+ #include <stdint.h>
6
5
 
7
- typedef struct LibJaroOption{
6
+ typedef struct {
8
7
  double weight, threshold;
9
8
  char ignore_case, adj_table;
10
- } LibJaroOption;
9
+ } Options;
11
10
 
11
+ extern const Options DEFAULT_OPTIONS;
12
12
 
13
- static const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
14
- double jaro_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
15
- double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
16
-
17
- #endif
13
+ double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
14
+ uint32_t *codepoints2, size_t len2, Options *);
15
+ double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
16
+ uint32_t *codepoints2, size_t len2,
17
+ Options *);
@@ -1,45 +1,70 @@
1
- #include "ruby.h"
1
+ #include "codepoints.h"
2
2
  #include "jaro.h"
3
+ #include "ruby.h"
3
4
 
4
- VALUE rb_mJaroWinkler,
5
- rb_eError,
6
- rb_eInvalidWeightError;
5
+ VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
7
6
 
8
- VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
9
- VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
10
- VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt));
7
+ VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
8
+ VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
9
+ VALUE distance(size_t argc, VALUE *argv, VALUE self,
10
+ double (*distance_fn)(uint32_t *codepoints1, size_t len1,
11
+ uint32_t *codepoints2, size_t len2,
12
+ Options *));
11
13
 
12
- void Init_jaro_winkler_ext(void){
14
+ void Init_jaro_winkler_ext(void) {
13
15
  rb_mJaroWinkler = rb_define_module("JaroWinkler");
14
16
  rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
15
- rb_eInvalidWeightError = rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
16
- rb_define_module_function(rb_mJaroWinkler, "distance", rb_jaro_winkler_distance, -1);
17
- rb_define_module_function(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1);
17
+ rb_eInvalidWeightError =
18
+ rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
19
+ rb_define_singleton_method(rb_mJaroWinkler, "distance",
20
+ rb_jaro_winkler_distance, -1);
21
+ rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
22
+ -1);
18
23
  }
19
24
 
20
-
21
- VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt)){
25
+ VALUE distance(size_t argc, VALUE *argv, VALUE self,
26
+ double (*distance_fn)(uint32_t *codepoints1, size_t len1,
27
+ uint32_t *codepoints2, size_t len2,
28
+ Options *)) {
22
29
  VALUE s1, s2, opt;
23
- rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
24
- LibJaroOption c_opt = DEFAULT_OPT;
25
- if(TYPE(opt) == T_HASH){
30
+ CodePoints cp1, cp2;
31
+
32
+ rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
33
+ codepoints_init(&cp1, s1);
34
+ codepoints_init(&cp2, s2);
35
+
36
+ Options c_opt = DEFAULT_OPTIONS;
37
+ if (TYPE(opt) == T_HASH) {
26
38
  VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
27
39
  threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
28
40
  ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
29
41
  adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
30
- if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
31
- if(c_opt.weight > 0.25) rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
32
- if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
33
- if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
34
- if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
42
+ if (!NIL_P(weight))
43
+ c_opt.weight = NUM2DBL(weight);
44
+ if (c_opt.weight > 0.25)
45
+ rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
46
+ "otherwise the distance can become "
47
+ "larger than 1.");
48
+ if (!NIL_P(threshold))
49
+ c_opt.threshold = NUM2DBL(threshold);
50
+ if (!NIL_P(ignore_case))
51
+ c_opt.ignore_case =
52
+ (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
53
+ if (!NIL_P(adj_table))
54
+ c_opt.adj_table =
55
+ (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
35
56
  }
36
- return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
57
+ VALUE ret = rb_float_new(
58
+ (*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
59
+ codepoints_free(&cp1);
60
+ codepoints_free(&cp2);
61
+ return ret;
37
62
  }
38
63
 
39
- VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self){
40
- return distance(argc, argv, self, jaro_distance);
64
+ VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
65
+ return distance(argc, argv, self, jaro_distance_from_codes);
41
66
  }
42
67
 
43
- VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self){
44
- return distance(argc, argv, self, jaro_winkler_distance);
45
- }
68
+ VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
69
+ return distance(argc, argv, self, jaro_winkler_distance_from_codes);
70
+ }
@@ -1,9 +1,9 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'jaro_winkler/version'
2
4
 
3
- case RUBY_PLATFORM
4
- when 'java'
5
- require 'jaro_winkler/jaro_winkler_pure'
6
- else
5
+ if RUBY_ENGINE == 'ruby'
7
6
  require 'jaro_winkler/jaro_winkler_ext'
7
+ else
8
+ require 'jaro_winkler/jaro_winkler_pure'
8
9
  end
9
-
@@ -1,19 +1,14 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module JaroWinkler
2
- DEFAULT_ADJ_TABLE = Hash.new
4
+ DEFAULT_ADJ_TABLE = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
3
5
  [
4
- ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
5
- ['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
6
- ['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
7
- ['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
6
+ %w[A E], %w[A I], %w[A O], %w[A U], %w[B V], %w[E I], %w[E O], %w[E U], %w[I O],
7
+ %w[I U], %w[O U], %w[I Y], %w[E Y], %w[C G], %w[E F], %w[W U], %w[W V], %w[X K],
8
+ %w[S Z], %w[X S], %w[Q C], %w[U V], %w[M N], %w[L I], %w[Q O], %w[P R], %w[I J],
9
+ %w[2 Z], %w[5 S], %w[8 B], %w[1 I], %w[1 L], %w[0 O], %w[0 Q], %w[C K], %w[G J],
8
10
  ['E', ' '], ['Y', ' '], ['S', ' ']
9
- ].each{ |s1, s2|
10
- if not DEFAULT_ADJ_TABLE.has_key?(s1)
11
- DEFAULT_ADJ_TABLE[s1] = Hash.new
12
- end
13
- if not DEFAULT_ADJ_TABLE.has_key?(s2)
14
- DEFAULT_ADJ_TABLE[s2] = Hash.new
15
- end
11
+ ].each do |s1, s2|
16
12
  DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
17
- }
18
- DEFAULT_ADJ_TABLE.default = Hash.new
13
+ end
19
14
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'jaro_winkler/adjusting_table'
2
4
  module JaroWinkler
3
5
  class Error < RuntimeError; end
@@ -6,120 +8,122 @@ module JaroWinkler
6
8
  DEFAULT_WEIGHT = 0.1
7
9
  DEFAULT_THRESHOLD = 0.7
8
10
  DEFAULT_OPTIONS = {
9
- jaro: {adj_table: false, ignore_case: false},
10
- jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
11
- }
12
-
13
- module_function
14
-
15
- def distance str1, str2, options={}
16
- _distance str1.codepoints.to_a, str2.codepoints.to_a, options
17
- end
11
+ jaro: { adj_table: false, ignore_case: false },
12
+ jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
13
+ }.freeze
18
14
 
19
- def jaro_distance str1, str2, options={}
20
- _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
21
- end
15
+ class << self
16
+ def distance(str1, str2, options = {})
17
+ _distance str1.codepoints.to_a, str2.codepoints.to_a, options
18
+ end
22
19
 
23
- def _distance codes1, codes2, options={}
24
- options = DEFAULT_OPTIONS[:jaro_winkler].merge options
25
- raise InvalidWeightError if options[:weight] > 0.25
26
- jaro_distance = _jaro_distance(codes1, codes2, options);
20
+ def jaro_distance(str1, str2, options = {})
21
+ _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
22
+ end
27
23
 
28
- if jaro_distance < options[:threshold]
29
- jaro_distance
30
- else
31
- codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
32
- len1, len2 = codes1.length, codes2.length
33
- max_4 = len1 > 4 ? 4 : len1
34
- prefix = 0
35
- while prefix < max_4 && codes1[prefix] == codes2[prefix]
36
- prefix += 1
24
+ private
25
+
26
+ def _distance(codes1, codes2, options = {})
27
+ options = DEFAULT_OPTIONS[:jaro_winkler].merge options
28
+ raise InvalidWeightError if options[:weight] > 0.25
29
+ jaro_distance = _jaro_distance(codes1, codes2, options)
30
+
31
+ if jaro_distance < options[:threshold]
32
+ jaro_distance
33
+ else
34
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
35
+ len1 = codes1.length
36
+ len2 = codes2.length
37
+ max_4 = len1 > 4 ? 4 : len1
38
+ prefix = 0
39
+ prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
40
+ jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
37
41
  end
38
- jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
39
42
  end
40
- end
41
-
42
- def _jaro_distance codes1, codes2, options={}
43
- options = DEFAULT_OPTIONS[:jaro].merge options
44
43
 
45
- codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
46
- len1, len2 = codes1.length, codes2.length
47
- return 0.0 if len1 == 0 || len2 == 0
44
+ def _jaro_distance(codes1, codes2, options = {})
45
+ options = DEFAULT_OPTIONS[:jaro].merge options
48
46
 
49
- if options[:ignore_case]
50
- codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
51
- codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
52
- end
47
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
48
+ len1 = codes1.length
49
+ len2 = codes2.length
50
+ return 0.0 if len1 == 0 || len2 == 0
53
51
 
54
- window = len2/2 - 1
55
- window = 0 if(window < 0)
56
- flags1, flags2 = 0, 0
57
-
58
- # // count number of matching characters
59
- match_count = 0;
60
- i = 0
61
- while i < len1
62
- left = (i >= window) ? i - window : 0
63
- right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
64
- right = len2 - 1 if right > len2 - 1
65
- j = left
66
- while j <= right
67
- if flags2[j] == 0 && codes1[i] == codes2[j]
68
- flags1 |= (1 << i)
69
- flags2 |= (1 << j)
70
- match_count += 1
71
- break
72
- end
73
- j +=1
52
+ if options[:ignore_case]
53
+ codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
54
+ codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
74
55
  end
75
- i += 1
76
- end
77
56
 
78
- return 0.0 if match_count == 0
79
-
80
- # // count number of transpositions
81
- transposition_count = j = k = 0
82
- i = 0
83
- while i < len1
84
- if flags1[i] == 1
85
- j = k
86
- while j < len2
87
- if flags2[j] == 1
88
- k = j + 1;
89
- break;
57
+ window = len2 / 2 - 1
58
+ window = 0 if window < 0
59
+ flags1 = 0
60
+ flags2 = 0
61
+
62
+ # // count number of matching characters
63
+ match_count = 0
64
+ i = 0
65
+ while i < len1
66
+ left = i >= window ? i - window : 0
67
+ right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
68
+ right = len2 - 1 if right > len2 - 1
69
+ j = left
70
+ while j <= right
71
+ if flags2[j] == 0 && codes1[i] == codes2[j]
72
+ flags1 |= (1 << i)
73
+ flags2 |= (1 << j)
74
+ match_count += 1
75
+ break
90
76
  end
91
77
  j += 1
92
78
  end
93
- transposition_count += 1 if codes1[i] != codes2[j]
79
+ i += 1
94
80
  end
95
- i += 1
96
- end
97
81
 
98
- # // count similarities in nonmatched characters
99
- similar_count = 0
100
- if options[:adj_table] && len1 > match_count
82
+ return 0.0 if match_count == 0
83
+
84
+ # // count number of transpositions
85
+ transposition_count = j = k = 0
101
86
  i = 0
102
87
  while i < len1
103
- if flags1[i] == 0
104
- j = 0
88
+ if flags1[i] == 1
89
+ j = k
105
90
  while j < len2
106
- if flags2[j] == 0
107
- if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
108
- similar_count += 3
109
- break
110
- end
91
+ if flags2[j] == 1
92
+ k = j + 1
93
+ break
111
94
  end
112
95
  j += 1
113
96
  end
97
+ transposition_count += 1 if codes1[i] != codes2[j]
114
98
  end
115
99
  i += 1
116
100
  end
117
- end
118
101
 
119
- m = match_count.to_f
120
- t = transposition_count/2
121
- m = similar_count/10.0 + m if options[:adj_table]
122
- (m/len1 + m/len2 + (m-t)/m) / 3
123
- end
102
+ # // count similarities in nonmatched characters
103
+ similar_count = 0
104
+ if options[:adj_table] && len1 > match_count
105
+ i = 0
106
+ while i < len1
107
+ if flags1[i] == 0
108
+ j = 0
109
+ while j < len2
110
+ if flags2[j] == 0
111
+ if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
112
+ similar_count += 3
113
+ break
114
+ end
115
+ end
116
+ j += 1
117
+ end
118
+ end
119
+ i += 1
120
+ end
121
+ end
124
122
 
125
- end
123
+ m = match_count.to_f
124
+ t = transposition_count / 2
125
+ m = similar_count / 10.0 + m if options[:adj_table]
126
+ (m / len1 + m / len2 + (m - t) / m) / 3
127
+ end
128
+ end
129
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module JaroWinkler
2
- VERSION = '1.4.0'
4
+ VERSION = '1.5.1'
3
5
  end
metadata CHANGED
@@ -1,72 +1,77 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.1
5
5
  platform: java
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-12 00:00:00.000000000 Z
11
+ date: 2018-06-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
- version_requirements: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.7'
20
14
  requirement: !ruby/object:Gem::Requirement
21
15
  requirements:
22
- - - ~>
16
+ - - "~>"
23
17
  - !ruby/object:Gem::Version
24
18
  version: '1.7'
25
- prerelease: false
19
+ name: bundler
26
20
  type: :development
27
- - !ruby/object:Gem::Dependency
28
- name: rake
21
+ prerelease: false
29
22
  version_requirements: !ruby/object:Gem::Requirement
30
23
  requirements:
31
- - - ~>
24
+ - - "~>"
32
25
  - !ruby/object:Gem::Version
33
- version: '10.0'
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
34
28
  requirement: !ruby/object:Gem::Requirement
35
29
  requirements:
36
- - - ~>
30
+ - - "~>"
37
31
  - !ruby/object:Gem::Version
38
- version: '10.0'
39
- prerelease: false
32
+ version: '12.0'
33
+ name: rake
40
34
  type: :development
41
- - !ruby/object:Gem::Dependency
42
- name: rake-compiler
35
+ prerelease: false
43
36
  version_requirements: !ruby/object:Gem::Requirement
44
37
  requirements:
45
- - - '>='
38
+ - - "~>"
46
39
  - !ruby/object:Gem::Version
47
- version: '0'
40
+ version: '12.0'
41
+ - !ruby/object:Gem::Dependency
48
42
  requirement: !ruby/object:Gem::Requirement
49
43
  requirements:
50
- - - '>='
44
+ - - ">="
51
45
  - !ruby/object:Gem::Version
52
46
  version: '0'
53
- prerelease: false
47
+ name: rake-compiler
54
48
  type: :development
55
- - !ruby/object:Gem::Dependency
56
- name: minitest
49
+ prerelease: false
57
50
  version_requirements: !ruby/object:Gem::Requirement
58
51
  requirements:
59
- - - '>='
52
+ - - ">="
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
62
56
  requirement: !ruby/object:Gem::Requirement
63
57
  requirements:
64
- - - '>='
58
+ - - ">="
65
59
  - !ruby/object:Gem::Version
66
60
  version: '0'
67
- prerelease: false
61
+ name: minitest
68
62
  type: :development
69
- description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: |-
70
+ jaro_winkler is an implementation of Jaro-Winkler \
71
+ distance algorithm which is written in C extension and will fallback to pure \
72
+ Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
73
+ C and Ruby implementation support any kind of string encoding, such as \
74
+ UTF-8, EUC-JP, Big5, etc.
70
75
  email: tonytonyjan@gmail.com
71
76
  executables: []
72
77
  extensions: []
@@ -74,12 +79,11 @@ extra_rdoc_files: []
74
79
  files:
75
80
  - ext/jaro_winkler/adj_matrix.c
76
81
  - ext/jaro_winkler/adj_matrix.h
77
- - ext/jaro_winkler/code.c
78
- - ext/jaro_winkler/code.h
82
+ - ext/jaro_winkler/codepoints.c
83
+ - ext/jaro_winkler/codepoints.h
79
84
  - ext/jaro_winkler/jaro.c
80
85
  - ext/jaro_winkler/jaro.h
81
86
  - ext/jaro_winkler/jaro_winkler.c
82
- - ext/jaro_winkler/murmur_hash2.c
83
87
  - lib/jaro_winkler.rb
84
88
  - lib/jaro_winkler/adjusting_table.rb
85
89
  - lib/jaro_winkler/jaro_winkler_pure.rb
@@ -94,18 +98,19 @@ require_paths:
94
98
  - lib
95
99
  required_ruby_version: !ruby/object:Gem::Requirement
96
100
  requirements:
97
- - - '>='
101
+ - - ">="
98
102
  - !ruby/object:Gem::Version
99
103
  version: '0'
100
104
  required_rubygems_version: !ruby/object:Gem::Requirement
101
105
  requirements:
102
- - - '>='
106
+ - - ">="
103
107
  - !ruby/object:Gem::Version
104
108
  version: '0'
105
109
  requirements: []
106
110
  rubyforge_project:
107
- rubygems_version: 2.4.5
111
+ rubygems_version: 2.6.14.1
108
112
  signing_key:
109
113
  specification_version: 4
110
- summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.
114
+ summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
115
+ which supports any kind of string encoding.
111
116
  test_files: []
@@ -1,29 +0,0 @@
1
- #include <stdlib.h>
2
- #include <string.h>
3
-
4
- void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
5
- unsigned char first_char = str[0];
6
- if(first_char >= 252) *ret_byte_length = 6; // 1111110x
7
- else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
8
- else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
9
- else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
10
- else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
11
- else *ret_byte_length = 1;
12
- *ret_code = 0;
13
- memcpy(ret_code, str, *ret_byte_length);
14
- }
15
-
16
- void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
17
- unsigned int code;
18
- char byte_length;
19
-
20
- *ret_codes = calloc(length, sizeof(long long));
21
- *ret_length = 0;
22
-
23
- for(int i = 0; i < length;){
24
- int byte_length;
25
- utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
26
- *ret_length += 1;
27
- i += byte_length;
28
- }
29
- }
@@ -1,7 +0,0 @@
1
- #ifndef CODE_H
2
- #define CODE_H
3
-
4
- void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length);
5
- void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length);
6
-
7
- #endif
@@ -1,64 +0,0 @@
1
- //-----------------------------------------------------------------------------
2
- // MurmurHash2, by Austin Appleby
3
-
4
- // Note - This code makes a few assumptions about how your machine behaves -
5
-
6
- // 1. We can read a 4-byte value from any address without crashing
7
- // 2. sizeof(int) == 4
8
-
9
- // And it has a few limitations -
10
-
11
- // 1. It will not work incrementally.
12
- // 2. It will not produce the same results on little-endian and big-endian
13
- // machines.
14
-
15
- unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
- {
17
- // 'm' and 'r' are mixing constants generated offline.
18
- // They're not really 'magic', they just happen to work well.
19
-
20
- const unsigned int m = 0x5bd1e995;
21
- const int r = 24;
22
-
23
- // Initialize the hash to a 'random' value
24
-
25
- unsigned int h = seed ^ len;
26
-
27
- // Mix 4 bytes at a time into the hash
28
-
29
- const unsigned char * data = (const unsigned char *)key;
30
-
31
- while(len >= 4)
32
- {
33
- unsigned int k = *(unsigned int *)data;
34
-
35
- k *= m;
36
- k ^= k >> r;
37
- k *= m;
38
-
39
- h *= m;
40
- h ^= k;
41
-
42
- data += 4;
43
- len -= 4;
44
- }
45
-
46
- // Handle the last few bytes of the input array
47
-
48
- switch(len)
49
- {
50
- case 3: h ^= data[2] << 16;
51
- case 2: h ^= data[1] << 8;
52
- case 1: h ^= data[0];
53
- h *= m;
54
- };
55
-
56
- // Do a few final mixes of the hash to ensure the last few
57
- // bytes are well-incorporated.
58
-
59
- h ^= h >> 13;
60
- h *= m;
61
- h ^= h >> 15;
62
-
63
- return h;
64
- }