jaro_winkler 1.4.0-java → 1.5.1-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 52f268c19787793ca7383fe1de1f0355e0a3e6b9
4
- data.tar.gz: f8814b814294a7f9268a6df2ad1ad72c75146c3e
3
+ metadata.gz: a3fd3fabbc7662b62ead8988ef48c53a89d75b6f
4
+ data.tar.gz: 7ba3eff5e134aadab37aa52f22665d78e66f6dcd
5
5
  SHA512:
6
- metadata.gz: 2ea65143ad847ef5cd565584c2dd1ce19908136506697eafe0579609227628a6e2bbb4baacd0d6c3ee883bcea07fff3043ae305d84c307a0e5f359dff64ab0c1
7
- data.tar.gz: 254d25523a0654343ca5b9a552789021a30d3dc7d0c613333db7d67f3ccc41ce003c01bf00b63089b9a340a83594d40f1d7c49b2e59601885b471e68048fc23f
6
+ metadata.gz: 3ee03982e280949d7069e5f25d4e1c5103b047abe5853e6497f5926a3cc5155c11f0876fc93461f7f26b1eb0985d9cf987b109d5898386928419032533cd1c8f
7
+ data.tar.gz: 902531a857d93d74bce572bfc8c604e61dcaf5853794ad36d530208bf70c85d2f43b84624550bbee1e4ddc0fac80fe3be91d96764390d060b1b7230f96c24520
@@ -1,66 +1,75 @@
1
1
  #include "adj_matrix.h"
2
- #include "code.h"
3
-
4
- #include <stdlib.h>
2
+ #include "codepoints.h"
3
+ #include "ruby.h"
5
4
 
6
5
  const char *DEFAULT_ADJ_TABLE[] = {
7
- "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
8
- "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
9
- "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
10
- "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
11
- };
6
+ "A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
7
+ "O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
8
+ "C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
9
+ "S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
10
+ "I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
11
+ "O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
12
12
 
13
- extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
14
13
  void node_free(Node *head);
15
14
 
16
- AdjMatrix* adj_matrix_new(unsigned int length){
15
+ AdjMatrix *adj_matrix_new(uint32_t length) {
17
16
  AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
18
17
  matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
19
- matrix->table = malloc(matrix->length * sizeof(Node**));
20
- for(int i = 0; i < matrix->length; i++){
21
- matrix->table[i] = malloc(matrix->length * sizeof(Node*));
22
- for (int j = 0; j < matrix->length; j++)
18
+ matrix->table = malloc(matrix->length * sizeof(Node **));
19
+ for (size_t i = 0; i < matrix->length; i++) {
20
+ matrix->table[i] = malloc(matrix->length * sizeof(Node *));
21
+ for (size_t j = 0; j < matrix->length; j++)
23
22
  matrix->table[i][j] = NULL;
24
23
  }
25
24
  return matrix;
26
25
  }
27
26
 
28
- void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
29
- unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
30
- h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
31
- Node *new_node = malloc(sizeof(Node)); new_node->x = h1; new_node->y = h2; new_node->next = NULL;
32
- if(matrix->table[h1][h2] == NULL){
27
+ void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
28
+ uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
29
+ ADJ_MATRIX_DEFAULT_LENGTH,
30
+ h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
31
+ ADJ_MATRIX_DEFAULT_LENGTH;
32
+ Node *new_node = malloc(sizeof(Node));
33
+ new_node->x = h1;
34
+ new_node->y = h2;
35
+ new_node->next = NULL;
36
+ if (matrix->table[h1][h2] == NULL) {
33
37
  matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
34
- }
35
- else{
38
+ } else {
36
39
  Node *previous = NULL;
37
- for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next) previous = i;
40
+ for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
41
+ previous = i;
38
42
  previous->next = new_node;
39
43
  }
40
44
  }
41
45
 
42
- char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
43
- unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
44
- h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
46
+ char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
47
+ uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
48
+ ADJ_MATRIX_DEFAULT_LENGTH,
49
+ h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
50
+ ADJ_MATRIX_DEFAULT_LENGTH;
45
51
  Node *node = matrix->table[h1][h2];
46
- if(node == NULL) return 0;
47
- else{
48
- for(Node *i = node; i != NULL; i = i->next)
49
- if((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1)) return 1;
52
+ if (node == NULL)
53
+ return 0;
54
+ else {
55
+ for (Node *i = node; i != NULL; i = i->next)
56
+ if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
57
+ return 1;
50
58
  return 0;
51
59
  }
52
60
  }
53
61
 
54
- void node_free(Node *head){
55
- if(head == NULL) return;
62
+ void node_free(Node *head) {
63
+ if (head == NULL)
64
+ return;
56
65
  node_free(head->next);
57
66
  free(head);
58
67
  }
59
68
 
60
- void adj_matrix_free(AdjMatrix *matrix){
61
- for(int i = 0; i < matrix->length; i++){
62
- for(int j = 0; j < matrix->length; j++)
63
- if(matrix->table[i][j] != NULL){
69
+ void adj_matrix_free(AdjMatrix *matrix) {
70
+ for (size_t i = 0; i < matrix->length; i++) {
71
+ for (size_t j = 0; j < matrix->length; j++)
72
+ if (matrix->table[i][j] != NULL) {
64
73
  node_free(matrix->table[i][j]);
65
74
  matrix->table[i][j] = matrix->table[j][i] = NULL;
66
75
  }
@@ -70,20 +79,19 @@ void adj_matrix_free(AdjMatrix *matrix){
70
79
  free(matrix);
71
80
  }
72
81
 
73
- AdjMatrix* adj_matrix_default(){
82
+ AdjMatrix *adj_matrix_default() {
74
83
  static char first_time = 1;
75
84
  static AdjMatrix *ret_matrix;
76
- if(first_time){
85
+ if (first_time) {
77
86
  ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
78
- int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
79
- for(int i = 0; i < length; i += 2){
80
- unsigned long long code_1, code_2;
81
- int dummy_length;
82
- utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
83
- utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
87
+ size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
88
+ for (size_t i = 0; i < length; i += 2) {
89
+ uint64_t code_1, code_2;
90
+ code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
91
+ code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
84
92
  adj_matrix_add(ret_matrix, code_1, code_2);
85
93
  }
86
94
  first_time = 0;
87
95
  }
88
96
  return ret_matrix;
89
- }
97
+ }
@@ -1,22 +1,22 @@
1
- #ifndef ADJ_MATRIX_H
2
- #define ADJ_MATRIX_H
1
+ #pragma once
2
+
3
+ #include "stdint.h"
4
+
3
5
  #define ADJ_MATRIX_DEFAULT_LENGTH 958
4
6
  #define ADJ_MATRIX_SEED 9527
5
7
 
6
- typedef struct _node{
8
+ typedef struct _node {
7
9
  struct _node *next;
8
- unsigned long long x, y;
10
+ uint64_t x, y;
9
11
  } Node;
10
12
 
11
- typedef struct{
13
+ typedef struct {
12
14
  Node ***table;
13
- unsigned int length;
15
+ uint32_t length;
14
16
  } AdjMatrix;
15
17
 
16
- AdjMatrix* adj_matrix_new (unsigned int length);
17
- void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
- char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
- void adj_matrix_free (AdjMatrix *matrix);
20
- AdjMatrix* adj_matrix_default();
21
-
22
- #endif
18
+ AdjMatrix *adj_matrix_new(uint32_t length);
19
+ void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
20
+ char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
21
+ void adj_matrix_free(AdjMatrix *matrix);
22
+ AdjMatrix *adj_matrix_default();
@@ -0,0 +1,61 @@
1
+ #include "codepoints.h"
2
+ #include "ruby.h"
3
+ #include "ruby/encoding.h"
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ // this function is copied from string.c
9
+ static inline int single_byte_optimizable(VALUE str) {
10
+ rb_encoding *enc;
11
+
12
+ /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
13
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
14
+ return 1;
15
+
16
+ enc = rb_enc_get(str);
17
+ if (rb_enc_mbmaxlen(enc) == 1)
18
+ return 1;
19
+
20
+ /* Conservative. Possibly single byte.
21
+ * "\xa1" in Shift_JIS for example. */
22
+ return 0;
23
+ }
24
+
25
+ void codepoints_init(CodePoints *codepoints, VALUE str) {
26
+ size_t i, length;
27
+ int32_t n;
28
+ uint32_t c;
29
+ const char *ptr, *end;
30
+ rb_encoding *enc;
31
+
32
+ if (single_byte_optimizable(str)) {
33
+ length = RSTRING_LEN(str);
34
+ ptr = RSTRING_PTR(str);
35
+ codepoints->data = malloc(length * sizeof(*codepoints->data));
36
+ for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
37
+ codepoints->data[i] = ptr[i] & 0xff;
38
+ } else {
39
+ codepoints->length = 0;
40
+ codepoints->size = 32;
41
+ codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
42
+ str = rb_str_new_frozen(str);
43
+ ptr = RSTRING_PTR(str);
44
+ end = RSTRING_END(str);
45
+ enc = rb_enc_get(str);
46
+
47
+ while (ptr < end) {
48
+ c = rb_enc_codepoint_len(ptr, end, &n, enc);
49
+ if (codepoints->length == codepoints->size) {
50
+ codepoints->size *= 2;
51
+ codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
52
+ codepoints->size);
53
+ }
54
+ codepoints->data[codepoints->length++] = c;
55
+ ptr += n;
56
+ }
57
+ RB_GC_GUARD(str);
58
+ }
59
+ }
60
+
61
+ void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
@@ -0,0 +1,13 @@
1
+ #pragma once
2
+ #include "ruby.h"
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ uint32_t *data;
8
+ size_t length;
9
+ size_t size;
10
+ } CodePoints;
11
+
12
+ void codepoints_init(CodePoints *, VALUE str);
13
+ void codepoints_free(CodePoints *);
@@ -1,73 +1,62 @@
1
1
  #include "jaro.h"
2
- #include "code.h"
3
2
  #include "adj_matrix.h"
3
+ #include "codepoints.h"
4
4
 
5
- #include <string.h>
6
- #include <stdlib.h>
7
5
  #include <ctype.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
8
 
9
- #define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
10
-
11
- double jaro_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
12
- double jaro_winkler_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
13
-
14
- double jaro_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
15
- if(!short_str_len || !long_str_len) return 0.0;
16
-
17
- unsigned long long *short_codes, *long_codes;
18
- int short_codes_len, long_codes_len;
19
- string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
20
- string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
21
-
22
- double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
23
-
24
- free(short_codes); free(long_codes);
25
- return ret;
26
- }
27
-
28
- double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
29
- if(!short_str_len || !long_str_len) return 0.0;
30
-
31
- unsigned long long *short_codes, *long_codes;
32
- int short_codes_len, long_codes_len;
33
- string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
34
- string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
35
-
36
- double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
37
-
38
- free(short_codes); free(long_codes);
39
- return ret;
40
- }
41
-
42
- double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
43
- if(!short_codes_len || !long_codes_len) return 0.0;
44
-
45
- if(short_codes_len > long_codes_len){
46
- SWAP(short_codes, long_codes);
47
- SWAP(short_codes_len, long_codes_len);
9
+ #define DEFAULT_WEIGHT 0.1
10
+ #define DEFAULT_THRESHOLD 0.7
11
+ #define SWAP(x, y) \
12
+ do { \
13
+ __typeof__(x) SWAP = x; \
14
+ x = y; \
15
+ y = SWAP; \
16
+ } while (0)
17
+
18
+ const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
19
+ .threshold = DEFAULT_THRESHOLD,
20
+ .ignore_case = 0,
21
+ .adj_table = 0};
22
+
23
+ double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
24
+ uint32_t *codepoints2, size_t len2,
25
+ Options *opt) {
26
+ if (!len1 || !len2)
27
+ return 0.0;
28
+
29
+ if (len1 > len2) {
30
+ SWAP(codepoints1, codepoints2);
31
+ SWAP(len1, len2);
48
32
  }
49
33
 
50
- if(opt->ignore_case){
51
- for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
52
- for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
34
+ if (opt->ignore_case) {
35
+ for (size_t i = 0; i < len1; i++)
36
+ codepoints1[i] = tolower(codepoints1[i]);
37
+ for (size_t i = 0; i < len2; i++)
38
+ codepoints2[i] = tolower(codepoints2[i]);
53
39
  }
54
40
 
55
- int window_size = long_codes_len/2 - 1;
56
- if(window_size < 0) window_size = 0;
41
+ int32_t window_size = (int32_t)len2 / 2 - 1;
42
+ if (window_size < 0)
43
+ window_size = 0;
57
44
 
58
- char short_codes_flag[short_codes_len];
59
- char long_codes_flag[long_codes_len];
60
- memset(short_codes_flag, 0, short_codes_len);
61
- memset(long_codes_flag, 0, long_codes_len);
45
+ char short_codes_flag[len1];
46
+ char long_codes_flag[len2];
47
+ memset(short_codes_flag, 0, len1);
48
+ memset(long_codes_flag, 0, len2);
62
49
 
63
50
  // count number of matching characters
64
- int match_count = 0;
65
- for(int i = 0; i < short_codes_len; i++){
66
- int left = (i >= window_size) ? i - window_size : 0;
67
- int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
68
- if(right > long_codes_len - 1) right = long_codes_len - 1;
69
- for(int j = left; j <= right; j++){
70
- if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
51
+ size_t match_count = 0;
52
+ for (size_t i = 0; i < len1; i++) {
53
+ size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
54
+ size_t right =
55
+ (i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
56
+ if (right > len2 - 1)
57
+ right = len2 - 1;
58
+ for (size_t j = left; j <= right; j++) {
59
+ if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
71
60
  short_codes_flag[i] = long_codes_flag[j] = 1;
72
61
  match_count++;
73
62
  break;
@@ -75,48 +64,58 @@ double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes
75
64
  }
76
65
  }
77
66
 
78
- if(!match_count) return 0.0;
67
+ if (!match_count)
68
+ return 0.0;
79
69
 
80
70
  // count number of transpositions
81
- int transposition_count = 0, j = 0, k = 0;
82
- for(int i = 0; i < short_codes_len; i++){
83
- if(short_codes_flag[i]){
84
- for(j = k; j < long_codes_len; j++){
85
- if(long_codes_flag[j]){
71
+ size_t transposition_count = 0, j = 0, k = 0;
72
+ for (size_t i = 0; i < len1; i++) {
73
+ if (short_codes_flag[i]) {
74
+ for (j = k; j < len2; j++) {
75
+ if (long_codes_flag[j]) {
86
76
  k = j + 1;
87
77
  break;
88
78
  }
89
79
  }
90
- if(short_codes[i] != long_codes[j]) transposition_count++;
80
+ if (codepoints1[i] != codepoints2[j])
81
+ transposition_count++;
91
82
  }
92
83
  }
93
84
 
94
85
  // count similarities in nonmatched characters
95
- int similar_count = 0;
96
- if(opt->adj_table && short_codes_len > match_count)
97
- for(int i = 0; i < short_codes_len; i++)
98
- if(!short_codes_flag[i])
99
- for(int j = 0; j < long_codes_len; j++)
100
- if(!long_codes_flag[j])
101
- if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
86
+ size_t similar_count = 0;
87
+ if (opt->adj_table && len1 > match_count)
88
+ for (size_t i = 0; i < len1; i++)
89
+ if (!short_codes_flag[i])
90
+ for (size_t j = 0; j < len2; j++)
91
+ if (!long_codes_flag[j])
92
+ if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
93
+ codepoints2[j])) {
102
94
  similar_count += 3;
103
95
  break;
104
96
  }
105
97
 
106
98
  double m = (double)match_count;
107
- double t = (double)(transposition_count/2);
108
- if(opt->adj_table) m = similar_count/10.0 + m;
109
- return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
99
+ double t = (double)(transposition_count / 2);
100
+ if (opt->adj_table)
101
+ m = similar_count / 10.0 + m;
102
+ return (m / len1 + m / len2 + (m - t) / m) / 3;
110
103
  }
111
104
 
112
- double jaro_winkler_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
113
- double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
114
-
115
- if(jaro_distance < opt->threshold) return jaro_distance;
116
- else{
117
- int prefix = 0;
118
- int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
119
- for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
120
- return jaro_distance + prefix*opt->weight*(1-jaro_distance);
105
+ double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
106
+ uint32_t *codepoints2, size_t len2,
107
+ Options *opt) {
108
+ double jaro_distance =
109
+ jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
110
+
111
+ if (jaro_distance < opt->threshold)
112
+ return jaro_distance;
113
+ else {
114
+ size_t prefix = 0;
115
+ size_t max_4 = len1 > 4 ? 4 : len1;
116
+ for (prefix = 0;
117
+ prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
118
+ ;
119
+ return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
121
120
  }
122
- }
121
+ }
@@ -1,17 +1,17 @@
1
- #ifndef LIBJARO_JARO_H
2
- #define LIBJARO_JARO_H
1
+ #pragma once
3
2
 
4
- #define DEFAULT_WEIGHT 0.1
5
- #define DEFAULT_THRESHOLD 0.7
3
+ #include <stddef.h>
4
+ #include <stdint.h>
6
5
 
7
- typedef struct LibJaroOption{
6
+ typedef struct {
8
7
  double weight, threshold;
9
8
  char ignore_case, adj_table;
10
- } LibJaroOption;
9
+ } Options;
11
10
 
11
+ extern const Options DEFAULT_OPTIONS;
12
12
 
13
- static const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
14
- double jaro_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
15
- double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
16
-
17
- #endif
13
+ double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
14
+ uint32_t *codepoints2, size_t len2, Options *);
15
+ double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
16
+ uint32_t *codepoints2, size_t len2,
17
+ Options *);
@@ -1,45 +1,70 @@
1
- #include "ruby.h"
1
+ #include "codepoints.h"
2
2
  #include "jaro.h"
3
+ #include "ruby.h"
3
4
 
4
- VALUE rb_mJaroWinkler,
5
- rb_eError,
6
- rb_eInvalidWeightError;
5
+ VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
7
6
 
8
- VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
9
- VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
10
- VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt));
7
+ VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
8
+ VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
9
+ VALUE distance(size_t argc, VALUE *argv, VALUE self,
10
+ double (*distance_fn)(uint32_t *codepoints1, size_t len1,
11
+ uint32_t *codepoints2, size_t len2,
12
+ Options *));
11
13
 
12
- void Init_jaro_winkler_ext(void){
14
+ void Init_jaro_winkler_ext(void) {
13
15
  rb_mJaroWinkler = rb_define_module("JaroWinkler");
14
16
  rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
15
- rb_eInvalidWeightError = rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
16
- rb_define_module_function(rb_mJaroWinkler, "distance", rb_jaro_winkler_distance, -1);
17
- rb_define_module_function(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1);
17
+ rb_eInvalidWeightError =
18
+ rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
19
+ rb_define_singleton_method(rb_mJaroWinkler, "distance",
20
+ rb_jaro_winkler_distance, -1);
21
+ rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
22
+ -1);
18
23
  }
19
24
 
20
-
21
- VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt)){
25
+ VALUE distance(size_t argc, VALUE *argv, VALUE self,
26
+ double (*distance_fn)(uint32_t *codepoints1, size_t len1,
27
+ uint32_t *codepoints2, size_t len2,
28
+ Options *)) {
22
29
  VALUE s1, s2, opt;
23
- rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
24
- LibJaroOption c_opt = DEFAULT_OPT;
25
- if(TYPE(opt) == T_HASH){
30
+ CodePoints cp1, cp2;
31
+
32
+ rb_scan_args((int32_t)argc, argv, "2:", &s1, &s2, &opt);
33
+ codepoints_init(&cp1, s1);
34
+ codepoints_init(&cp2, s2);
35
+
36
+ Options c_opt = DEFAULT_OPTIONS;
37
+ if (TYPE(opt) == T_HASH) {
26
38
  VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
27
39
  threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
28
40
  ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
29
41
  adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
30
- if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
31
- if(c_opt.weight > 0.25) rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
32
- if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
33
- if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
34
- if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
42
+ if (!NIL_P(weight))
43
+ c_opt.weight = NUM2DBL(weight);
44
+ if (c_opt.weight > 0.25)
45
+ rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
46
+ "otherwise the distance can become "
47
+ "larger than 1.");
48
+ if (!NIL_P(threshold))
49
+ c_opt.threshold = NUM2DBL(threshold);
50
+ if (!NIL_P(ignore_case))
51
+ c_opt.ignore_case =
52
+ (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
53
+ if (!NIL_P(adj_table))
54
+ c_opt.adj_table =
55
+ (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
35
56
  }
36
- return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
57
+ VALUE ret = rb_float_new(
58
+ (*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
59
+ codepoints_free(&cp1);
60
+ codepoints_free(&cp2);
61
+ return ret;
37
62
  }
38
63
 
39
- VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self){
40
- return distance(argc, argv, self, jaro_distance);
64
+ VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self) {
65
+ return distance(argc, argv, self, jaro_distance_from_codes);
41
66
  }
42
67
 
43
- VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self){
44
- return distance(argc, argv, self, jaro_winkler_distance);
45
- }
68
+ VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self) {
69
+ return distance(argc, argv, self, jaro_winkler_distance_from_codes);
70
+ }
@@ -1,9 +1,9 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'jaro_winkler/version'
2
4
 
3
- case RUBY_PLATFORM
4
- when 'java'
5
- require 'jaro_winkler/jaro_winkler_pure'
6
- else
5
+ if RUBY_ENGINE == 'ruby'
7
6
  require 'jaro_winkler/jaro_winkler_ext'
7
+ else
8
+ require 'jaro_winkler/jaro_winkler_pure'
8
9
  end
9
-
@@ -1,19 +1,14 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module JaroWinkler
2
- DEFAULT_ADJ_TABLE = Hash.new
4
+ DEFAULT_ADJ_TABLE = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
3
5
  [
4
- ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
5
- ['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
6
- ['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
7
- ['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
6
+ %w[A E], %w[A I], %w[A O], %w[A U], %w[B V], %w[E I], %w[E O], %w[E U], %w[I O],
7
+ %w[I U], %w[O U], %w[I Y], %w[E Y], %w[C G], %w[E F], %w[W U], %w[W V], %w[X K],
8
+ %w[S Z], %w[X S], %w[Q C], %w[U V], %w[M N], %w[L I], %w[Q O], %w[P R], %w[I J],
9
+ %w[2 Z], %w[5 S], %w[8 B], %w[1 I], %w[1 L], %w[0 O], %w[0 Q], %w[C K], %w[G J],
8
10
  ['E', ' '], ['Y', ' '], ['S', ' ']
9
- ].each{ |s1, s2|
10
- if not DEFAULT_ADJ_TABLE.has_key?(s1)
11
- DEFAULT_ADJ_TABLE[s1] = Hash.new
12
- end
13
- if not DEFAULT_ADJ_TABLE.has_key?(s2)
14
- DEFAULT_ADJ_TABLE[s2] = Hash.new
15
- end
11
+ ].each do |s1, s2|
16
12
  DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
17
- }
18
- DEFAULT_ADJ_TABLE.default = Hash.new
13
+ end
19
14
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'jaro_winkler/adjusting_table'
2
4
  module JaroWinkler
3
5
  class Error < RuntimeError; end
@@ -6,120 +8,122 @@ module JaroWinkler
6
8
  DEFAULT_WEIGHT = 0.1
7
9
  DEFAULT_THRESHOLD = 0.7
8
10
  DEFAULT_OPTIONS = {
9
- jaro: {adj_table: false, ignore_case: false},
10
- jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
11
- }
12
-
13
- module_function
14
-
15
- def distance str1, str2, options={}
16
- _distance str1.codepoints.to_a, str2.codepoints.to_a, options
17
- end
11
+ jaro: { adj_table: false, ignore_case: false },
12
+ jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
13
+ }.freeze
18
14
 
19
- def jaro_distance str1, str2, options={}
20
- _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
21
- end
15
+ class << self
16
+ def distance(str1, str2, options = {})
17
+ _distance str1.codepoints.to_a, str2.codepoints.to_a, options
18
+ end
22
19
 
23
- def _distance codes1, codes2, options={}
24
- options = DEFAULT_OPTIONS[:jaro_winkler].merge options
25
- raise InvalidWeightError if options[:weight] > 0.25
26
- jaro_distance = _jaro_distance(codes1, codes2, options);
20
+ def jaro_distance(str1, str2, options = {})
21
+ _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
22
+ end
27
23
 
28
- if jaro_distance < options[:threshold]
29
- jaro_distance
30
- else
31
- codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
32
- len1, len2 = codes1.length, codes2.length
33
- max_4 = len1 > 4 ? 4 : len1
34
- prefix = 0
35
- while prefix < max_4 && codes1[prefix] == codes2[prefix]
36
- prefix += 1
24
+ private
25
+
26
+ def _distance(codes1, codes2, options = {})
27
+ options = DEFAULT_OPTIONS[:jaro_winkler].merge options
28
+ raise InvalidWeightError if options[:weight] > 0.25
29
+ jaro_distance = _jaro_distance(codes1, codes2, options)
30
+
31
+ if jaro_distance < options[:threshold]
32
+ jaro_distance
33
+ else
34
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
35
+ len1 = codes1.length
36
+ len2 = codes2.length
37
+ max_4 = len1 > 4 ? 4 : len1
38
+ prefix = 0
39
+ prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
40
+ jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
37
41
  end
38
- jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
39
42
  end
40
- end
41
-
42
- def _jaro_distance codes1, codes2, options={}
43
- options = DEFAULT_OPTIONS[:jaro].merge options
44
43
 
45
- codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
46
- len1, len2 = codes1.length, codes2.length
47
- return 0.0 if len1 == 0 || len2 == 0
44
+ def _jaro_distance(codes1, codes2, options = {})
45
+ options = DEFAULT_OPTIONS[:jaro].merge options
48
46
 
49
- if options[:ignore_case]
50
- codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
51
- codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
52
- end
47
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
48
+ len1 = codes1.length
49
+ len2 = codes2.length
50
+ return 0.0 if len1 == 0 || len2 == 0
53
51
 
54
- window = len2/2 - 1
55
- window = 0 if(window < 0)
56
- flags1, flags2 = 0, 0
57
-
58
- # // count number of matching characters
59
- match_count = 0;
60
- i = 0
61
- while i < len1
62
- left = (i >= window) ? i - window : 0
63
- right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
64
- right = len2 - 1 if right > len2 - 1
65
- j = left
66
- while j <= right
67
- if flags2[j] == 0 && codes1[i] == codes2[j]
68
- flags1 |= (1 << i)
69
- flags2 |= (1 << j)
70
- match_count += 1
71
- break
72
- end
73
- j +=1
52
+ if options[:ignore_case]
53
+ codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
54
+ codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
74
55
  end
75
- i += 1
76
- end
77
56
 
78
- return 0.0 if match_count == 0
79
-
80
- # // count number of transpositions
81
- transposition_count = j = k = 0
82
- i = 0
83
- while i < len1
84
- if flags1[i] == 1
85
- j = k
86
- while j < len2
87
- if flags2[j] == 1
88
- k = j + 1;
89
- break;
57
+ window = len2 / 2 - 1
58
+ window = 0 if window < 0
59
+ flags1 = 0
60
+ flags2 = 0
61
+
62
+ # // count number of matching characters
63
+ match_count = 0
64
+ i = 0
65
+ while i < len1
66
+ left = i >= window ? i - window : 0
67
+ right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
68
+ right = len2 - 1 if right > len2 - 1
69
+ j = left
70
+ while j <= right
71
+ if flags2[j] == 0 && codes1[i] == codes2[j]
72
+ flags1 |= (1 << i)
73
+ flags2 |= (1 << j)
74
+ match_count += 1
75
+ break
90
76
  end
91
77
  j += 1
92
78
  end
93
- transposition_count += 1 if codes1[i] != codes2[j]
79
+ i += 1
94
80
  end
95
- i += 1
96
- end
97
81
 
98
- # // count similarities in nonmatched characters
99
- similar_count = 0
100
- if options[:adj_table] && len1 > match_count
82
+ return 0.0 if match_count == 0
83
+
84
+ # // count number of transpositions
85
+ transposition_count = j = k = 0
101
86
  i = 0
102
87
  while i < len1
103
- if flags1[i] == 0
104
- j = 0
88
+ if flags1[i] == 1
89
+ j = k
105
90
  while j < len2
106
- if flags2[j] == 0
107
- if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
108
- similar_count += 3
109
- break
110
- end
91
+ if flags2[j] == 1
92
+ k = j + 1
93
+ break
111
94
  end
112
95
  j += 1
113
96
  end
97
+ transposition_count += 1 if codes1[i] != codes2[j]
114
98
  end
115
99
  i += 1
116
100
  end
117
- end
118
101
 
119
- m = match_count.to_f
120
- t = transposition_count/2
121
- m = similar_count/10.0 + m if options[:adj_table]
122
- (m/len1 + m/len2 + (m-t)/m) / 3
123
- end
102
+ # // count similarities in nonmatched characters
103
+ similar_count = 0
104
+ if options[:adj_table] && len1 > match_count
105
+ i = 0
106
+ while i < len1
107
+ if flags1[i] == 0
108
+ j = 0
109
+ while j < len2
110
+ if flags2[j] == 0
111
+ if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
112
+ similar_count += 3
113
+ break
114
+ end
115
+ end
116
+ j += 1
117
+ end
118
+ end
119
+ i += 1
120
+ end
121
+ end
124
122
 
125
- end
123
+ m = match_count.to_f
124
+ t = transposition_count / 2
125
+ m = similar_count / 10.0 + m if options[:adj_table]
126
+ (m / len1 + m / len2 + (m - t) / m) / 3
127
+ end
128
+ end
129
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module JaroWinkler
2
- VERSION = '1.4.0'
4
+ VERSION = '1.5.1'
3
5
  end
metadata CHANGED
@@ -1,72 +1,77 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.1
5
5
  platform: java
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-12 00:00:00.000000000 Z
11
+ date: 2018-06-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
- version_requirements: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.7'
20
14
  requirement: !ruby/object:Gem::Requirement
21
15
  requirements:
22
- - - ~>
16
+ - - "~>"
23
17
  - !ruby/object:Gem::Version
24
18
  version: '1.7'
25
- prerelease: false
19
+ name: bundler
26
20
  type: :development
27
- - !ruby/object:Gem::Dependency
28
- name: rake
21
+ prerelease: false
29
22
  version_requirements: !ruby/object:Gem::Requirement
30
23
  requirements:
31
- - - ~>
24
+ - - "~>"
32
25
  - !ruby/object:Gem::Version
33
- version: '10.0'
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
34
28
  requirement: !ruby/object:Gem::Requirement
35
29
  requirements:
36
- - - ~>
30
+ - - "~>"
37
31
  - !ruby/object:Gem::Version
38
- version: '10.0'
39
- prerelease: false
32
+ version: '12.0'
33
+ name: rake
40
34
  type: :development
41
- - !ruby/object:Gem::Dependency
42
- name: rake-compiler
35
+ prerelease: false
43
36
  version_requirements: !ruby/object:Gem::Requirement
44
37
  requirements:
45
- - - '>='
38
+ - - "~>"
46
39
  - !ruby/object:Gem::Version
47
- version: '0'
40
+ version: '12.0'
41
+ - !ruby/object:Gem::Dependency
48
42
  requirement: !ruby/object:Gem::Requirement
49
43
  requirements:
50
- - - '>='
44
+ - - ">="
51
45
  - !ruby/object:Gem::Version
52
46
  version: '0'
53
- prerelease: false
47
+ name: rake-compiler
54
48
  type: :development
55
- - !ruby/object:Gem::Dependency
56
- name: minitest
49
+ prerelease: false
57
50
  version_requirements: !ruby/object:Gem::Requirement
58
51
  requirements:
59
- - - '>='
52
+ - - ">="
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
62
56
  requirement: !ruby/object:Gem::Requirement
63
57
  requirements:
64
- - - '>='
58
+ - - ">="
65
59
  - !ruby/object:Gem::Version
66
60
  version: '0'
67
- prerelease: false
61
+ name: minitest
68
62
  type: :development
69
- description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: |-
70
+ jaro_winkler is an implementation of Jaro-Winkler \
71
+ distance algorithm which is written in C extension and will fallback to pure \
72
+ Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
73
+ C and Ruby implementation support any kind of string encoding, such as \
74
+ UTF-8, EUC-JP, Big5, etc.
70
75
  email: tonytonyjan@gmail.com
71
76
  executables: []
72
77
  extensions: []
@@ -74,12 +79,11 @@ extra_rdoc_files: []
74
79
  files:
75
80
  - ext/jaro_winkler/adj_matrix.c
76
81
  - ext/jaro_winkler/adj_matrix.h
77
- - ext/jaro_winkler/code.c
78
- - ext/jaro_winkler/code.h
82
+ - ext/jaro_winkler/codepoints.c
83
+ - ext/jaro_winkler/codepoints.h
79
84
  - ext/jaro_winkler/jaro.c
80
85
  - ext/jaro_winkler/jaro.h
81
86
  - ext/jaro_winkler/jaro_winkler.c
82
- - ext/jaro_winkler/murmur_hash2.c
83
87
  - lib/jaro_winkler.rb
84
88
  - lib/jaro_winkler/adjusting_table.rb
85
89
  - lib/jaro_winkler/jaro_winkler_pure.rb
@@ -94,18 +98,19 @@ require_paths:
94
98
  - lib
95
99
  required_ruby_version: !ruby/object:Gem::Requirement
96
100
  requirements:
97
- - - '>='
101
+ - - ">="
98
102
  - !ruby/object:Gem::Version
99
103
  version: '0'
100
104
  required_rubygems_version: !ruby/object:Gem::Requirement
101
105
  requirements:
102
- - - '>='
106
+ - - ">="
103
107
  - !ruby/object:Gem::Version
104
108
  version: '0'
105
109
  requirements: []
106
110
  rubyforge_project:
107
- rubygems_version: 2.4.5
111
+ rubygems_version: 2.6.14.1
108
112
  signing_key:
109
113
  specification_version: 4
110
- summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.
114
+ summary: An implementation of Jaro-Winkler distance algorithm written \ in C extension
115
+ which supports any kind of string encoding.
111
116
  test_files: []
@@ -1,29 +0,0 @@
1
- #include <stdlib.h>
2
- #include <string.h>
3
-
4
- void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
5
- unsigned char first_char = str[0];
6
- if(first_char >= 252) *ret_byte_length = 6; // 1111110x
7
- else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
8
- else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
9
- else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
10
- else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
11
- else *ret_byte_length = 1;
12
- *ret_code = 0;
13
- memcpy(ret_code, str, *ret_byte_length);
14
- }
15
-
16
- void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
17
- unsigned int code;
18
- char byte_length;
19
-
20
- *ret_codes = calloc(length, sizeof(long long));
21
- *ret_length = 0;
22
-
23
- for(int i = 0; i < length;){
24
- int byte_length;
25
- utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
26
- *ret_length += 1;
27
- i += byte_length;
28
- }
29
- }
@@ -1,7 +0,0 @@
1
- #ifndef CODE_H
2
- #define CODE_H
3
-
4
- void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length);
5
- void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length);
6
-
7
- #endif
@@ -1,64 +0,0 @@
1
- //-----------------------------------------------------------------------------
2
- // MurmurHash2, by Austin Appleby
3
-
4
- // Note - This code makes a few assumptions about how your machine behaves -
5
-
6
- // 1. We can read a 4-byte value from any address without crashing
7
- // 2. sizeof(int) == 4
8
-
9
- // And it has a few limitations -
10
-
11
- // 1. It will not work incrementally.
12
- // 2. It will not produce the same results on little-endian and big-endian
13
- // machines.
14
-
15
- unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
- {
17
- // 'm' and 'r' are mixing constants generated offline.
18
- // They're not really 'magic', they just happen to work well.
19
-
20
- const unsigned int m = 0x5bd1e995;
21
- const int r = 24;
22
-
23
- // Initialize the hash to a 'random' value
24
-
25
- unsigned int h = seed ^ len;
26
-
27
- // Mix 4 bytes at a time into the hash
28
-
29
- const unsigned char * data = (const unsigned char *)key;
30
-
31
- while(len >= 4)
32
- {
33
- unsigned int k = *(unsigned int *)data;
34
-
35
- k *= m;
36
- k ^= k >> r;
37
- k *= m;
38
-
39
- h *= m;
40
- h ^= k;
41
-
42
- data += 4;
43
- len -= 4;
44
- }
45
-
46
- // Handle the last few bytes of the input array
47
-
48
- switch(len)
49
- {
50
- case 3: h ^= data[2] << 16;
51
- case 2: h ^= data[1] << 8;
52
- case 1: h ^= data[0];
53
- h *= m;
54
- };
55
-
56
- // Do a few final mixes of the hash to ensure the last few
57
- // bytes are well-incorporated.
58
-
59
- h ^= h >> 13;
60
- h *= m;
61
- h ^= h >> 15;
62
-
63
- return h;
64
- }