jaro_winkler 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/README.md +76 -6
- data/Rakefile +5 -0
- data/benchmark/native.rb +25 -0
- data/benchmark/pure.rb +20 -0
- data/ext/jaro_winkler/extconf.rb +3 -0
- data/ext/jaro_winkler/jaro_winkler.c +32 -0
- data/ext/jaro_winkler/jaro_winkler.h +8 -0
- data/ext/jaro_winkler/strcmp95.c +199 -0
- data/jaro_winkler.gemspec +2 -0
- data/lib/jaro_winkler.rb +5 -3
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +31 -13
- metadata +32 -11
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
MDFmODkwMGZkZTQ5OTk4YjZlODRjOWVmNjQxMDVhMjQ1NDlhYzhiMg==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f84ae47f4c9a9afe40e7b01c7d49359dd6986009
|
4
|
+
data.tar.gz: 6994fd05b085d89e203f1c5663915bdc9ce88735
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
NzM1ZGRjM2EwZTUxMzM4MWQzM2JhZDE3NjdkMmE2NTVlODdiMjM2YmJlMWY3
|
11
|
-
N2I4MjdhZjkwNzcxNTEyODE4ZTdkNjg1NmZjMjQ0ZWNkN2MzODM=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
ZDc4YWQ3NDJlYmYwMDg2MWQyODVkYzI1MjgyMzRhY2EyZDdmNzBiMWQ2MWIw
|
14
|
-
M2I3OWYyZTZkOTc4YzFjODM3MWM3OThmOTRhYjM4YjIwYTc0YTZhMjBmZWJk
|
15
|
-
ZTg4M2Y2MWEzMWU0YzM2ZWQ3MTBmYTZjZjUxMTI4NDMxODQzYTM=
|
6
|
+
metadata.gz: b2bf1f5c392750f82e203aafed8264eaa79750b9a3a11209cc221d8506cbf8b1840cd8822b631f0d5de5c6b8bd2c2c471b0992c55986918b81b9e4ef5236e072
|
7
|
+
data.tar.gz: c02296f3c27531a518f752177cca86c7e00670a1c3e7566bb6c3b499f7f676148a44fd3ec6848eedd50072bcd94a2c4781d9667440b44cd4032e24570f10a862
|
data/README.md
CHANGED
@@ -2,22 +2,26 @@
|
|
2
2
|
|
3
3
|
It's a pure Ruby implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm.
|
4
4
|
|
5
|
-
|
5
|
+
# Installation
|
6
6
|
|
7
7
|
```
|
8
8
|
gem install jaro_winkler
|
9
9
|
```
|
10
10
|
|
11
|
-
|
11
|
+
# Usage
|
12
12
|
|
13
13
|
```ruby
|
14
14
|
require 'jaro_winkler'
|
15
|
-
JaroWinkler.
|
15
|
+
JaroWinkler.distance "MARTHA", "MARHTA"
|
16
16
|
# => 0.9611
|
17
|
-
JaroWinkler.
|
17
|
+
JaroWinkler.distance "MARTHA", "marhta", case_match: true
|
18
18
|
# => 0.9611
|
19
|
-
JaroWinkler.
|
19
|
+
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
20
20
|
# => 0.9778
|
21
|
+
|
22
|
+
# Native
|
23
|
+
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
24
|
+
JaroWinkler.distance "MARTHA", "MARHTA", native: true
|
21
25
|
```
|
22
26
|
|
23
27
|
## Options
|
@@ -26,4 +30,70 @@ Name | Type | Default | Note
|
|
26
30
|
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
27
31
|
case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
|
28
32
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
29
|
-
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
|
33
|
+
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
|
34
|
+
native | boolean | false | Use native version, note that it omits all the other options.
|
35
|
+
|
36
|
+
## Pure Ruby v.s. Native
|
37
|
+
|
38
|
+
| Pure | Native
|
39
|
+
-------------- | ---- | ------
|
40
|
+
UTF-8 Support | Yes | No
|
41
|
+
Option Setting | Yes | No
|
42
|
+
|
43
|
+
# Why This?
|
44
|
+
|
45
|
+
There is also another gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match), it uses the same algorithm and both provides C and Ruby implementation.
|
46
|
+
|
47
|
+
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged:
|
48
|
+
|
49
|
+
string 1 | string 2 | origin | fuzzy-string-match | jaro_winkler
|
50
|
+
---------- | ---------- | -------- | ------------------ | ------------------
|
51
|
+
"henka" | "henkan" | 0.966667 | 0.9722 (wrong) | 0.9666666666666667
|
52
|
+
"al" | "al" | 1.000000 | 1.0 | 1.0
|
53
|
+
"martha" | "marhta" | 0.961111 | 0.9611 | 0.9611111111111111
|
54
|
+
"jones" | "johnson" | 0.832381 | 0.8323 | 0.8323809523809523
|
55
|
+
"abcvwxyz" | "cabvwxyz" | 0.958333 | 0.9583 | 0.9583333333333333
|
56
|
+
"dwayne" | "duane" | 0.840000 | 0.8400 | 0.84
|
57
|
+
"dixon" | "dicksonx" | 0.813333 | 0.8133 | 0.8133333333333332
|
58
|
+
"fvie" | "ten" | 0.000000 | 0.0 | 0
|
59
|
+
|
60
|
+
- The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
|
61
|
+
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
|
62
|
+
|
63
|
+
## Benchmark
|
64
|
+
|
65
|
+
- jaro_winkler (1.0.1)
|
66
|
+
- fuzzy-string-match (0.9.6)
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
require 'benchmark'
|
70
|
+
require 'jaro_winkler'
|
71
|
+
require 'fuzzystringmatch'
|
72
|
+
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
73
|
+
|
74
|
+
n = 100000
|
75
|
+
Benchmark.bmbm do |x|
|
76
|
+
x.report 'jaro_winkler ' do
|
77
|
+
n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2) } }
|
78
|
+
end
|
79
|
+
|
80
|
+
x.report 'fuzzystringmatch' do
|
81
|
+
jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
|
82
|
+
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# user system total real
|
87
|
+
# jaro_winkler 12.480000 0.010000 12.490000 ( 12.497828)
|
88
|
+
# fuzzystringmatch 14.990000 0.010000 15.000000 ( 15.014898)
|
89
|
+
```
|
90
|
+
|
91
|
+
# Todo
|
92
|
+
|
93
|
+
- Speed up `#distance(s1, s2, native: true)`
|
94
|
+
- Support UTF-8 in native version.
|
95
|
+
- Add more optoins to natvie version.
|
96
|
+
- case_match
|
97
|
+
- weight
|
98
|
+
- threshold
|
99
|
+
- adjusting word table (It's from the original C implementation.)
|
data/Rakefile
CHANGED
data/benchmark/native.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'jaro_winkler'
|
3
|
+
require 'fuzzystringmatch'
|
4
|
+
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
5
|
+
|
6
|
+
n = 100000
|
7
|
+
Benchmark.bmbm do |x|
|
8
|
+
x.report '#c_distance(s1, s2)' do
|
9
|
+
n.times{ ary.each{ |str1, str2| JaroWinkler.c_distance(str1, str2) } }
|
10
|
+
end
|
11
|
+
|
12
|
+
x.report '#distance(s1, s2, native: true)' do
|
13
|
+
n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2, native: true) } }
|
14
|
+
end
|
15
|
+
|
16
|
+
x.report 'fuzzystringmatch' do
|
17
|
+
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
18
|
+
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# user system total real
|
23
|
+
# #c_distance(s1, s2) 0.270000 0.000000 0.270000 ( 0.270250)
|
24
|
+
# #distance(s1, s2, native: true) 2.030000 0.050000 2.080000 ( 2.075878)
|
25
|
+
# fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.141239)
|
data/benchmark/pure.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'jaro_winkler'
|
3
|
+
require 'fuzzystringmatch'
|
4
|
+
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
5
|
+
|
6
|
+
n = 100000
|
7
|
+
Benchmark.bmbm do |x|
|
8
|
+
x.report 'jaro_winkler ' do
|
9
|
+
n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2) } }
|
10
|
+
end
|
11
|
+
|
12
|
+
x.report 'fuzzystringmatch' do
|
13
|
+
jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
|
14
|
+
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# user system total real
|
19
|
+
# jaro_winkler 12.480000 0.010000 12.490000 ( 12.497828)
|
20
|
+
# fuzzystringmatch 14.990000 0.010000 15.000000 ( 15.014898)
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#include "jaro_winkler.h"
|
2
|
+
#define MAX(X,Y) ((X) < (Y) ? (Y) : (X))
|
3
|
+
VALUE rb_mJaroWinkler;
|
4
|
+
|
5
|
+
static VALUE distance(VALUE self, VALUE s1, VALUE s2){
|
6
|
+
Check_Type(s1, T_STRING); Check_Type(s2, T_STRING);
|
7
|
+
// Check encoding
|
8
|
+
VALUE s1_ascii_only = TYPE(rb_funcall(s1, rb_intern("ascii_only?"), 0));
|
9
|
+
VALUE s2_ascii_only = TYPE(rb_funcall(s2, rb_intern("ascii_only?"), 0));
|
10
|
+
if(s1_ascii_only == T_FALSE || s2_ascii_only == T_FALSE) printf("WARNING: Non-ASCII string detected.\n");
|
11
|
+
|
12
|
+
VALUE *s1_ptr = &s1, *s2_ptr = &s2;
|
13
|
+
// guarantee the length of s1_ptr is less than or equal to that of s2_ptr
|
14
|
+
if(RSTRING_LEN(s1) > RSTRING_LEN(s2)){ VALUE *tmp = s1_ptr; s1_ptr = s2_ptr; s2_ptr = tmp; }
|
15
|
+
int min_length = RSTRING_LEN(*s1_ptr), max_length = RSTRING_LEN(*s2_ptr);
|
16
|
+
char *c_s1_ptr = StringValuePtr(*s1_ptr), *c_s2_ptr = StringValuePtr(*s2_ptr);
|
17
|
+
int opt[] = {1, 0};
|
18
|
+
if(min_length != max_length){
|
19
|
+
// padding spaces
|
20
|
+
char buf[max_length];
|
21
|
+
for(int i = min_length; i < max_length; ++i) buf[i] = ' ';
|
22
|
+
memcpy(buf, c_s1_ptr, min_length);
|
23
|
+
c_s1_ptr = buf;
|
24
|
+
return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
|
25
|
+
}
|
26
|
+
return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
|
27
|
+
}
|
28
|
+
|
29
|
+
void Init_jaro_winkler(void){
|
30
|
+
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
31
|
+
rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, 2);
|
32
|
+
}
|
@@ -0,0 +1,199 @@
|
|
1
|
+
|
2
|
+
/* strcmp95.c Version 2 */
|
3
|
+
|
4
|
+
/* The strcmp95 function returns a double precision value from 0.0 (total
|
5
|
+
disagreement) to 1.0 (character-by-character agreement). The returned
|
6
|
+
value is a measure of the similarity of the two strings. */
|
7
|
+
|
8
|
+
/* Date of Release: Jan. 26, 1994 */
|
9
|
+
/* Modified: April 24, 1994 Corrected the processing of the single length
|
10
|
+
character strings.
|
11
|
+
Authors: This function was written using the logic from code written by
|
12
|
+
Bill Winkler, George McLaughlin and Matt Jaro with modifications
|
13
|
+
by Maureen Lynch.
|
14
|
+
Comment: This is the official string comparator to be used for matching
|
15
|
+
during the 1995 Test Census. */
|
16
|
+
|
17
|
+
#include <ctype.h>
|
18
|
+
#include <string.h>
|
19
|
+
|
20
|
+
#define NOTNUM(c) ((c>57) || (c<48))
|
21
|
+
#define INRANGE(c) ((c>0) && (c<91))
|
22
|
+
#define MAX_VAR_SIZE 61
|
23
|
+
#define NULL60 " "
|
24
|
+
|
25
|
+
double strcmp95(char *ying, char *yang, long y_length, int *ind_c)
|
26
|
+
|
27
|
+
{
|
28
|
+
/* Arguments:
|
29
|
+
|
30
|
+
ying and yang are pointers to the 2 strings to be compared. The strings
|
31
|
+
need not be NUL-terminated strings because the length is passed.
|
32
|
+
|
33
|
+
y_length is the length of the strings.
|
34
|
+
|
35
|
+
ind_c is an array that is used to define whether certain options should be
|
36
|
+
activated. A nonzero value indicates the option is deactivated.
|
37
|
+
The options are:
|
38
|
+
ind_c[0] Increase the probability of a match when the number of matched
|
39
|
+
characters is large. This option allows for a little more
|
40
|
+
tolerance when the strings are large. It is not an appropriate
|
41
|
+
test when comparing fixed length fields such as phone and
|
42
|
+
social security numbers.
|
43
|
+
ind_c[1] All lower case characters are converted to upper case prior
|
44
|
+
to the comparison. Disabling this feature means that the lower
|
45
|
+
case string "code" will not be recognized as the same as the
|
46
|
+
upper case string "CODE". Also, the adjustment for similar
|
47
|
+
characters section only applies to uppercase characters.
|
48
|
+
|
49
|
+
The suggested values are all zeros for character strings such as names. */
|
50
|
+
|
51
|
+
static int pass=1/* TODO: make it an option parameter */, adjwt[91][91];
|
52
|
+
static char sp[39][2] =
|
53
|
+
{'A','E', 'A','I', 'A','O', 'A','U', 'B','V', 'E','I', 'E','O', 'E','U',
|
54
|
+
'I','O', 'I','U', 'O','U', 'I','Y', 'E','Y', 'C','G', 'E','F',
|
55
|
+
'W','U', 'W','V', 'X','K', 'S','Z', 'X','S', 'Q','C', 'U','V',
|
56
|
+
'M','N', 'L','I', 'Q','O', 'P','R', 'I','J', '2','Z', '5','S',
|
57
|
+
'8','B', '1','I', '1','L', '0','O', '0','Q', 'C','K', 'G','J',
|
58
|
+
'E',' ', 'Y',' ', 'S',' '};
|
59
|
+
|
60
|
+
char ying_hold[MAX_VAR_SIZE],
|
61
|
+
yang_hold[MAX_VAR_SIZE],
|
62
|
+
ying_flag[MAX_VAR_SIZE],
|
63
|
+
yang_flag[MAX_VAR_SIZE];
|
64
|
+
|
65
|
+
double weight, Num_sim;
|
66
|
+
|
67
|
+
long minv, search_range, lowlim, ying_length,
|
68
|
+
hilim, N_trans, Num_com, yang_length;
|
69
|
+
|
70
|
+
int yl1, yi_st, N_simi;
|
71
|
+
|
72
|
+
register int i, j, k;
|
73
|
+
|
74
|
+
/* Initialize the adjwt array on the first call to the function only.
|
75
|
+
The adjwt array is used to give partial credit for characters that
|
76
|
+
may be errors due to known phonetic or character recognition errors.
|
77
|
+
A typical example is to match the letter "O" with the number "0" */
|
78
|
+
if (!pass) {
|
79
|
+
pass++;
|
80
|
+
for (i=0; i<91; i++) for (j=0; j<91; j++) adjwt[i][j] = 0;
|
81
|
+
for (i=0; i<36; i++) {
|
82
|
+
adjwt[sp[i][0]][sp[i][1]] = 3;
|
83
|
+
adjwt[sp[i][1]][sp[i][0]] = 3;
|
84
|
+
} }
|
85
|
+
|
86
|
+
/* If either string is blank - return - added in Version 2 */
|
87
|
+
if (!strncmp(ying,NULL60,y_length)) return(0.0);
|
88
|
+
if (!strncmp(yang,NULL60,y_length)) return(0.0);
|
89
|
+
|
90
|
+
/* Identify the strings to be compared by stripping off all leading and
|
91
|
+
trailing spaces. */
|
92
|
+
k = y_length - 1;
|
93
|
+
for(j = 0;((ying[j]==' ') && (j < k));j++);
|
94
|
+
for(i = k;((ying[i]==' ') && (i > 0));i--);
|
95
|
+
ying_length = i + 1 - j;
|
96
|
+
yi_st = j;
|
97
|
+
|
98
|
+
for(j = 0;((yang[j]==' ') && (j < k));j++);
|
99
|
+
for(i = k;((yang[i]==' ') && (i > 0));i--);
|
100
|
+
yang_length = i + 1 - j;
|
101
|
+
|
102
|
+
ying_hold[0]=yang_hold[0]=0;
|
103
|
+
strncat(ying_hold,&ying[yi_st],ying_length);
|
104
|
+
strncat(yang_hold,&yang[j],yang_length);
|
105
|
+
|
106
|
+
if (ying_length > yang_length) {
|
107
|
+
search_range = ying_length;
|
108
|
+
minv = yang_length;
|
109
|
+
}
|
110
|
+
else {
|
111
|
+
search_range = yang_length;
|
112
|
+
minv = ying_length;
|
113
|
+
}
|
114
|
+
|
115
|
+
/* If either string is blank - return */
|
116
|
+
/* if (!minv) return(0.0); removed in version 2 */
|
117
|
+
|
118
|
+
/* Blank out the flags */
|
119
|
+
ying_flag[0] = yang_flag[0] = 0;
|
120
|
+
strncat(ying_flag,NULL60,search_range);
|
121
|
+
strncat(yang_flag,NULL60,search_range);
|
122
|
+
search_range = (search_range/2) - 1;
|
123
|
+
if (search_range < 0) search_range = 0; /* added in version 2 */
|
124
|
+
|
125
|
+
/* Convert all lower case characters to upper case. */
|
126
|
+
if (!ind_c[1]) {
|
127
|
+
for (i = 0;i < ying_length;i++) if (islower(ying_hold[i])) ying_hold[i] -= 32;
|
128
|
+
for (j = 0;j < yang_length;j++) if (islower(yang_hold[j])) yang_hold[j] -= 32;
|
129
|
+
}
|
130
|
+
|
131
|
+
/* Looking only within the search range, count and flag the matched pairs. */
|
132
|
+
Num_com = 0;
|
133
|
+
yl1 = yang_length - 1;
|
134
|
+
for (i = 0;i < ying_length;i++) {
|
135
|
+
lowlim = (i >= search_range) ? i - search_range : 0;
|
136
|
+
hilim = ((i + search_range) <= yl1) ? (i + search_range) : yl1;
|
137
|
+
for (j = lowlim;j <= hilim;j++) {
|
138
|
+
if ((yang_flag[j] != '1') && (yang_hold[j] == ying_hold[i])) {
|
139
|
+
yang_flag[j] = '1';
|
140
|
+
ying_flag[i] = '1';
|
141
|
+
Num_com++;
|
142
|
+
break;
|
143
|
+
} } }
|
144
|
+
|
145
|
+
/* If no characters in common - return */
|
146
|
+
if (!Num_com) return(0.0);
|
147
|
+
|
148
|
+
/* Count the number of transpositions */
|
149
|
+
k = N_trans = 0;
|
150
|
+
for (i = 0;i < ying_length;i++) {
|
151
|
+
if (ying_flag[i] == '1') {
|
152
|
+
for (j = k;j < yang_length;j++) {
|
153
|
+
if (yang_flag[j] == '1') {
|
154
|
+
k = j + 1;
|
155
|
+
break;
|
156
|
+
} }
|
157
|
+
if (ying_hold[i] != yang_hold[j]) N_trans++;
|
158
|
+
} }
|
159
|
+
N_trans = N_trans / 2;
|
160
|
+
|
161
|
+
/* adjust for similarities in nonmatched characters */
|
162
|
+
N_simi = 0;
|
163
|
+
if (minv > Num_com) {
|
164
|
+
for (i = 0;i < ying_length;i++) {
|
165
|
+
if (ying_flag[i] == ' ' && INRANGE(ying_hold[i])) {
|
166
|
+
for (j = 0;j < yang_length;j++) {
|
167
|
+
if (yang_flag[j] == ' ' && INRANGE(yang_hold[j])) {
|
168
|
+
if (adjwt[ying_hold[i]][yang_hold[j]] > 0) {
|
169
|
+
N_simi += adjwt[ying_hold[i]][yang_hold[j]];
|
170
|
+
yang_flag[j] = '2';
|
171
|
+
break;
|
172
|
+
} } } } } }
|
173
|
+
Num_sim = ((double) N_simi)/10.0 + Num_com;
|
174
|
+
|
175
|
+
/* Main weight computation. */
|
176
|
+
weight= Num_sim / ((double) ying_length) + Num_sim / ((double) yang_length)
|
177
|
+
+ ((double) (Num_com - N_trans)) / ((double) Num_com);
|
178
|
+
weight = weight / 3.0;
|
179
|
+
|
180
|
+
/* Continue to boost the weight if the strings are similar */
|
181
|
+
if (weight > 0.7) {
|
182
|
+
|
183
|
+
/* Adjust for having up to the first 4 characters in common */
|
184
|
+
j = (minv >= 4) ? 4 : minv;
|
185
|
+
for (i=0;((i<j)&&(ying_hold[i]==yang_hold[i])&&(NOTNUM(ying_hold[i])));i++);
|
186
|
+
if (i) weight += i * 0.1 * (1.0 - weight);
|
187
|
+
|
188
|
+
/* Optionally adjust for long strings. */
|
189
|
+
/* After agreeing beginning chars, at least two more must agree and
|
190
|
+
the agreeing characters must be > .5 of remaining characters. */
|
191
|
+
if ((!ind_c[0]) && (minv>4) && (Num_com>i+1) && (2*Num_com>=minv+i))
|
192
|
+
if (NOTNUM(ying_hold[0]))
|
193
|
+
weight += (double) (1.0-weight) *
|
194
|
+
((double) (Num_com-i-1) / ((double) (ying_length+yang_length-i*2+2)));
|
195
|
+
}
|
196
|
+
|
197
|
+
return(weight);
|
198
|
+
|
199
|
+
} /* strcmp95 */
|
data/jaro_winkler.gemspec
CHANGED
@@ -8,6 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = JaroWinkler::VERSION
|
9
9
|
spec.authors = ["Jian Weihang"]
|
10
10
|
spec.email = ["tonytonyjan@gmail.com"]
|
11
|
+
spec.extensions = ["ext/jaro_winkler/extconf.rb"]
|
11
12
|
spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
|
12
13
|
spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
|
13
14
|
spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
|
@@ -20,4 +21,5 @@ Gem::Specification.new do |spec|
|
|
20
21
|
|
21
22
|
spec.add_development_dependency "bundler", "~> 1.7"
|
22
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
|
+
spec.add_development_dependency "rake-compiler"
|
23
25
|
end
|
data/lib/jaro_winkler.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'jaro_winkler/jaro_winkler.so'
|
1
2
|
module JaroWinkler
|
2
3
|
module_function
|
3
4
|
def jaro_distance s1, s2
|
@@ -33,11 +34,12 @@ module JaroWinkler
|
|
33
34
|
end
|
34
35
|
end
|
35
36
|
# Don't divide transpositions by 2 since it's been counted directly by above code.
|
36
|
-
matches == 0 ? 0 :
|
37
|
+
matches == 0 ? 0 : (matches / length1 + matches / length2 + (matches - transpositions) / matches) / 3.0
|
37
38
|
end
|
38
39
|
|
39
|
-
def
|
40
|
-
options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
|
40
|
+
def distance s1, s2, options = {}
|
41
|
+
options = {weight: 0.1, threshold: 0.7, case_match: false, native: false}.merge options
|
42
|
+
return c_distance(s1, s2) if options[:native]
|
41
43
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
42
44
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
43
45
|
s1, s2 = s1.downcase, s2.downcase if case_match
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -1,24 +1,42 @@
|
|
1
1
|
require 'jaro_winkler'
|
2
|
-
|
2
|
+
include JaroWinkler
|
3
3
|
describe JaroWinkler do
|
4
|
+
before(:all) do
|
5
|
+
@ary = [
|
6
|
+
['henka', 'henkan', 0.9667],
|
7
|
+
['al', 'al', 1.0],
|
8
|
+
['martha', 'marhta', 0.9611],
|
9
|
+
['jones', 'johnson', 0.8323],
|
10
|
+
['abcvwxyz', 'cabvwxyz', 0.9583],
|
11
|
+
['dwayne', 'duane', 0.8400],
|
12
|
+
['dixon', 'dicksonx', 0.8133],
|
13
|
+
['fvie', 'ten', 0.0],
|
14
|
+
['tony', 'tony', 1.0],
|
15
|
+
['tonytonyjan', 'tonytonyjan', 1.0],
|
16
|
+
['', '', 0.0],
|
17
|
+
['tony', '', 0.0],
|
18
|
+
['', 'tony', 0.0]
|
19
|
+
]
|
20
|
+
end
|
21
|
+
|
4
22
|
it 'works' do
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
23
|
+
@ary.each do |s1, s2, ans|
|
24
|
+
expect(distance(s1, s2)).to be_within(0.0001).of(ans)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'supports C extension' do
|
29
|
+
@ary.each do |s1, s2, ans|
|
30
|
+
expect(distance(s1, s2, native: true)).to be_within(0.0001).of(ans)
|
31
|
+
end
|
14
32
|
end
|
15
33
|
|
16
34
|
it 'can ignore case' do
|
17
|
-
expect(
|
35
|
+
expect(distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
|
18
36
|
end
|
19
37
|
|
20
38
|
it 'can set weight' do
|
21
|
-
expect(
|
22
|
-
expect{
|
39
|
+
expect(distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
40
|
+
expect{ distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
|
23
41
|
end
|
24
42
|
end
|
metadata
CHANGED
@@ -1,56 +1,77 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.7'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.7'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: Pure Ruby implementation of Jaro-Winkler distance algorithm.
|
42
56
|
email:
|
43
57
|
- tonytonyjan@gmail.com
|
44
58
|
executables: []
|
45
|
-
extensions:
|
59
|
+
extensions:
|
60
|
+
- ext/jaro_winkler/extconf.rb
|
46
61
|
extra_rdoc_files: []
|
47
62
|
files:
|
48
|
-
- .gitignore
|
49
|
-
- .rspec
|
63
|
+
- ".gitignore"
|
64
|
+
- ".rspec"
|
50
65
|
- Gemfile
|
51
66
|
- LICENSE.txt
|
52
67
|
- README.md
|
53
68
|
- Rakefile
|
69
|
+
- benchmark/native.rb
|
70
|
+
- benchmark/pure.rb
|
71
|
+
- ext/jaro_winkler/extconf.rb
|
72
|
+
- ext/jaro_winkler/jaro_winkler.c
|
73
|
+
- ext/jaro_winkler/jaro_winkler.h
|
74
|
+
- ext/jaro_winkler/strcmp95.c
|
54
75
|
- jaro_winkler.gemspec
|
55
76
|
- lib/jaro_winkler.rb
|
56
77
|
- lib/jaro_winkler/version.rb
|
@@ -66,12 +87,12 @@ require_paths:
|
|
66
87
|
- lib
|
67
88
|
required_ruby_version: !ruby/object:Gem::Requirement
|
68
89
|
requirements:
|
69
|
-
- -
|
90
|
+
- - ">="
|
70
91
|
- !ruby/object:Gem::Version
|
71
92
|
version: '0'
|
72
93
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
94
|
requirements:
|
74
|
-
- -
|
95
|
+
- - ">="
|
75
96
|
- !ruby/object:Gem::Version
|
76
97
|
version: '0'
|
77
98
|
requirements: []
|