hotwater 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +45 -0
- data/README.md +70 -0
- data/Rakefile +25 -0
- data/ext/hotwater/Rakefile +5 -0
- data/ext/hotwater/damerau_levenshtein.c +68 -0
- data/ext/hotwater/hotwater.h +23 -0
- data/ext/hotwater/jaro.c +144 -0
- data/ext/hotwater/levenshtein.c +51 -0
- data/ext/hotwater/ngram.c +173 -0
- data/hotwater.gemspec +26 -0
- data/lib/hotwater/damerau_levenshtein_ffi.rb +26 -0
- data/lib/hotwater/jaro_ffi.rb +26 -0
- data/lib/hotwater/levenshtein_ffi.rb +25 -0
- data/lib/hotwater/ngram_ffi.rb +18 -0
- data/lib/hotwater/version.rb +3 -0
- data/lib/hotwater.rb +16 -0
- data/spec/hotwater/damerau_levenshtein_ffi_spec.rb +29 -0
- data/spec/hotwater/jaro_ffi_spec.rb +19 -0
- data/spec/hotwater/levenshtein_ffi_spec.rb +29 -0
- data/spec/hotwater/ngram_ffi_spec.rb +75 -0
- metadata +143 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
Copyright (c) 2013 Colin Surprenant <colin.surprenant@gmail.com>
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
|
15
|
+
--------
|
16
|
+
|
17
|
+
C code from the https://github.com/sunlightlabs/jellyfish project
|
18
|
+
|
19
|
+
Copyright (c) 2010, Sunlight Labs
|
20
|
+
|
21
|
+
All rights reserved.
|
22
|
+
|
23
|
+
Redistribution and use in source and binary forms, with or without modification,
|
24
|
+
are permitted provided that the following conditions are met:
|
25
|
+
|
26
|
+
* Redistributions of source code must retain the above copyright notice,
|
27
|
+
this list of conditions and the following disclaimer.
|
28
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
29
|
+
this list of conditions and the following disclaimer in the documentation
|
30
|
+
and/or other materials provided with the distribution.
|
31
|
+
* Neither the name of Sunlight Labs nor the names of its contributors may be
|
32
|
+
used to endorse or promote products derived from this software without
|
33
|
+
specific prior written permission.
|
34
|
+
|
35
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
36
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
37
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
38
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
39
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
40
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
41
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
42
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
43
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
44
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
45
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
# Hotwater v0.1.0
|
2
|
+
|
3
|
+
Ruby & JRuby gem with fast **string edit distance** C implementations using FFI bindings.
|
4
|
+
|
5
|
+
### Algorithms
|
6
|
+
|
7
|
+
- Levenshtein & Damerau Levenshtein
|
8
|
+
- Jaro & Jaro Winkler
|
9
|
+
- N-Gram
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
gem 'hotwater'
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install hotwater
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
Hotwater.levenshtein_distance("abc", "acb") # => 2
|
29
|
+
Hotwater.damerau_levenshtein_distance("abc", "acb") # => 1
|
30
|
+
|
31
|
+
# do normalization based on the string sizes
|
32
|
+
Hotwater.normalized_levenshtein_distance("abc", "acb").round(4) # => 0.3333
|
33
|
+
Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4) # => 0.6667
|
34
|
+
|
35
|
+
Hotwater.jaro_distance("martha", "marhta").round(4) # => 0.9444
|
36
|
+
Hotwater.jaro_winkler_distance("martha", "marhta").round(4) # => 0.9611
|
37
|
+
|
38
|
+
# default is bigram
|
39
|
+
Hotwater.ngram_distance("natural", "contrary").round(4) # => 0.25
|
40
|
+
|
41
|
+
# specify trigram
|
42
|
+
Hotwater.ngram_distance("natural", "contrary", 3).round(4) # => 0.2083
|
43
|
+
```
|
44
|
+
|
45
|
+
## Developement
|
46
|
+
|
47
|
+
1. Fort it
|
48
|
+
2. Install gems `$ bundle install`
|
49
|
+
3. Compile lib `$ rake compile`
|
50
|
+
4. Run specs `$ rake spec`
|
51
|
+
5. Clean compiler generated files `$ rake clean`
|
52
|
+
|
53
|
+
## Contributing
|
54
|
+
|
55
|
+
1. Fork it
|
56
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
57
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
58
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
59
|
+
5. Create new Pull Request
|
60
|
+
|
61
|
+
## Credits
|
62
|
+
- Some C code from the https://github.com/sunlightlabs/jellyfish project
|
63
|
+
- N-Gram ported from Apache Lucene 4.0.0 NGramDistance.java
|
64
|
+
|
65
|
+
## Author
|
66
|
+
Colin Surprenant, [@colinsurprenant](http://twitter.com/colinsurprenant), [http://github.com/colinsurprenant](http://github.com/colinsurprenant), colin.surprenant@gmail.com
|
67
|
+
|
68
|
+
## License
|
69
|
+
Hotwater is distributed under the Apache License, Version 2.0.
|
70
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'bundler/gem_tasks'
|
5
|
+
require 'rspec/core/rake_task'
|
6
|
+
require 'ffi-compiler/compile_task'
|
7
|
+
|
8
|
+
task :default => :spec
|
9
|
+
|
10
|
+
desc "run specs"
|
11
|
+
task :spec do
|
12
|
+
RSpec::Core::RakeTask.new
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "compiler tasks"
|
16
|
+
namespace "ffi-compiler" do
|
17
|
+
FFI::Compiler::CompileTask.new('ext/hotwater/hotwater') do |c|
|
18
|
+
end
|
19
|
+
end
|
20
|
+
task :compile => ["ffi-compiler:default"]
|
21
|
+
|
22
|
+
CLEAN.include('ext/**/*{.o,.log,.so,.bundle}')
|
23
|
+
CLEAN.include('lib/**/*{.o,.log,.so,.bundle}')
|
24
|
+
CLEAN.include('ext/**/Makefile')
|
25
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
/* from the https://github.com/sunlightlabs/jellyfish project */
|
2
|
+
|
3
|
+
#include "hotwater.h"
|
4
|
+
#include <string.h>
|
5
|
+
|
6
|
+
int damerau_levenshtein_distance(const char *s1, const char *s2)
|
7
|
+
{
|
8
|
+
size_t s1_len = s1 == NULL ? 0 : strlen(s1);
|
9
|
+
size_t s2_len = s2 == NULL ? 0 : strlen(s2);
|
10
|
+
size_t rows = s1_len + 1;
|
11
|
+
size_t cols = s2_len + 1;
|
12
|
+
|
13
|
+
size_t i, j;
|
14
|
+
size_t d1, d2, d3, d_now;;
|
15
|
+
unsigned short cost;
|
16
|
+
|
17
|
+
if (s1_len == 0) {
|
18
|
+
if (s2_len == 0) {
|
19
|
+
return 0;
|
20
|
+
}
|
21
|
+
else
|
22
|
+
{
|
23
|
+
return s2_len;
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
size_t *dist = malloc(rows * cols * sizeof(size_t));
|
28
|
+
if (!dist) {
|
29
|
+
return -1;
|
30
|
+
}
|
31
|
+
|
32
|
+
for (i = 0; i < rows; i++) {
|
33
|
+
dist[i * cols] = i;
|
34
|
+
}
|
35
|
+
|
36
|
+
for (j = 0; j < cols; j++) {
|
37
|
+
dist[j] = j;
|
38
|
+
}
|
39
|
+
|
40
|
+
for (i = 1; i < rows; i++) {
|
41
|
+
for (j = 1; j < cols; j++) {
|
42
|
+
if (s1[i - 1] == s2[j - 1]) {
|
43
|
+
cost = 0;
|
44
|
+
} else {
|
45
|
+
cost = 1;
|
46
|
+
}
|
47
|
+
|
48
|
+
d1 = dist[((i - 1) * cols) + j] + 1;
|
49
|
+
d2 = dist[(i * cols) + (j - 1)] + 1;
|
50
|
+
d3 = dist[((i - 1) * cols) + (j - 1)] + cost;
|
51
|
+
|
52
|
+
d_now = MIN(d1, MIN(d2, d3));
|
53
|
+
|
54
|
+
if (i > 2 && j > 2 && s1[i - 1] == s2[j - 2] &&
|
55
|
+
s1[i - 2] == s2[j - 1]) {
|
56
|
+
d1 = dist[((i - 2) * cols) + (j - 2)] + cost;
|
57
|
+
d_now = MIN(d_now, d1);
|
58
|
+
}
|
59
|
+
|
60
|
+
dist[(i * cols) + j] = d_now;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
d_now = dist[(cols * rows) - 1];
|
65
|
+
free(dist);
|
66
|
+
|
67
|
+
return d_now;
|
68
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#ifndef _HOTWATER_H_
|
2
|
+
#define _HOTWATER_H_
|
3
|
+
|
4
|
+
#include <stdbool.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
|
7
|
+
#ifndef MIN
|
8
|
+
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#ifndef MAX
|
12
|
+
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
|
13
|
+
#endif
|
14
|
+
|
15
|
+
double jaro_winkler_distance(const char *str1, const char *str2, bool long_tolerance);
|
16
|
+
double jaro_distance(const char *str1, const char *str2);
|
17
|
+
|
18
|
+
int levenshtein_distance(const char *str1, const char *str2);
|
19
|
+
int damerau_levenshtein_distance(const char *str1, const char *str2);
|
20
|
+
|
21
|
+
double ngram_distance(const char *source, const char *target, int n);
|
22
|
+
|
23
|
+
#endif /* _HOTWATER_H_ */
|
data/ext/hotwater/jaro.c
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
/* from the https://github.com/sunlightlabs/jellyfish project */
|
2
|
+
|
3
|
+
/*
|
4
|
+
Colin Surprenant, Feb 2013
|
5
|
+
- modified error return to -1 and small cosmetic cleanups
|
6
|
+
*/
|
7
|
+
|
8
|
+
|
9
|
+
#include <ctype.h>
|
10
|
+
#include <string.h>
|
11
|
+
#include <stdio.h>
|
12
|
+
#include <stdlib.h>
|
13
|
+
#include "hotwater.h"
|
14
|
+
|
15
|
+
#define NOTNUM(c) ((c>57) || (c<48))
|
16
|
+
#define INRANGE(c) ((c>0) && (c<91))
|
17
|
+
|
18
|
+
/* borrowed heavily from strcmp95.c
|
19
|
+
* http://www.census.gov/geo/msb/stand/strcmp.c
|
20
|
+
*/
|
21
|
+
double _jaro_winkler(const char *ying, const char *yang, bool long_tolerance, bool winklerize)
|
22
|
+
{
|
23
|
+
/* Arguments:
|
24
|
+
|
25
|
+
ying
|
26
|
+
yang
|
27
|
+
pointers to the 2 strings to be compared.
|
28
|
+
|
29
|
+
long_tolerance
|
30
|
+
Increase the probability of a match when the number of matched
|
31
|
+
characters is large. This option allows for a little more
|
32
|
+
tolerance when the strings are large. It is not an appropriate
|
33
|
+
test when comparing fixed length fields such as phone and
|
34
|
+
social security numbers.
|
35
|
+
*/
|
36
|
+
char *ying_flag = 0, *yang_flag = 0;
|
37
|
+
|
38
|
+
double weight;
|
39
|
+
|
40
|
+
long ying_length, yang_length, min_len;
|
41
|
+
long search_range;
|
42
|
+
long lowlim, hilim;
|
43
|
+
long trans_count, common_chars;
|
44
|
+
|
45
|
+
int i, j, k;
|
46
|
+
|
47
|
+
// ensure that neither string is blank
|
48
|
+
ying_length = strlen(ying);
|
49
|
+
yang_length = strlen(yang);
|
50
|
+
if (!ying_length || !yang_length) return 0;
|
51
|
+
|
52
|
+
search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
|
53
|
+
|
54
|
+
// Blank out the flags
|
55
|
+
ying_flag = alloca(ying_length + 1);
|
56
|
+
if (!ying_flag) return -1.0;
|
57
|
+
|
58
|
+
yang_flag = alloca(yang_length + 1);
|
59
|
+
if (!yang_flag) return -1.0;
|
60
|
+
|
61
|
+
memset(ying_flag, 0, ying_length + 1);
|
62
|
+
memset(yang_flag, 0, yang_length + 1);
|
63
|
+
|
64
|
+
search_range = (search_range/2) - 1;
|
65
|
+
if (search_range < 0) search_range = 0;
|
66
|
+
|
67
|
+
|
68
|
+
// Looking only within the search range, count and flag the matched pairs.
|
69
|
+
common_chars = 0;
|
70
|
+
for (i = 0; i < ying_length; i++) {
|
71
|
+
lowlim = (i >= search_range) ? i - search_range : 0;
|
72
|
+
hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
|
73
|
+
for (j = lowlim; j <= hilim; j++) {
|
74
|
+
if (!yang_flag[j] && yang[j] == ying[i]) {
|
75
|
+
yang_flag[j] = 1;
|
76
|
+
ying_flag[i] = 1;
|
77
|
+
common_chars++;
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
// If no characters in common - return
|
84
|
+
if (!common_chars) return 0;
|
85
|
+
|
86
|
+
// Count the number of transpositions
|
87
|
+
k = trans_count = 0;
|
88
|
+
for (i = 0; i < ying_length; i++) {
|
89
|
+
if (ying_flag[i]) {
|
90
|
+
for (j = k; j < yang_length; j++) {
|
91
|
+
if (yang_flag[j]) {
|
92
|
+
k = j + 1;
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
if (ying[i] != yang[j]) {
|
97
|
+
trans_count++;
|
98
|
+
}
|
99
|
+
}
|
100
|
+
}
|
101
|
+
trans_count /= 2;
|
102
|
+
|
103
|
+
// adjust for similarities in nonmatched characters
|
104
|
+
|
105
|
+
// Main weight computation.
|
106
|
+
weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
|
107
|
+
+ ((double) (common_chars - trans_count)) / ((double) common_chars);
|
108
|
+
weight /= 3.0;
|
109
|
+
|
110
|
+
// Continue to boost the weight if the strings are similar
|
111
|
+
if (winklerize && weight > 0.7) {
|
112
|
+
|
113
|
+
// Adjust for having up to the first 4 characters in common
|
114
|
+
j = (min_len >= 4) ? 4 : min_len;
|
115
|
+
for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
|
116
|
+
if (i) {
|
117
|
+
weight += i * 0.1 * (1.0 - weight);
|
118
|
+
}
|
119
|
+
|
120
|
+
/* Optionally adjust for long strings. */
|
121
|
+
/* After agreeing beginning chars, at least two more must agree and
|
122
|
+
the agreeing characters must be > .5 of remaining characters.
|
123
|
+
*/
|
124
|
+
if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
|
125
|
+
if (NOTNUM(ying[0])) {
|
126
|
+
weight += (double) (1.0-weight) *
|
127
|
+
((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
return weight;
|
133
|
+
}
|
134
|
+
|
135
|
+
|
136
|
+
double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
|
137
|
+
{
|
138
|
+
return _jaro_winkler(ying, yang, long_tolerance, true);
|
139
|
+
}
|
140
|
+
|
141
|
+
double jaro_distance(const char *ying, const char *yang)
|
142
|
+
{
|
143
|
+
return _jaro_winkler(ying, yang, false, false);
|
144
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
/* from the https://github.com/sunlightlabs/jellyfish project */
|
2
|
+
|
3
|
+
#include "hotwater.h"
|
4
|
+
#include <string.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <stdio.h>
|
7
|
+
|
8
|
+
int levenshtein_distance(const char *s1, const char *s2)
|
9
|
+
{
|
10
|
+
size_t s1_len = strlen(s1);
|
11
|
+
size_t s2_len = strlen(s2);
|
12
|
+
size_t rows = s1_len + 1;
|
13
|
+
size_t cols = s2_len + 1;
|
14
|
+
size_t i, j;
|
15
|
+
|
16
|
+
unsigned result;
|
17
|
+
unsigned d1, d2, d3;
|
18
|
+
unsigned *dist = malloc(rows * cols * sizeof(unsigned));
|
19
|
+
if (!dist) {
|
20
|
+
return -1;
|
21
|
+
}
|
22
|
+
|
23
|
+
for (i = 0; i < rows; i++) {
|
24
|
+
dist[i * cols] = i;
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
for (j = 0; j < cols; j++) {
|
29
|
+
dist[j] = j;
|
30
|
+
}
|
31
|
+
|
32
|
+
for (j = 1; j < cols; j++) {
|
33
|
+
for (i = 1; i < rows; i++) {
|
34
|
+
if (s1[i - 1] == s2[j - 1]) {
|
35
|
+
dist[(i * cols) + j] = dist[((i - 1) * cols) + (j - 1)];
|
36
|
+
} else {
|
37
|
+
d1 = dist[((i - 1) * cols) + j] + 1;
|
38
|
+
d2 = dist[(i * cols) + (j - 1)] + 1;
|
39
|
+
d3 = dist[((i - 1) * cols) + (j - 1)] + 1;
|
40
|
+
|
41
|
+
dist[(i * cols) + j] = MIN(d1, MIN(d2, d3));
|
42
|
+
}
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
result = dist[(cols * rows) - 1];
|
47
|
+
|
48
|
+
free(dist);
|
49
|
+
|
50
|
+
return result;
|
51
|
+
}
|
@@ -0,0 +1,173 @@
|
|
1
|
+
/*
|
2
|
+
Colin Surprenant, Feb 2013
|
3
|
+
- converted in C from org/apache/lucene/search/spell/NGramDistance.java v4.0.0
|
4
|
+
- fixed segfault bug in substring n parameter, which did not surface in Java
|
5
|
+
*/
|
6
|
+
|
7
|
+
/* package org.apache.lucene.search.spell; */
|
8
|
+
|
9
|
+
/**
|
10
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
11
|
+
* contributor license agreements. See the NOTICE file distributed with
|
12
|
+
* this work for additional information regarding copyright ownership.
|
13
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
14
|
+
* (the "License"); you may not use this file except in compliance with
|
15
|
+
* the License. You may obtain a copy of the License at
|
16
|
+
*
|
17
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
18
|
+
*
|
19
|
+
* Unless required by applicable law or agreed to in writing, software
|
20
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
21
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
22
|
+
* See the License for the specific language governing permissions and
|
23
|
+
* limitations under the License.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/**
|
27
|
+
* N-Gram version of edit distance based on paper by Grzegorz Kondrak,
|
28
|
+
* "N-gram similarity and distance". Proceedings of the Twelfth International
|
29
|
+
* Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
|
30
|
+
* Buenos Aires, Argentina, November 2005.
|
31
|
+
* http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
|
32
|
+
*
|
33
|
+
* This implementation uses the position-based optimization to compute partial
|
34
|
+
* matches of n-gram sub-strings and adds a null-character prefix of size n-1
|
35
|
+
* so that the first character is contained in the same number of n-grams as
|
36
|
+
* a middle character. Null-character prefix matches are discounted so that
|
37
|
+
* strings with no matching characters will return a distance of 0.
|
38
|
+
*
|
39
|
+
*/
|
40
|
+
|
41
|
+
#include "hotwater.h"
|
42
|
+
#include <string.h>
|
43
|
+
#include <stdlib.h>
|
44
|
+
#include <stdio.h>
|
45
|
+
|
46
|
+
char* substring(const char* s, int offset, int n) {
|
47
|
+
if (s == 0 || strlen(s) == 0 || strlen(s) < offset || strlen(s) < (offset + n)) {
|
48
|
+
return 0;
|
49
|
+
}
|
50
|
+
return strndup(s + offset, n);
|
51
|
+
}
|
52
|
+
|
53
|
+
double ngram_distance (const char *source, const char *target, int n) {
|
54
|
+
int sl = strlen(source);
|
55
|
+
int tl = strlen(target);
|
56
|
+
|
57
|
+
if (sl == 0 || tl == 0) {
|
58
|
+
if (sl == tl) {
|
59
|
+
return 1;
|
60
|
+
}
|
61
|
+
else {
|
62
|
+
return 0;
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
int cost = 0;
|
67
|
+
if (sl < n || tl < n) {
|
68
|
+
int ni = MIN(sl, tl);
|
69
|
+
for (int i = 0; i < ni; i++) {
|
70
|
+
if (source[i] == target[i]) {
|
71
|
+
cost++;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
return (double)cost / (double)MAX(sl, tl);
|
75
|
+
}
|
76
|
+
|
77
|
+
int sa_len = sl + n - 1;
|
78
|
+
char* sa = calloc(sa_len + 1, sizeof(char));
|
79
|
+
if (!sa) {
|
80
|
+
return -1;
|
81
|
+
}
|
82
|
+
|
83
|
+
double* p; // 'previous' cost array, horizontally
|
84
|
+
double* d; // cost array, horizontally
|
85
|
+
double* _d; // placeholder to assist in swapping p and d
|
86
|
+
|
87
|
+
// construct sa with prefix
|
88
|
+
for (int i = 0; i < sa_len; i++) {
|
89
|
+
if (i < n - 1) {
|
90
|
+
sa[i] = 0 ; //add prefix
|
91
|
+
}
|
92
|
+
else {
|
93
|
+
sa[i] = source[i - n + 1];
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
int p_d_len = sl + 1;
|
98
|
+
p = calloc(p_d_len + 1, sizeof(double));
|
99
|
+
if (!p) {
|
100
|
+
free(sa);
|
101
|
+
return -1;
|
102
|
+
}
|
103
|
+
d = calloc(p_d_len + 1, sizeof(double));
|
104
|
+
if (!d) {
|
105
|
+
free(sa);
|
106
|
+
free(p);
|
107
|
+
return -1;
|
108
|
+
}
|
109
|
+
|
110
|
+
// indexes into strings s and t
|
111
|
+
int i = 0; // iterates through source
|
112
|
+
int j = 0; // iterates through target
|
113
|
+
|
114
|
+
char* t_j = calloc(n + 1, sizeof(char)); // jth n-gram of t
|
115
|
+
if (!t_j) {
|
116
|
+
free(sa);
|
117
|
+
free(p);
|
118
|
+
free(d);
|
119
|
+
return -1;
|
120
|
+
}
|
121
|
+
for (i = 0; i <= sl; i++) {
|
122
|
+
p[i] = i;
|
123
|
+
}
|
124
|
+
|
125
|
+
for (j = 1; j <= tl; j++) {
|
126
|
+
// construct t_j n-gram
|
127
|
+
if (j < n) {
|
128
|
+
for (int ti = 0; ti < n - j; ti++) {
|
129
|
+
t_j[ti] = 0; //add prefix
|
130
|
+
}
|
131
|
+
for (int ti = n - j; ti < n; ti++) {
|
132
|
+
t_j[ti] = target[ti - (n - j)];
|
133
|
+
}
|
134
|
+
}
|
135
|
+
else {
|
136
|
+
free(t_j);
|
137
|
+
t_j = substring(target, j - n, n);
|
138
|
+
}
|
139
|
+
d[0] = j;
|
140
|
+
for (i = 1; i <= sl; i++) {
|
141
|
+
cost = 0;
|
142
|
+
int tn = n;
|
143
|
+
// compare sa to t_j
|
144
|
+
|
145
|
+
for (int ni = 0; ni < n; ni++) {
|
146
|
+
if (sa[i - 1 + ni] != t_j[ni]) {
|
147
|
+
cost++;
|
148
|
+
}
|
149
|
+
else if (sa[i - 1 + ni] == 0) { //discount matches on prefix
|
150
|
+
tn--;
|
151
|
+
}
|
152
|
+
}
|
153
|
+
double ec = (double)cost / (double)tn;
|
154
|
+
|
155
|
+
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
156
|
+
d[i] = MIN(MIN(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
|
157
|
+
}
|
158
|
+
// copy current distance counts to 'previous row' distance counts
|
159
|
+
_d = p;
|
160
|
+
p = d;
|
161
|
+
d = _d;
|
162
|
+
}
|
163
|
+
double p_sl = p[sl];
|
164
|
+
|
165
|
+
free(p);
|
166
|
+
free(d);
|
167
|
+
free(t_j);
|
168
|
+
free(sa);
|
169
|
+
|
170
|
+
// our last action in the above loop was to switch d and p, so p now
|
171
|
+
// actually has the most recent cost counts
|
172
|
+
return 1.0 - (p_sl / (double)MAX(tl, sl));
|
173
|
+
}
|
data/hotwater.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'hotwater/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "hotwater"
|
8
|
+
gem.version = Hotwater::VERSION
|
9
|
+
gem.authors = ["Colin Surprenant"]
|
10
|
+
gem.email = ["colin.surprenant@gmail.com"]
|
11
|
+
gem.description = "Ruby & JRuby gem with fast string edit distance C implementation using FFI bindings"
|
12
|
+
gem.summary = "Fast string edit distance"
|
13
|
+
gem.homepage = "http://github.com/colinsurprenant/hotwater"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.extensions = ["ext/hotwater/Rakefile"]
|
20
|
+
|
21
|
+
gem.add_dependency 'rake'
|
22
|
+
gem.add_dependency 'ffi'
|
23
|
+
gem.add_dependency 'ffi-compiler'
|
24
|
+
|
25
|
+
gem.add_development_dependency 'rspec'
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# int damerau_levenshtein_distance(const char *str1, const char *str2)
|
7
|
+
attach_function :damerau_levenshtein_distance, [:string, :string], :int
|
8
|
+
end
|
9
|
+
|
10
|
+
def damerau_levenshtein_distance(s1, s2)
|
11
|
+
result = C::damerau_levenshtein_distance(s1, s2)
|
12
|
+
raise("memory allocation error") if result == -1
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
def normalized_damerau_levenshtein_distance(s1, s2)
|
17
|
+
result = C::damerau_levenshtein_distance(s1, s2)
|
18
|
+
raise("memory allocation error") if result == -1
|
19
|
+
return 0.0 if result == 0
|
20
|
+
max = [s1.size, s2.size].max
|
21
|
+
(max - result.to_f) / max
|
22
|
+
end
|
23
|
+
|
24
|
+
module_function :damerau_levenshtein_distance, :normalized_damerau_levenshtein_distance
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# double jaro_distance(const char *ying, const char *yang)
|
7
|
+
attach_function :jaro_distance, [:string, :string], :double
|
8
|
+
|
9
|
+
# double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
|
10
|
+
attach_function :jaro_winkler_distance, [:string, :string, :bool], :double
|
11
|
+
end
|
12
|
+
|
13
|
+
def jaro_distance(s1, s2)
|
14
|
+
result = C::jaro_distance(s1, s2)
|
15
|
+
raise("memory allocation error") if result < 0.0
|
16
|
+
result
|
17
|
+
end
|
18
|
+
|
19
|
+
def jaro_winkler_distance(s1, s2, long_tolerance = false)
|
20
|
+
result = C::jaro_winkler_distance(s1, s2, long_tolerance)
|
21
|
+
raise("memory allocation error") if result < 0.0
|
22
|
+
result
|
23
|
+
end
|
24
|
+
|
25
|
+
module_function :jaro_distance, :jaro_winkler_distance
|
26
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# int levenshtein_distance(const char *str1, const char *str2);
|
7
|
+
attach_function :levenshtein_distance, [:string, :string], :int
|
8
|
+
end
|
9
|
+
|
10
|
+
def levenshtein_distance(s1, s2)
|
11
|
+
result = C::levenshtein_distance(s1, s2)
|
12
|
+
raise("memory allocation error") if result == -1
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
def normalized_levenshtein_distance(s1, s2)
|
17
|
+
result = C::levenshtein_distance(s1, s2)
|
18
|
+
raise("memory allocation error") if result == -1
|
19
|
+
return 0.0 if result == 0
|
20
|
+
max = [s1.size, s2.size].max
|
21
|
+
(max - result.to_f) / max
|
22
|
+
end
|
23
|
+
|
24
|
+
module_function :levenshtein_distance, :normalized_levenshtein_distance
|
25
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# double ngram_distance(const char str1*, const char str2*, int n, int mode);
|
7
|
+
attach_function :ngram_distance, [:string, :string, :int], :double
|
8
|
+
end
|
9
|
+
|
10
|
+
def ngram_distance(s1, s2, n = 2)
|
11
|
+
result = C::ngram_distance(s1, s2, n)
|
12
|
+
raise("memory allocation error") if result == -1
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
module_function :ngram_distance
|
17
|
+
|
18
|
+
end
|
data/lib/hotwater.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'hotwater/version'
|
2
|
+
require 'ffi'
|
3
|
+
require 'ffi-compiler/loader'
|
4
|
+
|
5
|
+
module Hotwater
|
6
|
+
module C
|
7
|
+
extend FFI::Library
|
8
|
+
ffi_lib FFI::Compiler::Loader.find('hotwater')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'hotwater/levenshtein_ffi'
|
13
|
+
require 'hotwater/damerau_levenshtein_ffi'
|
14
|
+
require 'hotwater/jaro_ffi'
|
15
|
+
require 'hotwater/ngram_ffi'
|
16
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute damerau_levenshtein_distance" do
|
7
|
+
Hotwater.damerau_levenshtein_distance("", "").should == 0
|
8
|
+
Hotwater.damerau_levenshtein_distance("abc", "").should == 3
|
9
|
+
Hotwater.damerau_levenshtein_distance("bc", "abc").should == 1
|
10
|
+
Hotwater.damerau_levenshtein_distance("ca", "abc").should == 3
|
11
|
+
Hotwater.damerau_levenshtein_distance("abc", "acb").should == 1
|
12
|
+
Hotwater.damerau_levenshtein_distance("kitten", "sitting").should == 3
|
13
|
+
Hotwater.damerau_levenshtein_distance("Saturday", "Sunday").should == 3
|
14
|
+
Hotwater.damerau_levenshtein_distance("teusday", "tuesday").should == 1
|
15
|
+
Hotwater.damerau_levenshtein_distance("teusday", "thursday").should == 2
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should compute normalized_damerau_levenshtein_distance" do
|
19
|
+
Hotwater.normalized_damerau_levenshtein_distance("", "").should == 0.0
|
20
|
+
Hotwater.normalized_damerau_levenshtein_distance("abc", "").should == 0.0
|
21
|
+
Hotwater.normalized_damerau_levenshtein_distance("bc", "abc").round(4).should == 0.6667
|
22
|
+
Hotwater.normalized_damerau_levenshtein_distance("ca", "abc").should == 0.0
|
23
|
+
Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4).should == 0.6667
|
24
|
+
Hotwater.normalized_damerau_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
|
25
|
+
Hotwater.normalized_damerau_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
|
+
Hotwater.normalized_damerau_levenshtein_distance("teusday", "tuesday").round(4).should == 0.8571
|
27
|
+
Hotwater.normalized_damerau_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute jaro_distance" do
|
7
|
+
Hotwater.jaro_distance("", "").should == 0.0
|
8
|
+
Hotwater.jaro_distance("dixon", "dicksonx").round(4).should == 0.7667
|
9
|
+
Hotwater.jaro_distance("martha", "marhta").round(4).should == 0.9444
|
10
|
+
Hotwater.jaro_distance("dwayne", "duane").round(4).should == 0.8222
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should compute jaro_winkler_distance" do
|
14
|
+
Hotwater.jaro_winkler_distance("", "").should == 0.0
|
15
|
+
Hotwater.jaro_winkler_distance("dixon", "dicksonx").round(4).should == 0.8133
|
16
|
+
Hotwater.jaro_winkler_distance("martha", "marhta").round(4).should == 0.9611
|
17
|
+
Hotwater.jaro_winkler_distance("dwayne", "duane").round(4).should == 0.84
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute levenshtein_distance" do
|
7
|
+
Hotwater.levenshtein_distance("", "").should == 0
|
8
|
+
Hotwater.levenshtein_distance("abc", "").should == 3
|
9
|
+
Hotwater.levenshtein_distance("bc", "abc").should == 1
|
10
|
+
Hotwater.levenshtein_distance("ca", "abc").should == 3
|
11
|
+
Hotwater.levenshtein_distance("abc", "acb").should == 2
|
12
|
+
Hotwater.levenshtein_distance("kitten", "sitting").should == 3
|
13
|
+
Hotwater.levenshtein_distance("Saturday", "Sunday").should == 3
|
14
|
+
Hotwater.levenshtein_distance("teusday", "tuesday").should == 2
|
15
|
+
Hotwater.levenshtein_distance("teusday", "thursday").should == 2
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should compute normalized_levenshtein_distance" do
|
19
|
+
Hotwater.normalized_levenshtein_distance("", "").should == 0.0
|
20
|
+
Hotwater.normalized_levenshtein_distance("abc", "").should == 0.0
|
21
|
+
Hotwater.normalized_levenshtein_distance("bc", "abc").round(4).should == 0.6667
|
22
|
+
Hotwater.normalized_levenshtein_distance("ca", "abc").should == 0.0
|
23
|
+
Hotwater.normalized_levenshtein_distance("abc", "acb").round(4).should == 0.3333
|
24
|
+
Hotwater.normalized_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
|
25
|
+
Hotwater.normalized_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
|
+
Hotwater.normalized_levenshtein_distance("teusday", "tuesday").round(4).should == 0.7143
|
27
|
+
Hotwater.normalized_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute unigram distance" do
|
7
|
+
Hotwater.ngram_distance("", "al", 1).round(4).should == 0.0
|
8
|
+
Hotwater.ngram_distance("al", "al", 1).round(4).should == 1.0
|
9
|
+
Hotwater.ngram_distance("a", "a", 1).round(4).should == 1.0
|
10
|
+
Hotwater.ngram_distance("b", "a", 1).round(4).should == 0.0
|
11
|
+
Hotwater.ngram_distance("martha", "marhta", 1).round(4).should == 0.6667
|
12
|
+
Hotwater.ngram_distance("jones", "johnson", 1).round(4).should == 0.4286
|
13
|
+
Hotwater.ngram_distance("natural", "contrary", 1).round(4).should == 0.25
|
14
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 1).round(4).should == 0.75
|
15
|
+
Hotwater.ngram_distance("dwayne", "duane", 1).round(4).should == 0.6667
|
16
|
+
Hotwater.ngram_distance("dixon", "dicksonx", 1).round(4).should == 0.5
|
17
|
+
Hotwater.ngram_distance("six", "ten", 1).round(4).should == 0.0
|
18
|
+
Hotwater.ngram_distance("zac ephron", "zac efron", 1).round(4).should == Hotwater.ngram_distance("zac ephron", "kai ephron", 1).round(4)
|
19
|
+
Hotwater.ngram_distance("brittney spears", "britney spears", 1).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 1)
|
20
|
+
Hotwater.ngram_distance("12345678", "12890678", 1).round(4).should == Hotwater.ngram_distance("12345678", "72385698", 1)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should compute bigram distance" do
|
24
|
+
Hotwater.ngram_distance("", "al", 2).round(4).should == 0.0
|
25
|
+
Hotwater.ngram_distance("al", "al", 2).round(4).should == 1.0
|
26
|
+
Hotwater.ngram_distance("a", "a", 2).round(4).should == 1.0
|
27
|
+
Hotwater.ngram_distance("b", "a", 2).round(4).should == 0.0
|
28
|
+
Hotwater.ngram_distance("a", "aa", 2).round(4).should == 0.5
|
29
|
+
Hotwater.ngram_distance("martha", "marhta", 2).round(4).should == 0.6667
|
30
|
+
Hotwater.ngram_distance("jones", "johnson", 2).round(4).should == 0.4286
|
31
|
+
Hotwater.ngram_distance("natural", "contrary", 2).round(4).should == 0.25
|
32
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 2).round(4).should == 0.625
|
33
|
+
Hotwater.ngram_distance("dwayne", "duane", 2).round(4).should == 0.5833
|
34
|
+
Hotwater.ngram_distance("dixon", "dicksonx", 2).round(4).should == 0.5
|
35
|
+
Hotwater.ngram_distance("six", "ten", 2).round(4).should == 0.0
|
36
|
+
Hotwater.ngram_distance("zac ephron", "zac efron", 2).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 2).round(4)
|
37
|
+
Hotwater.ngram_distance("brittney spears", "britney spears", 2).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 2)
|
38
|
+
Hotwater.ngram_distance("0012345678", "0012890678", 2).round(4).should == Hotwater.ngram_distance("0012345678", "0072385698", 2)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should compute bigram distance by default" do
|
42
|
+
Hotwater.ngram_distance("", "al").round(4).should == 0.0
|
43
|
+
Hotwater.ngram_distance("al", "al").round(4).should == 1.0
|
44
|
+
Hotwater.ngram_distance("a", "a").round(4).should == 1.0
|
45
|
+
Hotwater.ngram_distance("b", "a").round(4).should == 0.0
|
46
|
+
Hotwater.ngram_distance("a", "aa").round(4).should == 0.5
|
47
|
+
Hotwater.ngram_distance("martha", "marhta").round(4).should == 0.6667
|
48
|
+
Hotwater.ngram_distance("jones", "johnson").round(4).should == 0.4286
|
49
|
+
Hotwater.ngram_distance("natural", "contrary").round(4).should == 0.25
|
50
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz").round(4).should == 0.625
|
51
|
+
Hotwater.ngram_distance("dwayne", "duane").round(4).should == 0.5833
|
52
|
+
Hotwater.ngram_distance("dixon", "dicksonx").round(4).should == 0.5
|
53
|
+
Hotwater.ngram_distance("six", "ten").round(4).should == 0.0
|
54
|
+
Hotwater.ngram_distance("zac ephron", "zac efron").round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron").round(4)
|
55
|
+
Hotwater.ngram_distance("brittney spears", "britney spears").round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman")
|
56
|
+
Hotwater.ngram_distance("0012345678", "0012890678").round(4).should == Hotwater.ngram_distance("0012345678", "0072385698")
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should compute trigram distance" do
|
60
|
+
Hotwater.ngram_distance("", "al", 3).round(4).should == 0.0
|
61
|
+
Hotwater.ngram_distance("al", "al", 3).round(4).should == 1.0
|
62
|
+
Hotwater.ngram_distance("a", "a", 3).round(4).should == 1.0
|
63
|
+
Hotwater.ngram_distance("b", "a", 3).round(4).should == 0.0
|
64
|
+
Hotwater.ngram_distance("martha", "marhta", 3).round(4).should == 0.7222
|
65
|
+
Hotwater.ngram_distance("jones", "johnson", 3).round(4).should == 0.4762
|
66
|
+
Hotwater.ngram_distance("natural", "contrary", 3).round(4).should == 0.2083
|
67
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 3).round(4).should == 0.5625
|
68
|
+
Hotwater.ngram_distance("dwayne", "duane", 3).round(4).should == 0.5278
|
69
|
+
Hotwater.ngram_distance("dixon", "dicksonx", 3).round(4).should == 0.4583
|
70
|
+
Hotwater.ngram_distance("six", "ten", 3).round(4).should == 0.0
|
71
|
+
Hotwater.ngram_distance("zac ephron", "zac efron", 3).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 3).round(4)
|
72
|
+
Hotwater.ngram_distance("brittney spears", "britney spears", 3).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 3)
|
73
|
+
Hotwater.ngram_distance("0012345678", "0012890678", 3).round(4).should < Hotwater.ngram_distance("0012345678", "0072385698", 3)
|
74
|
+
end
|
75
|
+
end
|
metadata
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hotwater
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Colin Surprenant
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: ffi
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: ffi-compiler
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rspec
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: Ruby & JRuby gem with fast string edit distance C implementation using
|
79
|
+
FFI bindings
|
80
|
+
email:
|
81
|
+
- colin.surprenant@gmail.com
|
82
|
+
executables: []
|
83
|
+
extensions:
|
84
|
+
- ext/hotwater/Rakefile
|
85
|
+
extra_rdoc_files: []
|
86
|
+
files:
|
87
|
+
- .gitignore
|
88
|
+
- Gemfile
|
89
|
+
- LICENSE.txt
|
90
|
+
- README.md
|
91
|
+
- Rakefile
|
92
|
+
- ext/hotwater/Rakefile
|
93
|
+
- ext/hotwater/damerau_levenshtein.c
|
94
|
+
- ext/hotwater/hotwater.h
|
95
|
+
- ext/hotwater/jaro.c
|
96
|
+
- ext/hotwater/levenshtein.c
|
97
|
+
- ext/hotwater/ngram.c
|
98
|
+
- hotwater.gemspec
|
99
|
+
- lib/hotwater.rb
|
100
|
+
- lib/hotwater/damerau_levenshtein_ffi.rb
|
101
|
+
- lib/hotwater/jaro_ffi.rb
|
102
|
+
- lib/hotwater/levenshtein_ffi.rb
|
103
|
+
- lib/hotwater/ngram_ffi.rb
|
104
|
+
- lib/hotwater/version.rb
|
105
|
+
- spec/hotwater/damerau_levenshtein_ffi_spec.rb
|
106
|
+
- spec/hotwater/jaro_ffi_spec.rb
|
107
|
+
- spec/hotwater/levenshtein_ffi_spec.rb
|
108
|
+
- spec/hotwater/ngram_ffi_spec.rb
|
109
|
+
homepage: http://github.com/colinsurprenant/hotwater
|
110
|
+
licenses: []
|
111
|
+
post_install_message:
|
112
|
+
rdoc_options: []
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
segments:
|
122
|
+
- 0
|
123
|
+
hash: -289401610859280349
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
+
none: false
|
126
|
+
requirements:
|
127
|
+
- - ! '>='
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
segments:
|
131
|
+
- 0
|
132
|
+
hash: -289401610859280349
|
133
|
+
requirements: []
|
134
|
+
rubyforge_project:
|
135
|
+
rubygems_version: 1.8.23
|
136
|
+
signing_key:
|
137
|
+
specification_version: 3
|
138
|
+
summary: Fast string edit distance
|
139
|
+
test_files:
|
140
|
+
- spec/hotwater/damerau_levenshtein_ffi_spec.rb
|
141
|
+
- spec/hotwater/jaro_ffi_spec.rb
|
142
|
+
- spec/hotwater/levenshtein_ffi_spec.rb
|
143
|
+
- spec/hotwater/ngram_ffi_spec.rb
|