hotwater 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +45 -0
- data/README.md +70 -0
- data/Rakefile +25 -0
- data/ext/hotwater/Rakefile +5 -0
- data/ext/hotwater/damerau_levenshtein.c +68 -0
- data/ext/hotwater/hotwater.h +23 -0
- data/ext/hotwater/jaro.c +144 -0
- data/ext/hotwater/levenshtein.c +51 -0
- data/ext/hotwater/ngram.c +173 -0
- data/hotwater.gemspec +26 -0
- data/lib/hotwater/damerau_levenshtein_ffi.rb +26 -0
- data/lib/hotwater/jaro_ffi.rb +26 -0
- data/lib/hotwater/levenshtein_ffi.rb +25 -0
- data/lib/hotwater/ngram_ffi.rb +18 -0
- data/lib/hotwater/version.rb +3 -0
- data/lib/hotwater.rb +16 -0
- data/spec/hotwater/damerau_levenshtein_ffi_spec.rb +29 -0
- data/spec/hotwater/jaro_ffi_spec.rb +19 -0
- data/spec/hotwater/levenshtein_ffi_spec.rb +29 -0
- data/spec/hotwater/ngram_ffi_spec.rb +75 -0
- metadata +143 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
Copyright (c) 2013 Colin Surprenant <colin.surprenant@gmail.com>
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
|
15
|
+
--------
|
16
|
+
|
17
|
+
C code from the https://github.com/sunlightlabs/jellyfish project
|
18
|
+
|
19
|
+
Copyright (c) 2010, Sunlight Labs
|
20
|
+
|
21
|
+
All rights reserved.
|
22
|
+
|
23
|
+
Redistribution and use in source and binary forms, with or without modification,
|
24
|
+
are permitted provided that the following conditions are met:
|
25
|
+
|
26
|
+
* Redistributions of source code must retain the above copyright notice,
|
27
|
+
this list of conditions and the following disclaimer.
|
28
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
29
|
+
this list of conditions and the following disclaimer in the documentation
|
30
|
+
and/or other materials provided with the distribution.
|
31
|
+
* Neither the name of Sunlight Labs nor the names of its contributors may be
|
32
|
+
used to endorse or promote products derived from this software without
|
33
|
+
specific prior written permission.
|
34
|
+
|
35
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
36
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
37
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
38
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
39
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
40
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
41
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
42
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
43
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
44
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
45
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
# Hotwater v0.1.0
|
2
|
+
|
3
|
+
Ruby & JRuby gem with fast **string edit distance** C implementations using FFI bindings.
|
4
|
+
|
5
|
+
### Algorithms
|
6
|
+
|
7
|
+
- Levenshtein & Damerau Levenshtein
|
8
|
+
- Jaro & Jaro Winkler
|
9
|
+
- N-Gram
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
gem 'hotwater'
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install hotwater
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
Hotwater.levenshtein_distance("abc", "acb") # => 2
|
29
|
+
Hotwater.damerau_levenshtein_distance("abc", "acb") # => 1
|
30
|
+
|
31
|
+
# do normalization based on the string sizes
|
32
|
+
Hotwater.normalized_levenshtein_distance("abc", "acb").round(4) # => 0.3333
|
33
|
+
Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4) # => 0.6667
|
34
|
+
|
35
|
+
Hotwater.jaro_distance("martha", "marhta").round(4) # => 0.9444
|
36
|
+
Hotwater.jaro_winkler_distance("martha", "marhta").round(4) # => 0.9611
|
37
|
+
|
38
|
+
# default is bigram
|
39
|
+
Hotwater.ngram_distance("natural", "contrary").round(4) # => 0.25
|
40
|
+
|
41
|
+
# specify trigram
|
42
|
+
Hotwater.ngram_distance("natural", "contrary", 3).round(4) # => 0.2083
|
43
|
+
```
|
44
|
+
|
45
|
+
## Developement
|
46
|
+
|
47
|
+
1. Fort it
|
48
|
+
2. Install gems `$ bundle install`
|
49
|
+
3. Compile lib `$ rake compile`
|
50
|
+
4. Run specs `$ rake spec`
|
51
|
+
5. Clean compiler generated files `$ rake clean`
|
52
|
+
|
53
|
+
## Contributing
|
54
|
+
|
55
|
+
1. Fork it
|
56
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
57
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
58
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
59
|
+
5. Create new Pull Request
|
60
|
+
|
61
|
+
## Credits
|
62
|
+
- Some C code from the https://github.com/sunlightlabs/jellyfish project
|
63
|
+
- N-Gram ported from Apache Lucene 4.0.0 NGramDistance.java
|
64
|
+
|
65
|
+
## Author
|
66
|
+
Colin Surprenant, [@colinsurprenant](http://twitter.com/colinsurprenant), [http://github.com/colinsurprenant](http://github.com/colinsurprenant), colin.surprenant@gmail.com
|
67
|
+
|
68
|
+
## License
|
69
|
+
Hotwater is distributed under the Apache License, Version 2.0.
|
70
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'bundler/gem_tasks'
|
5
|
+
require 'rspec/core/rake_task'
|
6
|
+
require 'ffi-compiler/compile_task'
|
7
|
+
|
8
|
+
task :default => :spec
|
9
|
+
|
10
|
+
desc "run specs"
|
11
|
+
task :spec do
|
12
|
+
RSpec::Core::RakeTask.new
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "compiler tasks"
|
16
|
+
namespace "ffi-compiler" do
|
17
|
+
FFI::Compiler::CompileTask.new('ext/hotwater/hotwater') do |c|
|
18
|
+
end
|
19
|
+
end
|
20
|
+
task :compile => ["ffi-compiler:default"]
|
21
|
+
|
22
|
+
CLEAN.include('ext/**/*{.o,.log,.so,.bundle}')
|
23
|
+
CLEAN.include('lib/**/*{.o,.log,.so,.bundle}')
|
24
|
+
CLEAN.include('ext/**/Makefile')
|
25
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
/* from the https://github.com/sunlightlabs/jellyfish project */
|
2
|
+
|
3
|
+
#include "hotwater.h"
|
4
|
+
#include <string.h>
|
5
|
+
|
6
|
+
int damerau_levenshtein_distance(const char *s1, const char *s2)
|
7
|
+
{
|
8
|
+
size_t s1_len = s1 == NULL ? 0 : strlen(s1);
|
9
|
+
size_t s2_len = s2 == NULL ? 0 : strlen(s2);
|
10
|
+
size_t rows = s1_len + 1;
|
11
|
+
size_t cols = s2_len + 1;
|
12
|
+
|
13
|
+
size_t i, j;
|
14
|
+
size_t d1, d2, d3, d_now;;
|
15
|
+
unsigned short cost;
|
16
|
+
|
17
|
+
if (s1_len == 0) {
|
18
|
+
if (s2_len == 0) {
|
19
|
+
return 0;
|
20
|
+
}
|
21
|
+
else
|
22
|
+
{
|
23
|
+
return s2_len;
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
size_t *dist = malloc(rows * cols * sizeof(size_t));
|
28
|
+
if (!dist) {
|
29
|
+
return -1;
|
30
|
+
}
|
31
|
+
|
32
|
+
for (i = 0; i < rows; i++) {
|
33
|
+
dist[i * cols] = i;
|
34
|
+
}
|
35
|
+
|
36
|
+
for (j = 0; j < cols; j++) {
|
37
|
+
dist[j] = j;
|
38
|
+
}
|
39
|
+
|
40
|
+
for (i = 1; i < rows; i++) {
|
41
|
+
for (j = 1; j < cols; j++) {
|
42
|
+
if (s1[i - 1] == s2[j - 1]) {
|
43
|
+
cost = 0;
|
44
|
+
} else {
|
45
|
+
cost = 1;
|
46
|
+
}
|
47
|
+
|
48
|
+
d1 = dist[((i - 1) * cols) + j] + 1;
|
49
|
+
d2 = dist[(i * cols) + (j - 1)] + 1;
|
50
|
+
d3 = dist[((i - 1) * cols) + (j - 1)] + cost;
|
51
|
+
|
52
|
+
d_now = MIN(d1, MIN(d2, d3));
|
53
|
+
|
54
|
+
if (i > 2 && j > 2 && s1[i - 1] == s2[j - 2] &&
|
55
|
+
s1[i - 2] == s2[j - 1]) {
|
56
|
+
d1 = dist[((i - 2) * cols) + (j - 2)] + cost;
|
57
|
+
d_now = MIN(d_now, d1);
|
58
|
+
}
|
59
|
+
|
60
|
+
dist[(i * cols) + j] = d_now;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
d_now = dist[(cols * rows) - 1];
|
65
|
+
free(dist);
|
66
|
+
|
67
|
+
return d_now;
|
68
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#ifndef _HOTWATER_H_
|
2
|
+
#define _HOTWATER_H_
|
3
|
+
|
4
|
+
#include <stdbool.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
|
7
|
+
#ifndef MIN
|
8
|
+
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#ifndef MAX
|
12
|
+
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
|
13
|
+
#endif
|
14
|
+
|
15
|
+
double jaro_winkler_distance(const char *str1, const char *str2, bool long_tolerance);
|
16
|
+
double jaro_distance(const char *str1, const char *str2);
|
17
|
+
|
18
|
+
int levenshtein_distance(const char *str1, const char *str2);
|
19
|
+
int damerau_levenshtein_distance(const char *str1, const char *str2);
|
20
|
+
|
21
|
+
double ngram_distance(const char *source, const char *target, int n);
|
22
|
+
|
23
|
+
#endif /* _HOTWATER_H_ */
|
data/ext/hotwater/jaro.c
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
/* from the https://github.com/sunlightlabs/jellyfish project */
|
2
|
+
|
3
|
+
/*
|
4
|
+
Colin Surprenant, Feb 2013
|
5
|
+
- modified error return to -1 and small cosmetic cleanups
|
6
|
+
*/
|
7
|
+
|
8
|
+
|
9
|
+
#include <ctype.h>
|
10
|
+
#include <string.h>
|
11
|
+
#include <stdio.h>
|
12
|
+
#include <stdlib.h>
|
13
|
+
#include "hotwater.h"
|
14
|
+
|
15
|
+
#define NOTNUM(c) ((c>57) || (c<48))
|
16
|
+
#define INRANGE(c) ((c>0) && (c<91))
|
17
|
+
|
18
|
+
/* borrowed heavily from strcmp95.c
|
19
|
+
* http://www.census.gov/geo/msb/stand/strcmp.c
|
20
|
+
*/
|
21
|
+
double _jaro_winkler(const char *ying, const char *yang, bool long_tolerance, bool winklerize)
|
22
|
+
{
|
23
|
+
/* Arguments:
|
24
|
+
|
25
|
+
ying
|
26
|
+
yang
|
27
|
+
pointers to the 2 strings to be compared.
|
28
|
+
|
29
|
+
long_tolerance
|
30
|
+
Increase the probability of a match when the number of matched
|
31
|
+
characters is large. This option allows for a little more
|
32
|
+
tolerance when the strings are large. It is not an appropriate
|
33
|
+
test when comparing fixed length fields such as phone and
|
34
|
+
social security numbers.
|
35
|
+
*/
|
36
|
+
char *ying_flag = 0, *yang_flag = 0;
|
37
|
+
|
38
|
+
double weight;
|
39
|
+
|
40
|
+
long ying_length, yang_length, min_len;
|
41
|
+
long search_range;
|
42
|
+
long lowlim, hilim;
|
43
|
+
long trans_count, common_chars;
|
44
|
+
|
45
|
+
int i, j, k;
|
46
|
+
|
47
|
+
// ensure that neither string is blank
|
48
|
+
ying_length = strlen(ying);
|
49
|
+
yang_length = strlen(yang);
|
50
|
+
if (!ying_length || !yang_length) return 0;
|
51
|
+
|
52
|
+
search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
|
53
|
+
|
54
|
+
// Blank out the flags
|
55
|
+
ying_flag = alloca(ying_length + 1);
|
56
|
+
if (!ying_flag) return -1.0;
|
57
|
+
|
58
|
+
yang_flag = alloca(yang_length + 1);
|
59
|
+
if (!yang_flag) return -1.0;
|
60
|
+
|
61
|
+
memset(ying_flag, 0, ying_length + 1);
|
62
|
+
memset(yang_flag, 0, yang_length + 1);
|
63
|
+
|
64
|
+
search_range = (search_range/2) - 1;
|
65
|
+
if (search_range < 0) search_range = 0;
|
66
|
+
|
67
|
+
|
68
|
+
// Looking only within the search range, count and flag the matched pairs.
|
69
|
+
common_chars = 0;
|
70
|
+
for (i = 0; i < ying_length; i++) {
|
71
|
+
lowlim = (i >= search_range) ? i - search_range : 0;
|
72
|
+
hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
|
73
|
+
for (j = lowlim; j <= hilim; j++) {
|
74
|
+
if (!yang_flag[j] && yang[j] == ying[i]) {
|
75
|
+
yang_flag[j] = 1;
|
76
|
+
ying_flag[i] = 1;
|
77
|
+
common_chars++;
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
// If no characters in common - return
|
84
|
+
if (!common_chars) return 0;
|
85
|
+
|
86
|
+
// Count the number of transpositions
|
87
|
+
k = trans_count = 0;
|
88
|
+
for (i = 0; i < ying_length; i++) {
|
89
|
+
if (ying_flag[i]) {
|
90
|
+
for (j = k; j < yang_length; j++) {
|
91
|
+
if (yang_flag[j]) {
|
92
|
+
k = j + 1;
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
if (ying[i] != yang[j]) {
|
97
|
+
trans_count++;
|
98
|
+
}
|
99
|
+
}
|
100
|
+
}
|
101
|
+
trans_count /= 2;
|
102
|
+
|
103
|
+
// adjust for similarities in nonmatched characters
|
104
|
+
|
105
|
+
// Main weight computation.
|
106
|
+
weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
|
107
|
+
+ ((double) (common_chars - trans_count)) / ((double) common_chars);
|
108
|
+
weight /= 3.0;
|
109
|
+
|
110
|
+
// Continue to boost the weight if the strings are similar
|
111
|
+
if (winklerize && weight > 0.7) {
|
112
|
+
|
113
|
+
// Adjust for having up to the first 4 characters in common
|
114
|
+
j = (min_len >= 4) ? 4 : min_len;
|
115
|
+
for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
|
116
|
+
if (i) {
|
117
|
+
weight += i * 0.1 * (1.0 - weight);
|
118
|
+
}
|
119
|
+
|
120
|
+
/* Optionally adjust for long strings. */
|
121
|
+
/* After agreeing beginning chars, at least two more must agree and
|
122
|
+
the agreeing characters must be > .5 of remaining characters.
|
123
|
+
*/
|
124
|
+
if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
|
125
|
+
if (NOTNUM(ying[0])) {
|
126
|
+
weight += (double) (1.0-weight) *
|
127
|
+
((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
return weight;
|
133
|
+
}
|
134
|
+
|
135
|
+
|
136
|
+
double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
|
137
|
+
{
|
138
|
+
return _jaro_winkler(ying, yang, long_tolerance, true);
|
139
|
+
}
|
140
|
+
|
141
|
+
double jaro_distance(const char *ying, const char *yang)
|
142
|
+
{
|
143
|
+
return _jaro_winkler(ying, yang, false, false);
|
144
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
/* from the https://github.com/sunlightlabs/jellyfish project */
|
2
|
+
|
3
|
+
#include "hotwater.h"
|
4
|
+
#include <string.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <stdio.h>
|
7
|
+
|
8
|
+
int levenshtein_distance(const char *s1, const char *s2)
|
9
|
+
{
|
10
|
+
size_t s1_len = strlen(s1);
|
11
|
+
size_t s2_len = strlen(s2);
|
12
|
+
size_t rows = s1_len + 1;
|
13
|
+
size_t cols = s2_len + 1;
|
14
|
+
size_t i, j;
|
15
|
+
|
16
|
+
unsigned result;
|
17
|
+
unsigned d1, d2, d3;
|
18
|
+
unsigned *dist = malloc(rows * cols * sizeof(unsigned));
|
19
|
+
if (!dist) {
|
20
|
+
return -1;
|
21
|
+
}
|
22
|
+
|
23
|
+
for (i = 0; i < rows; i++) {
|
24
|
+
dist[i * cols] = i;
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
for (j = 0; j < cols; j++) {
|
29
|
+
dist[j] = j;
|
30
|
+
}
|
31
|
+
|
32
|
+
for (j = 1; j < cols; j++) {
|
33
|
+
for (i = 1; i < rows; i++) {
|
34
|
+
if (s1[i - 1] == s2[j - 1]) {
|
35
|
+
dist[(i * cols) + j] = dist[((i - 1) * cols) + (j - 1)];
|
36
|
+
} else {
|
37
|
+
d1 = dist[((i - 1) * cols) + j] + 1;
|
38
|
+
d2 = dist[(i * cols) + (j - 1)] + 1;
|
39
|
+
d3 = dist[((i - 1) * cols) + (j - 1)] + 1;
|
40
|
+
|
41
|
+
dist[(i * cols) + j] = MIN(d1, MIN(d2, d3));
|
42
|
+
}
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
result = dist[(cols * rows) - 1];
|
47
|
+
|
48
|
+
free(dist);
|
49
|
+
|
50
|
+
return result;
|
51
|
+
}
|
@@ -0,0 +1,173 @@
|
|
1
|
+
/*
|
2
|
+
Colin Surprenant, Feb 2013
|
3
|
+
- converted in C from org/apache/lucene/search/spell/NGramDistance.java v4.0.0
|
4
|
+
- fixed segfault bug in substring n parameter, which did not surface in Java
|
5
|
+
*/
|
6
|
+
|
7
|
+
/* package org.apache.lucene.search.spell; */
|
8
|
+
|
9
|
+
/**
|
10
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
11
|
+
* contributor license agreements. See the NOTICE file distributed with
|
12
|
+
* this work for additional information regarding copyright ownership.
|
13
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
14
|
+
* (the "License"); you may not use this file except in compliance with
|
15
|
+
* the License. You may obtain a copy of the License at
|
16
|
+
*
|
17
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
18
|
+
*
|
19
|
+
* Unless required by applicable law or agreed to in writing, software
|
20
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
21
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
22
|
+
* See the License for the specific language governing permissions and
|
23
|
+
* limitations under the License.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/**
|
27
|
+
* N-Gram version of edit distance based on paper by Grzegorz Kondrak,
|
28
|
+
* "N-gram similarity and distance". Proceedings of the Twelfth International
|
29
|
+
* Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
|
30
|
+
* Buenos Aires, Argentina, November 2005.
|
31
|
+
* http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
|
32
|
+
*
|
33
|
+
* This implementation uses the position-based optimization to compute partial
|
34
|
+
* matches of n-gram sub-strings and adds a null-character prefix of size n-1
|
35
|
+
* so that the first character is contained in the same number of n-grams as
|
36
|
+
* a middle character. Null-character prefix matches are discounted so that
|
37
|
+
* strings with no matching characters will return a distance of 0.
|
38
|
+
*
|
39
|
+
*/
|
40
|
+
|
41
|
+
#include "hotwater.h"
|
42
|
+
#include <string.h>
|
43
|
+
#include <stdlib.h>
|
44
|
+
#include <stdio.h>
|
45
|
+
|
46
|
+
char* substring(const char* s, int offset, int n) {
|
47
|
+
if (s == 0 || strlen(s) == 0 || strlen(s) < offset || strlen(s) < (offset + n)) {
|
48
|
+
return 0;
|
49
|
+
}
|
50
|
+
return strndup(s + offset, n);
|
51
|
+
}
|
52
|
+
|
53
|
+
double ngram_distance (const char *source, const char *target, int n) {
|
54
|
+
int sl = strlen(source);
|
55
|
+
int tl = strlen(target);
|
56
|
+
|
57
|
+
if (sl == 0 || tl == 0) {
|
58
|
+
if (sl == tl) {
|
59
|
+
return 1;
|
60
|
+
}
|
61
|
+
else {
|
62
|
+
return 0;
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
int cost = 0;
|
67
|
+
if (sl < n || tl < n) {
|
68
|
+
int ni = MIN(sl, tl);
|
69
|
+
for (int i = 0; i < ni; i++) {
|
70
|
+
if (source[i] == target[i]) {
|
71
|
+
cost++;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
return (double)cost / (double)MAX(sl, tl);
|
75
|
+
}
|
76
|
+
|
77
|
+
int sa_len = sl + n - 1;
|
78
|
+
char* sa = calloc(sa_len + 1, sizeof(char));
|
79
|
+
if (!sa) {
|
80
|
+
return -1;
|
81
|
+
}
|
82
|
+
|
83
|
+
double* p; // 'previous' cost array, horizontally
|
84
|
+
double* d; // cost array, horizontally
|
85
|
+
double* _d; // placeholder to assist in swapping p and d
|
86
|
+
|
87
|
+
// construct sa with prefix
|
88
|
+
for (int i = 0; i < sa_len; i++) {
|
89
|
+
if (i < n - 1) {
|
90
|
+
sa[i] = 0 ; //add prefix
|
91
|
+
}
|
92
|
+
else {
|
93
|
+
sa[i] = source[i - n + 1];
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
int p_d_len = sl + 1;
|
98
|
+
p = calloc(p_d_len + 1, sizeof(double));
|
99
|
+
if (!p) {
|
100
|
+
free(sa);
|
101
|
+
return -1;
|
102
|
+
}
|
103
|
+
d = calloc(p_d_len + 1, sizeof(double));
|
104
|
+
if (!d) {
|
105
|
+
free(sa);
|
106
|
+
free(p);
|
107
|
+
return -1;
|
108
|
+
}
|
109
|
+
|
110
|
+
// indexes into strings s and t
|
111
|
+
int i = 0; // iterates through source
|
112
|
+
int j = 0; // iterates through target
|
113
|
+
|
114
|
+
char* t_j = calloc(n + 1, sizeof(char)); // jth n-gram of t
|
115
|
+
if (!t_j) {
|
116
|
+
free(sa);
|
117
|
+
free(p);
|
118
|
+
free(d);
|
119
|
+
return -1;
|
120
|
+
}
|
121
|
+
for (i = 0; i <= sl; i++) {
|
122
|
+
p[i] = i;
|
123
|
+
}
|
124
|
+
|
125
|
+
for (j = 1; j <= tl; j++) {
|
126
|
+
// construct t_j n-gram
|
127
|
+
if (j < n) {
|
128
|
+
for (int ti = 0; ti < n - j; ti++) {
|
129
|
+
t_j[ti] = 0; //add prefix
|
130
|
+
}
|
131
|
+
for (int ti = n - j; ti < n; ti++) {
|
132
|
+
t_j[ti] = target[ti - (n - j)];
|
133
|
+
}
|
134
|
+
}
|
135
|
+
else {
|
136
|
+
free(t_j);
|
137
|
+
t_j = substring(target, j - n, n);
|
138
|
+
}
|
139
|
+
d[0] = j;
|
140
|
+
for (i = 1; i <= sl; i++) {
|
141
|
+
cost = 0;
|
142
|
+
int tn = n;
|
143
|
+
// compare sa to t_j
|
144
|
+
|
145
|
+
for (int ni = 0; ni < n; ni++) {
|
146
|
+
if (sa[i - 1 + ni] != t_j[ni]) {
|
147
|
+
cost++;
|
148
|
+
}
|
149
|
+
else if (sa[i - 1 + ni] == 0) { //discount matches on prefix
|
150
|
+
tn--;
|
151
|
+
}
|
152
|
+
}
|
153
|
+
double ec = (double)cost / (double)tn;
|
154
|
+
|
155
|
+
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
156
|
+
d[i] = MIN(MIN(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
|
157
|
+
}
|
158
|
+
// copy current distance counts to 'previous row' distance counts
|
159
|
+
_d = p;
|
160
|
+
p = d;
|
161
|
+
d = _d;
|
162
|
+
}
|
163
|
+
double p_sl = p[sl];
|
164
|
+
|
165
|
+
free(p);
|
166
|
+
free(d);
|
167
|
+
free(t_j);
|
168
|
+
free(sa);
|
169
|
+
|
170
|
+
// our last action in the above loop was to switch d and p, so p now
|
171
|
+
// actually has the most recent cost counts
|
172
|
+
return 1.0 - (p_sl / (double)MAX(tl, sl));
|
173
|
+
}
|
data/hotwater.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'hotwater/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "hotwater"
|
8
|
+
gem.version = Hotwater::VERSION
|
9
|
+
gem.authors = ["Colin Surprenant"]
|
10
|
+
gem.email = ["colin.surprenant@gmail.com"]
|
11
|
+
gem.description = "Ruby & JRuby gem with fast string edit distance C implementation using FFI bindings"
|
12
|
+
gem.summary = "Fast string edit distance"
|
13
|
+
gem.homepage = "http://github.com/colinsurprenant/hotwater"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.extensions = ["ext/hotwater/Rakefile"]
|
20
|
+
|
21
|
+
gem.add_dependency 'rake'
|
22
|
+
gem.add_dependency 'ffi'
|
23
|
+
gem.add_dependency 'ffi-compiler'
|
24
|
+
|
25
|
+
gem.add_development_dependency 'rspec'
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# int damerau_levenshtein_distance(const char *str1, const char *str2)
|
7
|
+
attach_function :damerau_levenshtein_distance, [:string, :string], :int
|
8
|
+
end
|
9
|
+
|
10
|
+
def damerau_levenshtein_distance(s1, s2)
|
11
|
+
result = C::damerau_levenshtein_distance(s1, s2)
|
12
|
+
raise("memory allocation error") if result == -1
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
def normalized_damerau_levenshtein_distance(s1, s2)
|
17
|
+
result = C::damerau_levenshtein_distance(s1, s2)
|
18
|
+
raise("memory allocation error") if result == -1
|
19
|
+
return 0.0 if result == 0
|
20
|
+
max = [s1.size, s2.size].max
|
21
|
+
(max - result.to_f) / max
|
22
|
+
end
|
23
|
+
|
24
|
+
module_function :damerau_levenshtein_distance, :normalized_damerau_levenshtein_distance
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# double jaro_distance(const char *ying, const char *yang)
|
7
|
+
attach_function :jaro_distance, [:string, :string], :double
|
8
|
+
|
9
|
+
# double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
|
10
|
+
attach_function :jaro_winkler_distance, [:string, :string, :bool], :double
|
11
|
+
end
|
12
|
+
|
13
|
+
def jaro_distance(s1, s2)
|
14
|
+
result = C::jaro_distance(s1, s2)
|
15
|
+
raise("memory allocation error") if result < 0.0
|
16
|
+
result
|
17
|
+
end
|
18
|
+
|
19
|
+
def jaro_winkler_distance(s1, s2, long_tolerance = false)
|
20
|
+
result = C::jaro_winkler_distance(s1, s2, long_tolerance)
|
21
|
+
raise("memory allocation error") if result < 0.0
|
22
|
+
result
|
23
|
+
end
|
24
|
+
|
25
|
+
module_function :jaro_distance, :jaro_winkler_distance
|
26
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# int levenshtein_distance(const char *str1, const char *str2);
|
7
|
+
attach_function :levenshtein_distance, [:string, :string], :int
|
8
|
+
end
|
9
|
+
|
10
|
+
def levenshtein_distance(s1, s2)
|
11
|
+
result = C::levenshtein_distance(s1, s2)
|
12
|
+
raise("memory allocation error") if result == -1
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
def normalized_levenshtein_distance(s1, s2)
|
17
|
+
result = C::levenshtein_distance(s1, s2)
|
18
|
+
raise("memory allocation error") if result == -1
|
19
|
+
return 0.0 if result == 0
|
20
|
+
max = [s1.size, s2.size].max
|
21
|
+
(max - result.to_f) / max
|
22
|
+
end
|
23
|
+
|
24
|
+
module_function :levenshtein_distance, :normalized_levenshtein_distance
|
25
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module Hotwater
|
4
|
+
|
5
|
+
module C
|
6
|
+
# double ngram_distance(const char str1*, const char str2*, int n, int mode);
|
7
|
+
attach_function :ngram_distance, [:string, :string, :int], :double
|
8
|
+
end
|
9
|
+
|
10
|
+
def ngram_distance(s1, s2, n = 2)
|
11
|
+
result = C::ngram_distance(s1, s2, n)
|
12
|
+
raise("memory allocation error") if result == -1
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
module_function :ngram_distance
|
17
|
+
|
18
|
+
end
|
data/lib/hotwater.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'hotwater/version'
|
2
|
+
require 'ffi'
|
3
|
+
require 'ffi-compiler/loader'
|
4
|
+
|
5
|
+
module Hotwater
|
6
|
+
module C
|
7
|
+
extend FFI::Library
|
8
|
+
ffi_lib FFI::Compiler::Loader.find('hotwater')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'hotwater/levenshtein_ffi'
|
13
|
+
require 'hotwater/damerau_levenshtein_ffi'
|
14
|
+
require 'hotwater/jaro_ffi'
|
15
|
+
require 'hotwater/ngram_ffi'
|
16
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute damerau_levenshtein_distance" do
|
7
|
+
Hotwater.damerau_levenshtein_distance("", "").should == 0
|
8
|
+
Hotwater.damerau_levenshtein_distance("abc", "").should == 3
|
9
|
+
Hotwater.damerau_levenshtein_distance("bc", "abc").should == 1
|
10
|
+
Hotwater.damerau_levenshtein_distance("ca", "abc").should == 3
|
11
|
+
Hotwater.damerau_levenshtein_distance("abc", "acb").should == 1
|
12
|
+
Hotwater.damerau_levenshtein_distance("kitten", "sitting").should == 3
|
13
|
+
Hotwater.damerau_levenshtein_distance("Saturday", "Sunday").should == 3
|
14
|
+
Hotwater.damerau_levenshtein_distance("teusday", "tuesday").should == 1
|
15
|
+
Hotwater.damerau_levenshtein_distance("teusday", "thursday").should == 2
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should compute normalized_damerau_levenshtein_distance" do
|
19
|
+
Hotwater.normalized_damerau_levenshtein_distance("", "").should == 0.0
|
20
|
+
Hotwater.normalized_damerau_levenshtein_distance("abc", "").should == 0.0
|
21
|
+
Hotwater.normalized_damerau_levenshtein_distance("bc", "abc").round(4).should == 0.6667
|
22
|
+
Hotwater.normalized_damerau_levenshtein_distance("ca", "abc").should == 0.0
|
23
|
+
Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4).should == 0.6667
|
24
|
+
Hotwater.normalized_damerau_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
|
25
|
+
Hotwater.normalized_damerau_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
|
+
Hotwater.normalized_damerau_levenshtein_distance("teusday", "tuesday").round(4).should == 0.8571
|
27
|
+
Hotwater.normalized_damerau_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute jaro_distance" do
|
7
|
+
Hotwater.jaro_distance("", "").should == 0.0
|
8
|
+
Hotwater.jaro_distance("dixon", "dicksonx").round(4).should == 0.7667
|
9
|
+
Hotwater.jaro_distance("martha", "marhta").round(4).should == 0.9444
|
10
|
+
Hotwater.jaro_distance("dwayne", "duane").round(4).should == 0.8222
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should compute jaro_winkler_distance" do
|
14
|
+
Hotwater.jaro_winkler_distance("", "").should == 0.0
|
15
|
+
Hotwater.jaro_winkler_distance("dixon", "dicksonx").round(4).should == 0.8133
|
16
|
+
Hotwater.jaro_winkler_distance("martha", "marhta").round(4).should == 0.9611
|
17
|
+
Hotwater.jaro_winkler_distance("dwayne", "duane").round(4).should == 0.84
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute levenshtein_distance" do
|
7
|
+
Hotwater.levenshtein_distance("", "").should == 0
|
8
|
+
Hotwater.levenshtein_distance("abc", "").should == 3
|
9
|
+
Hotwater.levenshtein_distance("bc", "abc").should == 1
|
10
|
+
Hotwater.levenshtein_distance("ca", "abc").should == 3
|
11
|
+
Hotwater.levenshtein_distance("abc", "acb").should == 2
|
12
|
+
Hotwater.levenshtein_distance("kitten", "sitting").should == 3
|
13
|
+
Hotwater.levenshtein_distance("Saturday", "Sunday").should == 3
|
14
|
+
Hotwater.levenshtein_distance("teusday", "tuesday").should == 2
|
15
|
+
Hotwater.levenshtein_distance("teusday", "thursday").should == 2
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should compute normalized_levenshtein_distance" do
|
19
|
+
Hotwater.normalized_levenshtein_distance("", "").should == 0.0
|
20
|
+
Hotwater.normalized_levenshtein_distance("abc", "").should == 0.0
|
21
|
+
Hotwater.normalized_levenshtein_distance("bc", "abc").round(4).should == 0.6667
|
22
|
+
Hotwater.normalized_levenshtein_distance("ca", "abc").should == 0.0
|
23
|
+
Hotwater.normalized_levenshtein_distance("abc", "acb").round(4).should == 0.3333
|
24
|
+
Hotwater.normalized_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
|
25
|
+
Hotwater.normalized_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
|
+
Hotwater.normalized_levenshtein_distance("teusday", "tuesday").round(4).should == 0.7143
|
27
|
+
Hotwater.normalized_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'hotwater'
|
3
|
+
|
4
|
+
describe Hotwater do
|
5
|
+
|
6
|
+
it "should compute unigram distance" do
|
7
|
+
Hotwater.ngram_distance("", "al", 1).round(4).should == 0.0
|
8
|
+
Hotwater.ngram_distance("al", "al", 1).round(4).should == 1.0
|
9
|
+
Hotwater.ngram_distance("a", "a", 1).round(4).should == 1.0
|
10
|
+
Hotwater.ngram_distance("b", "a", 1).round(4).should == 0.0
|
11
|
+
Hotwater.ngram_distance("martha", "marhta", 1).round(4).should == 0.6667
|
12
|
+
Hotwater.ngram_distance("jones", "johnson", 1).round(4).should == 0.4286
|
13
|
+
Hotwater.ngram_distance("natural", "contrary", 1).round(4).should == 0.25
|
14
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 1).round(4).should == 0.75
|
15
|
+
Hotwater.ngram_distance("dwayne", "duane", 1).round(4).should == 0.6667
|
16
|
+
Hotwater.ngram_distance("dixon", "dicksonx", 1).round(4).should == 0.5
|
17
|
+
Hotwater.ngram_distance("six", "ten", 1).round(4).should == 0.0
|
18
|
+
Hotwater.ngram_distance("zac ephron", "zac efron", 1).round(4).should == Hotwater.ngram_distance("zac ephron", "kai ephron", 1).round(4)
|
19
|
+
Hotwater.ngram_distance("brittney spears", "britney spears", 1).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 1)
|
20
|
+
Hotwater.ngram_distance("12345678", "12890678", 1).round(4).should == Hotwater.ngram_distance("12345678", "72385698", 1)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should compute bigram distance" do
|
24
|
+
Hotwater.ngram_distance("", "al", 2).round(4).should == 0.0
|
25
|
+
Hotwater.ngram_distance("al", "al", 2).round(4).should == 1.0
|
26
|
+
Hotwater.ngram_distance("a", "a", 2).round(4).should == 1.0
|
27
|
+
Hotwater.ngram_distance("b", "a", 2).round(4).should == 0.0
|
28
|
+
Hotwater.ngram_distance("a", "aa", 2).round(4).should == 0.5
|
29
|
+
Hotwater.ngram_distance("martha", "marhta", 2).round(4).should == 0.6667
|
30
|
+
Hotwater.ngram_distance("jones", "johnson", 2).round(4).should == 0.4286
|
31
|
+
Hotwater.ngram_distance("natural", "contrary", 2).round(4).should == 0.25
|
32
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 2).round(4).should == 0.625
|
33
|
+
Hotwater.ngram_distance("dwayne", "duane", 2).round(4).should == 0.5833
|
34
|
+
Hotwater.ngram_distance("dixon", "dicksonx", 2).round(4).should == 0.5
|
35
|
+
Hotwater.ngram_distance("six", "ten", 2).round(4).should == 0.0
|
36
|
+
Hotwater.ngram_distance("zac ephron", "zac efron", 2).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 2).round(4)
|
37
|
+
Hotwater.ngram_distance("brittney spears", "britney spears", 2).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 2)
|
38
|
+
Hotwater.ngram_distance("0012345678", "0012890678", 2).round(4).should == Hotwater.ngram_distance("0012345678", "0072385698", 2)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should compute bigram distance by default" do
|
42
|
+
Hotwater.ngram_distance("", "al").round(4).should == 0.0
|
43
|
+
Hotwater.ngram_distance("al", "al").round(4).should == 1.0
|
44
|
+
Hotwater.ngram_distance("a", "a").round(4).should == 1.0
|
45
|
+
Hotwater.ngram_distance("b", "a").round(4).should == 0.0
|
46
|
+
Hotwater.ngram_distance("a", "aa").round(4).should == 0.5
|
47
|
+
Hotwater.ngram_distance("martha", "marhta").round(4).should == 0.6667
|
48
|
+
Hotwater.ngram_distance("jones", "johnson").round(4).should == 0.4286
|
49
|
+
Hotwater.ngram_distance("natural", "contrary").round(4).should == 0.25
|
50
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz").round(4).should == 0.625
|
51
|
+
Hotwater.ngram_distance("dwayne", "duane").round(4).should == 0.5833
|
52
|
+
Hotwater.ngram_distance("dixon", "dicksonx").round(4).should == 0.5
|
53
|
+
Hotwater.ngram_distance("six", "ten").round(4).should == 0.0
|
54
|
+
Hotwater.ngram_distance("zac ephron", "zac efron").round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron").round(4)
|
55
|
+
Hotwater.ngram_distance("brittney spears", "britney spears").round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman")
|
56
|
+
Hotwater.ngram_distance("0012345678", "0012890678").round(4).should == Hotwater.ngram_distance("0012345678", "0072385698")
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should compute trigram distance" do
|
60
|
+
Hotwater.ngram_distance("", "al", 3).round(4).should == 0.0
|
61
|
+
Hotwater.ngram_distance("al", "al", 3).round(4).should == 1.0
|
62
|
+
Hotwater.ngram_distance("a", "a", 3).round(4).should == 1.0
|
63
|
+
Hotwater.ngram_distance("b", "a", 3).round(4).should == 0.0
|
64
|
+
Hotwater.ngram_distance("martha", "marhta", 3).round(4).should == 0.7222
|
65
|
+
Hotwater.ngram_distance("jones", "johnson", 3).round(4).should == 0.4762
|
66
|
+
Hotwater.ngram_distance("natural", "contrary", 3).round(4).should == 0.2083
|
67
|
+
Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 3).round(4).should == 0.5625
|
68
|
+
Hotwater.ngram_distance("dwayne", "duane", 3).round(4).should == 0.5278
|
69
|
+
Hotwater.ngram_distance("dixon", "dicksonx", 3).round(4).should == 0.4583
|
70
|
+
Hotwater.ngram_distance("six", "ten", 3).round(4).should == 0.0
|
71
|
+
Hotwater.ngram_distance("zac ephron", "zac efron", 3).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 3).round(4)
|
72
|
+
Hotwater.ngram_distance("brittney spears", "britney spears", 3).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 3)
|
73
|
+
Hotwater.ngram_distance("0012345678", "0012890678", 3).round(4).should < Hotwater.ngram_distance("0012345678", "0072385698", 3)
|
74
|
+
end
|
75
|
+
end
|
metadata
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hotwater
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Colin Surprenant
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: ffi
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: ffi-compiler
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rspec
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: Ruby & JRuby gem with fast string edit distance C implementation using
|
79
|
+
FFI bindings
|
80
|
+
email:
|
81
|
+
- colin.surprenant@gmail.com
|
82
|
+
executables: []
|
83
|
+
extensions:
|
84
|
+
- ext/hotwater/Rakefile
|
85
|
+
extra_rdoc_files: []
|
86
|
+
files:
|
87
|
+
- .gitignore
|
88
|
+
- Gemfile
|
89
|
+
- LICENSE.txt
|
90
|
+
- README.md
|
91
|
+
- Rakefile
|
92
|
+
- ext/hotwater/Rakefile
|
93
|
+
- ext/hotwater/damerau_levenshtein.c
|
94
|
+
- ext/hotwater/hotwater.h
|
95
|
+
- ext/hotwater/jaro.c
|
96
|
+
- ext/hotwater/levenshtein.c
|
97
|
+
- ext/hotwater/ngram.c
|
98
|
+
- hotwater.gemspec
|
99
|
+
- lib/hotwater.rb
|
100
|
+
- lib/hotwater/damerau_levenshtein_ffi.rb
|
101
|
+
- lib/hotwater/jaro_ffi.rb
|
102
|
+
- lib/hotwater/levenshtein_ffi.rb
|
103
|
+
- lib/hotwater/ngram_ffi.rb
|
104
|
+
- lib/hotwater/version.rb
|
105
|
+
- spec/hotwater/damerau_levenshtein_ffi_spec.rb
|
106
|
+
- spec/hotwater/jaro_ffi_spec.rb
|
107
|
+
- spec/hotwater/levenshtein_ffi_spec.rb
|
108
|
+
- spec/hotwater/ngram_ffi_spec.rb
|
109
|
+
homepage: http://github.com/colinsurprenant/hotwater
|
110
|
+
licenses: []
|
111
|
+
post_install_message:
|
112
|
+
rdoc_options: []
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
segments:
|
122
|
+
- 0
|
123
|
+
hash: -289401610859280349
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
+
none: false
|
126
|
+
requirements:
|
127
|
+
- - ! '>='
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
segments:
|
131
|
+
- 0
|
132
|
+
hash: -289401610859280349
|
133
|
+
requirements: []
|
134
|
+
rubyforge_project:
|
135
|
+
rubygems_version: 1.8.23
|
136
|
+
signing_key:
|
137
|
+
specification_version: 3
|
138
|
+
summary: Fast string edit distance
|
139
|
+
test_files:
|
140
|
+
- spec/hotwater/damerau_levenshtein_ffi_spec.rb
|
141
|
+
- spec/hotwater/jaro_ffi_spec.rb
|
142
|
+
- spec/hotwater/levenshtein_ffi_spec.rb
|
143
|
+
- spec/hotwater/ngram_ffi_spec.rb
|