hotwater 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/CHANGELOG.md +6 -0
- data/README.md +29 -14
- data/Rakefile +1 -1
- data/ext/hotwater/Rakefile +1 -1
- data/ext/hotwater/jaro.c +2 -1
- data/ext/hotwater/ngram.c +11 -7
- data/hotwater.gemspec +1 -1
- data/lib/hotwater/damerau_levenshtein_ffi.rb +11 -1
- data/lib/hotwater/jaro_ffi.rb +15 -0
- data/lib/hotwater/levenshtein_ffi.rb +11 -1
- data/lib/hotwater/ngram_ffi.rb +5 -0
- data/lib/hotwater/version.rb +1 -1
- data/spec/hotwater/damerau_levenshtein_ffi_spec.rb +4 -0
- data/spec/hotwater/levenshtein_ffi_spec.rb +4 -0
- metadata +7 -5
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
data/README.md
CHANGED
@@ -1,26 +1,33 @@
|
|
1
|
-
# Hotwater v0.1.
|
1
|
+
# Hotwater v0.1.2
|
2
2
|
|
3
|
-
|
3
|
+
[](http://travis-ci.org/colinsurprenant/hotwater)
|
4
|
+
|
5
|
+
Ruby & JRuby gem with fast **string edit distance** algorithms C implementations with FFI bindings.
|
4
6
|
|
5
7
|
### Algorithms
|
6
8
|
|
7
|
-
- Levenshtein & Damerau Levenshtein
|
8
|
-
- Jaro & Jaro
|
9
|
-
- N-Gram
|
9
|
+
- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) & [Damerau Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) distance
|
10
|
+
- [Jaro & Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) distance
|
11
|
+
- [N-Gram](https://en.wikipedia.org/wiki/N-gram) distance
|
10
12
|
|
11
13
|
## Installation
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
-
|
15
|
+
Tested on **OSX 10.8.2** and **Linux 12.10** with
|
16
|
+
- MRI Ruby 1.9.3 p385
|
17
|
+
- JRuby 1.7.2 (1.9.3 p327)
|
16
18
|
|
19
|
+
Add this line to your application's Gemfile:
|
20
|
+
```ruby
|
21
|
+
gem 'hotwater'
|
22
|
+
```
|
17
23
|
And then execute:
|
18
|
-
|
19
|
-
|
20
|
-
|
24
|
+
```sh
|
25
|
+
$ bundle
|
26
|
+
```
|
21
27
|
Or install it yourself as:
|
22
|
-
|
23
|
-
|
28
|
+
```sh
|
29
|
+
$ gem install hotwater
|
30
|
+
```
|
24
31
|
|
25
32
|
## Usage
|
26
33
|
|
@@ -28,7 +35,8 @@ Or install it yourself as:
|
|
28
35
|
Hotwater.levenshtein_distance("abc", "acb") # => 2
|
29
36
|
Hotwater.damerau_levenshtein_distance("abc", "acb") # => 1
|
30
37
|
|
31
|
-
#
|
38
|
+
# normalization based on the string sizes
|
39
|
+
# where an edit on a small string has more weight than on a longer string
|
32
40
|
Hotwater.normalized_levenshtein_distance("abc", "acb").round(4) # => 0.3333
|
33
41
|
Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4) # => 0.6667
|
34
42
|
|
@@ -59,12 +67,19 @@ Hotwater.ngram_distance("natural", "contrary", 3).round(4) # => 0.2083
|
|
59
67
|
5. Create new Pull Request
|
60
68
|
|
61
69
|
## Credits
|
70
|
+
|
62
71
|
- Some C code from the https://github.com/sunlightlabs/jellyfish project
|
63
72
|
- N-Gram ported from Apache Lucene 4.0.0 NGramDistance.java
|
64
73
|
|
74
|
+
## Why?
|
75
|
+
|
76
|
+
Why Hotwater? as stated in the credits section, some of the C code comes from the [jellyfish Python project](https://github.com/sunlightlabs/jellyfish). Jelly fish made me think right away about New Brunswick beaches where I have been a couple of times in the past years. There is this legend about New Brunswick having warm water beaches. I even saw a tourism promotion TV commercial selling NB has having warm water. This is a lie! :P I never experienced warm water (in the generaly accepted definition) in NB, only lots of jellyfish :D (that being said, I have enjoyed every bit of my visits in New Brunswick and I really do not care about warm water really ;)
|
77
|
+
|
65
78
|
## Author
|
79
|
+
|
66
80
|
Colin Surprenant, [@colinsurprenant](http://twitter.com/colinsurprenant), [http://github.com/colinsurprenant](http://github.com/colinsurprenant), colin.surprenant@gmail.com
|
67
81
|
|
68
82
|
## License
|
83
|
+
|
69
84
|
Hotwater is distributed under the Apache License, Version 2.0.
|
70
85
|
|
data/Rakefile
CHANGED
data/ext/hotwater/Rakefile
CHANGED
data/ext/hotwater/jaro.c
CHANGED
@@ -10,6 +10,7 @@ Colin Surprenant, Feb 2013
|
|
10
10
|
#include <string.h>
|
11
11
|
#include <stdio.h>
|
12
12
|
#include <stdlib.h>
|
13
|
+
#include <alloca.h>
|
13
14
|
#include "hotwater.h"
|
14
15
|
|
15
16
|
#define NOTNUM(c) ((c>57) || (c<48))
|
@@ -52,7 +53,7 @@ double _jaro_winkler(const char *ying, const char *yang, bool long_tolerance, bo
|
|
52
53
|
search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
|
53
54
|
|
54
55
|
// Blank out the flags
|
55
|
-
ying_flag = alloca(ying_length + 1);
|
56
|
+
ying_flag = (char *)alloca(ying_length + 1);
|
56
57
|
if (!ying_flag) return -1.0;
|
57
58
|
|
58
59
|
yang_flag = alloca(yang_length + 1);
|
data/ext/hotwater/ngram.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
Colin Surprenant, Feb 2013
|
3
3
|
- converted in C from org/apache/lucene/search/spell/NGramDistance.java v4.0.0
|
4
|
-
- fixed segfault bug in substring n parameter, which did not surface in Java
|
4
|
+
- fixed segfault bug in call to substring for n parameter, which did not surface in Java
|
5
5
|
*/
|
6
6
|
|
7
7
|
/* package org.apache.lucene.search.spell; */
|
@@ -66,7 +66,8 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
66
66
|
int cost = 0;
|
67
67
|
if (sl < n || tl < n) {
|
68
68
|
int ni = MIN(sl, tl);
|
69
|
-
|
69
|
+
int i;
|
70
|
+
for (i = 0; i < ni; i++) {
|
70
71
|
if (source[i] == target[i]) {
|
71
72
|
cost++;
|
72
73
|
}
|
@@ -85,7 +86,8 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
85
86
|
double* _d; // placeholder to assist in swapping p and d
|
86
87
|
|
87
88
|
// construct sa with prefix
|
88
|
-
|
89
|
+
int i;
|
90
|
+
for (i = 0; i < sa_len; i++) {
|
89
91
|
if (i < n - 1) {
|
90
92
|
sa[i] = 0 ; //add prefix
|
91
93
|
}
|
@@ -108,7 +110,7 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
108
110
|
}
|
109
111
|
|
110
112
|
// indexes into strings s and t
|
111
|
-
|
113
|
+
i = 0; // iterates through source
|
112
114
|
int j = 0; // iterates through target
|
113
115
|
|
114
116
|
char* t_j = calloc(n + 1, sizeof(char)); // jth n-gram of t
|
@@ -125,10 +127,11 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
125
127
|
for (j = 1; j <= tl; j++) {
|
126
128
|
// construct t_j n-gram
|
127
129
|
if (j < n) {
|
128
|
-
|
130
|
+
int ti;
|
131
|
+
for (ti = 0; ti < n - j; ti++) {
|
129
132
|
t_j[ti] = 0; //add prefix
|
130
133
|
}
|
131
|
-
for (
|
134
|
+
for (ti = n - j; ti < n; ti++) {
|
132
135
|
t_j[ti] = target[ti - (n - j)];
|
133
136
|
}
|
134
137
|
}
|
@@ -142,7 +145,8 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
142
145
|
int tn = n;
|
143
146
|
// compare sa to t_j
|
144
147
|
|
145
|
-
|
148
|
+
int ni;
|
149
|
+
for (ni = 0; ni < n; ni++) {
|
146
150
|
if (sa[i - 1 + ni] != t_j[ni]) {
|
147
151
|
cost++;
|
148
152
|
}
|
data/hotwater.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.version = Hotwater::VERSION
|
9
9
|
gem.authors = ["Colin Surprenant"]
|
10
10
|
gem.email = ["colin.surprenant@gmail.com"]
|
11
|
-
gem.description = "Ruby & JRuby gem with fast string edit distance C
|
11
|
+
gem.description = "Ruby & JRuby gem with fast string edit distance algorithms C implementations with FFI bindings"
|
12
12
|
gem.summary = "Fast string edit distance"
|
13
13
|
gem.homepage = "http://github.com/colinsurprenant/hotwater"
|
14
14
|
|
@@ -7,16 +7,26 @@ module Hotwater
|
|
7
7
|
attach_function :damerau_levenshtein_distance, [:string, :string], :int
|
8
8
|
end
|
9
9
|
|
10
|
+
# compute Damerau Levenshtein edit distance between 2 strings
|
11
|
+
# @param s1 [String] first string
|
12
|
+
# @param s2 [String] seconds string
|
13
|
+
# @return [Integer] edit distance
|
10
14
|
def damerau_levenshtein_distance(s1, s2)
|
11
15
|
result = C::damerau_levenshtein_distance(s1, s2)
|
12
16
|
raise("memory allocation error") if result == -1
|
13
17
|
result
|
14
18
|
end
|
15
19
|
|
20
|
+
# compute normalized Damerau Levenshtein edit distance between 2 strings
|
21
|
+
# normalization weight the edit distance using the string lengths where
|
22
|
+
# an edit on a small string has more impact than on a longer string
|
23
|
+
# @param s1 [String] first string
|
24
|
+
# @param s2 [String] seconds string
|
25
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
16
26
|
def normalized_damerau_levenshtein_distance(s1, s2)
|
17
27
|
result = C::damerau_levenshtein_distance(s1, s2)
|
18
28
|
raise("memory allocation error") if result == -1
|
19
|
-
return 0.0 if result == 0
|
29
|
+
return 0.0 if result == 0.0
|
20
30
|
max = [s1.size, s2.size].max
|
21
31
|
(max - result.to_f) / max
|
22
32
|
end
|
data/lib/hotwater/jaro_ffi.rb
CHANGED
@@ -10,12 +10,27 @@ module Hotwater
|
|
10
10
|
attach_function :jaro_winkler_distance, [:string, :string, :bool], :double
|
11
11
|
end
|
12
12
|
|
13
|
+
# compute Jaro edit distance between 2 strings
|
14
|
+
# @param s1 [String] first string
|
15
|
+
# @param s2 [String] seconds string
|
16
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
13
17
|
def jaro_distance(s1, s2)
|
14
18
|
result = C::jaro_distance(s1, s2)
|
15
19
|
raise("memory allocation error") if result < 0.0
|
16
20
|
result
|
17
21
|
end
|
18
22
|
|
23
|
+
# compute Jaro-Winkler edit distance between 2 strings
|
24
|
+
#
|
25
|
+
# setting `long_tolerance = true` increases the probability of a match when the number
|
26
|
+
# of matched characters is large. This option allows for a little more
|
27
|
+
# tolerance when the strings are large. It is not an appropriate
|
28
|
+
# test when comparing fixed length fields such as phone and social security numbers.
|
29
|
+
#
|
30
|
+
# @param s1 [String] first string
|
31
|
+
# @param s2 [String] seconds string
|
32
|
+
# @param long_tolerance [Boolean] add more tolerance when the strings are large. Default `false`
|
33
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
19
34
|
def jaro_winkler_distance(s1, s2, long_tolerance = false)
|
20
35
|
result = C::jaro_winkler_distance(s1, s2, long_tolerance)
|
21
36
|
raise("memory allocation error") if result < 0.0
|
@@ -7,16 +7,26 @@ module Hotwater
|
|
7
7
|
attach_function :levenshtein_distance, [:string, :string], :int
|
8
8
|
end
|
9
9
|
|
10
|
+
# compute Levenshtein edit distance between 2 strings
|
11
|
+
# @param s1 [String] first string
|
12
|
+
# @param s2 [String] seconds string
|
13
|
+
# @return [Integer] edit distance
|
10
14
|
def levenshtein_distance(s1, s2)
|
11
15
|
result = C::levenshtein_distance(s1, s2)
|
12
16
|
raise("memory allocation error") if result == -1
|
13
17
|
result
|
14
18
|
end
|
15
19
|
|
20
|
+
# compute normalized Levenshtein edit distance between 2 strings
|
21
|
+
# normalization weight the edit distance using the string lengths where
|
22
|
+
# an edit on a small string has more impact than on a longer string
|
23
|
+
# @param s1 [String] first string
|
24
|
+
# @param s2 [String] seconds string
|
25
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
16
26
|
def normalized_levenshtein_distance(s1, s2)
|
17
27
|
result = C::levenshtein_distance(s1, s2)
|
18
28
|
raise("memory allocation error") if result == -1
|
19
|
-
return 0.0 if result == 0
|
29
|
+
return 0.0 if result == 0.0
|
20
30
|
max = [s1.size, s2.size].max
|
21
31
|
(max - result.to_f) / max
|
22
32
|
end
|
data/lib/hotwater/ngram_ffi.rb
CHANGED
@@ -7,6 +7,11 @@ module Hotwater
|
|
7
7
|
attach_function :ngram_distance, [:string, :string, :int], :double
|
8
8
|
end
|
9
9
|
|
10
|
+
# compute N-Gram distance between 2 strings
|
11
|
+
# @param s1 [String] first string
|
12
|
+
# @param s2 [String] seconds string
|
13
|
+
# @param n [Integer] number of characters per gram, default is 2
|
14
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
10
15
|
def ngram_distance(s1, s2, n = 2)
|
11
16
|
result = C::ngram_distance(s1, s2, n)
|
12
17
|
raise("memory allocation error") if result == -1
|
data/lib/hotwater/version.rb
CHANGED
@@ -25,5 +25,9 @@ describe Hotwater do
|
|
25
25
|
Hotwater.normalized_damerau_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
26
|
Hotwater.normalized_damerau_levenshtein_distance("teusday", "tuesday").round(4).should == 0.8571
|
27
27
|
Hotwater.normalized_damerau_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
|
29
|
+
Hotwater.normalized_levenshtein_distance("aaaa", "aaab").round(4).should == 0.75
|
30
|
+
Hotwater.normalized_levenshtein_distance("aaaaa", "aaaab").round(4).should == 0.8
|
31
|
+
Hotwater.normalized_levenshtein_distance("aaaaaa", "aaaaab").round(4).should == 0.8333
|
28
32
|
end
|
29
33
|
end
|
@@ -25,5 +25,9 @@ describe Hotwater do
|
|
25
25
|
Hotwater.normalized_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
26
|
Hotwater.normalized_levenshtein_distance("teusday", "tuesday").round(4).should == 0.7143
|
27
27
|
Hotwater.normalized_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
|
29
|
+
Hotwater.normalized_levenshtein_distance("aaaa", "aaab").round(4).should == 0.75
|
30
|
+
Hotwater.normalized_levenshtein_distance("aaaaa", "aaaab").round(4).should == 0.8
|
31
|
+
Hotwater.normalized_levenshtein_distance("aaaaaa", "aaaaab").round(4).should == 0.8333
|
28
32
|
end
|
29
33
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hotwater
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -75,8 +75,8 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
-
description: Ruby & JRuby gem with fast string edit distance C
|
79
|
-
FFI bindings
|
78
|
+
description: Ruby & JRuby gem with fast string edit distance algorithms C implementations
|
79
|
+
with FFI bindings
|
80
80
|
email:
|
81
81
|
- colin.surprenant@gmail.com
|
82
82
|
executables: []
|
@@ -85,6 +85,8 @@ extensions:
|
|
85
85
|
extra_rdoc_files: []
|
86
86
|
files:
|
87
87
|
- .gitignore
|
88
|
+
- .travis.yml
|
89
|
+
- CHANGELOG.md
|
88
90
|
- Gemfile
|
89
91
|
- LICENSE.txt
|
90
92
|
- README.md
|
@@ -120,7 +122,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
120
122
|
version: '0'
|
121
123
|
segments:
|
122
124
|
- 0
|
123
|
-
hash:
|
125
|
+
hash: 3182707895951543481
|
124
126
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
127
|
none: false
|
126
128
|
requirements:
|
@@ -129,7 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
131
|
version: '0'
|
130
132
|
segments:
|
131
133
|
- 0
|
132
|
-
hash:
|
134
|
+
hash: 3182707895951543481
|
133
135
|
requirements: []
|
134
136
|
rubyforge_project:
|
135
137
|
rubygems_version: 1.8.23
|