hotwater 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/CHANGELOG.md +6 -0
- data/README.md +29 -14
- data/Rakefile +1 -1
- data/ext/hotwater/Rakefile +1 -1
- data/ext/hotwater/jaro.c +2 -1
- data/ext/hotwater/ngram.c +11 -7
- data/hotwater.gemspec +1 -1
- data/lib/hotwater/damerau_levenshtein_ffi.rb +11 -1
- data/lib/hotwater/jaro_ffi.rb +15 -0
- data/lib/hotwater/levenshtein_ffi.rb +11 -1
- data/lib/hotwater/ngram_ffi.rb +5 -0
- data/lib/hotwater/version.rb +1 -1
- data/spec/hotwater/damerau_levenshtein_ffi_spec.rb +4 -0
- data/spec/hotwater/levenshtein_ffi_spec.rb +4 -0
- metadata +7 -5
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
data/README.md
CHANGED
@@ -1,26 +1,33 @@
|
|
1
|
-
# Hotwater v0.1.
|
1
|
+
# Hotwater v0.1.2
|
2
2
|
|
3
|
-
|
3
|
+
[![build status](https://secure.travis-ci.org/colinsurprenant/hotwater.png)](http://travis-ci.org/colinsurprenant/hotwater)
|
4
|
+
|
5
|
+
Ruby & JRuby gem with fast **string edit distance** algorithms C implementations with FFI bindings.
|
4
6
|
|
5
7
|
### Algorithms
|
6
8
|
|
7
|
-
- Levenshtein & Damerau Levenshtein
|
8
|
-
- Jaro & Jaro
|
9
|
-
- N-Gram
|
9
|
+
- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) & [Damerau Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) distance
|
10
|
+
- [Jaro & Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) distance
|
11
|
+
- [N-Gram](https://en.wikipedia.org/wiki/N-gram) distance
|
10
12
|
|
11
13
|
## Installation
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
-
|
15
|
+
Tested on **OSX 10.8.2** and **Linux 12.10** with
|
16
|
+
- MRI Ruby 1.9.3 p385
|
17
|
+
- JRuby 1.7.2 (1.9.3 p327)
|
16
18
|
|
19
|
+
Add this line to your application's Gemfile:
|
20
|
+
```ruby
|
21
|
+
gem 'hotwater'
|
22
|
+
```
|
17
23
|
And then execute:
|
18
|
-
|
19
|
-
|
20
|
-
|
24
|
+
```sh
|
25
|
+
$ bundle
|
26
|
+
```
|
21
27
|
Or install it yourself as:
|
22
|
-
|
23
|
-
|
28
|
+
```sh
|
29
|
+
$ gem install hotwater
|
30
|
+
```
|
24
31
|
|
25
32
|
## Usage
|
26
33
|
|
@@ -28,7 +35,8 @@ Or install it yourself as:
|
|
28
35
|
Hotwater.levenshtein_distance("abc", "acb") # => 2
|
29
36
|
Hotwater.damerau_levenshtein_distance("abc", "acb") # => 1
|
30
37
|
|
31
|
-
#
|
38
|
+
# normalization based on the string sizes
|
39
|
+
# where an edit on a small string has more weight than on a longer string
|
32
40
|
Hotwater.normalized_levenshtein_distance("abc", "acb").round(4) # => 0.3333
|
33
41
|
Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4) # => 0.6667
|
34
42
|
|
@@ -59,12 +67,19 @@ Hotwater.ngram_distance("natural", "contrary", 3).round(4) # => 0.2083
|
|
59
67
|
5. Create new Pull Request
|
60
68
|
|
61
69
|
## Credits
|
70
|
+
|
62
71
|
- Some C code from the https://github.com/sunlightlabs/jellyfish project
|
63
72
|
- N-Gram ported from Apache Lucene 4.0.0 NGramDistance.java
|
64
73
|
|
74
|
+
## Why?
|
75
|
+
|
76
|
+
Why Hotwater? as stated in the credits section, some of the C code comes from the [jellyfish Python project](https://github.com/sunlightlabs/jellyfish). Jelly fish made me think right away about New Brunswick beaches where I have been a couple of times in the past years. There is this legend about New Brunswick having warm water beaches. I even saw a tourism promotion TV commercial selling NB has having warm water. This is a lie! :P I never experienced warm water (in the generaly accepted definition) in NB, only lots of jellyfish :D (that being said, I have enjoyed every bit of my visits in New Brunswick and I really do not care about warm water really ;)
|
77
|
+
|
65
78
|
## Author
|
79
|
+
|
66
80
|
Colin Surprenant, [@colinsurprenant](http://twitter.com/colinsurprenant), [http://github.com/colinsurprenant](http://github.com/colinsurprenant), colin.surprenant@gmail.com
|
67
81
|
|
68
82
|
## License
|
83
|
+
|
69
84
|
Hotwater is distributed under the Apache License, Version 2.0.
|
70
85
|
|
data/Rakefile
CHANGED
data/ext/hotwater/Rakefile
CHANGED
data/ext/hotwater/jaro.c
CHANGED
@@ -10,6 +10,7 @@ Colin Surprenant, Feb 2013
|
|
10
10
|
#include <string.h>
|
11
11
|
#include <stdio.h>
|
12
12
|
#include <stdlib.h>
|
13
|
+
#include <alloca.h>
|
13
14
|
#include "hotwater.h"
|
14
15
|
|
15
16
|
#define NOTNUM(c) ((c>57) || (c<48))
|
@@ -52,7 +53,7 @@ double _jaro_winkler(const char *ying, const char *yang, bool long_tolerance, bo
|
|
52
53
|
search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
|
53
54
|
|
54
55
|
// Blank out the flags
|
55
|
-
ying_flag = alloca(ying_length + 1);
|
56
|
+
ying_flag = (char *)alloca(ying_length + 1);
|
56
57
|
if (!ying_flag) return -1.0;
|
57
58
|
|
58
59
|
yang_flag = alloca(yang_length + 1);
|
data/ext/hotwater/ngram.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
Colin Surprenant, Feb 2013
|
3
3
|
- converted in C from org/apache/lucene/search/spell/NGramDistance.java v4.0.0
|
4
|
-
- fixed segfault bug in substring n parameter, which did not surface in Java
|
4
|
+
- fixed segfault bug in call to substring for n parameter, which did not surface in Java
|
5
5
|
*/
|
6
6
|
|
7
7
|
/* package org.apache.lucene.search.spell; */
|
@@ -66,7 +66,8 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
66
66
|
int cost = 0;
|
67
67
|
if (sl < n || tl < n) {
|
68
68
|
int ni = MIN(sl, tl);
|
69
|
-
|
69
|
+
int i;
|
70
|
+
for (i = 0; i < ni; i++) {
|
70
71
|
if (source[i] == target[i]) {
|
71
72
|
cost++;
|
72
73
|
}
|
@@ -85,7 +86,8 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
85
86
|
double* _d; // placeholder to assist in swapping p and d
|
86
87
|
|
87
88
|
// construct sa with prefix
|
88
|
-
|
89
|
+
int i;
|
90
|
+
for (i = 0; i < sa_len; i++) {
|
89
91
|
if (i < n - 1) {
|
90
92
|
sa[i] = 0 ; //add prefix
|
91
93
|
}
|
@@ -108,7 +110,7 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
108
110
|
}
|
109
111
|
|
110
112
|
// indexes into strings s and t
|
111
|
-
|
113
|
+
i = 0; // iterates through source
|
112
114
|
int j = 0; // iterates through target
|
113
115
|
|
114
116
|
char* t_j = calloc(n + 1, sizeof(char)); // jth n-gram of t
|
@@ -125,10 +127,11 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
125
127
|
for (j = 1; j <= tl; j++) {
|
126
128
|
// construct t_j n-gram
|
127
129
|
if (j < n) {
|
128
|
-
|
130
|
+
int ti;
|
131
|
+
for (ti = 0; ti < n - j; ti++) {
|
129
132
|
t_j[ti] = 0; //add prefix
|
130
133
|
}
|
131
|
-
for (
|
134
|
+
for (ti = n - j; ti < n; ti++) {
|
132
135
|
t_j[ti] = target[ti - (n - j)];
|
133
136
|
}
|
134
137
|
}
|
@@ -142,7 +145,8 @@ double ngram_distance (const char *source, const char *target, int n) {
|
|
142
145
|
int tn = n;
|
143
146
|
// compare sa to t_j
|
144
147
|
|
145
|
-
|
148
|
+
int ni;
|
149
|
+
for (ni = 0; ni < n; ni++) {
|
146
150
|
if (sa[i - 1 + ni] != t_j[ni]) {
|
147
151
|
cost++;
|
148
152
|
}
|
data/hotwater.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.version = Hotwater::VERSION
|
9
9
|
gem.authors = ["Colin Surprenant"]
|
10
10
|
gem.email = ["colin.surprenant@gmail.com"]
|
11
|
-
gem.description = "Ruby & JRuby gem with fast string edit distance C
|
11
|
+
gem.description = "Ruby & JRuby gem with fast string edit distance algorithms C implementations with FFI bindings"
|
12
12
|
gem.summary = "Fast string edit distance"
|
13
13
|
gem.homepage = "http://github.com/colinsurprenant/hotwater"
|
14
14
|
|
@@ -7,16 +7,26 @@ module Hotwater
|
|
7
7
|
attach_function :damerau_levenshtein_distance, [:string, :string], :int
|
8
8
|
end
|
9
9
|
|
10
|
+
# compute Damerau Levenshtein edit distance between 2 strings
|
11
|
+
# @param s1 [String] first string
|
12
|
+
# @param s2 [String] seconds string
|
13
|
+
# @return [Integer] edit distance
|
10
14
|
def damerau_levenshtein_distance(s1, s2)
|
11
15
|
result = C::damerau_levenshtein_distance(s1, s2)
|
12
16
|
raise("memory allocation error") if result == -1
|
13
17
|
result
|
14
18
|
end
|
15
19
|
|
20
|
+
# compute normalized Damerau Levenshtein edit distance between 2 strings
|
21
|
+
# normalization weight the edit distance using the string lengths where
|
22
|
+
# an edit on a small string has more impact than on a longer string
|
23
|
+
# @param s1 [String] first string
|
24
|
+
# @param s2 [String] seconds string
|
25
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
16
26
|
def normalized_damerau_levenshtein_distance(s1, s2)
|
17
27
|
result = C::damerau_levenshtein_distance(s1, s2)
|
18
28
|
raise("memory allocation error") if result == -1
|
19
|
-
return 0.0 if result == 0
|
29
|
+
return 0.0 if result == 0.0
|
20
30
|
max = [s1.size, s2.size].max
|
21
31
|
(max - result.to_f) / max
|
22
32
|
end
|
data/lib/hotwater/jaro_ffi.rb
CHANGED
@@ -10,12 +10,27 @@ module Hotwater
|
|
10
10
|
attach_function :jaro_winkler_distance, [:string, :string, :bool], :double
|
11
11
|
end
|
12
12
|
|
13
|
+
# compute Jaro edit distance between 2 strings
|
14
|
+
# @param s1 [String] first string
|
15
|
+
# @param s2 [String] seconds string
|
16
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
13
17
|
def jaro_distance(s1, s2)
|
14
18
|
result = C::jaro_distance(s1, s2)
|
15
19
|
raise("memory allocation error") if result < 0.0
|
16
20
|
result
|
17
21
|
end
|
18
22
|
|
23
|
+
# compute Jaro-Winkler edit distance between 2 strings
|
24
|
+
#
|
25
|
+
# setting `long_tolerance = true` increases the probability of a match when the number
|
26
|
+
# of matched characters is large. This option allows for a little more
|
27
|
+
# tolerance when the strings are large. It is not an appropriate
|
28
|
+
# test when comparing fixed length fields such as phone and social security numbers.
|
29
|
+
#
|
30
|
+
# @param s1 [String] first string
|
31
|
+
# @param s2 [String] seconds string
|
32
|
+
# @param long_tolerance [Boolean] add more tolerance when the strings are large. Default `false`
|
33
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
19
34
|
def jaro_winkler_distance(s1, s2, long_tolerance = false)
|
20
35
|
result = C::jaro_winkler_distance(s1, s2, long_tolerance)
|
21
36
|
raise("memory allocation error") if result < 0.0
|
@@ -7,16 +7,26 @@ module Hotwater
|
|
7
7
|
attach_function :levenshtein_distance, [:string, :string], :int
|
8
8
|
end
|
9
9
|
|
10
|
+
# compute Levenshtein edit distance between 2 strings
|
11
|
+
# @param s1 [String] first string
|
12
|
+
# @param s2 [String] seconds string
|
13
|
+
# @return [Integer] edit distance
|
10
14
|
def levenshtein_distance(s1, s2)
|
11
15
|
result = C::levenshtein_distance(s1, s2)
|
12
16
|
raise("memory allocation error") if result == -1
|
13
17
|
result
|
14
18
|
end
|
15
19
|
|
20
|
+
# compute normalized Levenshtein edit distance between 2 strings
|
21
|
+
# normalization weight the edit distance using the string lengths where
|
22
|
+
# an edit on a small string has more impact than on a longer string
|
23
|
+
# @param s1 [String] first string
|
24
|
+
# @param s2 [String] seconds string
|
25
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
16
26
|
def normalized_levenshtein_distance(s1, s2)
|
17
27
|
result = C::levenshtein_distance(s1, s2)
|
18
28
|
raise("memory allocation error") if result == -1
|
19
|
-
return 0.0 if result == 0
|
29
|
+
return 0.0 if result == 0.0
|
20
30
|
max = [s1.size, s2.size].max
|
21
31
|
(max - result.to_f) / max
|
22
32
|
end
|
data/lib/hotwater/ngram_ffi.rb
CHANGED
@@ -7,6 +7,11 @@ module Hotwater
|
|
7
7
|
attach_function :ngram_distance, [:string, :string, :int], :double
|
8
8
|
end
|
9
9
|
|
10
|
+
# compute N-Gram distance between 2 strings
|
11
|
+
# @param s1 [String] first string
|
12
|
+
# @param s2 [String] seconds string
|
13
|
+
# @param n [Integer] number of characters per gram, default is 2
|
14
|
+
# @return [Float] edit distance between 0.0 and 1.0
|
10
15
|
def ngram_distance(s1, s2, n = 2)
|
11
16
|
result = C::ngram_distance(s1, s2, n)
|
12
17
|
raise("memory allocation error") if result == -1
|
data/lib/hotwater/version.rb
CHANGED
@@ -25,5 +25,9 @@ describe Hotwater do
|
|
25
25
|
Hotwater.normalized_damerau_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
26
|
Hotwater.normalized_damerau_levenshtein_distance("teusday", "tuesday").round(4).should == 0.8571
|
27
27
|
Hotwater.normalized_damerau_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
|
29
|
+
Hotwater.normalized_levenshtein_distance("aaaa", "aaab").round(4).should == 0.75
|
30
|
+
Hotwater.normalized_levenshtein_distance("aaaaa", "aaaab").round(4).should == 0.8
|
31
|
+
Hotwater.normalized_levenshtein_distance("aaaaaa", "aaaaab").round(4).should == 0.8333
|
28
32
|
end
|
29
33
|
end
|
@@ -25,5 +25,9 @@ describe Hotwater do
|
|
25
25
|
Hotwater.normalized_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
|
26
26
|
Hotwater.normalized_levenshtein_distance("teusday", "tuesday").round(4).should == 0.7143
|
27
27
|
Hotwater.normalized_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
|
28
|
+
|
29
|
+
Hotwater.normalized_levenshtein_distance("aaaa", "aaab").round(4).should == 0.75
|
30
|
+
Hotwater.normalized_levenshtein_distance("aaaaa", "aaaab").round(4).should == 0.8
|
31
|
+
Hotwater.normalized_levenshtein_distance("aaaaaa", "aaaaab").round(4).should == 0.8333
|
28
32
|
end
|
29
33
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hotwater
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -75,8 +75,8 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
-
description: Ruby & JRuby gem with fast string edit distance C
|
79
|
-
FFI bindings
|
78
|
+
description: Ruby & JRuby gem with fast string edit distance algorithms C implementations
|
79
|
+
with FFI bindings
|
80
80
|
email:
|
81
81
|
- colin.surprenant@gmail.com
|
82
82
|
executables: []
|
@@ -85,6 +85,8 @@ extensions:
|
|
85
85
|
extra_rdoc_files: []
|
86
86
|
files:
|
87
87
|
- .gitignore
|
88
|
+
- .travis.yml
|
89
|
+
- CHANGELOG.md
|
88
90
|
- Gemfile
|
89
91
|
- LICENSE.txt
|
90
92
|
- README.md
|
@@ -120,7 +122,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
120
122
|
version: '0'
|
121
123
|
segments:
|
122
124
|
- 0
|
123
|
-
hash:
|
125
|
+
hash: 3182707895951543481
|
124
126
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
127
|
none: false
|
126
128
|
requirements:
|
@@ -129,7 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
131
|
version: '0'
|
130
132
|
segments:
|
131
133
|
- 0
|
132
|
-
hash:
|
134
|
+
hash: 3182707895951543481
|
133
135
|
requirements: []
|
134
136
|
rubyforge_project:
|
135
137
|
rubygems_version: 1.8.23
|