hotwater 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .ruby-version
19
+ vendor/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in hotwater.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,45 @@
1
+ Copyright (c) 2013 Colin Surprenant <colin.surprenant@gmail.com>
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+
15
+ --------
16
+
17
+ C code from the https://github.com/sunlightlabs/jellyfish project
18
+
19
+ Copyright (c) 2010, Sunlight Labs
20
+
21
+ All rights reserved.
22
+
23
+ Redistribution and use in source and binary forms, with or without modification,
24
+ are permitted provided that the following conditions are met:
25
+
26
+ * Redistributions of source code must retain the above copyright notice,
27
+ this list of conditions and the following disclaimer.
28
+ * Redistributions in binary form must reproduce the above copyright notice,
29
+ this list of conditions and the following disclaimer in the documentation
30
+ and/or other materials provided with the distribution.
31
+ * Neither the name of Sunlight Labs nor the names of its contributors may be
32
+ used to endorse or promote products derived from this software without
33
+ specific prior written permission.
34
+
35
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
39
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ # Hotwater v0.1.0
2
+
3
+ Ruby & JRuby gem with fast **string edit distance** C implementations using FFI bindings.
4
+
5
+ ### Algorithms
6
+
7
+ - Levenshtein & Damerau Levenshtein
8
+ - Jaro & Jaro Winkler
9
+ - N-Gram
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'hotwater'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install hotwater
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ Hotwater.levenshtein_distance("abc", "acb") # => 2
29
+ Hotwater.damerau_levenshtein_distance("abc", "acb") # => 1
30
+
31
+ # do normalization based on the string sizes
32
+ Hotwater.normalized_levenshtein_distance("abc", "acb").round(4) # => 0.3333
33
+ Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4) # => 0.6667
34
+
35
+ Hotwater.jaro_distance("martha", "marhta").round(4) # => 0.9444
36
+ Hotwater.jaro_winkler_distance("martha", "marhta").round(4) # => 0.9611
37
+
38
+ # default is bigram
39
+ Hotwater.ngram_distance("natural", "contrary").round(4) # => 0.25
40
+
41
+ # specify trigram
42
+ Hotwater.ngram_distance("natural", "contrary", 3).round(4) # => 0.2083
43
+ ```
44
+
45
+ ## Developement
46
+
47
+ 1. Fort it
48
+ 2. Install gems `$ bundle install`
49
+ 3. Compile lib `$ rake compile`
50
+ 4. Run specs `$ rake spec`
51
+ 5. Clean compiler generated files `$ rake clean`
52
+
53
+ ## Contributing
54
+
55
+ 1. Fork it
56
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
57
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
58
+ 4. Push to the branch (`git push origin my-new-feature`)
59
+ 5. Create new Pull Request
60
+
61
+ ## Credits
62
+ - Some C code from the https://github.com/sunlightlabs/jellyfish project
63
+ - N-Gram ported from Apache Lucene 4.0.0 NGramDistance.java
64
+
65
+ ## Author
66
+ Colin Surprenant, [@colinsurprenant](http://twitter.com/colinsurprenant), [http://github.com/colinsurprenant](http://github.com/colinsurprenant), colin.surprenant@gmail.com
67
+
68
+ ## License
69
+ Hotwater is distributed under the Apache License, Version 2.0.
70
+
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ require 'bundler/setup'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'bundler/gem_tasks'
5
+ require 'rspec/core/rake_task'
6
+ require 'ffi-compiler/compile_task'
7
+
8
+ task :default => :spec
9
+
10
+ desc "run specs"
11
+ task :spec do
12
+ RSpec::Core::RakeTask.new
13
+ end
14
+
15
+ desc "compiler tasks"
16
+ namespace "ffi-compiler" do
17
+ FFI::Compiler::CompileTask.new('ext/hotwater/hotwater') do |c|
18
+ end
19
+ end
20
+ task :compile => ["ffi-compiler:default"]
21
+
22
+ CLEAN.include('ext/**/*{.o,.log,.so,.bundle}')
23
+ CLEAN.include('lib/**/*{.o,.log,.so,.bundle}')
24
+ CLEAN.include('ext/**/Makefile')
25
+
@@ -0,0 +1,5 @@
1
+ require 'ffi-compiler/compile_task'
2
+
3
+ FFI::Compiler::CompileTask.new('hotwater') do |c|
4
+ # nothing yet bro
5
+ end
@@ -0,0 +1,68 @@
1
+ /* from the https://github.com/sunlightlabs/jellyfish project */
2
+
3
+ #include "hotwater.h"
4
+ #include <string.h>
5
+
6
+ int damerau_levenshtein_distance(const char *s1, const char *s2)
7
+ {
8
+ size_t s1_len = s1 == NULL ? 0 : strlen(s1);
9
+ size_t s2_len = s2 == NULL ? 0 : strlen(s2);
10
+ size_t rows = s1_len + 1;
11
+ size_t cols = s2_len + 1;
12
+
13
+ size_t i, j;
14
+ size_t d1, d2, d3, d_now;;
15
+ unsigned short cost;
16
+
17
+ if (s1_len == 0) {
18
+ if (s2_len == 0) {
19
+ return 0;
20
+ }
21
+ else
22
+ {
23
+ return s2_len;
24
+ }
25
+ }
26
+
27
+ size_t *dist = malloc(rows * cols * sizeof(size_t));
28
+ if (!dist) {
29
+ return -1;
30
+ }
31
+
32
+ for (i = 0; i < rows; i++) {
33
+ dist[i * cols] = i;
34
+ }
35
+
36
+ for (j = 0; j < cols; j++) {
37
+ dist[j] = j;
38
+ }
39
+
40
+ for (i = 1; i < rows; i++) {
41
+ for (j = 1; j < cols; j++) {
42
+ if (s1[i - 1] == s2[j - 1]) {
43
+ cost = 0;
44
+ } else {
45
+ cost = 1;
46
+ }
47
+
48
+ d1 = dist[((i - 1) * cols) + j] + 1;
49
+ d2 = dist[(i * cols) + (j - 1)] + 1;
50
+ d3 = dist[((i - 1) * cols) + (j - 1)] + cost;
51
+
52
+ d_now = MIN(d1, MIN(d2, d3));
53
+
54
+ if (i > 2 && j > 2 && s1[i - 1] == s2[j - 2] &&
55
+ s1[i - 2] == s2[j - 1]) {
56
+ d1 = dist[((i - 2) * cols) + (j - 2)] + cost;
57
+ d_now = MIN(d_now, d1);
58
+ }
59
+
60
+ dist[(i * cols) + j] = d_now;
61
+ }
62
+ }
63
+
64
+ d_now = dist[(cols * rows) - 1];
65
+ free(dist);
66
+
67
+ return d_now;
68
+ }
@@ -0,0 +1,23 @@
1
+ #ifndef _HOTWATER_H_
2
+ #define _HOTWATER_H_
3
+
4
+ #include <stdbool.h>
5
+ #include <stdlib.h>
6
+
7
+ #ifndef MIN
8
+ #define MIN(a, b) (((a) < (b)) ? (a) : (b))
9
+ #endif
10
+
11
+ #ifndef MAX
12
+ #define MAX(a, b) (((a) > (b)) ? (a) : (b))
13
+ #endif
14
+
15
+ double jaro_winkler_distance(const char *str1, const char *str2, bool long_tolerance);
16
+ double jaro_distance(const char *str1, const char *str2);
17
+
18
+ int levenshtein_distance(const char *str1, const char *str2);
19
+ int damerau_levenshtein_distance(const char *str1, const char *str2);
20
+
21
+ double ngram_distance(const char *source, const char *target, int n);
22
+
23
+ #endif /* _HOTWATER_H_ */
@@ -0,0 +1,144 @@
1
+ /* from the https://github.com/sunlightlabs/jellyfish project */
2
+
3
+ /*
4
+ Colin Surprenant, Feb 2013
5
+ - modified error return to -1 and small cosmetic cleanups
6
+ */
7
+
8
+
9
+ #include <ctype.h>
10
+ #include <string.h>
11
+ #include <stdio.h>
12
+ #include <stdlib.h>
13
+ #include "hotwater.h"
14
+
15
+ #define NOTNUM(c) ((c>57) || (c<48))
16
+ #define INRANGE(c) ((c>0) && (c<91))
17
+
18
+ /* borrowed heavily from strcmp95.c
19
+ * http://www.census.gov/geo/msb/stand/strcmp.c
20
+ */
21
+ double _jaro_winkler(const char *ying, const char *yang, bool long_tolerance, bool winklerize)
22
+ {
23
+ /* Arguments:
24
+
25
+ ying
26
+ yang
27
+ pointers to the 2 strings to be compared.
28
+
29
+ long_tolerance
30
+ Increase the probability of a match when the number of matched
31
+ characters is large. This option allows for a little more
32
+ tolerance when the strings are large. It is not an appropriate
33
+ test when comparing fixed length fields such as phone and
34
+ social security numbers.
35
+ */
36
+ char *ying_flag = 0, *yang_flag = 0;
37
+
38
+ double weight;
39
+
40
+ long ying_length, yang_length, min_len;
41
+ long search_range;
42
+ long lowlim, hilim;
43
+ long trans_count, common_chars;
44
+
45
+ int i, j, k;
46
+
47
+ // ensure that neither string is blank
48
+ ying_length = strlen(ying);
49
+ yang_length = strlen(yang);
50
+ if (!ying_length || !yang_length) return 0;
51
+
52
+ search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
53
+
54
+ // Blank out the flags
55
+ ying_flag = alloca(ying_length + 1);
56
+ if (!ying_flag) return -1.0;
57
+
58
+ yang_flag = alloca(yang_length + 1);
59
+ if (!yang_flag) return -1.0;
60
+
61
+ memset(ying_flag, 0, ying_length + 1);
62
+ memset(yang_flag, 0, yang_length + 1);
63
+
64
+ search_range = (search_range/2) - 1;
65
+ if (search_range < 0) search_range = 0;
66
+
67
+
68
+ // Looking only within the search range, count and flag the matched pairs.
69
+ common_chars = 0;
70
+ for (i = 0; i < ying_length; i++) {
71
+ lowlim = (i >= search_range) ? i - search_range : 0;
72
+ hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
73
+ for (j = lowlim; j <= hilim; j++) {
74
+ if (!yang_flag[j] && yang[j] == ying[i]) {
75
+ yang_flag[j] = 1;
76
+ ying_flag[i] = 1;
77
+ common_chars++;
78
+ break;
79
+ }
80
+ }
81
+ }
82
+
83
+ // If no characters in common - return
84
+ if (!common_chars) return 0;
85
+
86
+ // Count the number of transpositions
87
+ k = trans_count = 0;
88
+ for (i = 0; i < ying_length; i++) {
89
+ if (ying_flag[i]) {
90
+ for (j = k; j < yang_length; j++) {
91
+ if (yang_flag[j]) {
92
+ k = j + 1;
93
+ break;
94
+ }
95
+ }
96
+ if (ying[i] != yang[j]) {
97
+ trans_count++;
98
+ }
99
+ }
100
+ }
101
+ trans_count /= 2;
102
+
103
+ // adjust for similarities in nonmatched characters
104
+
105
+ // Main weight computation.
106
+ weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
107
+ + ((double) (common_chars - trans_count)) / ((double) common_chars);
108
+ weight /= 3.0;
109
+
110
+ // Continue to boost the weight if the strings are similar
111
+ if (winklerize && weight > 0.7) {
112
+
113
+ // Adjust for having up to the first 4 characters in common
114
+ j = (min_len >= 4) ? 4 : min_len;
115
+ for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
116
+ if (i) {
117
+ weight += i * 0.1 * (1.0 - weight);
118
+ }
119
+
120
+ /* Optionally adjust for long strings. */
121
+ /* After agreeing beginning chars, at least two more must agree and
122
+ the agreeing characters must be > .5 of remaining characters.
123
+ */
124
+ if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
125
+ if (NOTNUM(ying[0])) {
126
+ weight += (double) (1.0-weight) *
127
+ ((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
128
+ }
129
+ }
130
+ }
131
+
132
+ return weight;
133
+ }
134
+
135
+
136
+ double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
137
+ {
138
+ return _jaro_winkler(ying, yang, long_tolerance, true);
139
+ }
140
+
141
+ double jaro_distance(const char *ying, const char *yang)
142
+ {
143
+ return _jaro_winkler(ying, yang, false, false);
144
+ }
@@ -0,0 +1,51 @@
1
+ /* from the https://github.com/sunlightlabs/jellyfish project */
2
+
3
+ #include "hotwater.h"
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+ #include <stdio.h>
7
+
8
+ int levenshtein_distance(const char *s1, const char *s2)
9
+ {
10
+ size_t s1_len = strlen(s1);
11
+ size_t s2_len = strlen(s2);
12
+ size_t rows = s1_len + 1;
13
+ size_t cols = s2_len + 1;
14
+ size_t i, j;
15
+
16
+ unsigned result;
17
+ unsigned d1, d2, d3;
18
+ unsigned *dist = malloc(rows * cols * sizeof(unsigned));
19
+ if (!dist) {
20
+ return -1;
21
+ }
22
+
23
+ for (i = 0; i < rows; i++) {
24
+ dist[i * cols] = i;
25
+ }
26
+
27
+
28
+ for (j = 0; j < cols; j++) {
29
+ dist[j] = j;
30
+ }
31
+
32
+ for (j = 1; j < cols; j++) {
33
+ for (i = 1; i < rows; i++) {
34
+ if (s1[i - 1] == s2[j - 1]) {
35
+ dist[(i * cols) + j] = dist[((i - 1) * cols) + (j - 1)];
36
+ } else {
37
+ d1 = dist[((i - 1) * cols) + j] + 1;
38
+ d2 = dist[(i * cols) + (j - 1)] + 1;
39
+ d3 = dist[((i - 1) * cols) + (j - 1)] + 1;
40
+
41
+ dist[(i * cols) + j] = MIN(d1, MIN(d2, d3));
42
+ }
43
+ }
44
+ }
45
+
46
+ result = dist[(cols * rows) - 1];
47
+
48
+ free(dist);
49
+
50
+ return result;
51
+ }
@@ -0,0 +1,173 @@
1
+ /*
2
+ Colin Surprenant, Feb 2013
3
+ - converted in C from org/apache/lucene/search/spell/NGramDistance.java v4.0.0
4
+ - fixed segfault bug in substring n parameter, which did not surface in Java
5
+ */
6
+
7
+ /* package org.apache.lucene.search.spell; */
8
+
9
+ /**
10
+ * Licensed to the Apache Software Foundation (ASF) under one or more
11
+ * contributor license agreements. See the NOTICE file distributed with
12
+ * this work for additional information regarding copyright ownership.
13
+ * The ASF licenses this file to You under the Apache License, Version 2.0
14
+ * (the "License"); you may not use this file except in compliance with
15
+ * the License. You may obtain a copy of the License at
16
+ *
17
+ * http://www.apache.org/licenses/LICENSE-2.0
18
+ *
19
+ * Unless required by applicable law or agreed to in writing, software
20
+ * distributed under the License is distributed on an "AS IS" BASIS,
21
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
+ * See the License for the specific language governing permissions and
23
+ * limitations under the License.
24
+ */
25
+
26
+ /**
27
+ * N-Gram version of edit distance based on paper by Grzegorz Kondrak,
28
+ * "N-gram similarity and distance". Proceedings of the Twelfth International
29
+ * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
30
+ * Buenos Aires, Argentina, November 2005.
31
+ * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
32
+ *
33
+ * This implementation uses the position-based optimization to compute partial
34
+ * matches of n-gram sub-strings and adds a null-character prefix of size n-1
35
+ * so that the first character is contained in the same number of n-grams as
36
+ * a middle character. Null-character prefix matches are discounted so that
37
+ * strings with no matching characters will return a distance of 0.
38
+ *
39
+ */
40
+
41
+ #include "hotwater.h"
42
+ #include <string.h>
43
+ #include <stdlib.h>
44
+ #include <stdio.h>
45
+
46
+ char* substring(const char* s, int offset, int n) {
47
+ if (s == 0 || strlen(s) == 0 || strlen(s) < offset || strlen(s) < (offset + n)) {
48
+ return 0;
49
+ }
50
+ return strndup(s + offset, n);
51
+ }
52
+
53
+ double ngram_distance (const char *source, const char *target, int n) {
54
+ int sl = strlen(source);
55
+ int tl = strlen(target);
56
+
57
+ if (sl == 0 || tl == 0) {
58
+ if (sl == tl) {
59
+ return 1;
60
+ }
61
+ else {
62
+ return 0;
63
+ }
64
+ }
65
+
66
+ int cost = 0;
67
+ if (sl < n || tl < n) {
68
+ int ni = MIN(sl, tl);
69
+ for (int i = 0; i < ni; i++) {
70
+ if (source[i] == target[i]) {
71
+ cost++;
72
+ }
73
+ }
74
+ return (double)cost / (double)MAX(sl, tl);
75
+ }
76
+
77
+ int sa_len = sl + n - 1;
78
+ char* sa = calloc(sa_len + 1, sizeof(char));
79
+ if (!sa) {
80
+ return -1;
81
+ }
82
+
83
+ double* p; // 'previous' cost array, horizontally
84
+ double* d; // cost array, horizontally
85
+ double* _d; // placeholder to assist in swapping p and d
86
+
87
+ // construct sa with prefix
88
+ for (int i = 0; i < sa_len; i++) {
89
+ if (i < n - 1) {
90
+ sa[i] = 0 ; //add prefix
91
+ }
92
+ else {
93
+ sa[i] = source[i - n + 1];
94
+ }
95
+ }
96
+
97
+ int p_d_len = sl + 1;
98
+ p = calloc(p_d_len + 1, sizeof(double));
99
+ if (!p) {
100
+ free(sa);
101
+ return -1;
102
+ }
103
+ d = calloc(p_d_len + 1, sizeof(double));
104
+ if (!d) {
105
+ free(sa);
106
+ free(p);
107
+ return -1;
108
+ }
109
+
110
+ // indexes into strings s and t
111
+ int i = 0; // iterates through source
112
+ int j = 0; // iterates through target
113
+
114
+ char* t_j = calloc(n + 1, sizeof(char)); // jth n-gram of t
115
+ if (!t_j) {
116
+ free(sa);
117
+ free(p);
118
+ free(d);
119
+ return -1;
120
+ }
121
+ for (i = 0; i <= sl; i++) {
122
+ p[i] = i;
123
+ }
124
+
125
+ for (j = 1; j <= tl; j++) {
126
+ // construct t_j n-gram
127
+ if (j < n) {
128
+ for (int ti = 0; ti < n - j; ti++) {
129
+ t_j[ti] = 0; //add prefix
130
+ }
131
+ for (int ti = n - j; ti < n; ti++) {
132
+ t_j[ti] = target[ti - (n - j)];
133
+ }
134
+ }
135
+ else {
136
+ free(t_j);
137
+ t_j = substring(target, j - n, n);
138
+ }
139
+ d[0] = j;
140
+ for (i = 1; i <= sl; i++) {
141
+ cost = 0;
142
+ int tn = n;
143
+ // compare sa to t_j
144
+
145
+ for (int ni = 0; ni < n; ni++) {
146
+ if (sa[i - 1 + ni] != t_j[ni]) {
147
+ cost++;
148
+ }
149
+ else if (sa[i - 1 + ni] == 0) { //discount matches on prefix
150
+ tn--;
151
+ }
152
+ }
153
+ double ec = (double)cost / (double)tn;
154
+
155
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
156
+ d[i] = MIN(MIN(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
157
+ }
158
+ // copy current distance counts to 'previous row' distance counts
159
+ _d = p;
160
+ p = d;
161
+ d = _d;
162
+ }
163
+ double p_sl = p[sl];
164
+
165
+ free(p);
166
+ free(d);
167
+ free(t_j);
168
+ free(sa);
169
+
170
+ // our last action in the above loop was to switch d and p, so p now
171
+ // actually has the most recent cost counts
172
+ return 1.0 - (p_sl / (double)MAX(tl, sl));
173
+ }
data/hotwater.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'hotwater/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "hotwater"
8
+ gem.version = Hotwater::VERSION
9
+ gem.authors = ["Colin Surprenant"]
10
+ gem.email = ["colin.surprenant@gmail.com"]
11
+ gem.description = "Ruby & JRuby gem with fast string edit distance C implementation using FFI bindings"
12
+ gem.summary = "Fast string edit distance"
13
+ gem.homepage = "http://github.com/colinsurprenant/hotwater"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.extensions = ["ext/hotwater/Rakefile"]
20
+
21
+ gem.add_dependency 'rake'
22
+ gem.add_dependency 'ffi'
23
+ gem.add_dependency 'ffi-compiler'
24
+
25
+ gem.add_development_dependency 'rspec'
26
+ end
@@ -0,0 +1,26 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # int damerau_levenshtein_distance(const char *str1, const char *str2)
7
+ attach_function :damerau_levenshtein_distance, [:string, :string], :int
8
+ end
9
+
10
+ def damerau_levenshtein_distance(s1, s2)
11
+ result = C::damerau_levenshtein_distance(s1, s2)
12
+ raise("memory allocation error") if result == -1
13
+ result
14
+ end
15
+
16
+ def normalized_damerau_levenshtein_distance(s1, s2)
17
+ result = C::damerau_levenshtein_distance(s1, s2)
18
+ raise("memory allocation error") if result == -1
19
+ return 0.0 if result == 0
20
+ max = [s1.size, s2.size].max
21
+ (max - result.to_f) / max
22
+ end
23
+
24
+ module_function :damerau_levenshtein_distance, :normalized_damerau_levenshtein_distance
25
+
26
+ end
@@ -0,0 +1,26 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # double jaro_distance(const char *ying, const char *yang)
7
+ attach_function :jaro_distance, [:string, :string], :double
8
+
9
+ # double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
10
+ attach_function :jaro_winkler_distance, [:string, :string, :bool], :double
11
+ end
12
+
13
+ def jaro_distance(s1, s2)
14
+ result = C::jaro_distance(s1, s2)
15
+ raise("memory allocation error") if result < 0.0
16
+ result
17
+ end
18
+
19
+ def jaro_winkler_distance(s1, s2, long_tolerance = false)
20
+ result = C::jaro_winkler_distance(s1, s2, long_tolerance)
21
+ raise("memory allocation error") if result < 0.0
22
+ result
23
+ end
24
+
25
+ module_function :jaro_distance, :jaro_winkler_distance
26
+ end
@@ -0,0 +1,25 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # int levenshtein_distance(const char *str1, const char *str2);
7
+ attach_function :levenshtein_distance, [:string, :string], :int
8
+ end
9
+
10
+ def levenshtein_distance(s1, s2)
11
+ result = C::levenshtein_distance(s1, s2)
12
+ raise("memory allocation error") if result == -1
13
+ result
14
+ end
15
+
16
+ def normalized_levenshtein_distance(s1, s2)
17
+ result = C::levenshtein_distance(s1, s2)
18
+ raise("memory allocation error") if result == -1
19
+ return 0.0 if result == 0
20
+ max = [s1.size, s2.size].max
21
+ (max - result.to_f) / max
22
+ end
23
+
24
+ module_function :levenshtein_distance, :normalized_levenshtein_distance
25
+ end
@@ -0,0 +1,18 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # double ngram_distance(const char str1*, const char str2*, int n, int mode);
7
+ attach_function :ngram_distance, [:string, :string, :int], :double
8
+ end
9
+
10
+ def ngram_distance(s1, s2, n = 2)
11
+ result = C::ngram_distance(s1, s2, n)
12
+ raise("memory allocation error") if result == -1
13
+ result
14
+ end
15
+
16
+ module_function :ngram_distance
17
+
18
+ end
@@ -0,0 +1,3 @@
1
+ module Hotwater
2
+ VERSION = "0.1.0"
3
+ end
data/lib/hotwater.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'hotwater/version'
2
+ require 'ffi'
3
+ require 'ffi-compiler/loader'
4
+
5
+ module Hotwater
6
+ module C
7
+ extend FFI::Library
8
+ ffi_lib FFI::Compiler::Loader.find('hotwater')
9
+ end
10
+ end
11
+
12
+ require 'hotwater/levenshtein_ffi'
13
+ require 'hotwater/damerau_levenshtein_ffi'
14
+ require 'hotwater/jaro_ffi'
15
+ require 'hotwater/ngram_ffi'
16
+
@@ -0,0 +1,29 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute damerau_levenshtein_distance" do
7
+ Hotwater.damerau_levenshtein_distance("", "").should == 0
8
+ Hotwater.damerau_levenshtein_distance("abc", "").should == 3
9
+ Hotwater.damerau_levenshtein_distance("bc", "abc").should == 1
10
+ Hotwater.damerau_levenshtein_distance("ca", "abc").should == 3
11
+ Hotwater.damerau_levenshtein_distance("abc", "acb").should == 1
12
+ Hotwater.damerau_levenshtein_distance("kitten", "sitting").should == 3
13
+ Hotwater.damerau_levenshtein_distance("Saturday", "Sunday").should == 3
14
+ Hotwater.damerau_levenshtein_distance("teusday", "tuesday").should == 1
15
+ Hotwater.damerau_levenshtein_distance("teusday", "thursday").should == 2
16
+ end
17
+
18
+ it "should compute normalized_damerau_levenshtein_distance" do
19
+ Hotwater.normalized_damerau_levenshtein_distance("", "").should == 0.0
20
+ Hotwater.normalized_damerau_levenshtein_distance("abc", "").should == 0.0
21
+ Hotwater.normalized_damerau_levenshtein_distance("bc", "abc").round(4).should == 0.6667
22
+ Hotwater.normalized_damerau_levenshtein_distance("ca", "abc").should == 0.0
23
+ Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4).should == 0.6667
24
+ Hotwater.normalized_damerau_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
25
+ Hotwater.normalized_damerau_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
26
+ Hotwater.normalized_damerau_levenshtein_distance("teusday", "tuesday").round(4).should == 0.8571
27
+ Hotwater.normalized_damerau_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
28
+ end
29
+ end
@@ -0,0 +1,19 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute jaro_distance" do
7
+ Hotwater.jaro_distance("", "").should == 0.0
8
+ Hotwater.jaro_distance("dixon", "dicksonx").round(4).should == 0.7667
9
+ Hotwater.jaro_distance("martha", "marhta").round(4).should == 0.9444
10
+ Hotwater.jaro_distance("dwayne", "duane").round(4).should == 0.8222
11
+ end
12
+
13
+ it "should compute jaro_winkler_distance" do
14
+ Hotwater.jaro_winkler_distance("", "").should == 0.0
15
+ Hotwater.jaro_winkler_distance("dixon", "dicksonx").round(4).should == 0.8133
16
+ Hotwater.jaro_winkler_distance("martha", "marhta").round(4).should == 0.9611
17
+ Hotwater.jaro_winkler_distance("dwayne", "duane").round(4).should == 0.84
18
+ end
19
+ end
@@ -0,0 +1,29 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute levenshtein_distance" do
7
+ Hotwater.levenshtein_distance("", "").should == 0
8
+ Hotwater.levenshtein_distance("abc", "").should == 3
9
+ Hotwater.levenshtein_distance("bc", "abc").should == 1
10
+ Hotwater.levenshtein_distance("ca", "abc").should == 3
11
+ Hotwater.levenshtein_distance("abc", "acb").should == 2
12
+ Hotwater.levenshtein_distance("kitten", "sitting").should == 3
13
+ Hotwater.levenshtein_distance("Saturday", "Sunday").should == 3
14
+ Hotwater.levenshtein_distance("teusday", "tuesday").should == 2
15
+ Hotwater.levenshtein_distance("teusday", "thursday").should == 2
16
+ end
17
+
18
+ it "should compute normalized_levenshtein_distance" do
19
+ Hotwater.normalized_levenshtein_distance("", "").should == 0.0
20
+ Hotwater.normalized_levenshtein_distance("abc", "").should == 0.0
21
+ Hotwater.normalized_levenshtein_distance("bc", "abc").round(4).should == 0.6667
22
+ Hotwater.normalized_levenshtein_distance("ca", "abc").should == 0.0
23
+ Hotwater.normalized_levenshtein_distance("abc", "acb").round(4).should == 0.3333
24
+ Hotwater.normalized_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
25
+ Hotwater.normalized_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
26
+ Hotwater.normalized_levenshtein_distance("teusday", "tuesday").round(4).should == 0.7143
27
+ Hotwater.normalized_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
28
+ end
29
+ end
@@ -0,0 +1,75 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute unigram distance" do
7
+ Hotwater.ngram_distance("", "al", 1).round(4).should == 0.0
8
+ Hotwater.ngram_distance("al", "al", 1).round(4).should == 1.0
9
+ Hotwater.ngram_distance("a", "a", 1).round(4).should == 1.0
10
+ Hotwater.ngram_distance("b", "a", 1).round(4).should == 0.0
11
+ Hotwater.ngram_distance("martha", "marhta", 1).round(4).should == 0.6667
12
+ Hotwater.ngram_distance("jones", "johnson", 1).round(4).should == 0.4286
13
+ Hotwater.ngram_distance("natural", "contrary", 1).round(4).should == 0.25
14
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 1).round(4).should == 0.75
15
+ Hotwater.ngram_distance("dwayne", "duane", 1).round(4).should == 0.6667
16
+ Hotwater.ngram_distance("dixon", "dicksonx", 1).round(4).should == 0.5
17
+ Hotwater.ngram_distance("six", "ten", 1).round(4).should == 0.0
18
+ Hotwater.ngram_distance("zac ephron", "zac efron", 1).round(4).should == Hotwater.ngram_distance("zac ephron", "kai ephron", 1).round(4)
19
+ Hotwater.ngram_distance("brittney spears", "britney spears", 1).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 1)
20
+ Hotwater.ngram_distance("12345678", "12890678", 1).round(4).should == Hotwater.ngram_distance("12345678", "72385698", 1)
21
+ end
22
+
23
+ it "should compute bigram distance" do
24
+ Hotwater.ngram_distance("", "al", 2).round(4).should == 0.0
25
+ Hotwater.ngram_distance("al", "al", 2).round(4).should == 1.0
26
+ Hotwater.ngram_distance("a", "a", 2).round(4).should == 1.0
27
+ Hotwater.ngram_distance("b", "a", 2).round(4).should == 0.0
28
+ Hotwater.ngram_distance("a", "aa", 2).round(4).should == 0.5
29
+ Hotwater.ngram_distance("martha", "marhta", 2).round(4).should == 0.6667
30
+ Hotwater.ngram_distance("jones", "johnson", 2).round(4).should == 0.4286
31
+ Hotwater.ngram_distance("natural", "contrary", 2).round(4).should == 0.25
32
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 2).round(4).should == 0.625
33
+ Hotwater.ngram_distance("dwayne", "duane", 2).round(4).should == 0.5833
34
+ Hotwater.ngram_distance("dixon", "dicksonx", 2).round(4).should == 0.5
35
+ Hotwater.ngram_distance("six", "ten", 2).round(4).should == 0.0
36
+ Hotwater.ngram_distance("zac ephron", "zac efron", 2).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 2).round(4)
37
+ Hotwater.ngram_distance("brittney spears", "britney spears", 2).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 2)
38
+ Hotwater.ngram_distance("0012345678", "0012890678", 2).round(4).should == Hotwater.ngram_distance("0012345678", "0072385698", 2)
39
+ end
40
+
41
+ it "should compute bigram distance by default" do
42
+ Hotwater.ngram_distance("", "al").round(4).should == 0.0
43
+ Hotwater.ngram_distance("al", "al").round(4).should == 1.0
44
+ Hotwater.ngram_distance("a", "a").round(4).should == 1.0
45
+ Hotwater.ngram_distance("b", "a").round(4).should == 0.0
46
+ Hotwater.ngram_distance("a", "aa").round(4).should == 0.5
47
+ Hotwater.ngram_distance("martha", "marhta").round(4).should == 0.6667
48
+ Hotwater.ngram_distance("jones", "johnson").round(4).should == 0.4286
49
+ Hotwater.ngram_distance("natural", "contrary").round(4).should == 0.25
50
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz").round(4).should == 0.625
51
+ Hotwater.ngram_distance("dwayne", "duane").round(4).should == 0.5833
52
+ Hotwater.ngram_distance("dixon", "dicksonx").round(4).should == 0.5
53
+ Hotwater.ngram_distance("six", "ten").round(4).should == 0.0
54
+ Hotwater.ngram_distance("zac ephron", "zac efron").round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron").round(4)
55
+ Hotwater.ngram_distance("brittney spears", "britney spears").round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman")
56
+ Hotwater.ngram_distance("0012345678", "0012890678").round(4).should == Hotwater.ngram_distance("0012345678", "0072385698")
57
+ end
58
+
59
+ it "should compute trigram distance" do
60
+ Hotwater.ngram_distance("", "al", 3).round(4).should == 0.0
61
+ Hotwater.ngram_distance("al", "al", 3).round(4).should == 1.0
62
+ Hotwater.ngram_distance("a", "a", 3).round(4).should == 1.0
63
+ Hotwater.ngram_distance("b", "a", 3).round(4).should == 0.0
64
+ Hotwater.ngram_distance("martha", "marhta", 3).round(4).should == 0.7222
65
+ Hotwater.ngram_distance("jones", "johnson", 3).round(4).should == 0.4762
66
+ Hotwater.ngram_distance("natural", "contrary", 3).round(4).should == 0.2083
67
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 3).round(4).should == 0.5625
68
+ Hotwater.ngram_distance("dwayne", "duane", 3).round(4).should == 0.5278
69
+ Hotwater.ngram_distance("dixon", "dicksonx", 3).round(4).should == 0.4583
70
+ Hotwater.ngram_distance("six", "ten", 3).round(4).should == 0.0
71
+ Hotwater.ngram_distance("zac ephron", "zac efron", 3).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 3).round(4)
72
+ Hotwater.ngram_distance("brittney spears", "britney spears", 3).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 3)
73
+ Hotwater.ngram_distance("0012345678", "0012890678", 3).round(4).should < Hotwater.ngram_distance("0012345678", "0072385698", 3)
74
+ end
75
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hotwater
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Colin Surprenant
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: ffi
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: ffi-compiler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rspec
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Ruby & JRuby gem with fast string edit distance C implementation using
79
+ FFI bindings
80
+ email:
81
+ - colin.surprenant@gmail.com
82
+ executables: []
83
+ extensions:
84
+ - ext/hotwater/Rakefile
85
+ extra_rdoc_files: []
86
+ files:
87
+ - .gitignore
88
+ - Gemfile
89
+ - LICENSE.txt
90
+ - README.md
91
+ - Rakefile
92
+ - ext/hotwater/Rakefile
93
+ - ext/hotwater/damerau_levenshtein.c
94
+ - ext/hotwater/hotwater.h
95
+ - ext/hotwater/jaro.c
96
+ - ext/hotwater/levenshtein.c
97
+ - ext/hotwater/ngram.c
98
+ - hotwater.gemspec
99
+ - lib/hotwater.rb
100
+ - lib/hotwater/damerau_levenshtein_ffi.rb
101
+ - lib/hotwater/jaro_ffi.rb
102
+ - lib/hotwater/levenshtein_ffi.rb
103
+ - lib/hotwater/ngram_ffi.rb
104
+ - lib/hotwater/version.rb
105
+ - spec/hotwater/damerau_levenshtein_ffi_spec.rb
106
+ - spec/hotwater/jaro_ffi_spec.rb
107
+ - spec/hotwater/levenshtein_ffi_spec.rb
108
+ - spec/hotwater/ngram_ffi_spec.rb
109
+ homepage: http://github.com/colinsurprenant/hotwater
110
+ licenses: []
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ segments:
122
+ - 0
123
+ hash: -289401610859280349
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ none: false
126
+ requirements:
127
+ - - ! '>='
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ segments:
131
+ - 0
132
+ hash: -289401610859280349
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 1.8.23
136
+ signing_key:
137
+ specification_version: 3
138
+ summary: Fast string edit distance
139
+ test_files:
140
+ - spec/hotwater/damerau_levenshtein_ffi_spec.rb
141
+ - spec/hotwater/jaro_ffi_spec.rb
142
+ - spec/hotwater/levenshtein_ffi_spec.rb
143
+ - spec/hotwater/ngram_ffi_spec.rb