hotwater 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .ruby-version
19
+ vendor/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in hotwater.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,45 @@
1
+ Copyright (c) 2013 Colin Surprenant <colin.surprenant@gmail.com>
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+
15
+ --------
16
+
17
+ C code from the https://github.com/sunlightlabs/jellyfish project
18
+
19
+ Copyright (c) 2010, Sunlight Labs
20
+
21
+ All rights reserved.
22
+
23
+ Redistribution and use in source and binary forms, with or without modification,
24
+ are permitted provided that the following conditions are met:
25
+
26
+ * Redistributions of source code must retain the above copyright notice,
27
+ this list of conditions and the following disclaimer.
28
+ * Redistributions in binary form must reproduce the above copyright notice,
29
+ this list of conditions and the following disclaimer in the documentation
30
+ and/or other materials provided with the distribution.
31
+ * Neither the name of Sunlight Labs nor the names of its contributors may be
32
+ used to endorse or promote products derived from this software without
33
+ specific prior written permission.
34
+
35
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
39
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ # Hotwater v0.1.0
2
+
3
+ Ruby & JRuby gem with fast **string edit distance** C implementations using FFI bindings.
4
+
5
+ ### Algorithms
6
+
7
+ - Levenshtein & Damerau Levenshtein
8
+ - Jaro & Jaro Winkler
9
+ - N-Gram
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'hotwater'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install hotwater
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ Hotwater.levenshtein_distance("abc", "acb") # => 2
29
+ Hotwater.damerau_levenshtein_distance("abc", "acb") # => 1
30
+
31
+ # do normalization based on the string sizes
32
+ Hotwater.normalized_levenshtein_distance("abc", "acb").round(4) # => 0.3333
33
+ Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4) # => 0.6667
34
+
35
+ Hotwater.jaro_distance("martha", "marhta").round(4) # => 0.9444
36
+ Hotwater.jaro_winkler_distance("martha", "marhta").round(4) # => 0.9611
37
+
38
+ # default is bigram
39
+ Hotwater.ngram_distance("natural", "contrary").round(4) # => 0.25
40
+
41
+ # specify trigram
42
+ Hotwater.ngram_distance("natural", "contrary", 3).round(4) # => 0.2083
43
+ ```
44
+
45
+ ## Developement
46
+
47
+ 1. Fort it
48
+ 2. Install gems `$ bundle install`
49
+ 3. Compile lib `$ rake compile`
50
+ 4. Run specs `$ rake spec`
51
+ 5. Clean compiler generated files `$ rake clean`
52
+
53
+ ## Contributing
54
+
55
+ 1. Fork it
56
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
57
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
58
+ 4. Push to the branch (`git push origin my-new-feature`)
59
+ 5. Create new Pull Request
60
+
61
+ ## Credits
62
+ - Some C code from the https://github.com/sunlightlabs/jellyfish project
63
+ - N-Gram ported from Apache Lucene 4.0.0 NGramDistance.java
64
+
65
+ ## Author
66
+ Colin Surprenant, [@colinsurprenant](http://twitter.com/colinsurprenant), [http://github.com/colinsurprenant](http://github.com/colinsurprenant), colin.surprenant@gmail.com
67
+
68
+ ## License
69
+ Hotwater is distributed under the Apache License, Version 2.0.
70
+
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ require 'bundler/setup'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'bundler/gem_tasks'
5
+ require 'rspec/core/rake_task'
6
+ require 'ffi-compiler/compile_task'
7
+
8
+ task :default => :spec
9
+
10
+ desc "run specs"
11
+ task :spec do
12
+ RSpec::Core::RakeTask.new
13
+ end
14
+
15
+ desc "compiler tasks"
16
+ namespace "ffi-compiler" do
17
+ FFI::Compiler::CompileTask.new('ext/hotwater/hotwater') do |c|
18
+ end
19
+ end
20
+ task :compile => ["ffi-compiler:default"]
21
+
22
+ CLEAN.include('ext/**/*{.o,.log,.so,.bundle}')
23
+ CLEAN.include('lib/**/*{.o,.log,.so,.bundle}')
24
+ CLEAN.include('ext/**/Makefile')
25
+
@@ -0,0 +1,5 @@
1
+ require 'ffi-compiler/compile_task'
2
+
3
+ FFI::Compiler::CompileTask.new('hotwater') do |c|
4
+ # nothing yet bro
5
+ end
@@ -0,0 +1,68 @@
1
+ /* from the https://github.com/sunlightlabs/jellyfish project */
2
+
3
+ #include "hotwater.h"
4
+ #include <string.h>
5
+
6
+ int damerau_levenshtein_distance(const char *s1, const char *s2)
7
+ {
8
+ size_t s1_len = s1 == NULL ? 0 : strlen(s1);
9
+ size_t s2_len = s2 == NULL ? 0 : strlen(s2);
10
+ size_t rows = s1_len + 1;
11
+ size_t cols = s2_len + 1;
12
+
13
+ size_t i, j;
14
+ size_t d1, d2, d3, d_now;;
15
+ unsigned short cost;
16
+
17
+ if (s1_len == 0) {
18
+ if (s2_len == 0) {
19
+ return 0;
20
+ }
21
+ else
22
+ {
23
+ return s2_len;
24
+ }
25
+ }
26
+
27
+ size_t *dist = malloc(rows * cols * sizeof(size_t));
28
+ if (!dist) {
29
+ return -1;
30
+ }
31
+
32
+ for (i = 0; i < rows; i++) {
33
+ dist[i * cols] = i;
34
+ }
35
+
36
+ for (j = 0; j < cols; j++) {
37
+ dist[j] = j;
38
+ }
39
+
40
+ for (i = 1; i < rows; i++) {
41
+ for (j = 1; j < cols; j++) {
42
+ if (s1[i - 1] == s2[j - 1]) {
43
+ cost = 0;
44
+ } else {
45
+ cost = 1;
46
+ }
47
+
48
+ d1 = dist[((i - 1) * cols) + j] + 1;
49
+ d2 = dist[(i * cols) + (j - 1)] + 1;
50
+ d3 = dist[((i - 1) * cols) + (j - 1)] + cost;
51
+
52
+ d_now = MIN(d1, MIN(d2, d3));
53
+
54
+ if (i > 2 && j > 2 && s1[i - 1] == s2[j - 2] &&
55
+ s1[i - 2] == s2[j - 1]) {
56
+ d1 = dist[((i - 2) * cols) + (j - 2)] + cost;
57
+ d_now = MIN(d_now, d1);
58
+ }
59
+
60
+ dist[(i * cols) + j] = d_now;
61
+ }
62
+ }
63
+
64
+ d_now = dist[(cols * rows) - 1];
65
+ free(dist);
66
+
67
+ return d_now;
68
+ }
@@ -0,0 +1,23 @@
1
+ #ifndef _HOTWATER_H_
2
+ #define _HOTWATER_H_
3
+
4
+ #include <stdbool.h>
5
+ #include <stdlib.h>
6
+
7
+ #ifndef MIN
8
+ #define MIN(a, b) (((a) < (b)) ? (a) : (b))
9
+ #endif
10
+
11
+ #ifndef MAX
12
+ #define MAX(a, b) (((a) > (b)) ? (a) : (b))
13
+ #endif
14
+
15
+ double jaro_winkler_distance(const char *str1, const char *str2, bool long_tolerance);
16
+ double jaro_distance(const char *str1, const char *str2);
17
+
18
+ int levenshtein_distance(const char *str1, const char *str2);
19
+ int damerau_levenshtein_distance(const char *str1, const char *str2);
20
+
21
+ double ngram_distance(const char *source, const char *target, int n);
22
+
23
+ #endif /* _HOTWATER_H_ */
@@ -0,0 +1,144 @@
1
+ /* from the https://github.com/sunlightlabs/jellyfish project */
2
+
3
+ /*
4
+ Colin Surprenant, Feb 2013
5
+ - modified error return to -1 and small cosmetic cleanups
6
+ */
7
+
8
+
9
+ #include <ctype.h>
10
+ #include <string.h>
11
+ #include <stdio.h>
12
+ #include <stdlib.h>
13
+ #include "hotwater.h"
14
+
15
+ #define NOTNUM(c) ((c>57) || (c<48))
16
+ #define INRANGE(c) ((c>0) && (c<91))
17
+
18
+ /* borrowed heavily from strcmp95.c
19
+ * http://www.census.gov/geo/msb/stand/strcmp.c
20
+ */
21
+ double _jaro_winkler(const char *ying, const char *yang, bool long_tolerance, bool winklerize)
22
+ {
23
+ /* Arguments:
24
+
25
+ ying
26
+ yang
27
+ pointers to the 2 strings to be compared.
28
+
29
+ long_tolerance
30
+ Increase the probability of a match when the number of matched
31
+ characters is large. This option allows for a little more
32
+ tolerance when the strings are large. It is not an appropriate
33
+ test when comparing fixed length fields such as phone and
34
+ social security numbers.
35
+ */
36
+ char *ying_flag = 0, *yang_flag = 0;
37
+
38
+ double weight;
39
+
40
+ long ying_length, yang_length, min_len;
41
+ long search_range;
42
+ long lowlim, hilim;
43
+ long trans_count, common_chars;
44
+
45
+ int i, j, k;
46
+
47
+ // ensure that neither string is blank
48
+ ying_length = strlen(ying);
49
+ yang_length = strlen(yang);
50
+ if (!ying_length || !yang_length) return 0;
51
+
52
+ search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
53
+
54
+ // Blank out the flags
55
+ ying_flag = alloca(ying_length + 1);
56
+ if (!ying_flag) return -1.0;
57
+
58
+ yang_flag = alloca(yang_length + 1);
59
+ if (!yang_flag) return -1.0;
60
+
61
+ memset(ying_flag, 0, ying_length + 1);
62
+ memset(yang_flag, 0, yang_length + 1);
63
+
64
+ search_range = (search_range/2) - 1;
65
+ if (search_range < 0) search_range = 0;
66
+
67
+
68
+ // Looking only within the search range, count and flag the matched pairs.
69
+ common_chars = 0;
70
+ for (i = 0; i < ying_length; i++) {
71
+ lowlim = (i >= search_range) ? i - search_range : 0;
72
+ hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
73
+ for (j = lowlim; j <= hilim; j++) {
74
+ if (!yang_flag[j] && yang[j] == ying[i]) {
75
+ yang_flag[j] = 1;
76
+ ying_flag[i] = 1;
77
+ common_chars++;
78
+ break;
79
+ }
80
+ }
81
+ }
82
+
83
+ // If no characters in common - return
84
+ if (!common_chars) return 0;
85
+
86
+ // Count the number of transpositions
87
+ k = trans_count = 0;
88
+ for (i = 0; i < ying_length; i++) {
89
+ if (ying_flag[i]) {
90
+ for (j = k; j < yang_length; j++) {
91
+ if (yang_flag[j]) {
92
+ k = j + 1;
93
+ break;
94
+ }
95
+ }
96
+ if (ying[i] != yang[j]) {
97
+ trans_count++;
98
+ }
99
+ }
100
+ }
101
+ trans_count /= 2;
102
+
103
+ // adjust for similarities in nonmatched characters
104
+
105
+ // Main weight computation.
106
+ weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
107
+ + ((double) (common_chars - trans_count)) / ((double) common_chars);
108
+ weight /= 3.0;
109
+
110
+ // Continue to boost the weight if the strings are similar
111
+ if (winklerize && weight > 0.7) {
112
+
113
+ // Adjust for having up to the first 4 characters in common
114
+ j = (min_len >= 4) ? 4 : min_len;
115
+ for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
116
+ if (i) {
117
+ weight += i * 0.1 * (1.0 - weight);
118
+ }
119
+
120
+ /* Optionally adjust for long strings. */
121
+ /* After agreeing beginning chars, at least two more must agree and
122
+ the agreeing characters must be > .5 of remaining characters.
123
+ */
124
+ if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
125
+ if (NOTNUM(ying[0])) {
126
+ weight += (double) (1.0-weight) *
127
+ ((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
128
+ }
129
+ }
130
+ }
131
+
132
+ return weight;
133
+ }
134
+
135
+
136
+ double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
137
+ {
138
+ return _jaro_winkler(ying, yang, long_tolerance, true);
139
+ }
140
+
141
+ double jaro_distance(const char *ying, const char *yang)
142
+ {
143
+ return _jaro_winkler(ying, yang, false, false);
144
+ }
@@ -0,0 +1,51 @@
1
+ /* from the https://github.com/sunlightlabs/jellyfish project */
2
+
3
+ #include "hotwater.h"
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+ #include <stdio.h>
7
+
8
+ int levenshtein_distance(const char *s1, const char *s2)
9
+ {
10
+ size_t s1_len = strlen(s1);
11
+ size_t s2_len = strlen(s2);
12
+ size_t rows = s1_len + 1;
13
+ size_t cols = s2_len + 1;
14
+ size_t i, j;
15
+
16
+ unsigned result;
17
+ unsigned d1, d2, d3;
18
+ unsigned *dist = malloc(rows * cols * sizeof(unsigned));
19
+ if (!dist) {
20
+ return -1;
21
+ }
22
+
23
+ for (i = 0; i < rows; i++) {
24
+ dist[i * cols] = i;
25
+ }
26
+
27
+
28
+ for (j = 0; j < cols; j++) {
29
+ dist[j] = j;
30
+ }
31
+
32
+ for (j = 1; j < cols; j++) {
33
+ for (i = 1; i < rows; i++) {
34
+ if (s1[i - 1] == s2[j - 1]) {
35
+ dist[(i * cols) + j] = dist[((i - 1) * cols) + (j - 1)];
36
+ } else {
37
+ d1 = dist[((i - 1) * cols) + j] + 1;
38
+ d2 = dist[(i * cols) + (j - 1)] + 1;
39
+ d3 = dist[((i - 1) * cols) + (j - 1)] + 1;
40
+
41
+ dist[(i * cols) + j] = MIN(d1, MIN(d2, d3));
42
+ }
43
+ }
44
+ }
45
+
46
+ result = dist[(cols * rows) - 1];
47
+
48
+ free(dist);
49
+
50
+ return result;
51
+ }
@@ -0,0 +1,173 @@
1
+ /*
2
+ Colin Surprenant, Feb 2013
3
+ - converted in C from org/apache/lucene/search/spell/NGramDistance.java v4.0.0
4
+ - fixed segfault bug in substring n parameter, which did not surface in Java
5
+ */
6
+
7
+ /* package org.apache.lucene.search.spell; */
8
+
9
+ /**
10
+ * Licensed to the Apache Software Foundation (ASF) under one or more
11
+ * contributor license agreements. See the NOTICE file distributed with
12
+ * this work for additional information regarding copyright ownership.
13
+ * The ASF licenses this file to You under the Apache License, Version 2.0
14
+ * (the "License"); you may not use this file except in compliance with
15
+ * the License. You may obtain a copy of the License at
16
+ *
17
+ * http://www.apache.org/licenses/LICENSE-2.0
18
+ *
19
+ * Unless required by applicable law or agreed to in writing, software
20
+ * distributed under the License is distributed on an "AS IS" BASIS,
21
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
+ * See the License for the specific language governing permissions and
23
+ * limitations under the License.
24
+ */
25
+
26
+ /**
27
+ * N-Gram version of edit distance based on paper by Grzegorz Kondrak,
28
+ * "N-gram similarity and distance". Proceedings of the Twelfth International
29
+ * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
30
+ * Buenos Aires, Argentina, November 2005.
31
+ * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
32
+ *
33
+ * This implementation uses the position-based optimization to compute partial
34
+ * matches of n-gram sub-strings and adds a null-character prefix of size n-1
35
+ * so that the first character is contained in the same number of n-grams as
36
+ * a middle character. Null-character prefix matches are discounted so that
37
+ * strings with no matching characters will return a distance of 0.
38
+ *
39
+ */
40
+
41
+ #include "hotwater.h"
42
+ #include <string.h>
43
+ #include <stdlib.h>
44
+ #include <stdio.h>
45
+
46
+ char* substring(const char* s, int offset, int n) {
47
+ if (s == 0 || strlen(s) == 0 || strlen(s) < offset || strlen(s) < (offset + n)) {
48
+ return 0;
49
+ }
50
+ return strndup(s + offset, n);
51
+ }
52
+
53
+ double ngram_distance (const char *source, const char *target, int n) {
54
+ int sl = strlen(source);
55
+ int tl = strlen(target);
56
+
57
+ if (sl == 0 || tl == 0) {
58
+ if (sl == tl) {
59
+ return 1;
60
+ }
61
+ else {
62
+ return 0;
63
+ }
64
+ }
65
+
66
+ int cost = 0;
67
+ if (sl < n || tl < n) {
68
+ int ni = MIN(sl, tl);
69
+ for (int i = 0; i < ni; i++) {
70
+ if (source[i] == target[i]) {
71
+ cost++;
72
+ }
73
+ }
74
+ return (double)cost / (double)MAX(sl, tl);
75
+ }
76
+
77
+ int sa_len = sl + n - 1;
78
+ char* sa = calloc(sa_len + 1, sizeof(char));
79
+ if (!sa) {
80
+ return -1;
81
+ }
82
+
83
+ double* p; // 'previous' cost array, horizontally
84
+ double* d; // cost array, horizontally
85
+ double* _d; // placeholder to assist in swapping p and d
86
+
87
+ // construct sa with prefix
88
+ for (int i = 0; i < sa_len; i++) {
89
+ if (i < n - 1) {
90
+ sa[i] = 0 ; //add prefix
91
+ }
92
+ else {
93
+ sa[i] = source[i - n + 1];
94
+ }
95
+ }
96
+
97
+ int p_d_len = sl + 1;
98
+ p = calloc(p_d_len + 1, sizeof(double));
99
+ if (!p) {
100
+ free(sa);
101
+ return -1;
102
+ }
103
+ d = calloc(p_d_len + 1, sizeof(double));
104
+ if (!d) {
105
+ free(sa);
106
+ free(p);
107
+ return -1;
108
+ }
109
+
110
+ // indexes into strings s and t
111
+ int i = 0; // iterates through source
112
+ int j = 0; // iterates through target
113
+
114
+ char* t_j = calloc(n + 1, sizeof(char)); // jth n-gram of t
115
+ if (!t_j) {
116
+ free(sa);
117
+ free(p);
118
+ free(d);
119
+ return -1;
120
+ }
121
+ for (i = 0; i <= sl; i++) {
122
+ p[i] = i;
123
+ }
124
+
125
+ for (j = 1; j <= tl; j++) {
126
+ // construct t_j n-gram
127
+ if (j < n) {
128
+ for (int ti = 0; ti < n - j; ti++) {
129
+ t_j[ti] = 0; //add prefix
130
+ }
131
+ for (int ti = n - j; ti < n; ti++) {
132
+ t_j[ti] = target[ti - (n - j)];
133
+ }
134
+ }
135
+ else {
136
+ free(t_j);
137
+ t_j = substring(target, j - n, n);
138
+ }
139
+ d[0] = j;
140
+ for (i = 1; i <= sl; i++) {
141
+ cost = 0;
142
+ int tn = n;
143
+ // compare sa to t_j
144
+
145
+ for (int ni = 0; ni < n; ni++) {
146
+ if (sa[i - 1 + ni] != t_j[ni]) {
147
+ cost++;
148
+ }
149
+ else if (sa[i - 1 + ni] == 0) { //discount matches on prefix
150
+ tn--;
151
+ }
152
+ }
153
+ double ec = (double)cost / (double)tn;
154
+
155
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
156
+ d[i] = MIN(MIN(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
157
+ }
158
+ // copy current distance counts to 'previous row' distance counts
159
+ _d = p;
160
+ p = d;
161
+ d = _d;
162
+ }
163
+ double p_sl = p[sl];
164
+
165
+ free(p);
166
+ free(d);
167
+ free(t_j);
168
+ free(sa);
169
+
170
+ // our last action in the above loop was to switch d and p, so p now
171
+ // actually has the most recent cost counts
172
+ return 1.0 - (p_sl / (double)MAX(tl, sl));
173
+ }
data/hotwater.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'hotwater/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "hotwater"
8
+ gem.version = Hotwater::VERSION
9
+ gem.authors = ["Colin Surprenant"]
10
+ gem.email = ["colin.surprenant@gmail.com"]
11
+ gem.description = "Ruby & JRuby gem with fast string edit distance C implementation using FFI bindings"
12
+ gem.summary = "Fast string edit distance"
13
+ gem.homepage = "http://github.com/colinsurprenant/hotwater"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.extensions = ["ext/hotwater/Rakefile"]
20
+
21
+ gem.add_dependency 'rake'
22
+ gem.add_dependency 'ffi'
23
+ gem.add_dependency 'ffi-compiler'
24
+
25
+ gem.add_development_dependency 'rspec'
26
+ end
@@ -0,0 +1,26 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # int damerau_levenshtein_distance(const char *str1, const char *str2)
7
+ attach_function :damerau_levenshtein_distance, [:string, :string], :int
8
+ end
9
+
10
+ def damerau_levenshtein_distance(s1, s2)
11
+ result = C::damerau_levenshtein_distance(s1, s2)
12
+ raise("memory allocation error") if result == -1
13
+ result
14
+ end
15
+
16
+ def normalized_damerau_levenshtein_distance(s1, s2)
17
+ result = C::damerau_levenshtein_distance(s1, s2)
18
+ raise("memory allocation error") if result == -1
19
+ return 0.0 if result == 0
20
+ max = [s1.size, s2.size].max
21
+ (max - result.to_f) / max
22
+ end
23
+
24
+ module_function :damerau_levenshtein_distance, :normalized_damerau_levenshtein_distance
25
+
26
+ end
@@ -0,0 +1,26 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # double jaro_distance(const char *ying, const char *yang)
7
+ attach_function :jaro_distance, [:string, :string], :double
8
+
9
+ # double jaro_winkler_distance(const char *ying, const char *yang, bool long_tolerance)
10
+ attach_function :jaro_winkler_distance, [:string, :string, :bool], :double
11
+ end
12
+
13
+ def jaro_distance(s1, s2)
14
+ result = C::jaro_distance(s1, s2)
15
+ raise("memory allocation error") if result < 0.0
16
+ result
17
+ end
18
+
19
+ def jaro_winkler_distance(s1, s2, long_tolerance = false)
20
+ result = C::jaro_winkler_distance(s1, s2, long_tolerance)
21
+ raise("memory allocation error") if result < 0.0
22
+ result
23
+ end
24
+
25
+ module_function :jaro_distance, :jaro_winkler_distance
26
+ end
@@ -0,0 +1,25 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # int levenshtein_distance(const char *str1, const char *str2);
7
+ attach_function :levenshtein_distance, [:string, :string], :int
8
+ end
9
+
10
+ def levenshtein_distance(s1, s2)
11
+ result = C::levenshtein_distance(s1, s2)
12
+ raise("memory allocation error") if result == -1
13
+ result
14
+ end
15
+
16
+ def normalized_levenshtein_distance(s1, s2)
17
+ result = C::levenshtein_distance(s1, s2)
18
+ raise("memory allocation error") if result == -1
19
+ return 0.0 if result == 0
20
+ max = [s1.size, s2.size].max
21
+ (max - result.to_f) / max
22
+ end
23
+
24
+ module_function :levenshtein_distance, :normalized_levenshtein_distance
25
+ end
@@ -0,0 +1,18 @@
1
+ require 'ffi'
2
+
3
+ module Hotwater
4
+
5
+ module C
6
+ # double ngram_distance(const char str1*, const char str2*, int n, int mode);
7
+ attach_function :ngram_distance, [:string, :string, :int], :double
8
+ end
9
+
10
+ def ngram_distance(s1, s2, n = 2)
11
+ result = C::ngram_distance(s1, s2, n)
12
+ raise("memory allocation error") if result == -1
13
+ result
14
+ end
15
+
16
+ module_function :ngram_distance
17
+
18
+ end
@@ -0,0 +1,3 @@
1
+ module Hotwater
2
+ VERSION = "0.1.0"
3
+ end
data/lib/hotwater.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'hotwater/version'
2
+ require 'ffi'
3
+ require 'ffi-compiler/loader'
4
+
5
+ module Hotwater
6
+ module C
7
+ extend FFI::Library
8
+ ffi_lib FFI::Compiler::Loader.find('hotwater')
9
+ end
10
+ end
11
+
12
+ require 'hotwater/levenshtein_ffi'
13
+ require 'hotwater/damerau_levenshtein_ffi'
14
+ require 'hotwater/jaro_ffi'
15
+ require 'hotwater/ngram_ffi'
16
+
@@ -0,0 +1,29 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute damerau_levenshtein_distance" do
7
+ Hotwater.damerau_levenshtein_distance("", "").should == 0
8
+ Hotwater.damerau_levenshtein_distance("abc", "").should == 3
9
+ Hotwater.damerau_levenshtein_distance("bc", "abc").should == 1
10
+ Hotwater.damerau_levenshtein_distance("ca", "abc").should == 3
11
+ Hotwater.damerau_levenshtein_distance("abc", "acb").should == 1
12
+ Hotwater.damerau_levenshtein_distance("kitten", "sitting").should == 3
13
+ Hotwater.damerau_levenshtein_distance("Saturday", "Sunday").should == 3
14
+ Hotwater.damerau_levenshtein_distance("teusday", "tuesday").should == 1
15
+ Hotwater.damerau_levenshtein_distance("teusday", "thursday").should == 2
16
+ end
17
+
18
+ it "should compute normalized_damerau_levenshtein_distance" do
19
+ Hotwater.normalized_damerau_levenshtein_distance("", "").should == 0.0
20
+ Hotwater.normalized_damerau_levenshtein_distance("abc", "").should == 0.0
21
+ Hotwater.normalized_damerau_levenshtein_distance("bc", "abc").round(4).should == 0.6667
22
+ Hotwater.normalized_damerau_levenshtein_distance("ca", "abc").should == 0.0
23
+ Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4).should == 0.6667
24
+ Hotwater.normalized_damerau_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
25
+ Hotwater.normalized_damerau_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
26
+ Hotwater.normalized_damerau_levenshtein_distance("teusday", "tuesday").round(4).should == 0.8571
27
+ Hotwater.normalized_damerau_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
28
+ end
29
+ end
@@ -0,0 +1,19 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute jaro_distance" do
7
+ Hotwater.jaro_distance("", "").should == 0.0
8
+ Hotwater.jaro_distance("dixon", "dicksonx").round(4).should == 0.7667
9
+ Hotwater.jaro_distance("martha", "marhta").round(4).should == 0.9444
10
+ Hotwater.jaro_distance("dwayne", "duane").round(4).should == 0.8222
11
+ end
12
+
13
+ it "should compute jaro_winkler_distance" do
14
+ Hotwater.jaro_winkler_distance("", "").should == 0.0
15
+ Hotwater.jaro_winkler_distance("dixon", "dicksonx").round(4).should == 0.8133
16
+ Hotwater.jaro_winkler_distance("martha", "marhta").round(4).should == 0.9611
17
+ Hotwater.jaro_winkler_distance("dwayne", "duane").round(4).should == 0.84
18
+ end
19
+ end
@@ -0,0 +1,29 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute levenshtein_distance" do
7
+ Hotwater.levenshtein_distance("", "").should == 0
8
+ Hotwater.levenshtein_distance("abc", "").should == 3
9
+ Hotwater.levenshtein_distance("bc", "abc").should == 1
10
+ Hotwater.levenshtein_distance("ca", "abc").should == 3
11
+ Hotwater.levenshtein_distance("abc", "acb").should == 2
12
+ Hotwater.levenshtein_distance("kitten", "sitting").should == 3
13
+ Hotwater.levenshtein_distance("Saturday", "Sunday").should == 3
14
+ Hotwater.levenshtein_distance("teusday", "tuesday").should == 2
15
+ Hotwater.levenshtein_distance("teusday", "thursday").should == 2
16
+ end
17
+
18
+ it "should compute normalized_levenshtein_distance" do
19
+ Hotwater.normalized_levenshtein_distance("", "").should == 0.0
20
+ Hotwater.normalized_levenshtein_distance("abc", "").should == 0.0
21
+ Hotwater.normalized_levenshtein_distance("bc", "abc").round(4).should == 0.6667
22
+ Hotwater.normalized_levenshtein_distance("ca", "abc").should == 0.0
23
+ Hotwater.normalized_levenshtein_distance("abc", "acb").round(4).should == 0.3333
24
+ Hotwater.normalized_levenshtein_distance("kitten", "sitting").round(4).should == 0.5714
25
+ Hotwater.normalized_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
26
+ Hotwater.normalized_levenshtein_distance("teusday", "tuesday").round(4).should == 0.7143
27
+ Hotwater.normalized_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
28
+ end
29
+ end
@@ -0,0 +1,75 @@
1
+ require 'rspec'
2
+ require 'hotwater'
3
+
4
+ describe Hotwater do
5
+
6
+ it "should compute unigram distance" do
7
+ Hotwater.ngram_distance("", "al", 1).round(4).should == 0.0
8
+ Hotwater.ngram_distance("al", "al", 1).round(4).should == 1.0
9
+ Hotwater.ngram_distance("a", "a", 1).round(4).should == 1.0
10
+ Hotwater.ngram_distance("b", "a", 1).round(4).should == 0.0
11
+ Hotwater.ngram_distance("martha", "marhta", 1).round(4).should == 0.6667
12
+ Hotwater.ngram_distance("jones", "johnson", 1).round(4).should == 0.4286
13
+ Hotwater.ngram_distance("natural", "contrary", 1).round(4).should == 0.25
14
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 1).round(4).should == 0.75
15
+ Hotwater.ngram_distance("dwayne", "duane", 1).round(4).should == 0.6667
16
+ Hotwater.ngram_distance("dixon", "dicksonx", 1).round(4).should == 0.5
17
+ Hotwater.ngram_distance("six", "ten", 1).round(4).should == 0.0
18
+ Hotwater.ngram_distance("zac ephron", "zac efron", 1).round(4).should == Hotwater.ngram_distance("zac ephron", "kai ephron", 1).round(4)
19
+ Hotwater.ngram_distance("brittney spears", "britney spears", 1).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 1)
20
+ Hotwater.ngram_distance("12345678", "12890678", 1).round(4).should == Hotwater.ngram_distance("12345678", "72385698", 1)
21
+ end
22
+
23
+ it "should compute bigram distance" do
24
+ Hotwater.ngram_distance("", "al", 2).round(4).should == 0.0
25
+ Hotwater.ngram_distance("al", "al", 2).round(4).should == 1.0
26
+ Hotwater.ngram_distance("a", "a", 2).round(4).should == 1.0
27
+ Hotwater.ngram_distance("b", "a", 2).round(4).should == 0.0
28
+ Hotwater.ngram_distance("a", "aa", 2).round(4).should == 0.5
29
+ Hotwater.ngram_distance("martha", "marhta", 2).round(4).should == 0.6667
30
+ Hotwater.ngram_distance("jones", "johnson", 2).round(4).should == 0.4286
31
+ Hotwater.ngram_distance("natural", "contrary", 2).round(4).should == 0.25
32
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 2).round(4).should == 0.625
33
+ Hotwater.ngram_distance("dwayne", "duane", 2).round(4).should == 0.5833
34
+ Hotwater.ngram_distance("dixon", "dicksonx", 2).round(4).should == 0.5
35
+ Hotwater.ngram_distance("six", "ten", 2).round(4).should == 0.0
36
+ Hotwater.ngram_distance("zac ephron", "zac efron", 2).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 2).round(4)
37
+ Hotwater.ngram_distance("brittney spears", "britney spears", 2).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 2)
38
+ Hotwater.ngram_distance("0012345678", "0012890678", 2).round(4).should == Hotwater.ngram_distance("0012345678", "0072385698", 2)
39
+ end
40
+
41
+ it "should compute bigram distance by default" do
42
+ Hotwater.ngram_distance("", "al").round(4).should == 0.0
43
+ Hotwater.ngram_distance("al", "al").round(4).should == 1.0
44
+ Hotwater.ngram_distance("a", "a").round(4).should == 1.0
45
+ Hotwater.ngram_distance("b", "a").round(4).should == 0.0
46
+ Hotwater.ngram_distance("a", "aa").round(4).should == 0.5
47
+ Hotwater.ngram_distance("martha", "marhta").round(4).should == 0.6667
48
+ Hotwater.ngram_distance("jones", "johnson").round(4).should == 0.4286
49
+ Hotwater.ngram_distance("natural", "contrary").round(4).should == 0.25
50
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz").round(4).should == 0.625
51
+ Hotwater.ngram_distance("dwayne", "duane").round(4).should == 0.5833
52
+ Hotwater.ngram_distance("dixon", "dicksonx").round(4).should == 0.5
53
+ Hotwater.ngram_distance("six", "ten").round(4).should == 0.0
54
+ Hotwater.ngram_distance("zac ephron", "zac efron").round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron").round(4)
55
+ Hotwater.ngram_distance("brittney spears", "britney spears").round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman")
56
+ Hotwater.ngram_distance("0012345678", "0012890678").round(4).should == Hotwater.ngram_distance("0012345678", "0072385698")
57
+ end
58
+
59
+ it "should compute trigram distance" do
60
+ Hotwater.ngram_distance("", "al", 3).round(4).should == 0.0
61
+ Hotwater.ngram_distance("al", "al", 3).round(4).should == 1.0
62
+ Hotwater.ngram_distance("a", "a", 3).round(4).should == 1.0
63
+ Hotwater.ngram_distance("b", "a", 3).round(4).should == 0.0
64
+ Hotwater.ngram_distance("martha", "marhta", 3).round(4).should == 0.7222
65
+ Hotwater.ngram_distance("jones", "johnson", 3).round(4).should == 0.4762
66
+ Hotwater.ngram_distance("natural", "contrary", 3).round(4).should == 0.2083
67
+ Hotwater.ngram_distance("abcvwxyz", "cabvwxyz", 3).round(4).should == 0.5625
68
+ Hotwater.ngram_distance("dwayne", "duane", 3).round(4).should == 0.5278
69
+ Hotwater.ngram_distance("dixon", "dicksonx", 3).round(4).should == 0.4583
70
+ Hotwater.ngram_distance("six", "ten", 3).round(4).should == 0.0
71
+ Hotwater.ngram_distance("zac ephron", "zac efron", 3).round(4).should > Hotwater.ngram_distance("zac ephron", "kai ephron", 3).round(4)
72
+ Hotwater.ngram_distance("brittney spears", "britney spears", 3).round(4).should > Hotwater.ngram_distance("brittney spears", "brittney startzman", 3)
73
+ Hotwater.ngram_distance("0012345678", "0012890678", 3).round(4).should < Hotwater.ngram_distance("0012345678", "0072385698", 3)
74
+ end
75
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hotwater
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Colin Surprenant
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: ffi
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: ffi-compiler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rspec
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Ruby & JRuby gem with fast string edit distance C implementation using
79
+ FFI bindings
80
+ email:
81
+ - colin.surprenant@gmail.com
82
+ executables: []
83
+ extensions:
84
+ - ext/hotwater/Rakefile
85
+ extra_rdoc_files: []
86
+ files:
87
+ - .gitignore
88
+ - Gemfile
89
+ - LICENSE.txt
90
+ - README.md
91
+ - Rakefile
92
+ - ext/hotwater/Rakefile
93
+ - ext/hotwater/damerau_levenshtein.c
94
+ - ext/hotwater/hotwater.h
95
+ - ext/hotwater/jaro.c
96
+ - ext/hotwater/levenshtein.c
97
+ - ext/hotwater/ngram.c
98
+ - hotwater.gemspec
99
+ - lib/hotwater.rb
100
+ - lib/hotwater/damerau_levenshtein_ffi.rb
101
+ - lib/hotwater/jaro_ffi.rb
102
+ - lib/hotwater/levenshtein_ffi.rb
103
+ - lib/hotwater/ngram_ffi.rb
104
+ - lib/hotwater/version.rb
105
+ - spec/hotwater/damerau_levenshtein_ffi_spec.rb
106
+ - spec/hotwater/jaro_ffi_spec.rb
107
+ - spec/hotwater/levenshtein_ffi_spec.rb
108
+ - spec/hotwater/ngram_ffi_spec.rb
109
+ homepage: http://github.com/colinsurprenant/hotwater
110
+ licenses: []
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ segments:
122
+ - 0
123
+ hash: -289401610859280349
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ none: false
126
+ requirements:
127
+ - - ! '>='
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ segments:
131
+ - 0
132
+ hash: -289401610859280349
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 1.8.23
136
+ signing_key:
137
+ specification_version: 3
138
+ summary: Fast string edit distance
139
+ test_files:
140
+ - spec/hotwater/damerau_levenshtein_ffi_spec.rb
141
+ - spec/hotwater/jaro_ffi_spec.rb
142
+ - spec/hotwater/levenshtein_ffi_spec.rb
143
+ - spec/hotwater/ngram_ffi_spec.rb