hotwater 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - jruby-19mode
5
+ - jruby-head
6
+
@@ -0,0 +1,6 @@
1
+ # 0.1.0, 02-25-2013
2
+ - initial release
3
+
4
+ # 0.1.2, 02-25-2013
5
+ - linux compilation issue
6
+ - travis support
data/README.md CHANGED
@@ -1,26 +1,33 @@
1
- # Hotwater v0.1.0
1
+ # Hotwater v0.1.2
2
2
 
3
- Ruby & JRuby gem with fast **string edit distance** C implementations using FFI bindings.
3
+ [![build status](https://secure.travis-ci.org/colinsurprenant/hotwater.png)](http://travis-ci.org/colinsurprenant/hotwater)
4
+
5
+ Ruby & JRuby gem with fast **string edit distance** algorithms C implementations with FFI bindings.
4
6
 
5
7
  ### Algorithms
6
8
 
7
- - Levenshtein & Damerau Levenshtein
8
- - Jaro & Jaro Winkler
9
- - N-Gram
9
+ - [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) & [Damerau Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) distance
10
+ - [Jaro & Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) distance
11
+ - [N-Gram](https://en.wikipedia.org/wiki/N-gram) distance
10
12
 
11
13
  ## Installation
12
14
 
13
- Add this line to your application's Gemfile:
14
-
15
- gem 'hotwater'
15
+ Tested on **OSX 10.8.2** and **Linux 12.10** with
16
+ - MRI Ruby 1.9.3 p385
17
+ - JRuby 1.7.2 (1.9.3 p327)
16
18
 
19
+ Add this line to your application's Gemfile:
20
+ ```ruby
21
+ gem 'hotwater'
22
+ ```
17
23
  And then execute:
18
-
19
- $ bundle
20
-
24
+ ```sh
25
+ $ bundle
26
+ ```
21
27
  Or install it yourself as:
22
-
23
- $ gem install hotwater
28
+ ```sh
29
+ $ gem install hotwater
30
+ ```
24
31
 
25
32
  ## Usage
26
33
 
@@ -28,7 +35,8 @@ Or install it yourself as:
28
35
  Hotwater.levenshtein_distance("abc", "acb") # => 2
29
36
  Hotwater.damerau_levenshtein_distance("abc", "acb") # => 1
30
37
 
31
- # do normalization based on the string sizes
38
+ # normalization based on the string sizes
39
+ # where an edit on a small string has more weight than on a longer string
32
40
  Hotwater.normalized_levenshtein_distance("abc", "acb").round(4) # => 0.3333
33
41
  Hotwater.normalized_damerau_levenshtein_distance("abc", "acb").round(4) # => 0.6667
34
42
 
@@ -59,12 +67,19 @@ Hotwater.ngram_distance("natural", "contrary", 3).round(4) # => 0.2083
59
67
  5. Create new Pull Request
60
68
 
61
69
  ## Credits
70
+
62
71
  - Some C code from the https://github.com/sunlightlabs/jellyfish project
63
72
  - N-Gram ported from Apache Lucene 4.0.0 NGramDistance.java
64
73
 
74
+ ## Why?
75
+
76
+ Why Hotwater? as stated in the credits section, some of the C code comes from the [jellyfish Python project](https://github.com/sunlightlabs/jellyfish). Jelly fish made me think right away about New Brunswick beaches where I have been a couple of times in the past years. There is this legend about New Brunswick having warm water beaches. I even saw a tourism promotion TV commercial selling NB has having warm water. This is a lie! :P I never experienced warm water (in the generaly accepted definition) in NB, only lots of jellyfish :D (that being said, I have enjoyed every bit of my visits in New Brunswick and I really do not care about warm water really ;)
77
+
65
78
  ## Author
79
+
66
80
  Colin Surprenant, [@colinsurprenant](http://twitter.com/colinsurprenant), [http://github.com/colinsurprenant](http://github.com/colinsurprenant), colin.surprenant@gmail.com
67
81
 
68
82
  ## License
83
+
69
84
  Hotwater is distributed under the Apache License, Version 2.0.
70
85
 
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ require 'bundler/gem_tasks'
5
5
  require 'rspec/core/rake_task'
6
6
  require 'ffi-compiler/compile_task'
7
7
 
8
- task :default => :spec
8
+ task :default => [:clean, :compile, :spec]
9
9
 
10
10
  desc "run specs"
11
11
  task :spec do
@@ -1,5 +1,5 @@
1
1
  require 'ffi-compiler/compile_task'
2
2
 
3
3
  FFI::Compiler::CompileTask.new('hotwater') do |c|
4
- # nothing yet bro
4
+ # nothing to see here, move along
5
5
  end
@@ -10,6 +10,7 @@ Colin Surprenant, Feb 2013
10
10
  #include <string.h>
11
11
  #include <stdio.h>
12
12
  #include <stdlib.h>
13
+ #include <alloca.h>
13
14
  #include "hotwater.h"
14
15
 
15
16
  #define NOTNUM(c) ((c>57) || (c<48))
@@ -52,7 +53,7 @@ double _jaro_winkler(const char *ying, const char *yang, bool long_tolerance, bo
52
53
  search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
53
54
 
54
55
  // Blank out the flags
55
- ying_flag = alloca(ying_length + 1);
56
+ ying_flag = (char *)alloca(ying_length + 1);
56
57
  if (!ying_flag) return -1.0;
57
58
 
58
59
  yang_flag = alloca(yang_length + 1);
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  Colin Surprenant, Feb 2013
3
3
  - converted in C from org/apache/lucene/search/spell/NGramDistance.java v4.0.0
4
- - fixed segfault bug in substring n parameter, which did not surface in Java
4
+ - fixed segfault bug in call to substring for n parameter, which did not surface in Java
5
5
  */
6
6
 
7
7
  /* package org.apache.lucene.search.spell; */
@@ -66,7 +66,8 @@ double ngram_distance (const char *source, const char *target, int n) {
66
66
  int cost = 0;
67
67
  if (sl < n || tl < n) {
68
68
  int ni = MIN(sl, tl);
69
- for (int i = 0; i < ni; i++) {
69
+ int i;
70
+ for (i = 0; i < ni; i++) {
70
71
  if (source[i] == target[i]) {
71
72
  cost++;
72
73
  }
@@ -85,7 +86,8 @@ double ngram_distance (const char *source, const char *target, int n) {
85
86
  double* _d; // placeholder to assist in swapping p and d
86
87
 
87
88
  // construct sa with prefix
88
- for (int i = 0; i < sa_len; i++) {
89
+ int i;
90
+ for (i = 0; i < sa_len; i++) {
89
91
  if (i < n - 1) {
90
92
  sa[i] = 0 ; //add prefix
91
93
  }
@@ -108,7 +110,7 @@ double ngram_distance (const char *source, const char *target, int n) {
108
110
  }
109
111
 
110
112
  // indexes into strings s and t
111
- int i = 0; // iterates through source
113
+ i = 0; // iterates through source
112
114
  int j = 0; // iterates through target
113
115
 
114
116
  char* t_j = calloc(n + 1, sizeof(char)); // jth n-gram of t
@@ -125,10 +127,11 @@ double ngram_distance (const char *source, const char *target, int n) {
125
127
  for (j = 1; j <= tl; j++) {
126
128
  // construct t_j n-gram
127
129
  if (j < n) {
128
- for (int ti = 0; ti < n - j; ti++) {
130
+ int ti;
131
+ for (ti = 0; ti < n - j; ti++) {
129
132
  t_j[ti] = 0; //add prefix
130
133
  }
131
- for (int ti = n - j; ti < n; ti++) {
134
+ for (ti = n - j; ti < n; ti++) {
132
135
  t_j[ti] = target[ti - (n - j)];
133
136
  }
134
137
  }
@@ -142,7 +145,8 @@ double ngram_distance (const char *source, const char *target, int n) {
142
145
  int tn = n;
143
146
  // compare sa to t_j
144
147
 
145
- for (int ni = 0; ni < n; ni++) {
148
+ int ni;
149
+ for (ni = 0; ni < n; ni++) {
146
150
  if (sa[i - 1 + ni] != t_j[ni]) {
147
151
  cost++;
148
152
  }
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
8
8
  gem.version = Hotwater::VERSION
9
9
  gem.authors = ["Colin Surprenant"]
10
10
  gem.email = ["colin.surprenant@gmail.com"]
11
- gem.description = "Ruby & JRuby gem with fast string edit distance C implementation using FFI bindings"
11
+ gem.description = "Ruby & JRuby gem with fast string edit distance algorithms C implementations with FFI bindings"
12
12
  gem.summary = "Fast string edit distance"
13
13
  gem.homepage = "http://github.com/colinsurprenant/hotwater"
14
14
 
@@ -7,16 +7,26 @@ module Hotwater
7
7
  attach_function :damerau_levenshtein_distance, [:string, :string], :int
8
8
  end
9
9
 
10
+ # compute Damerau Levenshtein edit distance between 2 strings
11
+ # @param s1 [String] first string
12
+ # @param s2 [String] seconds string
13
+ # @return [Integer] edit distance
10
14
  def damerau_levenshtein_distance(s1, s2)
11
15
  result = C::damerau_levenshtein_distance(s1, s2)
12
16
  raise("memory allocation error") if result == -1
13
17
  result
14
18
  end
15
19
 
20
+ # compute normalized Damerau Levenshtein edit distance between 2 strings
21
+ # normalization weight the edit distance using the string lengths where
22
+ # an edit on a small string has more impact than on a longer string
23
+ # @param s1 [String] first string
24
+ # @param s2 [String] seconds string
25
+ # @return [Float] edit distance between 0.0 and 1.0
16
26
  def normalized_damerau_levenshtein_distance(s1, s2)
17
27
  result = C::damerau_levenshtein_distance(s1, s2)
18
28
  raise("memory allocation error") if result == -1
19
- return 0.0 if result == 0
29
+ return 0.0 if result == 0.0
20
30
  max = [s1.size, s2.size].max
21
31
  (max - result.to_f) / max
22
32
  end
@@ -10,12 +10,27 @@ module Hotwater
10
10
  attach_function :jaro_winkler_distance, [:string, :string, :bool], :double
11
11
  end
12
12
 
13
+ # compute Jaro edit distance between 2 strings
14
+ # @param s1 [String] first string
15
+ # @param s2 [String] seconds string
16
+ # @return [Float] edit distance between 0.0 and 1.0
13
17
  def jaro_distance(s1, s2)
14
18
  result = C::jaro_distance(s1, s2)
15
19
  raise("memory allocation error") if result < 0.0
16
20
  result
17
21
  end
18
22
 
23
+ # compute Jaro-Winkler edit distance between 2 strings
24
+ #
25
+ # setting `long_tolerance = true` increases the probability of a match when the number
26
+ # of matched characters is large. This option allows for a little more
27
+ # tolerance when the strings are large. It is not an appropriate
28
+ # test when comparing fixed length fields such as phone and social security numbers.
29
+ #
30
+ # @param s1 [String] first string
31
+ # @param s2 [String] seconds string
32
+ # @param long_tolerance [Boolean] add more tolerance when the strings are large. Default `false`
33
+ # @return [Float] edit distance between 0.0 and 1.0
19
34
  def jaro_winkler_distance(s1, s2, long_tolerance = false)
20
35
  result = C::jaro_winkler_distance(s1, s2, long_tolerance)
21
36
  raise("memory allocation error") if result < 0.0
@@ -7,16 +7,26 @@ module Hotwater
7
7
  attach_function :levenshtein_distance, [:string, :string], :int
8
8
  end
9
9
 
10
+ # compute Levenshtein edit distance between 2 strings
11
+ # @param s1 [String] first string
12
+ # @param s2 [String] seconds string
13
+ # @return [Integer] edit distance
10
14
  def levenshtein_distance(s1, s2)
11
15
  result = C::levenshtein_distance(s1, s2)
12
16
  raise("memory allocation error") if result == -1
13
17
  result
14
18
  end
15
19
 
20
+ # compute normalized Levenshtein edit distance between 2 strings
21
+ # normalization weight the edit distance using the string lengths where
22
+ # an edit on a small string has more impact than on a longer string
23
+ # @param s1 [String] first string
24
+ # @param s2 [String] seconds string
25
+ # @return [Float] edit distance between 0.0 and 1.0
16
26
  def normalized_levenshtein_distance(s1, s2)
17
27
  result = C::levenshtein_distance(s1, s2)
18
28
  raise("memory allocation error") if result == -1
19
- return 0.0 if result == 0
29
+ return 0.0 if result == 0.0
20
30
  max = [s1.size, s2.size].max
21
31
  (max - result.to_f) / max
22
32
  end
@@ -7,6 +7,11 @@ module Hotwater
7
7
  attach_function :ngram_distance, [:string, :string, :int], :double
8
8
  end
9
9
 
10
+ # compute N-Gram distance between 2 strings
11
+ # @param s1 [String] first string
12
+ # @param s2 [String] seconds string
13
+ # @param n [Integer] number of characters per gram, default is 2
14
+ # @return [Float] edit distance between 0.0 and 1.0
10
15
  def ngram_distance(s1, s2, n = 2)
11
16
  result = C::ngram_distance(s1, s2, n)
12
17
  raise("memory allocation error") if result == -1
@@ -1,3 +1,3 @@
1
1
  module Hotwater
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -25,5 +25,9 @@ describe Hotwater do
25
25
  Hotwater.normalized_damerau_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
26
26
  Hotwater.normalized_damerau_levenshtein_distance("teusday", "tuesday").round(4).should == 0.8571
27
27
  Hotwater.normalized_damerau_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
28
+
29
+ Hotwater.normalized_levenshtein_distance("aaaa", "aaab").round(4).should == 0.75
30
+ Hotwater.normalized_levenshtein_distance("aaaaa", "aaaab").round(4).should == 0.8
31
+ Hotwater.normalized_levenshtein_distance("aaaaaa", "aaaaab").round(4).should == 0.8333
28
32
  end
29
33
  end
@@ -25,5 +25,9 @@ describe Hotwater do
25
25
  Hotwater.normalized_levenshtein_distance("Saturday", "Sunday").round(4).should == 0.625
26
26
  Hotwater.normalized_levenshtein_distance("teusday", "tuesday").round(4).should == 0.7143
27
27
  Hotwater.normalized_levenshtein_distance("teusday", "thursday").round(4).should == 0.75
28
+
29
+ Hotwater.normalized_levenshtein_distance("aaaa", "aaab").round(4).should == 0.75
30
+ Hotwater.normalized_levenshtein_distance("aaaaa", "aaaab").round(4).should == 0.8
31
+ Hotwater.normalized_levenshtein_distance("aaaaaa", "aaaaab").round(4).should == 0.8333
28
32
  end
29
33
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hotwater
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -75,8 +75,8 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
- description: Ruby & JRuby gem with fast string edit distance C implementation using
79
- FFI bindings
78
+ description: Ruby & JRuby gem with fast string edit distance algorithms C implementations
79
+ with FFI bindings
80
80
  email:
81
81
  - colin.surprenant@gmail.com
82
82
  executables: []
@@ -85,6 +85,8 @@ extensions:
85
85
  extra_rdoc_files: []
86
86
  files:
87
87
  - .gitignore
88
+ - .travis.yml
89
+ - CHANGELOG.md
88
90
  - Gemfile
89
91
  - LICENSE.txt
90
92
  - README.md
@@ -120,7 +122,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
120
122
  version: '0'
121
123
  segments:
122
124
  - 0
123
- hash: -289401610859280349
125
+ hash: 3182707895951543481
124
126
  required_rubygems_version: !ruby/object:Gem::Requirement
125
127
  none: false
126
128
  requirements:
@@ -129,7 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
129
131
  version: '0'
130
132
  segments:
131
133
  - 0
132
- hash: -289401610859280349
134
+ hash: 3182707895951543481
133
135
  requirements: []
134
136
  rubyforge_project:
135
137
  rubygems_version: 1.8.23