strcmp 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -0
- data/Gemfile.lock +29 -0
- data/README.md +27 -0
- data/Rakefile +18 -0
- data/VERSION +1 -0
- data/ext/strcmp/.gitignore +4 -0
- data/ext/strcmp/extconf.rb +3 -0
- data/ext/strcmp/jaro_winkler +0 -0
- data/ext/strcmp/jaro_winkler.c +99 -0
- data/ext/strcmp/jaro_winkler.h +2 -0
- data/ext/strcmp/test.c +15 -0
- data/lib/strcmp.rb +36 -0
- data/spec/jaro_winkler_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/strcmp.gemspec +62 -0
- metadata +128 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.2.1)
|
5
|
+
ffi (1.1.5)
|
6
|
+
git (1.2.5)
|
7
|
+
jeweler (1.8.4)
|
8
|
+
bundler (~> 1.0)
|
9
|
+
git (>= 1.2.5)
|
10
|
+
rake
|
11
|
+
rdoc
|
12
|
+
rake (10.0.3)
|
13
|
+
rdoc (3.9.5)
|
14
|
+
rspec (2.13.0)
|
15
|
+
rspec-core (~> 2.13.0)
|
16
|
+
rspec-expectations (~> 2.13.0)
|
17
|
+
rspec-mocks (~> 2.13.0)
|
18
|
+
rspec-core (2.13.0)
|
19
|
+
rspec-expectations (2.13.0)
|
20
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
21
|
+
rspec-mocks (2.13.0)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
ffi (~> 1.1.5)
|
28
|
+
jeweler
|
29
|
+
rspec (= 2.13.0)
|
data/README.md
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
strcmp
|
2
|
+
======
|
3
|
+
|
4
|
+
Strcmp is a simple ruby gem that is used to compute how similar two strings are
|
5
|
+
to one another. It includes a fast C implementation of the [jaro-winkler]
|
6
|
+
(http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) distance algorithm
|
7
|
+
and soon the levenshtein distance algorithm. More metrics on the way. I claim no
|
8
|
+
ownership of the algorithm.
|
9
|
+
|
10
|
+
Installing
|
11
|
+
==========
|
12
|
+
|
13
|
+
gem install strcmp
|
14
|
+
|
15
|
+
Usage
|
16
|
+
=====
|
17
|
+
|
18
|
+
Using the gem is simple. First add the require statement to the file you wish to
|
19
|
+
use it in.
|
20
|
+
|
21
|
+
require 'strcmp'
|
22
|
+
|
23
|
+
To use the gem call:
|
24
|
+
|
25
|
+
StrCmp.jaro_winkler("martha", "marhta") => 0.961111
|
26
|
+
StrCmp.jaro("martha", "marhta") => 0.944444
|
27
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "strcmp"
|
5
|
+
gemspec.summary = "A gem to compare strings and compute a distance or other metric between two strings."
|
6
|
+
gemspec.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
|
7
|
+
gemspec.email = "rostepher.dev@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/Rostepher/strcmp"
|
9
|
+
gemspec.authors = ["Ross Bayer"]
|
10
|
+
gemspec.add_dependency "ffi"
|
11
|
+
gemspec.add_development_dependency "rspec"
|
12
|
+
gemspec.add_development_dependency "jeweler"
|
13
|
+
end
|
14
|
+
|
15
|
+
Jeweler::GemcutterTasks.new
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler does not seem to be here... Try installing it with: gem install jeweler"
|
18
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
Binary file
|
@@ -0,0 +1,99 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
|
5
|
+
#define TRUE 1
|
6
|
+
#define FALSE 0
|
7
|
+
|
8
|
+
#define max(a, b) ((a) > (b) ? (a) : (b))
|
9
|
+
#define min(a, b) ((a) < (b) ? (a) : (b))
|
10
|
+
#define equal(a, b) (tolower(a) == tolower(b))
|
11
|
+
#define not_equal(a, b) (tolower(a) != tolower(b))
|
12
|
+
|
13
|
+
double jaro_distance(char *foo, char *bar) {
|
14
|
+
// length of the strings, stops the repeated use of strlen
|
15
|
+
int foo_len = strlen(foo);
|
16
|
+
int bar_len = strlen(bar);
|
17
|
+
|
18
|
+
// if both strings are empty return 1
|
19
|
+
// if only one of the strings is empty return 0
|
20
|
+
if (foo_len == 0) return bar_len == 0 ? 1.0 : 0.0;
|
21
|
+
|
22
|
+
// max distance between two chars to be considered matching
|
23
|
+
// floor() is ommitted due to integer division rules
|
24
|
+
int match_distance = (int) max(foo_len, bar_len)/2 - 1;
|
25
|
+
|
26
|
+
// arrays of bools that signify if that char in the matcing string has a match
|
27
|
+
int *foo_matches = calloc(foo_len, sizeof(int));
|
28
|
+
int *bar_matches = calloc(bar_len, sizeof(int));
|
29
|
+
|
30
|
+
// number of matches and transpositions
|
31
|
+
double matches = 0.0;
|
32
|
+
double transpositions = 0.0;
|
33
|
+
|
34
|
+
// find the matches
|
35
|
+
for (int i = 0; i < foo_len; i++) {
|
36
|
+
// start and end take into account the match distance
|
37
|
+
int start = max(0, i - match_distance);
|
38
|
+
int end = min(i + match_distance + 1, bar_len);
|
39
|
+
|
40
|
+
// add comments...
|
41
|
+
for (int k = start; k < end; k++) {
|
42
|
+
// if bar already has a match continue
|
43
|
+
if (bar_matches[k]) continue;
|
44
|
+
// if foo and bar are not
|
45
|
+
if (not_equal(foo[i], bar[k])) continue;
|
46
|
+
// otherwise assume there is a match
|
47
|
+
foo_matches[i] = TRUE;
|
48
|
+
bar_matches[k] = TRUE;
|
49
|
+
matches++;
|
50
|
+
break;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
// if there are no matches return 0
|
55
|
+
if (matches == 0) {
|
56
|
+
free(foo_matches);
|
57
|
+
free(bar_matches);
|
58
|
+
return 0.0;
|
59
|
+
}
|
60
|
+
|
61
|
+
// count transpositions
|
62
|
+
int k = 0;
|
63
|
+
for (int i = 0; i < foo_len; i++) {
|
64
|
+
// if there are no matches in foo continue
|
65
|
+
if (!foo_matches[i]) continue;
|
66
|
+
// while there is no match in bar increment k
|
67
|
+
while (!bar_matches[k]) k++;
|
68
|
+
// increment transpositions
|
69
|
+
if (not_equal(foo[i], bar[k])) transpositions++;
|
70
|
+
k++;
|
71
|
+
}
|
72
|
+
|
73
|
+
// divide the number of transpositions by two as per the algorithm specs
|
74
|
+
// this division is valid because the counted transpositions include both
|
75
|
+
// instances of the transposed characters.
|
76
|
+
transpositions /= 2.0;
|
77
|
+
|
78
|
+
// free dat allocated memory !VERY IMPORTANT!
|
79
|
+
free(foo_matches);
|
80
|
+
free(bar_matches);
|
81
|
+
|
82
|
+
// return the jaro distance
|
83
|
+
return ((matches / foo_len) +
|
84
|
+
(matches / bar_len) +
|
85
|
+
((matches - transpositions) / matches)) / 3.0;
|
86
|
+
}
|
87
|
+
|
88
|
+
double jaro_winkler_distance(char *foo, char *bar) {
|
89
|
+
// compute the jaro distance
|
90
|
+
double jaro = jaro_distance(foo, bar);
|
91
|
+
|
92
|
+
// finds the number of common terms in the first 3 strings, max 3.
|
93
|
+
int prefix_length = 0;
|
94
|
+
if (strlen(foo) != 0 && strlen(bar) != 0)
|
95
|
+
while (prefix_length < 3 && equal(*foo++, *bar++)) prefix_length++;
|
96
|
+
|
97
|
+
// 0.1 is the default scaling factor
|
98
|
+
return jaro + prefix_length * 0.1 * (1 - jaro);
|
99
|
+
}
|
data/ext/strcmp/test.c
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include "jaro_winkler.h"
|
3
|
+
|
4
|
+
int main(int argc, char *argv[]) {
|
5
|
+
//char *str1 = argv[1];
|
6
|
+
//char *str2 = argv[2];
|
7
|
+
char *str1[] = { "", "foobar", "martha", "dwayne", "dixon" };
|
8
|
+
char *str2[] = { "", "", "marhta", "duane", "dicksonx" };
|
9
|
+
for (int i = 0; i < 5; i++) {
|
10
|
+
double dist_1 = jaro_distance(str1[i], str2[i]);
|
11
|
+
double dist_2 = jaro_distance(str2[i], str1[i]);
|
12
|
+
printf("jaro_distance(\"%s\", \"%s\") => %f\n", str1[i], str2[i], dist_1);
|
13
|
+
printf("jaro_distance(\"%s\", \"%s\") => %f\n", str2[i], str1[i], dist_2);
|
14
|
+
}
|
15
|
+
}
|
data/lib/strcmp.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module StrCmp
|
4
|
+
class << self
|
5
|
+
extend FFI::Library
|
6
|
+
|
7
|
+
# Try loading in order.
|
8
|
+
library = File.dirname(__FILE__) + "/../ext/strcmp/strcmp"
|
9
|
+
candidates = ['.bundle', '.so', '.dylib', ''].map { |ext| library + ext }
|
10
|
+
ffi_lib(candidates)
|
11
|
+
|
12
|
+
def jaro(string1, string2)
|
13
|
+
validate(string1)
|
14
|
+
validate(string2)
|
15
|
+
ffi_jaro(string1, string2)
|
16
|
+
end
|
17
|
+
|
18
|
+
def jaro_winkler(string1, string2)
|
19
|
+
validate(string1)
|
20
|
+
validate(string2)
|
21
|
+
ffi_jaro_winkler(string1, string2)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
# attach the c functions with ffi
|
26
|
+
attach_function :ffi_jaro, :jaro_distance, [:string, :string], :double
|
27
|
+
attach_function :ffi_jaro_winkler, :jaro_winkler_distance, [:string, :string], :double
|
28
|
+
|
29
|
+
# confirm that the given object is a string, if not raise type error
|
30
|
+
def validate(object)
|
31
|
+
unless object.kind_of?(String)
|
32
|
+
raise TypeError, "Wrong argument type #{object.class} (expected String)"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe JaroWinkler do
|
4
|
+
test_cases = [
|
5
|
+
["", "", 1.0],
|
6
|
+
["foobar", "", 0.0],
|
7
|
+
["martha", "marhta", 0.961],
|
8
|
+
["dwayne", "duane", 0.84],
|
9
|
+
["dixon", "dicksonx", 0.813]
|
10
|
+
]
|
11
|
+
|
12
|
+
test_cases.each do |str1, str2, distance|
|
13
|
+
it "should calculate the distance #{distance} from #{str1} and #{str2}" do
|
14
|
+
JaroWinkler.distance(str1, str2).round(3) == distance
|
15
|
+
JaroWinkler.distance(str2, str1).round(3) == distance
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should raise an error if either arguemnt is nil" do
|
20
|
+
expect { JaroWinkler.distance("", nil) }.to raise_error TypeError
|
21
|
+
expect { JaroWinkler.distance(nil, "") }.to raise_error TypeError
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should raise an error if either argument is not a string" do
|
25
|
+
expect { JaroWinkler.distance("foo", /bar/) }.to raise_error TypeError
|
26
|
+
expect { JaroWinkler.distance("42", 42) }.to raise_error TypeError
|
27
|
+
expect { JaroWinkler.distance("Object", Object.new) }.to raise_error TypeError
|
28
|
+
end
|
29
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../lib/strcmp"
|
data/strcmp.gemspec
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "strcmp"
|
8
|
+
s.version = "1.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Ross Bayer"]
|
12
|
+
s.date = "2013-03-05"
|
13
|
+
s.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
|
14
|
+
s.email = "rostepher.dev@gmail.com"
|
15
|
+
s.extensions = ["ext/strcmp/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"README.md",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"ext/strcmp/.gitignore",
|
26
|
+
"ext/strcmp/extconf.rb",
|
27
|
+
"ext/strcmp/jaro_winkler",
|
28
|
+
"ext/strcmp/jaro_winkler.c",
|
29
|
+
"ext/strcmp/jaro_winkler.h",
|
30
|
+
"ext/strcmp/test.c",
|
31
|
+
"lib/strcmp.rb",
|
32
|
+
"spec/jaro_winkler_spec.rb",
|
33
|
+
"spec/spec_helper.rb",
|
34
|
+
"strcmp.gemspec"
|
35
|
+
]
|
36
|
+
s.homepage = "http://github.com/Rostepher/strcmp"
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = "1.8.25"
|
39
|
+
s.summary = "A gem to compare strings and compute a distance or other metric between two strings."
|
40
|
+
|
41
|
+
if s.respond_to? :specification_version then
|
42
|
+
s.specification_version = 3
|
43
|
+
|
44
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
45
|
+
s.add_runtime_dependency(%q<ffi>, ["~> 1.1.5"])
|
46
|
+
s.add_runtime_dependency(%q<ffi>, [">= 0"])
|
47
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<ffi>, ["~> 1.1.5"])
|
51
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
52
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
53
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
54
|
+
end
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<ffi>, ["~> 1.1.5"])
|
57
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
58
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
59
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: strcmp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Ross Bayer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-05 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: ffi
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.1.5
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.5
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: ffi
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: jeweler
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: A simple ruby gem that is used to compute how similar two strings are
|
79
|
+
to one another. It includes a fast C implementation of the jaro-winkler distance
|
80
|
+
algorith and soon the levenshtein distance algorithm. More metrics on the way.
|
81
|
+
email: rostepher.dev@gmail.com
|
82
|
+
executables: []
|
83
|
+
extensions:
|
84
|
+
- ext/strcmp/extconf.rb
|
85
|
+
extra_rdoc_files:
|
86
|
+
- README.md
|
87
|
+
files:
|
88
|
+
- Gemfile
|
89
|
+
- Gemfile.lock
|
90
|
+
- README.md
|
91
|
+
- Rakefile
|
92
|
+
- VERSION
|
93
|
+
- ext/strcmp/.gitignore
|
94
|
+
- ext/strcmp/extconf.rb
|
95
|
+
- ext/strcmp/jaro_winkler
|
96
|
+
- ext/strcmp/jaro_winkler.c
|
97
|
+
- ext/strcmp/jaro_winkler.h
|
98
|
+
- ext/strcmp/test.c
|
99
|
+
- lib/strcmp.rb
|
100
|
+
- spec/jaro_winkler_spec.rb
|
101
|
+
- spec/spec_helper.rb
|
102
|
+
- strcmp.gemspec
|
103
|
+
homepage: http://github.com/Rostepher/strcmp
|
104
|
+
licenses: []
|
105
|
+
post_install_message:
|
106
|
+
rdoc_options: []
|
107
|
+
require_paths:
|
108
|
+
- lib
|
109
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
+
none: false
|
111
|
+
requirements:
|
112
|
+
- - ! '>='
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: '0'
|
115
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
requirements: []
|
122
|
+
rubyforge_project:
|
123
|
+
rubygems_version: 1.8.25
|
124
|
+
signing_key:
|
125
|
+
specification_version: 3
|
126
|
+
summary: A gem to compare strings and compute a distance or other metric between two
|
127
|
+
strings.
|
128
|
+
test_files: []
|