strcmp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -0
- data/Gemfile.lock +29 -0
- data/README.md +27 -0
- data/Rakefile +18 -0
- data/VERSION +1 -0
- data/ext/strcmp/.gitignore +4 -0
- data/ext/strcmp/extconf.rb +3 -0
- data/ext/strcmp/jaro_winkler +0 -0
- data/ext/strcmp/jaro_winkler.c +99 -0
- data/ext/strcmp/jaro_winkler.h +2 -0
- data/ext/strcmp/test.c +15 -0
- data/lib/strcmp.rb +36 -0
- data/spec/jaro_winkler_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/strcmp.gemspec +62 -0
- metadata +128 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.2.1)
|
5
|
+
ffi (1.1.5)
|
6
|
+
git (1.2.5)
|
7
|
+
jeweler (1.8.4)
|
8
|
+
bundler (~> 1.0)
|
9
|
+
git (>= 1.2.5)
|
10
|
+
rake
|
11
|
+
rdoc
|
12
|
+
rake (10.0.3)
|
13
|
+
rdoc (3.9.5)
|
14
|
+
rspec (2.13.0)
|
15
|
+
rspec-core (~> 2.13.0)
|
16
|
+
rspec-expectations (~> 2.13.0)
|
17
|
+
rspec-mocks (~> 2.13.0)
|
18
|
+
rspec-core (2.13.0)
|
19
|
+
rspec-expectations (2.13.0)
|
20
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
21
|
+
rspec-mocks (2.13.0)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
ffi (~> 1.1.5)
|
28
|
+
jeweler
|
29
|
+
rspec (= 2.13.0)
|
data/README.md
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
strcmp
|
2
|
+
======
|
3
|
+
|
4
|
+
Strcmp is a simple ruby gem that is used to compute how similar two strings are
|
5
|
+
to one another. It includes a fast C implementation of the [jaro-winkler]
|
6
|
+
(http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) distance algorithm
|
7
|
+
and soon the levenshtein distance algorithm. More metrics on the way. I claim no
|
8
|
+
ownership of the algorithm.
|
9
|
+
|
10
|
+
Installing
|
11
|
+
==========
|
12
|
+
|
13
|
+
gem install strcmp
|
14
|
+
|
15
|
+
Usage
|
16
|
+
=====
|
17
|
+
|
18
|
+
Using the gem is simple. First add the require statement to the file you wish to
|
19
|
+
use it in.
|
20
|
+
|
21
|
+
require 'strcmp'
|
22
|
+
|
23
|
+
To use the gem call:
|
24
|
+
|
25
|
+
StrCmp.jaro_winkler("martha", "marhta") => 0.961111
|
26
|
+
StrCmp.jaro("martha", "marhta") => 0.944444
|
27
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "strcmp"
|
5
|
+
gemspec.summary = "A gem to compare strings and compute a distance or other metric between two strings."
|
6
|
+
gemspec.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
|
7
|
+
gemspec.email = "rostepher.dev@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/Rostepher/strcmp"
|
9
|
+
gemspec.authors = ["Ross Bayer"]
|
10
|
+
gemspec.add_dependency "ffi"
|
11
|
+
gemspec.add_development_dependency "rspec"
|
12
|
+
gemspec.add_development_dependency "jeweler"
|
13
|
+
end
|
14
|
+
|
15
|
+
Jeweler::GemcutterTasks.new
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler does not seem to be here... Try installing it with: gem install jeweler"
|
18
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
Binary file
|
@@ -0,0 +1,99 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
|
5
|
+
#define TRUE 1
|
6
|
+
#define FALSE 0
|
7
|
+
|
8
|
+
#define max(a, b) ((a) > (b) ? (a) : (b))
|
9
|
+
#define min(a, b) ((a) < (b) ? (a) : (b))
|
10
|
+
#define equal(a, b) (tolower(a) == tolower(b))
|
11
|
+
#define not_equal(a, b) (tolower(a) != tolower(b))
|
12
|
+
|
13
|
+
double jaro_distance(char *foo, char *bar) {
|
14
|
+
// length of the strings, stops the repeated use of strlen
|
15
|
+
int foo_len = strlen(foo);
|
16
|
+
int bar_len = strlen(bar);
|
17
|
+
|
18
|
+
// if both strings are empty return 1
|
19
|
+
// if only one of the strings is empty return 0
|
20
|
+
if (foo_len == 0) return bar_len == 0 ? 1.0 : 0.0;
|
21
|
+
|
22
|
+
// max distance between two chars to be considered matching
|
23
|
+
// floor() is ommitted due to integer division rules
|
24
|
+
int match_distance = (int) max(foo_len, bar_len)/2 - 1;
|
25
|
+
|
26
|
+
// arrays of bools that signify if that char in the matcing string has a match
|
27
|
+
int *foo_matches = calloc(foo_len, sizeof(int));
|
28
|
+
int *bar_matches = calloc(bar_len, sizeof(int));
|
29
|
+
|
30
|
+
// number of matches and transpositions
|
31
|
+
double matches = 0.0;
|
32
|
+
double transpositions = 0.0;
|
33
|
+
|
34
|
+
// find the matches
|
35
|
+
for (int i = 0; i < foo_len; i++) {
|
36
|
+
// start and end take into account the match distance
|
37
|
+
int start = max(0, i - match_distance);
|
38
|
+
int end = min(i + match_distance + 1, bar_len);
|
39
|
+
|
40
|
+
// add comments...
|
41
|
+
for (int k = start; k < end; k++) {
|
42
|
+
// if bar already has a match continue
|
43
|
+
if (bar_matches[k]) continue;
|
44
|
+
// if foo and bar are not
|
45
|
+
if (not_equal(foo[i], bar[k])) continue;
|
46
|
+
// otherwise assume there is a match
|
47
|
+
foo_matches[i] = TRUE;
|
48
|
+
bar_matches[k] = TRUE;
|
49
|
+
matches++;
|
50
|
+
break;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
// if there are no matches return 0
|
55
|
+
if (matches == 0) {
|
56
|
+
free(foo_matches);
|
57
|
+
free(bar_matches);
|
58
|
+
return 0.0;
|
59
|
+
}
|
60
|
+
|
61
|
+
// count transpositions
|
62
|
+
int k = 0;
|
63
|
+
for (int i = 0; i < foo_len; i++) {
|
64
|
+
// if there are no matches in foo continue
|
65
|
+
if (!foo_matches[i]) continue;
|
66
|
+
// while there is no match in bar increment k
|
67
|
+
while (!bar_matches[k]) k++;
|
68
|
+
// increment transpositions
|
69
|
+
if (not_equal(foo[i], bar[k])) transpositions++;
|
70
|
+
k++;
|
71
|
+
}
|
72
|
+
|
73
|
+
// divide the number of transpositions by two as per the algorithm specs
|
74
|
+
// this division is valid because the counted transpositions include both
|
75
|
+
// instances of the transposed characters.
|
76
|
+
transpositions /= 2.0;
|
77
|
+
|
78
|
+
// free dat allocated memory !VERY IMPORTANT!
|
79
|
+
free(foo_matches);
|
80
|
+
free(bar_matches);
|
81
|
+
|
82
|
+
// return the jaro distance
|
83
|
+
return ((matches / foo_len) +
|
84
|
+
(matches / bar_len) +
|
85
|
+
((matches - transpositions) / matches)) / 3.0;
|
86
|
+
}
|
87
|
+
|
88
|
+
double jaro_winkler_distance(char *foo, char *bar) {
|
89
|
+
// compute the jaro distance
|
90
|
+
double jaro = jaro_distance(foo, bar);
|
91
|
+
|
92
|
+
// finds the number of common terms in the first 3 strings, max 3.
|
93
|
+
int prefix_length = 0;
|
94
|
+
if (strlen(foo) != 0 && strlen(bar) != 0)
|
95
|
+
while (prefix_length < 3 && equal(*foo++, *bar++)) prefix_length++;
|
96
|
+
|
97
|
+
// 0.1 is the default scaling factor
|
98
|
+
return jaro + prefix_length * 0.1 * (1 - jaro);
|
99
|
+
}
|
data/ext/strcmp/test.c
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include "jaro_winkler.h"
|
3
|
+
|
4
|
+
int main(int argc, char *argv[]) {
|
5
|
+
//char *str1 = argv[1];
|
6
|
+
//char *str2 = argv[2];
|
7
|
+
char *str1[] = { "", "foobar", "martha", "dwayne", "dixon" };
|
8
|
+
char *str2[] = { "", "", "marhta", "duane", "dicksonx" };
|
9
|
+
for (int i = 0; i < 5; i++) {
|
10
|
+
double dist_1 = jaro_distance(str1[i], str2[i]);
|
11
|
+
double dist_2 = jaro_distance(str2[i], str1[i]);
|
12
|
+
printf("jaro_distance(\"%s\", \"%s\") => %f\n", str1[i], str2[i], dist_1);
|
13
|
+
printf("jaro_distance(\"%s\", \"%s\") => %f\n", str2[i], str1[i], dist_2);
|
14
|
+
}
|
15
|
+
}
|
data/lib/strcmp.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module StrCmp
|
4
|
+
class << self
|
5
|
+
extend FFI::Library
|
6
|
+
|
7
|
+
# Try loading in order.
|
8
|
+
library = File.dirname(__FILE__) + "/../ext/strcmp/strcmp"
|
9
|
+
candidates = ['.bundle', '.so', '.dylib', ''].map { |ext| library + ext }
|
10
|
+
ffi_lib(candidates)
|
11
|
+
|
12
|
+
def jaro(string1, string2)
|
13
|
+
validate(string1)
|
14
|
+
validate(string2)
|
15
|
+
ffi_jaro(string1, string2)
|
16
|
+
end
|
17
|
+
|
18
|
+
def jaro_winkler(string1, string2)
|
19
|
+
validate(string1)
|
20
|
+
validate(string2)
|
21
|
+
ffi_jaro_winkler(string1, string2)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
# attach the c functions with ffi
|
26
|
+
attach_function :ffi_jaro, :jaro_distance, [:string, :string], :double
|
27
|
+
attach_function :ffi_jaro_winkler, :jaro_winkler_distance, [:string, :string], :double
|
28
|
+
|
29
|
+
# confirm that the given object is a string, if not raise type error
|
30
|
+
def validate(object)
|
31
|
+
unless object.kind_of?(String)
|
32
|
+
raise TypeError, "Wrong argument type #{object.class} (expected String)"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe JaroWinkler do
|
4
|
+
test_cases = [
|
5
|
+
["", "", 1.0],
|
6
|
+
["foobar", "", 0.0],
|
7
|
+
["martha", "marhta", 0.961],
|
8
|
+
["dwayne", "duane", 0.84],
|
9
|
+
["dixon", "dicksonx", 0.813]
|
10
|
+
]
|
11
|
+
|
12
|
+
test_cases.each do |str1, str2, distance|
|
13
|
+
it "should calculate the distance #{distance} from #{str1} and #{str2}" do
|
14
|
+
JaroWinkler.distance(str1, str2).round(3) == distance
|
15
|
+
JaroWinkler.distance(str2, str1).round(3) == distance
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should raise an error if either arguemnt is nil" do
|
20
|
+
expect { JaroWinkler.distance("", nil) }.to raise_error TypeError
|
21
|
+
expect { JaroWinkler.distance(nil, "") }.to raise_error TypeError
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should raise an error if either argument is not a string" do
|
25
|
+
expect { JaroWinkler.distance("foo", /bar/) }.to raise_error TypeError
|
26
|
+
expect { JaroWinkler.distance("42", 42) }.to raise_error TypeError
|
27
|
+
expect { JaroWinkler.distance("Object", Object.new) }.to raise_error TypeError
|
28
|
+
end
|
29
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../lib/strcmp"
|
data/strcmp.gemspec
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "strcmp"
|
8
|
+
s.version = "1.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Ross Bayer"]
|
12
|
+
s.date = "2013-03-05"
|
13
|
+
s.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
|
14
|
+
s.email = "rostepher.dev@gmail.com"
|
15
|
+
s.extensions = ["ext/strcmp/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"README.md",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"ext/strcmp/.gitignore",
|
26
|
+
"ext/strcmp/extconf.rb",
|
27
|
+
"ext/strcmp/jaro_winkler",
|
28
|
+
"ext/strcmp/jaro_winkler.c",
|
29
|
+
"ext/strcmp/jaro_winkler.h",
|
30
|
+
"ext/strcmp/test.c",
|
31
|
+
"lib/strcmp.rb",
|
32
|
+
"spec/jaro_winkler_spec.rb",
|
33
|
+
"spec/spec_helper.rb",
|
34
|
+
"strcmp.gemspec"
|
35
|
+
]
|
36
|
+
s.homepage = "http://github.com/Rostepher/strcmp"
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = "1.8.25"
|
39
|
+
s.summary = "A gem to compare strings and compute a distance or other metric between two strings."
|
40
|
+
|
41
|
+
if s.respond_to? :specification_version then
|
42
|
+
s.specification_version = 3
|
43
|
+
|
44
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
45
|
+
s.add_runtime_dependency(%q<ffi>, ["~> 1.1.5"])
|
46
|
+
s.add_runtime_dependency(%q<ffi>, [">= 0"])
|
47
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<ffi>, ["~> 1.1.5"])
|
51
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
52
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
53
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
54
|
+
end
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<ffi>, ["~> 1.1.5"])
|
57
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
58
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
59
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: strcmp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Ross Bayer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-05 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: ffi
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.1.5
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.5
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: ffi
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: jeweler
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: A simple ruby gem that is used to compute how similar two strings are
|
79
|
+
to one another. It includes a fast C implementation of the jaro-winkler distance
|
80
|
+
algorith and soon the levenshtein distance algorithm. More metrics on the way.
|
81
|
+
email: rostepher.dev@gmail.com
|
82
|
+
executables: []
|
83
|
+
extensions:
|
84
|
+
- ext/strcmp/extconf.rb
|
85
|
+
extra_rdoc_files:
|
86
|
+
- README.md
|
87
|
+
files:
|
88
|
+
- Gemfile
|
89
|
+
- Gemfile.lock
|
90
|
+
- README.md
|
91
|
+
- Rakefile
|
92
|
+
- VERSION
|
93
|
+
- ext/strcmp/.gitignore
|
94
|
+
- ext/strcmp/extconf.rb
|
95
|
+
- ext/strcmp/jaro_winkler
|
96
|
+
- ext/strcmp/jaro_winkler.c
|
97
|
+
- ext/strcmp/jaro_winkler.h
|
98
|
+
- ext/strcmp/test.c
|
99
|
+
- lib/strcmp.rb
|
100
|
+
- spec/jaro_winkler_spec.rb
|
101
|
+
- spec/spec_helper.rb
|
102
|
+
- strcmp.gemspec
|
103
|
+
homepage: http://github.com/Rostepher/strcmp
|
104
|
+
licenses: []
|
105
|
+
post_install_message:
|
106
|
+
rdoc_options: []
|
107
|
+
require_paths:
|
108
|
+
- lib
|
109
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
+
none: false
|
111
|
+
requirements:
|
112
|
+
- - ! '>='
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: '0'
|
115
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
requirements: []
|
122
|
+
rubyforge_project:
|
123
|
+
rubygems_version: 1.8.25
|
124
|
+
signing_key:
|
125
|
+
specification_version: 3
|
126
|
+
summary: A gem to compare strings and compute a distance or other metric between two
|
127
|
+
strings.
|
128
|
+
test_files: []
|