strcmp 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source :rubygems
2
+
3
+ gem 'ffi', '~> 1.1.5'
4
+
5
+ group :test do
6
+ gem 'rspec', '2.13.0'
7
+ gem 'jeweler'
8
+ end
@@ -0,0 +1,29 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.2.1)
5
+ ffi (1.1.5)
6
+ git (1.2.5)
7
+ jeweler (1.8.4)
8
+ bundler (~> 1.0)
9
+ git (>= 1.2.5)
10
+ rake
11
+ rdoc
12
+ rake (10.0.3)
13
+ rdoc (3.9.5)
14
+ rspec (2.13.0)
15
+ rspec-core (~> 2.13.0)
16
+ rspec-expectations (~> 2.13.0)
17
+ rspec-mocks (~> 2.13.0)
18
+ rspec-core (2.13.0)
19
+ rspec-expectations (2.13.0)
20
+ diff-lcs (>= 1.1.3, < 2.0)
21
+ rspec-mocks (2.13.0)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ ffi (~> 1.1.5)
28
+ jeweler
29
+ rspec (= 2.13.0)
@@ -0,0 +1,27 @@
1
+ strcmp
2
+ ======
3
+
4
+ Strcmp is a simple ruby gem that is used to compute how similar two strings are
5
+ to one another. It includes a fast C implementation of the [jaro-winkler]
6
+ (http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) distance algorithm
7
+ and soon the levenshtein distance algorithm. More metrics on the way. I claim no
8
+ ownership of the algorithm.
9
+
10
+ Installing
11
+ ==========
12
+
13
+ gem install strcmp
14
+
15
+ Usage
16
+ =====
17
+
18
+ Using the gem is simple. First add the require statement to the file you wish to
19
+ use it in.
20
+
21
+ require 'strcmp'
22
+
23
+ To use the gem call:
24
+
25
+ StrCmp.jaro_winkler("martha", "marhta") => 0.961111
26
+ StrCmp.jaro("martha", "marhta") => 0.944444
27
+
@@ -0,0 +1,18 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "strcmp"
5
+ gemspec.summary = "A gem to compare strings and compute a distance or other metric between two strings."
6
+ gemspec.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
7
+ gemspec.email = "rostepher.dev@gmail.com"
8
+ gemspec.homepage = "http://github.com/Rostepher/strcmp"
9
+ gemspec.authors = ["Ross Bayer"]
10
+ gemspec.add_dependency "ffi"
11
+ gemspec.add_development_dependency "rspec"
12
+ gemspec.add_development_dependency "jeweler"
13
+ end
14
+
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler does not seem to be here... Try installing it with: gem install jeweler"
18
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,4 @@
1
+ *.out
2
+ *.bundle
3
+ *.o
4
+ Makefile
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+ $CFLAGS = "-std=c11 " + $CFLAGS
3
+ create_makefile "strcmp"
Binary file
@@ -0,0 +1,99 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <ctype.h>
4
+
5
+ #define TRUE 1
6
+ #define FALSE 0
7
+
8
+ #define max(a, b) ((a) > (b) ? (a) : (b))
9
+ #define min(a, b) ((a) < (b) ? (a) : (b))
10
+ #define equal(a, b) (tolower(a) == tolower(b))
11
+ #define not_equal(a, b) (tolower(a) != tolower(b))
12
+
13
+ double jaro_distance(char *foo, char *bar) {
14
+ // length of the strings, stops the repeated use of strlen
15
+ int foo_len = strlen(foo);
16
+ int bar_len = strlen(bar);
17
+
18
+ // if both strings are empty return 1
19
+ // if only one of the strings is empty return 0
20
+ if (foo_len == 0) return bar_len == 0 ? 1.0 : 0.0;
21
+
22
+ // max distance between two chars to be considered matching
23
+ // floor() is ommitted due to integer division rules
24
+ int match_distance = (int) max(foo_len, bar_len)/2 - 1;
25
+
26
+ // arrays of bools that signify if that char in the matcing string has a match
27
+ int *foo_matches = calloc(foo_len, sizeof(int));
28
+ int *bar_matches = calloc(bar_len, sizeof(int));
29
+
30
+ // number of matches and transpositions
31
+ double matches = 0.0;
32
+ double transpositions = 0.0;
33
+
34
+ // find the matches
35
+ for (int i = 0; i < foo_len; i++) {
36
+ // start and end take into account the match distance
37
+ int start = max(0, i - match_distance);
38
+ int end = min(i + match_distance + 1, bar_len);
39
+
40
+ // add comments...
41
+ for (int k = start; k < end; k++) {
42
+ // if bar already has a match continue
43
+ if (bar_matches[k]) continue;
44
+ // if foo and bar are not
45
+ if (not_equal(foo[i], bar[k])) continue;
46
+ // otherwise assume there is a match
47
+ foo_matches[i] = TRUE;
48
+ bar_matches[k] = TRUE;
49
+ matches++;
50
+ break;
51
+ }
52
+ }
53
+
54
+ // if there are no matches return 0
55
+ if (matches == 0) {
56
+ free(foo_matches);
57
+ free(bar_matches);
58
+ return 0.0;
59
+ }
60
+
61
+ // count transpositions
62
+ int k = 0;
63
+ for (int i = 0; i < foo_len; i++) {
64
+ // if there are no matches in foo continue
65
+ if (!foo_matches[i]) continue;
66
+ // while there is no match in bar increment k
67
+ while (!bar_matches[k]) k++;
68
+ // increment transpositions
69
+ if (not_equal(foo[i], bar[k])) transpositions++;
70
+ k++;
71
+ }
72
+
73
+ // divide the number of transpositions by two as per the algorithm specs
74
+ // this division is valid because the counted transpositions include both
75
+ // instances of the transposed characters.
76
+ transpositions /= 2.0;
77
+
78
+ // free dat allocated memory !VERY IMPORTANT!
79
+ free(foo_matches);
80
+ free(bar_matches);
81
+
82
+ // return the jaro distance
83
+ return ((matches / foo_len) +
84
+ (matches / bar_len) +
85
+ ((matches - transpositions) / matches)) / 3.0;
86
+ }
87
+
88
+ double jaro_winkler_distance(char *foo, char *bar) {
89
+ // compute the jaro distance
90
+ double jaro = jaro_distance(foo, bar);
91
+
92
+ // finds the number of common terms in the first 3 strings, max 3.
93
+ int prefix_length = 0;
94
+ if (strlen(foo) != 0 && strlen(bar) != 0)
95
+ while (prefix_length < 3 && equal(*foo++, *bar++)) prefix_length++;
96
+
97
+ // 0.1 is the default scaling factor
98
+ return jaro + prefix_length * 0.1 * (1 - jaro);
99
+ }
@@ -0,0 +1,2 @@
1
+ double jaro_distance(char *, char *);
2
+ double jaro_winkler_distance(char *, char *);
@@ -0,0 +1,15 @@
1
+ #include <stdio.h>
2
+ #include "jaro_winkler.h"
3
+
4
+ int main(int argc, char *argv[]) {
5
+ //char *str1 = argv[1];
6
+ //char *str2 = argv[2];
7
+ char *str1[] = { "", "foobar", "martha", "dwayne", "dixon" };
8
+ char *str2[] = { "", "", "marhta", "duane", "dicksonx" };
9
+ for (int i = 0; i < 5; i++) {
10
+ double dist_1 = jaro_distance(str1[i], str2[i]);
11
+ double dist_2 = jaro_distance(str2[i], str1[i]);
12
+ printf("jaro_distance(\"%s\", \"%s\") => %f\n", str1[i], str2[i], dist_1);
13
+ printf("jaro_distance(\"%s\", \"%s\") => %f\n", str2[i], str1[i], dist_2);
14
+ }
15
+ }
@@ -0,0 +1,36 @@
1
+ require 'ffi'
2
+
3
+ module StrCmp
4
+ class << self
5
+ extend FFI::Library
6
+
7
+ # Try loading in order.
8
+ library = File.dirname(__FILE__) + "/../ext/strcmp/strcmp"
9
+ candidates = ['.bundle', '.so', '.dylib', ''].map { |ext| library + ext }
10
+ ffi_lib(candidates)
11
+
12
+ def jaro(string1, string2)
13
+ validate(string1)
14
+ validate(string2)
15
+ ffi_jaro(string1, string2)
16
+ end
17
+
18
+ def jaro_winkler(string1, string2)
19
+ validate(string1)
20
+ validate(string2)
21
+ ffi_jaro_winkler(string1, string2)
22
+ end
23
+
24
+ private
25
+ # attach the c functions with ffi
26
+ attach_function :ffi_jaro, :jaro_distance, [:string, :string], :double
27
+ attach_function :ffi_jaro_winkler, :jaro_winkler_distance, [:string, :string], :double
28
+
29
+ # confirm that the given object is a string, if not raise type error
30
+ def validate(object)
31
+ unless object.kind_of?(String)
32
+ raise TypeError, "Wrong argument type #{object.class} (expected String)"
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe JaroWinkler do
4
+ test_cases = [
5
+ ["", "", 1.0],
6
+ ["foobar", "", 0.0],
7
+ ["martha", "marhta", 0.961],
8
+ ["dwayne", "duane", 0.84],
9
+ ["dixon", "dicksonx", 0.813]
10
+ ]
11
+
12
+ test_cases.each do |str1, str2, distance|
13
+ it "should calculate the distance #{distance} from #{str1} and #{str2}" do
14
+ JaroWinkler.distance(str1, str2).round(3) == distance
15
+ JaroWinkler.distance(str2, str1).round(3) == distance
16
+ end
17
+ end
18
+
19
+ it "should raise an error if either arguemnt is nil" do
20
+ expect { JaroWinkler.distance("", nil) }.to raise_error TypeError
21
+ expect { JaroWinkler.distance(nil, "") }.to raise_error TypeError
22
+ end
23
+
24
+ it "should raise an error if either argument is not a string" do
25
+ expect { JaroWinkler.distance("foo", /bar/) }.to raise_error TypeError
26
+ expect { JaroWinkler.distance("42", 42) }.to raise_error TypeError
27
+ expect { JaroWinkler.distance("Object", Object.new) }.to raise_error TypeError
28
+ end
29
+ end
@@ -0,0 +1 @@
1
+ require File.dirname(__FILE__) + "/../lib/strcmp"
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "strcmp"
8
+ s.version = "1.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ross Bayer"]
12
+ s.date = "2013-03-05"
13
+ s.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
14
+ s.email = "rostepher.dev@gmail.com"
15
+ s.extensions = ["ext/strcmp/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ "Gemfile",
21
+ "Gemfile.lock",
22
+ "README.md",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "ext/strcmp/.gitignore",
26
+ "ext/strcmp/extconf.rb",
27
+ "ext/strcmp/jaro_winkler",
28
+ "ext/strcmp/jaro_winkler.c",
29
+ "ext/strcmp/jaro_winkler.h",
30
+ "ext/strcmp/test.c",
31
+ "lib/strcmp.rb",
32
+ "spec/jaro_winkler_spec.rb",
33
+ "spec/spec_helper.rb",
34
+ "strcmp.gemspec"
35
+ ]
36
+ s.homepage = "http://github.com/Rostepher/strcmp"
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = "1.8.25"
39
+ s.summary = "A gem to compare strings and compute a distance or other metric between two strings."
40
+
41
+ if s.respond_to? :specification_version then
42
+ s.specification_version = 3
43
+
44
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
45
+ s.add_runtime_dependency(%q<ffi>, ["~> 1.1.5"])
46
+ s.add_runtime_dependency(%q<ffi>, [">= 0"])
47
+ s.add_development_dependency(%q<rspec>, [">= 0"])
48
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
49
+ else
50
+ s.add_dependency(%q<ffi>, ["~> 1.1.5"])
51
+ s.add_dependency(%q<ffi>, [">= 0"])
52
+ s.add_dependency(%q<rspec>, [">= 0"])
53
+ s.add_dependency(%q<jeweler>, [">= 0"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<ffi>, ["~> 1.1.5"])
57
+ s.add_dependency(%q<ffi>, [">= 0"])
58
+ s.add_dependency(%q<rspec>, [">= 0"])
59
+ s.add_dependency(%q<jeweler>, [">= 0"])
60
+ end
61
+ end
62
+
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: strcmp
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ross Bayer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ffi
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.1.5
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.5
30
+ - !ruby/object:Gem::Dependency
31
+ name: ffi
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: jeweler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: A simple ruby gem that is used to compute how similar two strings are
79
+ to one another. It includes a fast C implementation of the jaro-winkler distance
80
+ algorith and soon the levenshtein distance algorithm. More metrics on the way.
81
+ email: rostepher.dev@gmail.com
82
+ executables: []
83
+ extensions:
84
+ - ext/strcmp/extconf.rb
85
+ extra_rdoc_files:
86
+ - README.md
87
+ files:
88
+ - Gemfile
89
+ - Gemfile.lock
90
+ - README.md
91
+ - Rakefile
92
+ - VERSION
93
+ - ext/strcmp/.gitignore
94
+ - ext/strcmp/extconf.rb
95
+ - ext/strcmp/jaro_winkler
96
+ - ext/strcmp/jaro_winkler.c
97
+ - ext/strcmp/jaro_winkler.h
98
+ - ext/strcmp/test.c
99
+ - lib/strcmp.rb
100
+ - spec/jaro_winkler_spec.rb
101
+ - spec/spec_helper.rb
102
+ - strcmp.gemspec
103
+ homepage: http://github.com/Rostepher/strcmp
104
+ licenses: []
105
+ post_install_message:
106
+ rdoc_options: []
107
+ require_paths:
108
+ - lib
109
+ required_ruby_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ! '>='
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 1.8.25
124
+ signing_key:
125
+ specification_version: 3
126
+ summary: A gem to compare strings and compute a distance or other metric between two
127
+ strings.
128
+ test_files: []