strcmp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source :rubygems
2
+
3
+ gem 'ffi', '~> 1.1.5'
4
+
5
+ group :test do
6
+ gem 'rspec', '2.13.0'
7
+ gem 'jeweler'
8
+ end
@@ -0,0 +1,29 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.2.1)
5
+ ffi (1.1.5)
6
+ git (1.2.5)
7
+ jeweler (1.8.4)
8
+ bundler (~> 1.0)
9
+ git (>= 1.2.5)
10
+ rake
11
+ rdoc
12
+ rake (10.0.3)
13
+ rdoc (3.9.5)
14
+ rspec (2.13.0)
15
+ rspec-core (~> 2.13.0)
16
+ rspec-expectations (~> 2.13.0)
17
+ rspec-mocks (~> 2.13.0)
18
+ rspec-core (2.13.0)
19
+ rspec-expectations (2.13.0)
20
+ diff-lcs (>= 1.1.3, < 2.0)
21
+ rspec-mocks (2.13.0)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ ffi (~> 1.1.5)
28
+ jeweler
29
+ rspec (= 2.13.0)
@@ -0,0 +1,27 @@
1
+ strcmp
2
+ ======
3
+
4
+ Strcmp is a simple ruby gem that is used to compute how similar two strings are
5
+ to one another. It includes a fast C implementation of the [jaro-winkler]
6
+ (http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) distance algorithm
7
+ and soon the levenshtein distance algorithm. More metrics on the way. I claim no
8
+ ownership of the algorithm.
9
+
10
+ Installing
11
+ ==========
12
+
13
+ gem install strcmp
14
+
15
+ Usage
16
+ =====
17
+
18
+ Using the gem is simple. First add the require statement to the file you wish to
19
+ use it in.
20
+
21
+ require 'strcmp'
22
+
23
+ To use the gem call:
24
+
25
+ StrCmp.jaro_winkler("martha", "marhta") => 0.961111
26
+ StrCmp.jaro("martha", "marhta") => 0.944444
27
+
@@ -0,0 +1,18 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "strcmp"
5
+ gemspec.summary = "A gem to compare strings and compute a distance or other metric between two strings."
6
+ gemspec.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
7
+ gemspec.email = "rostepher.dev@gmail.com"
8
+ gemspec.homepage = "http://github.com/Rostepher/strcmp"
9
+ gemspec.authors = ["Ross Bayer"]
10
+ gemspec.add_dependency "ffi"
11
+ gemspec.add_development_dependency "rspec"
12
+ gemspec.add_development_dependency "jeweler"
13
+ end
14
+
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler does not seem to be here... Try installing it with: gem install jeweler"
18
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,4 @@
1
+ *.out
2
+ *.bundle
3
+ *.o
4
+ Makefile
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+ $CFLAGS = "-std=c11 " + $CFLAGS
3
+ create_makefile "strcmp"
Binary file
@@ -0,0 +1,99 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <ctype.h>
4
+
5
+ #define TRUE 1
6
+ #define FALSE 0
7
+
8
+ #define max(a, b) ((a) > (b) ? (a) : (b))
9
+ #define min(a, b) ((a) < (b) ? (a) : (b))
10
+ #define equal(a, b) (tolower(a) == tolower(b))
11
+ #define not_equal(a, b) (tolower(a) != tolower(b))
12
+
13
+ double jaro_distance(char *foo, char *bar) {
14
+ // length of the strings, stops the repeated use of strlen
15
+ int foo_len = strlen(foo);
16
+ int bar_len = strlen(bar);
17
+
18
+ // if both strings are empty return 1
19
+ // if only one of the strings is empty return 0
20
+ if (foo_len == 0) return bar_len == 0 ? 1.0 : 0.0;
21
+
22
+ // max distance between two chars to be considered matching
23
+ // floor() is ommitted due to integer division rules
24
+ int match_distance = (int) max(foo_len, bar_len)/2 - 1;
25
+
26
+ // arrays of bools that signify if that char in the matcing string has a match
27
+ int *foo_matches = calloc(foo_len, sizeof(int));
28
+ int *bar_matches = calloc(bar_len, sizeof(int));
29
+
30
+ // number of matches and transpositions
31
+ double matches = 0.0;
32
+ double transpositions = 0.0;
33
+
34
+ // find the matches
35
+ for (int i = 0; i < foo_len; i++) {
36
+ // start and end take into account the match distance
37
+ int start = max(0, i - match_distance);
38
+ int end = min(i + match_distance + 1, bar_len);
39
+
40
+ // add comments...
41
+ for (int k = start; k < end; k++) {
42
+ // if bar already has a match continue
43
+ if (bar_matches[k]) continue;
44
+ // if foo and bar are not
45
+ if (not_equal(foo[i], bar[k])) continue;
46
+ // otherwise assume there is a match
47
+ foo_matches[i] = TRUE;
48
+ bar_matches[k] = TRUE;
49
+ matches++;
50
+ break;
51
+ }
52
+ }
53
+
54
+ // if there are no matches return 0
55
+ if (matches == 0) {
56
+ free(foo_matches);
57
+ free(bar_matches);
58
+ return 0.0;
59
+ }
60
+
61
+ // count transpositions
62
+ int k = 0;
63
+ for (int i = 0; i < foo_len; i++) {
64
+ // if there are no matches in foo continue
65
+ if (!foo_matches[i]) continue;
66
+ // while there is no match in bar increment k
67
+ while (!bar_matches[k]) k++;
68
+ // increment transpositions
69
+ if (not_equal(foo[i], bar[k])) transpositions++;
70
+ k++;
71
+ }
72
+
73
+ // divide the number of transpositions by two as per the algorithm specs
74
+ // this division is valid because the counted transpositions include both
75
+ // instances of the transposed characters.
76
+ transpositions /= 2.0;
77
+
78
+ // free dat allocated memory !VERY IMPORTANT!
79
+ free(foo_matches);
80
+ free(bar_matches);
81
+
82
+ // return the jaro distance
83
+ return ((matches / foo_len) +
84
+ (matches / bar_len) +
85
+ ((matches - transpositions) / matches)) / 3.0;
86
+ }
87
+
88
+ double jaro_winkler_distance(char *foo, char *bar) {
89
+ // compute the jaro distance
90
+ double jaro = jaro_distance(foo, bar);
91
+
92
+ // finds the number of common terms in the first 3 strings, max 3.
93
+ int prefix_length = 0;
94
+ if (strlen(foo) != 0 && strlen(bar) != 0)
95
+ while (prefix_length < 3 && equal(*foo++, *bar++)) prefix_length++;
96
+
97
+ // 0.1 is the default scaling factor
98
+ return jaro + prefix_length * 0.1 * (1 - jaro);
99
+ }
@@ -0,0 +1,2 @@
1
+ double jaro_distance(char *, char *);
2
+ double jaro_winkler_distance(char *, char *);
@@ -0,0 +1,15 @@
1
+ #include <stdio.h>
2
+ #include "jaro_winkler.h"
3
+
4
+ int main(int argc, char *argv[]) {
5
+ //char *str1 = argv[1];
6
+ //char *str2 = argv[2];
7
+ char *str1[] = { "", "foobar", "martha", "dwayne", "dixon" };
8
+ char *str2[] = { "", "", "marhta", "duane", "dicksonx" };
9
+ for (int i = 0; i < 5; i++) {
10
+ double dist_1 = jaro_distance(str1[i], str2[i]);
11
+ double dist_2 = jaro_distance(str2[i], str1[i]);
12
+ printf("jaro_distance(\"%s\", \"%s\") => %f\n", str1[i], str2[i], dist_1);
13
+ printf("jaro_distance(\"%s\", \"%s\") => %f\n", str2[i], str1[i], dist_2);
14
+ }
15
+ }
@@ -0,0 +1,36 @@
1
+ require 'ffi'
2
+
3
+ module StrCmp
4
+ class << self
5
+ extend FFI::Library
6
+
7
+ # Try loading in order.
8
+ library = File.dirname(__FILE__) + "/../ext/strcmp/strcmp"
9
+ candidates = ['.bundle', '.so', '.dylib', ''].map { |ext| library + ext }
10
+ ffi_lib(candidates)
11
+
12
+ def jaro(string1, string2)
13
+ validate(string1)
14
+ validate(string2)
15
+ ffi_jaro(string1, string2)
16
+ end
17
+
18
+ def jaro_winkler(string1, string2)
19
+ validate(string1)
20
+ validate(string2)
21
+ ffi_jaro_winkler(string1, string2)
22
+ end
23
+
24
+ private
25
+ # attach the c functions with ffi
26
+ attach_function :ffi_jaro, :jaro_distance, [:string, :string], :double
27
+ attach_function :ffi_jaro_winkler, :jaro_winkler_distance, [:string, :string], :double
28
+
29
+ # confirm that the given object is a string, if not raise type error
30
+ def validate(object)
31
+ unless object.kind_of?(String)
32
+ raise TypeError, "Wrong argument type #{object.class} (expected String)"
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe JaroWinkler do
4
+ test_cases = [
5
+ ["", "", 1.0],
6
+ ["foobar", "", 0.0],
7
+ ["martha", "marhta", 0.961],
8
+ ["dwayne", "duane", 0.84],
9
+ ["dixon", "dicksonx", 0.813]
10
+ ]
11
+
12
+ test_cases.each do |str1, str2, distance|
13
+ it "should calculate the distance #{distance} from #{str1} and #{str2}" do
14
+ JaroWinkler.distance(str1, str2).round(3) == distance
15
+ JaroWinkler.distance(str2, str1).round(3) == distance
16
+ end
17
+ end
18
+
19
+ it "should raise an error if either arguemnt is nil" do
20
+ expect { JaroWinkler.distance("", nil) }.to raise_error TypeError
21
+ expect { JaroWinkler.distance(nil, "") }.to raise_error TypeError
22
+ end
23
+
24
+ it "should raise an error if either argument is not a string" do
25
+ expect { JaroWinkler.distance("foo", /bar/) }.to raise_error TypeError
26
+ expect { JaroWinkler.distance("42", 42) }.to raise_error TypeError
27
+ expect { JaroWinkler.distance("Object", Object.new) }.to raise_error TypeError
28
+ end
29
+ end
@@ -0,0 +1 @@
1
+ require File.dirname(__FILE__) + "/../lib/strcmp"
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "strcmp"
8
+ s.version = "1.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ross Bayer"]
12
+ s.date = "2013-03-05"
13
+ s.description = "A simple ruby gem that is used to compute how similar two strings are to one another. It includes a fast C implementation of the jaro-winkler distance algorith and soon the levenshtein distance algorithm. More metrics on the way."
14
+ s.email = "rostepher.dev@gmail.com"
15
+ s.extensions = ["ext/strcmp/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ "Gemfile",
21
+ "Gemfile.lock",
22
+ "README.md",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "ext/strcmp/.gitignore",
26
+ "ext/strcmp/extconf.rb",
27
+ "ext/strcmp/jaro_winkler",
28
+ "ext/strcmp/jaro_winkler.c",
29
+ "ext/strcmp/jaro_winkler.h",
30
+ "ext/strcmp/test.c",
31
+ "lib/strcmp.rb",
32
+ "spec/jaro_winkler_spec.rb",
33
+ "spec/spec_helper.rb",
34
+ "strcmp.gemspec"
35
+ ]
36
+ s.homepage = "http://github.com/Rostepher/strcmp"
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = "1.8.25"
39
+ s.summary = "A gem to compare strings and compute a distance or other metric between two strings."
40
+
41
+ if s.respond_to? :specification_version then
42
+ s.specification_version = 3
43
+
44
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
45
+ s.add_runtime_dependency(%q<ffi>, ["~> 1.1.5"])
46
+ s.add_runtime_dependency(%q<ffi>, [">= 0"])
47
+ s.add_development_dependency(%q<rspec>, [">= 0"])
48
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
49
+ else
50
+ s.add_dependency(%q<ffi>, ["~> 1.1.5"])
51
+ s.add_dependency(%q<ffi>, [">= 0"])
52
+ s.add_dependency(%q<rspec>, [">= 0"])
53
+ s.add_dependency(%q<jeweler>, [">= 0"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<ffi>, ["~> 1.1.5"])
57
+ s.add_dependency(%q<ffi>, [">= 0"])
58
+ s.add_dependency(%q<rspec>, [">= 0"])
59
+ s.add_dependency(%q<jeweler>, [">= 0"])
60
+ end
61
+ end
62
+
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: strcmp
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ross Bayer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ffi
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.1.5
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.5
30
+ - !ruby/object:Gem::Dependency
31
+ name: ffi
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: jeweler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: A simple ruby gem that is used to compute how similar two strings are
79
+ to one another. It includes a fast C implementation of the jaro-winkler distance
80
+ algorith and soon the levenshtein distance algorithm. More metrics on the way.
81
+ email: rostepher.dev@gmail.com
82
+ executables: []
83
+ extensions:
84
+ - ext/strcmp/extconf.rb
85
+ extra_rdoc_files:
86
+ - README.md
87
+ files:
88
+ - Gemfile
89
+ - Gemfile.lock
90
+ - README.md
91
+ - Rakefile
92
+ - VERSION
93
+ - ext/strcmp/.gitignore
94
+ - ext/strcmp/extconf.rb
95
+ - ext/strcmp/jaro_winkler
96
+ - ext/strcmp/jaro_winkler.c
97
+ - ext/strcmp/jaro_winkler.h
98
+ - ext/strcmp/test.c
99
+ - lib/strcmp.rb
100
+ - spec/jaro_winkler_spec.rb
101
+ - spec/spec_helper.rb
102
+ - strcmp.gemspec
103
+ homepage: http://github.com/Rostepher/strcmp
104
+ licenses: []
105
+ post_install_message:
106
+ rdoc_options: []
107
+ require_paths:
108
+ - lib
109
+ required_ruby_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ! '>='
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 1.8.25
124
+ signing_key:
125
+ specification_version: 3
126
+ summary: A gem to compare strings and compute a distance or other metric between two
127
+ strings.
128
+ test_files: []