levenshtein_str 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f32f298f472987c392ffb24891f0acf74e85217d14d9eb43e4f7d38c19a63e73
4
+ data.tar.gz: 34ccf02e8fb61a6c47094fc082f2d278887aca57ade4d41e9276735896155615
5
+ SHA512:
6
+ metadata.gz: d7f605c8aa11767b27ac77bd7afa89d9581a2541fb9338a68545b30bceb0c2bbe49bcb197f4d13a8365a7a0c1cade2d32b624b1e5f14f646e10fb30e415e4852
7
+ data.tar.gz: 2ab3c4a7d014d145091ee005322b8523bc4cfc9c5b9812b0bc6b2570b1942bcb3db9841c61506513edc44322fd3d464b7d06d1840179c578b82a956ad11c04a5
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,19 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ levenshtein_str (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ benchmark-ips (2.8.2)
10
+
11
+ PLATFORMS
12
+ ruby
13
+
14
+ DEPENDENCIES
15
+ benchmark-ips
16
+ levenshtein_str!
17
+
18
+ BUNDLED WITH
19
+ 2.1.4
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # String#levenshtein
2
+
3
+ A performant Ruby gem for getting the [levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) between 2 strings, with the leg work done in Crystal.
4
+
5
+ ## Installation:
6
+
7
+ ### System Requirements
8
+
9
+ - crystal
10
+ - MacOS (untested in Linux)
11
+
12
+ ```ruby
13
+ # Gemfile
14
+ gem "levenshtein_str"
15
+ ```
16
+
17
+ then
18
+
19
+ `bundle install`
20
+
21
+ ## Usage:
22
+
23
+ ```ruby
24
+ # app.rb
25
+ require 'levenshtein_str'
26
+
27
+ puts "hello".levenshtein("world")
28
+ ```
29
+
30
+ then
31
+
32
+ `ruby app.rb # => 4`
33
+
34
+ ## Benchmarks
35
+
36
+ See benchmark.rb for more code info and full results.
37
+
38
+ `./benchmark`
39
+
40
+ - "" and "" Same-ish, sometimes Ruby fractionally quicker (no type conversion overheads)
41
+ - "abd" and "abc" Crystal ~2x quicker
42
+ - "abc" and "abcde" Crystal ~2x quicker
43
+ - "abcdefghi" and "0123456789" Crystal ~2.25x quicker
44
+ - [whole alphabet] vs "012345" Crystal ~2.37x quicker
45
+
46
+ ## Template
47
+
48
+ Based on [this](https://github.com/johansenja/crystal_gem_template) template for writing Ruby gems in Crystal.
49
+
50
+ ## Contributions && Testing
51
+
52
+ Testing for now can just be done using Ruby:
53
+
54
+ `rspec`
55
+
56
+ Contributions are welcome.
57
+
58
+ ## License
59
+
60
+ [MIT](https://rem.mit-license.org)
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ desc "compile the Crystal native extensions"
4
+ task :compile do
5
+ puts "compiling native extensions"
6
+ `cd ext/levenshtein_str && shards && make clean && make & cd ../../`
7
+ end
8
+
9
+ desc "cleaning up compiled binaries"
10
+ task :clean do
11
+ puts "cleaning up extensions"
12
+ `cd ext/levenshtein_str && make clean && cd ../../`
13
+ end
data/benchmark ADDED
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__) + "/lib"
3
+
4
+ # Aiming to mimic how the Crystal looks fairly closely, with a wrapper method,
5
+ # and then a separate method for the logic
6
+
7
+ def _levenshtein_rb(first, second)
8
+ len1, len2 = first.size, second.size
9
+ return len1 if len2.zero?
10
+ return len2 if len1.zero?
11
+
12
+ matrix = [(0..len1).to_a]
13
+ (1..len2).each do |j|
14
+ matrix << [j] + [0] * len1
15
+ end
16
+
17
+ (1..len2).each do |i|
18
+ (1..len1).each do |j|
19
+ if first[j - 1] == second[i - 1]
20
+ matrix[i][j] = matrix[i - 1][j - 1]
21
+ else
22
+ matrix[i][j] = [
23
+ matrix[i - 1][j],
24
+ matrix[i][j - 1],
25
+ matrix[i - 1][j - 1],
26
+ ].min + 1
27
+ end
28
+ end
29
+ end
30
+ return matrix.last.last
31
+ end
32
+
33
+ class String
34
+ def levenshtein_rb(second)
35
+ _levenshtein_rb(self, second)
36
+ end
37
+ end
38
+
39
+ def bm
40
+ load "Rakefile"
41
+
42
+ Rake::Task["compile"].invoke
43
+
44
+ require "benchmark/ips"
45
+ require "levenshtein_str"
46
+
47
+ # blank strings
48
+ Benchmark.ips do |bmark|
49
+ bmark.report("Blank strings (Ruby)") { 100_000.times { "".levenshtein_rb("") } }
50
+ bmark.report("Blank string (Crystal)") { 100_000.times { "".levenshtein("") } }
51
+ bmark.compare!
52
+ end
53
+
54
+ # 1 char diff
55
+ Benchmark.ips do |bmark|
56
+ bmark.report("abd vs abc (Ruby)") { 100_000.times { "abd".levenshtein_rb("abc") } }
57
+ bmark.report("abd vs abc (Crystal)") { 100_000.times { "abd".levenshtein("abc") } }
58
+ bmark.compare!
59
+ end
60
+
61
+ # short diff
62
+ Benchmark.ips do |bmark|
63
+ bmark.report("abc vs abcde (Ruby)") { 100_000.times { "abc".levenshtein_rb("abcde") } }
64
+ bmark.report("abc vs abcde (Crystal)") { 100_000.times { "abc".levenshtein("abcde") } }
65
+ bmark.compare!
66
+ end
67
+
68
+ # medium diff
69
+ Benchmark.ips do |bmark|
70
+ bmark.report("abcdefghi vs 0123456789 (Ruby)") { 100_000.times { "abcdefghi".levenshtein_rb("0123456789") } }
71
+ bmark.report("abcdefghi vs 0123456789 (Crystal)") { 100_000.times { "abcdefghi".levenshtein("0123456789") } }
72
+ bmark.compare!
73
+ end
74
+
75
+ # long diff
76
+ Benchmark.ips do |bmark|
77
+ bmark.report("[whole alphabet] vs 012345 (Ruby)") { 100_000.times { "abcdefghijklmnopqrstuvwxyz".levenshtein_rb("012345") } }
78
+ bmark.report("[whole alphabet] vs 012345 (Crystal)") { 100_000.times { "abcdefghijklmnopqrstuvwxyz".levenshtein("012345") } }
79
+ bmark.compare!
80
+ end
81
+ end
82
+
83
+ begin
84
+ bm
85
+ ensure
86
+ Rake::Task["clean"].invoke
87
+ end
88
+
89
+ # RESULTS
90
+
91
+ # Warming up --------------------------------------
92
+ # Blank strings (Ruby) 3.000 i/100ms
93
+ # Blank string (Crystal)
94
+ # 3.000 i/100ms
95
+ # Calculating -------------------------------------
96
+ # Blank strings (Ruby) 48.982 (±16.3%) i/s - 237.000 in 5.004116s
97
+ # Blank string (Crystal)
98
+ # 47.140 (±14.8%) i/s - 231.000 in 5.030098s
99
+
100
+ # Comparison:
101
+ # Blank strings (Ruby): 49.0 i/s
102
+ # Blank string (Crystal): 47.1 i/s - same-ish: difference falls within error
103
+
104
+ # Warming up --------------------------------------
105
+ # abd vs abc (Ruby) 1.000 i/100ms
106
+ # abd vs abc (Crystal) 1.000 i/100ms
107
+ # Calculating -------------------------------------
108
+ # abd vs abc (Ruby) 1.367 (± 0.0%) i/s - 7.000 in 5.123125s
109
+ # abd vs abc (Crystal) 2.750 (± 0.0%) i/s - 14.000 in 5.101675s
110
+
111
+ # Comparison:
112
+ # abd vs abc (Crystal): 2.7 i/s
113
+ # abd vs abc (Ruby): 1.4 i/s - 2.01x (± 0.00) slower
114
+
115
+ # Warming up --------------------------------------
116
+ # abc vs abcde (Ruby) 1.000 i/100ms
117
+ # abc vs abcde (Crystal)
118
+ # 1.000 i/100ms
119
+ # Calculating -------------------------------------
120
+ # abc vs abcde (Ruby) 0.901 (± 0.0%) i/s - 5.000 in 5.547298s
121
+ # abc vs abcde (Crystal)
122
+ # 1.792 (± 0.0%) i/s - 9.000 in 5.023881s
123
+
124
+ # Comparison:
125
+ # abc vs abcde (Crystal): 1.8 i/s
126
+ # abc vs abcde (Ruby): 0.9 i/s - 1.99x (± 0.00) slower
127
+
128
+ # Warming up --------------------------------------
129
+ # abcdefghi vs 0123456789 (Ruby)
130
+ # 1.000 i/100ms
131
+ # abcdefghi vs 0123456789 (Crystal)
132
+ # 1.000 i/100ms
133
+ # Calculating -------------------------------------
134
+ # abcdefghi vs 0123456789 (Ruby)
135
+ # 0.196 (± 0.0%) i/s - 1.000 in 5.095124s
136
+ # abcdefghi vs 0123456789 (Crystal)
137
+ # 0.441 (± 0.0%) i/s - 3.000 in 6.804299s
138
+
139
+ # Comparison:
140
+ # abcdefghi vs 0123456789 (Crystal): 0.4 i/s
141
+ # abcdefghi vs 0123456789 (Ruby): 0.2 i/s - 2.25x (± 0.00) slower
142
+
143
+ # Warming up --------------------------------------
144
+ # [whole alphabet] vs 012345 (Ruby)
145
+ # 1.000 i/100ms
146
+ # [whole alphabet] vs 012345 (Crystal)
147
+ # 1.000 i/100ms
148
+ # Calculating -------------------------------------
149
+ # [whole alphabet] vs 012345 (Ruby)
150
+ # 0.123 (± 0.0%) i/s - 1.000 in 8.150069s
151
+ # [whole alphabet] vs 012345 (Crystal)
152
+ # 0.291 (± 0.0%) i/s - 2.000 in 6.886041s
153
+
154
+ # Comparison:
155
+ # [whole alphabet] vs 012345 (Crystal): 0.3 i/s
156
+ # [whole alphabet] vs 012345 (Ruby): 0.1 i/s - 2.37x (± 0.00) slower
@@ -0,0 +1,15 @@
1
+ CRYSTAL = crystal
2
+ TARGET = ../../lib/levenshtein_str.bundle
3
+
4
+ install: all
5
+
6
+ all: shards $(TARGET)
7
+
8
+ shards:
9
+ shards
10
+
11
+ $(TARGET): ./src/levenshtein_str.cr
12
+ $(CRYSTAL) $< --link-flags "-dynamic -bundle -Wl,-undefined,dynamic_lookup" -o $(TARGET)
13
+
14
+ clean:
15
+ rm -f ../../**/*.bundle*
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+ find_executable("crystal") or abort <<~ERR
3
+ You need crystal installed to use this gem.
4
+ Please check out https://crystal-lang.org/ for information on how to install it.
5
+ ERR
@@ -0,0 +1,18 @@
1
+ name: crysal_gem_template
2
+ version: 0.1.0
3
+
4
+ authors:
5
+ - johansenja
6
+
7
+ targets:
8
+ levenshtein_str:
9
+ main: src/levenshtein_str.cr
10
+ #
11
+ # add dependencies and development_dependencies:
12
+ # dependencies:
13
+ # pg:
14
+ # github: will/crystal-pg
15
+ # version: "~> 0.5"
16
+ #
17
+ # development_dependencies:
18
+ # ...
@@ -0,0 +1,39 @@
1
+ require "./lib/cr_ruby"
2
+
3
+ def levenshtein(first : String, second : String)
4
+ len1, len2 = first.size, second.size
5
+ return len1 if len2.zero?
6
+ return len2 if len1.zero?
7
+
8
+ matrix = [(0..len1).to_a]
9
+ (1..len2).each do |j|
10
+ matrix << [j] + [0] * len1
11
+ end
12
+
13
+ (1..len2).each do |i|
14
+ (1..len1).each do |j|
15
+ if first[j-1] == second[i-1]
16
+ matrix[i][j] = matrix[i-1][j-1]
17
+ else
18
+ matrix[i][j] = {
19
+ matrix[i-1][j],
20
+ matrix[i][j-1],
21
+ matrix[i-1][j-1]
22
+ }.min + 1
23
+ end
24
+ end
25
+ end
26
+ return matrix.last.last
27
+ end
28
+
29
+ def levenshtein(self : CrRuby::VALUE, other : CrRuby::VALUE)
30
+ levenshtein(String.from_ruby(self), String.from_ruby(other)).to_ruby
31
+ end
32
+
33
+ fun init = Init_levenshtein_str
34
+ GC.init
35
+ LibCrystalMain.__crystal_main(0, Pointer(Pointer(UInt8)).null)
36
+
37
+ string = CrRuby.rb_define_class("String", CrRuby.rb_cObject)
38
+ CrRuby.rb_define_method(string, "levenshtein", ->levenshtein(CrRuby::VALUE, CrRuby::VALUE), 1)
39
+ end
@@ -0,0 +1,58 @@
1
+ lib CrRuby
2
+ # every ruby Object is a VALUE
3
+ type VALUE = Void*
4
+ # ruby method typing
5
+ type METHOD_FUNC = VALUE, VALUE -> VALUE
6
+
7
+ # a ruby Class is a VALUE
8
+ $rb_cObject : VALUE
9
+
10
+ # a ruby Module is also a VALUE
11
+ $rb_mObject : VALUE
12
+
13
+ # convert Ruby str to C str
14
+ fun rb_str_new_cstr(str : UInt8*) : VALUE
15
+
16
+ # convert Ruby str to plain string
17
+ fun rb_str_to_str(value: VALUE) : VALUE
18
+
19
+ # convert plain string pointer to C string pointer
20
+ fun rb_string_value_cstr(value_ptr : VALUE*) : UInt8*
21
+
22
+ # ruby Integer to crystal int
23
+ fun rb_num2int(value : VALUE) : Int32
24
+
25
+ # crystal int to ruby Integer
26
+ fun rb_int2inum(value : Int32) : VALUE
27
+
28
+ # define ruby class in C
29
+ fun rb_define_class(name: UInt8*, super: VALUE) : VALUE
30
+
31
+ # define ruby module in C
32
+ fun rb_define_module(name: UInt8*) : VALUE
33
+
34
+ # define ruby method in C
35
+ fun rb_define_method(klass: VALUE, name: UInt8*, func: METHOD_FUNC, argc: Int32)
36
+ end
37
+
38
+ class String
39
+ def to_ruby
40
+ CrRuby.rb_str_new_cstr self
41
+ end
42
+
43
+ def self.from_ruby(ruby_str : CrRuby::VALUE)
44
+ plain_string = CrRuby.rb_str_to_str(ruby_str)
45
+ c_string = CrRuby.rb_string_value_cstr(pointerof(plain_string))
46
+ new(c_string)
47
+ end
48
+ end
49
+
50
+ struct Int32
51
+ def to_ruby
52
+ CrRuby.rb_int2inum(self)
53
+ end
54
+
55
+ def self.from_ruby(ruby_int : CrRuby::VALUE)
56
+ CrRuby.rb_num2int(ruby_int)
57
+ end
58
+ end
@@ -0,0 +1,3 @@
1
+ module LevenshteinStr
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,2 @@
1
+ require "levenshtein_str/version"
2
+ require "levenshtein_str.bundle"
@@ -0,0 +1,15 @@
1
+ load "Rakefile"
2
+
3
+ RSpec.configure do |config|
4
+ config.before(:suite) do
5
+ Rake::Task["clean"].invoke
6
+ Rake::Task["compile"].invoke
7
+ require "levenshtein_str"
8
+ end
9
+
10
+ config.after(:suite) do
11
+ # for some reason, this task never runs if just `invoke`d
12
+ # Rake::Task["clean"].invoke
13
+ system "rake clean"
14
+ end
15
+ end
@@ -0,0 +1,41 @@
1
+ require_relative "./spec_helper"
2
+
3
+ RSpec.describe String do
4
+ describe "#levenshtein" do
5
+ context "with one empty string" do
6
+ it "returns the other string's length" do
7
+ expect("".levenshtein("abc")).to eq 3
8
+ expect("abc".levenshtein("")).to eq 3
9
+ end
10
+ end
11
+
12
+ context "with two equal strings" do
13
+ it "returns 0 if empty" do
14
+ expect("".levenshtein("")).to eq 0
15
+ end
16
+
17
+ it "returns 0 if not empty" do
18
+ expect("abc".levenshtein("abc")).to eq 0
19
+ end
20
+ end
21
+
22
+ context "with 2 same length strings" do
23
+ it "can accurately return the difference" do
24
+ # c => p
25
+ # d => w
26
+ # h => l
27
+ expect("abcdefgh".levenshtein("abpwefgl")).to eq 3
28
+ end
29
+ end
30
+
31
+ context "with 2 varying length strings" do
32
+ it "returns the length dfference if otherwise the same" do
33
+ expect("abcde".levenshtein("abc")).to eq 2
34
+ end
35
+
36
+ it "correctly evaluates complex strings" do
37
+ expect("abcdefghijklmnop".levenshtein("435hq09yrz")).to eq 15
38
+ end
39
+ end
40
+ end
41
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein_str
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - johansenja
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-05-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: benchmark-ips
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ description: Use String#levenshtein(other_str) to get the levenshtein distance between
28
+ 2 strings. Useful for measuring approximate string similarity, and fuzzy matching.
29
+ email:
30
+ executables: []
31
+ extensions:
32
+ - ext/levenshtein_str/extconf.rb
33
+ extra_rdoc_files: []
34
+ files:
35
+ - Gemfile
36
+ - Gemfile.lock
37
+ - README.md
38
+ - Rakefile
39
+ - benchmark
40
+ - ext/levenshtein_str/Makefile
41
+ - ext/levenshtein_str/extconf.rb
42
+ - ext/levenshtein_str/shard.yml
43
+ - ext/levenshtein_str/src/levenshtein_str.cr
44
+ - ext/levenshtein_str/src/lib/cr_ruby.cr
45
+ - lib/levenshtein_str.rb
46
+ - lib/levenshtein_str/version.rb
47
+ - spec/spec_helper.rb
48
+ - spec/string_spec.rb
49
+ homepage: https://github.com/johansenja/levenshtein_str
50
+ licenses:
51
+ - MIT
52
+ metadata:
53
+ source_code_uri: https://github.com/johansenja/levenshtein_str
54
+ post_install_message:
55
+ rdoc_options: []
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ requirements: []
69
+ rubygems_version: 3.0.8
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Get the Levenshtein distance between two strings
73
+ test_files:
74
+ - spec/spec_helper.rb
75
+ - spec/string_spec.rb