levenshtein_str 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f32f298f472987c392ffb24891f0acf74e85217d14d9eb43e4f7d38c19a63e73
4
+ data.tar.gz: 34ccf02e8fb61a6c47094fc082f2d278887aca57ade4d41e9276735896155615
5
+ SHA512:
6
+ metadata.gz: d7f605c8aa11767b27ac77bd7afa89d9581a2541fb9338a68545b30bceb0c2bbe49bcb197f4d13a8365a7a0c1cade2d32b624b1e5f14f646e10fb30e415e4852
7
+ data.tar.gz: 2ab3c4a7d014d145091ee005322b8523bc4cfc9c5b9812b0bc6b2570b1942bcb3db9841c61506513edc44322fd3d464b7d06d1840179c578b82a956ad11c04a5
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,19 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ levenshtein_str (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ benchmark-ips (2.8.2)
10
+
11
+ PLATFORMS
12
+ ruby
13
+
14
+ DEPENDENCIES
15
+ benchmark-ips
16
+ levenshtein_str!
17
+
18
+ BUNDLED WITH
19
+ 2.1.4
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # String#levenshtein
2
+
3
+ A performant Ruby gem for getting the [levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) between 2 strings, with the leg work done in Crystal.
4
+
5
+ ## Installation:
6
+
7
+ ### System Requirements
8
+
9
+ - crystal
10
+ - MacOS (untested in Linux)
11
+
12
+ ```ruby
13
+ # Gemfile
14
+ gem "levenshtein_str"
15
+ ```
16
+
17
+ then
18
+
19
+ `bundle install`
20
+
21
+ ## Usage:
22
+
23
+ ```ruby
24
+ # app.rb
25
+ require 'levenshtein_str'
26
+
27
+ puts "hello".levenshtein("world")
28
+ ```
29
+
30
+ then
31
+
32
+ `ruby app.rb # => 4`
33
+
34
+ ## Benchmarks
35
+
36
+ See benchmark.rb for more code info and full results.
37
+
38
+ `./benchmark`
39
+
40
+ - "" and "" Same-ish, sometimes Ruby fractionally quicker (no type conversion overheads)
41
+ - "abd" and "abc" Crystal ~2x quicker
42
+ - "abc" and "abcde" Crystal ~2x quicker
43
+ - "abcdefghi" and "0123456789" Crystal ~2.25x quicker
44
+ - [whole alphabet] vs "012345" Crystal ~2.37x quicker
45
+
46
+ ## Template
47
+
48
+ Based on [this](https://github.com/johansenja/crystal_gem_template) template for writing Ruby gems in Crystal.
49
+
50
+ ## Contributions && Testing
51
+
52
+ Testing for now can just be done using Ruby:
53
+
54
+ `rspec`
55
+
56
+ Contributions are welcome.
57
+
58
+ ## License
59
+
60
+ [MIT](https://rem.mit-license.org)
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ desc "compile the Crystal native extensions"
4
+ task :compile do
5
+ puts "compiling native extensions"
6
+ `cd ext/levenshtein_str && shards && make clean && make & cd ../../`
7
+ end
8
+
9
+ desc "cleaning up compiled binaries"
10
+ task :clean do
11
+ puts "cleaning up extensions"
12
+ `cd ext/levenshtein_str && make clean && cd ../../`
13
+ end
data/benchmark ADDED
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__) + "/lib"
3
+
4
+ # Aiming to mimic how the Crystal looks fairly closely, with a wrapper method,
5
+ # and then a separate method for the logic
6
+
7
+ def _levenshtein_rb(first, second)
8
+ len1, len2 = first.size, second.size
9
+ return len1 if len2.zero?
10
+ return len2 if len1.zero?
11
+
12
+ matrix = [(0..len1).to_a]
13
+ (1..len2).each do |j|
14
+ matrix << [j] + [0] * len1
15
+ end
16
+
17
+ (1..len2).each do |i|
18
+ (1..len1).each do |j|
19
+ if first[j - 1] == second[i - 1]
20
+ matrix[i][j] = matrix[i - 1][j - 1]
21
+ else
22
+ matrix[i][j] = [
23
+ matrix[i - 1][j],
24
+ matrix[i][j - 1],
25
+ matrix[i - 1][j - 1],
26
+ ].min + 1
27
+ end
28
+ end
29
+ end
30
+ return matrix.last.last
31
+ end
32
+
33
+ class String
34
+ def levenshtein_rb(second)
35
+ _levenshtein_rb(self, second)
36
+ end
37
+ end
38
+
39
+ def bm
40
+ load "Rakefile"
41
+
42
+ Rake::Task["compile"].invoke
43
+
44
+ require "benchmark/ips"
45
+ require "levenshtein_str"
46
+
47
+ # blank strings
48
+ Benchmark.ips do |bmark|
49
+ bmark.report("Blank strings (Ruby)") { 100_000.times { "".levenshtein_rb("") } }
50
+ bmark.report("Blank string (Crystal)") { 100_000.times { "".levenshtein("") } }
51
+ bmark.compare!
52
+ end
53
+
54
+ # 1 char diff
55
+ Benchmark.ips do |bmark|
56
+ bmark.report("abd vs abc (Ruby)") { 100_000.times { "abd".levenshtein_rb("abc") } }
57
+ bmark.report("abd vs abc (Crystal)") { 100_000.times { "abd".levenshtein("abc") } }
58
+ bmark.compare!
59
+ end
60
+
61
+ # short diff
62
+ Benchmark.ips do |bmark|
63
+ bmark.report("abc vs abcde (Ruby)") { 100_000.times { "abc".levenshtein_rb("abcde") } }
64
+ bmark.report("abc vs abcde (Crystal)") { 100_000.times { "abc".levenshtein("abcde") } }
65
+ bmark.compare!
66
+ end
67
+
68
+ # medium diff
69
+ Benchmark.ips do |bmark|
70
+ bmark.report("abcdefghi vs 0123456789 (Ruby)") { 100_000.times { "abcdefghi".levenshtein_rb("0123456789") } }
71
+ bmark.report("abcdefghi vs 0123456789 (Crystal)") { 100_000.times { "abcdefghi".levenshtein("0123456789") } }
72
+ bmark.compare!
73
+ end
74
+
75
+ # long diff
76
+ Benchmark.ips do |bmark|
77
+ bmark.report("[whole alphabet] vs 012345 (Ruby)") { 100_000.times { "abcdefghijklmnopqrstuvwxyz".levenshtein_rb("012345") } }
78
+ bmark.report("[whole alphabet] vs 012345 (Crystal)") { 100_000.times { "abcdefghijklmnopqrstuvwxyz".levenshtein("012345") } }
79
+ bmark.compare!
80
+ end
81
+ end
82
+
83
+ begin
84
+ bm
85
+ ensure
86
+ Rake::Task["clean"].invoke
87
+ end
88
+
89
+ # RESULTS
90
+
91
+ # Warming up --------------------------------------
92
+ # Blank strings (Ruby) 3.000 i/100ms
93
+ # Blank string (Crystal)
94
+ # 3.000 i/100ms
95
+ # Calculating -------------------------------------
96
+ # Blank strings (Ruby) 48.982 (±16.3%) i/s - 237.000 in 5.004116s
97
+ # Blank string (Crystal)
98
+ # 47.140 (±14.8%) i/s - 231.000 in 5.030098s
99
+
100
+ # Comparison:
101
+ # Blank strings (Ruby): 49.0 i/s
102
+ # Blank string (Crystal): 47.1 i/s - same-ish: difference falls within error
103
+
104
+ # Warming up --------------------------------------
105
+ # abd vs abc (Ruby) 1.000 i/100ms
106
+ # abd vs abc (Crystal) 1.000 i/100ms
107
+ # Calculating -------------------------------------
108
+ # abd vs abc (Ruby) 1.367 (± 0.0%) i/s - 7.000 in 5.123125s
109
+ # abd vs abc (Crystal) 2.750 (± 0.0%) i/s - 14.000 in 5.101675s
110
+
111
+ # Comparison:
112
+ # abd vs abc (Crystal): 2.7 i/s
113
+ # abd vs abc (Ruby): 1.4 i/s - 2.01x (± 0.00) slower
114
+
115
+ # Warming up --------------------------------------
116
+ # abc vs abcde (Ruby) 1.000 i/100ms
117
+ # abc vs abcde (Crystal)
118
+ # 1.000 i/100ms
119
+ # Calculating -------------------------------------
120
+ # abc vs abcde (Ruby) 0.901 (± 0.0%) i/s - 5.000 in 5.547298s
121
+ # abc vs abcde (Crystal)
122
+ # 1.792 (± 0.0%) i/s - 9.000 in 5.023881s
123
+
124
+ # Comparison:
125
+ # abc vs abcde (Crystal): 1.8 i/s
126
+ # abc vs abcde (Ruby): 0.9 i/s - 1.99x (± 0.00) slower
127
+
128
+ # Warming up --------------------------------------
129
+ # abcdefghi vs 0123456789 (Ruby)
130
+ # 1.000 i/100ms
131
+ # abcdefghi vs 0123456789 (Crystal)
132
+ # 1.000 i/100ms
133
+ # Calculating -------------------------------------
134
+ # abcdefghi vs 0123456789 (Ruby)
135
+ # 0.196 (± 0.0%) i/s - 1.000 in 5.095124s
136
+ # abcdefghi vs 0123456789 (Crystal)
137
+ # 0.441 (± 0.0%) i/s - 3.000 in 6.804299s
138
+
139
+ # Comparison:
140
+ # abcdefghi vs 0123456789 (Crystal): 0.4 i/s
141
+ # abcdefghi vs 0123456789 (Ruby): 0.2 i/s - 2.25x (± 0.00) slower
142
+
143
+ # Warming up --------------------------------------
144
+ # [whole alphabet] vs 012345 (Ruby)
145
+ # 1.000 i/100ms
146
+ # [whole alphabet] vs 012345 (Crystal)
147
+ # 1.000 i/100ms
148
+ # Calculating -------------------------------------
149
+ # [whole alphabet] vs 012345 (Ruby)
150
+ # 0.123 (± 0.0%) i/s - 1.000 in 8.150069s
151
+ # [whole alphabet] vs 012345 (Crystal)
152
+ # 0.291 (± 0.0%) i/s - 2.000 in 6.886041s
153
+
154
+ # Comparison:
155
+ # [whole alphabet] vs 012345 (Crystal): 0.3 i/s
156
+ # [whole alphabet] vs 012345 (Ruby): 0.1 i/s - 2.37x (± 0.00) slower
@@ -0,0 +1,15 @@
1
+ CRYSTAL = crystal
2
+ TARGET = ../../lib/levenshtein_str.bundle
3
+
4
+ install: all
5
+
6
+ all: shards $(TARGET)
7
+
8
+ shards:
9
+ shards
10
+
11
+ $(TARGET): ./src/levenshtein_str.cr
12
+ $(CRYSTAL) $< --link-flags "-dynamic -bundle -Wl,-undefined,dynamic_lookup" -o $(TARGET)
13
+
14
+ clean:
15
+ rm -f ../../**/*.bundle*
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+ find_executable("crystal") or abort <<~ERR
3
+ You need crystal installed to use this gem.
4
+ Please check out https://crystal-lang.org/ for information on how to install it.
5
+ ERR
@@ -0,0 +1,18 @@
1
+ name: crysal_gem_template
2
+ version: 0.1.0
3
+
4
+ authors:
5
+ - johansenja
6
+
7
+ targets:
8
+ levenshtein_str:
9
+ main: src/levenshtein_str.cr
10
+ #
11
+ # add dependencies and development_dependencies:
12
+ # dependencies:
13
+ # pg:
14
+ # github: will/crystal-pg
15
+ # version: "~> 0.5"
16
+ #
17
+ # development_dependencies:
18
+ # ...
@@ -0,0 +1,39 @@
1
+ require "./lib/cr_ruby"
2
+
3
+ def levenshtein(first : String, second : String)
4
+ len1, len2 = first.size, second.size
5
+ return len1 if len2.zero?
6
+ return len2 if len1.zero?
7
+
8
+ matrix = [(0..len1).to_a]
9
+ (1..len2).each do |j|
10
+ matrix << [j] + [0] * len1
11
+ end
12
+
13
+ (1..len2).each do |i|
14
+ (1..len1).each do |j|
15
+ if first[j-1] == second[i-1]
16
+ matrix[i][j] = matrix[i-1][j-1]
17
+ else
18
+ matrix[i][j] = {
19
+ matrix[i-1][j],
20
+ matrix[i][j-1],
21
+ matrix[i-1][j-1]
22
+ }.min + 1
23
+ end
24
+ end
25
+ end
26
+ return matrix.last.last
27
+ end
28
+
29
+ def levenshtein(self : CrRuby::VALUE, other : CrRuby::VALUE)
30
+ levenshtein(String.from_ruby(self), String.from_ruby(other)).to_ruby
31
+ end
32
+
33
+ fun init = Init_levenshtein_str
34
+ GC.init
35
+ LibCrystalMain.__crystal_main(0, Pointer(Pointer(UInt8)).null)
36
+
37
+ string = CrRuby.rb_define_class("String", CrRuby.rb_cObject)
38
+ CrRuby.rb_define_method(string, "levenshtein", ->levenshtein(CrRuby::VALUE, CrRuby::VALUE), 1)
39
+ end
@@ -0,0 +1,58 @@
1
+ lib CrRuby
2
+ # every ruby Object is a VALUE
3
+ type VALUE = Void*
4
+ # ruby method typing
5
+ type METHOD_FUNC = VALUE, VALUE -> VALUE
6
+
7
+ # a ruby Class is a VALUE
8
+ $rb_cObject : VALUE
9
+
10
+ # a ruby Module is also a VALUE
11
+ $rb_mObject : VALUE
12
+
13
+ # convert Ruby str to C str
14
+ fun rb_str_new_cstr(str : UInt8*) : VALUE
15
+
16
+ # convert Ruby str to plain string
17
+ fun rb_str_to_str(value: VALUE) : VALUE
18
+
19
+ # convert plain string pointer to C string pointer
20
+ fun rb_string_value_cstr(value_ptr : VALUE*) : UInt8*
21
+
22
+ # ruby Integer to crystal int
23
+ fun rb_num2int(value : VALUE) : Int32
24
+
25
+ # crystal int to ruby Integer
26
+ fun rb_int2inum(value : Int32) : VALUE
27
+
28
+ # define ruby class in C
29
+ fun rb_define_class(name: UInt8*, super: VALUE) : VALUE
30
+
31
+ # define ruby module in C
32
+ fun rb_define_module(name: UInt8*) : VALUE
33
+
34
+ # define ruby method in C
35
+ fun rb_define_method(klass: VALUE, name: UInt8*, func: METHOD_FUNC, argc: Int32)
36
+ end
37
+
38
+ class String
39
+ def to_ruby
40
+ CrRuby.rb_str_new_cstr self
41
+ end
42
+
43
+ def self.from_ruby(ruby_str : CrRuby::VALUE)
44
+ plain_string = CrRuby.rb_str_to_str(ruby_str)
45
+ c_string = CrRuby.rb_string_value_cstr(pointerof(plain_string))
46
+ new(c_string)
47
+ end
48
+ end
49
+
50
+ struct Int32
51
+ def to_ruby
52
+ CrRuby.rb_int2inum(self)
53
+ end
54
+
55
+ def self.from_ruby(ruby_int : CrRuby::VALUE)
56
+ CrRuby.rb_num2int(ruby_int)
57
+ end
58
+ end
@@ -0,0 +1,3 @@
1
+ module LevenshteinStr
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,2 @@
1
+ require "levenshtein_str/version"
2
+ require "levenshtein_str.bundle"
@@ -0,0 +1,15 @@
1
+ load "Rakefile"
2
+
3
+ RSpec.configure do |config|
4
+ config.before(:suite) do
5
+ Rake::Task["clean"].invoke
6
+ Rake::Task["compile"].invoke
7
+ require "levenshtein_str"
8
+ end
9
+
10
+ config.after(:suite) do
11
+ # for some reason, this task never runs if just `invoke`d
12
+ # Rake::Task["clean"].invoke
13
+ system "rake clean"
14
+ end
15
+ end
@@ -0,0 +1,41 @@
1
+ require_relative "./spec_helper"
2
+
3
+ RSpec.describe String do
4
+ describe "#levenshtein" do
5
+ context "with one empty string" do
6
+ it "returns the other string's length" do
7
+ expect("".levenshtein("abc")).to eq 3
8
+ expect("abc".levenshtein("")).to eq 3
9
+ end
10
+ end
11
+
12
+ context "with two equal strings" do
13
+ it "returns 0 if empty" do
14
+ expect("".levenshtein("")).to eq 0
15
+ end
16
+
17
+ it "returns 0 if not empty" do
18
+ expect("abc".levenshtein("abc")).to eq 0
19
+ end
20
+ end
21
+
22
+ context "with 2 same length strings" do
23
+ it "can accurately return the difference" do
24
+ # c => p
25
+ # d => w
26
+ # h => l
27
+ expect("abcdefgh".levenshtein("abpwefgl")).to eq 3
28
+ end
29
+ end
30
+
31
+ context "with 2 varying length strings" do
32
+ it "returns the length dfference if otherwise the same" do
33
+ expect("abcde".levenshtein("abc")).to eq 2
34
+ end
35
+
36
+ it "correctly evaluates complex strings" do
37
+ expect("abcdefghijklmnop".levenshtein("435hq09yrz")).to eq 15
38
+ end
39
+ end
40
+ end
41
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein_str
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - johansenja
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-05-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: benchmark-ips
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ description: Use String#levenshtein(other_str) to get the levenshtein distance between
28
+ 2 strings. Useful for measuring approximate string similarity, and fuzzy matching.
29
+ email:
30
+ executables: []
31
+ extensions:
32
+ - ext/levenshtein_str/extconf.rb
33
+ extra_rdoc_files: []
34
+ files:
35
+ - Gemfile
36
+ - Gemfile.lock
37
+ - README.md
38
+ - Rakefile
39
+ - benchmark
40
+ - ext/levenshtein_str/Makefile
41
+ - ext/levenshtein_str/extconf.rb
42
+ - ext/levenshtein_str/shard.yml
43
+ - ext/levenshtein_str/src/levenshtein_str.cr
44
+ - ext/levenshtein_str/src/lib/cr_ruby.cr
45
+ - lib/levenshtein_str.rb
46
+ - lib/levenshtein_str/version.rb
47
+ - spec/spec_helper.rb
48
+ - spec/string_spec.rb
49
+ homepage: https://github.com/johansenja/levenshtein_str
50
+ licenses:
51
+ - MIT
52
+ metadata:
53
+ source_code_uri: https://github.com/johansenja/levenshtein_str
54
+ post_install_message:
55
+ rdoc_options: []
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ requirements: []
69
+ rubygems_version: 3.0.8
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Get the Levenshtein distance between two strings
73
+ test_files:
74
+ - spec/spec_helper.rb
75
+ - spec/string_spec.rb