damerau-levenshtein 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '08637e3e7eb3789b7910dbb1d824aea39ac1e0a20a939cbf035a95f4657e50a0'
4
+ data.tar.gz: 430fbf0ac5d6723becbb1be76f075422bcee05cf9a0e45ef7405dd065d321cb3
5
+ SHA512:
6
+ metadata.gz: b0a24c262162ebcb8a310d352f6c2388f4e2067b9183d4804048909af2af81f504f477d9da8f9c614204e0c63905f7fff3e59e29c4c08ae4170fe886477ab37b
7
+ data.tar.gz: 18ca5a38b06fc6d1d0177bcc30f5a9c37f6a040cbc1de061f41ba0cfd187d133baafe7c2fe57f1435eb7afd802f0f43502ed71ee7a77302348dd9acf407265e0
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
@@ -0,0 +1,27 @@
1
+ Gemfile.lock
2
+ *.sw?
3
+ .DS_Store
4
+ tmp
5
+ *.o
6
+ *.bundle
7
+ *.gem
8
+ .nvimlog
9
+ .vim.custom
10
+ .byebug_history
11
+
12
+ # rcov generated
13
+ coverage
14
+
15
+ # rdoc generated
16
+ rdoc
17
+
18
+ # yard generated
19
+ doc
20
+ .yardoc
21
+
22
+ # bundler
23
+ .bundle
24
+ bin
25
+
26
+ # generated gems
27
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,15 @@
1
+ AllCops:
2
+ Exclude:
3
+ - features/**/*
4
+ - db/**/*
5
+ - bundle_bin/**/*
6
+ Include:
7
+ - damerau-levenshtein.gemspec
8
+ Metrics/ModuleLength:
9
+ Max: 1000
10
+ Style/StringLiterals:
11
+ EnforcedStyle: double_quotes
12
+ Layout/DotPosition:
13
+ EnforcedStyle: trailing
14
+ Naming/FileName:
15
+ Enabled: false
@@ -0,0 +1 @@
1
+ 2.5.7
@@ -0,0 +1,11 @@
1
+ rvm:
2
+ - 2.5
3
+ - 2.6
4
+ before_install: "gem update bundler"
5
+ script:
6
+ - "bundle exec rake"
7
+ - "bundle exec rake features"
8
+ branches:
9
+ only:
10
+ - master
11
+
@@ -0,0 +1,25 @@
1
+ damerau-levenshtein CHANGELOG
2
+ =============================
3
+
4
+ 1.3.1 -- gems update
5
+
6
+ 1.3.0 -- (issue #10) shows difference between two strings
7
+
8
+ 1.2.0 -- add edit distance for array of integers (by @azhi)
9
+
10
+ 1.1.3 -- add ruby 2.3.1 to travis tests by request from @greysteil
11
+
12
+ 1.1.2 -- remove unnecessary production dependencies (by @ixti)
13
+
14
+ 1.1.1 -- fix random negative distance bug (by @Skarlit)
15
+
16
+ 1.1.0 -- relax and update gem dependencies, update specs
17
+
18
+ 1.0.3 -- gems update
19
+
20
+ 1.0.2 -- includes refactoring by https://github.com/luislavena to remove
21
+ dependency on jeweler
22
+
23
+ 1.0.1 -- fixed gem dependencies
24
+
25
+ 1.0.0 -- stable version
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2011-2018 Dmitry Mozzherin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,250 @@
1
+ # damerau-levenshtein #
2
+
3
+ [![Gem Version][gem_svg]][gem]
4
+ [![Continuous Integration Status][ci_svg]][ci]
5
+ [![Dependency Status][dep_svg]][dep]
6
+ [![Coverage Status][cov_svg]][cov]
7
+
8
+ The damerau-levenshtein gem allows to find [edit distance][ed] between two
9
+ UTF-8 or ASCII encoded strings with O(N\*M) efficiency.
10
+
11
+ This gem implements pure Levenshtein algorithm, Damerau modification of it
12
+ (where 2 character transposition counts as 1 edit distance). It also includes
13
+ Boehmer & Rees 2008 modification of Damerau algorithm, where transposition
14
+ of bigger than 1 character blocks is taken in account as well
15
+ [(Rees 2014)][rees2014].
16
+
17
+ ```ruby
18
+ require "damerau-levenshtein"
19
+ DamerauLevenshtein.distance("Something", "Smoething") #returns 1
20
+ ```
21
+
22
+ It also returns a diff between two strings according to Levenshtein alrorithm.
23
+ The diff is expressed by tags `<ins>`, `<del>`, and `<subst>`. Such tags make
24
+ it possible to highlight differnce between strings in a flexible way.
25
+
26
+ ```ruby
27
+ require "damerau-levenshtein"
28
+ differ = DamerauLevenshtein::Differ.new
29
+ differ.run("corn", "cron")
30
+ # output: ["c<subst>or</subst>n", "c<subst>ro</subst>n"]
31
+ ```
32
+
33
+ ## Dependencies ##
34
+
35
+ sudo apt-get install build-essential libgmp3-dev
36
+
37
+ ## Installation ##
38
+
39
+ gem install damerau-levenshtein
40
+
41
+ ## Examples ##
42
+
43
+ ```ruby
44
+ require "damerau-levenshtein"
45
+ dl = DamerauLevenshtein
46
+ ```
47
+
48
+ * compare using Damerau Levenshtein algorithm
49
+
50
+ ```ruby
51
+ dl.distance("Something", "Smoething") #returns 1
52
+ ```
53
+
54
+ * compare using Levensthein algorithm
55
+
56
+ ```ruby
57
+ dl.distance("Something", "Smoething", 0) #returns 2
58
+ ```
59
+
60
+ * compare using Boehmer & Rees modification
61
+
62
+ ```ruby
63
+ dl.distance("Something", "meSothing", 2) #returns 2 instead of 4
64
+ ```
65
+
66
+ * comparison of words with UTF-8 characters should work fine:
67
+
68
+ ```ruby
69
+ dl.distance("Sjöstedt", "Sjostedt") #returns 1
70
+ ```
71
+
72
+ * compare two arrays
73
+
74
+ ```ruby
75
+ dl.array_distance([1,2,3,5], [1,2,3,4]) #returns 1
76
+ ```
77
+
78
+ * return diff between two strings
79
+
80
+ ```ruby
81
+ differ = DamerauLevenshtein::Differ.new
82
+ differ.run("Something", "smthg")
83
+ ```
84
+
85
+ * return diff between two strings in raw format
86
+
87
+ ```ruby
88
+ differ = DamerauLevenshtein::Differ.new
89
+ differ.format = :raw
90
+ differ.run("Something", "smthg")
91
+ ```
92
+
93
+ ## API Description ##
94
+
95
+ ### Methods ###
96
+
97
+ #### DamerauLevenshtein.version
98
+
99
+ ```ruby
100
+ DamerauLevenshtein.version
101
+ #returns version number of the gem
102
+ ```
103
+
104
+ #### DamerauLevenshtein.distance
105
+
106
+ ```ruby
107
+ DamerauLevenshtein.distance(string1, string2, block_size, max_distance)
108
+ #returns edit distance between 2 strings
109
+
110
+ DamerauLevenshtein.string_distance(string1, string2, block_size, max_distance)
111
+ # an alias for .distance
112
+
113
+ DamerauLevenshtein.array_distance(array1, array2, block_size, max_distance)
114
+ # returns edit distance between 2 arrays of integers
115
+ ```
116
+
117
+ `DamerauLevenshtein.distance` and `.array_distance` take 4 arguments:
118
+
119
+ * `string1` (`array1` for `.array_distance`)
120
+ * `string2` (`array2` for `.array_distance`)
121
+ * `block_size` (default is 1)
122
+ * `max_distance` (default is 10)
123
+
124
+ `block_size` determines maximum number of characters in a transposition block:
125
+
126
+ block_size = 0
127
+ (transposition does not count -- it is a pure Levenshtein algorithm)
128
+
129
+ block_size = 1
130
+ (transposition between 2 adjustent characters --
131
+ it is pure Damerau-Levenshtein algorithm)
132
+
133
+ block_size = 2
134
+ (transposition between blocks as big as 2 characters -- so abcd and cdab
135
+ counts as edit distance 2, not 4)
136
+
137
+ block_size = 3
138
+ (transposition between blocks as big as 3 characters --
139
+ so abcdef and defabc counts as edit distance 3, not 6)
140
+
141
+ etc.
142
+
143
+ `max_distance` -- is a threshold after which algorithm gives up and
144
+ returns max_distance instead of real edit distance.
145
+
146
+ Levenshtein algorithm is expensive, so it makes sense to give up when edit
147
+ distance is becoming too big. The argument max_distance does just that.
148
+
149
+ ```ruby
150
+
151
+ DamerauLevenshtein.distance("abcdefg", "1234567", 0, 3)
152
+ # output: 4 -- it gave up when edit distance exceeded 3
153
+
154
+ ```
155
+
156
+ #### DamerauLevenshtein::Differ
157
+
158
+ `differ = DamerauLevenshtein::Differ.new` creates an instance of new differ class to return difference between two strings
159
+
160
+ `differ.format` shows current format for diff. Default is `:tag` format
161
+
162
+ `differ.format = :raw` changes current format for diffs. Possible values are `:tag` and `:raw`
163
+
164
+ `differ.run("String1", "String2")` returns difference between two strings.
165
+
166
+ For example:
167
+
168
+ ```ruby
169
+ differ = DamerauLevenshtein::Differ.new
170
+ differ.run("Something", "smthng")
171
+ # output: ["<ins>S</ins><subst>o</subst>m<ins>e</ins>th<ins>i</ins>ng",
172
+ # "<del>S</del><subst>s</subst>m<del>e</del>th<del>i</del>ng"]
173
+
174
+ ```
175
+
176
+ Or with parsing:
177
+
178
+ ```ruby
179
+ require "damerau-levenshtein"
180
+ require "nokogiri"
181
+
182
+ differ = DamerauLevenshtein::Differ.new
183
+ res = differ.run("Something", "Smothing!")
184
+ nodes = Nokogiri::XML("<root>#{res.first}</root>")
185
+
186
+ markup = nodes.root.children.map do |n|
187
+ case n.name
188
+ when "text"
189
+ n.text
190
+ when "del"
191
+ "~~#{n.children.first.text}~~"
192
+ when "ins"
193
+ "*#{n.children.first.text}*"
194
+ when "subst"
195
+ "**#{n.children.first.text}**"
196
+ end
197
+ end.join("")
198
+
199
+ puts markup
200
+ ```
201
+
202
+ ## Contributing to damerau-levenshtein ##
203
+
204
+ * Check out the latest master to make sure the feature hasn't been
205
+ implemented or the bug hasn't been fixed yet
206
+ * Check out the issue tracker to make sure someone already hasn't requested
207
+ it and/or contributed it
208
+ * Fork the project
209
+ * Start a feature/bugfix branch
210
+ * Commit and push until you are happy with your contribution
211
+ * Make sure to add tests for it. This is important so I don't break it
212
+ in a future version unintentionally.
213
+ * Please try not to mess with the Rakefile, version, or history. If you want
214
+ to have your own version, or is otherwise necessary, that is fine, but please
215
+ isolate to its own commit so I can cherry-pick around it.
216
+
217
+ ## Versioning ##
218
+
219
+ This gem is following practices of [Semantic Versioning][semver]
220
+
221
+ ## Authors ##
222
+
223
+ [Dmitry Mozzherin][dimus]
224
+
225
+ ## Contributors ##
226
+
227
+ [lazylester][lazylester], [Ran Xie][skarlit], [Alexey Zapparov][ixti], [azhi][azhi], [Josephine Wright][jozr]
228
+
229
+ ## Copyright ##
230
+
231
+ Copyright (c) 2011-2018 Dmitry Mozzherin. See LICENSE.txt for
232
+ further details.
233
+
234
+ [gem_svg]: https://badge.fury.io/rb/damerau-levenshtein.svg
235
+ [gem]: http://badge.fury.io/rb/damerau-levenshtein
236
+ [ci_svg]: https://secure.travis-ci.org/GlobalNamesArchitecture/damerau-levenshtein.svg
237
+ [ci]: http://travis-ci.org/GlobalNamesArchitecture/damerau-levenshtein
238
+ [dep_svg]: https://gemnasium.com/GlobalNamesArchitecture/damerau-levenshtein.svg
239
+ [dep]: https://gemnasium.com/GlobalNamesArchitecture/damerau-levenshtein
240
+ [cov_svg]: https://coveralls.io/repos/GlobalNamesArchitecture/damerau-levenshtein/badge.svg?branch=master
241
+ [cov]: https://coveralls.io/r/GlobalNamesArchitecture/damerau-levenshtein?branch=master
242
+ [ed]: http://en.wikipedia.org/wiki/Edit_distance
243
+ [semver]: http://semver.org/
244
+ [dimus]: https://github.com/dimus
245
+ [lazylester]: https://github.com/lazylester
246
+ [skarlit]: https://github.com/Skarlit
247
+ [ixti]: https://github.com/ixti
248
+ [azhi]: https://github.com/azhi
249
+ [jozr]: https://github.com/jozr
250
+ [rees2014]: https://dx.doi.org/10.1371/journal.pone.0107510
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+ require "cucumber/rake/task"
6
+ require "rubocop/rake_task"
7
+ require "rake/dsl_definition"
8
+ require "rake"
9
+ require "rake/extensiontask"
10
+ require "rspec"
11
+
12
+ RSpec::Core::RakeTask.new(:spec) do |rspec|
13
+ rspec.pattern = FileList["spec/**/*_spec.rb"]
14
+ end
15
+
16
+ Cucumber::Rake::Task.new(:features)
17
+
18
+ Rake::ExtensionTask.new("damerau_levenshtein") do |extension|
19
+ extension.ext_dir = "ext/damerau_levenshtein"
20
+ extension.lib_dir = "lib/damerau-levenshtein"
21
+ end
22
+
23
+ Rake::Task[:spec].prerequisites << :compile
24
+ Rake::Task[:features].prerequisites << :compile
25
+
26
+ RuboCop::RakeTask.new
27
+ task default: %i[rubocop spec]
28
+
29
+ desc "open an irb session preloaded with this gem"
30
+ task :console do
31
+ sh "irb -r pp -r ./lib/damerau-levenshtein.rb"
32
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.push File.expand_path("lib", __dir__)
4
+
5
+ require "damerau-levenshtein/version"
6
+
7
+ Gem::Specification.new do |s|
8
+ s.required_ruby_version = "~> 2.5"
9
+ s.name = "damerau-levenshtein"
10
+ s.version = DamerauLevenshtein::VERSION
11
+ s.homepage = "https://github.com/GlobalNamesArchitecture/damerau-levenshtein"
12
+ s.authors = ["Dmitry Mozzherin"]
13
+ s.email = "dmozzherin@gmail.com"
14
+ s.license = "MIT"
15
+ s.summary = "Calculation of editing distance for 2 strings " \
16
+ "using Levenshtein or Damerau-Levenshtein algorithms"
17
+ s.description = "This gem implements pure Levenshtein algorithm, " \
18
+ "Damerau modification (where 2 character " \
19
+ "transposition counts as 1 edit distance). It also " \
20
+ "includes Boehmer & Rees 2008 modification, " \
21
+ "to handle transposition in blocks with more than " \
22
+ "2 characters (Boehmer & Rees 2008)."
23
+ s.files = `git ls-files -z`.split("\x0").
24
+ reject { |f| f.match(%r{^(test|spec|features)/}) }
25
+ s.extensions = ["ext/damerau_levenshtein/extconf.rb"]
26
+ s.require_paths = ["lib", "lib/damerau-levenshtein"]
27
+
28
+ s.add_development_dependency "activesupport", "~> 6.0"
29
+ s.add_development_dependency "bundler", "~> 2.0"
30
+ s.add_development_dependency "byebug", "~> 11.0"
31
+ s.add_development_dependency "coveralls", "~> 0.8"
32
+ s.add_development_dependency "cucumber", "~> 3.1"
33
+ s.add_development_dependency "rake", "~> 13.0"
34
+ s.add_development_dependency "rake-compiler", "~> 1.0"
35
+ s.add_development_dependency "rspec", "~> 3.9"
36
+ s.add_development_dependency "rubocop", "~> 0.76"
37
+ s.add_development_dependency "ruby-prof", "~> 1.0"
38
+ s.add_development_dependency "shoulda", "~> 3.6"
39
+ end
@@ -0,0 +1,116 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE DamerauLevenshteinBinding = Qnil;
4
+
5
+ void Init_damerau_levenshtein();
6
+
7
+ VALUE method_internal_distance(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance);
8
+
9
+ void Init_damerau_levenshtein() {
10
+ DamerauLevenshteinBinding = rb_define_module("DamerauLevenshteinBinding");
11
+ rb_define_method(DamerauLevenshteinBinding, "internal_distance", method_internal_distance, 4);
12
+ }
13
+
14
+ VALUE method_internal_distance(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance){
15
+ VALUE *sv = RARRAY_PTR(_s);
16
+ VALUE *tv = RARRAY_PTR(_t);
17
+ int i, i1, j, j1, k, half_tl, cost, *d, distance, del, ins, subs, transp, block;
18
+ int half_sl;
19
+ int stop_execution = 0;
20
+ int min = 0;
21
+ int current_distance = 0;
22
+ int pure_levenshtein = 0;
23
+ int block_size = NUM2INT(_block_size);
24
+ int max_distance = NUM2INT(_max_distance);
25
+ int sl = (int) RARRAY_LEN(_s);
26
+ int tl = (int) RARRAY_LEN(_t);
27
+ long long s[sl];
28
+ long long t[tl];
29
+
30
+ if (block_size == 0) {
31
+ pure_levenshtein = 1;
32
+ block_size = 1;
33
+ }
34
+
35
+
36
+ if (sl == 0) return INT2NUM(tl);
37
+ if (tl == 0) return INT2NUM(sl);
38
+ //case of lengths 1 must present or it will break further in the code
39
+ if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
40
+
41
+
42
+ for (i=0; i < sl; i++) s[i] = NUM2LL(sv[i]);
43
+ for (i=0; i < tl; i++) t[i] = NUM2LL(tv[i]);
44
+
45
+ sl++;
46
+ tl++;
47
+
48
+ //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
49
+ d = malloc((sizeof(int))*(sl)*(tl));
50
+ //populate 'vertical' row starting from the 2nd position (first one is filled already)
51
+ for(i = 0; i < tl; i++){
52
+ d[i*sl] = i;
53
+ }
54
+
55
+ //fill up array with scores
56
+ for(i = 1; i<sl; i++){
57
+ d[i] = i;
58
+ if (stop_execution == 1) break;
59
+ current_distance = 10000;
60
+ for(j = 1; j<tl; j++){
61
+
62
+ cost = 1;
63
+ if(s[i-1] == t[j-1]) cost = 0;
64
+
65
+ half_sl = (sl - 1)/2;
66
+ half_tl = (tl - 1)/2;
67
+
68
+ block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
69
+ block = block < half_tl || half_tl == 0 ? block : half_tl;
70
+
71
+ while (block >= 1){
72
+ int swap1 = 1;
73
+ int swap2 = 1;
74
+ i1 = i - (block * 2);
75
+ j1 = j - (block * 2);
76
+ for (k = i1; k < i1 + block; k++) {
77
+ if (s[k] != t[k + block]){
78
+ swap1 = 0;
79
+ break;
80
+ }
81
+ }
82
+ for (k = j1; k < j1 + block; k++) {
83
+ if (t[k] != s[k + block]){
84
+ swap2 = 0;
85
+ break;
86
+ }
87
+ }
88
+
89
+ del = d[j*sl + i - 1] + 1;
90
+ ins = d[(j-1)*sl + i] + 1;
91
+ min = del;
92
+ if (ins < min) min = ins;
93
+ //if (i == 2 && j==2) return INT2NUM(swap2+5);
94
+ if (pure_levenshtein == 0 && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1){
95
+ transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
96
+ if (transp < min) min = transp;
97
+ block = 0;
98
+ } else if (block == 1) {
99
+ subs = d[(j-1)*sl + i - 1] + cost;
100
+ if (subs < min) min = subs;
101
+ }
102
+ block--;
103
+ }
104
+ d[j*sl+i]=min;
105
+ if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
106
+ }
107
+ if (current_distance > max_distance) {
108
+ stop_execution = 1;
109
+ }
110
+ }
111
+ distance=d[sl * tl - 1];
112
+ if (stop_execution == 1) distance = current_distance;
113
+
114
+ free(d);
115
+ return INT2NUM(distance);
116
+ }
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Loads mkmf which is used to make makefiles for Ruby extensions
4
+ require "mkmf"
5
+
6
+ # The destination
7
+ dir_config("damerau-levenshtein/damerau_levenshtein")
8
+
9
+ # Do the work
10
+ create_makefile("damerau-levenshtein/damerau_levenshtein")
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "damerau-levenshtein/version"
4
+ require_relative "damerau-levenshtein/damerau_levenshtein"
5
+ require_relative "damerau-levenshtein/formatter"
6
+ require_relative "damerau-levenshtein/differ"
7
+
8
+ # Damerau-Levenshtein algorithm
9
+ module DamerauLevenshtein
10
+ extend DamerauLevenshteinBinding
11
+
12
+ def self.version
13
+ VERSION
14
+ end
15
+
16
+ def self.distance(str1, str2, block_size = 1, max_distance = 10)
17
+ internal_distance(
18
+ str1.unpack("U*"), str2.unpack("U*"),
19
+ block_size, max_distance
20
+ )
21
+ end
22
+
23
+ def self.string_distance(*args)
24
+ distance(*args)
25
+ end
26
+
27
+ def self.array_distance(array1, array2, block_size = 1, max_distance = 10)
28
+ internal_distance(array1, array2, block_size, max_distance)
29
+ end
30
+
31
+ # keep backward compatibility - internal_distance was called distance_utf
32
+ # before
33
+ def self.distance_utf(*args)
34
+ internal_distance(*args)
35
+ end
36
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DamerauLevenshtein
4
+ # Shows the difference between two strings in character by character
5
+ # resolution
6
+ class Differ
7
+ FORMATS = %i[raw tag].freeze
8
+ attr_reader :format
9
+
10
+ def initialize
11
+ @format = :tag
12
+ @matrix = []
13
+ end
14
+
15
+ def format=(new_format)
16
+ new_format = new_format.to_sym
17
+ @format = new_format if FORMATS.include?(new_format)
18
+ end
19
+
20
+ def run(str1, str2)
21
+ @len1 = str1.size.freeze
22
+ @len2 = str2.size.freeze
23
+ prepare_matrix
24
+ edit_distance(str1, str2)
25
+ raw = trace_back
26
+ formatter_factory.show(raw, str1, str2)
27
+ end
28
+
29
+ private
30
+
31
+ def formatter_factory
32
+ formatter =
33
+ case @format
34
+ when :tag
35
+ DamerauLevenshtein::FormatterTag
36
+ when :raw
37
+ DamerauLevenshtein::FormatterRaw
38
+ end
39
+ Formatter.new(formatter)
40
+ end
41
+
42
+ def edit_distance(str1, str2)
43
+ (1..@len2).each do |i|
44
+ (1..@len1).each do |j|
45
+ no_change(i, j) && next if str2[i - 1] == str1[j - 1]
46
+ @matrix[i][j] = [del(i, j), ins(i, j), subst(i, j)].min + 1
47
+ end
48
+ end
49
+ end
50
+
51
+ def trace_back
52
+ res = []
53
+ cell = [@len2, @len1]
54
+ while cell != [0, 0]
55
+ cell, char = char_data(cell)
56
+ res.unshift char
57
+ end
58
+ res
59
+ end
60
+
61
+ def char_data(cell)
62
+ char = { distance: @matrix[cell[0]][cell[1]] }
63
+ val = find_previous(cell)
64
+ previous_value = val[0][0]
65
+ char[:type] = previous_value == char[:distance] ? :same : val[1]
66
+ cell = val.pop
67
+ [cell, char]
68
+ end
69
+
70
+ def find_previous(cell)
71
+ candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
72
+ [[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
73
+ [[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
74
+ select_cell(candidates)
75
+ end
76
+
77
+ def select_cell(candidates)
78
+ candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
79
+ sort_by(&:first).first
80
+ end
81
+
82
+ def del(i, j)
83
+ @matrix[i - 1][j]
84
+ end
85
+
86
+ def ins(i, j)
87
+ @matrix[i][j - 1]
88
+ end
89
+
90
+ def subst(i, j)
91
+ @matrix[i - 1][j - 1]
92
+ end
93
+
94
+ def no_change(i, j)
95
+ @matrix[i][j] = @matrix[i - 1][j - 1]
96
+ end
97
+
98
+ def prepare_matrix
99
+ @matrix = []
100
+ @matrix << (0..@len1).to_a
101
+ @len2.times do |i|
102
+ ary = [i + 1] + (1..@len1).map { nil }
103
+ @matrix << ary
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DamerauLevenshtein
4
+ # Formats supplied strings according to their differences
5
+ class Formatter
6
+ def initialize(formatter)
7
+ @formatter = formatter
8
+ end
9
+
10
+ def show(raw_format, str1, str2)
11
+ @formatter.show(raw_format, str1, str2)
12
+ end
13
+ end
14
+
15
+ # Outputs raw format for two strings
16
+ module FormatterRaw
17
+ def self.show(raw_format, _, _)
18
+ raw_format
19
+ end
20
+ end
21
+
22
+ # Outputs strings marked with tags
23
+ module FormatterTag
24
+ class << self
25
+ def show(raw_format, str1, str2)
26
+ inverted_raw_format = raw_format.map do |e|
27
+ type = invert_type(e[:type])
28
+ { distance: e[:distance], type: type }
29
+ end
30
+ [show_string(raw_format, str1, str2),
31
+ show_string(inverted_raw_format, str2, str1)]
32
+ end
33
+
34
+ private
35
+
36
+ def invert_type(type)
37
+ case type
38
+ when :del
39
+ :ins
40
+ when :ins
41
+ :del
42
+ else
43
+ type
44
+ end
45
+ end
46
+
47
+ def show_string(raw, str1, str2)
48
+ data = { res: [], type: nil, deletes: 0, inserts: 0,
49
+ str1: str1, str2: str2 }
50
+ raw.each_with_index do |e, i|
51
+ process_entry(e, i, data)
52
+ end
53
+ data[:res] << format("</%<type>s>", data) if data[:type] != :same
54
+ data[:res].join("")
55
+ end
56
+
57
+ def process_entry(e, i, data)
58
+ if data[:type] && e[:type] != data[:type]
59
+ insert_tags(e, data)
60
+ elsif data[:type].nil?
61
+ data[:res] << format("<%<type>s>", e) if e[:type] != :same
62
+ end
63
+ insert_letter(e, i, data)
64
+ end
65
+
66
+ def insert_tags(entry, data)
67
+ data[:res] << format("</%<type>s>", data) if data[:type] != :same
68
+ data[:res] << format("<%<type>s>", entry) if entry[:type] != :same
69
+ end
70
+
71
+ def insert_letter(entry, index, data)
72
+ if entry[:type] == :del
73
+ insert_del(index, data)
74
+ else
75
+ insert_others(index, data)
76
+ end
77
+ data[:inserts] += 1 if entry[:type] == :ins
78
+ data[:type] = entry[:type]
79
+ end
80
+
81
+ def insert_del(i, data)
82
+ data[:res] << data[:str2][i - data[:inserts]]
83
+ data[:deletes] += 1
84
+ end
85
+
86
+ def insert_others(i, data)
87
+ data[:res] << data[:str1][i - data[:deletes]]
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Damerau Levenshtein algorithm
4
+ module DamerauLevenshtein
5
+ VERSION = "1.3.2"
6
+ end
metadata ADDED
@@ -0,0 +1,222 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: damerau-levenshtein
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.2
5
+ platform: ruby
6
+ authors:
7
+ - Dmitry Mozzherin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-11-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '6.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '6.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: byebug
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '11.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '11.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.8'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: cucumber
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '13.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '13.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake-compiler
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.9'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.9'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rubocop
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.76'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.76'
139
+ - !ruby/object:Gem::Dependency
140
+ name: ruby-prof
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '1.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '1.0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: shoulda
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '3.6'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '3.6'
167
+ description: This gem implements pure Levenshtein algorithm, Damerau modification
168
+ (where 2 character transposition counts as 1 edit distance). It also includes Boehmer
169
+ & Rees 2008 modification, to handle transposition in blocks with more than 2 characters
170
+ (Boehmer & Rees 2008).
171
+ email: dmozzherin@gmail.com
172
+ executables: []
173
+ extensions:
174
+ - ext/damerau_levenshtein/extconf.rb
175
+ extra_rdoc_files: []
176
+ files:
177
+ - ".document"
178
+ - ".gitignore"
179
+ - ".rspec"
180
+ - ".rubocop.yml"
181
+ - ".ruby-version"
182
+ - ".travis.yml"
183
+ - CHANGELOG.md
184
+ - Gemfile
185
+ - LICENSE.txt
186
+ - README.md
187
+ - Rakefile
188
+ - damerau-levenshtein.gemspec
189
+ - ext/damerau_levenshtein/damerau_levenshtein.c
190
+ - ext/damerau_levenshtein/extconf.rb
191
+ - lib/damerau-levenshtein.rb
192
+ - lib/damerau-levenshtein/damerau_levenshtein.so
193
+ - lib/damerau-levenshtein/differ.rb
194
+ - lib/damerau-levenshtein/formatter.rb
195
+ - lib/damerau-levenshtein/version.rb
196
+ homepage: https://github.com/GlobalNamesArchitecture/damerau-levenshtein
197
+ licenses:
198
+ - MIT
199
+ metadata: {}
200
+ post_install_message:
201
+ rdoc_options: []
202
+ require_paths:
203
+ - lib
204
+ - lib/damerau-levenshtein
205
+ required_ruby_version: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: '2.5'
210
+ required_rubygems_version: !ruby/object:Gem::Requirement
211
+ requirements:
212
+ - - ">="
213
+ - !ruby/object:Gem::Version
214
+ version: '0'
215
+ requirements: []
216
+ rubyforge_project:
217
+ rubygems_version: 2.7.6.2
218
+ signing_key:
219
+ specification_version: 4
220
+ summary: Calculation of editing distance for 2 strings using Levenshtein or Damerau-Levenshtein
221
+ algorithms
222
+ test_files: []