damerau-levenshtein 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '08637e3e7eb3789b7910dbb1d824aea39ac1e0a20a939cbf035a95f4657e50a0'
4
+ data.tar.gz: 430fbf0ac5d6723becbb1be76f075422bcee05cf9a0e45ef7405dd065d321cb3
5
+ SHA512:
6
+ metadata.gz: b0a24c262162ebcb8a310d352f6c2388f4e2067b9183d4804048909af2af81f504f477d9da8f9c614204e0c63905f7fff3e59e29c4c08ae4170fe886477ab37b
7
+ data.tar.gz: 18ca5a38b06fc6d1d0177bcc30f5a9c37f6a040cbc1de061f41ba0cfd187d133baafe7c2fe57f1435eb7afd802f0f43502ed71ee7a77302348dd9acf407265e0
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
@@ -0,0 +1,27 @@
1
+ Gemfile.lock
2
+ *.sw?
3
+ .DS_Store
4
+ tmp
5
+ *.o
6
+ *.bundle
7
+ *.gem
8
+ .nvimlog
9
+ .vim.custom
10
+ .byebug_history
11
+
12
+ # rcov generated
13
+ coverage
14
+
15
+ # rdoc generated
16
+ rdoc
17
+
18
+ # yard generated
19
+ doc
20
+ .yardoc
21
+
22
+ # bundler
23
+ .bundle
24
+ bin
25
+
26
+ # generated gems
27
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,15 @@
1
+ AllCops:
2
+ Exclude:
3
+ - features/**/*
4
+ - db/**/*
5
+ - bundle_bin/**/*
6
+ Include:
7
+ - damerau-levenshtein.gemspec
8
+ Metrics/ModuleLength:
9
+ Max: 1000
10
+ Style/StringLiterals:
11
+ EnforcedStyle: double_quotes
12
+ Layout/DotPosition:
13
+ EnforcedStyle: trailing
14
+ Naming/FileName:
15
+ Enabled: false
@@ -0,0 +1 @@
1
+ 2.5.7
@@ -0,0 +1,11 @@
1
+ rvm:
2
+ - 2.5
3
+ - 2.6
4
+ before_install: "gem update bundler"
5
+ script:
6
+ - "bundle exec rake"
7
+ - "bundle exec rake features"
8
+ branches:
9
+ only:
10
+ - master
11
+
@@ -0,0 +1,25 @@
1
+ damerau-levenshtein CHANGELOG
2
+ =============================
3
+
4
+ 1.3.1 -- gems update
5
+
6
+ 1.3.0 -- (issue #10) shows difference between two strings
7
+
8
+ 1.2.0 -- add edit distance for array of integers (by @azhi)
9
+
10
+ 1.1.3 -- add ruby 2.3.1 to travis tests by request from @greysteil
11
+
12
+ 1.1.2 -- remove unnecessary production dependencies (by @ixti)
13
+
14
+ 1.1.1 -- fix random negative distance bug (by @Skarlit)
15
+
16
+ 1.1.0 -- relax and update gem dependencies, update specs
17
+
18
+ 1.0.3 -- gems update
19
+
20
+ 1.0.2 -- includes refactoring by https://github.com/luislavena to remove
21
+ dependency on jeweler
22
+
23
+ 1.0.1 -- fixed gem dependencies
24
+
25
+ 1.0.0 -- stable version
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2011-2018 Dmitry Mozzherin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,250 @@
1
+ # damerau-levenshtein #
2
+
3
+ [![Gem Version][gem_svg]][gem]
4
+ [![Continuous Integration Status][ci_svg]][ci]
5
+ [![Dependency Status][dep_svg]][dep]
6
+ [![Coverage Status][cov_svg]][cov]
7
+
8
+ The damerau-levenshtein gem allows to find [edit distance][ed] between two
9
+ UTF-8 or ASCII encoded strings with O(N\*M) efficiency.
10
+
11
+ This gem implements pure Levenshtein algorithm, Damerau modification of it
12
+ (where 2 character transposition counts as 1 edit distance). It also includes
13
+ Boehmer & Rees 2008 modification of Damerau algorithm, where transposition
14
+ of bigger than 1 character blocks is taken in account as well
15
+ [(Rees 2014)][rees2014].
16
+
17
+ ```ruby
18
+ require "damerau-levenshtein"
19
+ DamerauLevenshtein.distance("Something", "Smoething") #returns 1
20
+ ```
21
+
22
+ It also returns a diff between two strings according to Levenshtein alrorithm.
23
+ The diff is expressed by tags `<ins>`, `<del>`, and `<subst>`. Such tags make
24
+ it possible to highlight differnce between strings in a flexible way.
25
+
26
+ ```ruby
27
+ require "damerau-levenshtein"
28
+ differ = DamerauLevenshtein::Differ.new
29
+ differ.run("corn", "cron")
30
+ # output: ["c<subst>or</subst>n", "c<subst>ro</subst>n"]
31
+ ```
32
+
33
+ ## Dependencies ##
34
+
35
+ sudo apt-get install build-essential libgmp3-dev
36
+
37
+ ## Installation ##
38
+
39
+ gem install damerau-levenshtein
40
+
41
+ ## Examples ##
42
+
43
+ ```ruby
44
+ require "damerau-levenshtein"
45
+ dl = DamerauLevenshtein
46
+ ```
47
+
48
+ * compare using Damerau Levenshtein algorithm
49
+
50
+ ```ruby
51
+ dl.distance("Something", "Smoething") #returns 1
52
+ ```
53
+
54
+ * compare using Levensthein algorithm
55
+
56
+ ```ruby
57
+ dl.distance("Something", "Smoething", 0) #returns 2
58
+ ```
59
+
60
+ * compare using Boehmer & Rees modification
61
+
62
+ ```ruby
63
+ dl.distance("Something", "meSothing", 2) #returns 2 instead of 4
64
+ ```
65
+
66
+ * comparison of words with UTF-8 characters should work fine:
67
+
68
+ ```ruby
69
+ dl.distance("Sjöstedt", "Sjostedt") #returns 1
70
+ ```
71
+
72
+ * compare two arrays
73
+
74
+ ```ruby
75
+ dl.array_distance([1,2,3,5], [1,2,3,4]) #returns 1
76
+ ```
77
+
78
+ * return diff between two strings
79
+
80
+ ```ruby
81
+ differ = DamerauLevenshtein::Differ.new
82
+ differ.run("Something", "smthg")
83
+ ```
84
+
85
+ * return diff between two strings in raw format
86
+
87
+ ```ruby
88
+ differ = DamerauLevenshtein::Differ.new
89
+ differ.format = :raw
90
+ differ.run("Something", "smthg")
91
+ ```
92
+
93
+ ## API Description ##
94
+
95
+ ### Methods ###
96
+
97
+ #### DamerauLevenshtein.version
98
+
99
+ ```ruby
100
+ DamerauLevenshtein.version
101
+ #returns version number of the gem
102
+ ```
103
+
104
+ #### DamerauLevenshtein.distance
105
+
106
+ ```ruby
107
+ DamerauLevenshtein.distance(string1, string2, block_size, max_distance)
108
+ #returns edit distance between 2 strings
109
+
110
+ DamerauLevenshtein.string_distance(string1, string2, block_size, max_distance)
111
+ # an alias for .distance
112
+
113
+ DamerauLevenshtein.array_distance(array1, array2, block_size, max_distance)
114
+ # returns edit distance between 2 arrays of integers
115
+ ```
116
+
117
+ `DamerauLevenshtein.distance` and `.array_distance` take 4 arguments:
118
+
119
+ * `string1` (`array1` for `.array_distance`)
120
+ * `string2` (`array2` for `.array_distance`)
121
+ * `block_size` (default is 1)
122
+ * `max_distance` (default is 10)
123
+
124
+ `block_size` determines maximum number of characters in a transposition block:
125
+
126
+ block_size = 0
127
+ (transposition does not count -- it is a pure Levenshtein algorithm)
128
+
129
+ block_size = 1
130
+ (transposition between 2 adjustent characters --
131
+ it is pure Damerau-Levenshtein algorithm)
132
+
133
+ block_size = 2
134
+ (transposition between blocks as big as 2 characters -- so abcd and cdab
135
+ counts as edit distance 2, not 4)
136
+
137
+ block_size = 3
138
+ (transposition between blocks as big as 3 characters --
139
+ so abcdef and defabc counts as edit distance 3, not 6)
140
+
141
+ etc.
142
+
143
+ `max_distance` -- is a threshold after which algorithm gives up and
144
+ returns max_distance instead of real edit distance.
145
+
146
+ Levenshtein algorithm is expensive, so it makes sense to give up when edit
147
+ distance is becoming too big. The argument max_distance does just that.
148
+
149
+ ```ruby
150
+
151
+ DamerauLevenshtein.distance("abcdefg", "1234567", 0, 3)
152
+ # output: 4 -- it gave up when edit distance exceeded 3
153
+
154
+ ```
155
+
156
+ #### DamerauLevenshtein::Differ
157
+
158
+ `differ = DamerauLevenshtein::Differ.new` creates an instance of new differ class to return difference between two strings
159
+
160
+ `differ.format` shows current format for diff. Default is `:tag` format
161
+
162
+ `differ.format = :raw` changes current format for diffs. Possible values are `:tag` and `:raw`
163
+
164
+ `differ.run("String1", "String2")` returns difference between two strings.
165
+
166
+ For example:
167
+
168
+ ```ruby
169
+ differ = DamerauLevenshtein::Differ.new
170
+ differ.run("Something", "smthng")
171
+ # output: ["<ins>S</ins><subst>o</subst>m<ins>e</ins>th<ins>i</ins>ng",
172
+ # "<del>S</del><subst>s</subst>m<del>e</del>th<del>i</del>ng"]
173
+
174
+ ```
175
+
176
+ Or with parsing:
177
+
178
+ ```ruby
179
+ require "damerau-levenshtein"
180
+ require "nokogiri"
181
+
182
+ differ = DamerauLevenshtein::Differ.new
183
+ res = differ.run("Something", "Smothing!")
184
+ nodes = Nokogiri::XML("<root>#{res.first}</root>")
185
+
186
+ markup = nodes.root.children.map do |n|
187
+ case n.name
188
+ when "text"
189
+ n.text
190
+ when "del"
191
+ "~~#{n.children.first.text}~~"
192
+ when "ins"
193
+ "*#{n.children.first.text}*"
194
+ when "subst"
195
+ "**#{n.children.first.text}**"
196
+ end
197
+ end.join("")
198
+
199
+ puts markup
200
+ ```
201
+
202
+ ## Contributing to damerau-levenshtein ##
203
+
204
+ * Check out the latest master to make sure the feature hasn't been
205
+ implemented or the bug hasn't been fixed yet
206
+ * Check out the issue tracker to make sure someone already hasn't requested
207
+ it and/or contributed it
208
+ * Fork the project
209
+ * Start a feature/bugfix branch
210
+ * Commit and push until you are happy with your contribution
211
+ * Make sure to add tests for it. This is important so I don't break it
212
+ in a future version unintentionally.
213
+ * Please try not to mess with the Rakefile, version, or history. If you want
214
+ to have your own version, or is otherwise necessary, that is fine, but please
215
+ isolate to its own commit so I can cherry-pick around it.
216
+
217
+ ## Versioning ##
218
+
219
+ This gem is following practices of [Semantic Versioning][semver]
220
+
221
+ ## Authors ##
222
+
223
+ [Dmitry Mozzherin][dimus]
224
+
225
+ ## Contributors ##
226
+
227
+ [lazylester][lazylester], [Ran Xie][skarlit], [Alexey Zapparov][ixti], [azhi][azhi], [Josephine Wright][jozr]
228
+
229
+ ## Copyright ##
230
+
231
+ Copyright (c) 2011-2018 Dmitry Mozzherin. See LICENSE.txt for
232
+ further details.
233
+
234
+ [gem_svg]: https://badge.fury.io/rb/damerau-levenshtein.svg
235
+ [gem]: http://badge.fury.io/rb/damerau-levenshtein
236
+ [ci_svg]: https://secure.travis-ci.org/GlobalNamesArchitecture/damerau-levenshtein.svg
237
+ [ci]: http://travis-ci.org/GlobalNamesArchitecture/damerau-levenshtein
238
+ [dep_svg]: https://gemnasium.com/GlobalNamesArchitecture/damerau-levenshtein.svg
239
+ [dep]: https://gemnasium.com/GlobalNamesArchitecture/damerau-levenshtein
240
+ [cov_svg]: https://coveralls.io/repos/GlobalNamesArchitecture/damerau-levenshtein/badge.svg?branch=master
241
+ [cov]: https://coveralls.io/r/GlobalNamesArchitecture/damerau-levenshtein?branch=master
242
+ [ed]: http://en.wikipedia.org/wiki/Edit_distance
243
+ [semver]: http://semver.org/
244
+ [dimus]: https://github.com/dimus
245
+ [lazylester]: https://github.com/lazylester
246
+ [skarlit]: https://github.com/Skarlit
247
+ [ixti]: https://github.com/ixti
248
+ [azhi]: https://github.com/azhi
249
+ [jozr]: https://github.com/jozr
250
+ [rees2014]: https://dx.doi.org/10.1371/journal.pone.0107510
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+ require "cucumber/rake/task"
6
+ require "rubocop/rake_task"
7
+ require "rake/dsl_definition"
8
+ require "rake"
9
+ require "rake/extensiontask"
10
+ require "rspec"
11
+
12
+ RSpec::Core::RakeTask.new(:spec) do |rspec|
13
+ rspec.pattern = FileList["spec/**/*_spec.rb"]
14
+ end
15
+
16
+ Cucumber::Rake::Task.new(:features)
17
+
18
+ Rake::ExtensionTask.new("damerau_levenshtein") do |extension|
19
+ extension.ext_dir = "ext/damerau_levenshtein"
20
+ extension.lib_dir = "lib/damerau-levenshtein"
21
+ end
22
+
23
+ Rake::Task[:spec].prerequisites << :compile
24
+ Rake::Task[:features].prerequisites << :compile
25
+
26
+ RuboCop::RakeTask.new
27
+ task default: %i[rubocop spec]
28
+
29
+ desc "open an irb session preloaded with this gem"
30
+ task :console do
31
+ sh "irb -r pp -r ./lib/damerau-levenshtein.rb"
32
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.push File.expand_path("lib", __dir__)
4
+
5
+ require "damerau-levenshtein/version"
6
+
7
+ Gem::Specification.new do |s|
8
+ s.required_ruby_version = "~> 2.5"
9
+ s.name = "damerau-levenshtein"
10
+ s.version = DamerauLevenshtein::VERSION
11
+ s.homepage = "https://github.com/GlobalNamesArchitecture/damerau-levenshtein"
12
+ s.authors = ["Dmitry Mozzherin"]
13
+ s.email = "dmozzherin@gmail.com"
14
+ s.license = "MIT"
15
+ s.summary = "Calculation of editing distance for 2 strings " \
16
+ "using Levenshtein or Damerau-Levenshtein algorithms"
17
+ s.description = "This gem implements pure Levenshtein algorithm, " \
18
+ "Damerau modification (where 2 character " \
19
+ "transposition counts as 1 edit distance). It also " \
20
+ "includes Boehmer & Rees 2008 modification, " \
21
+ "to handle transposition in blocks with more than " \
22
+ "2 characters (Boehmer & Rees 2008)."
23
+ s.files = `git ls-files -z`.split("\x0").
24
+ reject { |f| f.match(%r{^(test|spec|features)/}) }
25
+ s.extensions = ["ext/damerau_levenshtein/extconf.rb"]
26
+ s.require_paths = ["lib", "lib/damerau-levenshtein"]
27
+
28
+ s.add_development_dependency "activesupport", "~> 6.0"
29
+ s.add_development_dependency "bundler", "~> 2.0"
30
+ s.add_development_dependency "byebug", "~> 11.0"
31
+ s.add_development_dependency "coveralls", "~> 0.8"
32
+ s.add_development_dependency "cucumber", "~> 3.1"
33
+ s.add_development_dependency "rake", "~> 13.0"
34
+ s.add_development_dependency "rake-compiler", "~> 1.0"
35
+ s.add_development_dependency "rspec", "~> 3.9"
36
+ s.add_development_dependency "rubocop", "~> 0.76"
37
+ s.add_development_dependency "ruby-prof", "~> 1.0"
38
+ s.add_development_dependency "shoulda", "~> 3.6"
39
+ end
@@ -0,0 +1,116 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE DamerauLevenshteinBinding = Qnil;
4
+
5
+ void Init_damerau_levenshtein();
6
+
7
+ VALUE method_internal_distance(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance);
8
+
9
+ void Init_damerau_levenshtein() {
10
+ DamerauLevenshteinBinding = rb_define_module("DamerauLevenshteinBinding");
11
+ rb_define_method(DamerauLevenshteinBinding, "internal_distance", method_internal_distance, 4);
12
+ }
13
+
14
+ VALUE method_internal_distance(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance){
15
+ VALUE *sv = RARRAY_PTR(_s);
16
+ VALUE *tv = RARRAY_PTR(_t);
17
+ int i, i1, j, j1, k, half_tl, cost, *d, distance, del, ins, subs, transp, block;
18
+ int half_sl;
19
+ int stop_execution = 0;
20
+ int min = 0;
21
+ int current_distance = 0;
22
+ int pure_levenshtein = 0;
23
+ int block_size = NUM2INT(_block_size);
24
+ int max_distance = NUM2INT(_max_distance);
25
+ int sl = (int) RARRAY_LEN(_s);
26
+ int tl = (int) RARRAY_LEN(_t);
27
+ long long s[sl];
28
+ long long t[tl];
29
+
30
+ if (block_size == 0) {
31
+ pure_levenshtein = 1;
32
+ block_size = 1;
33
+ }
34
+
35
+
36
+ if (sl == 0) return INT2NUM(tl);
37
+ if (tl == 0) return INT2NUM(sl);
38
+ //case of lengths 1 must present or it will break further in the code
39
+ if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
40
+
41
+
42
+ for (i=0; i < sl; i++) s[i] = NUM2LL(sv[i]);
43
+ for (i=0; i < tl; i++) t[i] = NUM2LL(tv[i]);
44
+
45
+ sl++;
46
+ tl++;
47
+
48
+ //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
49
+ d = malloc((sizeof(int))*(sl)*(tl));
50
+ //populate 'vertical' row starting from the 2nd position (first one is filled already)
51
+ for(i = 0; i < tl; i++){
52
+ d[i*sl] = i;
53
+ }
54
+
55
+ //fill up array with scores
56
+ for(i = 1; i<sl; i++){
57
+ d[i] = i;
58
+ if (stop_execution == 1) break;
59
+ current_distance = 10000;
60
+ for(j = 1; j<tl; j++){
61
+
62
+ cost = 1;
63
+ if(s[i-1] == t[j-1]) cost = 0;
64
+
65
+ half_sl = (sl - 1)/2;
66
+ half_tl = (tl - 1)/2;
67
+
68
+ block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
69
+ block = block < half_tl || half_tl == 0 ? block : half_tl;
70
+
71
+ while (block >= 1){
72
+ int swap1 = 1;
73
+ int swap2 = 1;
74
+ i1 = i - (block * 2);
75
+ j1 = j - (block * 2);
76
+ for (k = i1; k < i1 + block; k++) {
77
+ if (s[k] != t[k + block]){
78
+ swap1 = 0;
79
+ break;
80
+ }
81
+ }
82
+ for (k = j1; k < j1 + block; k++) {
83
+ if (t[k] != s[k + block]){
84
+ swap2 = 0;
85
+ break;
86
+ }
87
+ }
88
+
89
+ del = d[j*sl + i - 1] + 1;
90
+ ins = d[(j-1)*sl + i] + 1;
91
+ min = del;
92
+ if (ins < min) min = ins;
93
+ //if (i == 2 && j==2) return INT2NUM(swap2+5);
94
+ if (pure_levenshtein == 0 && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1){
95
+ transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
96
+ if (transp < min) min = transp;
97
+ block = 0;
98
+ } else if (block == 1) {
99
+ subs = d[(j-1)*sl + i - 1] + cost;
100
+ if (subs < min) min = subs;
101
+ }
102
+ block--;
103
+ }
104
+ d[j*sl+i]=min;
105
+ if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
106
+ }
107
+ if (current_distance > max_distance) {
108
+ stop_execution = 1;
109
+ }
110
+ }
111
+ distance=d[sl * tl - 1];
112
+ if (stop_execution == 1) distance = current_distance;
113
+
114
+ free(d);
115
+ return INT2NUM(distance);
116
+ }
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Loads mkmf which is used to make makefiles for Ruby extensions
4
+ require "mkmf"
5
+
6
+ # The destination
7
+ dir_config("damerau-levenshtein/damerau_levenshtein")
8
+
9
+ # Do the work
10
+ create_makefile("damerau-levenshtein/damerau_levenshtein")
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "damerau-levenshtein/version"
4
+ require_relative "damerau-levenshtein/damerau_levenshtein"
5
+ require_relative "damerau-levenshtein/formatter"
6
+ require_relative "damerau-levenshtein/differ"
7
+
8
+ # Damerau-Levenshtein algorithm
9
+ module DamerauLevenshtein
10
+ extend DamerauLevenshteinBinding
11
+
12
+ def self.version
13
+ VERSION
14
+ end
15
+
16
+ def self.distance(str1, str2, block_size = 1, max_distance = 10)
17
+ internal_distance(
18
+ str1.unpack("U*"), str2.unpack("U*"),
19
+ block_size, max_distance
20
+ )
21
+ end
22
+
23
+ def self.string_distance(*args)
24
+ distance(*args)
25
+ end
26
+
27
+ def self.array_distance(array1, array2, block_size = 1, max_distance = 10)
28
+ internal_distance(array1, array2, block_size, max_distance)
29
+ end
30
+
31
+ # keep backward compatibility - internal_distance was called distance_utf
32
+ # before
33
+ def self.distance_utf(*args)
34
+ internal_distance(*args)
35
+ end
36
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DamerauLevenshtein
4
+ # Shows the difference between two strings in character by character
5
+ # resolution
6
+ class Differ
7
+ FORMATS = %i[raw tag].freeze
8
+ attr_reader :format
9
+
10
+ def initialize
11
+ @format = :tag
12
+ @matrix = []
13
+ end
14
+
15
+ def format=(new_format)
16
+ new_format = new_format.to_sym
17
+ @format = new_format if FORMATS.include?(new_format)
18
+ end
19
+
20
+ def run(str1, str2)
21
+ @len1 = str1.size.freeze
22
+ @len2 = str2.size.freeze
23
+ prepare_matrix
24
+ edit_distance(str1, str2)
25
+ raw = trace_back
26
+ formatter_factory.show(raw, str1, str2)
27
+ end
28
+
29
+ private
30
+
31
+ def formatter_factory
32
+ formatter =
33
+ case @format
34
+ when :tag
35
+ DamerauLevenshtein::FormatterTag
36
+ when :raw
37
+ DamerauLevenshtein::FormatterRaw
38
+ end
39
+ Formatter.new(formatter)
40
+ end
41
+
42
+ def edit_distance(str1, str2)
43
+ (1..@len2).each do |i|
44
+ (1..@len1).each do |j|
45
+ no_change(i, j) && next if str2[i - 1] == str1[j - 1]
46
+ @matrix[i][j] = [del(i, j), ins(i, j), subst(i, j)].min + 1
47
+ end
48
+ end
49
+ end
50
+
51
+ def trace_back
52
+ res = []
53
+ cell = [@len2, @len1]
54
+ while cell != [0, 0]
55
+ cell, char = char_data(cell)
56
+ res.unshift char
57
+ end
58
+ res
59
+ end
60
+
61
+ def char_data(cell)
62
+ char = { distance: @matrix[cell[0]][cell[1]] }
63
+ val = find_previous(cell)
64
+ previous_value = val[0][0]
65
+ char[:type] = previous_value == char[:distance] ? :same : val[1]
66
+ cell = val.pop
67
+ [cell, char]
68
+ end
69
+
70
+ def find_previous(cell)
71
+ candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
72
+ [[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
73
+ [[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
74
+ select_cell(candidates)
75
+ end
76
+
77
+ def select_cell(candidates)
78
+ candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
79
+ sort_by(&:first).first
80
+ end
81
+
82
+ def del(i, j)
83
+ @matrix[i - 1][j]
84
+ end
85
+
86
+ def ins(i, j)
87
+ @matrix[i][j - 1]
88
+ end
89
+
90
+ def subst(i, j)
91
+ @matrix[i - 1][j - 1]
92
+ end
93
+
94
+ def no_change(i, j)
95
+ @matrix[i][j] = @matrix[i - 1][j - 1]
96
+ end
97
+
98
+ def prepare_matrix
99
+ @matrix = []
100
+ @matrix << (0..@len1).to_a
101
+ @len2.times do |i|
102
+ ary = [i + 1] + (1..@len1).map { nil }
103
+ @matrix << ary
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DamerauLevenshtein
4
+ # Formats supplied strings according to their differences
5
+ class Formatter
6
+ def initialize(formatter)
7
+ @formatter = formatter
8
+ end
9
+
10
+ def show(raw_format, str1, str2)
11
+ @formatter.show(raw_format, str1, str2)
12
+ end
13
+ end
14
+
15
+ # Outputs raw format for two strings
16
+ module FormatterRaw
17
+ def self.show(raw_format, _, _)
18
+ raw_format
19
+ end
20
+ end
21
+
22
+ # Outputs strings marked with tags
23
+ module FormatterTag
24
+ class << self
25
+ def show(raw_format, str1, str2)
26
+ inverted_raw_format = raw_format.map do |e|
27
+ type = invert_type(e[:type])
28
+ { distance: e[:distance], type: type }
29
+ end
30
+ [show_string(raw_format, str1, str2),
31
+ show_string(inverted_raw_format, str2, str1)]
32
+ end
33
+
34
+ private
35
+
36
+ def invert_type(type)
37
+ case type
38
+ when :del
39
+ :ins
40
+ when :ins
41
+ :del
42
+ else
43
+ type
44
+ end
45
+ end
46
+
47
+ def show_string(raw, str1, str2)
48
+ data = { res: [], type: nil, deletes: 0, inserts: 0,
49
+ str1: str1, str2: str2 }
50
+ raw.each_with_index do |e, i|
51
+ process_entry(e, i, data)
52
+ end
53
+ data[:res] << format("</%<type>s>", data) if data[:type] != :same
54
+ data[:res].join("")
55
+ end
56
+
57
+ def process_entry(e, i, data)
58
+ if data[:type] && e[:type] != data[:type]
59
+ insert_tags(e, data)
60
+ elsif data[:type].nil?
61
+ data[:res] << format("<%<type>s>", e) if e[:type] != :same
62
+ end
63
+ insert_letter(e, i, data)
64
+ end
65
+
66
+ def insert_tags(entry, data)
67
+ data[:res] << format("</%<type>s>", data) if data[:type] != :same
68
+ data[:res] << format("<%<type>s>", entry) if entry[:type] != :same
69
+ end
70
+
71
+ def insert_letter(entry, index, data)
72
+ if entry[:type] == :del
73
+ insert_del(index, data)
74
+ else
75
+ insert_others(index, data)
76
+ end
77
+ data[:inserts] += 1 if entry[:type] == :ins
78
+ data[:type] = entry[:type]
79
+ end
80
+
81
+ def insert_del(i, data)
82
+ data[:res] << data[:str2][i - data[:inserts]]
83
+ data[:deletes] += 1
84
+ end
85
+
86
+ def insert_others(i, data)
87
+ data[:res] << data[:str1][i - data[:deletes]]
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Damerau Levenshtein algorithm
4
+ module DamerauLevenshtein
5
+ VERSION = "1.3.2"
6
+ end
metadata ADDED
@@ -0,0 +1,222 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: damerau-levenshtein
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.2
5
+ platform: ruby
6
+ authors:
7
+ - Dmitry Mozzherin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-11-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '6.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '6.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: byebug
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '11.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '11.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.8'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: cucumber
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '13.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '13.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake-compiler
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.9'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.9'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rubocop
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.76'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.76'
139
+ - !ruby/object:Gem::Dependency
140
+ name: ruby-prof
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '1.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '1.0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: shoulda
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '3.6'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '3.6'
167
+ description: This gem implements pure Levenshtein algorithm, Damerau modification
168
+ (where 2 character transposition counts as 1 edit distance). It also includes Boehmer
169
+ & Rees 2008 modification, to handle transposition in blocks with more than 2 characters
170
+ (Boehmer & Rees 2008).
171
+ email: dmozzherin@gmail.com
172
+ executables: []
173
+ extensions:
174
+ - ext/damerau_levenshtein/extconf.rb
175
+ extra_rdoc_files: []
176
+ files:
177
+ - ".document"
178
+ - ".gitignore"
179
+ - ".rspec"
180
+ - ".rubocop.yml"
181
+ - ".ruby-version"
182
+ - ".travis.yml"
183
+ - CHANGELOG.md
184
+ - Gemfile
185
+ - LICENSE.txt
186
+ - README.md
187
+ - Rakefile
188
+ - damerau-levenshtein.gemspec
189
+ - ext/damerau_levenshtein/damerau_levenshtein.c
190
+ - ext/damerau_levenshtein/extconf.rb
191
+ - lib/damerau-levenshtein.rb
192
+ - lib/damerau-levenshtein/damerau_levenshtein.so
193
+ - lib/damerau-levenshtein/differ.rb
194
+ - lib/damerau-levenshtein/formatter.rb
195
+ - lib/damerau-levenshtein/version.rb
196
+ homepage: https://github.com/GlobalNamesArchitecture/damerau-levenshtein
197
+ licenses:
198
+ - MIT
199
+ metadata: {}
200
+ post_install_message:
201
+ rdoc_options: []
202
+ require_paths:
203
+ - lib
204
+ - lib/damerau-levenshtein
205
+ required_ruby_version: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: '2.5'
210
+ required_rubygems_version: !ruby/object:Gem::Requirement
211
+ requirements:
212
+ - - ">="
213
+ - !ruby/object:Gem::Version
214
+ version: '0'
215
+ requirements: []
216
+ rubyforge_project:
217
+ rubygems_version: 2.7.6.2
218
+ signing_key:
219
+ specification_version: 4
220
+ summary: Calculation of editing distance for 2 strings using Levenshtein or Damerau-Levenshtein
221
+ algorithms
222
+ test_files: []