damerau-levenshtein 1.1.3 → 1.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +4 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/.travis.yml +3 -4
- data/CHANGELOG.md +7 -1
- data/Gemfile +3 -1
- data/LICENSE.txt +1 -1
- data/README.md +127 -41
- data/Rakefile +17 -10
- data/damerau-levenshtein.gemspec +16 -11
- data/ext/damerau_levenshtein/damerau_levenshtein.c +7 -7
- data/ext/damerau_levenshtein/extconf.rb +2 -0
- data/lib/damerau-levenshtein.rb +23 -4
- data/lib/damerau-levenshtein/damerau_levenshtein.so +0 -0
- data/lib/damerau-levenshtein/differ.rb +107 -0
- data/lib/damerau-levenshtein/formatter.rb +91 -0
- data/lib/damerau-levenshtein/version.rb +3 -1
- metadata +68 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1df0e124b37dd40f7d57bd0d96abcb1e0bdfef4d7dc38439ecdcddb8d292aa9b
|
4
|
+
data.tar.gz: '082576cec439a97f6c3bf262c52ec0e6f9dd26805971d9bc3791cdf9f5e2f1dc'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c705b20404d0f2a344cd55693fb207666b16505da67390a82f31ebb1df37fa8c260f121da6fe70447179ae14df34281b9cbfa2332ddcced34afb42acfc861817
|
7
|
+
data.tar.gz: 846aaab0f6c38fdc4bd4ab626de6aec11ddf213b845b75867cad3ee4bd728bf121efb12b0886ead512cfad24798f052bb2c81dd2b36ba43b8f1633d569768feb
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
AllCops:
|
2
|
+
NewCops: disable
|
2
3
|
Exclude:
|
3
4
|
- features/**/*
|
4
5
|
- db/**/*
|
@@ -9,7 +10,7 @@ Metrics/ModuleLength:
|
|
9
10
|
Max: 1000
|
10
11
|
Style/StringLiterals:
|
11
12
|
EnforcedStyle: double_quotes
|
12
|
-
|
13
|
+
Layout/DotPosition:
|
13
14
|
EnforcedStyle: trailing
|
14
|
-
|
15
|
+
Naming/FileName:
|
15
16
|
Enabled: false
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.5.8
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
damerau-levenshtein CHANGELOG
|
2
2
|
=============================
|
3
3
|
|
4
|
-
1.
|
4
|
+
1.3.1 -- gems update
|
5
|
+
|
6
|
+
1.3.0 -- (issue #10) shows difference between two strings
|
7
|
+
|
8
|
+
1.2.0 -- add edit distance for array of integers (by @azhi)
|
9
|
+
|
10
|
+
1.1.3 -- add ruby 2.3.1 to travis tests by request from @greysteil
|
5
11
|
|
6
12
|
1.1.2 -- remove unnecessary production dependencies (by @ixti)
|
7
13
|
|
data/Gemfile
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,40 +1,44 @@
|
|
1
|
-
damerau-levenshtein
|
2
|
-
===================
|
1
|
+
# damerau-levenshtein #
|
3
2
|
|
4
3
|
[![Gem Version][gem_svg]][gem]
|
5
4
|
[![Continuous Integration Status][ci_svg]][ci]
|
6
5
|
[![Dependency Status][dep_svg]][dep]
|
7
6
|
[![Coverage Status][cov_svg]][cov]
|
8
7
|
|
9
|
-
The damerau-levenshtein gem allows to find edit distance between two
|
10
|
-
or ASCII encoded strings with O(N\*M) efficiency.
|
8
|
+
The damerau-levenshtein gem allows to find [edit distance][ed] between two
|
9
|
+
UTF-8 or ASCII encoded strings with O(N\*M) efficiency.
|
11
10
|
|
12
11
|
This gem implements pure Levenshtein algorithm, Damerau modification of it
|
13
12
|
(where 2 character transposition counts as 1 edit distance). It also includes
|
14
13
|
Boehmer & Rees 2008 modification of Damerau algorithm, where transposition
|
15
14
|
of bigger than 1 character blocks is taken in account as well
|
16
|
-
(
|
15
|
+
[(Rees 2014)][rees2014].
|
17
16
|
|
18
17
|
```ruby
|
19
18
|
require "damerau-levenshtein"
|
20
19
|
DamerauLevenshtein.distance("Something", "Smoething") #returns 1
|
21
20
|
```
|
22
21
|
|
23
|
-
|
24
|
-
|
22
|
+
It also returns a diff between two strings according to Levenshtein alrorithm.
|
23
|
+
The diff is expressed by tags `<ins>`, `<del>`, and `<subst>`. Such tags make
|
24
|
+
it possible to highlight differnce between strings in a flexible way.
|
25
25
|
|
26
|
-
|
27
|
-
|
26
|
+
```ruby
|
27
|
+
require "damerau-levenshtein"
|
28
|
+
differ = DamerauLevenshtein::Differ.new
|
29
|
+
differ.run("corn", "cron")
|
30
|
+
# output: ["c<subst>or</subst>n", "c<subst>ro</subst>n"]
|
31
|
+
```
|
32
|
+
|
33
|
+
## Dependencies ##
|
28
34
|
|
29
35
|
sudo apt-get install build-essential libgmp3-dev
|
30
36
|
|
31
|
-
Installation
|
32
|
-
------------
|
37
|
+
## Installation ##
|
33
38
|
|
34
39
|
gem install damerau-levenshtein
|
35
40
|
|
36
|
-
Examples
|
37
|
-
--------
|
41
|
+
## Examples ##
|
38
42
|
|
39
43
|
```ruby
|
40
44
|
require "damerau-levenshtein"
|
@@ -59,33 +63,63 @@ dl.distance("Something", "Smoething", 0) #returns 2
|
|
59
63
|
dl.distance("Something", "meSothing", 2) #returns 2 instead of 4
|
60
64
|
```
|
61
65
|
|
62
|
-
* comparison of words with
|
66
|
+
* comparison of words with UTF-8 characters should work fine:
|
63
67
|
|
64
68
|
```ruby
|
65
69
|
dl.distance("Sjöstedt", "Sjostedt") #returns 1
|
66
70
|
```
|
67
71
|
|
68
|
-
|
69
|
-
|
72
|
+
* compare two arrays
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
dl.array_distance([1,2,3,5], [1,2,3,4]) #returns 1
|
76
|
+
```
|
77
|
+
|
78
|
+
* return diff between two strings
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
differ = DamerauLevenshtein::Differ.new
|
82
|
+
differ.run("Something", "smthg")
|
83
|
+
```
|
84
|
+
|
85
|
+
* return diff between two strings in raw format
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
differ = DamerauLevenshtein::Differ.new
|
89
|
+
differ.format = :raw
|
90
|
+
differ.run("Something", "smthg")
|
91
|
+
```
|
92
|
+
|
93
|
+
## API Description ##
|
94
|
+
|
95
|
+
### Methods ###
|
70
96
|
|
71
|
-
|
97
|
+
#### DamerauLevenshtein.version
|
72
98
|
|
73
99
|
```ruby
|
74
100
|
DamerauLevenshtein.version
|
75
101
|
#returns version number of the gem
|
102
|
+
```
|
76
103
|
|
104
|
+
#### DamerauLevenshtein.distance
|
105
|
+
|
106
|
+
```ruby
|
77
107
|
DamerauLevenshtein.distance(string1, string2, block_size, max_distance)
|
78
|
-
#returns
|
79
|
-
```
|
108
|
+
#returns edit distance between 2 strings
|
80
109
|
|
110
|
+
DamerauLevenshtein.string_distance(string1, string2, block_size, max_distance)
|
111
|
+
# an alias for .distance
|
81
112
|
|
113
|
+
DamerauLevenshtein.array_distance(array1, array2, block_size, max_distance)
|
114
|
+
# returns edit distance between 2 arrays of integers
|
115
|
+
```
|
82
116
|
|
83
|
-
DamerauLevenshtein.distance
|
117
|
+
`DamerauLevenshtein.distance` and `.array_distance` take 4 arguments:
|
84
118
|
|
85
|
-
* string1
|
86
|
-
* string2
|
87
|
-
* block_size (default is 1)
|
88
|
-
* max_distance (default is 10)
|
119
|
+
* `string1` (`array1` for `.array_distance`)
|
120
|
+
* `string2` (`array2` for `.array_distance`)
|
121
|
+
* `block_size` (default is 1)
|
122
|
+
* `max_distance` (default is 10)
|
89
123
|
|
90
124
|
`block_size` determines maximum number of characters in a transposition block:
|
91
125
|
|
@@ -113,45 +147,93 @@ Levenshtein algorithm is expensive, so it makes sense to give up when edit
|
|
113
147
|
distance is becoming too big. The argument max_distance does just that.
|
114
148
|
|
115
149
|
```ruby
|
150
|
+
|
116
151
|
DamerauLevenshtein.distance("abcdefg", "1234567", 0, 3)
|
117
152
|
# output: 4 -- it gave up when edit distance exceeded 3
|
153
|
+
|
154
|
+
```
|
155
|
+
|
156
|
+
#### DamerauLevenshtein::Differ
|
157
|
+
|
158
|
+
`differ = DamerauLevenshtein::Differ.new` creates an instance of new differ class to return difference between two strings
|
159
|
+
|
160
|
+
`differ.format` shows current format for diff. Default is `:tag` format
|
161
|
+
|
162
|
+
`differ.format = :raw` changes current format for diffs. Possible values are `:tag` and `:raw`
|
163
|
+
|
164
|
+
`differ.run("String1", "String2")` returns difference between two strings.
|
165
|
+
|
166
|
+
For example:
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
differ = DamerauLevenshtein::Differ.new
|
170
|
+
differ.run("Something", "smthng")
|
171
|
+
# output: ["<ins>S</ins><subst>o</subst>m<ins>e</ins>th<ins>i</ins>ng",
|
172
|
+
# "<del>S</del><subst>s</subst>m<del>e</del>th<del>i</del>ng"]
|
173
|
+
|
174
|
+
```
|
175
|
+
|
176
|
+
Or with parsing:
|
177
|
+
|
178
|
+
```ruby
|
179
|
+
require "damerau-levenshtein"
|
180
|
+
require "nokogiri"
|
181
|
+
|
182
|
+
differ = DamerauLevenshtein::Differ.new
|
183
|
+
res = differ.run("Something", "Smothing!")
|
184
|
+
nodes = Nokogiri::XML("<root>#{res.first}</root>")
|
185
|
+
|
186
|
+
markup = nodes.root.children.map do |n|
|
187
|
+
case n.name
|
188
|
+
when "text"
|
189
|
+
n.text
|
190
|
+
when "del"
|
191
|
+
"~~#{n.children.first.text}~~"
|
192
|
+
when "ins"
|
193
|
+
"*#{n.children.first.text}*"
|
194
|
+
when "subst"
|
195
|
+
"**#{n.children.first.text}**"
|
196
|
+
end
|
197
|
+
end.join("")
|
198
|
+
|
199
|
+
puts markup
|
118
200
|
```
|
119
201
|
|
120
|
-
Contributing to damerau-levenshtein
|
121
|
-
-----------------------------------
|
202
|
+
## Contributing to damerau-levenshtein ##
|
122
203
|
|
123
204
|
* Check out the latest master to make sure the feature hasn't been
|
124
|
-
implemented or the bug hasn't been fixed yet
|
205
|
+
implemented or the bug hasn't been fixed yet
|
125
206
|
* Check out the issue tracker to make sure someone already hasn't requested
|
126
|
-
it and/or contributed it
|
207
|
+
it and/or contributed it
|
127
208
|
* Fork the project
|
128
209
|
* Start a feature/bugfix branch
|
129
210
|
* Commit and push until you are happy with your contribution
|
130
211
|
* Make sure to add tests for it. This is important so I don't break it
|
131
|
-
in a future version unintentionally.
|
212
|
+
in a future version unintentionally.
|
132
213
|
* Please try not to mess with the Rakefile, version, or history. If you want
|
133
|
-
to have your own version, or is otherwise necessary, that is fine, but please
|
134
|
-
isolate to its own commit so I can cherry-pick around it.
|
214
|
+
to have your own version, or is otherwise necessary, that is fine, but please
|
215
|
+
isolate to its own commit so I can cherry-pick around it.
|
135
216
|
|
136
|
-
Versioning
|
137
|
-
----------
|
217
|
+
## Versioning ##
|
138
218
|
|
139
219
|
This gem is following practices of [Semantic Versioning][semver]
|
140
220
|
|
141
|
-
Authors
|
142
|
-
-------
|
221
|
+
## Authors ##
|
143
222
|
|
144
223
|
[Dmitry Mozzherin][dimus]
|
145
224
|
|
146
|
-
Contributors
|
147
|
-
------------
|
225
|
+
## Contributors ##
|
148
226
|
|
149
|
-
[
|
227
|
+
[Alexey Zapparov][ixti],
|
228
|
+
[azhi][azhi],
|
229
|
+
[Fabian Winkler][wynksaiddestroy]
|
230
|
+
[Josephine Wright][jozr],
|
231
|
+
[lazylester][lazylester],
|
232
|
+
[Ran Xie][skarlit],
|
150
233
|
|
151
|
-
Copyright
|
152
|
-
---------
|
234
|
+
## Copyright ##
|
153
235
|
|
154
|
-
Copyright (c) 2011-
|
236
|
+
Copyright (c) 2011-2019 Dmitry Mozzherin. See LICENSE.txt for
|
155
237
|
further details.
|
156
238
|
|
157
239
|
[gem_svg]: https://badge.fury.io/rb/damerau-levenshtein.svg
|
@@ -168,3 +250,7 @@ further details.
|
|
168
250
|
[lazylester]: https://github.com/lazylester
|
169
251
|
[skarlit]: https://github.com/Skarlit
|
170
252
|
[ixti]: https://github.com/ixti
|
253
|
+
[azhi]: https://github.com/azhi
|
254
|
+
[jozr]: https://github.com/jozr
|
255
|
+
[rees2014]: https://dx.doi.org/10.1371/journal.pone.0107510
|
256
|
+
[wynksaiddestroy]: https://github.com/wynksaiddestroy
|
data/Rakefile
CHANGED
@@ -1,25 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "bundler/gem_tasks"
|
2
4
|
require "rspec/core/rake_task"
|
3
|
-
require
|
5
|
+
require "cucumber/rake/task"
|
4
6
|
require "rubocop/rake_task"
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
7
|
+
require "rake/dsl_definition"
|
8
|
+
require "rake"
|
9
|
+
require "rake/extensiontask"
|
10
|
+
require "rspec"
|
9
11
|
|
10
12
|
RSpec::Core::RakeTask.new(:spec) do |rspec|
|
11
|
-
rspec.pattern = FileList[
|
13
|
+
rspec.pattern = FileList["spec/**/*_spec.rb"]
|
12
14
|
end
|
13
15
|
|
14
16
|
Cucumber::Rake::Task.new(:features)
|
15
17
|
|
16
|
-
Rake::ExtensionTask.new(
|
17
|
-
|
18
|
-
|
18
|
+
Rake::ExtensionTask.new("damerau_levenshtein") do |extension|
|
19
|
+
extension.ext_dir = "ext/damerau_levenshtein"
|
20
|
+
extension.lib_dir = "lib/damerau-levenshtein"
|
19
21
|
end
|
20
22
|
|
21
23
|
Rake::Task[:spec].prerequisites << :compile
|
22
24
|
Rake::Task[:features].prerequisites << :compile
|
23
25
|
|
24
26
|
RuboCop::RakeTask.new
|
25
|
-
task :
|
27
|
+
task default: %i[rubocop spec]
|
28
|
+
|
29
|
+
desc "open an irb session preloaded with this gem"
|
30
|
+
task :console do
|
31
|
+
sh "irb -r pp -r ./lib/damerau-levenshtein.rb"
|
32
|
+
end
|
data/damerau-levenshtein.gemspec
CHANGED
@@ -1,8 +1,11 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
$LOAD_PATH.push File.expand_path("lib", __dir__)
|
2
4
|
|
3
5
|
require "damerau-levenshtein/version"
|
4
6
|
|
5
7
|
Gem::Specification.new do |s|
|
8
|
+
s.required_ruby_version = ">= 2.5"
|
6
9
|
s.name = "damerau-levenshtein"
|
7
10
|
s.version = DamerauLevenshtein::VERSION
|
8
11
|
s.homepage = "https://github.com/GlobalNamesArchitecture/damerau-levenshtein"
|
@@ -15,21 +18,23 @@ Gem::Specification.new do |s|
|
|
15
18
|
"Damerau modification (where 2 character " \
|
16
19
|
"transposition counts as 1 edit distance). It also " \
|
17
20
|
"includes Boehmer & Rees 2008 modification, " \
|
18
|
-
"to handle transposition in
|
21
|
+
"to handle transposition in blocks with more than " \
|
19
22
|
"2 characters (Boehmer & Rees 2008)."
|
20
23
|
s.files = `git ls-files -z`.split("\x0").
|
21
24
|
reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
25
|
s.extensions = ["ext/damerau_levenshtein/extconf.rb"]
|
23
26
|
s.require_paths = ["lib", "lib/damerau-levenshtein"]
|
24
27
|
|
25
|
-
s.add_development_dependency "
|
26
|
-
s.add_development_dependency "
|
27
|
-
s.add_development_dependency "
|
28
|
-
s.add_development_dependency "ruby-prof", "~> 0.15"
|
29
|
-
s.add_development_dependency "shoulda", "~> 3.5"
|
30
|
-
s.add_development_dependency "rubocop", "~> 0.38"
|
28
|
+
s.add_development_dependency "activesupport", "~> 6.0"
|
29
|
+
s.add_development_dependency "bundler", "~> 2.1"
|
30
|
+
s.add_development_dependency "byebug", "~> 11.0"
|
31
31
|
s.add_development_dependency "coveralls", "~> 0.8"
|
32
|
-
s.add_development_dependency "
|
33
|
-
s.add_development_dependency "rake", "~>
|
34
|
-
s.add_development_dependency "rake-compiler", "~>
|
32
|
+
s.add_development_dependency "cucumber", "~> 4.1"
|
33
|
+
s.add_development_dependency "rake", "~> 13.0"
|
34
|
+
s.add_development_dependency "rake-compiler", "~> 1.1"
|
35
|
+
s.add_development_dependency "rspec", "~> 3.9"
|
36
|
+
s.add_development_dependency "rubocop", "~> 0.88"
|
37
|
+
s.add_development_dependency "ruby-prof", "~> 1.4"
|
38
|
+
s.add_development_dependency "shoulda", "~> 4.0"
|
39
|
+
s.add_development_dependency "solargraph", "~> 0.39"
|
35
40
|
end
|
@@ -4,14 +4,14 @@ VALUE DamerauLevenshteinBinding = Qnil;
|
|
4
4
|
|
5
5
|
void Init_damerau_levenshtein();
|
6
6
|
|
7
|
-
VALUE
|
7
|
+
VALUE method_internal_distance(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance);
|
8
8
|
|
9
9
|
void Init_damerau_levenshtein() {
|
10
10
|
DamerauLevenshteinBinding = rb_define_module("DamerauLevenshteinBinding");
|
11
|
-
rb_define_method(DamerauLevenshteinBinding, "
|
11
|
+
rb_define_method(DamerauLevenshteinBinding, "internal_distance", method_internal_distance, 4);
|
12
12
|
}
|
13
13
|
|
14
|
-
VALUE
|
14
|
+
VALUE method_internal_distance(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance){
|
15
15
|
VALUE *sv = RARRAY_PTR(_s);
|
16
16
|
VALUE *tv = RARRAY_PTR(_t);
|
17
17
|
int i, i1, j, j1, k, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
@@ -24,8 +24,8 @@ VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VAL
|
|
24
24
|
int max_distance = NUM2INT(_max_distance);
|
25
25
|
int sl = (int) RARRAY_LEN(_s);
|
26
26
|
int tl = (int) RARRAY_LEN(_t);
|
27
|
-
|
28
|
-
|
27
|
+
long long s[sl];
|
28
|
+
long long t[tl];
|
29
29
|
|
30
30
|
if (block_size == 0) {
|
31
31
|
pure_levenshtein = 1;
|
@@ -39,8 +39,8 @@ VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VAL
|
|
39
39
|
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
40
40
|
|
41
41
|
|
42
|
-
for (i=0; i < sl; i++) s[i] =
|
43
|
-
for (i=0; i < tl; i++) t[i] =
|
42
|
+
for (i=0; i < sl; i++) s[i] = NUM2LL(sv[i]);
|
43
|
+
for (i=0; i < tl; i++) t[i] = NUM2LL(tv[i]);
|
44
44
|
|
45
45
|
sl++;
|
46
46
|
tl++;
|
data/lib/damerau-levenshtein.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
require_relative "damerau-levenshtein/version"
|
4
|
+
require_relative "damerau-levenshtein/damerau_levenshtein"
|
5
|
+
require_relative "damerau-levenshtein/formatter"
|
6
|
+
require_relative "damerau-levenshtein/differ"
|
5
7
|
|
6
8
|
# Damerau-Levenshtein algorithm
|
7
9
|
module DamerauLevenshtein
|
@@ -12,6 +14,23 @@ module DamerauLevenshtein
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def self.distance(str1, str2, block_size = 1, max_distance = 10)
|
15
|
-
|
17
|
+
internal_distance(
|
18
|
+
str1.unpack("U*"), str2.unpack("U*"),
|
19
|
+
block_size, max_distance
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.string_distance(*args)
|
24
|
+
distance(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.array_distance(array1, array2, block_size = 1, max_distance = 10)
|
28
|
+
internal_distance(array1, array2, block_size, max_distance)
|
29
|
+
end
|
30
|
+
|
31
|
+
# keep backward compatibility - internal_distance was called distance_utf
|
32
|
+
# before
|
33
|
+
def self.distance_utf(*args)
|
34
|
+
internal_distance(*args)
|
16
35
|
end
|
17
36
|
end
|
Binary file
|
@@ -0,0 +1,107 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DamerauLevenshtein
|
4
|
+
# Shows the difference between two strings in character by character
|
5
|
+
# resolution
|
6
|
+
class Differ
|
7
|
+
FORMATS = %i[raw tag].freeze
|
8
|
+
attr_reader :format
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
@format = :tag
|
12
|
+
@matrix = []
|
13
|
+
end
|
14
|
+
|
15
|
+
def format=(new_format)
|
16
|
+
new_format = new_format.to_sym
|
17
|
+
@format = new_format if FORMATS.include?(new_format)
|
18
|
+
end
|
19
|
+
|
20
|
+
def run(str1, str2)
|
21
|
+
@len1 = str1.size.freeze
|
22
|
+
@len2 = str2.size.freeze
|
23
|
+
prepare_matrix
|
24
|
+
edit_distance(str1, str2)
|
25
|
+
raw = trace_back
|
26
|
+
formatter_factory.show(raw, str1, str2)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def formatter_factory
|
32
|
+
formatter =
|
33
|
+
case @format
|
34
|
+
when :tag
|
35
|
+
DamerauLevenshtein::FormatterTag
|
36
|
+
when :raw
|
37
|
+
DamerauLevenshtein::FormatterRaw
|
38
|
+
end
|
39
|
+
Formatter.new(formatter)
|
40
|
+
end
|
41
|
+
|
42
|
+
def edit_distance(str1, str2)
|
43
|
+
(1..@len2).each do |i|
|
44
|
+
(1..@len1).each do |j|
|
45
|
+
no_change(i, j) && next if str2[i - 1] == str1[j - 1]
|
46
|
+
@matrix[i][j] = [del(i, j), ins(i, j), subst(i, j)].min + 1
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def trace_back
|
52
|
+
res = []
|
53
|
+
cell = [@len2, @len1]
|
54
|
+
while cell != [0, 0]
|
55
|
+
cell, char = char_data(cell)
|
56
|
+
res.unshift char
|
57
|
+
end
|
58
|
+
res
|
59
|
+
end
|
60
|
+
|
61
|
+
def char_data(cell)
|
62
|
+
char = { distance: @matrix[cell[0]][cell[1]] }
|
63
|
+
val = find_previous(cell)
|
64
|
+
previous_value = val[0][0]
|
65
|
+
char[:type] = previous_value == char[:distance] ? :same : val[1]
|
66
|
+
cell = val.pop
|
67
|
+
[cell, char]
|
68
|
+
end
|
69
|
+
|
70
|
+
def find_previous(cell)
|
71
|
+
candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
|
72
|
+
[[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
|
73
|
+
[[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
|
74
|
+
select_cell(candidates)
|
75
|
+
end
|
76
|
+
|
77
|
+
def select_cell(candidates)
|
78
|
+
candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
|
79
|
+
sort_by(&:first).first
|
80
|
+
end
|
81
|
+
|
82
|
+
def del(i, j)
|
83
|
+
@matrix[i - 1][j]
|
84
|
+
end
|
85
|
+
|
86
|
+
def ins(i, j)
|
87
|
+
@matrix[i][j - 1]
|
88
|
+
end
|
89
|
+
|
90
|
+
def subst(i, j)
|
91
|
+
@matrix[i - 1][j - 1]
|
92
|
+
end
|
93
|
+
|
94
|
+
def no_change(i, j)
|
95
|
+
@matrix[i][j] = @matrix[i - 1][j - 1]
|
96
|
+
end
|
97
|
+
|
98
|
+
def prepare_matrix
|
99
|
+
@matrix = []
|
100
|
+
@matrix << (0..@len1).to_a
|
101
|
+
@len2.times do |i|
|
102
|
+
ary = [i + 1] + (1..@len1).map { nil }
|
103
|
+
@matrix << ary
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DamerauLevenshtein
|
4
|
+
# Formats supplied strings according to their differences
|
5
|
+
class Formatter
|
6
|
+
def initialize(formatter)
|
7
|
+
@formatter = formatter
|
8
|
+
end
|
9
|
+
|
10
|
+
def show(raw_format, str1, str2)
|
11
|
+
@formatter.show(raw_format, str1, str2)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# Outputs raw format for two strings
|
16
|
+
module FormatterRaw
|
17
|
+
def self.show(raw_format, _, _)
|
18
|
+
raw_format
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Outputs strings marked with tags
|
23
|
+
module FormatterTag
|
24
|
+
class << self
|
25
|
+
def show(raw_format, str1, str2)
|
26
|
+
inverted_raw_format = raw_format.map do |e|
|
27
|
+
type = invert_type(e[:type])
|
28
|
+
{ distance: e[:distance], type: type }
|
29
|
+
end
|
30
|
+
[show_string(raw_format, str1, str2),
|
31
|
+
show_string(inverted_raw_format, str2, str1)]
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def invert_type(type)
|
37
|
+
case type
|
38
|
+
when :del
|
39
|
+
:ins
|
40
|
+
when :ins
|
41
|
+
:del
|
42
|
+
else
|
43
|
+
type
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def show_string(raw, str1, str2)
|
48
|
+
data = { res: [], type: nil, deletes: 0, inserts: 0,
|
49
|
+
str1: str1, str2: str2 }
|
50
|
+
raw.each_with_index do |e, i|
|
51
|
+
process_entry(e, i, data)
|
52
|
+
end
|
53
|
+
data[:res] << format("</%<type>s>", data) if data[:type] != :same
|
54
|
+
data[:res].join("")
|
55
|
+
end
|
56
|
+
|
57
|
+
def process_entry(e, i, data)
|
58
|
+
if data[:type] && e[:type] != data[:type]
|
59
|
+
insert_tags(e, data)
|
60
|
+
elsif data[:type].nil?
|
61
|
+
data[:res] << format("<%<type>s>", e) if e[:type] != :same
|
62
|
+
end
|
63
|
+
insert_letter(e, i, data)
|
64
|
+
end
|
65
|
+
|
66
|
+
def insert_tags(entry, data)
|
67
|
+
data[:res] << format("</%<type>s>", data) if data[:type] != :same
|
68
|
+
data[:res] << format("<%<type>s>", entry) if entry[:type] != :same
|
69
|
+
end
|
70
|
+
|
71
|
+
def insert_letter(entry, index, data)
|
72
|
+
if entry[:type] == :del
|
73
|
+
insert_del(index, data)
|
74
|
+
else
|
75
|
+
insert_others(index, data)
|
76
|
+
end
|
77
|
+
data[:inserts] += 1 if entry[:type] == :ins
|
78
|
+
data[:type] = entry[:type]
|
79
|
+
end
|
80
|
+
|
81
|
+
def insert_del(i, data)
|
82
|
+
data[:res] << data[:str2][i - data[:inserts]]
|
83
|
+
data[:deletes] += 1
|
84
|
+
end
|
85
|
+
|
86
|
+
def insert_others(i, data)
|
87
|
+
data[:res] << data[:str1][i - data[:deletes]]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
metadata
CHANGED
@@ -1,158 +1,186 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: damerau-levenshtein
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: activesupport
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '6.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '6.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.1'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: byebug
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '11.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '11.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: coveralls
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.8'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.8'
|
41
69
|
- !ruby/object:Gem::Dependency
|
42
70
|
name: cucumber
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|
44
72
|
requirements:
|
45
73
|
- - "~>"
|
46
74
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
75
|
+
version: '4.1'
|
48
76
|
type: :development
|
49
77
|
prerelease: false
|
50
78
|
version_requirements: !ruby/object:Gem::Requirement
|
51
79
|
requirements:
|
52
80
|
- - "~>"
|
53
81
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
82
|
+
version: '4.1'
|
55
83
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
84
|
+
name: rake
|
57
85
|
requirement: !ruby/object:Gem::Requirement
|
58
86
|
requirements:
|
59
87
|
- - "~>"
|
60
88
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0
|
89
|
+
version: '13.0'
|
62
90
|
type: :development
|
63
91
|
prerelease: false
|
64
92
|
version_requirements: !ruby/object:Gem::Requirement
|
65
93
|
requirements:
|
66
94
|
- - "~>"
|
67
95
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0
|
96
|
+
version: '13.0'
|
69
97
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
98
|
+
name: rake-compiler
|
71
99
|
requirement: !ruby/object:Gem::Requirement
|
72
100
|
requirements:
|
73
101
|
- - "~>"
|
74
102
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
103
|
+
version: '1.1'
|
76
104
|
type: :development
|
77
105
|
prerelease: false
|
78
106
|
version_requirements: !ruby/object:Gem::Requirement
|
79
107
|
requirements:
|
80
108
|
- - "~>"
|
81
109
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
110
|
+
version: '1.1'
|
83
111
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
112
|
+
name: rspec
|
85
113
|
requirement: !ruby/object:Gem::Requirement
|
86
114
|
requirements:
|
87
115
|
- - "~>"
|
88
116
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
117
|
+
version: '3.9'
|
90
118
|
type: :development
|
91
119
|
prerelease: false
|
92
120
|
version_requirements: !ruby/object:Gem::Requirement
|
93
121
|
requirements:
|
94
122
|
- - "~>"
|
95
123
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
124
|
+
version: '3.9'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
126
|
+
name: rubocop
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
100
128
|
requirements:
|
101
129
|
- - "~>"
|
102
130
|
- !ruby/object:Gem::Version
|
103
|
-
version: '0.
|
131
|
+
version: '0.88'
|
104
132
|
type: :development
|
105
133
|
prerelease: false
|
106
134
|
version_requirements: !ruby/object:Gem::Requirement
|
107
135
|
requirements:
|
108
136
|
- - "~>"
|
109
137
|
- !ruby/object:Gem::Version
|
110
|
-
version: '0.
|
138
|
+
version: '0.88'
|
111
139
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
140
|
+
name: ruby-prof
|
113
141
|
requirement: !ruby/object:Gem::Requirement
|
114
142
|
requirements:
|
115
143
|
- - "~>"
|
116
144
|
- !ruby/object:Gem::Version
|
117
|
-
version: '1.
|
145
|
+
version: '1.4'
|
118
146
|
type: :development
|
119
147
|
prerelease: false
|
120
148
|
version_requirements: !ruby/object:Gem::Requirement
|
121
149
|
requirements:
|
122
150
|
- - "~>"
|
123
151
|
- !ruby/object:Gem::Version
|
124
|
-
version: '1.
|
152
|
+
version: '1.4'
|
125
153
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
154
|
+
name: shoulda
|
127
155
|
requirement: !ruby/object:Gem::Requirement
|
128
156
|
requirements:
|
129
157
|
- - "~>"
|
130
158
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
159
|
+
version: '4.0'
|
132
160
|
type: :development
|
133
161
|
prerelease: false
|
134
162
|
version_requirements: !ruby/object:Gem::Requirement
|
135
163
|
requirements:
|
136
164
|
- - "~>"
|
137
165
|
- !ruby/object:Gem::Version
|
138
|
-
version: '
|
166
|
+
version: '4.0'
|
139
167
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
168
|
+
name: solargraph
|
141
169
|
requirement: !ruby/object:Gem::Requirement
|
142
170
|
requirements:
|
143
171
|
- - "~>"
|
144
172
|
- !ruby/object:Gem::Version
|
145
|
-
version: '0.
|
173
|
+
version: '0.39'
|
146
174
|
type: :development
|
147
175
|
prerelease: false
|
148
176
|
version_requirements: !ruby/object:Gem::Requirement
|
149
177
|
requirements:
|
150
178
|
- - "~>"
|
151
179
|
- !ruby/object:Gem::Version
|
152
|
-
version: '0.
|
180
|
+
version: '0.39'
|
153
181
|
description: This gem implements pure Levenshtein algorithm, Damerau modification
|
154
182
|
(where 2 character transposition counts as 1 edit distance). It also includes Boehmer
|
155
|
-
& Rees 2008 modification, to handle transposition in
|
183
|
+
& Rees 2008 modification, to handle transposition in blocks with more than 2 characters
|
156
184
|
(Boehmer & Rees 2008).
|
157
185
|
email: dmozzherin@gmail.com
|
158
186
|
executables: []
|
@@ -176,12 +204,14 @@ files:
|
|
176
204
|
- ext/damerau_levenshtein/extconf.rb
|
177
205
|
- lib/damerau-levenshtein.rb
|
178
206
|
- lib/damerau-levenshtein/damerau_levenshtein.so
|
207
|
+
- lib/damerau-levenshtein/differ.rb
|
208
|
+
- lib/damerau-levenshtein/formatter.rb
|
179
209
|
- lib/damerau-levenshtein/version.rb
|
180
210
|
homepage: https://github.com/GlobalNamesArchitecture/damerau-levenshtein
|
181
211
|
licenses:
|
182
212
|
- MIT
|
183
213
|
metadata: {}
|
184
|
-
post_install_message:
|
214
|
+
post_install_message:
|
185
215
|
rdoc_options: []
|
186
216
|
require_paths:
|
187
217
|
- lib
|
@@ -190,16 +220,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
190
220
|
requirements:
|
191
221
|
- - ">="
|
192
222
|
- !ruby/object:Gem::Version
|
193
|
-
version: '
|
223
|
+
version: '2.5'
|
194
224
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
195
225
|
requirements:
|
196
226
|
- - ">="
|
197
227
|
- !ruby/object:Gem::Version
|
198
228
|
version: '0'
|
199
229
|
requirements: []
|
200
|
-
rubyforge_project:
|
201
|
-
rubygems_version: 2.
|
202
|
-
signing_key:
|
230
|
+
rubyforge_project:
|
231
|
+
rubygems_version: 2.7.6.2
|
232
|
+
signing_key:
|
203
233
|
specification_version: 4
|
204
234
|
summary: Calculation of editing distance for 2 strings using Levenshtein or Damerau-Levenshtein
|
205
235
|
algorithms
|