string_metric 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/.travis.yml +8 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +134 -0
- data/Rakefile +1 -0
- data/benchmarks/levenshtein.rb +42 -0
- data/lib/string_metric.rb +5 -0
- data/lib/string_metric/levenshtein.rb +65 -0
- data/lib/string_metric/levenshtein/experiment.rb +28 -0
- data/lib/string_metric/levenshtein/iterative_with_full_matrix.rb +47 -0
- data/lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb +57 -0
- data/lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb +58 -0
- data/lib/string_metric/levenshtein/recursive.rb +38 -0
- data/lib/string_metric/version.rb +3 -0
- data/spec/fixtures/levenshtein.csv +11 -0
- data/spec/lib/levenshtein/experiment_spec.rb +6 -0
- data/spec/lib/levenshtein/iterative_with_full_matric_spec.rb +6 -0
- data/spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb +6 -0
- data/spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb +6 -0
- data/spec/lib/levenshtein/recursive_spec.rb +6 -0
- data/spec/lib/levenshtein_spec.rb +13 -0
- data/spec/spec_helper.rb +29 -0
- data/spec/support/levenshtein.rb +53 -0
- data/string_metric.gemspec +33 -0
- metadata +149 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 32870ce170c62b4036f6b8eaf3f92e43280a0246
|
4
|
+
data.tar.gz: a7f38c88be3211f6ef1d82c71d701a0655081570
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8668f405e8568c4254ef7856660a87bd4488d8e4ddbcea95f185575562787ad072696868a831431b821d95f868a8a1f67e1f2b057272375dbda3f0fbe73a6ccb
|
7
|
+
data.tar.gz: 9e9b25912b3f695161262fb331cc8f043fe8b269b29f1880ee9d8bce8dbf367c339de18b9c3756d15d9ee0e5ba677e1ff5b2837909869c0b1984797b75638f5b
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Skroutz S.A.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
# StringMetric
|
2
|
+
|
3
|
+
[](https://travis-ci.org/skroutz/string_metric)
|
4
|
+
[](https://codeclimate.com/github/skroutz/string_metric)
|
5
|
+
[](https://coveralls.io/r/skroutz/string_metric?branch=master)
|
6
|
+
|
7
|
+
A simple library with String Metric algorithms. If you want to read more about
|
8
|
+
String Metric algorithms please read [here](https://en.wikipedia.org/wiki/String_metric).
|
9
|
+
|
10
|
+
This library wants to support __MRI__ (1.9.3, 2.0.0, 2.1.0), __JRuby__ and
|
11
|
+
__Rubinious__.
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
gem 'string_metric'
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
$ bundle
|
22
|
+
|
23
|
+
Or install it yourself as:
|
24
|
+
|
25
|
+
$ gem install string_metric
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Levenshtein Distance
|
30
|
+
|
31
|
+
The public api for Levenshtein Distance is the method
|
32
|
+
`StringMetric::Levenshtein.distance`.
|
33
|
+
|
34
|
+
__Options__
|
35
|
+
|
36
|
+
* `:max_distance`: It sets an upper limit for the calculated distance. Can be
|
37
|
+
`Fixnum` or `Float`.
|
38
|
+
|
39
|
+
* `:insertion_cost`: It overrides the default (equals to 1) insertion penalty.
|
40
|
+
Can be `Fixnum` or `Float`.
|
41
|
+
|
42
|
+
* `:deletion_cost`: It overrides the default (equals to 1) deletion penanty.
|
43
|
+
Can be `Fixnum` or `Float`.
|
44
|
+
|
45
|
+
* `:subsctitution_cost`: It overrides the default (equals to 1) substitution
|
46
|
+
penalty. Can be `Fixum` or `Float`.
|
47
|
+
|
48
|
+
* `:strategy`: The desired strategy for Levenshtein distance. Supported
|
49
|
+
strategies are `:recursive`, `:two_matrix_rows`, `:full_matrix` and
|
50
|
+
`:experiment`. The default strategy is `:two_matrix_rows`. One should not
|
51
|
+
depend on `:experiment` strategy.
|
52
|
+
|
53
|
+
__Examples__
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
|
57
|
+
require 'string_metric'
|
58
|
+
|
59
|
+
StringMetric::Levenshtein.distance("kitten", "sitting")
|
60
|
+
# Generates: 3
|
61
|
+
|
62
|
+
# Trim distance to :max_distance
|
63
|
+
StringMetric::Levenshtein.distance("kitten", "sitting",
|
64
|
+
max_distance: 2)
|
65
|
+
# Generates: 2
|
66
|
+
|
67
|
+
# Pass different costs for increase, delete or substitute actions
|
68
|
+
StringMetric::Levenshtein.distance("kitten", "sitting",
|
69
|
+
insertion_cost: 2,
|
70
|
+
deletion_cost: 2,
|
71
|
+
substitution_cost: 2)
|
72
|
+
# Generates: 6
|
73
|
+
|
74
|
+
```
|
75
|
+
|
76
|
+
## References
|
77
|
+
|
78
|
+
* [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
|
79
|
+
* [String Metric](https://en.wikipedia.org/wiki/String_metric)
|
80
|
+
|
81
|
+
## Benchmarks
|
82
|
+
|
83
|
+
You can run benchmarks with
|
84
|
+
|
85
|
+
```
|
86
|
+
$ bundle exec ruby benchmarks/*
|
87
|
+
```
|
88
|
+
|
89
|
+
or you can choose to benchmark a specific algorithm like:
|
90
|
+
|
91
|
+
```
|
92
|
+
$ bundle exec ruby benchmarks/levenshtein.rb
|
93
|
+
```
|
94
|
+
|
95
|
+
## Current Benchmarks status
|
96
|
+
|
97
|
+
__Levenshtein__
|
98
|
+
|
99
|
+
|
100
|
+
Implementation | User | Real
|
101
|
+
--------------------------------------------|-----------|-----------
|
102
|
+
Levenshtein::IterativeWithFullMatrix | 0.480000 | 0.475662
|
103
|
+
Levenshtein::IterativeWithTwoMatrixRows | 0.350000 | 0.352388
|
104
|
+
Levenshtein::Experiment | 0.420000 | 0.420000
|
105
|
+
Text::Levenshtein (from gem text) | 0.400000 | 0.400346
|
106
|
+
|
107
|
+
_Currently the set of fixtures is very small_
|
108
|
+
|
109
|
+
## Other implementations
|
110
|
+
|
111
|
+
__Levenshtein__
|
112
|
+
|
113
|
+
* this beautiful gem, [text](https://github.com/threedaymonk/text)
|
114
|
+
* ffi implementations, like [this](https://github.com/dbalatero/levenshtein-ffi) or check [The Ruby Toolbox](https://www.ruby-toolbox.com/projects/levenshtein-ffi)
|
115
|
+
|
116
|
+
__Various__
|
117
|
+
* Approximate String matching [library](https://github.com/flori/amatch)
|
118
|
+
|
119
|
+
## Tools
|
120
|
+
|
121
|
+
* Try to use [SemVer](http://semver.org/)
|
122
|
+
|
123
|
+
|
124
|
+
## Contributing
|
125
|
+
|
126
|
+
1. Fork it ( http://github.com/<my-github-username>/string_metric/fork )
|
127
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
128
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
129
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
130
|
+
5. Create new Pull Request
|
131
|
+
|
132
|
+
## Licence
|
133
|
+
|
134
|
+
string_metric is licensed under MIT. See [License](LICENSE.txt)
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "benchmark"
|
2
|
+
require "string_metric"
|
3
|
+
require "text"
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
if RUBY_ENGINE == "ruby"
|
7
|
+
require "pry"
|
8
|
+
end
|
9
|
+
|
10
|
+
Benchmark.bmbm(7) do |x|
|
11
|
+
|
12
|
+
iterations = 10_000
|
13
|
+
options = { insertion_cost: 2 }
|
14
|
+
|
15
|
+
fixtures = []
|
16
|
+
CSV.foreach("spec/fixtures/levenshtein.csv") do |row|
|
17
|
+
from, to, _ = row
|
18
|
+
|
19
|
+
fixtures.push [from.to_s.strip, to.to_s.strip]
|
20
|
+
end
|
21
|
+
|
22
|
+
StringMetric::Levenshtein::STRATEGIES.each do |strategy, implementation|
|
23
|
+
next if strategy == :recursive
|
24
|
+
|
25
|
+
x.report("#{implementation.to_s} implementation") do
|
26
|
+
iterations.times do |i|
|
27
|
+
|
28
|
+
fixtures.each do |from, to|
|
29
|
+
implementation.distance(from, to, options)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
x.report("Text::Levenshtein implementation") do
|
36
|
+
iterations.times do |i|
|
37
|
+
fixtures.each do |from, to|
|
38
|
+
Text::Levenshtein.distance(from, to)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require_relative "levenshtein/experiment"
|
4
|
+
require_relative "levenshtein/iterative_with_two_matrix_rows"
|
5
|
+
require_relative "levenshtein/iterative_with_two_matrix_rows_optimized"
|
6
|
+
require_relative "levenshtein/iterative_with_full_matrix"
|
7
|
+
require_relative "levenshtein/recursive"
|
8
|
+
|
9
|
+
module StringMetric
|
10
|
+
# Levenshtein Distance implementation
|
11
|
+
#
|
12
|
+
# @see https://en.wikipedia.org/wiki/Levenshtein_distance
|
13
|
+
module Levenshtein
|
14
|
+
|
15
|
+
STRATEGIES = {
|
16
|
+
experiment: Experiment,
|
17
|
+
full_matrix: IterativeWithFullMatrix,
|
18
|
+
recursive: Recursive,
|
19
|
+
two_matrix_rows: IterativeWithTwoMatrixRows,
|
20
|
+
two_matrix_rows_v2: IterativeWithTwoMatrixRowsOptimized
|
21
|
+
}
|
22
|
+
|
23
|
+
# Levenshtein Distance of two strings
|
24
|
+
#
|
25
|
+
# @param from [String] the first string
|
26
|
+
# @param to [String] the second string
|
27
|
+
# @param options [Hash] options
|
28
|
+
# @option options [Fixnum, Float] :max_distance If this option is passed then
|
29
|
+
# levenstein distance is trimmed to this value (if greater)
|
30
|
+
# @option options [Fixnum, Float] :insertion_cost If this option is passed then
|
31
|
+
# new insertion cost is taken into account (by default is 1)
|
32
|
+
# @option options [Fixnum, Float] :deletion_cost If this option is passed then
|
33
|
+
# new deletion cost is taken into account (by default is 1)
|
34
|
+
# @option options [Fixnum, Float] :substitution_cost If this option is passed then
|
35
|
+
# new substitution cost is taken into account (be default is 1)
|
36
|
+
# @option options [Symbol] :strategy The desired strategy for Levenshtein
|
37
|
+
# distance. Supported strategies are :recursive, :two_matrix_rows,
|
38
|
+
# :full_matrix and :experiment. The default strategy is :two_matrix_rows.
|
39
|
+
# One should not depend on :experiment strategy.
|
40
|
+
# @return [Fixnum, Float] the Levenshtein Distance
|
41
|
+
def distance(from, to, options = {})
|
42
|
+
strategy = pick_strategy(options[:strategy]) || Levenshtein.default_strategy
|
43
|
+
args = [from, to, options]
|
44
|
+
|
45
|
+
strategy.distance(*args)
|
46
|
+
end
|
47
|
+
module_function :distance
|
48
|
+
|
49
|
+
# Currently the default strategy is set to IterativeWithTwoMatrixRows
|
50
|
+
def default_strategy
|
51
|
+
if RUBY_ENGINE == "ruby"
|
52
|
+
pick_strategy(:two_matrix_rows_v2)
|
53
|
+
else
|
54
|
+
pick_strategy(:two_matrix_rows)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
module_function :default_strategy
|
58
|
+
|
59
|
+
def pick_strategy(symbol)
|
60
|
+
STRATEGIES[symbol]
|
61
|
+
end
|
62
|
+
module_function :pick_strategy
|
63
|
+
private_class_method :pick_strategy
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class Experiment
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
m = from.length
|
12
|
+
n = to.length
|
13
|
+
|
14
|
+
[m, n].min.times do |i|
|
15
|
+
if from[i] == to[i]
|
16
|
+
from.slice!(i)
|
17
|
+
to.slice!(i)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
options.delete(:strategy)
|
22
|
+
|
23
|
+
# Call default distance implementation
|
24
|
+
Levenshtein.distance(from, to, options)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class IterativeWithFullMatrix
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
d = (0..to.size).map do |i|
|
17
|
+
[0] * (from.size + 1)
|
18
|
+
end
|
19
|
+
|
20
|
+
(1..from.size).each { |j| d[0][j] = j }
|
21
|
+
(1..to.size).each { |i| d[i][0] = i }
|
22
|
+
|
23
|
+
(1..from.size).each do |j|
|
24
|
+
(1..to.size).each do |i|
|
25
|
+
if from[j-1] == to[i-1]
|
26
|
+
d[i][j] = d[i -1][j-1]
|
27
|
+
else
|
28
|
+
d[i][j] = [d[i-1][j] + insertion_cost, # insertion
|
29
|
+
d[i][j-1] + deletion_cost, # deletion
|
30
|
+
d[i-1][j-1] + substitution_cost # substitution
|
31
|
+
].min
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
break if max_distance and d[j][j] > max_distance
|
36
|
+
end
|
37
|
+
|
38
|
+
x = d[to.size][from.size]
|
39
|
+
if max_distance && x > max_distance
|
40
|
+
max_distance
|
41
|
+
else
|
42
|
+
x
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class IterativeWithTwoMatrixRows
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
m = from.length
|
17
|
+
n = to.length
|
18
|
+
|
19
|
+
v0 = (0..m).to_a
|
20
|
+
v1 = []
|
21
|
+
x = 0
|
22
|
+
|
23
|
+
n.times do |i|
|
24
|
+
x = v1[0] = i + 1
|
25
|
+
|
26
|
+
sub_cell = v0[0]
|
27
|
+
|
28
|
+
m.times do |j|
|
29
|
+
cost = (from[j] == to[i]) ? 0 : substitution_cost
|
30
|
+
|
31
|
+
ins_cell = v0[j+1]
|
32
|
+
|
33
|
+
x = [x + deletion_cost, # deletion
|
34
|
+
ins_cell + insertion_cost, # insertion
|
35
|
+
sub_cell + cost # substitution
|
36
|
+
].min
|
37
|
+
|
38
|
+
|
39
|
+
v1[j + 1] = x
|
40
|
+
|
41
|
+
sub_cell = ins_cell
|
42
|
+
end
|
43
|
+
|
44
|
+
break if max_distance && v0[i] > max_distance
|
45
|
+
|
46
|
+
v0 = v1.dup
|
47
|
+
end
|
48
|
+
|
49
|
+
if max_distance && x > max_distance
|
50
|
+
max_distance
|
51
|
+
else
|
52
|
+
x
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class IterativeWithTwoMatrixRowsOptimized
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
m = from.length
|
17
|
+
n = to.length
|
18
|
+
|
19
|
+
v0 = (0..m).to_a
|
20
|
+
v1 = []
|
21
|
+
x = 0
|
22
|
+
|
23
|
+
n.times do |i|
|
24
|
+
x = v1[0] = i + 1
|
25
|
+
|
26
|
+
sub_cell = v0[0]
|
27
|
+
|
28
|
+
m.times do |j|
|
29
|
+
cost = (from[j] == to[i]) ? 0 : substitution_cost
|
30
|
+
|
31
|
+
ins_cell = v0[j+1]
|
32
|
+
|
33
|
+
x = [x + deletion_cost, # deletion
|
34
|
+
ins_cell + insertion_cost, # insertion
|
35
|
+
sub_cell + cost # substitution
|
36
|
+
].sort!
|
37
|
+
|
38
|
+
x = x[0]
|
39
|
+
|
40
|
+
v1[j + 1] = x
|
41
|
+
|
42
|
+
sub_cell = ins_cell
|
43
|
+
end
|
44
|
+
|
45
|
+
break if max_distance && v0[i] > max_distance
|
46
|
+
|
47
|
+
v0 = v1.dup
|
48
|
+
end
|
49
|
+
|
50
|
+
if max_distance && x > max_distance
|
51
|
+
max_distance
|
52
|
+
else
|
53
|
+
x
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class Recursive
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
if from.chars.to_a.last == to.chars.to_a.last
|
17
|
+
cost = 0
|
18
|
+
else
|
19
|
+
cost = substitution_cost
|
20
|
+
end
|
21
|
+
|
22
|
+
if max_distance
|
23
|
+
return [distance(from.chop, to, options) + deletion_cost,
|
24
|
+
distance(from, to.chop, options) + insertion_cost,
|
25
|
+
distance(from.chop, to.chop, options) + cost,
|
26
|
+
max_distance
|
27
|
+
].min
|
28
|
+
else
|
29
|
+
return [distance(from.chop, to, options) + deletion_cost,
|
30
|
+
distance(from, to.chop, options) + insertion_cost,
|
31
|
+
distance(from.chop, to.chop, options) + cost
|
32
|
+
].min
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe StringMetric::Levenshtein do
|
6
|
+
it_behaves_like "Levenshtein Distance"
|
7
|
+
|
8
|
+
describe '#default_strategy' do
|
9
|
+
it "has a default strategy" do
|
10
|
+
expect(described_class.default_strategy).to be
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
|
8
|
+
require "string_metric"
|
9
|
+
|
10
|
+
if RUBY_ENGINE == "ruby"
|
11
|
+
require "pry"
|
12
|
+
end
|
13
|
+
|
14
|
+
require 'coveralls'
|
15
|
+
Coveralls.wear!
|
16
|
+
|
17
|
+
Dir["./spec/support/**/*.rb"].sort.each { |f| require f }
|
18
|
+
|
19
|
+
RSpec.configure do |config|
|
20
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
21
|
+
config.run_all_when_everything_filtered = true
|
22
|
+
config.filter_run :focus
|
23
|
+
|
24
|
+
# Run specs in random order to surface order dependencies. If you find an
|
25
|
+
# order dependency and want to debug it, you can fix the order by providing
|
26
|
+
# the seed, which is printed after each run.
|
27
|
+
# --seed 1234
|
28
|
+
config.order = 'random'
|
29
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
shared_examples "Levenshtein Distance" do |options|
|
4
|
+
options ||= {}
|
5
|
+
|
6
|
+
describe ".distance" do
|
7
|
+
context "when the two strings are equal" do
|
8
|
+
it "is 0" do
|
9
|
+
expect(described_class.distance("kitten", "kitten", options)).to eq 0
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
context "when the first string is empty" do
|
14
|
+
it "is the size of the second string" do
|
15
|
+
expect(described_class.distance("","kitten", options)).to eq("kitten".size)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "when the second string is empty" do
|
20
|
+
it "is the size of the first string" do
|
21
|
+
expect(described_class.distance("kitten","", options)).to eq("kitten".size)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "when max_distance is passed as option" do
|
26
|
+
context "and normal distance is greater than max_distance" do
|
27
|
+
let(:max_distance) { 2 }
|
28
|
+
|
29
|
+
it "is trimmed to max_distance" do
|
30
|
+
expect(described_class.distance("kitten", "sitting",
|
31
|
+
max_distance: max_distance)).to eq max_distance
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
CSV.foreach("spec/fixtures/levenshtein.csv") do |row|
|
37
|
+
from, to, distance = row
|
38
|
+
from = from.to_s.strip
|
39
|
+
to = to.to_s.strip
|
40
|
+
|
41
|
+
it "calculates the distance from '#{from}' to '#{to}' correctly" do
|
42
|
+
expect(described_class.distance(from, to, options)).to eq distance.to_i
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context "when insertion_cost is passed" do
|
47
|
+
it "takes this cost into account" do
|
48
|
+
expect(described_class.distance("kitten", "sitting", insertion_cost: 1)).not_to eq(
|
49
|
+
described_class.distance("kitten", "sitting", insertion_cost: 2))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'string_metric/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "string_metric"
|
8
|
+
spec.version = StringMetric::VERSION
|
9
|
+
spec.authors = ["Giorgos Tsiftsis"]
|
10
|
+
spec.email = ["giorgos.tsiftsis@skroutz.gr"]
|
11
|
+
spec.summary = %q{A simple library with String Metric algorithms}
|
12
|
+
spec.description = %q{A simple library with String Metric algorithms}
|
13
|
+
spec.homepage = "https://github.com/chief/string_metric"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.1.1"
|
23
|
+
spec.add_development_dependency "rspec", "~> 2.14.1"
|
24
|
+
spec.add_development_dependency "text", "~> 1.2.3"
|
25
|
+
|
26
|
+
if RUBY_ENGINE == "ruby"
|
27
|
+
if RUBY_VERSION > "1.9.3"
|
28
|
+
spec.add_development_dependency "pry-byebug", "~> 1.2.1"
|
29
|
+
else
|
30
|
+
spec.add_development_dependency "pry", "~> 0.9.12.4"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: string_metric
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Giorgos Tsiftsis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-01-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.5'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.5'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 10.1.1
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 10.1.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.14.1
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.14.1
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: text
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.2.3
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 1.2.3
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry-byebug
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.2.1
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.2.1
|
83
|
+
description: A simple library with String Metric algorithms
|
84
|
+
email:
|
85
|
+
- giorgos.tsiftsis@skroutz.gr
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".rspec"
|
92
|
+
- ".travis.yml"
|
93
|
+
- Gemfile
|
94
|
+
- LICENSE.txt
|
95
|
+
- README.md
|
96
|
+
- Rakefile
|
97
|
+
- benchmarks/levenshtein.rb
|
98
|
+
- lib/string_metric.rb
|
99
|
+
- lib/string_metric/levenshtein.rb
|
100
|
+
- lib/string_metric/levenshtein/experiment.rb
|
101
|
+
- lib/string_metric/levenshtein/iterative_with_full_matrix.rb
|
102
|
+
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
|
103
|
+
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
|
104
|
+
- lib/string_metric/levenshtein/recursive.rb
|
105
|
+
- lib/string_metric/version.rb
|
106
|
+
- spec/fixtures/levenshtein.csv
|
107
|
+
- spec/lib/levenshtein/experiment_spec.rb
|
108
|
+
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|
109
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
|
110
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
|
111
|
+
- spec/lib/levenshtein/recursive_spec.rb
|
112
|
+
- spec/lib/levenshtein_spec.rb
|
113
|
+
- spec/spec_helper.rb
|
114
|
+
- spec/support/levenshtein.rb
|
115
|
+
- string_metric.gemspec
|
116
|
+
homepage: https://github.com/chief/string_metric
|
117
|
+
licenses:
|
118
|
+
- MIT
|
119
|
+
metadata: {}
|
120
|
+
post_install_message:
|
121
|
+
rdoc_options: []
|
122
|
+
require_paths:
|
123
|
+
- lib
|
124
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
requirements: []
|
135
|
+
rubyforge_project:
|
136
|
+
rubygems_version: 2.2.0
|
137
|
+
signing_key:
|
138
|
+
specification_version: 4
|
139
|
+
summary: A simple library with String Metric algorithms
|
140
|
+
test_files:
|
141
|
+
- spec/fixtures/levenshtein.csv
|
142
|
+
- spec/lib/levenshtein/experiment_spec.rb
|
143
|
+
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|
144
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
|
145
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
|
146
|
+
- spec/lib/levenshtein/recursive_spec.rb
|
147
|
+
- spec/lib/levenshtein_spec.rb
|
148
|
+
- spec/spec_helper.rb
|
149
|
+
- spec/support/levenshtein.rb
|