string_metric 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/.travis.yml +8 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +134 -0
- data/Rakefile +1 -0
- data/benchmarks/levenshtein.rb +42 -0
- data/lib/string_metric.rb +5 -0
- data/lib/string_metric/levenshtein.rb +65 -0
- data/lib/string_metric/levenshtein/experiment.rb +28 -0
- data/lib/string_metric/levenshtein/iterative_with_full_matrix.rb +47 -0
- data/lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb +57 -0
- data/lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb +58 -0
- data/lib/string_metric/levenshtein/recursive.rb +38 -0
- data/lib/string_metric/version.rb +3 -0
- data/spec/fixtures/levenshtein.csv +11 -0
- data/spec/lib/levenshtein/experiment_spec.rb +6 -0
- data/spec/lib/levenshtein/iterative_with_full_matric_spec.rb +6 -0
- data/spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb +6 -0
- data/spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb +6 -0
- data/spec/lib/levenshtein/recursive_spec.rb +6 -0
- data/spec/lib/levenshtein_spec.rb +13 -0
- data/spec/spec_helper.rb +29 -0
- data/spec/support/levenshtein.rb +53 -0
- data/string_metric.gemspec +33 -0
- metadata +149 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 32870ce170c62b4036f6b8eaf3f92e43280a0246
|
4
|
+
data.tar.gz: a7f38c88be3211f6ef1d82c71d701a0655081570
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8668f405e8568c4254ef7856660a87bd4488d8e4ddbcea95f185575562787ad072696868a831431b821d95f868a8a1f67e1f2b057272375dbda3f0fbe73a6ccb
|
7
|
+
data.tar.gz: 9e9b25912b3f695161262fb331cc8f043fe8b269b29f1880ee9d8bce8dbf367c339de18b9c3756d15d9ee0e5ba677e1ff5b2837909869c0b1984797b75638f5b
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Skroutz S.A.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
# StringMetric
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/skroutz/string_metric.png?branch=master)](https://travis-ci.org/skroutz/string_metric)
|
4
|
+
[![Code Climate](https://codeclimate.com/github/skroutz/string_metric.png)](https://codeclimate.com/github/skroutz/string_metric)
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/skroutz/string_metric/badge.png?branch=master)](https://coveralls.io/r/skroutz/string_metric?branch=master)
|
6
|
+
|
7
|
+
A simple library with String Metric algorithms. If you want to read more about
|
8
|
+
String Metric algorithms please read [here](https://en.wikipedia.org/wiki/String_metric).
|
9
|
+
|
10
|
+
This library wants to support __MRI__ (1.9.3, 2.0.0, 2.1.0), __JRuby__ and
|
11
|
+
__Rubinious__.
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
gem 'string_metric'
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
$ bundle
|
22
|
+
|
23
|
+
Or install it yourself as:
|
24
|
+
|
25
|
+
$ gem install string_metric
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Levenshtein Distance
|
30
|
+
|
31
|
+
The public api for Levenshtein Distance is the method
|
32
|
+
`StringMetric::Levenshtein.distance`.
|
33
|
+
|
34
|
+
__Options__
|
35
|
+
|
36
|
+
* `:max_distance`: It sets an upper limit for the calculated distance. Can be
|
37
|
+
`Fixnum` or `Float`.
|
38
|
+
|
39
|
+
* `:insertion_cost`: It overrides the default (equals to 1) insertion penalty.
|
40
|
+
Can be `Fixnum` or `Float`.
|
41
|
+
|
42
|
+
* `:deletion_cost`: It overrides the default (equals to 1) deletion penanty.
|
43
|
+
Can be `Fixnum` or `Float`.
|
44
|
+
|
45
|
+
* `:subsctitution_cost`: It overrides the default (equals to 1) substitution
|
46
|
+
penalty. Can be `Fixum` or `Float`.
|
47
|
+
|
48
|
+
* `:strategy`: The desired strategy for Levenshtein distance. Supported
|
49
|
+
strategies are `:recursive`, `:two_matrix_rows`, `:full_matrix` and
|
50
|
+
`:experiment`. The default strategy is `:two_matrix_rows`. One should not
|
51
|
+
depend on `:experiment` strategy.
|
52
|
+
|
53
|
+
__Examples__
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
|
57
|
+
require 'string_metric'
|
58
|
+
|
59
|
+
StringMetric::Levenshtein.distance("kitten", "sitting")
|
60
|
+
# Generates: 3
|
61
|
+
|
62
|
+
# Trim distance to :max_distance
|
63
|
+
StringMetric::Levenshtein.distance("kitten", "sitting",
|
64
|
+
max_distance: 2)
|
65
|
+
# Generates: 2
|
66
|
+
|
67
|
+
# Pass different costs for increase, delete or substitute actions
|
68
|
+
StringMetric::Levenshtein.distance("kitten", "sitting",
|
69
|
+
insertion_cost: 2,
|
70
|
+
deletion_cost: 2,
|
71
|
+
substitution_cost: 2)
|
72
|
+
# Generates: 6
|
73
|
+
|
74
|
+
```
|
75
|
+
|
76
|
+
## References
|
77
|
+
|
78
|
+
* [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
|
79
|
+
* [String Metric](https://en.wikipedia.org/wiki/String_metric)
|
80
|
+
|
81
|
+
## Benchmarks
|
82
|
+
|
83
|
+
You can run benchmarks with
|
84
|
+
|
85
|
+
```
|
86
|
+
$ bundle exec ruby benchmarks/*
|
87
|
+
```
|
88
|
+
|
89
|
+
or you can choose to benchmark a specific algorithm like:
|
90
|
+
|
91
|
+
```
|
92
|
+
$ bundle exec ruby benchmarks/levenshtein.rb
|
93
|
+
```
|
94
|
+
|
95
|
+
## Current Benchmarks status
|
96
|
+
|
97
|
+
__Levenshtein__
|
98
|
+
|
99
|
+
|
100
|
+
Implementation | User | Real
|
101
|
+
--------------------------------------------|-----------|-----------
|
102
|
+
Levenshtein::IterativeWithFullMatrix | 0.480000 | 0.475662
|
103
|
+
Levenshtein::IterativeWithTwoMatrixRows | 0.350000 | 0.352388
|
104
|
+
Levenshtein::Experiment | 0.420000 | 0.420000
|
105
|
+
Text::Levenshtein (from gem text) | 0.400000 | 0.400346
|
106
|
+
|
107
|
+
_Currently the set of fixtures is very small_
|
108
|
+
|
109
|
+
## Other implementations
|
110
|
+
|
111
|
+
__Levenshtein__
|
112
|
+
|
113
|
+
* this beautiful gem, [text](https://github.com/threedaymonk/text)
|
114
|
+
* ffi implementations, like [this](https://github.com/dbalatero/levenshtein-ffi) or check [The Ruby Toolbox](https://www.ruby-toolbox.com/projects/levenshtein-ffi)
|
115
|
+
|
116
|
+
__Various__
|
117
|
+
* Approximate String matching [library](https://github.com/flori/amatch)
|
118
|
+
|
119
|
+
## Tools
|
120
|
+
|
121
|
+
* Try to use [SemVer](http://semver.org/)
|
122
|
+
|
123
|
+
|
124
|
+
## Contributing
|
125
|
+
|
126
|
+
1. Fork it ( http://github.com/<my-github-username>/string_metric/fork )
|
127
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
128
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
129
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
130
|
+
5. Create new Pull Request
|
131
|
+
|
132
|
+
## Licence
|
133
|
+
|
134
|
+
string_metric is licensed under MIT. See [License](LICENSE.txt)
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "benchmark"
|
2
|
+
require "string_metric"
|
3
|
+
require "text"
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
if RUBY_ENGINE == "ruby"
|
7
|
+
require "pry"
|
8
|
+
end
|
9
|
+
|
10
|
+
Benchmark.bmbm(7) do |x|
|
11
|
+
|
12
|
+
iterations = 10_000
|
13
|
+
options = { insertion_cost: 2 }
|
14
|
+
|
15
|
+
fixtures = []
|
16
|
+
CSV.foreach("spec/fixtures/levenshtein.csv") do |row|
|
17
|
+
from, to, _ = row
|
18
|
+
|
19
|
+
fixtures.push [from.to_s.strip, to.to_s.strip]
|
20
|
+
end
|
21
|
+
|
22
|
+
StringMetric::Levenshtein::STRATEGIES.each do |strategy, implementation|
|
23
|
+
next if strategy == :recursive
|
24
|
+
|
25
|
+
x.report("#{implementation.to_s} implementation") do
|
26
|
+
iterations.times do |i|
|
27
|
+
|
28
|
+
fixtures.each do |from, to|
|
29
|
+
implementation.distance(from, to, options)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
x.report("Text::Levenshtein implementation") do
|
36
|
+
iterations.times do |i|
|
37
|
+
fixtures.each do |from, to|
|
38
|
+
Text::Levenshtein.distance(from, to)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require_relative "levenshtein/experiment"
|
4
|
+
require_relative "levenshtein/iterative_with_two_matrix_rows"
|
5
|
+
require_relative "levenshtein/iterative_with_two_matrix_rows_optimized"
|
6
|
+
require_relative "levenshtein/iterative_with_full_matrix"
|
7
|
+
require_relative "levenshtein/recursive"
|
8
|
+
|
9
|
+
module StringMetric
|
10
|
+
# Levenshtein Distance implementation
|
11
|
+
#
|
12
|
+
# @see https://en.wikipedia.org/wiki/Levenshtein_distance
|
13
|
+
module Levenshtein
|
14
|
+
|
15
|
+
STRATEGIES = {
|
16
|
+
experiment: Experiment,
|
17
|
+
full_matrix: IterativeWithFullMatrix,
|
18
|
+
recursive: Recursive,
|
19
|
+
two_matrix_rows: IterativeWithTwoMatrixRows,
|
20
|
+
two_matrix_rows_v2: IterativeWithTwoMatrixRowsOptimized
|
21
|
+
}
|
22
|
+
|
23
|
+
# Levenshtein Distance of two strings
|
24
|
+
#
|
25
|
+
# @param from [String] the first string
|
26
|
+
# @param to [String] the second string
|
27
|
+
# @param options [Hash] options
|
28
|
+
# @option options [Fixnum, Float] :max_distance If this option is passed then
|
29
|
+
# levenstein distance is trimmed to this value (if greater)
|
30
|
+
# @option options [Fixnum, Float] :insertion_cost If this option is passed then
|
31
|
+
# new insertion cost is taken into account (by default is 1)
|
32
|
+
# @option options [Fixnum, Float] :deletion_cost If this option is passed then
|
33
|
+
# new deletion cost is taken into account (by default is 1)
|
34
|
+
# @option options [Fixnum, Float] :substitution_cost If this option is passed then
|
35
|
+
# new substitution cost is taken into account (be default is 1)
|
36
|
+
# @option options [Symbol] :strategy The desired strategy for Levenshtein
|
37
|
+
# distance. Supported strategies are :recursive, :two_matrix_rows,
|
38
|
+
# :full_matrix and :experiment. The default strategy is :two_matrix_rows.
|
39
|
+
# One should not depend on :experiment strategy.
|
40
|
+
# @return [Fixnum, Float] the Levenshtein Distance
|
41
|
+
def distance(from, to, options = {})
|
42
|
+
strategy = pick_strategy(options[:strategy]) || Levenshtein.default_strategy
|
43
|
+
args = [from, to, options]
|
44
|
+
|
45
|
+
strategy.distance(*args)
|
46
|
+
end
|
47
|
+
module_function :distance
|
48
|
+
|
49
|
+
# Currently the default strategy is set to IterativeWithTwoMatrixRows
|
50
|
+
def default_strategy
|
51
|
+
if RUBY_ENGINE == "ruby"
|
52
|
+
pick_strategy(:two_matrix_rows_v2)
|
53
|
+
else
|
54
|
+
pick_strategy(:two_matrix_rows)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
module_function :default_strategy
|
58
|
+
|
59
|
+
def pick_strategy(symbol)
|
60
|
+
STRATEGIES[symbol]
|
61
|
+
end
|
62
|
+
module_function :pick_strategy
|
63
|
+
private_class_method :pick_strategy
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class Experiment
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
m = from.length
|
12
|
+
n = to.length
|
13
|
+
|
14
|
+
[m, n].min.times do |i|
|
15
|
+
if from[i] == to[i]
|
16
|
+
from.slice!(i)
|
17
|
+
to.slice!(i)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
options.delete(:strategy)
|
22
|
+
|
23
|
+
# Call default distance implementation
|
24
|
+
Levenshtein.distance(from, to, options)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class IterativeWithFullMatrix
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
d = (0..to.size).map do |i|
|
17
|
+
[0] * (from.size + 1)
|
18
|
+
end
|
19
|
+
|
20
|
+
(1..from.size).each { |j| d[0][j] = j }
|
21
|
+
(1..to.size).each { |i| d[i][0] = i }
|
22
|
+
|
23
|
+
(1..from.size).each do |j|
|
24
|
+
(1..to.size).each do |i|
|
25
|
+
if from[j-1] == to[i-1]
|
26
|
+
d[i][j] = d[i -1][j-1]
|
27
|
+
else
|
28
|
+
d[i][j] = [d[i-1][j] + insertion_cost, # insertion
|
29
|
+
d[i][j-1] + deletion_cost, # deletion
|
30
|
+
d[i-1][j-1] + substitution_cost # substitution
|
31
|
+
].min
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
break if max_distance and d[j][j] > max_distance
|
36
|
+
end
|
37
|
+
|
38
|
+
x = d[to.size][from.size]
|
39
|
+
if max_distance && x > max_distance
|
40
|
+
max_distance
|
41
|
+
else
|
42
|
+
x
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class IterativeWithTwoMatrixRows
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
m = from.length
|
17
|
+
n = to.length
|
18
|
+
|
19
|
+
v0 = (0..m).to_a
|
20
|
+
v1 = []
|
21
|
+
x = 0
|
22
|
+
|
23
|
+
n.times do |i|
|
24
|
+
x = v1[0] = i + 1
|
25
|
+
|
26
|
+
sub_cell = v0[0]
|
27
|
+
|
28
|
+
m.times do |j|
|
29
|
+
cost = (from[j] == to[i]) ? 0 : substitution_cost
|
30
|
+
|
31
|
+
ins_cell = v0[j+1]
|
32
|
+
|
33
|
+
x = [x + deletion_cost, # deletion
|
34
|
+
ins_cell + insertion_cost, # insertion
|
35
|
+
sub_cell + cost # substitution
|
36
|
+
].min
|
37
|
+
|
38
|
+
|
39
|
+
v1[j + 1] = x
|
40
|
+
|
41
|
+
sub_cell = ins_cell
|
42
|
+
end
|
43
|
+
|
44
|
+
break if max_distance && v0[i] > max_distance
|
45
|
+
|
46
|
+
v0 = v1.dup
|
47
|
+
end
|
48
|
+
|
49
|
+
if max_distance && x > max_distance
|
50
|
+
max_distance
|
51
|
+
else
|
52
|
+
x
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class IterativeWithTwoMatrixRowsOptimized
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
m = from.length
|
17
|
+
n = to.length
|
18
|
+
|
19
|
+
v0 = (0..m).to_a
|
20
|
+
v1 = []
|
21
|
+
x = 0
|
22
|
+
|
23
|
+
n.times do |i|
|
24
|
+
x = v1[0] = i + 1
|
25
|
+
|
26
|
+
sub_cell = v0[0]
|
27
|
+
|
28
|
+
m.times do |j|
|
29
|
+
cost = (from[j] == to[i]) ? 0 : substitution_cost
|
30
|
+
|
31
|
+
ins_cell = v0[j+1]
|
32
|
+
|
33
|
+
x = [x + deletion_cost, # deletion
|
34
|
+
ins_cell + insertion_cost, # insertion
|
35
|
+
sub_cell + cost # substitution
|
36
|
+
].sort!
|
37
|
+
|
38
|
+
x = x[0]
|
39
|
+
|
40
|
+
v1[j + 1] = x
|
41
|
+
|
42
|
+
sub_cell = ins_cell
|
43
|
+
end
|
44
|
+
|
45
|
+
break if max_distance && v0[i] > max_distance
|
46
|
+
|
47
|
+
v0 = v1.dup
|
48
|
+
end
|
49
|
+
|
50
|
+
if max_distance && x > max_distance
|
51
|
+
max_distance
|
52
|
+
else
|
53
|
+
x
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class Recursive
|
6
|
+
def self.distance(from, to, options = {})
|
7
|
+
return 0 if from == to
|
8
|
+
return to.size if from.size.zero?
|
9
|
+
return from.size if to.size.zero?
|
10
|
+
|
11
|
+
max_distance = options[:max_distance]
|
12
|
+
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
|
+
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
|
+
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
|
+
|
16
|
+
if from.chars.to_a.last == to.chars.to_a.last
|
17
|
+
cost = 0
|
18
|
+
else
|
19
|
+
cost = substitution_cost
|
20
|
+
end
|
21
|
+
|
22
|
+
if max_distance
|
23
|
+
return [distance(from.chop, to, options) + deletion_cost,
|
24
|
+
distance(from, to.chop, options) + insertion_cost,
|
25
|
+
distance(from.chop, to.chop, options) + cost,
|
26
|
+
max_distance
|
27
|
+
].min
|
28
|
+
else
|
29
|
+
return [distance(from.chop, to, options) + deletion_cost,
|
30
|
+
distance(from, to.chop, options) + insertion_cost,
|
31
|
+
distance(from.chop, to.chop, options) + cost
|
32
|
+
].min
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe StringMetric::Levenshtein do
|
6
|
+
it_behaves_like "Levenshtein Distance"
|
7
|
+
|
8
|
+
describe '#default_strategy' do
|
9
|
+
it "has a default strategy" do
|
10
|
+
expect(described_class.default_strategy).to be
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
|
8
|
+
require "string_metric"
|
9
|
+
|
10
|
+
if RUBY_ENGINE == "ruby"
|
11
|
+
require "pry"
|
12
|
+
end
|
13
|
+
|
14
|
+
require 'coveralls'
|
15
|
+
Coveralls.wear!
|
16
|
+
|
17
|
+
Dir["./spec/support/**/*.rb"].sort.each { |f| require f }
|
18
|
+
|
19
|
+
RSpec.configure do |config|
|
20
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
21
|
+
config.run_all_when_everything_filtered = true
|
22
|
+
config.filter_run :focus
|
23
|
+
|
24
|
+
# Run specs in random order to surface order dependencies. If you find an
|
25
|
+
# order dependency and want to debug it, you can fix the order by providing
|
26
|
+
# the seed, which is printed after each run.
|
27
|
+
# --seed 1234
|
28
|
+
config.order = 'random'
|
29
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
shared_examples "Levenshtein Distance" do |options|
|
4
|
+
options ||= {}
|
5
|
+
|
6
|
+
describe ".distance" do
|
7
|
+
context "when the two strings are equal" do
|
8
|
+
it "is 0" do
|
9
|
+
expect(described_class.distance("kitten", "kitten", options)).to eq 0
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
context "when the first string is empty" do
|
14
|
+
it "is the size of the second string" do
|
15
|
+
expect(described_class.distance("","kitten", options)).to eq("kitten".size)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "when the second string is empty" do
|
20
|
+
it "is the size of the first string" do
|
21
|
+
expect(described_class.distance("kitten","", options)).to eq("kitten".size)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "when max_distance is passed as option" do
|
26
|
+
context "and normal distance is greater than max_distance" do
|
27
|
+
let(:max_distance) { 2 }
|
28
|
+
|
29
|
+
it "is trimmed to max_distance" do
|
30
|
+
expect(described_class.distance("kitten", "sitting",
|
31
|
+
max_distance: max_distance)).to eq max_distance
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
CSV.foreach("spec/fixtures/levenshtein.csv") do |row|
|
37
|
+
from, to, distance = row
|
38
|
+
from = from.to_s.strip
|
39
|
+
to = to.to_s.strip
|
40
|
+
|
41
|
+
it "calculates the distance from '#{from}' to '#{to}' correctly" do
|
42
|
+
expect(described_class.distance(from, to, options)).to eq distance.to_i
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context "when insertion_cost is passed" do
|
47
|
+
it "takes this cost into account" do
|
48
|
+
expect(described_class.distance("kitten", "sitting", insertion_cost: 1)).not_to eq(
|
49
|
+
described_class.distance("kitten", "sitting", insertion_cost: 2))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'string_metric/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "string_metric"
|
8
|
+
spec.version = StringMetric::VERSION
|
9
|
+
spec.authors = ["Giorgos Tsiftsis"]
|
10
|
+
spec.email = ["giorgos.tsiftsis@skroutz.gr"]
|
11
|
+
spec.summary = %q{A simple library with String Metric algorithms}
|
12
|
+
spec.description = %q{A simple library with String Metric algorithms}
|
13
|
+
spec.homepage = "https://github.com/chief/string_metric"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.1.1"
|
23
|
+
spec.add_development_dependency "rspec", "~> 2.14.1"
|
24
|
+
spec.add_development_dependency "text", "~> 1.2.3"
|
25
|
+
|
26
|
+
if RUBY_ENGINE == "ruby"
|
27
|
+
if RUBY_VERSION > "1.9.3"
|
28
|
+
spec.add_development_dependency "pry-byebug", "~> 1.2.1"
|
29
|
+
else
|
30
|
+
spec.add_development_dependency "pry", "~> 0.9.12.4"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: string_metric
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Giorgos Tsiftsis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-01-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.5'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.5'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 10.1.1
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 10.1.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.14.1
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.14.1
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: text
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.2.3
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 1.2.3
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry-byebug
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.2.1
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.2.1
|
83
|
+
description: A simple library with String Metric algorithms
|
84
|
+
email:
|
85
|
+
- giorgos.tsiftsis@skroutz.gr
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".rspec"
|
92
|
+
- ".travis.yml"
|
93
|
+
- Gemfile
|
94
|
+
- LICENSE.txt
|
95
|
+
- README.md
|
96
|
+
- Rakefile
|
97
|
+
- benchmarks/levenshtein.rb
|
98
|
+
- lib/string_metric.rb
|
99
|
+
- lib/string_metric/levenshtein.rb
|
100
|
+
- lib/string_metric/levenshtein/experiment.rb
|
101
|
+
- lib/string_metric/levenshtein/iterative_with_full_matrix.rb
|
102
|
+
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
|
103
|
+
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
|
104
|
+
- lib/string_metric/levenshtein/recursive.rb
|
105
|
+
- lib/string_metric/version.rb
|
106
|
+
- spec/fixtures/levenshtein.csv
|
107
|
+
- spec/lib/levenshtein/experiment_spec.rb
|
108
|
+
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|
109
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
|
110
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
|
111
|
+
- spec/lib/levenshtein/recursive_spec.rb
|
112
|
+
- spec/lib/levenshtein_spec.rb
|
113
|
+
- spec/spec_helper.rb
|
114
|
+
- spec/support/levenshtein.rb
|
115
|
+
- string_metric.gemspec
|
116
|
+
homepage: https://github.com/chief/string_metric
|
117
|
+
licenses:
|
118
|
+
- MIT
|
119
|
+
metadata: {}
|
120
|
+
post_install_message:
|
121
|
+
rdoc_options: []
|
122
|
+
require_paths:
|
123
|
+
- lib
|
124
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
requirements: []
|
135
|
+
rubyforge_project:
|
136
|
+
rubygems_version: 2.2.0
|
137
|
+
signing_key:
|
138
|
+
specification_version: 4
|
139
|
+
summary: A simple library with String Metric algorithms
|
140
|
+
test_files:
|
141
|
+
- spec/fixtures/levenshtein.csv
|
142
|
+
- spec/lib/levenshtein/experiment_spec.rb
|
143
|
+
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|
144
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
|
145
|
+
- spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
|
146
|
+
- spec/lib/levenshtein/recursive_spec.rb
|
147
|
+
- spec/lib/levenshtein_spec.rb
|
148
|
+
- spec/spec_helper.rb
|
149
|
+
- spec/support/levenshtein.rb
|