edits 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a437de45c8830ea622a9b57b84b3e92c54a58178
4
+ data.tar.gz: 36eff87bb3ba0e832f2cd908cb0bafe358e05a19
5
+ SHA512:
6
+ metadata.gz: bf9474115aacaff6cb0411c85cf8f81409d6cfaff793ef656821fc1fc33ae0d5315aa6af98c8dbe17a68aa38514b8adef134824e2ea6fd8c1434aea372e37b6c
7
+ data.tar.gz: 71199e4c9e67f4b78b90e21368dfd1162cd660fa38b01e40ea412e1a37aa29529d57ee78f1e5584f5a6319a22b2134165b638335b0e75b163650aaadaa9d67ae
@@ -0,0 +1,12 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+
11
+ # rspec failure tracking
12
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1,22 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.4
3
+ Metrics/AbcSize:
4
+ Enabled: false
5
+ Metrics/CyclomaticComplexity:
6
+ Enabled: false
7
+ Metrics/MethodLength:
8
+ Enabled: false
9
+ Metrics/PerceivedComplexity:
10
+ Enabled: false
11
+ Metrics/BlockLength:
12
+ Exclude:
13
+ - "tasks/**/*.rake"
14
+
15
+ Style/StringLiterals:
16
+ EnforcedStyle: double_quotes
17
+ Layout/AlignParameters:
18
+ EnforcedStyle: with_fixed_indentation
19
+ Layout/MultilineMethodCallIndentation:
20
+ EnforcedStyle: indented
21
+ Layout/MultilineOperationIndentation:
22
+ EnforcedStyle: indented
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.4.0
5
+ before_install: gem install bundler -v 1.15.4
@@ -0,0 +1 @@
1
+ -m markdown - LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6
+
7
+ # Specify your gem's dependencies in edits.gemspec
8
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Tom Crouch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,98 @@
1
+ # Edits
2
+
3
+ A collection of edit distance algorithms in Ruby.
4
+
5
+ Includes Levenshtein, Restricted Edit (Optimal Alignment) and Damerau-Levenshtein distances, and Jaro and Jaro-Winkler similarity.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'edits'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install edits
22
+
23
+ ## Usage
24
+
25
+ ### Levenshtein
26
+
27
+ Edit distance, accounting for deletion, addition and substitution.
28
+
29
+ ```ruby
30
+ Edits::Levenshtein.distance "raked", "bakers"
31
+ # => 3
32
+ Edits::Levenshtein.distance "iota", "atom"
33
+ # => 4
34
+ Edits::Levenshtein.distance "acer", "earn"
35
+ # => 4
36
+
37
+ # Max distance
38
+ Edits::Levenshtein.distance_with_max "iota", "atom", 2
39
+ # => 2
40
+ Edits::Levenshtein.most_similar "atom", %w[tram atlas rota racer]
41
+ # => "atlas"
42
+ ```
43
+
44
+ ### Restricted Edit (Optimal Alignment)
45
+
46
+ Edit distance, accounting for deletion, addition, substitution and swapped
47
+ characters.
48
+
49
+ ```ruby
50
+ Edits::RestrictedEdit.distance "raked", "bakers"
51
+ # => 3
52
+ Edits::RestrictedEdit.distance "iota", "atom"
53
+ # => 3
54
+ Edits::RestrictedEdit.distance "acer", "earn"
55
+ # => 4
56
+ ```
57
+
58
+ ### Damerau-Levenshtein
59
+
60
+ Edit distance, accounting for deletions, additions, substitution and
61
+ transposition.
62
+
63
+ ```ruby
64
+ Edits::DamerauLevenshtein.distance "raked", "bakers"
65
+ # => 3
66
+ Edits::DamerauLevenshtein.distance "iota", "atom"
67
+ # => 3
68
+ Edits::DamerauLevenshtein.distance "acer", "earn"
69
+ # => 3
70
+ ```
71
+
72
+ ### Jaro & Jaro-Winkler
73
+
74
+ ```ruby
75
+ Edits::Jaro.similarity "information", "informant"
76
+ # => 0.90235690235690236
77
+ Edits::Jaro.distance "information", "informant"
78
+ # => 0.097643097643097643
79
+
80
+ Edits::JaroWinkler.similarity "information", "informant"
81
+ # => 0.94141414141414137
82
+ Edits::JaroWinkler.distance "information", "informant"
83
+ # => 0.05858585858585863
84
+ ```
85
+
86
+ ## Development
87
+
88
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
89
+
90
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
91
+
92
+ ## Contributing
93
+
94
+ Bug reports and pull requests are welcome on GitHub at https://github.com/tcrouch/edits.
95
+
96
+ ## License
97
+
98
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ Dir["tasks/**/*.rake"].each { |t| load t }
7
+
8
+ RSpec::Core::RakeTask.new(:spec)
9
+
10
+ task default: :spec
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "edits"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("../lib", __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require "edits/version"
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "edits"
9
+ spec.version = Edits::VERSION
10
+ spec.authors = ["Tom Crouch"]
11
+ spec.email = ["tom.crouch@gmail.com"]
12
+
13
+ spec.summary = "A collection of edit distance algorithms."
14
+ # spec.description = "TODO: Write a longer description or delete this line."
15
+ spec.homepage = "https://github.com/tcrouch/edits"
16
+ spec.license = "MIT"
17
+
18
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
19
+ f.match(%r{^(test|spec|features)/})
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.15"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rspec", "~> 3.6"
28
+ spec.add_development_dependency "benchmark-ips", "~> 2.7"
29
+ spec.add_development_dependency "redcarpet", "~> 3.4"
30
+ spec.add_development_dependency "yard", "~> 0.9"
31
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "edits/version"
4
+
5
+ require "edits/damerau_levenshtein"
6
+ require "edits/hamming"
7
+ require "edits/jaro"
8
+ require "edits/jaro_winkler"
9
+ require "edits/levenshtein"
10
+ require "edits/restricted_edit"
11
+
12
+ # A collection of edit distance algorithms
13
+ module Edits
14
+ # Your code goes here...
15
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ # Implemention of the Damerau/Levenshtein distance algorithm.
5
+ #
6
+ # Determines distance between two strings by counting edits, identifying:
7
+ # * Insertion
8
+ # * Deletion
9
+ # * Substitution
10
+ # * Transposition
11
+ module DamerauLevenshtein
12
+ # Calculate the Damerau/Levenshtein distance of two sequences.
13
+ #
14
+ # @example
15
+ # DamerauLevenshtein.distance("acer", "earn")
16
+ # # => 3
17
+ # @param seq1 [String, Array]
18
+ # @param seq2 [String, Array]
19
+ # @return [Integer]
20
+ def self.distance(seq1, seq2)
21
+ if seq1.length > seq2.length
22
+ temp = seq1
23
+ seq1 = seq2
24
+ seq2 = temp
25
+ end
26
+
27
+ # array of Integer codepoints outperforms String
28
+ seq1 = seq1.codepoints if seq1.is_a? String
29
+ seq2 = seq2.codepoints if seq2.is_a? String
30
+
31
+ rows = seq1.length
32
+ cols = seq2.length
33
+ return cols if rows.zero?
34
+ return rows if cols.zero?
35
+
36
+ # 'infinite' edit distance for padding cost matrix.
37
+ # Can be any value greater than max[rows, cols]
38
+ inf = rows + cols
39
+
40
+ # Initialize first two rows of cost matrix.
41
+ # The full initial state where cols=3, rows=2 (inf=5) would be:
42
+ # [[5, 5, 5, 5, 5],
43
+ # [5, 0, 1, 2, 3],
44
+ # [5, 1, 0, 0, 0],
45
+ # [5, 2, 0, 0, 0]]
46
+ matrix = [Array.new(cols + 2, inf)]
47
+ matrix << 0.upto(cols).to_a.unshift(inf)
48
+
49
+ # element => last row seen
50
+ item_history = Hash.new(0)
51
+
52
+ 1.upto(rows) do |row|
53
+ # generate next row of cost matrix
54
+ new_row = Array.new(cols + 2, 0)
55
+ new_row[0] = inf
56
+ new_row[1] = row
57
+ matrix << new_row
58
+
59
+ last_match_col = 0
60
+ seq1_item = seq1[row - 1]
61
+
62
+ 1.upto(cols) do |col|
63
+ seq2_item = seq2[col - 1]
64
+ last_match_row = item_history[seq2_item]
65
+
66
+ sub_cost = seq1_item == seq2_item ? 0 : 1
67
+
68
+ transposition = 1 + matrix[last_match_row][last_match_col]
69
+ transposition += row - last_match_row - 1
70
+ transposition += col - last_match_col - 1
71
+
72
+ # TODO: do insertion/deletion need to be considered when
73
+ # seq1_item == seq2_item ?
74
+ deletion = matrix[row][col + 1] + 1
75
+ insertion = matrix[row + 1][col] + 1
76
+ substitution = matrix[row][col] + sub_cost
77
+
78
+ # step cost is min of operation costs
79
+ cost = substitution < insertion ? substitution : insertion
80
+ cost = deletion if deletion < cost
81
+ cost = transposition if transposition < cost
82
+
83
+ matrix[row + 1][col + 1] = cost
84
+
85
+ last_match_col = col if sub_cost.zero?
86
+ end
87
+
88
+ item_history[seq1_item] = row
89
+ end
90
+
91
+ matrix[rows + 1][cols + 1]
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ # @see https://en.wikipedia.org/wiki/Hamming_distance
5
+ module Hamming
6
+ # Calculate the Hamming distance between two sequences.
7
+ #
8
+ # @note A true distance metric, satisfies triangle inequality.
9
+ #
10
+ # @param seq1 [String, Array]
11
+ # @param seq2 [String, Array]
12
+ # @return [Integer] Hamming distance
13
+ def self.distance(seq1, seq2)
14
+ # if seq1.is_a?(Integer) && seq2.is_a?(Integer)
15
+ # return (seq1 ^ seq2).to_s(2).count("1")
16
+ # end
17
+
18
+ length = seq1.length < seq2.length ? seq1.length : seq2.length
19
+ diff = (seq1.length - seq2.length).abs
20
+
21
+ length.times.reduce(diff) do |distance, i|
22
+ seq1[i] == seq2[i] ? distance : distance + 1
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ # @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance
5
+ module Jaro
6
+ # Calculate Jaro similarity
7
+ #
8
+ # `Sj = 1/3 * ((m / |A|) + (m / |B|) + ((m - t) / m))`
9
+ #
10
+ # Where `m` is #matches and `t` is #transposes
11
+ #
12
+ # @example
13
+ # Edits::Jaro.similarity("information", "informant")
14
+ # # => 0.9023569023569024
15
+ # @param seq1 [String, Array]
16
+ # @param seq2 [String, Array]
17
+ # @return [Float]
18
+ def self.similarity(seq1, seq2)
19
+ return 1.0 if seq1 == seq2
20
+ return 0.0 if seq1.empty? || seq2.empty?
21
+
22
+ seq1 = seq1.codepoints if seq1.is_a? String
23
+ seq2 = seq2.codepoints if seq2.is_a? String
24
+
25
+ m, t = jaro_matches(seq1, seq2)
26
+ return 0.0 if m.zero?
27
+
28
+ m = m.to_f
29
+ (1.0 / 3) * ((m / seq1.length) + (m / seq2.length) + ((m - t) / m))
30
+ end
31
+
32
+ # Calculate Jaro distance
33
+ #
34
+ # @example
35
+ # Edits::Jaro.distance("information", "informant")
36
+ # # => 0.09764309764309764
37
+ # @param (see #distance)
38
+ # @return [Float]
39
+ def self.distance(str1, str2)
40
+ 1.0 - similarity(str1, str2)
41
+ end
42
+
43
+ # Calculate number of Jaro matches and transpositions
44
+ #
45
+ # @param (see #distance)
46
+ # @return [(Integer, Integer)] matches and transpositions
47
+ def self.jaro_matches(seq1, seq2)
48
+ if seq1.length > seq2.length
49
+ temp = seq1
50
+ seq1 = seq2
51
+ seq2 = temp
52
+ end
53
+
54
+ # search range: (max(|A|, |B|) / 2) - 1
55
+ range = (seq2.length / 2) - 1
56
+ range = 0 if range.negative?
57
+
58
+ seq1_flags = Array.new(seq1.length, false)
59
+ seq2_flags = Array.new(seq2.length, false)
60
+
61
+ matches = 0
62
+ last2 = seq2.length - 1
63
+
64
+ # Pass 1:
65
+ # - determine number of matches
66
+ # - initialize transposition flags
67
+ seq1.length.times do |i|
68
+ min_bound = i >= range ? i - range : 0
69
+ max_bound = (i + range) <= last2 ? (i + range) : last2
70
+
71
+ min_bound.upto(max_bound) do |j|
72
+ next unless seq2_flags[j] != true && seq2[j] == seq1[i]
73
+
74
+ seq2_flags[j] = true
75
+ seq1_flags[i] = true
76
+ matches += 1
77
+ break
78
+ end
79
+ end
80
+
81
+ return [0, 0] if matches.zero?
82
+
83
+ transposes = 0
84
+ j = 0
85
+
86
+ # Pass 2: determine number of half-transpositions
87
+ seq1.length.times do |i|
88
+ # find a match in first string
89
+ next unless seq1_flags[i] == true
90
+ # go to location of next match on second string
91
+ j += 1 until seq2_flags[j]
92
+
93
+ # transposition if not the current match
94
+ transposes += 1 if seq1[i] != seq2[j]
95
+ j += 1
96
+ end
97
+
98
+ # half-transpositions -> transpositions
99
+ transposes /= 2
100
+
101
+ [matches, transposes]
102
+ end
103
+ private_class_method :jaro_matches
104
+ end
105
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ # @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance
5
+ module JaroWinkler
6
+ # Prefix scaling factor for jaro-winkler metric. Default is 0.1
7
+ # Should not exceed 0.25 or metric range will leave 0..1
8
+ WINKLER_PREFIX_WEIGHT = 0.1
9
+
10
+ # Threshold for boosting Jaro with winkler prefix multiplier.
11
+ # Default is 0.7
12
+ WINKLER_THRESHOLD = 0.7
13
+
14
+ # Calculate Jaro-Winkler similarity of given strings
15
+ #
16
+ # Adds weight to Jaro distance according to the length of a common prefix
17
+ # of up to 4 letters, where exists. The additional weighting is only
18
+ # applied when the original distance passes a threshold.
19
+ #
20
+ # `Sw = Sj + (l * p * (1 - Dj))`
21
+ #
22
+ # Where `Sj` is Jaro, `l` is prefix length, and `p` is prefix weight
23
+ #
24
+ # @example
25
+ # Edits::JaroWinkler.similarity("information", "informant")
26
+ # # => 0.9414141414141414
27
+ #
28
+ # @param seq1 [String, Array]
29
+ # @param seq2 [String, Array]
30
+ # @param threshold [Float] threshold for applying Winkler prefix weighting
31
+ # @param weight [Float] weighting for common prefix, should not exceed 0.25
32
+ # @return [Float]
33
+ def self.similarity(
34
+ seq1, seq2,
35
+ threshold: WINKLER_THRESHOLD,
36
+ weight: WINKLER_PREFIX_WEIGHT
37
+ )
38
+
39
+ dj = Jaro.similarity(seq1, seq2)
40
+
41
+ if dj > threshold
42
+ # size of common prefix, max 4
43
+ max_bound = seq1.length > seq2.length ? seq2.length : seq1.length
44
+ max_bound = 4 if max_bound > 4
45
+
46
+ l = 0
47
+ l += 1 until seq1[l] != seq2[l] || l >= max_bound
48
+
49
+ l < 1 ? dj : dj + (l * weight * (1 - dj))
50
+ else
51
+ dj
52
+ end
53
+ end
54
+
55
+ # Calculate Jaro-Winkler distance
56
+ #
57
+ # @note Not a true distance metric, fails to satisfy triangle inequality.
58
+ #
59
+ # @example
60
+ # Edits::JaroWinkler.distance("information", "informant")
61
+ # # => 0.05858585858585863
62
+ # @param (see #distance)
63
+ # @return [Float]
64
+ def self.distance(
65
+ seq1, seq2,
66
+ threshold: WINKLER_THRESHOLD,
67
+ weight: WINKLER_PREFIX_WEIGHT
68
+ )
69
+ 1.0 - similarity(seq1, seq2, threshold: threshold, weight: weight)
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ # Implementation of Levenshtein distance algorithm.
5
+ #
6
+ # Determines distance between two string by counting edits, identifying:
7
+ # - Insertion
8
+ # - Deletion
9
+ # - Substitution
10
+ module Levenshtein
11
+ # Calculate the Levenshtein (edit) distance of two sequences.
12
+ #
13
+ # @note A true distance metric, satisfies triangle inequality.
14
+ # @example
15
+ # Levenshtein.distance('sand', 'hands')
16
+ # # => 2
17
+ # @param seq1 [String, Array]
18
+ # @param seq2 [String, Array]
19
+ # @return [Integer]
20
+ def self.distance(seq1, seq2)
21
+ if seq1.length > seq2.length
22
+ temp = seq1
23
+ seq1 = seq2
24
+ seq2 = temp
25
+ end
26
+
27
+ # array of Integer codepoints outperforms String
28
+ seq1 = seq1.codepoints if seq1.is_a? String
29
+ seq2 = seq2.codepoints if seq2.is_a? String
30
+
31
+ rows = seq1.length
32
+ cols = seq2.length
33
+ return cols if rows.zero?
34
+ return rows if cols.zero?
35
+
36
+ # Initialize first row of cost matrix.
37
+ # The full initial state where cols=3, rows=2 would be:
38
+ # [[0, 1, 2, 3],
39
+ # [1, 0, 0, 0],
40
+ # [2, 0, 0, 0]]
41
+ last_row = 0.upto(cols).to_a
42
+
43
+ rows.times do |row|
44
+ last_col = row + 1
45
+
46
+ seq1_item = seq1[row]
47
+
48
+ cols.times do |col|
49
+ deletion = last_row[col + 1] + 1
50
+ insertion = last_col + 1
51
+ substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1)
52
+
53
+ # step cost is min of operation costs
54
+ cost = deletion < insertion ? deletion : insertion
55
+ cost = substitution if substitution < cost
56
+
57
+ # overwrite previous row as we progress
58
+ last_row[col] = last_col
59
+ last_col = cost
60
+ end
61
+ last_row[cols] = last_col
62
+ end
63
+
64
+ last_row[cols]
65
+ end
66
+
67
+ # Calculate the Levenshtein (edit) distance of two sequences, bounded by
68
+ # a maximum value.
69
+ #
70
+ # @example
71
+ # Edits::Levenshtein.distance("cloud", "crayon")
72
+ # # => 5
73
+ # Edits::Levenshtein.distance_with_max("cloud", "crayon", 2)
74
+ # # => 2
75
+ # @param seq1 [String, Array]
76
+ # @param seq2 [String, Array]
77
+ # @param max [Integer] maximum distance
78
+ # @return [Integer]
79
+ def self.distance_with_max(seq1, seq2, max)
80
+ if seq1.length > seq2.length
81
+ temp = seq1
82
+ seq1 = seq2
83
+ seq2 = temp
84
+ end
85
+
86
+ rows = seq1.length
87
+ cols = seq2.length
88
+ return cols if rows.zero?
89
+ return rows if cols.zero?
90
+ return max if (rows - cols).abs >= max
91
+
92
+ seq1 = seq1.codepoints if seq1.is_a? String
93
+ seq2 = seq2.codepoints if seq2.is_a? String
94
+
95
+ last_row = 0.upto(cols).to_a
96
+
97
+ rows.times do |row|
98
+ last_col_cost = row + 1
99
+ seq1_item = seq1[row]
100
+
101
+ min_col = row > max ? row - max : 0
102
+ max_col = row + max
103
+ max_col = cols - 1 if max_col > cols - 1
104
+ diagonal = cols - rows + row
105
+
106
+ cols.times do |col|
107
+ return max if diagonal == col && last_row[col] >= max
108
+ col_cost =
109
+ if col < min_col || col > max_col
110
+ max + 1
111
+ else
112
+ # step cost is min of operation costs
113
+ deletion = last_row[col + 1] + 1
114
+ insertion = last_col_cost + 1
115
+ substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1)
116
+
117
+ cost = deletion < insertion ? deletion : insertion
118
+ substitution < cost ? substitution : cost
119
+ end
120
+
121
+ last_row[col] = last_col_cost
122
+ last_col_cost = col_cost
123
+ end
124
+
125
+ last_row[cols] = last_col_cost
126
+ end
127
+
128
+ last_row[cols] > max ? max : last_row[cols]
129
+ end
130
+
131
+ # Given a prototype string and an array of strings, determines which
132
+ # string is most similar to the prototype.
133
+ #
134
+ # `Levenshtein.most_similar("foo", strings)` is functionally equivalent to
135
+ # `strings.min_by { |s| Levenshtein.distance("foo", s) }`, leveraging
136
+ # {.distance_with_max}.
137
+ #
138
+ # @example
139
+ # Edits::Levenshtein.most_similar("atom", %w[tram atlas rota racer])
140
+ # # => "atlas"
141
+ # @param prototype [String]
142
+ # @param strings [<String>]
143
+ # @return [String, nil] most similar string, or nil for empty array
144
+ def self.most_similar(prototype, strings)
145
+ return nil if strings.empty?
146
+ min_s = strings[0]
147
+ min_d = distance(prototype, min_s)
148
+
149
+ strings[1..-1].each do |s|
150
+ return min_s if min_d.zero?
151
+ d = distance_with_max(prototype, s, min_d)
152
+ if d < min_d
153
+ min_d = d
154
+ min_s = s
155
+ end
156
+ end
157
+
158
+ min_s
159
+ end
160
+ end
161
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ # Implements Restricted Damerau-Levenshtein distance (Optimal Alignment)
5
+ # algorithm.
6
+ #
7
+ # Determines distance between two strings by counting edits, identifying:
8
+ # * Insertion
9
+ # * Deletion
10
+ # * Substitution
11
+ # * Swapped items
12
+ module RestrictedEdit
13
+ # Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
14
+ # of two sequences.
15
+ #
16
+ # @note Not a true distance metric, fails to satisfy triangle inequality.
17
+ # @example
18
+ # RestrictedEdit.distance("iota", "atom")
19
+ # # => 3
20
+ # @param seq1 [String, Array]
21
+ # @param seq2 [String, Array]
22
+ # @return [Integer]
23
+ def self.distance(seq1, seq2)
24
+ if seq1.length > seq2.length
25
+ temp = seq1
26
+ seq1 = seq2
27
+ seq2 = temp
28
+ end
29
+
30
+ # array of Integer codepoints outperforms String
31
+ seq1 = seq1.codepoints if seq1.is_a? String
32
+ seq2 = seq2.codepoints if seq2.is_a? String
33
+
34
+ rows = seq1.length
35
+ cols = seq2.length
36
+ return cols if rows.zero?
37
+ return rows if cols.zero?
38
+
39
+ # previous two rows of cost matrix are retained
40
+ lastlast_row = []
41
+ last_row = []
42
+ # Initialize first row of cost matrix.
43
+ # The full initial state where cols=3, rows=2 would be:
44
+ # [[0, 1, 2, 3],
45
+ # [1, 0, 0, 0],
46
+ # [2, 0, 0, 0]]
47
+ curr_row = 0.upto(cols).to_a
48
+
49
+ rows.times do |row|
50
+ lastlast_row = last_row
51
+ last_row = curr_row
52
+
53
+ # generate next row of cost matrix
54
+ curr_row = Array.new(cols + 1, 0)
55
+ curr_row[0] = row + 1
56
+
57
+ curr_item = seq1[row]
58
+
59
+ cols.times do |col|
60
+ sub_cost = curr_item == seq2[col] ? 0 : 1
61
+ is_swap = sub_cost == 1 &&
62
+ row.positive? && col.positive? &&
63
+ curr_item == seq2[col - 1] &&
64
+ seq1[row - 1] == seq2[col]
65
+
66
+ deletion = last_row[col + 1] + 1
67
+ insertion = curr_row[col] + 1
68
+ substitution = last_row[col] + sub_cost
69
+
70
+ # step cost is min of operation costs
71
+ cost = deletion < insertion ? deletion : insertion
72
+ cost = substitution if substitution < cost
73
+
74
+ if is_swap
75
+ swap = lastlast_row[col - 1] + 1
76
+ cost = swap if swap < cost
77
+ end
78
+
79
+ curr_row[col + 1] = cost
80
+ end
81
+ end
82
+
83
+ curr_row[cols]
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "benchmark"
4
+ require "edits"
5
+
6
+ desc "Compare metrics"
7
+ task :benchmark do
8
+ words = File.read("/usr/share/dict/words")
9
+ .split(/\n/).compact.shuffle(random: Random.new(1))
10
+
11
+ Benchmark.bm(20) do |x|
12
+ x.report("Hamming") do
13
+ words.each_cons(2) do |a, b|
14
+ Edits::Hamming.distance a, b
15
+ end
16
+ end
17
+
18
+ x.report("Levenshtein") do
19
+ words.each_cons(2) do |a, b|
20
+ Edits::Levenshtein.distance a, b
21
+ end
22
+ end
23
+
24
+ x.report("RestrictedEdit") do
25
+ words.each_cons(2) do |a, b|
26
+ Edits::RestrictedEdit.distance a, b
27
+ end
28
+ end
29
+
30
+ x.report("DamerauLevenshtein") do
31
+ words.each_cons(2) do |a, b|
32
+ Edits::DamerauLevenshtein.distance a, b
33
+ end
34
+ end
35
+
36
+ x.report("Jaro") do
37
+ words.each_cons(2) do |a, b|
38
+ Edits::Jaro.distance a, b
39
+ end
40
+ end
41
+
42
+ x.report("JaroWinkler") do
43
+ words.each_cons(2) do |a, b|
44
+ Edits::JaroWinkler.distance a, b
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "benchmark"
4
+ require "benchmark/ips"
5
+ require "edits"
6
+
7
+ namespace :benchmark do
8
+ desc "distance vs. distance_with_max (x100)"
9
+ task :lev_max do
10
+ words = File.read("/usr/share/dict/words")
11
+ .split(/\n/).compact.shuffle(random: Random.new(1))
12
+ .take(101)
13
+
14
+ Benchmark.ips do |x|
15
+ x.report("distance") do
16
+ words.each_cons(2) do |a, b|
17
+ Edits::Levenshtein.distance a, b
18
+ end
19
+ end
20
+
21
+ x.report("with max 1") do
22
+ words.each_cons(2) do |a, b|
23
+ Edits::Levenshtein.distance_with_max a, b, 1
24
+ end
25
+ end
26
+
27
+ x.report("with max 2") do
28
+ words.each_cons(2) do |a, b|
29
+ Edits::Levenshtein.distance_with_max a, b, 2
30
+ end
31
+ end
32
+
33
+ x.report("with max 3") do
34
+ words.each_cons(2) do |a, b|
35
+ Edits::Levenshtein.distance_with_max a, b, 3
36
+ end
37
+ end
38
+
39
+ x.report("with max 4") do
40
+ words.each_cons(2) do |a, b|
41
+ Edits::Levenshtein.distance_with_max a, b, 4
42
+ end
43
+ end
44
+
45
+ x.report("with max 6") do
46
+ words.each_cons(2) do |a, b|
47
+ Edits::Levenshtein.distance_with_max a, b, 6
48
+ end
49
+ end
50
+
51
+ x.report("with max 8") do
52
+ words.each_cons(2) do |a, b|
53
+ Edits::Levenshtein.distance_with_max a, b, 8
54
+ end
55
+ end
56
+
57
+ x.report("with max 50") do
58
+ words.each_cons(2) do |a, b|
59
+ Edits::Levenshtein.distance_with_max a, b, 100
60
+ end
61
+ end
62
+
63
+ x.compare!
64
+ end
65
+ end
66
+
67
+ desc "most_similar vs. min_by (100 words)"
68
+ task :lev_similar do
69
+ words = File.read("/usr/share/dict/words")
70
+ .split(/\n/).compact.shuffle(random: Random.new(1))
71
+ .take(100)
72
+
73
+ Benchmark.ips do |x|
74
+ x.report("most_similar") do
75
+ Edits::Levenshtein.most_similar("wxyz", words)
76
+ end
77
+
78
+ x.report("min_by") do
79
+ words.min_by { |s| Edits::Levenshtein.distance("wxyz", s) }
80
+ end
81
+
82
+ x.compare!
83
+ end
84
+ end
85
+
86
+ task :rowgen1 do
87
+ cols = 5
88
+ rows = 3
89
+
90
+ Benchmark.ips do |x|
91
+ x.report "new, unshift" do
92
+ Array.new(cols, 0).unshift(rows)
93
+ end
94
+
95
+ x.report "new, []=" do
96
+ curr_row = Array.new(cols + 1, 0)
97
+ curr_row[0] = rows
98
+ end
99
+
100
+ x.report "literal, concat" do
101
+ [rows].concat(Array.new(cols, 0))
102
+ end
103
+
104
+ x.report "literal, +" do
105
+ m = []
106
+ m << [rows] + Array.new(cols, 0)
107
+ end
108
+
109
+ x.compare!
110
+ end
111
+ end
112
+
113
+ task :rowgen2 do
114
+ cols = 5
115
+ rows = 3
116
+ inf = cols + rows
117
+
118
+ Benchmark.ips do |x|
119
+ x.report "new, unshift" do
120
+ m = []
121
+ m << Array.new(cols, 0).unshift(rows, inf)
122
+ end
123
+
124
+ x.report "new, []=" do
125
+ m = []
126
+ curr_row = Array.new(cols + 2, 0)
127
+ curr_row[0] = rows
128
+ curr_row[1] = inf
129
+ m << curr_row
130
+ end
131
+
132
+ x.report "literal, concat" do
133
+ m = []
134
+ m << [rows, inf].concat(Array.new(cols, 0))
135
+ end
136
+
137
+ x.report "literal, +" do
138
+ m = []
139
+ m << [rows, inf] + Array.new(cols, 0)
140
+ end
141
+
142
+ x.compare!
143
+ end
144
+ end
145
+ end
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: edits
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Tom Crouch
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-09-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.15'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.15'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: benchmark-ips
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.7'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: redcarpet
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.4'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.4'
83
+ - !ruby/object:Gem::Dependency
84
+ name: yard
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.9'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.9'
97
+ description:
98
+ email:
99
+ - tom.crouch@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".rspec"
106
+ - ".rubocop.yml"
107
+ - ".travis.yml"
108
+ - ".yardopts"
109
+ - Gemfile
110
+ - LICENSE.txt
111
+ - README.md
112
+ - Rakefile
113
+ - bin/console
114
+ - bin/setup
115
+ - edits.gemspec
116
+ - lib/edits.rb
117
+ - lib/edits/damerau_levenshtein.rb
118
+ - lib/edits/hamming.rb
119
+ - lib/edits/jaro.rb
120
+ - lib/edits/jaro_winkler.rb
121
+ - lib/edits/levenshtein.rb
122
+ - lib/edits/restricted_edit.rb
123
+ - lib/edits/version.rb
124
+ - tasks/benchmark.rake
125
+ - tasks/benchmark/levenshtein.rake
126
+ homepage: https://github.com/tcrouch/edits
127
+ licenses:
128
+ - MIT
129
+ metadata: {}
130
+ post_install_message:
131
+ rdoc_options: []
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ requirements: []
145
+ rubyforge_project:
146
+ rubygems_version: 2.6.8
147
+ signing_key:
148
+ specification_version: 4
149
+ summary: A collection of edit distance algorithms.
150
+ test_files: []