edits 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/.travis.yml +4 -3
- data/README.md +8 -1
- data/edits.gemspec +3 -2
- data/lib/edits/compare.rb +2 -0
- data/lib/edits/damerau_levenshtein.rb +1 -1
- data/lib/edits/hamming.rb +4 -2
- data/lib/edits/jaro.rb +3 -2
- data/lib/edits/jaro_winkler.rb +11 -14
- data/lib/edits/levenshtein.rb +5 -3
- data/lib/edits/restricted_edit.rb +2 -2
- data/lib/edits/version.rb +1 -1
- data/tasks/benchmark/levenshtein.rake +3 -3
- metadata +19 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1b79624e5ce72cbf1e7623ecb909c42b1fc6a82763e922b30b25589a040f2c1
|
4
|
+
data.tar.gz: 26175c109066ecc63c4df37493493011724e9e4dfe0c57cbae47411f6a4daf85
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c4b25223ef3db51afad11160fab94028fe5ab9d35f8918b63ac6cc5c3894245bea843247717ef1f93e627923dbdaf6c5844a4a0863f00598515bf0e07aa87a8
|
7
|
+
data.tar.gz: 4d060b4eebc4a2cad7b37b1ba6e4e30a4c5c2f38640ce2244960bf5ee0cba06b62ac577beb271e0919df784c999e69a740d4ec47daedac0cf1f06f4c395d7ce4
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
@@ -3,8 +3,9 @@ language: ruby
|
|
3
3
|
cache: bundler
|
4
4
|
rvm:
|
5
5
|
- 2.3
|
6
|
-
- 2.4
|
7
|
-
- 2.5
|
6
|
+
- 2.4
|
7
|
+
- 2.5.3
|
8
|
+
- 2.6
|
8
9
|
- ruby-head
|
9
10
|
- rbx-3
|
10
11
|
matrix:
|
@@ -19,7 +20,7 @@ jobs:
|
|
19
20
|
include:
|
20
21
|
- stage: gem release
|
21
22
|
if: tag IS present
|
22
|
-
rvm: 2.
|
23
|
+
rvm: 2.5.3
|
23
24
|
script: echo "Deploying to rubygems.org ..."
|
24
25
|
deploy:
|
25
26
|
provider: rubygems
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Edits
|
2
2
|
|
3
3
|
[](https://travis-ci.org/tcrouch/edits)
|
4
|
-
[](https://app.codacy.com/app/t.crouch/edits?utm_source=github.com&utm_medium=referral&utm_content=tcrouch/edits&utm_campaign=Badge_Grade_Dashboard)
|
5
5
|
[](http://inch-ci.org/github/tcrouch/edits)
|
6
6
|
[](http://rubydoc.info/github/tcrouch/edits)
|
7
7
|
|
@@ -95,6 +95,13 @@ Edits::JaroWinkler.distance "information", "informant"
|
|
95
95
|
# => 0.05858585858585863
|
96
96
|
```
|
97
97
|
|
98
|
+
### Hamming
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
Edits::Hamming.distance("explorer", "exploded")
|
102
|
+
# => 2
|
103
|
+
```
|
104
|
+
|
98
105
|
## Development
|
99
106
|
|
100
107
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/edits.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
lib = File.expand_path("
|
3
|
+
lib = File.expand_path("lib", __dir__)
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
5
|
require "edits/version"
|
6
6
|
|
@@ -27,7 +27,8 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.require_paths = ["lib"]
|
28
28
|
|
29
29
|
spec.add_development_dependency "benchmark-ips", "~> 2.7"
|
30
|
-
spec.add_development_dependency "bundler", "~>
|
30
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
31
|
+
spec.add_development_dependency "codacy-coverage", "~> 2.1"
|
31
32
|
spec.add_development_dependency "rake", "~> 12.1"
|
32
33
|
spec.add_development_dependency "redcarpet", "~> 3.4"
|
33
34
|
spec.add_development_dependency "rspec", "~> 3.6"
|
data/lib/edits/compare.rb
CHANGED
@@ -18,11 +18,13 @@ module Edits
|
|
18
18
|
# @return [String, nil] most similar string, or nil for empty array
|
19
19
|
def most_similar(prototype, strings)
|
20
20
|
return nil if strings.empty?
|
21
|
+
|
21
22
|
min_s = strings[0]
|
22
23
|
min_d = distance(prototype, min_s)
|
23
24
|
|
24
25
|
strings[1..-1].each do |s|
|
25
26
|
return min_s if min_d.zero?
|
27
|
+
|
26
28
|
d = distance_with_max(prototype, s, min_d)
|
27
29
|
if d < min_d
|
28
30
|
min_d = d
|
@@ -16,7 +16,7 @@ module Edits
|
|
16
16
|
# # => 3
|
17
17
|
# @param seq1 [String, Array]
|
18
18
|
# @param seq2 [String, Array]
|
19
|
-
# @return [Integer]
|
19
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
20
20
|
def self.distance(seq1, seq2)
|
21
21
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
22
22
|
|
data/lib/edits/hamming.rb
CHANGED
@@ -8,10 +8,12 @@ module Edits
|
|
8
8
|
# Calculate the Hamming distance between two sequences.
|
9
9
|
#
|
10
10
|
# @note A true distance metric, satisfies triangle inequality.
|
11
|
-
#
|
11
|
+
# @example
|
12
|
+
# Edits::Hamming.distance("explorer", "exploded")
|
13
|
+
# # => 2
|
12
14
|
# @param seq1 [String, Array]
|
13
15
|
# @param seq2 [String, Array]
|
14
|
-
# @return [Integer]
|
16
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
15
17
|
def self.distance(seq1, seq2)
|
16
18
|
# if seq1.is_a?(Integer) && seq2.is_a?(Integer)
|
17
19
|
# return (seq1 ^ seq2).to_s(2).count("1")
|
data/lib/edits/jaro.rb
CHANGED
@@ -16,7 +16,7 @@ module Edits
|
|
16
16
|
# # => 0.9023569023569024
|
17
17
|
# @param seq1 [String, Array]
|
18
18
|
# @param seq2 [String, Array]
|
19
|
-
# @return [Float] similarity,
|
19
|
+
# @return [Float] similarity, from 0.0 (none) to 1.0 (identical)
|
20
20
|
def self.similarity(seq1, seq2)
|
21
21
|
return 1.0 if seq1 == seq2
|
22
22
|
return 0.0 if seq1.empty? || seq2.empty?
|
@@ -39,7 +39,7 @@ module Edits
|
|
39
39
|
# Edits::Jaro.distance("information", "informant")
|
40
40
|
# # => 0.09764309764309764
|
41
41
|
# @param (see #distance)
|
42
|
-
# @return [Float] distance,
|
42
|
+
# @return [Float] distance, from 0.0 (identical) to 1.0 (distant)
|
43
43
|
def self.distance(str1, str2)
|
44
44
|
1.0 - similarity(str1, str2)
|
45
45
|
end
|
@@ -87,6 +87,7 @@ module Edits
|
|
87
87
|
seq1.length.times do |i|
|
88
88
|
# find a match in first string
|
89
89
|
next unless seq1_flags[i] == true
|
90
|
+
|
90
91
|
# go to location of next match on second string
|
91
92
|
j += 1 until seq2_flags[j]
|
92
93
|
|
data/lib/edits/jaro_winkler.rb
CHANGED
@@ -9,7 +9,7 @@ module Edits
|
|
9
9
|
# Should not exceed 0.25 or metric range will leave 0..1
|
10
10
|
WINKLER_PREFIX_WEIGHT = 0.1
|
11
11
|
|
12
|
-
# Threshold for boosting Jaro with
|
12
|
+
# Threshold for boosting Jaro with Winkler prefix multiplier.
|
13
13
|
# Default is 0.7
|
14
14
|
WINKLER_THRESHOLD = 0.7
|
15
15
|
|
@@ -31,27 +31,24 @@ module Edits
|
|
31
31
|
# @param seq2 [String, Array]
|
32
32
|
# @param threshold [Float] threshold for applying Winkler prefix weighting
|
33
33
|
# @param weight [Float] weighting for common prefix, should not exceed 0.25
|
34
|
-
# @return [Float] similarity,
|
34
|
+
# @return [Float] similarity, from 0.0 (none) to 1.0 (identical)
|
35
35
|
def self.similarity(
|
36
36
|
seq1, seq2,
|
37
37
|
threshold: WINKLER_THRESHOLD,
|
38
38
|
weight: WINKLER_PREFIX_WEIGHT
|
39
39
|
)
|
40
40
|
|
41
|
-
|
41
|
+
sj = Jaro.similarity(seq1, seq2)
|
42
|
+
return sj unless sj > threshold
|
42
43
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
max_bound = 4 if max_bound > 4
|
44
|
+
# size of common prefix, max 4
|
45
|
+
max_bound = seq1.length > seq2.length ? seq2.length : seq1.length
|
46
|
+
max_bound = 4 if max_bound > 4
|
47
47
|
|
48
|
-
|
49
|
-
|
48
|
+
l = 0
|
49
|
+
l += 1 until seq1[l] != seq2[l] || l >= max_bound
|
50
50
|
|
51
|
-
|
52
|
-
else
|
53
|
-
dj
|
54
|
-
end
|
51
|
+
l < 1 ? sj : sj + (l * weight * (1 - sj))
|
55
52
|
end
|
56
53
|
|
57
54
|
# Calculate Jaro-Winkler distance
|
@@ -62,7 +59,7 @@ module Edits
|
|
62
59
|
# Edits::JaroWinkler.distance("information", "informant")
|
63
60
|
# # => 0.05858585858585863
|
64
61
|
# @param (see #distance)
|
65
|
-
# @return [Float] distance,
|
62
|
+
# @return [Float] distance, from 0.0 (identical) to 1.0 (distant)
|
66
63
|
def self.distance(
|
67
64
|
seq1, seq2,
|
68
65
|
threshold: WINKLER_THRESHOLD,
|
data/lib/edits/levenshtein.rb
CHANGED
@@ -7,6 +7,8 @@ module Edits
|
|
7
7
|
# * Insertion
|
8
8
|
# * Deletion
|
9
9
|
# * Substitution
|
10
|
+
#
|
11
|
+
# @see https://en.wikipedia.org/wiki/Levenshtein_distance
|
10
12
|
module Levenshtein
|
11
13
|
extend Compare
|
12
14
|
|
@@ -14,11 +16,11 @@ module Edits
|
|
14
16
|
#
|
15
17
|
# @note A true distance metric, satisfies triangle inequality.
|
16
18
|
# @example
|
17
|
-
# Levenshtein.distance(
|
19
|
+
# Levenshtein.distance("sand", "hands")
|
18
20
|
# # => 2
|
19
21
|
# @param seq1 [String, Array]
|
20
22
|
# @param seq2 [String, Array]
|
21
|
-
# @return [Integer]
|
23
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
22
24
|
def self.distance(seq1, seq2)
|
23
25
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
24
26
|
|
@@ -74,7 +76,7 @@ module Edits
|
|
74
76
|
# @param seq1 [String, Array]
|
75
77
|
# @param seq2 [String, Array]
|
76
78
|
# @param max [Integer] maximum distance
|
77
|
-
# @return [Integer]
|
79
|
+
# @return [Integer] distance, from 0 (identical) to max (more distant)
|
78
80
|
def self.distance_with_max(seq1, seq2, max)
|
79
81
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
80
82
|
|
@@ -24,7 +24,7 @@ module Edits
|
|
24
24
|
# # => 3
|
25
25
|
# @param seq1 [String, Array]
|
26
26
|
# @param seq2 [String, Array]
|
27
|
-
# @return [Integer]
|
27
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
28
28
|
def self.distance(seq1, seq2)
|
29
29
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
30
30
|
|
@@ -96,7 +96,7 @@ module Edits
|
|
96
96
|
# @param seq1 [String, Array]
|
97
97
|
# @param seq2 [String, Array]
|
98
98
|
# @param max [Integer] maximum distance
|
99
|
-
# @return [Integer]
|
99
|
+
# @return [Integer] distance, from 0 (identical) to max (more distant)
|
100
100
|
def self.distance_with_max(seq1, seq2, max)
|
101
101
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
102
102
|
|
data/lib/edits/version.rb
CHANGED
@@ -5,7 +5,7 @@ require "benchmark/ips"
|
|
5
5
|
require "edits"
|
6
6
|
|
7
7
|
namespace :benchmark do
|
8
|
-
desc "levenshtein distance vs
|
8
|
+
desc "levenshtein distance vs distance_with_max (x100)"
|
9
9
|
task :lev_max do
|
10
10
|
words = File.read("/usr/share/dict/words")
|
11
11
|
.split(/\n/).compact.shuffle(random: Random.new(1))
|
@@ -64,7 +64,7 @@ namespace :benchmark do
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
67
|
-
desc "restricted distance vs
|
67
|
+
desc "restricted distance vs distance_with_max (x100)"
|
68
68
|
task :restricted_max do
|
69
69
|
words = File.read("/usr/share/dict/words")
|
70
70
|
.split(/\n/).compact.shuffle(random: Random.new(1))
|
@@ -123,7 +123,7 @@ namespace :benchmark do
|
|
123
123
|
end
|
124
124
|
end
|
125
125
|
|
126
|
-
desc "most_similar vs
|
126
|
+
desc "most_similar vs min_by (100 words)"
|
127
127
|
task :lev_similar do
|
128
128
|
words = File.read("/usr/share/dict/words")
|
129
129
|
.split(/\n/).compact.shuffle(random: Random.new(1))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: edits
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Crouch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-04-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: benchmark-ips
|
@@ -30,14 +30,28 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: codacy-coverage
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.1'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.1'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rake
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -146,8 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
160
|
- !ruby/object:Gem::Version
|
147
161
|
version: '0'
|
148
162
|
requirements: []
|
149
|
-
|
150
|
-
rubygems_version: 2.7.6
|
163
|
+
rubygems_version: 3.0.3
|
151
164
|
signing_key:
|
152
165
|
specification_version: 4
|
153
166
|
summary: A collection of edit distance algorithms.
|