edits 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/.travis.yml +4 -3
- data/README.md +8 -1
- data/edits.gemspec +3 -2
- data/lib/edits/compare.rb +2 -0
- data/lib/edits/damerau_levenshtein.rb +1 -1
- data/lib/edits/hamming.rb +4 -2
- data/lib/edits/jaro.rb +3 -2
- data/lib/edits/jaro_winkler.rb +11 -14
- data/lib/edits/levenshtein.rb +5 -3
- data/lib/edits/restricted_edit.rb +2 -2
- data/lib/edits/version.rb +1 -1
- data/tasks/benchmark/levenshtein.rake +3 -3
- metadata +19 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1b79624e5ce72cbf1e7623ecb909c42b1fc6a82763e922b30b25589a040f2c1
|
4
|
+
data.tar.gz: 26175c109066ecc63c4df37493493011724e9e4dfe0c57cbae47411f6a4daf85
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c4b25223ef3db51afad11160fab94028fe5ab9d35f8918b63ac6cc5c3894245bea843247717ef1f93e627923dbdaf6c5844a4a0863f00598515bf0e07aa87a8
|
7
|
+
data.tar.gz: 4d060b4eebc4a2cad7b37b1ba6e4e30a4c5c2f38640ce2244960bf5ee0cba06b62ac577beb271e0919df784c999e69a740d4ec47daedac0cf1f06f4c395d7ce4
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
@@ -3,8 +3,9 @@ language: ruby
|
|
3
3
|
cache: bundler
|
4
4
|
rvm:
|
5
5
|
- 2.3
|
6
|
-
- 2.4
|
7
|
-
- 2.5
|
6
|
+
- 2.4
|
7
|
+
- 2.5.3
|
8
|
+
- 2.6
|
8
9
|
- ruby-head
|
9
10
|
- rbx-3
|
10
11
|
matrix:
|
@@ -19,7 +20,7 @@ jobs:
|
|
19
20
|
include:
|
20
21
|
- stage: gem release
|
21
22
|
if: tag IS present
|
22
|
-
rvm: 2.
|
23
|
+
rvm: 2.5.3
|
23
24
|
script: echo "Deploying to rubygems.org ..."
|
24
25
|
deploy:
|
25
26
|
provider: rubygems
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Edits
|
2
2
|
|
3
3
|
[![Build Status](https://travis-ci.org/tcrouch/edits.svg?branch=master)](https://travis-ci.org/tcrouch/edits)
|
4
|
-
[![
|
4
|
+
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/64cb50b8e9ce4ec2a752d091e441b09d)](https://app.codacy.com/app/t.crouch/edits?utm_source=github.com&utm_medium=referral&utm_content=tcrouch/edits&utm_campaign=Badge_Grade_Dashboard)
|
5
5
|
[![Inline docs](http://inch-ci.org/github/tcrouch/edits.svg?branch=master)](http://inch-ci.org/github/tcrouch/edits)
|
6
6
|
[![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](http://rubydoc.info/github/tcrouch/edits)
|
7
7
|
|
@@ -95,6 +95,13 @@ Edits::JaroWinkler.distance "information", "informant"
|
|
95
95
|
# => 0.05858585858585863
|
96
96
|
```
|
97
97
|
|
98
|
+
### Hamming
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
Edits::Hamming.distance("explorer", "exploded")
|
102
|
+
# => 2
|
103
|
+
```
|
104
|
+
|
98
105
|
## Development
|
99
106
|
|
100
107
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/edits.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
lib = File.expand_path("
|
3
|
+
lib = File.expand_path("lib", __dir__)
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
5
|
require "edits/version"
|
6
6
|
|
@@ -27,7 +27,8 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.require_paths = ["lib"]
|
28
28
|
|
29
29
|
spec.add_development_dependency "benchmark-ips", "~> 2.7"
|
30
|
-
spec.add_development_dependency "bundler", "~>
|
30
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
31
|
+
spec.add_development_dependency "codacy-coverage", "~> 2.1"
|
31
32
|
spec.add_development_dependency "rake", "~> 12.1"
|
32
33
|
spec.add_development_dependency "redcarpet", "~> 3.4"
|
33
34
|
spec.add_development_dependency "rspec", "~> 3.6"
|
data/lib/edits/compare.rb
CHANGED
@@ -18,11 +18,13 @@ module Edits
|
|
18
18
|
# @return [String, nil] most similar string, or nil for empty array
|
19
19
|
def most_similar(prototype, strings)
|
20
20
|
return nil if strings.empty?
|
21
|
+
|
21
22
|
min_s = strings[0]
|
22
23
|
min_d = distance(prototype, min_s)
|
23
24
|
|
24
25
|
strings[1..-1].each do |s|
|
25
26
|
return min_s if min_d.zero?
|
27
|
+
|
26
28
|
d = distance_with_max(prototype, s, min_d)
|
27
29
|
if d < min_d
|
28
30
|
min_d = d
|
@@ -16,7 +16,7 @@ module Edits
|
|
16
16
|
# # => 3
|
17
17
|
# @param seq1 [String, Array]
|
18
18
|
# @param seq2 [String, Array]
|
19
|
-
# @return [Integer]
|
19
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
20
20
|
def self.distance(seq1, seq2)
|
21
21
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
22
22
|
|
data/lib/edits/hamming.rb
CHANGED
@@ -8,10 +8,12 @@ module Edits
|
|
8
8
|
# Calculate the Hamming distance between two sequences.
|
9
9
|
#
|
10
10
|
# @note A true distance metric, satisfies triangle inequality.
|
11
|
-
#
|
11
|
+
# @example
|
12
|
+
# Edits::Hamming.distance("explorer", "exploded")
|
13
|
+
# # => 2
|
12
14
|
# @param seq1 [String, Array]
|
13
15
|
# @param seq2 [String, Array]
|
14
|
-
# @return [Integer]
|
16
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
15
17
|
def self.distance(seq1, seq2)
|
16
18
|
# if seq1.is_a?(Integer) && seq2.is_a?(Integer)
|
17
19
|
# return (seq1 ^ seq2).to_s(2).count("1")
|
data/lib/edits/jaro.rb
CHANGED
@@ -16,7 +16,7 @@ module Edits
|
|
16
16
|
# # => 0.9023569023569024
|
17
17
|
# @param seq1 [String, Array]
|
18
18
|
# @param seq2 [String, Array]
|
19
|
-
# @return [Float] similarity,
|
19
|
+
# @return [Float] similarity, from 0.0 (none) to 1.0 (identical)
|
20
20
|
def self.similarity(seq1, seq2)
|
21
21
|
return 1.0 if seq1 == seq2
|
22
22
|
return 0.0 if seq1.empty? || seq2.empty?
|
@@ -39,7 +39,7 @@ module Edits
|
|
39
39
|
# Edits::Jaro.distance("information", "informant")
|
40
40
|
# # => 0.09764309764309764
|
41
41
|
# @param (see #distance)
|
42
|
-
# @return [Float] distance,
|
42
|
+
# @return [Float] distance, from 0.0 (identical) to 1.0 (distant)
|
43
43
|
def self.distance(str1, str2)
|
44
44
|
1.0 - similarity(str1, str2)
|
45
45
|
end
|
@@ -87,6 +87,7 @@ module Edits
|
|
87
87
|
seq1.length.times do |i|
|
88
88
|
# find a match in first string
|
89
89
|
next unless seq1_flags[i] == true
|
90
|
+
|
90
91
|
# go to location of next match on second string
|
91
92
|
j += 1 until seq2_flags[j]
|
92
93
|
|
data/lib/edits/jaro_winkler.rb
CHANGED
@@ -9,7 +9,7 @@ module Edits
|
|
9
9
|
# Should not exceed 0.25 or metric range will leave 0..1
|
10
10
|
WINKLER_PREFIX_WEIGHT = 0.1
|
11
11
|
|
12
|
-
# Threshold for boosting Jaro with
|
12
|
+
# Threshold for boosting Jaro with Winkler prefix multiplier.
|
13
13
|
# Default is 0.7
|
14
14
|
WINKLER_THRESHOLD = 0.7
|
15
15
|
|
@@ -31,27 +31,24 @@ module Edits
|
|
31
31
|
# @param seq2 [String, Array]
|
32
32
|
# @param threshold [Float] threshold for applying Winkler prefix weighting
|
33
33
|
# @param weight [Float] weighting for common prefix, should not exceed 0.25
|
34
|
-
# @return [Float] similarity,
|
34
|
+
# @return [Float] similarity, from 0.0 (none) to 1.0 (identical)
|
35
35
|
def self.similarity(
|
36
36
|
seq1, seq2,
|
37
37
|
threshold: WINKLER_THRESHOLD,
|
38
38
|
weight: WINKLER_PREFIX_WEIGHT
|
39
39
|
)
|
40
40
|
|
41
|
-
|
41
|
+
sj = Jaro.similarity(seq1, seq2)
|
42
|
+
return sj unless sj > threshold
|
42
43
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
max_bound = 4 if max_bound > 4
|
44
|
+
# size of common prefix, max 4
|
45
|
+
max_bound = seq1.length > seq2.length ? seq2.length : seq1.length
|
46
|
+
max_bound = 4 if max_bound > 4
|
47
47
|
|
48
|
-
|
49
|
-
|
48
|
+
l = 0
|
49
|
+
l += 1 until seq1[l] != seq2[l] || l >= max_bound
|
50
50
|
|
51
|
-
|
52
|
-
else
|
53
|
-
dj
|
54
|
-
end
|
51
|
+
l < 1 ? sj : sj + (l * weight * (1 - sj))
|
55
52
|
end
|
56
53
|
|
57
54
|
# Calculate Jaro-Winkler distance
|
@@ -62,7 +59,7 @@ module Edits
|
|
62
59
|
# Edits::JaroWinkler.distance("information", "informant")
|
63
60
|
# # => 0.05858585858585863
|
64
61
|
# @param (see #distance)
|
65
|
-
# @return [Float] distance,
|
62
|
+
# @return [Float] distance, from 0.0 (identical) to 1.0 (distant)
|
66
63
|
def self.distance(
|
67
64
|
seq1, seq2,
|
68
65
|
threshold: WINKLER_THRESHOLD,
|
data/lib/edits/levenshtein.rb
CHANGED
@@ -7,6 +7,8 @@ module Edits
|
|
7
7
|
# * Insertion
|
8
8
|
# * Deletion
|
9
9
|
# * Substitution
|
10
|
+
#
|
11
|
+
# @see https://en.wikipedia.org/wiki/Levenshtein_distance
|
10
12
|
module Levenshtein
|
11
13
|
extend Compare
|
12
14
|
|
@@ -14,11 +16,11 @@ module Edits
|
|
14
16
|
#
|
15
17
|
# @note A true distance metric, satisfies triangle inequality.
|
16
18
|
# @example
|
17
|
-
# Levenshtein.distance(
|
19
|
+
# Levenshtein.distance("sand", "hands")
|
18
20
|
# # => 2
|
19
21
|
# @param seq1 [String, Array]
|
20
22
|
# @param seq2 [String, Array]
|
21
|
-
# @return [Integer]
|
23
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
22
24
|
def self.distance(seq1, seq2)
|
23
25
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
24
26
|
|
@@ -74,7 +76,7 @@ module Edits
|
|
74
76
|
# @param seq1 [String, Array]
|
75
77
|
# @param seq2 [String, Array]
|
76
78
|
# @param max [Integer] maximum distance
|
77
|
-
# @return [Integer]
|
79
|
+
# @return [Integer] distance, from 0 (identical) to max (more distant)
|
78
80
|
def self.distance_with_max(seq1, seq2, max)
|
79
81
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
80
82
|
|
@@ -24,7 +24,7 @@ module Edits
|
|
24
24
|
# # => 3
|
25
25
|
# @param seq1 [String, Array]
|
26
26
|
# @param seq2 [String, Array]
|
27
|
-
# @return [Integer]
|
27
|
+
# @return [Integer] distance, 0 (identical) or greater (more distant)
|
28
28
|
def self.distance(seq1, seq2)
|
29
29
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
30
30
|
|
@@ -96,7 +96,7 @@ module Edits
|
|
96
96
|
# @param seq1 [String, Array]
|
97
97
|
# @param seq2 [String, Array]
|
98
98
|
# @param max [Integer] maximum distance
|
99
|
-
# @return [Integer]
|
99
|
+
# @return [Integer] distance, from 0 (identical) to max (more distant)
|
100
100
|
def self.distance_with_max(seq1, seq2, max)
|
101
101
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
102
102
|
|
data/lib/edits/version.rb
CHANGED
@@ -5,7 +5,7 @@ require "benchmark/ips"
|
|
5
5
|
require "edits"
|
6
6
|
|
7
7
|
namespace :benchmark do
|
8
|
-
desc "levenshtein distance vs
|
8
|
+
desc "levenshtein distance vs distance_with_max (x100)"
|
9
9
|
task :lev_max do
|
10
10
|
words = File.read("/usr/share/dict/words")
|
11
11
|
.split(/\n/).compact.shuffle(random: Random.new(1))
|
@@ -64,7 +64,7 @@ namespace :benchmark do
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
67
|
-
desc "restricted distance vs
|
67
|
+
desc "restricted distance vs distance_with_max (x100)"
|
68
68
|
task :restricted_max do
|
69
69
|
words = File.read("/usr/share/dict/words")
|
70
70
|
.split(/\n/).compact.shuffle(random: Random.new(1))
|
@@ -123,7 +123,7 @@ namespace :benchmark do
|
|
123
123
|
end
|
124
124
|
end
|
125
125
|
|
126
|
-
desc "most_similar vs
|
126
|
+
desc "most_similar vs min_by (100 words)"
|
127
127
|
task :lev_similar do
|
128
128
|
words = File.read("/usr/share/dict/words")
|
129
129
|
.split(/\n/).compact.shuffle(random: Random.new(1))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: edits
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Crouch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-04-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: benchmark-ips
|
@@ -30,14 +30,28 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: codacy-coverage
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.1'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.1'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rake
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -146,8 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
160
|
- !ruby/object:Gem::Version
|
147
161
|
version: '0'
|
148
162
|
requirements: []
|
149
|
-
|
150
|
-
rubygems_version: 2.7.6
|
163
|
+
rubygems_version: 3.0.3
|
151
164
|
signing_key:
|
152
165
|
specification_version: 4
|
153
166
|
summary: A collection of edit distance algorithms.
|