likeness 1.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/likeness.rb +79 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 27ccde505442d156b4a2a80a493c05b8cfa88ca2210d05cd6c473c991f7bf09b
|
4
|
+
data.tar.gz: b2e0173619708676cd1cc0a92936d716417eded6653785f4d37007334163600f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3a1428413c19c5e97d4c58cd211dcb2940fed6d943f7958251741b234b4d46187fba2ed5eedfb846d87766b0364469a8ad988266f89bf7229975248335ffb843
|
7
|
+
data.tar.gz: 6eb7515e3c599a4a5b8aef05ca724a8dc8d9aff924cef2fb35b06d853624504ed41cbf64cb21ec6d189ab90299239ae296230add48e00d09cdb484b541b90c0c
|
data/likeness.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
class Likeness
|
3
|
+
# Yields an object that can be used to compare many strings.
|
4
|
+
def initialize(options = {}, &cleaner)
|
5
|
+
# Collect or default the size of shingles to extract from each word.
|
6
|
+
@width = options[:width] || 2
|
7
|
+
|
8
|
+
# splitter can be a Regexp (defaults to any non-word) or a proc/lambda for extra power.
|
9
|
+
# The caller can split on strict whitespace by passing /\s+/, whereas the default of
|
10
|
+
# /\W+/ basically only leaves alpha-num. "Apostrophe's" (sic) could be an issue...
|
11
|
+
#
|
12
|
+
# Capture as both, then replace splitter with a lambda-wrap of re if it is a pattern.
|
13
|
+
re = @splitter = options[:splitter] || /\W+/
|
14
|
+
@splitter = lambda { |str| str.split(re) } if Regexp === re
|
15
|
+
|
16
|
+
# Give caller the opportunity to use a canonicalizing block.
|
17
|
+
# If the caller wants no lower-casing, they can use a no-op block { |s| s } :p
|
18
|
+
# If the caller does not want to quietly ignore apostrope's, then pass (&:downcase)
|
19
|
+
@cleaner ||= lambda { |str| str.downcase.gsub(/(?<=\w)'/, '') }
|
20
|
+
end # initialize
|
21
|
+
|
22
|
+
|
23
|
+
def self.match(shingles1, shingles2)
|
24
|
+
n = (l1 = shingles1.length) + (l2 = shingles2.length)
|
25
|
+
# Two empty strings match perfectly. NOTE: strings full of separator also appear empty.
|
26
|
+
return 1.0 if n == 0
|
27
|
+
|
28
|
+
# Count matching shingles. Exploit the sorted lists returned from smush() to step through them both.
|
29
|
+
i1 = i2 = k = 0
|
30
|
+
while i1 < l1 && i2 < l2
|
31
|
+
case
|
32
|
+
when shingles1[i1] < shingles2[i2] then i1 += 1 # step forward in shingles1 while its current shingle is behind
|
33
|
+
when shingles1[i1] > shingles2[i2] then i2 += 1 # ditto for shingles2
|
34
|
+
else i1 += 1; i2 += 1; k += 1 # score!!!
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Final score ranges from 0.0 to 1.0
|
39
|
+
return (2.0 * k) / n
|
40
|
+
end # self.match
|
41
|
+
|
42
|
+
|
43
|
+
def match(str1, str2)
|
44
|
+
# Split into words, then collect progressive shingles from each word. Result will be a flat sorted list of shingles.
|
45
|
+
return Likeness::match(shingles(str1), shingles(str2))
|
46
|
+
end # match
|
47
|
+
|
48
|
+
alias_method :[], :match # a sussinct alternative to .match
|
49
|
+
|
50
|
+
|
51
|
+
def shingles(str)
|
52
|
+
return @splitter.call(@cleaner.call(str))
|
53
|
+
.reject(&:empty?)
|
54
|
+
.map{ |s| s.length < @width ? s : 0.upto(s.length - @width).collect { |i| s[i, @width] } }
|
55
|
+
.flatten
|
56
|
+
.sort
|
57
|
+
end # shingles
|
58
|
+
|
59
|
+
|
60
|
+
def subject(str)
|
61
|
+
return Likeness::Subject.new(self, str)
|
62
|
+
end # subject
|
63
|
+
|
64
|
+
|
65
|
+
class Subject
|
66
|
+
def initialize(compar, str)
|
67
|
+
@compar = compar
|
68
|
+
@subject = @compar.shingles(str)
|
69
|
+
end # initialize
|
70
|
+
|
71
|
+
|
72
|
+
def match(str)
|
73
|
+
return Likeness::match(@subject, @compar.shingles(str))
|
74
|
+
end # match
|
75
|
+
|
76
|
+
alias_method :[], :match # a sussinct alternative to .match
|
77
|
+
end # Likeness::Subject::
|
78
|
+
end # Likeness::
|
79
|
+
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: likeness
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0.rc1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Clarke
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-08-19 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Powerful string similarity determination using the Shingles method.
|
14
|
+
email: a.andrew.clarke@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- likeness.rb
|
20
|
+
homepage: https://github.com/AndrewClarke/likeness
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">"
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.3.1
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.7.7
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: String Similarity method.
|
44
|
+
test_files: []
|