likeness 1.0.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/likeness.rb +79 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 27ccde505442d156b4a2a80a493c05b8cfa88ca2210d05cd6c473c991f7bf09b
|
4
|
+
data.tar.gz: b2e0173619708676cd1cc0a92936d716417eded6653785f4d37007334163600f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3a1428413c19c5e97d4c58cd211dcb2940fed6d943f7958251741b234b4d46187fba2ed5eedfb846d87766b0364469a8ad988266f89bf7229975248335ffb843
|
7
|
+
data.tar.gz: 6eb7515e3c599a4a5b8aef05ca724a8dc8d9aff924cef2fb35b06d853624504ed41cbf64cb21ec6d189ab90299239ae296230add48e00d09cdb484b541b90c0c
|
data/likeness.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
class Likeness
|
3
|
+
# Yields an object that can be used to compare many strings.
|
4
|
+
def initialize(options = {}, &cleaner)
|
5
|
+
# Collect or default the size of shingles to extract from each word.
|
6
|
+
@width = options[:width] || 2
|
7
|
+
|
8
|
+
# splitter can be a Regexp (defaults to any non-word) or a proc/lambda for extra power.
|
9
|
+
# The caller can split on strict whitespace by passing /\s+/, whereas the default of
|
10
|
+
# /\W+/ basically only leaves alpha-num. "Apostrophe's" (sic) could be an issue...
|
11
|
+
#
|
12
|
+
# Capture as both, then replace splitter with a lambda-wrap of re if it is a pattern.
|
13
|
+
re = @splitter = options[:splitter] || /\W+/
|
14
|
+
@splitter = lambda { |str| str.split(re) } if Regexp === re
|
15
|
+
|
16
|
+
# Give caller the opportunity to use a canonicalizing block.
|
17
|
+
# If the caller wants no lower-casing, they can use a no-op block { |s| s } :p
|
18
|
+
# If the caller does not want to quietly ignore apostrope's, then pass (&:downcase)
|
19
|
+
@cleaner ||= lambda { |str| str.downcase.gsub(/(?<=\w)'/, '') }
|
20
|
+
end # initialize
|
21
|
+
|
22
|
+
|
23
|
+
def self.match(shingles1, shingles2)
|
24
|
+
n = (l1 = shingles1.length) + (l2 = shingles2.length)
|
25
|
+
# Two empty strings match perfectly. NOTE: strings full of separator also appear empty.
|
26
|
+
return 1.0 if n == 0
|
27
|
+
|
28
|
+
# Count matching shingles. Exploit the sorted lists returned from smush() to step through them both.
|
29
|
+
i1 = i2 = k = 0
|
30
|
+
while i1 < l1 && i2 < l2
|
31
|
+
case
|
32
|
+
when shingles1[i1] < shingles2[i2] then i1 += 1 # step forward in shingles1 while its current shingle is behind
|
33
|
+
when shingles1[i1] > shingles2[i2] then i2 += 1 # ditto for shingles2
|
34
|
+
else i1 += 1; i2 += 1; k += 1 # score!!!
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Final score ranges from 0.0 to 1.0
|
39
|
+
return (2.0 * k) / n
|
40
|
+
end # self.match
|
41
|
+
|
42
|
+
|
43
|
+
def match(str1, str2)
|
44
|
+
# Split into words, then collect progressive shingles from each word. Result will be a flat sorted list of shingles.
|
45
|
+
return Likeness::match(shingles(str1), shingles(str2))
|
46
|
+
end # match
|
47
|
+
|
48
|
+
alias_method :[], :match # a sussinct alternative to .match
|
49
|
+
|
50
|
+
|
51
|
+
def shingles(str)
|
52
|
+
return @splitter.call(@cleaner.call(str))
|
53
|
+
.reject(&:empty?)
|
54
|
+
.map{ |s| s.length < @width ? s : 0.upto(s.length - @width).collect { |i| s[i, @width] } }
|
55
|
+
.flatten
|
56
|
+
.sort
|
57
|
+
end # shingles
|
58
|
+
|
59
|
+
|
60
|
+
def subject(str)
|
61
|
+
return Likeness::Subject.new(self, str)
|
62
|
+
end # subject
|
63
|
+
|
64
|
+
|
65
|
+
class Subject
|
66
|
+
def initialize(compar, str)
|
67
|
+
@compar = compar
|
68
|
+
@subject = @compar.shingles(str)
|
69
|
+
end # initialize
|
70
|
+
|
71
|
+
|
72
|
+
def match(str)
|
73
|
+
return Likeness::match(@subject, @compar.shingles(str))
|
74
|
+
end # match
|
75
|
+
|
76
|
+
alias_method :[], :match # a sussinct alternative to .match
|
77
|
+
end # Likeness::Subject::
|
78
|
+
end # Likeness::
|
79
|
+
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: likeness
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0.rc1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Clarke
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-08-19 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Powerful string similarity determination using the Shingles method.
|
14
|
+
email: a.andrew.clarke@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- likeness.rb
|
20
|
+
homepage: https://github.com/AndrewClarke/likeness
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">"
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.3.1
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.7.7
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: String Similarity method.
|
44
|
+
test_files: []
|