likeness 1.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/likeness.rb +79 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 27ccde505442d156b4a2a80a493c05b8cfa88ca2210d05cd6c473c991f7bf09b
4
+ data.tar.gz: b2e0173619708676cd1cc0a92936d716417eded6653785f4d37007334163600f
5
+ SHA512:
6
+ metadata.gz: 3a1428413c19c5e97d4c58cd211dcb2940fed6d943f7958251741b234b4d46187fba2ed5eedfb846d87766b0364469a8ad988266f89bf7229975248335ffb843
7
+ data.tar.gz: 6eb7515e3c599a4a5b8aef05ca724a8dc8d9aff924cef2fb35b06d853624504ed41cbf64cb21ec6d189ab90299239ae296230add48e00d09cdb484b541b90c0c
@@ -0,0 +1,79 @@
1
+
2
+ class Likeness
3
+ # Yields an object that can be used to compare many strings.
4
+ def initialize(options = {}, &cleaner)
5
+ # Collect or default the size of shingles to extract from each word.
6
+ @width = options[:width] || 2
7
+
8
+ # splitter can be a Regexp (defaults to any non-word) or a proc/lambda for extra power.
9
+ # The caller can split on strict whitespace by passing /\s+/, whereas the default of
10
+ # /\W+/ basically only leaves alpha-num. "Apostrophe's" (sic) could be an issue...
11
+ #
12
+ # Capture as both, then replace splitter with a lambda-wrap of re if it is a pattern.
13
+ re = @splitter = options[:splitter] || /\W+/
14
+ @splitter = lambda { |str| str.split(re) } if Regexp === re
15
+
16
+ # Give caller the opportunity to use a canonicalizing block.
17
+ # If the caller wants no lower-casing, they can use a no-op block { |s| s } :p
18
+ # If the caller does not want to quietly ignore apostrope's, then pass (&:downcase)
19
+ @cleaner ||= lambda { |str| str.downcase.gsub(/(?<=\w)'/, '') }
20
+ end # initialize
21
+
22
+
23
+ def self.match(shingles1, shingles2)
24
+ n = (l1 = shingles1.length) + (l2 = shingles2.length)
25
+ # Two empty strings match perfectly. NOTE: strings full of separator also appear empty.
26
+ return 1.0 if n == 0
27
+
28
+ # Count matching shingles. Exploit the sorted lists returned from smush() to step through them both.
29
+ i1 = i2 = k = 0
30
+ while i1 < l1 && i2 < l2
31
+ case
32
+ when shingles1[i1] < shingles2[i2] then i1 += 1 # step forward in shingles1 while its current shingle is behind
33
+ when shingles1[i1] > shingles2[i2] then i2 += 1 # ditto for shingles2
34
+ else i1 += 1; i2 += 1; k += 1 # score!!!
35
+ end
36
+ end
37
+
38
+ # Final score ranges from 0.0 to 1.0
39
+ return (2.0 * k) / n
40
+ end # self.match
41
+
42
+
43
+ def match(str1, str2)
44
+ # Split into words, then collect progressive shingles from each word. Result will be a flat sorted list of shingles.
45
+ return Likeness::match(shingles(str1), shingles(str2))
46
+ end # match
47
+
48
+ alias_method :[], :match # a sussinct alternative to .match
49
+
50
+
51
+ def shingles(str)
52
+ return @splitter.call(@cleaner.call(str))
53
+ .reject(&:empty?)
54
+ .map{ |s| s.length < @width ? s : 0.upto(s.length - @width).collect { |i| s[i, @width] } }
55
+ .flatten
56
+ .sort
57
+ end # shingles
58
+
59
+
60
+ def subject(str)
61
+ return Likeness::Subject.new(self, str)
62
+ end # subject
63
+
64
+
65
+ class Subject
66
+ def initialize(compar, str)
67
+ @compar = compar
68
+ @subject = @compar.shingles(str)
69
+ end # initialize
70
+
71
+
72
+ def match(str)
73
+ return Likeness::match(@subject, @compar.shingles(str))
74
+ end # match
75
+
76
+ alias_method :[], :match # a sussinct alternative to .match
77
+ end # Likeness::Subject::
78
+ end # Likeness::
79
+
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: likeness
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.rc1
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Clarke
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-08-19 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Powerful string similarity determination using the Shingles method.
14
+ email: a.andrew.clarke@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - likeness.rb
20
+ homepage: https://github.com/AndrewClarke/likeness
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">"
36
+ - !ruby/object:Gem::Version
37
+ version: 1.3.1
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.7.7
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: String Similarity method.
44
+ test_files: []