likeness 1.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/likeness.rb +79 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 27ccde505442d156b4a2a80a493c05b8cfa88ca2210d05cd6c473c991f7bf09b
4
+ data.tar.gz: b2e0173619708676cd1cc0a92936d716417eded6653785f4d37007334163600f
5
+ SHA512:
6
+ metadata.gz: 3a1428413c19c5e97d4c58cd211dcb2940fed6d943f7958251741b234b4d46187fba2ed5eedfb846d87766b0364469a8ad988266f89bf7229975248335ffb843
7
+ data.tar.gz: 6eb7515e3c599a4a5b8aef05ca724a8dc8d9aff924cef2fb35b06d853624504ed41cbf64cb21ec6d189ab90299239ae296230add48e00d09cdb484b541b90c0c
@@ -0,0 +1,79 @@
1
+
2
+ class Likeness
3
+ # Yields an object that can be used to compare many strings.
4
+ def initialize(options = {}, &cleaner)
5
+ # Collect or default the size of shingles to extract from each word.
6
+ @width = options[:width] || 2
7
+
8
+ # splitter can be a Regexp (defaults to any non-word) or a proc/lambda for extra power.
9
+ # The caller can split on strict whitespace by passing /\s+/, whereas the default of
10
+ # /\W+/ basically only leaves alpha-num. "Apostrophe's" (sic) could be an issue...
11
+ #
12
+ # Capture as both, then replace splitter with a lambda-wrap of re if it is a pattern.
13
+ re = @splitter = options[:splitter] || /\W+/
14
+ @splitter = lambda { |str| str.split(re) } if Regexp === re
15
+
16
+ # Give caller the opportunity to use a canonicalizing block.
17
+ # If the caller wants no lower-casing, they can use a no-op block { |s| s } :p
18
+ # If the caller does not want to quietly ignore apostrope's, then pass (&:downcase)
19
+ @cleaner ||= lambda { |str| str.downcase.gsub(/(?<=\w)'/, '') }
20
+ end # initialize
21
+
22
+
23
+ def self.match(shingles1, shingles2)
24
+ n = (l1 = shingles1.length) + (l2 = shingles2.length)
25
+ # Two empty strings match perfectly. NOTE: strings full of separator also appear empty.
26
+ return 1.0 if n == 0
27
+
28
+ # Count matching shingles. Exploit the sorted lists returned from smush() to step through them both.
29
+ i1 = i2 = k = 0
30
+ while i1 < l1 && i2 < l2
31
+ case
32
+ when shingles1[i1] < shingles2[i2] then i1 += 1 # step forward in shingles1 while its current shingle is behind
33
+ when shingles1[i1] > shingles2[i2] then i2 += 1 # ditto for shingles2
34
+ else i1 += 1; i2 += 1; k += 1 # score!!!
35
+ end
36
+ end
37
+
38
+ # Final score ranges from 0.0 to 1.0
39
+ return (2.0 * k) / n
40
+ end # self.match
41
+
42
+
43
+ def match(str1, str2)
44
+ # Split into words, then collect progressive shingles from each word. Result will be a flat sorted list of shingles.
45
+ return Likeness::match(shingles(str1), shingles(str2))
46
+ end # match
47
+
48
+ alias_method :[], :match # a sussinct alternative to .match
49
+
50
+
51
+ def shingles(str)
52
+ return @splitter.call(@cleaner.call(str))
53
+ .reject(&:empty?)
54
+ .map{ |s| s.length < @width ? s : 0.upto(s.length - @width).collect { |i| s[i, @width] } }
55
+ .flatten
56
+ .sort
57
+ end # shingles
58
+
59
+
60
+ def subject(str)
61
+ return Likeness::Subject.new(self, str)
62
+ end # subject
63
+
64
+
65
+ class Subject
66
+ def initialize(compar, str)
67
+ @compar = compar
68
+ @subject = @compar.shingles(str)
69
+ end # initialize
70
+
71
+
72
+ def match(str)
73
+ return Likeness::match(@subject, @compar.shingles(str))
74
+ end # match
75
+
76
+ alias_method :[], :match # a sussinct alternative to .match
77
+ end # Likeness::Subject::
78
+ end # Likeness::
79
+
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: likeness
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.rc1
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Clarke
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-08-19 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Powerful string similarity determination using the Shingles method.
14
+ email: a.andrew.clarke@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - likeness.rb
20
+ homepage: https://github.com/AndrewClarke/likeness
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">"
36
+ - !ruby/object:Gem::Version
37
+ version: 1.3.1
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.7.7
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: String Similarity method.
44
+ test_files: []