levenshtein_comparator 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/levenshtein_comparator.rb +159 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b9eb28e2308b2dfb0db9dc0fcff05fbed95d58e3
4
+ data.tar.gz: 3c767a18435998d0ee3337e9db42bcf6ab893d4c
5
+ SHA512:
6
+ metadata.gz: 75b98f99eaf86a91a5a75fca343c9012254f04f3be0088933344071642796b9b5f0634c3b9d1c1bc7a485d6969d68cdfc8527deae15b4a24dce55281afe19f25
7
+ data.tar.gz: 25ab135e9ede8d6df2a6459e088896f8887f779f2b28e4c90ec2bbb1e83311e7ada4c04761341e1b602e49c59c90383e94f74b65dc5c490171257eb34ea1c402
@@ -0,0 +1,159 @@
1
+ # coding: utf-8
2
+
3
+ require 'levenshtein'
4
+ require 'htmlentities'
5
+
6
+ class LevenshteinComparator
7
+ attr_accessor :cleanified_strings
8
+
9
+ STOP_WORDS = [
10
+ "un",
11
+ "une",
12
+ "the",
13
+ "le",
14
+ "la",
15
+ "les",
16
+ "a",
17
+ "an",
18
+ "of",
19
+ "du",
20
+ "de",
21
+ "des",
22
+ "et",
23
+ "and",
24
+ "ne",
25
+ "en",
26
+ "au"
27
+ ]
28
+
29
+ ASCII_REGEXP_MAPPING = {
30
+ /[ÄÀÁÂÃÅĀĄĂ]/ => 'A',
31
+ /[âäàãáäåāăąǎǟǡǻȁȃȧẵặ]/ => 'a',
32
+ /[Æ]/ => 'Ae',
33
+ /[æ]/ => 'ae',
34
+ /[ÇĆČĈĊ]/ => 'C',
35
+ /[çćčĉċ]/ => 'c',
36
+ /[ĎĐ]/ => 'D',
37
+ /[ďđ]/ => 'd',
38
+ /[ÈÉÊËĒĘĚĔĖ]/ =>'E',
39
+ /[ëêéèẽēĕėẻȅȇẹȩęḙḛềếễểḕḗệḝ]/ => 'e',
40
+ /[ƒ]/ => 'f',
41
+ /[ĜĞĠĢ]/ => 'G',
42
+ /[ĝğġģ]/ => 'g',
43
+ /[ĤĦ]/ => 'H',
44
+ /[ĥħ]/ => 'h',
45
+ /[ÌÍÎÏĪĨĬĮİ]/ => 'I',
46
+ /[ìíîĩīĭïỉǐịįȉȋḭɨḯ]/ => 'i',
47
+ /[IJ]/ => 'IJ',
48
+ /[Ĵ]/ => 'J',
49
+ /[ĵ]/ => 'j',
50
+ /[Ķ]/ => 'K',
51
+ /[ķĸ]/ => 'k',
52
+ /[ŁĽĹĻĿ]/ => 'L',
53
+ /[łľĺļŀ]/ => 'l',
54
+ /[ÑŃŇŅŊ]/ => 'N',
55
+ /[ñńňņʼnŋ]/ => 'n',
56
+ /[ÒÓÔÕØŌŐŎÖ]/ => 'O',
57
+ /[òóôõōŏȯöỏőǒȍȏơǫọɵøồốỗổȱȫȭṍṏṑṓờớỡởợǭộǿ]/ => 'o',
58
+ /[Œ]/ => 'OE',
59
+ /[œ]/ => 'oe',
60
+ /[ŔŘŖ]/ =>'R',
61
+ /[ŕřŗ]/ =>'r',
62
+ /[ŚŠŞŜȘ]/ => 'S',
63
+ /[śšşŝș]/ => 's',
64
+ /[ß]/ => 'ss',
65
+ /[ŤŢŦȚ]/ => 'T',
66
+ /[ťţŧț]/ => 't',
67
+ /[ÜÙÚÛŪŮŰŬŨŲ]/ => 'U',
68
+ /[ùúûũūŭüủůűǔȕȗưụṳųṷṵṹṻǖǜǘǖǚừứữửự]/ => 'u',
69
+ /[Ŵ]/ => 'W',
70
+ /[ŵ]/ => 'w',
71
+ /[ỳýŷỹȳẏÿỷẙƴỵ]/ => 'y',
72
+ /[ŹŽŻ]/ =>'Z',
73
+ /[žżź]/ =>'z'
74
+ }
75
+
76
+ def initialize(s)
77
+ self.cleanified_strings = self.class.to_array(s)
78
+ end
79
+
80
+ def self.remove_parenthesis(s)
81
+ res = s.gsub(/([\(\[].*[\)\]])/, '')
82
+ res.strip
83
+ end
84
+
85
+ def self.remove_featuring(s)
86
+ res = s.gsub(/([fF]eat(\.|uring) .*)/, '')
87
+ res.strip
88
+ end
89
+
90
+ def self.unaccent!(s)
91
+ ASCII_REGEXP_MAPPING.each do |key, value|
92
+ s.gsub! key, value
93
+ end
94
+ s
95
+ end
96
+
97
+ def self.unaccent(s)
98
+ self.unaccent!(s.dup)
99
+ end
100
+
101
+ def self.decode_html_entities(s)
102
+ HTMLEntities.new.decode(s)
103
+ end
104
+
105
+ def self.remove_stop_words(a)
106
+ a - STOP_WORDS
107
+ end
108
+
109
+ # Cut the string into an array of words
110
+ # Two words separated by a dash (-) should be considered as :
111
+ # 1 word if the first or the second word is only 1 character
112
+ # 2 words otherwise
113
+ def self.to_array(s)
114
+ s = self.clean(s)
115
+
116
+ arr = s.gsub(/\b(\w{2,})-(\w{2,})\b/, '\1 \2').split.map do |w|
117
+ w.gsub(/[^A-Za-z0-9]/, '').downcase
118
+ end.delete_if do |w|
119
+ w.length < 2 && w !~ /\d/
120
+ end
121
+
122
+ self.remove_stop_words(arr)
123
+ end
124
+
125
+ def self.clean(s)
126
+ self.unaccent(
127
+ self.remove_featuring(
128
+ self.remove_parenthesis(
129
+ self.decode_html_entities(s)
130
+ )
131
+ )
132
+ )
133
+ end
134
+
135
+ def compare(pattern)
136
+ pattern = self.class.to_array(pattern)
137
+
138
+ size = cleanified_strings.size
139
+ cleanified_strings.delete_if do |word|
140
+ matched_word = pattern.find do |guess|
141
+ if word =~ /\d+/
142
+ guess == word
143
+ else
144
+ if guess.length > 4 and word.length > 4
145
+ Levenshtein.distance(guess, word) <= 2
146
+ elsif guess.length > 2 and word.length > 2
147
+ Levenshtein.distance(guess, word) <= 1
148
+ else
149
+ guess == word
150
+ end
151
+ end
152
+ end
153
+ # only deleting one of the words
154
+ pattern.delete_at(pattern.index(matched_word)) if matched_word
155
+ end
156
+ size != cleanified_strings.size ? cleanified_strings.size == 0 ? :ok : :almost : :ko
157
+ end
158
+
159
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein_comparator
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Stéphane Akkaoui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-11-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Levenstein Comparator allows you to compare two sentences and say if
14
+ their is a match, almost a match or nothing to compare.
15
+ email: sakkaoui@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/levenshtein_comparator.rb
21
+ homepage: https://github.com/meuble/levenshtein_comparator
22
+ licenses:
23
+ - WTFPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.5.1
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: A string comparator using Danau-Levenshtein distance
45
+ test_files: []