levenshtein_comparator 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/levenshtein_comparator.rb +159 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b9eb28e2308b2dfb0db9dc0fcff05fbed95d58e3
|
4
|
+
data.tar.gz: 3c767a18435998d0ee3337e9db42bcf6ab893d4c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 75b98f99eaf86a91a5a75fca343c9012254f04f3be0088933344071642796b9b5f0634c3b9d1c1bc7a485d6969d68cdfc8527deae15b4a24dce55281afe19f25
|
7
|
+
data.tar.gz: 25ab135e9ede8d6df2a6459e088896f8887f779f2b28e4c90ec2bbb1e83311e7ada4c04761341e1b602e49c59c90383e94f74b65dc5c490171257eb34ea1c402
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'levenshtein'
|
4
|
+
require 'htmlentities'
|
5
|
+
|
6
|
+
class LevenshteinComparator
|
7
|
+
attr_accessor :cleanified_strings
|
8
|
+
|
9
|
+
STOP_WORDS = [
|
10
|
+
"un",
|
11
|
+
"une",
|
12
|
+
"the",
|
13
|
+
"le",
|
14
|
+
"la",
|
15
|
+
"les",
|
16
|
+
"a",
|
17
|
+
"an",
|
18
|
+
"of",
|
19
|
+
"du",
|
20
|
+
"de",
|
21
|
+
"des",
|
22
|
+
"et",
|
23
|
+
"and",
|
24
|
+
"ne",
|
25
|
+
"en",
|
26
|
+
"au"
|
27
|
+
]
|
28
|
+
|
29
|
+
ASCII_REGEXP_MAPPING = {
|
30
|
+
/[ÄÀÁÂÃÅĀĄĂ]/ => 'A',
|
31
|
+
/[âäàãáäåāăąǎǟǡǻȁȃȧẵặ]/ => 'a',
|
32
|
+
/[Æ]/ => 'Ae',
|
33
|
+
/[æ]/ => 'ae',
|
34
|
+
/[ÇĆČĈĊ]/ => 'C',
|
35
|
+
/[çćčĉċ]/ => 'c',
|
36
|
+
/[ĎĐ]/ => 'D',
|
37
|
+
/[ďđ]/ => 'd',
|
38
|
+
/[ÈÉÊËĒĘĚĔĖ]/ =>'E',
|
39
|
+
/[ëêéèẽēĕėẻȅȇẹȩęḙḛềếễểḕḗệḝ]/ => 'e',
|
40
|
+
/[ƒ]/ => 'f',
|
41
|
+
/[ĜĞĠĢ]/ => 'G',
|
42
|
+
/[ĝğġģ]/ => 'g',
|
43
|
+
/[ĤĦ]/ => 'H',
|
44
|
+
/[ĥħ]/ => 'h',
|
45
|
+
/[ÌÍÎÏĪĨĬĮİ]/ => 'I',
|
46
|
+
/[ìíîĩīĭïỉǐịįȉȋḭɨḯ]/ => 'i',
|
47
|
+
/[IJ]/ => 'IJ',
|
48
|
+
/[Ĵ]/ => 'J',
|
49
|
+
/[ĵ]/ => 'j',
|
50
|
+
/[Ķ]/ => 'K',
|
51
|
+
/[ķĸ]/ => 'k',
|
52
|
+
/[ŁĽĹĻĿ]/ => 'L',
|
53
|
+
/[łľĺļŀ]/ => 'l',
|
54
|
+
/[ÑŃŇŅŊ]/ => 'N',
|
55
|
+
/[ñńňņʼnŋ]/ => 'n',
|
56
|
+
/[ÒÓÔÕØŌŐŎÖ]/ => 'O',
|
57
|
+
/[òóôõōŏȯöỏőǒȍȏơǫọɵøồốỗổȱȫȭṍṏṑṓờớỡởợǭộǿ]/ => 'o',
|
58
|
+
/[Œ]/ => 'OE',
|
59
|
+
/[œ]/ => 'oe',
|
60
|
+
/[ŔŘŖ]/ =>'R',
|
61
|
+
/[ŕřŗ]/ =>'r',
|
62
|
+
/[ŚŠŞŜȘ]/ => 'S',
|
63
|
+
/[śšşŝș]/ => 's',
|
64
|
+
/[ß]/ => 'ss',
|
65
|
+
/[ŤŢŦȚ]/ => 'T',
|
66
|
+
/[ťţŧț]/ => 't',
|
67
|
+
/[ÜÙÚÛŪŮŰŬŨŲ]/ => 'U',
|
68
|
+
/[ùúûũūŭüủůűǔȕȗưụṳųṷṵṹṻǖǜǘǖǚừứữửự]/ => 'u',
|
69
|
+
/[Ŵ]/ => 'W',
|
70
|
+
/[ŵ]/ => 'w',
|
71
|
+
/[ỳýŷỹȳẏÿỷẙƴỵ]/ => 'y',
|
72
|
+
/[ŹŽŻ]/ =>'Z',
|
73
|
+
/[žżź]/ =>'z'
|
74
|
+
}
|
75
|
+
|
76
|
+
def initialize(s)
|
77
|
+
self.cleanified_strings = self.class.to_array(s)
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.remove_parenthesis(s)
|
81
|
+
res = s.gsub(/([\(\[].*[\)\]])/, '')
|
82
|
+
res.strip
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.remove_featuring(s)
|
86
|
+
res = s.gsub(/([fF]eat(\.|uring) .*)/, '')
|
87
|
+
res.strip
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.unaccent!(s)
|
91
|
+
ASCII_REGEXP_MAPPING.each do |key, value|
|
92
|
+
s.gsub! key, value
|
93
|
+
end
|
94
|
+
s
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.unaccent(s)
|
98
|
+
self.unaccent!(s.dup)
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.decode_html_entities(s)
|
102
|
+
HTMLEntities.new.decode(s)
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.remove_stop_words(a)
|
106
|
+
a - STOP_WORDS
|
107
|
+
end
|
108
|
+
|
109
|
+
# Cut the string into an array of words
|
110
|
+
# Two words separated by a dash (-) should be considered as :
|
111
|
+
# 1 word if the first or the second word is only 1 character
|
112
|
+
# 2 words otherwise
|
113
|
+
def self.to_array(s)
|
114
|
+
s = self.clean(s)
|
115
|
+
|
116
|
+
arr = s.gsub(/\b(\w{2,})-(\w{2,})\b/, '\1 \2').split.map do |w|
|
117
|
+
w.gsub(/[^A-Za-z0-9]/, '').downcase
|
118
|
+
end.delete_if do |w|
|
119
|
+
w.length < 2 && w !~ /\d/
|
120
|
+
end
|
121
|
+
|
122
|
+
self.remove_stop_words(arr)
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.clean(s)
|
126
|
+
self.unaccent(
|
127
|
+
self.remove_featuring(
|
128
|
+
self.remove_parenthesis(
|
129
|
+
self.decode_html_entities(s)
|
130
|
+
)
|
131
|
+
)
|
132
|
+
)
|
133
|
+
end
|
134
|
+
|
135
|
+
def compare(pattern)
|
136
|
+
pattern = self.class.to_array(pattern)
|
137
|
+
|
138
|
+
size = cleanified_strings.size
|
139
|
+
cleanified_strings.delete_if do |word|
|
140
|
+
matched_word = pattern.find do |guess|
|
141
|
+
if word =~ /\d+/
|
142
|
+
guess == word
|
143
|
+
else
|
144
|
+
if guess.length > 4 and word.length > 4
|
145
|
+
Levenshtein.distance(guess, word) <= 2
|
146
|
+
elsif guess.length > 2 and word.length > 2
|
147
|
+
Levenshtein.distance(guess, word) <= 1
|
148
|
+
else
|
149
|
+
guess == word
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
# only deleting one of the words
|
154
|
+
pattern.delete_at(pattern.index(matched_word)) if matched_word
|
155
|
+
end
|
156
|
+
size != cleanified_strings.size ? cleanified_strings.size == 0 ? :ok : :almost : :ko
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: levenshtein_comparator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stéphane Akkaoui
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-11-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Levenstein Comparator allows you to compare two sentences and say if
|
14
|
+
their is a match, almost a match or nothing to compare.
|
15
|
+
email: sakkaoui@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/levenshtein_comparator.rb
|
21
|
+
homepage: https://github.com/meuble/levenshtein_comparator
|
22
|
+
licenses:
|
23
|
+
- WTFPL
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.5.1
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: A string comparator using Danau-Levenshtein distance
|
45
|
+
test_files: []
|