levenshtein_comparator 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/levenshtein_comparator.rb +159 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b9eb28e2308b2dfb0db9dc0fcff05fbed95d58e3
|
4
|
+
data.tar.gz: 3c767a18435998d0ee3337e9db42bcf6ab893d4c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 75b98f99eaf86a91a5a75fca343c9012254f04f3be0088933344071642796b9b5f0634c3b9d1c1bc7a485d6969d68cdfc8527deae15b4a24dce55281afe19f25
|
7
|
+
data.tar.gz: 25ab135e9ede8d6df2a6459e088896f8887f779f2b28e4c90ec2bbb1e83311e7ada4c04761341e1b602e49c59c90383e94f74b65dc5c490171257eb34ea1c402
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'levenshtein'
|
4
|
+
require 'htmlentities'
|
5
|
+
|
6
|
+
class LevenshteinComparator
|
7
|
+
attr_accessor :cleanified_strings
|
8
|
+
|
9
|
+
STOP_WORDS = [
|
10
|
+
"un",
|
11
|
+
"une",
|
12
|
+
"the",
|
13
|
+
"le",
|
14
|
+
"la",
|
15
|
+
"les",
|
16
|
+
"a",
|
17
|
+
"an",
|
18
|
+
"of",
|
19
|
+
"du",
|
20
|
+
"de",
|
21
|
+
"des",
|
22
|
+
"et",
|
23
|
+
"and",
|
24
|
+
"ne",
|
25
|
+
"en",
|
26
|
+
"au"
|
27
|
+
]
|
28
|
+
|
29
|
+
ASCII_REGEXP_MAPPING = {
|
30
|
+
/[ÄÀÁÂÃÅĀĄĂ]/ => 'A',
|
31
|
+
/[âäàãáäåāăąǎǟǡǻȁȃȧẵặ]/ => 'a',
|
32
|
+
/[Æ]/ => 'Ae',
|
33
|
+
/[æ]/ => 'ae',
|
34
|
+
/[ÇĆČĈĊ]/ => 'C',
|
35
|
+
/[çćčĉċ]/ => 'c',
|
36
|
+
/[ĎĐ]/ => 'D',
|
37
|
+
/[ďđ]/ => 'd',
|
38
|
+
/[ÈÉÊËĒĘĚĔĖ]/ =>'E',
|
39
|
+
/[ëêéèẽēĕėẻȅȇẹȩęḙḛềếễểḕḗệḝ]/ => 'e',
|
40
|
+
/[ƒ]/ => 'f',
|
41
|
+
/[ĜĞĠĢ]/ => 'G',
|
42
|
+
/[ĝğġģ]/ => 'g',
|
43
|
+
/[ĤĦ]/ => 'H',
|
44
|
+
/[ĥħ]/ => 'h',
|
45
|
+
/[ÌÍÎÏĪĨĬĮİ]/ => 'I',
|
46
|
+
/[ìíîĩīĭïỉǐịįȉȋḭɨḯ]/ => 'i',
|
47
|
+
/[IJ]/ => 'IJ',
|
48
|
+
/[Ĵ]/ => 'J',
|
49
|
+
/[ĵ]/ => 'j',
|
50
|
+
/[Ķ]/ => 'K',
|
51
|
+
/[ķĸ]/ => 'k',
|
52
|
+
/[ŁĽĹĻĿ]/ => 'L',
|
53
|
+
/[łľĺļŀ]/ => 'l',
|
54
|
+
/[ÑŃŇŅŊ]/ => 'N',
|
55
|
+
/[ñńňņʼnŋ]/ => 'n',
|
56
|
+
/[ÒÓÔÕØŌŐŎÖ]/ => 'O',
|
57
|
+
/[òóôõōŏȯöỏőǒȍȏơǫọɵøồốỗổȱȫȭṍṏṑṓờớỡởợǭộǿ]/ => 'o',
|
58
|
+
/[Œ]/ => 'OE',
|
59
|
+
/[œ]/ => 'oe',
|
60
|
+
/[ŔŘŖ]/ =>'R',
|
61
|
+
/[ŕřŗ]/ =>'r',
|
62
|
+
/[ŚŠŞŜȘ]/ => 'S',
|
63
|
+
/[śšşŝș]/ => 's',
|
64
|
+
/[ß]/ => 'ss',
|
65
|
+
/[ŤŢŦȚ]/ => 'T',
|
66
|
+
/[ťţŧț]/ => 't',
|
67
|
+
/[ÜÙÚÛŪŮŰŬŨŲ]/ => 'U',
|
68
|
+
/[ùúûũūŭüủůűǔȕȗưụṳųṷṵṹṻǖǜǘǖǚừứữửự]/ => 'u',
|
69
|
+
/[Ŵ]/ => 'W',
|
70
|
+
/[ŵ]/ => 'w',
|
71
|
+
/[ỳýŷỹȳẏÿỷẙƴỵ]/ => 'y',
|
72
|
+
/[ŹŽŻ]/ =>'Z',
|
73
|
+
/[žżź]/ =>'z'
|
74
|
+
}
|
75
|
+
|
76
|
+
def initialize(s)
|
77
|
+
self.cleanified_strings = self.class.to_array(s)
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.remove_parenthesis(s)
|
81
|
+
res = s.gsub(/([\(\[].*[\)\]])/, '')
|
82
|
+
res.strip
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.remove_featuring(s)
|
86
|
+
res = s.gsub(/([fF]eat(\.|uring) .*)/, '')
|
87
|
+
res.strip
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.unaccent!(s)
|
91
|
+
ASCII_REGEXP_MAPPING.each do |key, value|
|
92
|
+
s.gsub! key, value
|
93
|
+
end
|
94
|
+
s
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.unaccent(s)
|
98
|
+
self.unaccent!(s.dup)
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.decode_html_entities(s)
|
102
|
+
HTMLEntities.new.decode(s)
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.remove_stop_words(a)
|
106
|
+
a - STOP_WORDS
|
107
|
+
end
|
108
|
+
|
109
|
+
# Cut the string into an array of words
|
110
|
+
# Two words separated by a dash (-) should be considered as :
|
111
|
+
# 1 word if the first or the second word is only 1 character
|
112
|
+
# 2 words otherwise
|
113
|
+
def self.to_array(s)
|
114
|
+
s = self.clean(s)
|
115
|
+
|
116
|
+
arr = s.gsub(/\b(\w{2,})-(\w{2,})\b/, '\1 \2').split.map do |w|
|
117
|
+
w.gsub(/[^A-Za-z0-9]/, '').downcase
|
118
|
+
end.delete_if do |w|
|
119
|
+
w.length < 2 && w !~ /\d/
|
120
|
+
end
|
121
|
+
|
122
|
+
self.remove_stop_words(arr)
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.clean(s)
|
126
|
+
self.unaccent(
|
127
|
+
self.remove_featuring(
|
128
|
+
self.remove_parenthesis(
|
129
|
+
self.decode_html_entities(s)
|
130
|
+
)
|
131
|
+
)
|
132
|
+
)
|
133
|
+
end
|
134
|
+
|
135
|
+
def compare(pattern)
|
136
|
+
pattern = self.class.to_array(pattern)
|
137
|
+
|
138
|
+
size = cleanified_strings.size
|
139
|
+
cleanified_strings.delete_if do |word|
|
140
|
+
matched_word = pattern.find do |guess|
|
141
|
+
if word =~ /\d+/
|
142
|
+
guess == word
|
143
|
+
else
|
144
|
+
if guess.length > 4 and word.length > 4
|
145
|
+
Levenshtein.distance(guess, word) <= 2
|
146
|
+
elsif guess.length > 2 and word.length > 2
|
147
|
+
Levenshtein.distance(guess, word) <= 1
|
148
|
+
else
|
149
|
+
guess == word
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
# only deleting one of the words
|
154
|
+
pattern.delete_at(pattern.index(matched_word)) if matched_word
|
155
|
+
end
|
156
|
+
size != cleanified_strings.size ? cleanified_strings.size == 0 ? :ok : :almost : :ko
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: levenshtein_comparator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stéphane Akkaoui
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-11-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Levenstein Comparator allows you to compare two sentences and say if
|
14
|
+
their is a match, almost a match or nothing to compare.
|
15
|
+
email: sakkaoui@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/levenshtein_comparator.rb
|
21
|
+
homepage: https://github.com/meuble/levenshtein_comparator
|
22
|
+
licenses:
|
23
|
+
- WTFPL
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.5.1
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: A string comparator using Danau-Levenshtein distance
|
45
|
+
test_files: []
|