levenshtein_comparator 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/levenshtein_comparator.rb +159 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b9eb28e2308b2dfb0db9dc0fcff05fbed95d58e3
4
+ data.tar.gz: 3c767a18435998d0ee3337e9db42bcf6ab893d4c
5
+ SHA512:
6
+ metadata.gz: 75b98f99eaf86a91a5a75fca343c9012254f04f3be0088933344071642796b9b5f0634c3b9d1c1bc7a485d6969d68cdfc8527deae15b4a24dce55281afe19f25
7
+ data.tar.gz: 25ab135e9ede8d6df2a6459e088896f8887f779f2b28e4c90ec2bbb1e83311e7ada4c04761341e1b602e49c59c90383e94f74b65dc5c490171257eb34ea1c402
@@ -0,0 +1,159 @@
1
+ # coding: utf-8
2
+
3
+ require 'levenshtein'
4
+ require 'htmlentities'
5
+
6
+ class LevenshteinComparator
7
+ attr_accessor :cleanified_strings
8
+
9
+ STOP_WORDS = [
10
+ "un",
11
+ "une",
12
+ "the",
13
+ "le",
14
+ "la",
15
+ "les",
16
+ "a",
17
+ "an",
18
+ "of",
19
+ "du",
20
+ "de",
21
+ "des",
22
+ "et",
23
+ "and",
24
+ "ne",
25
+ "en",
26
+ "au"
27
+ ]
28
+
29
+ ASCII_REGEXP_MAPPING = {
30
+ /[ÄÀÁÂÃÅĀĄĂ]/ => 'A',
31
+ /[âäàãáäåāăąǎǟǡǻȁȃȧẵặ]/ => 'a',
32
+ /[Æ]/ => 'Ae',
33
+ /[æ]/ => 'ae',
34
+ /[ÇĆČĈĊ]/ => 'C',
35
+ /[çćčĉċ]/ => 'c',
36
+ /[ĎĐ]/ => 'D',
37
+ /[ďđ]/ => 'd',
38
+ /[ÈÉÊËĒĘĚĔĖ]/ =>'E',
39
+ /[ëêéèẽēĕėẻȅȇẹȩęḙḛềếễểḕḗệḝ]/ => 'e',
40
+ /[ƒ]/ => 'f',
41
+ /[ĜĞĠĢ]/ => 'G',
42
+ /[ĝğġģ]/ => 'g',
43
+ /[ĤĦ]/ => 'H',
44
+ /[ĥħ]/ => 'h',
45
+ /[ÌÍÎÏĪĨĬĮİ]/ => 'I',
46
+ /[ìíîĩīĭïỉǐịįȉȋḭɨḯ]/ => 'i',
47
+ /[IJ]/ => 'IJ',
48
+ /[Ĵ]/ => 'J',
49
+ /[ĵ]/ => 'j',
50
+ /[Ķ]/ => 'K',
51
+ /[ķĸ]/ => 'k',
52
+ /[ŁĽĹĻĿ]/ => 'L',
53
+ /[łľĺļŀ]/ => 'l',
54
+ /[ÑŃŇŅŊ]/ => 'N',
55
+ /[ñńňņʼnŋ]/ => 'n',
56
+ /[ÒÓÔÕØŌŐŎÖ]/ => 'O',
57
+ /[òóôõōŏȯöỏőǒȍȏơǫọɵøồốỗổȱȫȭṍṏṑṓờớỡởợǭộǿ]/ => 'o',
58
+ /[Œ]/ => 'OE',
59
+ /[œ]/ => 'oe',
60
+ /[ŔŘŖ]/ =>'R',
61
+ /[ŕřŗ]/ =>'r',
62
+ /[ŚŠŞŜȘ]/ => 'S',
63
+ /[śšşŝș]/ => 's',
64
+ /[ß]/ => 'ss',
65
+ /[ŤŢŦȚ]/ => 'T',
66
+ /[ťţŧț]/ => 't',
67
+ /[ÜÙÚÛŪŮŰŬŨŲ]/ => 'U',
68
+ /[ùúûũūŭüủůűǔȕȗưụṳųṷṵṹṻǖǜǘǖǚừứữửự]/ => 'u',
69
+ /[Ŵ]/ => 'W',
70
+ /[ŵ]/ => 'w',
71
+ /[ỳýŷỹȳẏÿỷẙƴỵ]/ => 'y',
72
+ /[ŹŽŻ]/ =>'Z',
73
+ /[žżź]/ =>'z'
74
+ }
75
+
76
+ def initialize(s)
77
+ self.cleanified_strings = self.class.to_array(s)
78
+ end
79
+
80
+ def self.remove_parenthesis(s)
81
+ res = s.gsub(/([\(\[].*[\)\]])/, '')
82
+ res.strip
83
+ end
84
+
85
+ def self.remove_featuring(s)
86
+ res = s.gsub(/([fF]eat(\.|uring) .*)/, '')
87
+ res.strip
88
+ end
89
+
90
+ def self.unaccent!(s)
91
+ ASCII_REGEXP_MAPPING.each do |key, value|
92
+ s.gsub! key, value
93
+ end
94
+ s
95
+ end
96
+
97
+ def self.unaccent(s)
98
+ self.unaccent!(s.dup)
99
+ end
100
+
101
+ def self.decode_html_entities(s)
102
+ HTMLEntities.new.decode(s)
103
+ end
104
+
105
+ def self.remove_stop_words(a)
106
+ a - STOP_WORDS
107
+ end
108
+
109
+ # Cut the string into an array of words
110
+ # Two words separated by a dash (-) should be considered as :
111
+ # 1 word if the first or the second word is only 1 character
112
+ # 2 words otherwise
113
+ def self.to_array(s)
114
+ s = self.clean(s)
115
+
116
+ arr = s.gsub(/\b(\w{2,})-(\w{2,})\b/, '\1 \2').split.map do |w|
117
+ w.gsub(/[^A-Za-z0-9]/, '').downcase
118
+ end.delete_if do |w|
119
+ w.length < 2 && w !~ /\d/
120
+ end
121
+
122
+ self.remove_stop_words(arr)
123
+ end
124
+
125
+ def self.clean(s)
126
+ self.unaccent(
127
+ self.remove_featuring(
128
+ self.remove_parenthesis(
129
+ self.decode_html_entities(s)
130
+ )
131
+ )
132
+ )
133
+ end
134
+
135
+ def compare(pattern)
136
+ pattern = self.class.to_array(pattern)
137
+
138
+ size = cleanified_strings.size
139
+ cleanified_strings.delete_if do |word|
140
+ matched_word = pattern.find do |guess|
141
+ if word =~ /\d+/
142
+ guess == word
143
+ else
144
+ if guess.length > 4 and word.length > 4
145
+ Levenshtein.distance(guess, word) <= 2
146
+ elsif guess.length > 2 and word.length > 2
147
+ Levenshtein.distance(guess, word) <= 1
148
+ else
149
+ guess == word
150
+ end
151
+ end
152
+ end
153
+ # only deleting one of the words
154
+ pattern.delete_at(pattern.index(matched_word)) if matched_word
155
+ end
156
+ size != cleanified_strings.size ? cleanified_strings.size == 0 ? :ok : :almost : :ko
157
+ end
158
+
159
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein_comparator
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Stéphane Akkaoui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-11-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Levenstein Comparator allows you to compare two sentences and say if
14
+ their is a match, almost a match or nothing to compare.
15
+ email: sakkaoui@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/levenshtein_comparator.rb
21
+ homepage: https://github.com/meuble/levenshtein_comparator
22
+ licenses:
23
+ - WTFPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.5.1
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: A string comparator using Danau-Levenshtein distance
45
+ test_files: []