emiler 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/emiler.gemspec +2 -0
- data/lib/emiler.rb +96 -23
- data/lib/emiler/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17d680b472c91879a2d30d06c5ff4252df1e2bf5
|
4
|
+
data.tar.gz: e7459390747cc3fb174acdad6f3fadaf137d7270
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05b5b0ce5d47397a655ab501b1d30e17066a667754b4c15608145b009b2d0c83dc8fdc0b5ee29fe6e5516d2a36d370f3364996e64630b76f9cf12a0b97518e42
|
7
|
+
data.tar.gz: 930f7d3c786b5744edfcea98e4ff0b0348498a455fd73e31535cd05727461dcc391b4eb6768776fa28899e7e348dd7a27996f83471abd10f1d7185a6fb5a9b4c
|
data/.rubocop.yml
CHANGED
data/emiler.gemspec
CHANGED
data/lib/emiler.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
require 'emiler/version'
|
2
2
|
require 'emiler/jarowinkler'
|
3
3
|
|
4
|
+
require 'phone'
|
5
|
+
|
4
6
|
module Emiler
|
5
7
|
INEXACT_MATCH_COEFFICIENT = ENV['INEXACT_MATCH_COEFFICIENT'] || 0.8
|
6
8
|
RAISE_ON_MALFORMED_EMAIL = ENV['RAISE_ON_MALFORMED_EMAIL']
|
9
|
+
COMPANY_NAME_STOP_WORDS = %w(ltd gmbh inc).freeze
|
7
10
|
|
8
11
|
class JW
|
9
12
|
attr_reader :jw
|
@@ -27,36 +30,106 @@ module Emiler
|
|
27
30
|
end
|
28
31
|
end
|
29
32
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
33
|
+
class << self
|
34
|
+
def similarity item1, item2, type: :email
|
35
|
+
type = :default unless private_methods.include? :"similarity_#{type}"
|
36
|
+
item1, item2 = [item1, item2].map(&:to_s).map(&:strip).map(&:downcase)
|
37
|
+
{ jw: JW::MATCHER.distance(item1, item2) }.merge send(:"similarity_#{type}", item1, item2)
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
34
41
|
|
35
|
-
|
36
|
-
|
37
|
-
|
42
|
+
# stub for unknown types; returns empty hash for `similarity` to return jaro-winkler distance only
|
43
|
+
def similarity_default(*)
|
44
|
+
{ result: nil }
|
38
45
|
end
|
39
46
|
|
40
|
-
|
47
|
+
# similarity for company names
|
48
|
+
def similarity_company_name c1, c2
|
49
|
+
return { full: 1.0,
|
50
|
+
distances: [1.0] * c1.split(/\s+/).size,
|
51
|
+
matches: c1.split(/\s+/).size,
|
52
|
+
result: true } if c1 == c2 # exact match
|
53
|
+
|
54
|
+
c1, c2 = [c1, c2].map { |c| c.split(/\s+/).reject(&COMPANY_NAME_STOP_WORDS.method(:include?)) }
|
55
|
+
return { full: 1.0 - (1.0 - INEXACT_MATCH_COEFFICIENT) / 2.0, name: 1.0, result: true } if c1 == c2 # match without stopwords
|
56
|
+
|
57
|
+
dists = c1.product(c2)
|
58
|
+
.map { |(w1, w2)| JW::MATCHER.distance(w1, w2) }
|
59
|
+
.sort
|
60
|
+
.reverse
|
61
|
+
count = [c1, c2].map(&:size).min
|
62
|
+
average = dists.take(count).map.with_index { |e, i| e * (1.0 - i.to_f / count) / count }.reduce(:+)
|
63
|
+
{ full: average, distances: dists, matches: dists.count(1.0), result: false }
|
64
|
+
end
|
41
65
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
66
|
+
# similarity for phone numbers
|
67
|
+
def similarity_phone p1, p2
|
68
|
+
return { full: 1.0,
|
69
|
+
distances: [1.0],
|
70
|
+
result: true } if p1 == p2 # exact match
|
47
71
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
72
|
+
p1, p2 = [p1, p2].map { |p| p.split(/[,;]/) }
|
73
|
+
.map do |p|
|
74
|
+
p.map do |e|
|
75
|
+
phone = e.delete('^0-9')
|
76
|
+
phone = case phone.length
|
77
|
+
when 0..6 then phone
|
78
|
+
when 7 then "+3493#{phone}" # consider Barcelona
|
79
|
+
when 8..9 then "+34#{phone}" # consider Spain
|
80
|
+
else "+#{phone}"
|
81
|
+
end
|
82
|
+
# rubocop:disable Style/RescueModifier
|
83
|
+
Phoner::Phone.parse(phone) rescue nil # Phoner::CountryCodeError
|
84
|
+
# rubocop:enable Style/RescueModifier
|
85
|
+
end.compact
|
86
|
+
end
|
53
87
|
|
54
|
-
|
88
|
+
dists = p1.product(p2)
|
89
|
+
.reject do |(pp1, pp2)|
|
90
|
+
pp1.country_code != pp2.country_code ||
|
91
|
+
pp1.area_code != pp2.area_code ||
|
92
|
+
pp1.number[0...-2] != pp2.number[0...-2]
|
93
|
+
end.map do |(pp1, pp2)|
|
94
|
+
case
|
95
|
+
when pp1.number[-2..-1] == pp2.number[-2..-1] then 1.0
|
96
|
+
when pp1.number[-2] == pp2.number[-2] then 0.9
|
97
|
+
else 0.8
|
98
|
+
end
|
99
|
+
end.sort.reverse
|
55
100
|
|
56
|
-
|
101
|
+
{ full: dists.first || 0.0, distances: dists, result: dists.first && dists.first >= INEXACT_MATCH_COEFFICIENT }
|
102
|
+
end
|
103
|
+
|
104
|
+
# rubocop:disable Metrics/AbcSize
|
105
|
+
# similarity for emails
|
106
|
+
def similarity_email e1, e2
|
107
|
+
return { full: 1.0,
|
108
|
+
name: 1.0,
|
109
|
+
domain: 1.0,
|
110
|
+
result: true } if e1 == e2
|
111
|
+
|
112
|
+
em1, em2 = [e1, e2].map { |e| e.split '@' }
|
113
|
+
if em1.size != 2 || em2.size != 2
|
114
|
+
raise MalformedEmailError.new(e1, e2) if RAISE_ON_MALFORMED_EMAIL
|
115
|
+
return JW::DUMMY
|
116
|
+
end
|
117
|
+
|
118
|
+
domain = case
|
119
|
+
when em1.last == em2.last then 1 # exact domain match
|
120
|
+
when [em1, em2].map { |e| e.last.split('.')[-2] }.reduce(:==) then INEXACT_MATCH_COEFFICIENT
|
121
|
+
else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.last, em2.last)
|
122
|
+
end
|
123
|
+
name = case
|
124
|
+
when em1.first == em2.first then 1 # exact match
|
125
|
+
when ![em1, em2].map { |e| e.first.scan(/[a-z]+/) }.reduce(:&).empty? then INEXACT_MATCH_COEFFICIENT
|
126
|
+
else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.first, em2.first)
|
127
|
+
end
|
128
|
+
full = domain * (1.0 - INEXACT_MATCH_COEFFICIENT) + name * INEXACT_MATCH_COEFFICIENT
|
129
|
+
{ full: full, name: name, domain: domain, result: full >= INEXACT_MATCH_COEFFICIENT * INEXACT_MATCH_COEFFICIENT }
|
130
|
+
end
|
131
|
+
# rubocop:enable Metrics/AbcSize
|
57
132
|
end
|
58
|
-
# rubocop:enable Metrics/AbcSize
|
59
133
|
|
60
|
-
private_constant :JW
|
61
|
-
module_function :similarity
|
134
|
+
private_constant :JW
|
62
135
|
end
|
data/lib/emiler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: emiler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksei Matiushkin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: phone
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.2.3
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.2.3
|
69
83
|
description: This library is kinda analogue of Jaro-Winkler distance between two emails.
|
70
84
|
email:
|
71
85
|
- aleksei.matiushkin@kantox.com
|