emiler 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/emiler.gemspec +2 -0
- data/lib/emiler.rb +96 -23
- data/lib/emiler/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17d680b472c91879a2d30d06c5ff4252df1e2bf5
|
4
|
+
data.tar.gz: e7459390747cc3fb174acdad6f3fadaf137d7270
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05b5b0ce5d47397a655ab501b1d30e17066a667754b4c15608145b009b2d0c83dc8fdc0b5ee29fe6e5516d2a36d370f3364996e64630b76f9cf12a0b97518e42
|
7
|
+
data.tar.gz: 930f7d3c786b5744edfcea98e4ff0b0348498a455fd73e31535cd05727461dcc391b4eb6768776fa28899e7e348dd7a27996f83471abd10f1d7185a6fb5a9b4c
|
data/.rubocop.yml
CHANGED
data/emiler.gemspec
CHANGED
data/lib/emiler.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
require 'emiler/version'
|
2
2
|
require 'emiler/jarowinkler'
|
3
3
|
|
4
|
+
require 'phone'
|
5
|
+
|
4
6
|
module Emiler
|
5
7
|
INEXACT_MATCH_COEFFICIENT = ENV['INEXACT_MATCH_COEFFICIENT'] || 0.8
|
6
8
|
RAISE_ON_MALFORMED_EMAIL = ENV['RAISE_ON_MALFORMED_EMAIL']
|
9
|
+
COMPANY_NAME_STOP_WORDS = %w(ltd gmbh inc).freeze
|
7
10
|
|
8
11
|
class JW
|
9
12
|
attr_reader :jw
|
@@ -27,36 +30,106 @@ module Emiler
|
|
27
30
|
end
|
28
31
|
end
|
29
32
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
33
|
+
class << self
|
34
|
+
def similarity item1, item2, type: :email
|
35
|
+
type = :default unless private_methods.include? :"similarity_#{type}"
|
36
|
+
item1, item2 = [item1, item2].map(&:to_s).map(&:strip).map(&:downcase)
|
37
|
+
{ jw: JW::MATCHER.distance(item1, item2) }.merge send(:"similarity_#{type}", item1, item2)
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
34
41
|
|
35
|
-
|
36
|
-
|
37
|
-
|
42
|
+
# stub for unknown types; returns empty hash for `similarity` to return jaro-winkler distance only
|
43
|
+
def similarity_default(*)
|
44
|
+
{ result: nil }
|
38
45
|
end
|
39
46
|
|
40
|
-
|
47
|
+
# similarity for company names
|
48
|
+
def similarity_company_name c1, c2
|
49
|
+
return { full: 1.0,
|
50
|
+
distances: [1.0] * c1.split(/\s+/).size,
|
51
|
+
matches: c1.split(/\s+/).size,
|
52
|
+
result: true } if c1 == c2 # exact match
|
53
|
+
|
54
|
+
c1, c2 = [c1, c2].map { |c| c.split(/\s+/).reject(&COMPANY_NAME_STOP_WORDS.method(:include?)) }
|
55
|
+
return { full: 1.0 - (1.0 - INEXACT_MATCH_COEFFICIENT) / 2.0, name: 1.0, result: true } if c1 == c2 # match without stopwords
|
56
|
+
|
57
|
+
dists = c1.product(c2)
|
58
|
+
.map { |(w1, w2)| JW::MATCHER.distance(w1, w2) }
|
59
|
+
.sort
|
60
|
+
.reverse
|
61
|
+
count = [c1, c2].map(&:size).min
|
62
|
+
average = dists.take(count).map.with_index { |e, i| e * (1.0 - i.to_f / count) / count }.reduce(:+)
|
63
|
+
{ full: average, distances: dists, matches: dists.count(1.0), result: false }
|
64
|
+
end
|
41
65
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
66
|
+
# similarity for phone numbers
|
67
|
+
def similarity_phone p1, p2
|
68
|
+
return { full: 1.0,
|
69
|
+
distances: [1.0],
|
70
|
+
result: true } if p1 == p2 # exact match
|
47
71
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
72
|
+
p1, p2 = [p1, p2].map { |p| p.split(/[,;]/) }
|
73
|
+
.map do |p|
|
74
|
+
p.map do |e|
|
75
|
+
phone = e.delete('^0-9')
|
76
|
+
phone = case phone.length
|
77
|
+
when 0..6 then phone
|
78
|
+
when 7 then "+3493#{phone}" # consider Barcelona
|
79
|
+
when 8..9 then "+34#{phone}" # consider Spain
|
80
|
+
else "+#{phone}"
|
81
|
+
end
|
82
|
+
# rubocop:disable Style/RescueModifier
|
83
|
+
Phoner::Phone.parse(phone) rescue nil # Phoner::CountryCodeError
|
84
|
+
# rubocop:enable Style/RescueModifier
|
85
|
+
end.compact
|
86
|
+
end
|
53
87
|
|
54
|
-
|
88
|
+
dists = p1.product(p2)
|
89
|
+
.reject do |(pp1, pp2)|
|
90
|
+
pp1.country_code != pp2.country_code ||
|
91
|
+
pp1.area_code != pp2.area_code ||
|
92
|
+
pp1.number[0...-2] != pp2.number[0...-2]
|
93
|
+
end.map do |(pp1, pp2)|
|
94
|
+
case
|
95
|
+
when pp1.number[-2..-1] == pp2.number[-2..-1] then 1.0
|
96
|
+
when pp1.number[-2] == pp2.number[-2] then 0.9
|
97
|
+
else 0.8
|
98
|
+
end
|
99
|
+
end.sort.reverse
|
55
100
|
|
56
|
-
|
101
|
+
{ full: dists.first || 0.0, distances: dists, result: dists.first && dists.first >= INEXACT_MATCH_COEFFICIENT }
|
102
|
+
end
|
103
|
+
|
104
|
+
# rubocop:disable Metrics/AbcSize
|
105
|
+
# similarity for emails
|
106
|
+
def similarity_email e1, e2
|
107
|
+
return { full: 1.0,
|
108
|
+
name: 1.0,
|
109
|
+
domain: 1.0,
|
110
|
+
result: true } if e1 == e2
|
111
|
+
|
112
|
+
em1, em2 = [e1, e2].map { |e| e.split '@' }
|
113
|
+
if em1.size != 2 || em2.size != 2
|
114
|
+
raise MalformedEmailError.new(e1, e2) if RAISE_ON_MALFORMED_EMAIL
|
115
|
+
return JW::DUMMY
|
116
|
+
end
|
117
|
+
|
118
|
+
domain = case
|
119
|
+
when em1.last == em2.last then 1 # exact domain match
|
120
|
+
when [em1, em2].map { |e| e.last.split('.')[-2] }.reduce(:==) then INEXACT_MATCH_COEFFICIENT
|
121
|
+
else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.last, em2.last)
|
122
|
+
end
|
123
|
+
name = case
|
124
|
+
when em1.first == em2.first then 1 # exact match
|
125
|
+
when ![em1, em2].map { |e| e.first.scan(/[a-z]+/) }.reduce(:&).empty? then INEXACT_MATCH_COEFFICIENT
|
126
|
+
else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.first, em2.first)
|
127
|
+
end
|
128
|
+
full = domain * (1.0 - INEXACT_MATCH_COEFFICIENT) + name * INEXACT_MATCH_COEFFICIENT
|
129
|
+
{ full: full, name: name, domain: domain, result: full >= INEXACT_MATCH_COEFFICIENT * INEXACT_MATCH_COEFFICIENT }
|
130
|
+
end
|
131
|
+
# rubocop:enable Metrics/AbcSize
|
57
132
|
end
|
58
|
-
# rubocop:enable Metrics/AbcSize
|
59
133
|
|
60
|
-
private_constant :JW
|
61
|
-
module_function :similarity
|
134
|
+
private_constant :JW
|
62
135
|
end
|
data/lib/emiler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: emiler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksei Matiushkin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: phone
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.2.3
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.2.3
|
69
83
|
description: This library is kinda analogue of Jaro-Winkler distance between two emails.
|
70
84
|
email:
|
71
85
|
- aleksei.matiushkin@kantox.com
|