emiler 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9087d21f342c6fbf527446f3b9d12a7704d24fc3
4
- data.tar.gz: 620b32bfe7e591eb14330ceb04e6c34a83258d5b
3
+ metadata.gz: 17d680b472c91879a2d30d06c5ff4252df1e2bf5
4
+ data.tar.gz: e7459390747cc3fb174acdad6f3fadaf137d7270
5
5
  SHA512:
6
- metadata.gz: e6214d1f660358148d59dce104fb9b223d3896f89905fb45cdb4c1c72d2478c8ffba7f5d762c28efc030330a7db1837683b965355551a22a148c8101a98a6c7c
7
- data.tar.gz: c48623402f161cabe1afb70d80ea515a31315f2c16700b61e9e4b797f329958eea4430254e29c1f4105a406c3009d6f71fd220697de74f9e4447a5d46bb10d64
6
+ metadata.gz: 05b5b0ce5d47397a655ab501b1d30e17066a667754b4c15608145b009b2d0c83dc8fdc0b5ee29fe6e5516d2a36d370f3364996e64630b76f9cf12a0b97518e42
7
+ data.tar.gz: 930f7d3c786b5744edfcea98e4ff0b0348498a455fd73e31535cd05727461dcc391b4eb6768776fa28899e7e348dd7a27996f83471abd10f1d7185a6fb5a9b4c
@@ -54,3 +54,6 @@ Metrics/MethodLength:
54
54
 
55
55
  Metrics/PerceivedComplexity:
56
56
  Max: 20
57
+
58
+ Metrics/AbcSize:
59
+ Max: 42
@@ -28,4 +28,6 @@ Gem::Specification.new do |spec|
28
28
  spec.add_development_dependency 'rake', '~> 10.0'
29
29
  spec.add_development_dependency 'rspec', '~> 3.0'
30
30
  spec.add_development_dependency 'pry', '~> 0.10'
31
+
32
+ spec.add_dependency 'phone', '~> 1.2.3'
31
33
  end
@@ -1,9 +1,12 @@
1
1
  require 'emiler/version'
2
2
  require 'emiler/jarowinkler'
3
3
 
4
+ require 'phone'
5
+
4
6
  module Emiler
5
7
  INEXACT_MATCH_COEFFICIENT = ENV['INEXACT_MATCH_COEFFICIENT'] || 0.8
6
8
  RAISE_ON_MALFORMED_EMAIL = ENV['RAISE_ON_MALFORMED_EMAIL']
9
+ COMPANY_NAME_STOP_WORDS = %w(ltd gmbh inc).freeze
7
10
 
8
11
  class JW
9
12
  attr_reader :jw
@@ -27,36 +30,106 @@ module Emiler
27
30
  end
28
31
  end
29
32
 
30
- # rubocop:disable Metrics/AbcSize
31
- def similarity e1, e2
32
- e1, e2 = [e1, e2].map(&:to_s).map(&:downcase)
33
- em1, em2 = [e1, e2].map { |e| e.split '@' }
33
+ class << self
34
+ def similarity item1, item2, type: :email
35
+ type = :default unless private_methods.include? :"similarity_#{type}"
36
+ item1, item2 = [item1, item2].map(&:to_s).map(&:strip).map(&:downcase)
37
+ { jw: JW::MATCHER.distance(item1, item2) }.merge send(:"similarity_#{type}", item1, item2)
38
+ end
39
+
40
+ private
34
41
 
35
- if em1.size != 2 || em2.size != 2
36
- raise MalformedEmailError.new(e1, e2) if RAISE_ON_MALFORMED_EMAIL
37
- return JW::DUMMY
42
+ # stub for unknown types; returns empty hash for `similarity` to return jaro-winkler distance only
43
+ def similarity_default(*)
44
+ { result: nil }
38
45
  end
39
46
 
40
- jw = JW::MATCHER.distance e1, e2
47
+ # similarity for company names
48
+ def similarity_company_name c1, c2
49
+ return { full: 1.0,
50
+ distances: [1.0] * c1.split(/\s+/).size,
51
+ matches: c1.split(/\s+/).size,
52
+ result: true } if c1 == c2 # exact match
53
+
54
+ c1, c2 = [c1, c2].map { |c| c.split(/\s+/).reject(&COMPANY_NAME_STOP_WORDS.method(:include?)) }
55
+ return { full: 1.0 - (1.0 - INEXACT_MATCH_COEFFICIENT) / 2.0, name: 1.0, result: true } if c1 == c2 # match without stopwords
56
+
57
+ dists = c1.product(c2)
58
+ .map { |(w1, w2)| JW::MATCHER.distance(w1, w2) }
59
+ .sort
60
+ .reverse
61
+ count = [c1, c2].map(&:size).min
62
+ average = dists.take(count).map.with_index { |e, i| e * (1.0 - i.to_f / count) / count }.reduce(:+)
63
+ { full: average, distances: dists, matches: dists.count(1.0), result: false }
64
+ end
41
65
 
42
- domain = case
43
- when em1.last == em2.last then 1 # exact domain match
44
- when [em1, em2].map { |e| e.last.split('.')[-2] }.reduce(:==) then INEXACT_MATCH_COEFFICIENT
45
- else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.last, em2.last)
46
- end
66
+ # similarity for phone numbers
67
+ def similarity_phone p1, p2
68
+ return { full: 1.0,
69
+ distances: [1.0],
70
+ result: true } if p1 == p2 # exact match
47
71
 
48
- name = case
49
- when em1.first == em2.first then 1 # exact match
50
- when ![em1, em2].map { |e| e.first.scan(/[a-z]+/) }.reduce(:&).empty? then INEXACT_MATCH_COEFFICIENT
51
- else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.first, em2.first)
52
- end
72
+ p1, p2 = [p1, p2].map { |p| p.split(/[,;]/) }
73
+ .map do |p|
74
+ p.map do |e|
75
+ phone = e.delete('^0-9')
76
+ phone = case phone.length
77
+ when 0..6 then phone
78
+ when 7 then "+3493#{phone}" # consider Barcelona
79
+ when 8..9 then "+34#{phone}" # consider Spain
80
+ else "+#{phone}"
81
+ end
82
+ # rubocop:disable Style/RescueModifier
83
+ Phoner::Phone.parse(phone) rescue nil # Phoner::CountryCodeError
84
+ # rubocop:enable Style/RescueModifier
85
+ end.compact
86
+ end
53
87
 
54
- full = domain * (1.0 - INEXACT_MATCH_COEFFICIENT) + name * INEXACT_MATCH_COEFFICIENT
88
+ dists = p1.product(p2)
89
+ .reject do |(pp1, pp2)|
90
+ pp1.country_code != pp2.country_code ||
91
+ pp1.area_code != pp2.area_code ||
92
+ pp1.number[0...-2] != pp2.number[0...-2]
93
+ end.map do |(pp1, pp2)|
94
+ case
95
+ when pp1.number[-2..-1] == pp2.number[-2..-1] then 1.0
96
+ when pp1.number[-2] == pp2.number[-2] then 0.9
97
+ else 0.8
98
+ end
99
+ end.sort.reverse
55
100
 
56
- { jw: jw, full: full, name: name, domain: domain, result: full >= INEXACT_MATCH_COEFFICIENT * INEXACT_MATCH_COEFFICIENT }
101
+ { full: dists.first || 0.0, distances: dists, result: dists.first && dists.first >= INEXACT_MATCH_COEFFICIENT }
102
+ end
103
+
104
+ # rubocop:disable Metrics/AbcSize
105
+ # similarity for emails
106
+ def similarity_email e1, e2
107
+ return { full: 1.0,
108
+ name: 1.0,
109
+ domain: 1.0,
110
+ result: true } if e1 == e2
111
+
112
+ em1, em2 = [e1, e2].map { |e| e.split '@' }
113
+ if em1.size != 2 || em2.size != 2
114
+ raise MalformedEmailError.new(e1, e2) if RAISE_ON_MALFORMED_EMAIL
115
+ return JW::DUMMY
116
+ end
117
+
118
+ domain = case
119
+ when em1.last == em2.last then 1 # exact domain match
120
+ when [em1, em2].map { |e| e.last.split('.')[-2] }.reduce(:==) then INEXACT_MATCH_COEFFICIENT
121
+ else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.last, em2.last)
122
+ end
123
+ name = case
124
+ when em1.first == em2.first then 1 # exact match
125
+ when ![em1, em2].map { |e| e.first.scan(/[a-z]+/) }.reduce(:&).empty? then INEXACT_MATCH_COEFFICIENT
126
+ else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.first, em2.first)
127
+ end
128
+ full = domain * (1.0 - INEXACT_MATCH_COEFFICIENT) + name * INEXACT_MATCH_COEFFICIENT
129
+ { full: full, name: name, domain: domain, result: full >= INEXACT_MATCH_COEFFICIENT * INEXACT_MATCH_COEFFICIENT }
130
+ end
131
+ # rubocop:enable Metrics/AbcSize
57
132
  end
58
- # rubocop:enable Metrics/AbcSize
59
133
 
60
- private_constant :JW, :MalformedEmailError
61
- module_function :similarity
134
+ private_constant :JW
62
135
  end
@@ -1,3 +1,3 @@
1
1
  module Emiler
2
- VERSION = "0.2.3"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: emiler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aleksei Matiushkin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-27 00:00:00.000000000 Z
11
+ date: 2016-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: phone
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 1.2.3
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 1.2.3
69
83
  description: This library is kinda analogue of Jaro-Winkler distance between two emails.
70
84
  email:
71
85
  - aleksei.matiushkin@kantox.com