emiler 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9087d21f342c6fbf527446f3b9d12a7704d24fc3
4
- data.tar.gz: 620b32bfe7e591eb14330ceb04e6c34a83258d5b
3
+ metadata.gz: 17d680b472c91879a2d30d06c5ff4252df1e2bf5
4
+ data.tar.gz: e7459390747cc3fb174acdad6f3fadaf137d7270
5
5
  SHA512:
6
- metadata.gz: e6214d1f660358148d59dce104fb9b223d3896f89905fb45cdb4c1c72d2478c8ffba7f5d762c28efc030330a7db1837683b965355551a22a148c8101a98a6c7c
7
- data.tar.gz: c48623402f161cabe1afb70d80ea515a31315f2c16700b61e9e4b797f329958eea4430254e29c1f4105a406c3009d6f71fd220697de74f9e4447a5d46bb10d64
6
+ metadata.gz: 05b5b0ce5d47397a655ab501b1d30e17066a667754b4c15608145b009b2d0c83dc8fdc0b5ee29fe6e5516d2a36d370f3364996e64630b76f9cf12a0b97518e42
7
+ data.tar.gz: 930f7d3c786b5744edfcea98e4ff0b0348498a455fd73e31535cd05727461dcc391b4eb6768776fa28899e7e348dd7a27996f83471abd10f1d7185a6fb5a9b4c
@@ -54,3 +54,6 @@ Metrics/MethodLength:
54
54
 
55
55
  Metrics/PerceivedComplexity:
56
56
  Max: 20
57
+
58
+ Metrics/AbcSize:
59
+ Max: 42
@@ -28,4 +28,6 @@ Gem::Specification.new do |spec|
28
28
  spec.add_development_dependency 'rake', '~> 10.0'
29
29
  spec.add_development_dependency 'rspec', '~> 3.0'
30
30
  spec.add_development_dependency 'pry', '~> 0.10'
31
+
32
+ spec.add_dependency 'phone', '~> 1.2.3'
31
33
  end
@@ -1,9 +1,12 @@
1
1
  require 'emiler/version'
2
2
  require 'emiler/jarowinkler'
3
3
 
4
+ require 'phone'
5
+
4
6
  module Emiler
5
7
  INEXACT_MATCH_COEFFICIENT = ENV['INEXACT_MATCH_COEFFICIENT'] || 0.8
6
8
  RAISE_ON_MALFORMED_EMAIL = ENV['RAISE_ON_MALFORMED_EMAIL']
9
+ COMPANY_NAME_STOP_WORDS = %w(ltd gmbh inc).freeze
7
10
 
8
11
  class JW
9
12
  attr_reader :jw
@@ -27,36 +30,106 @@ module Emiler
27
30
  end
28
31
  end
29
32
 
30
- # rubocop:disable Metrics/AbcSize
31
- def similarity e1, e2
32
- e1, e2 = [e1, e2].map(&:to_s).map(&:downcase)
33
- em1, em2 = [e1, e2].map { |e| e.split '@' }
33
+ class << self
34
+ def similarity item1, item2, type: :email
35
+ type = :default unless private_methods.include? :"similarity_#{type}"
36
+ item1, item2 = [item1, item2].map(&:to_s).map(&:strip).map(&:downcase)
37
+ { jw: JW::MATCHER.distance(item1, item2) }.merge send(:"similarity_#{type}", item1, item2)
38
+ end
39
+
40
+ private
34
41
 
35
- if em1.size != 2 || em2.size != 2
36
- raise MalformedEmailError.new(e1, e2) if RAISE_ON_MALFORMED_EMAIL
37
- return JW::DUMMY
42
+ # stub for unknown types; returns empty hash for `similarity` to return jaro-winkler distance only
43
+ def similarity_default(*)
44
+ { result: nil }
38
45
  end
39
46
 
40
- jw = JW::MATCHER.distance e1, e2
47
+ # similarity for company names
48
+ def similarity_company_name c1, c2
49
+ return { full: 1.0,
50
+ distances: [1.0] * c1.split(/\s+/).size,
51
+ matches: c1.split(/\s+/).size,
52
+ result: true } if c1 == c2 # exact match
53
+
54
+ c1, c2 = [c1, c2].map { |c| c.split(/\s+/).reject(&COMPANY_NAME_STOP_WORDS.method(:include?)) }
55
+ return { full: 1.0 - (1.0 - INEXACT_MATCH_COEFFICIENT) / 2.0, name: 1.0, result: true } if c1 == c2 # match without stopwords
56
+
57
+ dists = c1.product(c2)
58
+ .map { |(w1, w2)| JW::MATCHER.distance(w1, w2) }
59
+ .sort
60
+ .reverse
61
+ count = [c1, c2].map(&:size).min
62
+ average = dists.take(count).map.with_index { |e, i| e * (1.0 - i.to_f / count) / count }.reduce(:+)
63
+ { full: average, distances: dists, matches: dists.count(1.0), result: false }
64
+ end
41
65
 
42
- domain = case
43
- when em1.last == em2.last then 1 # exact domain match
44
- when [em1, em2].map { |e| e.last.split('.')[-2] }.reduce(:==) then INEXACT_MATCH_COEFFICIENT
45
- else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.last, em2.last)
46
- end
66
+ # similarity for phone numbers
67
+ def similarity_phone p1, p2
68
+ return { full: 1.0,
69
+ distances: [1.0],
70
+ result: true } if p1 == p2 # exact match
47
71
 
48
- name = case
49
- when em1.first == em2.first then 1 # exact match
50
- when ![em1, em2].map { |e| e.first.scan(/[a-z]+/) }.reduce(:&).empty? then INEXACT_MATCH_COEFFICIENT
51
- else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.first, em2.first)
52
- end
72
+ p1, p2 = [p1, p2].map { |p| p.split(/[,;]/) }
73
+ .map do |p|
74
+ p.map do |e|
75
+ phone = e.delete('^0-9')
76
+ phone = case phone.length
77
+ when 0..6 then phone
78
+ when 7 then "+3493#{phone}" # consider Barcelona
79
+ when 8..9 then "+34#{phone}" # consider Spain
80
+ else "+#{phone}"
81
+ end
82
+ # rubocop:disable Style/RescueModifier
83
+ Phoner::Phone.parse(phone) rescue nil # Phoner::CountryCodeError
84
+ # rubocop:enable Style/RescueModifier
85
+ end.compact
86
+ end
53
87
 
54
- full = domain * (1.0 - INEXACT_MATCH_COEFFICIENT) + name * INEXACT_MATCH_COEFFICIENT
88
+ dists = p1.product(p2)
89
+ .reject do |(pp1, pp2)|
90
+ pp1.country_code != pp2.country_code ||
91
+ pp1.area_code != pp2.area_code ||
92
+ pp1.number[0...-2] != pp2.number[0...-2]
93
+ end.map do |(pp1, pp2)|
94
+ case
95
+ when pp1.number[-2..-1] == pp2.number[-2..-1] then 1.0
96
+ when pp1.number[-2] == pp2.number[-2] then 0.9
97
+ else 0.8
98
+ end
99
+ end.sort.reverse
55
100
 
56
- { jw: jw, full: full, name: name, domain: domain, result: full >= INEXACT_MATCH_COEFFICIENT * INEXACT_MATCH_COEFFICIENT }
101
+ { full: dists.first || 0.0, distances: dists, result: dists.first && dists.first >= INEXACT_MATCH_COEFFICIENT }
102
+ end
103
+
104
+ # rubocop:disable Metrics/AbcSize
105
+ # similarity for emails
106
+ def similarity_email e1, e2
107
+ return { full: 1.0,
108
+ name: 1.0,
109
+ domain: 1.0,
110
+ result: true } if e1 == e2
111
+
112
+ em1, em2 = [e1, e2].map { |e| e.split '@' }
113
+ if em1.size != 2 || em2.size != 2
114
+ raise MalformedEmailError.new(e1, e2) if RAISE_ON_MALFORMED_EMAIL
115
+ return JW::DUMMY
116
+ end
117
+
118
+ domain = case
119
+ when em1.last == em2.last then 1 # exact domain match
120
+ when [em1, em2].map { |e| e.last.split('.')[-2] }.reduce(:==) then INEXACT_MATCH_COEFFICIENT
121
+ else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.last, em2.last)
122
+ end
123
+ name = case
124
+ when em1.first == em2.first then 1 # exact match
125
+ when ![em1, em2].map { |e| e.first.scan(/[a-z]+/) }.reduce(:&).empty? then INEXACT_MATCH_COEFFICIENT
126
+ else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.first, em2.first)
127
+ end
128
+ full = domain * (1.0 - INEXACT_MATCH_COEFFICIENT) + name * INEXACT_MATCH_COEFFICIENT
129
+ { full: full, name: name, domain: domain, result: full >= INEXACT_MATCH_COEFFICIENT * INEXACT_MATCH_COEFFICIENT }
130
+ end
131
+ # rubocop:enable Metrics/AbcSize
57
132
  end
58
- # rubocop:enable Metrics/AbcSize
59
133
 
60
- private_constant :JW, :MalformedEmailError
61
- module_function :similarity
134
+ private_constant :JW
62
135
  end
@@ -1,3 +1,3 @@
1
1
  module Emiler
2
- VERSION = "0.2.3"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: emiler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aleksei Matiushkin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-27 00:00:00.000000000 Z
11
+ date: 2016-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: phone
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 1.2.3
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 1.2.3
69
83
  description: This library is kinda analogue of Jaro-Winkler distance between two emails.
70
84
  email:
71
85
  - aleksei.matiushkin@kantox.com