email_signature_parser 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,885 @@
1
+ # encoding: utf-8
2
+
3
+ module EmailSignatureParser
4
+
5
+ class ParseError < StandardError; end
6
+ class InvalidEmailError < ParseError; end
7
+ class InvalidFromError < ParseError; end
8
+
9
+ class Parser
10
+
11
+ SOCIAL_MEDIA_URLS = ['instagram', 'x', 'twitter', 'facebook', 'linkedin', 'youtube', 'tiktok', 'bsky']
12
+
13
+ PHONES_TO_COUNTRY = {"1"=>"US/CA", "7"=>"KZ", "20"=>"EG", "27"=>"ZA", "30"=>"GR", "31"=>"NL", "32"=>"BE", "33"=>"FR", "34"=>"ES", "36"=>"HU", "39"=>"IT",
14
+ "40"=>"RO", "41"=>"CH", "43"=>"AT", "44"=>"GB", "45"=>"DK", "46"=>"SE", "47"=>"NO", "48"=>"PL", "49"=>"DE", "51"=>"PE", "52"=>"MX", "53"=>"CU",
15
+ "54"=>"AR", "55"=>"BR", "56"=>"CL", "57"=>"CO", "58"=>"VE", "60"=>"MY", "61"=>"AU", "62"=>"ID", "63"=>"PH", "64"=>"NZ", "65"=>"SG", "66"=>"TH", "81"=>"JP",
16
+ "82"=>"KR", "84"=>"VN", "86"=>"CN", "90"=>"TR", "91"=>"IN", "92"=>"PK", "93"=>"AF", "94"=>"LK", "95"=>"MM", "98"=>"IR", "211"=>"SS", "212"=>"MA", "213"=>"DZ",
17
+ "216"=>"TN", "218"=>"LY", "220"=>"GM", "221"=>"SN", "222"=>"MR", "223"=>"ML", "224"=>"GN", "225"=>"CI", "226"=>"BF", "227"=>"NE", "228"=>"TG", "229"=>"BJ", "230"=>"MU",
18
+ "231"=>"LR", "232"=>"SL", "233"=>"GH", "234"=>"NG", "235"=>"TD", "236"=>"CF", "237"=>"CM", "238"=>"CV", "239"=>"ST", "240"=>"GQ", "241"=>"GA", "242"=>"CG", "243"=>"CD",
19
+ "244"=>"AO", "245"=>"GW", "246"=>"IO", "248"=>"SC", "249"=>"SD", "250"=>"RW", "251"=>"ET", "252"=>"SO", "253"=>"DJ", "254"=>"KE", "255"=>"TZ", "256"=>"UG", "257"=>"BI",
20
+ "258"=>"MZ", "260"=>"ZM", "261"=>"MG", "262"=>"YT", "263"=>"ZW", "264"=>"NA", "265"=>"MW", "266"=>"LS", "267"=>"BW", "268"=>"SZ", "269"=>"KM", "290"=>"SH", "291"=>"ER",
21
+ "297"=>"AW", "298"=>"FO", "299"=>"GL", "350"=>"GI", "351"=>"PT", "352"=>"LU", "353"=>"IE", "354"=>"IS", "355"=>"AL", "356"=>"MT", "357"=>"CY", "358"=>"FI", "359"=>"BG",
22
+ "370"=>"LT", "371"=>"LV", "372"=>"EE", "373"=>"MD", "374"=>"AM", "375"=>"BY", "376"=>"AD", "377"=>"MC", "378"=>"SM", "379"=>"VA", "380"=>"UA", "381"=>"RS", "382"=>"ME",
23
+ "385"=>"HR", "386"=>"SI", "387"=>"BA", "389"=>"MK", "420"=>"CZ", "421"=>"SK", "423"=>"LI", "500"=>"FK", "501"=>"BZ", "502"=>"GT", "503"=>"SV", "504"=>"HN", "505"=>"NI",
24
+ "506"=>"CR", "507"=>"PA", "508"=>"PM", "509"=>"HT", "590"=>"MF", "591"=>"BO", "592"=>"GY", "593"=>"EC", "594"=>"GF", "595"=>"PY", "596"=>"MQ", "597"=>"SR", "598"=>"UY",
25
+ "599"=>"SX", "670"=>"TL", "672"=>"NF", "673"=>"BN", "674"=>"NR", "675"=>"PG", "676"=>"TO", "677"=>"SB", "678"=>"VU", "679"=>"FJ", "680"=>"PW", "681"=>"WF", "682"=>"CK", "683"=>"NU",
26
+ "685"=>"WS", "686"=>"KI", "687"=>"NC", "688"=>"TV", "689"=>"PF", "690"=>"TK", "691"=>"FM", "692"=>"MH", "850"=>"KP", "852"=>"HK", "853"=>"MO", "855"=>"KH", "856"=>"LA",
27
+ "870"=>"PN", "880"=>"BD", "886"=>"TW", "960"=>"MV", "961"=>"LB", "962"=>"JO", "963"=>"SY", "964"=>"IQ", "965"=>"KW", "966"=>"SA", "967"=>"YE", "968"=>"OM", "970"=>"PS",
28
+ "971"=>"AE", "972"=>"IL", "973"=>"BH", "974"=>"QA", "975"=>"BT", "976"=>"MN", "977"=>"NP", "992"=>"TJ", "993"=>"TM", "994"=>"AZ", "995"=>"GE", "996"=>"KG", "998"=>"UZ"}
29
+
30
+ JOB_TITLES = YAML.load_file(
31
+ File.expand_path('../../../lib/email_signature_parser/data/job_titles/titles.yaml', __FILE__), symbolize_names: true
32
+ ) + YAML.load_file(
33
+ File.expand_path('../../../lib/email_signature_parser/data/job_titles/titles_es.yaml', __FILE__), symbolize_names: true
34
+ )
35
+
36
+ JOB_ACRONYMS = YAML.load_file(
37
+ File.expand_path('../../../lib/email_signature_parser/data/job_titles/acronyms.yaml', __FILE__), symbolize_names: true
38
+ )
39
+
40
+ MEETING_DOMAINS = [
41
+ /zoom\./i,
42
+ /teams\.microsoft\.com/i,
43
+ /meet\.google\.com/i,
44
+ /webex\.com/i,
45
+ /gotomeeting\.com/i,
46
+ /ringcentral\.com/i
47
+ ]
48
+
49
+ COMMON_EMAIL_DOMAINS = [
50
+ 'gmail.com',
51
+ 'yahoo.', # includes yahoo.com, yahoo.co.uk, etc
52
+ 'hotmail.', # includes hotmail.com, hotmail.co.uk, etc
53
+ 'outlook.com',
54
+ 'aol.com',
55
+ 'icloud.com',
56
+ 'mail.com',
57
+ 'gmx.',
58
+ 'protonmail.com',
59
+ 'zoho.com'
60
+ ]
61
+
62
+ COMMON_COMPANY_ENDINGS = [
63
+ # English suffixes
64
+ 'inc', 'incorporated', 'corp', 'corporation', 'company', 'co', 'ltd', 'limited',
65
+ 'llc', 'lp', 'limited partnership', 'plc', 'public limited company',
66
+ 'enterprises', 'group', 'holdings', 'international', 'intl', 'worldwide',
67
+ 'global', 'associates', 'partners', 'consulting', 'services', 'solutions',
68
+ 'systems', 'technologies', 'tech', 'industries', 'manufacturing', 'mfg',
69
+
70
+ # Spanish suffixes
71
+ 'sa', 'sociedad anónima', 'sl', 'sociedad limitada', 'srl', 'sociedad de responsabilidad limitada',
72
+ 'sc', 'sociedad colectiva', 'scp', 'sociedad civil particular', 'scpp', 'sociedad civil público privada',
73
+ 'sad', 'sociedad anónima deportiva', 'sal', 'sociedad anónima laboral', 'sll', 'sociedad limitada laboral',
74
+ 'sau', 'sociedad anónima unipersonal', 'slu', 'sociedad limitada unipersonal',
75
+ 'sociedad', 'compañía', 'cia', 'empresa', 'corporación', 'grupo', 'holding',
76
+ 'internacional', 'mundial', 'global', 'asociados', 'socios', 'consultoría',
77
+ 'servicios', 'soluciones', 'sistemas', 'tecnologías', 'industrias', 'industrial',
78
+ 'ltda', 'limitada',
79
+ # International/European suffixes (commonly used in Spanish-speaking countries)
80
+ 'gmbh', 'ag', 'bv', 'nv', 'oy', 'ab', 'sas', 'spa', 'pte', 'kg', 'kgaa',
81
+ ]
82
+
83
+ MAX_SIGNATURE_LINES = 12
84
+ MAX_SENTENCE_LENGTH = 150
85
+ MAX_NAME_WORD_LENGTH = 15
86
+ MIN_PHONE_DIGITS = 8
87
+ MAX_PHONE_DIGITS = 15
88
+ COMPANY_NAME_SIMILARITY_THRESHOLD = 0.6
89
+ NAME_SIMILARITY_THRESHOLD = 0.7
90
+
91
+
92
+ def initialize
93
+ @signature_sentences = []
94
+ @parsed_name_from_signature = ''
95
+ end
96
+
97
+ def meeting_email?(email_body)
98
+ MEETING_DOMAINS.any? { |regex| email_body =~ regex }
99
+ end
100
+
101
+ def get_signature_sentences(name, email_address, email_body)
102
+
103
+ if email_body.nil? || email_body.empty?
104
+ raise InvalidEmailError, "No email body provided"
105
+ end
106
+
107
+ email_sentences = email_body.split(/\t|\n|\r/).map{|s| s.strip}
108
+
109
+ signature_start_index = email_sentences.size
110
+
111
+ first_consecutive_carriage_returns_index = -1
112
+ consecutive_carriages_found = []
113
+
114
+ consecutive_return_carriage = 0
115
+ foundContent = false
116
+ # Find the first consecutive empty lines that come after some content.
117
+ email_sentences.each_with_index do |sentence, index|
118
+
119
+ if !foundContent
120
+ if sentence.split(" ").size > 2
121
+ foundContent = true
122
+ end
123
+ next
124
+ end
125
+
126
+ if sentence == "" || sentence.count('*') > 5 || sentence.count('-') > 5 || sentence.count('_') > 5
127
+ # empty chars are breaks
128
+ consecutive_return_carriage += 1
129
+ else
130
+ if consecutive_return_carriage >= 1
131
+ consecutive_carriages_found << index - 1
132
+ end
133
+ consecutive_return_carriage = 0
134
+ end
135
+ end
136
+
137
+ if consecutive_carriages_found.size == 0
138
+ # No consecutive carriages or separators found, assume no signature
139
+ return
140
+ end
141
+
142
+ first_consecutive_carriage_returns_index = consecutive_carriages_found.first || -1
143
+
144
+ sentences = []
145
+
146
+ splitted_name = []
147
+ email_prefix = ""
148
+ if !name.empty?
149
+ splitted_name = name.downcase.gsub(/[^\w\s]/, ' ').gsub(/\s+/, ' ').split(" ")
150
+ elsif !email_address.empty?
151
+ email_prefix = email_address.split("@").first.downcase
152
+ end
153
+
154
+ signature_start_index = -1
155
+ found_email_index = -1
156
+ found_name_index = -1
157
+ email_sentences.each_with_index do |sentence, index|
158
+ sentences << { isSignature: false , doc: nil, type: 'unknown' }
159
+ if (sentence.include?("http") || sentence.include?("www") || index < first_consecutive_carriage_returns_index)
160
+ # If the sentence contains an email or link, or is before the first long break, dont even bother analyzing it
161
+ next
162
+ end
163
+
164
+ downcased_sentence = sentence.downcase
165
+ sentence_words = downcased_sentence.gsub(/[^\w\s\.\@]/, ' ').gsub(/\s+/, ' ').split(" ")
166
+ #Leave dots in sentence words. is to prevent splitting usernames like name.surname34
167
+ #Remove words longer than 15 chars, as they are unlikely to be names
168
+
169
+ if sentence_words.size < 8 && sentence_words.size > 0
170
+ similarities = []
171
+
172
+ #Discard long words (cant be a name) and words with digits
173
+ filtered_words = sentence_words.filter{|w| w.size < MAX_NAME_WORD_LENGTH && !w.match?(/\d/)}
174
+
175
+ splitted_name.each do |splitted_name_word|
176
+ similarities << filtered_words.map {|word| word.similarity(splitted_name_word)}.max
177
+ end
178
+ sim_max = similarities.max || 0
179
+
180
+ if sim_max < NAME_SIMILARITY_THRESHOLD && !email_prefix.empty?
181
+ # We are trying to see if a combination of the words in the sentence can match the email prefix
182
+
183
+ # Also consider common patterns in emails to match names
184
+ #Common email patterns
185
+ # {first}@company
186
+ # {first}{last}@company
187
+ # {first}.{last}@company
188
+ # {f}{last}@company
189
+ # {f}.{last}@company
190
+ # {first}{middle}{last}@company
191
+ # {first}.{middle}.{last}@company
192
+ # {f}.{middle}.{last}@company
193
+ # {f}{m}{last}@company
194
+ filtered_words.each_with_index do |word, wIndex|
195
+ #If the word starts with same letter as the searched email, build the possible names
196
+ wordsUntilEnd = filtered_words[(wIndex+1)..]
197
+
198
+ if word[0] == email_prefix[0] && wordsUntilEnd.size > 0
199
+
200
+ possible_first_last_emails = [
201
+ word, #first@company
202
+ word + wordsUntilEnd.first, #firstlast@company
203
+ word + "." + wordsUntilEnd.first, #first.last@company
204
+ word[0] + wordsUntilEnd.first, #flast@company
205
+ word[0] + "." + wordsUntilEnd.first, #f.last@company
206
+ ]
207
+
208
+ possible_first_last_emails.each do |possible_email|
209
+ if possible_email == email_prefix
210
+ sim_max = 1.0
211
+ @parsed_name_from_signature = word.capitalize + " " + wordsUntilEnd.first.capitalize
212
+ break
213
+ end
214
+ end
215
+
216
+ if wordsUntilEnd.size > 1
217
+ possible_first_middle_last_emails = [
218
+ # We may be in a situation where theres a middle name but not used in the email
219
+ word, #first@company
220
+ word + wordsUntilEnd[1], #firstlast@company
221
+ word + "." + wordsUntilEnd[1], #first.last@company
222
+ word[0] + wordsUntilEnd[1], #flast@company
223
+ word[0] + "." + wordsUntilEnd[1], #f.last@company
224
+
225
+ word + wordsUntilEnd.first + wordsUntilEnd[1], #firstmiddlelast@company
226
+ word + "." + wordsUntilEnd.first + "." + wordsUntilEnd[1], #first.middle.last@company
227
+ word[0] + "." + wordsUntilEnd.first + "." + wordsUntilEnd[1], #f.middle.last@company
228
+ word[0] + wordsUntilEnd.first[0] + wordsUntilEnd[1] #fmlast@company
229
+ ]
230
+
231
+ possible_first_middle_last_emails.each do |possible_email|
232
+ if possible_email == email_prefix
233
+ sim_max = 1.0
234
+ @parsed_name_from_signature = word.capitalize + " " + wordsUntilEnd.first.capitalize + " " + wordsUntilEnd[1].capitalize
235
+ break
236
+ end
237
+ end
238
+ end
239
+ end
240
+ end
241
+ end
242
+
243
+ if sim_max > NAME_SIMILARITY_THRESHOLD
244
+ signature_start_index = index
245
+ found_name_index = index
246
+ end
247
+ end
248
+
249
+ if (found_email_index == - 1 && downcased_sentence.include?(email_address))
250
+ found_email_index = index
251
+ end
252
+ end
253
+
254
+ if signature_start_index == -1
255
+ if found_email_index == -1
256
+ # No name or email found, assume no signature
257
+ return
258
+ end
259
+
260
+ # Lets try and find the name based on the email index
261
+ # Get closest consecutive carriage return index
262
+ closest_consecutive_index = consecutive_carriages_found.select { |i| i < found_email_index }.max
263
+ if (closest_consecutive_index - found_email_index).abs < 8
264
+ signature_start_index = closest_consecutive_index + 1
265
+ end
266
+ end
267
+
268
+ signature_end_index = -1
269
+
270
+ email_sentences.each_with_index do |sentence, index|
271
+ if index >= signature_start_index
272
+ sentence_no_links = sentence.gsub(/<a href(.*?)\/a>/, '')
273
+ if (sentence_no_links.count('*') > 5 || sentence_no_links.count('-') > 5 || sentence_no_links.size >= MAX_SENTENCE_LENGTH) ||
274
+ (sentence == "" && email_sentences[index+1] == "") # If big break, assume end of signature
275
+ signature_end_index = index
276
+ break
277
+ end
278
+ end
279
+ end
280
+
281
+ sentences.each_with_index do |sentence, index|
282
+ sentence[:doc] = email_sentences[index].strip
283
+ if index == found_name_index
284
+ sentence[:type] = 'name'
285
+ end
286
+
287
+ if index >= signature_start_index && (signature_end_index == -1 || index < signature_end_index)
288
+ sentence[:isSignature] = true
289
+ sentence[:doc] = email_sentences[index].strip
290
+ end
291
+ end
292
+
293
+ @signature_sentences = sentences.filter{|s| s[:isSignature] && !s[:doc].nil? && !s[:doc].empty?}
294
+ if @signature_sentences.size > MAX_SIGNATURE_LINES
295
+ # Unlikely to be a signature if it has more than 12 lines
296
+ @signature_sentences = []
297
+ end
298
+ end
299
+
300
+ def get_address
301
+ cityOrStateIndexes = []
302
+ postCodeIndexes = []
303
+ countryIndex = -1
304
+ streetIndexes = []
305
+
306
+ @signature_sentences.each_with_index do |sentence, index|
307
+ if sentence[:type] != 'unknown'
308
+ next
309
+ end
310
+
311
+ address_parts = {
312
+ house: 0,
313
+ house_number: 0,
314
+ road: 0,
315
+ postcode: 0,
316
+ city_district: 0,
317
+ city: 0,
318
+ state_district: 0,
319
+ state: 0,
320
+ country_region: 0,
321
+ country: 0
322
+ }
323
+
324
+ # remove links from the sentence before parsing
325
+ possible_address = sentence[:doc].gsub(/<a href(.*?)\/a>/, '')
326
+ parsed_address = Postal::Parser.parse_address(possible_address)
327
+
328
+ parsed_address.each do |val|
329
+ if address_parts[val[:label]]
330
+ address_parts[val[:label]] += 1
331
+ end
332
+ end
333
+
334
+ if max_consecutive_digits = Utils.max_consecutive_digits(possible_address) > 9 &&
335
+ (!address_parts[:country].positive? || !address_parts[:state].positive?)
336
+ # looks like a number, not part of address, discard
337
+ next
338
+ end
339
+
340
+ if (index > 0 && (address_parts[:city] > 0 || address_parts[:state] > 0))
341
+ cityOrStateIndexes << index
342
+ end
343
+ if (index > 0 && address_parts[:country] > 0 && countryIndex < 0)
344
+ countryIndex = index
345
+ @signature_sentences[countryIndex][:type] = 'address'
346
+ end
347
+ if (index > 0 && address_parts[:postcode] > 0)
348
+ postCodeIndexes << index
349
+ end
350
+
351
+ if (address_parts[:road] > 0 && address_parts[:house] > 0) ||
352
+ (address_parts[:road] > 0 && address_parts[:house_number] > 0) ||
353
+ (address_parts[:house] > 0 && address_parts[:house_number] > 0)
354
+ streetIndexes << index
355
+ end
356
+ end
357
+
358
+ cityOrStateIndexes.each do |cityIndex|
359
+ streetIndexes.each do |streetIndex|
360
+ if ((cityIndex - streetIndex).abs < 2)
361
+ @signature_sentences[cityIndex][:type] = 'address'
362
+ @signature_sentences[streetIndex][:type] = 'address'
363
+ end
364
+ end
365
+ if (countryIndex > 0 && (cityIndex - countryIndex).abs < 3)
366
+ @signature_sentences[cityIndex][:type] = 'address'
367
+ end
368
+ postCodeIndexes.each do |postCodeIndex|
369
+ if ((cityIndex - postCodeIndex).abs < 3)
370
+ @signature_sentences[cityIndex][:type] = 'address'
371
+ @signature_sentences[postCodeIndex][:type] = 'address'
372
+ end
373
+ end
374
+ end
375
+
376
+ address_text = @signature_sentences.filter{|sentence| sentence[:type] == 'address'}.map{
377
+ |sentence| sentence[:doc].gsub(/<a href(.*?)\/a>/, '').gsub(/.*?:/, ' ').gsub(/[\|><]/, ' ')}.join(", ")
378
+
379
+ if address_text.size > MAX_SENTENCE_LENGTH
380
+ @signature_sentences.filter{|s| s[:type] == 'address'}.each{|s| s[:type] = 'unknown'}
381
+ return ""
382
+ end
383
+
384
+ #Replace multiple commas with a single comma. or a comma preceded by a space
385
+ address_text = address_text.gsub(/\s+/, ' ').gsub(/,+|\ ,/, ',')
386
+
387
+ return address_text.strip
388
+ end
389
+
390
+ def get_phones
391
+ phones = []
392
+ @signature_sentences[1..]&.each_with_index do |sentence, index|
393
+
394
+ #Sometimes multiple phones are in the same line separated by | or ●
395
+ #They can also be in the same line as Telephone 1123123 Mobile 123123123
396
+ text = sentence[:doc].gsub(/<a href(.*?)\/a>/, '')
397
+
398
+ phone_texts = text.scan(/(.*?\+?\d+(?:[\s\-\(\)\.]+\d+)*(?:\s*(?:\([^)]+\)|[A-Za-z][^,\n]*))?)|[\|●\,]/
399
+ ).flatten.reject(&:nil?).map{|pt| pt.gsub(/[\|●\,]/, '').strip}.reject(&:empty?)
400
+
401
+ phone_texts.each do |phone_text|
402
+ # See if phone text is an extension
403
+ # common patterns: ext. 1234, x1234, x.1234, ext1234
404
+ extension_regex = /\b(ext\.?|extension)[\.\:\s]*\d+|(?<!\w)x[\.\:\s]*\d+/i
405
+ extension_match = phone_text.match(extension_regex)
406
+ extension = nil
407
+ if extension_match
408
+ extension = extension_match[0].gsub(/[^\d]/, '').strip
409
+ phone_text = phone_text.gsub(extension_regex, '').strip
410
+ end
411
+
412
+ max_consecutive_digits = Utils.max_consecutive_digits(phone_text)
413
+ if max_consecutive_digits < MIN_PHONE_DIGITS || max_consecutive_digits > MAX_PHONE_DIGITS
414
+ # Too few or too many consecutive digits to be a phone number, skip
415
+ if !extension.nil? && !extension.empty? && phones.size > 0
416
+ # They wrote the number like "123-123-124, ext. 1234". Assign the extension to the last phone found
417
+ phones.last[:extension] = extension
418
+ end
419
+ next
420
+ end
421
+
422
+ @signature_sentences[index+1][:type] = 'phone'
423
+ match_before_number = phone_text.match(/[^\d\+\(\)]*(?=[\d\+\(])/i)
424
+ phone_type = 'Phone'
425
+ unless match_before_number.nil?
426
+ match_text = match_before_number[0].downcase()
427
+ if match_text.include?('m:') || match_text.include?('mobile') || match_text.include?('movil') ||
428
+ match_text.include?('móvil') || match_text.include?('c:') || match_text.include?('cell')
429
+ phone_type = 'Mobile'
430
+ elsif match_text.include?('o:') || match_text.include?('office') || match_text.include?('oficina') ||
431
+ match_text.include?('work') || match_text.include?('w:') || match_text.include?('trabajo')
432
+ phone_type = 'Office'
433
+ elsif match_text.include?('f:') || match_text.include?('fax')
434
+ phone_type = 'Fax'
435
+ elsif match_text.include?('direct') && match_text.include?("line") || match_text.include?('d:')
436
+ phone_type = 'Direct Line'
437
+ end
438
+
439
+ phone_text = phone_text.gsub(match_before_number[0], '')
440
+ end
441
+
442
+ if phone_type == 'Phone'
443
+ text_after_last_digit = phone_text.match(/\d[^\d]*$/)&.to_s&.gsub(/^\d/, '')&.downcase || ''
444
+ if !text_after_last_digit.empty?
445
+ if text_after_last_digit.include?('mobile') || text_after_last_digit.include?('movil') ||
446
+ text_after_last_digit.include?('móvil') || text_after_last_digit.include?('cell')
447
+ phone_type = 'Mobile'
448
+ elsif text_after_last_digit.include?('office') || text_after_last_digit.include?('oficina') || text_after_last_digit.include?('work') ||
449
+ text_after_last_digit.include?('trabajo')
450
+ phone_type = 'Office'
451
+ elsif text_after_last_digit.include?('fax')
452
+ phone_type = 'Fax'
453
+ elsif text_after_last_digit.include?('direct') && text_after_last_digit.include?("line")
454
+ phone_type = 'Direct Line'
455
+ end
456
+ end
457
+ end
458
+
459
+ # Get digits including ., spaces, (, ), +
460
+ digits = phone_text.gsub(/[^\+\d\s\.\(\)-]/, '').gsub(/(\+\s+\+)|(\++)/, '+').gsub(/\s+/, ' ').strip
461
+ digit_count = digits.gsub(/[^\d]/, '').size
462
+
463
+ if digit_count < MIN_PHONE_DIGITS || digit_count > MAX_PHONE_DIGITS
464
+ # Too small or long to be a phone number, skip
465
+ next
466
+ end
467
+
468
+ country_code = digits.match(/^\+?(\d{1,3})/)&.captures&.first
469
+
470
+ phones << {
471
+ type: phone_type,
472
+ phone_number: digits,
473
+ country: PHONES_TO_COUNTRY[country_code],
474
+ extension: extension
475
+ }.compact
476
+ end
477
+ end
478
+
479
+ return phones
480
+
481
+ end
482
+
483
+ def get_links()
484
+
485
+ parsed_links = {
486
+ social_media: {},
487
+ other: []
488
+ }
489
+
490
+ @signature_sentences[1..]&.each_with_index do |sentence, index|
491
+ links_in_sentence = sentence[:doc].scan(/(?<=(href=")).*?(?=")/).map {$&}
492
+ for link in links_in_sentence
493
+ if link.include?('mailto:')
494
+ next
495
+ else
496
+ isSocialMedia = false
497
+ SOCIAL_MEDIA_URLS.each do |social_media|
498
+ domain = link.downcase.match(/https?:\/\/(www\.)?([^\/]+)/)&.captures&.last || ""
499
+ domain = domain.split('.').first # Remove TLD
500
+
501
+ if social_media == domain
502
+ parsed_links[:social_media][social_media.to_sym] = link.include?("http") ? link : "https://#{link}"
503
+ isSocialMedia = true
504
+ break
505
+ end
506
+ end
507
+ if !isSocialMedia
508
+ olink = link.include?("http") ? link : "https://#{link}"
509
+ parsed_links[:other] << olink
510
+ end
511
+ end
512
+ end
513
+ end
514
+
515
+ return parsed_links
516
+ end
517
+
518
+ def get_company_name_and_job_title(email_address)
519
+
520
+ company_name = []
521
+ parsed_titles = []
522
+ acronyms_found = []
523
+
524
+ email_domain = email_address.split("@").last&.downcase || ""
525
+ company_domain = ""
526
+
527
+ unless COMMON_EMAIL_DOMAINS.any? { |common_domain| email_domain.include?(common_domain) }
528
+ begin
529
+ company_domain = PublicSuffix.parse(email_domain).sld
530
+ rescue Exception => e
531
+ # Dont care
532
+ end
533
+ end
534
+
535
+ @signature_sentences[0..2]&.each do |sentence|
536
+ if sentence[:type] != 'unknown' && sentence[:type] != 'name'
537
+ next
538
+ end
539
+
540
+ text = sentence[:doc].strip()
541
+ if text.include?("@") || text.include?("www") || text.include?("http")
542
+ next
543
+ end
544
+
545
+ sections = text.split(',')
546
+
547
+ sections.each do |section|
548
+ clean_sentence = section.gsub(/[^\w\s]/, ' ').gsub(/\s+/, ' ').strip
549
+ splitted_text = clean_sentence.split(' ')
550
+
551
+ if splitted_text.any? { |word| COMMON_COMPANY_ENDINGS.any? { |ending| word.downcase == ending } }
552
+ company_name = [section]
553
+ end
554
+
555
+ if company_name.size == 0 && !company_domain.empty?
556
+ # First attempt to see if section matches company domain, and is the company name
557
+
558
+ sim = clean_sentence.downcase.gsub(/\s+/, '').similarity(company_domain)
559
+ if (sim >= COMPANY_NAME_SIMILARITY_THRESHOLD)
560
+ company_name = [section]
561
+ if sentence[:type] == 'unknown'
562
+ sentence[:type] = 'company_name_or_job_title'
563
+ end
564
+ end
565
+
566
+ if company_name.size == 0
567
+ splitted_text.each do |word|
568
+ sim = word.downcase.similarity(company_domain)
569
+ if sim == 1.0
570
+ company_name << word.strip()
571
+ if sentence[:type] == 'unknown'
572
+ sentence[:type] = 'company_name_or_job_title'
573
+ end
574
+ break
575
+ end
576
+
577
+ if (sim >= COMPANY_NAME_SIMILARITY_THRESHOLD)
578
+ if (splitted_text.size <= 3)
579
+ company_name = splitted_text
580
+ if sentence[:type] == 'unknown'
581
+ sentence[:type] = 'company_name_or_job_title'
582
+ end
583
+ break
584
+ else
585
+ company_name << word.strip()
586
+ if sentence[:type] == 'unknown'
587
+ sentence[:type] = 'company_name_or_job_title'
588
+ end
589
+ end
590
+ end
591
+ end
592
+ end
593
+
594
+ if company_name.size > 0
595
+ next
596
+ end
597
+ # Also compare it with the whole string, without spaces
598
+ end
599
+ # Now try to find job titles in the section
600
+ splitted_downcased_words = splitted_text.map(&:downcase)
601
+ job_titles_found = []
602
+
603
+ splitted_downcased_words.each_with_index do |word, index|
604
+ if JOB_TITLES.any?(word)
605
+ job_titles_found << {
606
+ index: index,
607
+ word: word
608
+ }
609
+ if sentence[:type] == 'unknown'
610
+ sentence[:type] = 'company_name_or_job_title'
611
+ end
612
+ end
613
+ end
614
+
615
+ acronyms = (section.gsub(/\s+/, ' ').strip.split(' ') & JOB_ACRONYMS)
616
+ if acronyms.size > 0
617
+ acronyms_found += acronyms
618
+ if sentence[:type] == 'unknown'
619
+ sentence[:type] = 'company_name_or_job_title'
620
+ end
621
+ end
622
+ if job_titles_found.size > 0
623
+ parsed_titles << section.gsub(/\s+/, ' ').strip
624
+ end
625
+ end
626
+ end
627
+
628
+ job_title = {}
629
+
630
+ if acronyms_found.size == 1
631
+ job_title[:acronym] = acronyms_found.first
632
+ elsif acronyms_found.size > 1
633
+ job_title[:acronyms] = acronyms_found
634
+ end
635
+
636
+ if parsed_titles.size == 1
637
+ job_title[:title] = parsed_titles.first
638
+ elsif parsed_titles.size > 1
639
+ job_title[:titles] = parsed_titles
640
+ end
641
+
642
+ return [company_name.uniq.join(" "), job_title]
643
+ end
644
+
645
+ def parse_signature(name, email_address, email_body)
646
+
647
+ @signature_sentences = []
648
+
649
+ get_signature_sentences(name, email_address, email_body)
650
+
651
+ address = get_address()
652
+ phones = get_phones()
653
+ links = get_links()
654
+ company_name, job_title = get_company_name_and_job_title(email_address)
655
+
656
+ parsed_name = name
657
+
658
+ if !parsed_name.nil? && parsed_name.empty?
659
+ if !@parsed_name_from_signature.empty?
660
+ parsed_name = @parsed_name_from_signature
661
+ else
662
+ # Try to extract name from the first two sentences of the signature
663
+ @signature_sentences[0..1].filter{|s| s[:type] == 'unknown'}.each do |sentence|
664
+ sentence_words = sentence[:doc].split(" ")
665
+ if sentence_words.size > 1 && sentence_words.size < 4 && sentence_words.reduce(true){|acc, val| val.size < MAX_NAME_WORD_LENGTH && acc}
666
+ parsed_name = sentence_words.map{|n| n.capitalize}.join(" ")
667
+ end
668
+ end
669
+ end
670
+ end
671
+
672
+ parsed_data = {
673
+ name: parsed_name,
674
+ email_address: email_address,
675
+ address: address,
676
+ phones: phones,
677
+ links: links,
678
+ job_title: job_title,
679
+ text: @signature_sentences.map{|sentence| sentence[:doc]}.join("\n"),
680
+ company_name: company_name,
681
+ }
682
+
683
+ return parsed_data
684
+ end
685
+
686
+ def split_in_threads(text)
687
+
688
+ # Regex for reply markers. Handles English and Spanish.
689
+ # (from|de):\s.*@+.+) matches from followed by an email address
690
+ # (-{5,}\s*?^(from|de):\s+\w+) matches lines that start with ----- followed by from and a name
691
+ marker_regex = /((from|de):\s.*@+.+)|(-{5,}\s*?^(from|de):\s+\w+)/i
692
+
693
+ marker_indexes = text.enum_for(:scan, marker_regex).map { Regexp.last_match.begin(0) }
694
+ messages = []
695
+ last_index = 0
696
+
697
+ marker_indexes.each do |marker_index|
698
+ messages << text[last_index...marker_index].strip
699
+ last_index = marker_index
700
+ end
701
+
702
+ if last_index < text.size
703
+ messages << text[last_index..].strip
704
+ end
705
+
706
+ return messages
707
+ end
708
+
709
+ def remove_forwarded_content(text)
710
+ if text.nil? || text.empty?
711
+ return text
712
+ end
713
+ # Regex for forwarded message markers. Handles English and Spanish.
714
+ forwarded_regex = /-{5,}.*?(forwarded|reenviado|original)/i
715
+ text.split(forwarded_regex).first&.strip
716
+ end
717
+
718
+ def parse_email_html(raw_html)
719
+ content = raw_html[/<html.*<\/html>/im]
720
+
721
+ # Gets only the html part of the email
722
+ if content.nil?
723
+ raise InvalidEmailError, "No HTML content found"
724
+ end
725
+
726
+ options = {
727
+ symbolize: true,
728
+ skip: :skip_white,
729
+ smart: true,
730
+ mode: :generic,
731
+ effort: :tolerant,
732
+ }
733
+ input = StringIO.new(content)
734
+ text_handler = HtmlTextParser.new()
735
+
736
+ Ox.sax_html(text_handler, input, options)
737
+ text_handler.postprocess
738
+
739
+ return text_handler.parsed_text
740
+ end
741
+
742
+ def extract_name_and_email(from)
743
+ name = ""
744
+ email_address = ""
745
+ if from.nil? || from.empty?
746
+ return name, email_address
747
+ end
748
+
749
+ if from.include?("<") && from.include?(">")
750
+ # From likely to include name and email address Name <email@example.com>
751
+ name = from.split("<")&.first&.strip&.gsub((/["']/), '') || ""
752
+ email_address = from.match(/<(.+?)>/)&.captures&.first || ""
753
+ else
754
+ email_address = from.strip
755
+ end
756
+
757
+ splitted_name = name.split(" ")
758
+ if splitted_name.any?{|n| n.size > 15}
759
+ # Name likely to be invalid, as it contains very long words
760
+ # There are sometimes funny names in the from header, like: "=?utf-8?B?TWlrZSBCcm93bmxlYWRlciBlbiBUZWFtcw==?=" <noreply@emeaemail.teams.microsoft.com>
761
+ name = ""
762
+ end
763
+
764
+ return name, email_address.downcase
765
+ end
766
+
767
+ def parse_from_file(file_path)
768
+
769
+ # Sometimes when there are special characters in the email, the mail gem fails to parse it correctly
770
+ file_content = File.open(file_path, 'rb') { |f| f.read }
771
+ encoding = file_content[/charset="([^"]+)"/, 1] || file_content[/charset=([^;\s]+)/, 1]
772
+ if !encoding.nil? && !encoding.empty?
773
+ begin
774
+ file_content = file_content.force_encoding(encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
775
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e
776
+ # Fallback: force UTF-8 and clean invalid bytes
777
+ file_content = file_content.force_encoding('UTF-8').scrub('?')
778
+ end
779
+ end
780
+
781
+ m = Mail.new(file_content)
782
+ from = m.header["From"]&.field&.value
783
+ if from.nil? && from.empty?
784
+ raise InvalidFromError, "No From field in email"
785
+ end
786
+
787
+ if from.downcase.include?("reply") || from.downcase.include?("mailer-daemon")
788
+ raise InvalidFromError, "No valid From address"
789
+ end
790
+
791
+ name, email_address = extract_name_and_email(from)
792
+
793
+ html_part = ""
794
+ text_part = ""
795
+
796
+ if m.multipart?
797
+ if m.parts.size > 0
798
+ if !m.html_part.nil?
799
+ if m.html_part.content_type_parameters && !m.html_part.content_type_parameters['charset'].nil? && !m.html_part.content_type_parameters['charset'].empty?
800
+ encoding = m.html_part.content_type_parameters['charset']
801
+ elsif !m.html_part.charset.nil? && !m.html_part.charset.empty?
802
+ encoding = m.html_part.charset
803
+ end
804
+
805
+ if !encoding.nil? && !encoding.empty?
806
+ html_part = m.html_part.decode_body.force_encoding(encoding)
807
+ else
808
+ html_part = m.html_part.decode_body.force_encoding('UTF-8')
809
+ end
810
+ begin
811
+ html_part = html_part.encode('UTF-8')
812
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e
813
+ html_part = html_part.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
814
+ end
815
+ text_part = parse_email_html(html_part)
816
+ else
817
+ text_part = m.text_part ? m.text_part.decode_body : ""
818
+ end
819
+ else
820
+ # If mail detected multipart, but no parts raise error
821
+ raise InvalidEmailError, "Multipart email with no parts"
822
+ end
823
+ else
824
+ raw_content = m.decode_body
825
+ if raw_content.include?("<html")
826
+ text_part = parse_email_html(raw_content)
827
+ else
828
+ text_part = raw_content
829
+ end
830
+ end
831
+
832
+ unless !text_part.nil? && !text_part.empty?
833
+ raise InvalidEmailError, "No email body"
834
+ end
835
+
836
+ if meeting_email?(text_part)
837
+ raise InvalidEmailError, "Meeting email detected, no signature to parse"
838
+ end
839
+
840
+ messages = split_in_threads(text_part)
841
+
842
+ # Only parse the first message in the thread, as its the most recent one and the one sent by the from address
843
+ signature = parse_signature(name, email_address.downcase, remove_forwarded_content(messages.first))
844
+ signature[:signature_datetime] = m.date.to_s
845
+ return signature
846
+ end
847
+
848
+ def parse_from_html(from, email_html_body)
849
+ unless !from.nil? && !from.empty?
850
+ raise InvalidFromError, "No from provided"
851
+ end
852
+
853
+ unless !email_html_body.nil? && !email_html_body.empty?
854
+ raise InvalidEmailError, "No email body provided"
855
+ end
856
+
857
+
858
+ name, email_address = extract_name_and_email(from)
859
+ parsed_text = parse_email_html(email_html_body)
860
+
861
+ if meeting_email?(parsed_text)
862
+ raise InvalidEmailError, "Email appears to be a meeting invite"
863
+ end
864
+
865
+ messages = split_in_threads(parsed_text)
866
+
867
+ parse_signature(name, email_address.downcase, remove_forwarded_content(messages.first))
868
+ end
869
+
870
+ def parse_from_text(from, email_body)
871
+ unless !from.nil? && !from.empty?
872
+ raise InvalidFromError, "No from provided"
873
+ end
874
+
875
+ if meeting_email?(email_body)
876
+ raise InvalidEmailError, "Email appears to be a meeting invite"
877
+ end
878
+
879
+ name, email_address = extract_name_and_email(from)
880
+ messages = split_in_threads(email_body)
881
+
882
+ parse_signature(name, email_address.downcase, remove_forwarded_content(messages.first))
883
+ end
884
+ end
885
+ end