datacompose 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/__init__.py +1 -0
- datacompose/cli/__init__.py +5 -0
- datacompose/cli/colors.py +80 -0
- datacompose/cli/commands/__init__.py +3 -0
- datacompose/cli/commands/add.py +215 -0
- datacompose/cli/commands/init.py +451 -0
- datacompose/cli/commands/list.py +118 -0
- datacompose/cli/commands/upgrade.py +7 -0
- datacompose/cli/main.py +59 -0
- datacompose/cli/validation.py +72 -0
- datacompose/generators/__init__.py +3 -0
- datacompose/generators/base.py +193 -0
- datacompose/generators/pyspark/__init__.py +1 -0
- datacompose/generators/pyspark/generator.py +51 -0
- datacompose/operators/__init__.py +21 -0
- datacompose/operators/primitives.py +595 -0
- datacompose/transformers/__init__.py +0 -0
- datacompose/transformers/discovery.py +186 -0
- datacompose/transformers/text/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4.dist-info/METADATA +431 -0
- datacompose-0.2.4.dist-info/RECORD +31 -0
- datacompose-0.2.4.dist-info/WHEEL +5 -0
- datacompose-0.2.4.dist-info/entry_points.txt +2 -0
- datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
- datacompose-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,781 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
# For type checkers only - these imports are always available during type checking
|
|
6
|
+
from pyspark.sql import Column
|
|
7
|
+
from pyspark.sql import functions as F
|
|
8
|
+
else:
|
|
9
|
+
# At runtime, handle missing PySpark gracefully
|
|
10
|
+
try:
|
|
11
|
+
from pyspark.sql import Column
|
|
12
|
+
from pyspark.sql import functions as F
|
|
13
|
+
except ImportError:
|
|
14
|
+
# PySpark is not installed - functions will fail at runtime if called
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
# Try local utils import first (for generated code)
|
|
19
|
+
from utils.primitives import PrimitiveRegistry
|
|
20
|
+
except ImportError:
|
|
21
|
+
# Fall back to installed datacompose package
|
|
22
|
+
from datacompose.operators.primitives import PrimitiveRegistry
|
|
23
|
+
|
|
24
|
+
emails = PrimitiveRegistry("emails")
|
|
25
|
+
|
|
26
|
+
# Common email domain typo mappings
|
|
27
|
+
DOMAIN_TYPO_MAPPINGS = {
|
|
28
|
+
# Gmail typos
|
|
29
|
+
"gmai.com": "gmail.com",
|
|
30
|
+
"gmial.com": "gmail.com",
|
|
31
|
+
"gmaill.com": "gmail.com",
|
|
32
|
+
"gmail.co": "gmail.com",
|
|
33
|
+
"gmail.cm": "gmail.com",
|
|
34
|
+
"gmal.com": "gmail.com",
|
|
35
|
+
"g-mail.com": "gmail.com",
|
|
36
|
+
"gmailcom": "gmail.com",
|
|
37
|
+
# Yahoo typos
|
|
38
|
+
"yahooo.com": "yahoo.com",
|
|
39
|
+
"yaho.com": "yahoo.com",
|
|
40
|
+
"yahoo.co": "yahoo.com",
|
|
41
|
+
"yahoo.cm": "yahoo.com",
|
|
42
|
+
"yhoo.com": "yahoo.com",
|
|
43
|
+
"ymail.co": "ymail.com",
|
|
44
|
+
# Hotmail/Outlook typos
|
|
45
|
+
"hotmial.com": "hotmail.com",
|
|
46
|
+
"hotmall.com": "hotmail.com",
|
|
47
|
+
"hotmai.com": "hotmail.com",
|
|
48
|
+
"hotmail.co": "hotmail.com",
|
|
49
|
+
"hotmail.cm": "hotmail.com",
|
|
50
|
+
"hotmial.co.uk": "hotmail.co.uk",
|
|
51
|
+
"outlok.com": "outlook.com",
|
|
52
|
+
"outlook.co": "outlook.com",
|
|
53
|
+
"outlookcom": "outlook.com",
|
|
54
|
+
# AOL typos
|
|
55
|
+
"aol.co": "aol.com",
|
|
56
|
+
"aol.cm": "aol.com",
|
|
57
|
+
"ao.com": "aol.com",
|
|
58
|
+
# ISP typos
|
|
59
|
+
"comcast.ent": "comcast.net",
|
|
60
|
+
"verizon.ent": "verizon.net",
|
|
61
|
+
"sbcglobal.ent": "sbcglobal.net",
|
|
62
|
+
"att.ent": "att.net",
|
|
63
|
+
"charter.ent": "charter.net",
|
|
64
|
+
"cox.ent": "cox.net",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# TLD typo mappings
|
|
68
|
+
TLD_TYPO_MAPPINGS = {
|
|
69
|
+
".cmo": ".com",
|
|
70
|
+
".ocm": ".com",
|
|
71
|
+
".con": ".com",
|
|
72
|
+
".ent": ".net",
|
|
73
|
+
".nte": ".net",
|
|
74
|
+
".ten": ".net",
|
|
75
|
+
".rg": ".org",
|
|
76
|
+
".rog": ".org",
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ============================================================================
|
|
81
|
+
# Core Email Extraction Functions
|
|
82
|
+
# ============================================================================
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@emails.register()
|
|
86
|
+
def extract_email(col: Column) -> Column:
|
|
87
|
+
"""
|
|
88
|
+
Extract first valid email address from text.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
col: Column containing text with potential email addresses
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Column with extracted email address or empty string
|
|
95
|
+
"""
|
|
96
|
+
# Basic email pattern - captures most valid emails
|
|
97
|
+
email_pattern = r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
|
|
98
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
99
|
+
F.regexp_extract(col, email_pattern, 1)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@emails.register()
|
|
104
|
+
def extract_all_emails(col: Column) -> Column:
|
|
105
|
+
"""
|
|
106
|
+
Extract all email addresses from text as an array.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
col: Column containing text with potential email addresses
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Column with array of email addresses
|
|
113
|
+
"""
|
|
114
|
+
# Split by whitespace and common delimiters, then filter for email pattern
|
|
115
|
+
email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
|
116
|
+
|
|
117
|
+
# Split text and filter for email-like strings
|
|
118
|
+
return F.expr(
|
|
119
|
+
f"""
|
|
120
|
+
filter(
|
|
121
|
+
split(regexp_replace({col._jc}, '[,;\\s]+', ' '), ' '),
|
|
122
|
+
x -> x rlike '{email_pattern}'
|
|
123
|
+
)
|
|
124
|
+
"""
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@emails.register()
|
|
129
|
+
def extract_username(col: Column) -> Column:
|
|
130
|
+
"""
|
|
131
|
+
Extract username (local part) from email address.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
col: Column containing email address
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Column with username part or empty string
|
|
138
|
+
"""
|
|
139
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
140
|
+
F.regexp_extract(col, r"^([^@]+)@", 1)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@emails.register()
|
|
145
|
+
def extract_domain(col: Column) -> Column:
|
|
146
|
+
"""
|
|
147
|
+
Extract domain from email address.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
col: Column containing email address
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Column with domain part or empty string
|
|
154
|
+
"""
|
|
155
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
156
|
+
F.regexp_extract(col, r"@([^@]+)$", 1)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@emails.register()
|
|
161
|
+
def extract_domain_name(col: Column) -> Column:
|
|
162
|
+
"""
|
|
163
|
+
Extract domain name without TLD from email address.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
col: Column containing email address
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Column with domain name (e.g., "gmail" from "user@gmail.com")
|
|
170
|
+
"""
|
|
171
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
172
|
+
F.regexp_extract(col, r"@([^.@]+)\.", 1)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@emails.register()
|
|
177
|
+
def extract_tld(col: Column) -> Column:
|
|
178
|
+
"""
|
|
179
|
+
Extract top-level domain from email address.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
col: Column containing email address
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Column with TLD (e.g., "com", "co.uk")
|
|
186
|
+
"""
|
|
187
|
+
# This pattern captures everything after the last @ and first dot
|
|
188
|
+
# Handles multi-part TLDs like co.uk, com.au, etc.
|
|
189
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
190
|
+
F.regexp_extract(col, r"@[^.@]+\.(.+)$", 1)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ============================================================================
|
|
195
|
+
# Email Validation Functions
|
|
196
|
+
# ============================================================================
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@emails.register()
|
|
200
|
+
def is_valid_email(col: Column, min_length: int = 6, max_length: int = 254) -> Column:
|
|
201
|
+
"""
|
|
202
|
+
Check if email address has valid format.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
col: Column containing email address
|
|
206
|
+
min_length: Minimum length for valid email
|
|
207
|
+
max_length: Maximum length for valid email
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Column with boolean indicating validity
|
|
211
|
+
"""
|
|
212
|
+
# RFC-compliant basic email validation
|
|
213
|
+
email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
|
214
|
+
|
|
215
|
+
# Extract username to check it separately
|
|
216
|
+
username = extract_username(col)
|
|
217
|
+
|
|
218
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
219
|
+
col.rlike(email_pattern)
|
|
220
|
+
& (F.length(col) >= F.lit(min_length))
|
|
221
|
+
& (F.length(col) <= F.lit(max_length))
|
|
222
|
+
& (F.length(username) <= F.lit(64)) # RFC 5321 username max length
|
|
223
|
+
& ~col.rlike(r"\.\.") # No consecutive dots anywhere
|
|
224
|
+
& ~col.rlike(r"^\.") # Doesn't start with dot
|
|
225
|
+
& ~username.rlike(r"\.$") # Username doesn't end with dot
|
|
226
|
+
& ~col.rlike(r"\.@") # No dot before @
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
@emails.register()
|
|
231
|
+
def is_valid_username(col: Column, min_length: int = 1, max_length: int = 64) -> Column:
|
|
232
|
+
"""
|
|
233
|
+
Check if email username part is valid.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
col: Column containing email address
|
|
237
|
+
min_length: Minimum length for valid username (default 1)
|
|
238
|
+
max_length: Maximum length for valid username (default 64 per RFC)
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Column with boolean indicating username validity
|
|
242
|
+
"""
|
|
243
|
+
username = extract_username(col)
|
|
244
|
+
|
|
245
|
+
return (
|
|
246
|
+
username.isNotNull()
|
|
247
|
+
& (F.length(username) >= F.lit(min_length))
|
|
248
|
+
& (F.length(username) <= F.lit(max_length))
|
|
249
|
+
& ~username.rlike(r"^\.") # Doesn't start with dot
|
|
250
|
+
& ~username.rlike(r"\.$") # Doesn't end with dot
|
|
251
|
+
& ~username.rlike(r"\.\.") # No consecutive dots
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
@emails.register()
|
|
256
|
+
def is_valid_domain(col: Column) -> Column:
|
|
257
|
+
"""
|
|
258
|
+
Check if email domain part is valid.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
col: Column containing email address
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Column with boolean indicating domain validity
|
|
265
|
+
"""
|
|
266
|
+
domain = extract_domain(col)
|
|
267
|
+
|
|
268
|
+
return (
|
|
269
|
+
domain.isNotNull()
|
|
270
|
+
& (F.length(domain) > 0)
|
|
271
|
+
& (F.length(domain) <= 253)
|
|
272
|
+
& domain.rlike(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
|
|
273
|
+
& ~domain.rlike(r"^-") # Doesn't start with hyphen
|
|
274
|
+
& ~domain.rlike(r"-\.") # No hyphen before dot
|
|
275
|
+
& ~domain.rlike(r"\.\.") # No consecutive dots
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
@emails.register()
|
|
280
|
+
def has_plus_addressing(col: Column) -> Column:
|
|
281
|
+
"""
|
|
282
|
+
Check if email uses plus addressing (e.g., user+tag@gmail.com).
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
col: Column containing email address
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Column with boolean indicating plus addressing usage
|
|
289
|
+
"""
|
|
290
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(col.rlike(r"^[^@]*\+[^@]*@"))
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@emails.register()
|
|
294
|
+
def is_disposable_email(
|
|
295
|
+
col: Column, disposable_domains: Optional[List[str]] = None
|
|
296
|
+
) -> Column:
|
|
297
|
+
"""
|
|
298
|
+
Check if email is from a disposable email service.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
col: Column containing email address
|
|
302
|
+
disposable_domains: List of disposable domains to check against
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Column with boolean indicating if email is disposable
|
|
306
|
+
"""
|
|
307
|
+
# Common disposable email domains
|
|
308
|
+
default_disposable = [
|
|
309
|
+
"10minutemail.com",
|
|
310
|
+
"guerrillamail.com",
|
|
311
|
+
"mailinator.com",
|
|
312
|
+
"temp-mail.org",
|
|
313
|
+
"throwaway.email",
|
|
314
|
+
"yopmail.com",
|
|
315
|
+
"tempmail.com",
|
|
316
|
+
"trashmail.com",
|
|
317
|
+
"getnada.com",
|
|
318
|
+
]
|
|
319
|
+
|
|
320
|
+
domains_to_check = disposable_domains or default_disposable
|
|
321
|
+
domain = extract_domain(col)
|
|
322
|
+
|
|
323
|
+
# Check if domain is in disposable list
|
|
324
|
+
conditions = F.lit(False)
|
|
325
|
+
for disposable_domain in domains_to_check:
|
|
326
|
+
conditions = conditions | (F.lower(domain) == disposable_domain.lower())
|
|
327
|
+
|
|
328
|
+
return conditions
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@emails.register()
|
|
332
|
+
def is_corporate_email(
|
|
333
|
+
col: Column, free_providers: Optional[List[str]] = None
|
|
334
|
+
) -> Column:
|
|
335
|
+
"""
|
|
336
|
+
Check if email appears to be from a corporate domain (not free email provider).
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
col: Column containing email address
|
|
340
|
+
free_providers: List of free email provider domains to check against
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Column with boolean indicating if email is corporate
|
|
344
|
+
|
|
345
|
+
Examples:
|
|
346
|
+
# Use default free provider list
|
|
347
|
+
df.withColumn("is_corp", emails.is_corporate_email(F.col("email")))
|
|
348
|
+
|
|
349
|
+
# Add custom free providers to check
|
|
350
|
+
custom_free = ["company-internal.com", "contractor-email.com"]
|
|
351
|
+
df.withColumn("is_corp", emails.is_corporate_email(F.col("email"), custom_free))
|
|
352
|
+
"""
|
|
353
|
+
# Common free email providers
|
|
354
|
+
default_free_providers = [
|
|
355
|
+
"gmail.com",
|
|
356
|
+
"yahoo.com",
|
|
357
|
+
"hotmail.com",
|
|
358
|
+
"outlook.com",
|
|
359
|
+
"aol.com",
|
|
360
|
+
"icloud.com",
|
|
361
|
+
"mail.com",
|
|
362
|
+
"protonmail.com",
|
|
363
|
+
"ymail.com",
|
|
364
|
+
"live.com",
|
|
365
|
+
"msn.com",
|
|
366
|
+
"me.com",
|
|
367
|
+
]
|
|
368
|
+
|
|
369
|
+
providers_to_check = (
|
|
370
|
+
free_providers if free_providers is not None else default_free_providers
|
|
371
|
+
)
|
|
372
|
+
domain = extract_domain(col)
|
|
373
|
+
|
|
374
|
+
# Check if domain is NOT in free provider list
|
|
375
|
+
conditions = F.lit(True)
|
|
376
|
+
for provider in providers_to_check:
|
|
377
|
+
conditions = conditions & (F.lower(domain) != provider.lower())
|
|
378
|
+
|
|
379
|
+
return F.when(domain.isNull() | (domain == ""), F.lit(False)).otherwise(conditions)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
# ============================================================================
|
|
383
|
+
# Email Cleaning Functions
|
|
384
|
+
# ============================================================================
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@emails.register()
|
|
388
|
+
def remove_whitespace(col: Column) -> Column:
|
|
389
|
+
"""
|
|
390
|
+
Remove all whitespace from email address.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
col: Column containing email address
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
Column with whitespace removed
|
|
397
|
+
"""
|
|
398
|
+
return F.when(col.isNull(), F.lit("")).otherwise(F.regexp_replace(col, r"\s+", ""))
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
@emails.register()
|
|
402
|
+
def lowercase_email(col: Column) -> Column:
|
|
403
|
+
"""
|
|
404
|
+
Convert entire email address to lowercase.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
col: Column containing email address
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Column with lowercased email
|
|
411
|
+
"""
|
|
412
|
+
return F.when(col.isNull(), F.lit("")).otherwise(F.lower(col))
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
@emails.register()
|
|
416
|
+
def lowercase_domain(col: Column) -> Column:
|
|
417
|
+
"""
|
|
418
|
+
Convert only domain part to lowercase, preserve username case.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
col: Column containing email address
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
Column with domain lowercased
|
|
425
|
+
"""
|
|
426
|
+
username = extract_username(col)
|
|
427
|
+
domain = extract_domain(col)
|
|
428
|
+
|
|
429
|
+
return F.when(col.isNull() | ~col.contains("@"), col).otherwise(
|
|
430
|
+
F.concat(username, F.lit("@"), F.lower(domain))
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
@emails.register()
|
|
435
|
+
def remove_plus_addressing(col: Column) -> Column:
|
|
436
|
+
"""
|
|
437
|
+
Remove plus addressing from email (e.g., user+tag@gmail.com -> user@gmail.com).
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
col: Column containing email address
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
Column with plus addressing removed
|
|
444
|
+
"""
|
|
445
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
446
|
+
F.regexp_replace(col, r"\+[^@]*(@)", "$1")
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
@emails.register()
|
|
451
|
+
def remove_dots_from_gmail(col: Column) -> Column:
|
|
452
|
+
"""
|
|
453
|
+
Remove dots from Gmail addresses (Gmail ignores dots in usernames).
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
col: Column containing email address
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Column with dots removed from Gmail usernames
|
|
460
|
+
"""
|
|
461
|
+
username = extract_username(col)
|
|
462
|
+
domain = extract_domain(col)
|
|
463
|
+
|
|
464
|
+
# Only process Gmail addresses
|
|
465
|
+
return (
|
|
466
|
+
F.when(col.isNull() | ~col.contains("@"), col)
|
|
467
|
+
.when(
|
|
468
|
+
F.lower(domain).isin(["gmail.com", "googlemail.com"]),
|
|
469
|
+
F.concat(F.regexp_replace(username, r"\.", ""), F.lit("@"), domain),
|
|
470
|
+
)
|
|
471
|
+
.otherwise(col)
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
@emails.register()
|
|
476
|
+
def fix_common_typos(
|
|
477
|
+
col: Column,
|
|
478
|
+
custom_mappings: Optional[Dict[str, str]] = None,
|
|
479
|
+
custom_tld_mappings: Optional[Dict[str, str]] = None,
|
|
480
|
+
) -> Column:
|
|
481
|
+
"""
|
|
482
|
+
Fix common domain typos in email addresses.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
col: Column containing email address
|
|
486
|
+
custom_mappings: Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
|
|
487
|
+
custom_tld_mappings: Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
Column with typos fixed
|
|
491
|
+
|
|
492
|
+
Examples:
|
|
493
|
+
# Use default typo fixes
|
|
494
|
+
df.withColumn("fixed", emails.fix_common_typos(F.col("email")))
|
|
495
|
+
|
|
496
|
+
# Add custom domain typo mappings
|
|
497
|
+
custom_domains = {
|
|
498
|
+
"company.con": "company.com",
|
|
499
|
+
"mycompany.co": "mycompany.com",
|
|
500
|
+
"gmai.com": "gmail.com" # Override default mapping
|
|
501
|
+
}
|
|
502
|
+
df.withColumn("fixed", emails.fix_common_typos(F.col("email"), custom_domains))
|
|
503
|
+
|
|
504
|
+
# Add custom TLD mappings
|
|
505
|
+
custom_tlds = {
|
|
506
|
+
".coom": ".com",
|
|
507
|
+
".nett": ".net"
|
|
508
|
+
}
|
|
509
|
+
df.withColumn("fixed", emails.fix_common_typos(
|
|
510
|
+
F.col("email"),
|
|
511
|
+
custom_tld_mappings=custom_tlds
|
|
512
|
+
))
|
|
513
|
+
"""
|
|
514
|
+
domain = extract_domain(col)
|
|
515
|
+
username = extract_username(col)
|
|
516
|
+
|
|
517
|
+
# Combine default and custom mappings
|
|
518
|
+
all_domain_mappings = {**DOMAIN_TYPO_MAPPINGS, **(custom_mappings or {})}
|
|
519
|
+
all_tld_mappings = {**TLD_TYPO_MAPPINGS, **(custom_tld_mappings or {})}
|
|
520
|
+
|
|
521
|
+
# Build case statement for all typo fixes
|
|
522
|
+
fixed_domain = domain
|
|
523
|
+
for typo, correct in all_domain_mappings.items():
|
|
524
|
+
fixed_domain = F.when(
|
|
525
|
+
F.lower(domain) == typo.lower(), F.lit(correct)
|
|
526
|
+
).otherwise(fixed_domain)
|
|
527
|
+
|
|
528
|
+
# Also fix TLD typos
|
|
529
|
+
for typo, correct in all_tld_mappings.items():
|
|
530
|
+
fixed_domain = F.regexp_replace(fixed_domain, re.escape(typo) + r"$", correct)
|
|
531
|
+
|
|
532
|
+
return F.when(col.isNull() | ~col.contains("@"), col).otherwise(
|
|
533
|
+
F.concat(username, F.lit("@"), fixed_domain)
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
# ============================================================================
|
|
538
|
+
# Email Standardization Functions
|
|
539
|
+
# ============================================================================
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
@emails.register()
|
|
543
|
+
def standardize_email(
|
|
544
|
+
col: Column,
|
|
545
|
+
lowercase: bool = True,
|
|
546
|
+
remove_dots_gmail: bool = True,
|
|
547
|
+
remove_plus: bool = False,
|
|
548
|
+
fix_typos: bool = True,
|
|
549
|
+
) -> Column:
|
|
550
|
+
"""
|
|
551
|
+
Apply standard email cleaning and normalization.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
col: Column containing email address
|
|
555
|
+
lowercase: Convert to lowercase
|
|
556
|
+
remove_dots_gmail: Remove dots from Gmail addresses
|
|
557
|
+
remove_plus: Remove plus addressing
|
|
558
|
+
fix_typos: Fix common domain typos
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
Column with standardized email
|
|
562
|
+
"""
|
|
563
|
+
result = remove_whitespace(col)
|
|
564
|
+
|
|
565
|
+
if fix_typos:
|
|
566
|
+
result = fix_common_typos(result)
|
|
567
|
+
|
|
568
|
+
if lowercase:
|
|
569
|
+
result = lowercase_email(result)
|
|
570
|
+
else:
|
|
571
|
+
# At least lowercase the domain
|
|
572
|
+
result = lowercase_domain(result)
|
|
573
|
+
|
|
574
|
+
if remove_plus:
|
|
575
|
+
result = remove_plus_addressing(result)
|
|
576
|
+
|
|
577
|
+
if remove_dots_gmail:
|
|
578
|
+
result = remove_dots_from_gmail(result)
|
|
579
|
+
|
|
580
|
+
# Only return valid emails
|
|
581
|
+
return F.when(is_valid_email(result), result).otherwise(F.lit(""))
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
@emails.register()
|
|
585
|
+
def normalize_gmail(col: Column) -> Column:
|
|
586
|
+
"""
|
|
587
|
+
Normalize Gmail addresses (remove dots, plus addressing, lowercase).
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
col: Column containing email address
|
|
591
|
+
|
|
592
|
+
Returns:
|
|
593
|
+
Column with normalized Gmail address
|
|
594
|
+
"""
|
|
595
|
+
domain = extract_domain(col)
|
|
596
|
+
|
|
597
|
+
return F.when(
|
|
598
|
+
F.lower(domain).isin(["gmail.com", "googlemail.com"]),
|
|
599
|
+
standardize_email(
|
|
600
|
+
col, lowercase=True, remove_dots_gmail=True, remove_plus=True
|
|
601
|
+
),
|
|
602
|
+
).otherwise(col)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
@emails.register()
|
|
606
|
+
def get_canonical_email(col: Column) -> Column:
|
|
607
|
+
"""
|
|
608
|
+
Get canonical form of email address for deduplication.
|
|
609
|
+
Applies maximum normalization.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
col: Column containing email address
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
Column with canonical email form
|
|
616
|
+
"""
|
|
617
|
+
return standardize_email(
|
|
618
|
+
col, lowercase=True, remove_dots_gmail=True, remove_plus=True, fix_typos=True
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
# ============================================================================
|
|
623
|
+
# Email Information Extraction
|
|
624
|
+
# ============================================================================
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
@emails.register()
|
|
628
|
+
def extract_name_from_email(col: Column) -> Column:
|
|
629
|
+
"""
|
|
630
|
+
Attempt to extract person's name from email username.
|
|
631
|
+
E.g., john.smith@example.com -> "John Smith"
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
col: Column containing email address
|
|
635
|
+
|
|
636
|
+
Returns:
|
|
637
|
+
Column with extracted name or empty string
|
|
638
|
+
"""
|
|
639
|
+
username = extract_username(col)
|
|
640
|
+
|
|
641
|
+
# Remove numbers and common prefixes/suffixes
|
|
642
|
+
cleaned = F.regexp_replace(username, r"[0-9]+", "")
|
|
643
|
+
cleaned = F.regexp_replace(
|
|
644
|
+
cleaned, r"^(info|admin|support|sales|contact|hello|hi|hey)", ""
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# Replace separators with spaces
|
|
648
|
+
name = F.regexp_replace(cleaned, r"[._-]+", " ")
|
|
649
|
+
|
|
650
|
+
# Capitalize words
|
|
651
|
+
name = F.initcap(F.trim(name))
|
|
652
|
+
|
|
653
|
+
# Only return if it looks like a name (has letters, reasonable length)
|
|
654
|
+
return F.when(
|
|
655
|
+
(F.length(name) >= 2) & (F.length(name) <= 50) & name.rlike(r"^[A-Za-z\s]+$"),
|
|
656
|
+
name,
|
|
657
|
+
).otherwise(F.lit(""))
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
@emails.register()
|
|
661
|
+
def get_email_provider(col: Column) -> Column:
|
|
662
|
+
"""
|
|
663
|
+
Get email provider name from domain.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
col: Column containing email address
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
Column with provider name
|
|
670
|
+
"""
|
|
671
|
+
domain = extract_domain(col)
|
|
672
|
+
|
|
673
|
+
# Map domains to provider names
|
|
674
|
+
provider_mappings = {
|
|
675
|
+
"gmail.com": "Gmail",
|
|
676
|
+
"googlemail.com": "Gmail",
|
|
677
|
+
"yahoo.com": "Yahoo",
|
|
678
|
+
"ymail.com": "Yahoo",
|
|
679
|
+
"hotmail.com": "Hotmail",
|
|
680
|
+
"outlook.com": "Outlook",
|
|
681
|
+
"live.com": "Outlook",
|
|
682
|
+
"msn.com": "Outlook",
|
|
683
|
+
"aol.com": "AOL",
|
|
684
|
+
"icloud.com": "iCloud",
|
|
685
|
+
"me.com": "iCloud",
|
|
686
|
+
"mac.com": "iCloud",
|
|
687
|
+
"protonmail.com": "ProtonMail",
|
|
688
|
+
"proton.me": "ProtonMail",
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
result = F.lit("Other")
|
|
692
|
+
for domain_str, provider in provider_mappings.items():
|
|
693
|
+
result = F.when(F.lower(domain) == domain_str, F.lit(provider)).otherwise(
|
|
694
|
+
result
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
return result
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
@emails.register()
|
|
701
|
+
def mask_email(col: Column, mask_char: str = "*", keep_chars: int = 3) -> Column:
|
|
702
|
+
"""
|
|
703
|
+
Mask email address for privacy (e.g., joh***@gm***.com).
|
|
704
|
+
|
|
705
|
+
Args:
|
|
706
|
+
col: Column containing email address
|
|
707
|
+
mask_char: Character to use for masking
|
|
708
|
+
keep_chars: Number of characters to keep at start
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
Column with masked email
|
|
712
|
+
"""
|
|
713
|
+
username = extract_username(col)
|
|
714
|
+
# domain = extract_domain(col)
|
|
715
|
+
domain_name = extract_domain_name(col)
|
|
716
|
+
tld = extract_tld(col)
|
|
717
|
+
|
|
718
|
+
# Mask username (keep first few chars)
|
|
719
|
+
masked_username = F.when(
|
|
720
|
+
F.length(username) > keep_chars,
|
|
721
|
+
F.concat(F.substring(username, 1, keep_chars), F.lit(mask_char * 3)),
|
|
722
|
+
).otherwise(F.lit(mask_char * 3))
|
|
723
|
+
|
|
724
|
+
# Mask domain (keep first few chars)
|
|
725
|
+
masked_domain_name = F.when(
|
|
726
|
+
F.length(domain_name) > keep_chars,
|
|
727
|
+
F.concat(F.substring(domain_name, 1, keep_chars), F.lit(mask_char * 3)),
|
|
728
|
+
).otherwise(F.lit(mask_char * 3))
|
|
729
|
+
|
|
730
|
+
return F.when(col.isNull() | ~col.contains("@"), col).otherwise(
|
|
731
|
+
F.concat(masked_username, F.lit("@"), masked_domain_name, F.lit("."), tld)
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
# ============================================================================
|
|
736
|
+
# Email Filtering Functions
|
|
737
|
+
# ============================================================================
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
@emails.register()
|
|
741
|
+
def filter_valid_emails(col: Column) -> Column:
|
|
742
|
+
"""
|
|
743
|
+
Return email only if valid, otherwise return null.
|
|
744
|
+
|
|
745
|
+
Args:
|
|
746
|
+
col: Column containing email address
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
Column with valid email or null
|
|
750
|
+
"""
|
|
751
|
+
return F.when(is_valid_email(col), col).otherwise(F.lit(None))
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
@emails.register()
|
|
755
|
+
def filter_corporate_emails(col: Column) -> Column:
|
|
756
|
+
"""
|
|
757
|
+
Return email only if corporate, otherwise return null.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
col: Column containing email address
|
|
761
|
+
|
|
762
|
+
Returns:
|
|
763
|
+
Column with corporate email or null
|
|
764
|
+
"""
|
|
765
|
+
return F.when(is_corporate_email(col), col).otherwise(F.lit(None))
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
@emails.register()
|
|
769
|
+
def filter_non_disposable_emails(col: Column) -> Column:
|
|
770
|
+
"""
|
|
771
|
+
Return email only if not disposable, otherwise return null.
|
|
772
|
+
|
|
773
|
+
Args:
|
|
774
|
+
col: Column containing email address
|
|
775
|
+
|
|
776
|
+
Returns:
|
|
777
|
+
Column with non-disposable email or null
|
|
778
|
+
"""
|
|
779
|
+
return F.when(
|
|
780
|
+
col.isNotNull() & (col != "") & ~is_disposable_email(col), col
|
|
781
|
+
).otherwise(F.lit(None))
|