datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,781 @@
1
+ import re
2
+ from typing import TYPE_CHECKING, Dict, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ # For type checkers only - these imports are always available during type checking
6
+ from pyspark.sql import Column
7
+ from pyspark.sql import functions as F
8
+ else:
9
+ # At runtime, handle missing PySpark gracefully
10
+ try:
11
+ from pyspark.sql import Column
12
+ from pyspark.sql import functions as F
13
+ except ImportError:
14
+ # PySpark is not installed - functions will fail at runtime if called
15
+ pass
16
+
17
+ try:
18
+ # Try local utils import first (for generated code)
19
+ from utils.primitives import PrimitiveRegistry
20
+ except ImportError:
21
+ # Fall back to installed datacompose package
22
+ from datacompose.operators.primitives import PrimitiveRegistry
23
+
24
+ emails = PrimitiveRegistry("emails")
25
+
26
+ # Common email domain typo mappings
27
+ DOMAIN_TYPO_MAPPINGS = {
28
+ # Gmail typos
29
+ "gmai.com": "gmail.com",
30
+ "gmial.com": "gmail.com",
31
+ "gmaill.com": "gmail.com",
32
+ "gmail.co": "gmail.com",
33
+ "gmail.cm": "gmail.com",
34
+ "gmal.com": "gmail.com",
35
+ "g-mail.com": "gmail.com",
36
+ "gmailcom": "gmail.com",
37
+ # Yahoo typos
38
+ "yahooo.com": "yahoo.com",
39
+ "yaho.com": "yahoo.com",
40
+ "yahoo.co": "yahoo.com",
41
+ "yahoo.cm": "yahoo.com",
42
+ "yhoo.com": "yahoo.com",
43
+ "ymail.co": "ymail.com",
44
+ # Hotmail/Outlook typos
45
+ "hotmial.com": "hotmail.com",
46
+ "hotmall.com": "hotmail.com",
47
+ "hotmai.com": "hotmail.com",
48
+ "hotmail.co": "hotmail.com",
49
+ "hotmail.cm": "hotmail.com",
50
+ "hotmial.co.uk": "hotmail.co.uk",
51
+ "outlok.com": "outlook.com",
52
+ "outlook.co": "outlook.com",
53
+ "outlookcom": "outlook.com",
54
+ # AOL typos
55
+ "aol.co": "aol.com",
56
+ "aol.cm": "aol.com",
57
+ "ao.com": "aol.com",
58
+ # ISP typos
59
+ "comcast.ent": "comcast.net",
60
+ "verizon.ent": "verizon.net",
61
+ "sbcglobal.ent": "sbcglobal.net",
62
+ "att.ent": "att.net",
63
+ "charter.ent": "charter.net",
64
+ "cox.ent": "cox.net",
65
+ }
66
+
67
+ # TLD typo mappings
68
+ TLD_TYPO_MAPPINGS = {
69
+ ".cmo": ".com",
70
+ ".ocm": ".com",
71
+ ".con": ".com",
72
+ ".ent": ".net",
73
+ ".nte": ".net",
74
+ ".ten": ".net",
75
+ ".rg": ".org",
76
+ ".rog": ".org",
77
+ }
78
+
79
+
80
+ # ============================================================================
81
+ # Core Email Extraction Functions
82
+ # ============================================================================
83
+
84
+
85
+ @emails.register()
86
+ def extract_email(col: Column) -> Column:
87
+ """
88
+ Extract first valid email address from text.
89
+
90
+ Args:
91
+ col: Column containing text with potential email addresses
92
+
93
+ Returns:
94
+ Column with extracted email address or empty string
95
+ """
96
+ # Basic email pattern - captures most valid emails
97
+ email_pattern = r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
98
+ return F.when(col.isNull(), F.lit("")).otherwise(
99
+ F.regexp_extract(col, email_pattern, 1)
100
+ )
101
+
102
+
103
+ @emails.register()
104
+ def extract_all_emails(col: Column) -> Column:
105
+ """
106
+ Extract all email addresses from text as an array.
107
+
108
+ Args:
109
+ col: Column containing text with potential email addresses
110
+
111
+ Returns:
112
+ Column with array of email addresses
113
+ """
114
+ # Split by whitespace and common delimiters, then filter for email pattern
115
+ email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
116
+
117
+ # Split text and filter for email-like strings
118
+ return F.expr(
119
+ f"""
120
+ filter(
121
+ split(regexp_replace({col._jc}, '[,;\\s]+', ' '), ' '),
122
+ x -> x rlike '{email_pattern}'
123
+ )
124
+ """
125
+ )
126
+
127
+
128
+ @emails.register()
129
+ def extract_username(col: Column) -> Column:
130
+ """
131
+ Extract username (local part) from email address.
132
+
133
+ Args:
134
+ col: Column containing email address
135
+
136
+ Returns:
137
+ Column with username part or empty string
138
+ """
139
+ return F.when(col.isNull(), F.lit("")).otherwise(
140
+ F.regexp_extract(col, r"^([^@]+)@", 1)
141
+ )
142
+
143
+
144
+ @emails.register()
145
+ def extract_domain(col: Column) -> Column:
146
+ """
147
+ Extract domain from email address.
148
+
149
+ Args:
150
+ col: Column containing email address
151
+
152
+ Returns:
153
+ Column with domain part or empty string
154
+ """
155
+ return F.when(col.isNull(), F.lit("")).otherwise(
156
+ F.regexp_extract(col, r"@([^@]+)$", 1)
157
+ )
158
+
159
+
160
+ @emails.register()
161
+ def extract_domain_name(col: Column) -> Column:
162
+ """
163
+ Extract domain name without TLD from email address.
164
+
165
+ Args:
166
+ col: Column containing email address
167
+
168
+ Returns:
169
+ Column with domain name (e.g., "gmail" from "user@gmail.com")
170
+ """
171
+ return F.when(col.isNull(), F.lit("")).otherwise(
172
+ F.regexp_extract(col, r"@([^.@]+)\.", 1)
173
+ )
174
+
175
+
176
+ @emails.register()
177
+ def extract_tld(col: Column) -> Column:
178
+ """
179
+ Extract top-level domain from email address.
180
+
181
+ Args:
182
+ col: Column containing email address
183
+
184
+ Returns:
185
+ Column with TLD (e.g., "com", "co.uk")
186
+ """
187
+ # This pattern captures everything after the last @ and first dot
188
+ # Handles multi-part TLDs like co.uk, com.au, etc.
189
+ return F.when(col.isNull(), F.lit("")).otherwise(
190
+ F.regexp_extract(col, r"@[^.@]+\.(.+)$", 1)
191
+ )
192
+
193
+
194
+ # ============================================================================
195
+ # Email Validation Functions
196
+ # ============================================================================
197
+
198
+
199
+ @emails.register()
200
+ def is_valid_email(col: Column, min_length: int = 6, max_length: int = 254) -> Column:
201
+ """
202
+ Check if email address has valid format.
203
+
204
+ Args:
205
+ col: Column containing email address
206
+ min_length: Minimum length for valid email
207
+ max_length: Maximum length for valid email
208
+
209
+ Returns:
210
+ Column with boolean indicating validity
211
+ """
212
+ # RFC-compliant basic email validation
213
+ email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
214
+
215
+ # Extract username to check it separately
216
+ username = extract_username(col)
217
+
218
+ return F.when(col.isNull(), F.lit(False)).otherwise(
219
+ col.rlike(email_pattern)
220
+ & (F.length(col) >= F.lit(min_length))
221
+ & (F.length(col) <= F.lit(max_length))
222
+ & (F.length(username) <= F.lit(64)) # RFC 5321 username max length
223
+ & ~col.rlike(r"\.\.") # No consecutive dots anywhere
224
+ & ~col.rlike(r"^\.") # Doesn't start with dot
225
+ & ~username.rlike(r"\.$") # Username doesn't end with dot
226
+ & ~col.rlike(r"\.@") # No dot before @
227
+ )
228
+
229
+
230
+ @emails.register()
231
+ def is_valid_username(col: Column, min_length: int = 1, max_length: int = 64) -> Column:
232
+ """
233
+ Check if email username part is valid.
234
+
235
+ Args:
236
+ col: Column containing email address
237
+ min_length: Minimum length for valid username (default 1)
238
+ max_length: Maximum length for valid username (default 64 per RFC)
239
+
240
+ Returns:
241
+ Column with boolean indicating username validity
242
+ """
243
+ username = extract_username(col)
244
+
245
+ return (
246
+ username.isNotNull()
247
+ & (F.length(username) >= F.lit(min_length))
248
+ & (F.length(username) <= F.lit(max_length))
249
+ & ~username.rlike(r"^\.") # Doesn't start with dot
250
+ & ~username.rlike(r"\.$") # Doesn't end with dot
251
+ & ~username.rlike(r"\.\.") # No consecutive dots
252
+ )
253
+
254
+
255
+ @emails.register()
256
+ def is_valid_domain(col: Column) -> Column:
257
+ """
258
+ Check if email domain part is valid.
259
+
260
+ Args:
261
+ col: Column containing email address
262
+
263
+ Returns:
264
+ Column with boolean indicating domain validity
265
+ """
266
+ domain = extract_domain(col)
267
+
268
+ return (
269
+ domain.isNotNull()
270
+ & (F.length(domain) > 0)
271
+ & (F.length(domain) <= 253)
272
+ & domain.rlike(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
273
+ & ~domain.rlike(r"^-") # Doesn't start with hyphen
274
+ & ~domain.rlike(r"-\.") # No hyphen before dot
275
+ & ~domain.rlike(r"\.\.") # No consecutive dots
276
+ )
277
+
278
+
279
+ @emails.register()
280
+ def has_plus_addressing(col: Column) -> Column:
281
+ """
282
+ Check if email uses plus addressing (e.g., user+tag@gmail.com).
283
+
284
+ Args:
285
+ col: Column containing email address
286
+
287
+ Returns:
288
+ Column with boolean indicating plus addressing usage
289
+ """
290
+ return F.when(col.isNull(), F.lit(False)).otherwise(col.rlike(r"^[^@]*\+[^@]*@"))
291
+
292
+
293
+ @emails.register()
294
+ def is_disposable_email(
295
+ col: Column, disposable_domains: Optional[List[str]] = None
296
+ ) -> Column:
297
+ """
298
+ Check if email is from a disposable email service.
299
+
300
+ Args:
301
+ col: Column containing email address
302
+ disposable_domains: List of disposable domains to check against
303
+
304
+ Returns:
305
+ Column with boolean indicating if email is disposable
306
+ """
307
+ # Common disposable email domains
308
+ default_disposable = [
309
+ "10minutemail.com",
310
+ "guerrillamail.com",
311
+ "mailinator.com",
312
+ "temp-mail.org",
313
+ "throwaway.email",
314
+ "yopmail.com",
315
+ "tempmail.com",
316
+ "trashmail.com",
317
+ "getnada.com",
318
+ ]
319
+
320
+ domains_to_check = disposable_domains or default_disposable
321
+ domain = extract_domain(col)
322
+
323
+ # Check if domain is in disposable list
324
+ conditions = F.lit(False)
325
+ for disposable_domain in domains_to_check:
326
+ conditions = conditions | (F.lower(domain) == disposable_domain.lower())
327
+
328
+ return conditions
329
+
330
+
331
+ @emails.register()
332
+ def is_corporate_email(
333
+ col: Column, free_providers: Optional[List[str]] = None
334
+ ) -> Column:
335
+ """
336
+ Check if email appears to be from a corporate domain (not free email provider).
337
+
338
+ Args:
339
+ col: Column containing email address
340
+ free_providers: List of free email provider domains to check against
341
+
342
+ Returns:
343
+ Column with boolean indicating if email is corporate
344
+
345
+ Examples:
346
+ # Use default free provider list
347
+ df.withColumn("is_corp", emails.is_corporate_email(F.col("email")))
348
+
349
+ # Add custom free providers to check
350
+ custom_free = ["company-internal.com", "contractor-email.com"]
351
+ df.withColumn("is_corp", emails.is_corporate_email(F.col("email"), custom_free))
352
+ """
353
+ # Common free email providers
354
+ default_free_providers = [
355
+ "gmail.com",
356
+ "yahoo.com",
357
+ "hotmail.com",
358
+ "outlook.com",
359
+ "aol.com",
360
+ "icloud.com",
361
+ "mail.com",
362
+ "protonmail.com",
363
+ "ymail.com",
364
+ "live.com",
365
+ "msn.com",
366
+ "me.com",
367
+ ]
368
+
369
+ providers_to_check = (
370
+ free_providers if free_providers is not None else default_free_providers
371
+ )
372
+ domain = extract_domain(col)
373
+
374
+ # Check if domain is NOT in free provider list
375
+ conditions = F.lit(True)
376
+ for provider in providers_to_check:
377
+ conditions = conditions & (F.lower(domain) != provider.lower())
378
+
379
+ return F.when(domain.isNull() | (domain == ""), F.lit(False)).otherwise(conditions)
380
+
381
+
382
+ # ============================================================================
383
+ # Email Cleaning Functions
384
+ # ============================================================================
385
+
386
+
387
+ @emails.register()
388
+ def remove_whitespace(col: Column) -> Column:
389
+ """
390
+ Remove all whitespace from email address.
391
+
392
+ Args:
393
+ col: Column containing email address
394
+
395
+ Returns:
396
+ Column with whitespace removed
397
+ """
398
+ return F.when(col.isNull(), F.lit("")).otherwise(F.regexp_replace(col, r"\s+", ""))
399
+
400
+
401
+ @emails.register()
402
+ def lowercase_email(col: Column) -> Column:
403
+ """
404
+ Convert entire email address to lowercase.
405
+
406
+ Args:
407
+ col: Column containing email address
408
+
409
+ Returns:
410
+ Column with lowercased email
411
+ """
412
+ return F.when(col.isNull(), F.lit("")).otherwise(F.lower(col))
413
+
414
+
415
+ @emails.register()
416
+ def lowercase_domain(col: Column) -> Column:
417
+ """
418
+ Convert only domain part to lowercase, preserve username case.
419
+
420
+ Args:
421
+ col: Column containing email address
422
+
423
+ Returns:
424
+ Column with domain lowercased
425
+ """
426
+ username = extract_username(col)
427
+ domain = extract_domain(col)
428
+
429
+ return F.when(col.isNull() | ~col.contains("@"), col).otherwise(
430
+ F.concat(username, F.lit("@"), F.lower(domain))
431
+ )
432
+
433
+
434
+ @emails.register()
435
+ def remove_plus_addressing(col: Column) -> Column:
436
+ """
437
+ Remove plus addressing from email (e.g., user+tag@gmail.com -> user@gmail.com).
438
+
439
+ Args:
440
+ col: Column containing email address
441
+
442
+ Returns:
443
+ Column with plus addressing removed
444
+ """
445
+ return F.when(col.isNull(), F.lit("")).otherwise(
446
+ F.regexp_replace(col, r"\+[^@]*(@)", "$1")
447
+ )
448
+
449
+
450
+ @emails.register()
451
+ def remove_dots_from_gmail(col: Column) -> Column:
452
+ """
453
+ Remove dots from Gmail addresses (Gmail ignores dots in usernames).
454
+
455
+ Args:
456
+ col: Column containing email address
457
+
458
+ Returns:
459
+ Column with dots removed from Gmail usernames
460
+ """
461
+ username = extract_username(col)
462
+ domain = extract_domain(col)
463
+
464
+ # Only process Gmail addresses
465
+ return (
466
+ F.when(col.isNull() | ~col.contains("@"), col)
467
+ .when(
468
+ F.lower(domain).isin(["gmail.com", "googlemail.com"]),
469
+ F.concat(F.regexp_replace(username, r"\.", ""), F.lit("@"), domain),
470
+ )
471
+ .otherwise(col)
472
+ )
473
+
474
+
475
+ @emails.register()
476
+ def fix_common_typos(
477
+ col: Column,
478
+ custom_mappings: Optional[Dict[str, str]] = None,
479
+ custom_tld_mappings: Optional[Dict[str, str]] = None,
480
+ ) -> Column:
481
+ """
482
+ Fix common domain typos in email addresses.
483
+
484
+ Args:
485
+ col: Column containing email address
486
+ custom_mappings: Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
487
+ custom_tld_mappings: Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
488
+
489
+ Returns:
490
+ Column with typos fixed
491
+
492
+ Examples:
493
+ # Use default typo fixes
494
+ df.withColumn("fixed", emails.fix_common_typos(F.col("email")))
495
+
496
+ # Add custom domain typo mappings
497
+ custom_domains = {
498
+ "company.con": "company.com",
499
+ "mycompany.co": "mycompany.com",
500
+ "gmai.com": "gmail.com" # Override default mapping
501
+ }
502
+ df.withColumn("fixed", emails.fix_common_typos(F.col("email"), custom_domains))
503
+
504
+ # Add custom TLD mappings
505
+ custom_tlds = {
506
+ ".coom": ".com",
507
+ ".nett": ".net"
508
+ }
509
+ df.withColumn("fixed", emails.fix_common_typos(
510
+ F.col("email"),
511
+ custom_tld_mappings=custom_tlds
512
+ ))
513
+ """
514
+ domain = extract_domain(col)
515
+ username = extract_username(col)
516
+
517
+ # Combine default and custom mappings
518
+ all_domain_mappings = {**DOMAIN_TYPO_MAPPINGS, **(custom_mappings or {})}
519
+ all_tld_mappings = {**TLD_TYPO_MAPPINGS, **(custom_tld_mappings or {})}
520
+
521
+ # Build case statement for all typo fixes
522
+ fixed_domain = domain
523
+ for typo, correct in all_domain_mappings.items():
524
+ fixed_domain = F.when(
525
+ F.lower(domain) == typo.lower(), F.lit(correct)
526
+ ).otherwise(fixed_domain)
527
+
528
+ # Also fix TLD typos
529
+ for typo, correct in all_tld_mappings.items():
530
+ fixed_domain = F.regexp_replace(fixed_domain, re.escape(typo) + r"$", correct)
531
+
532
+ return F.when(col.isNull() | ~col.contains("@"), col).otherwise(
533
+ F.concat(username, F.lit("@"), fixed_domain)
534
+ )
535
+
536
+
537
+ # ============================================================================
538
+ # Email Standardization Functions
539
+ # ============================================================================
540
+
541
+
542
+ @emails.register()
543
+ def standardize_email(
544
+ col: Column,
545
+ lowercase: bool = True,
546
+ remove_dots_gmail: bool = True,
547
+ remove_plus: bool = False,
548
+ fix_typos: bool = True,
549
+ ) -> Column:
550
+ """
551
+ Apply standard email cleaning and normalization.
552
+
553
+ Args:
554
+ col: Column containing email address
555
+ lowercase: Convert to lowercase
556
+ remove_dots_gmail: Remove dots from Gmail addresses
557
+ remove_plus: Remove plus addressing
558
+ fix_typos: Fix common domain typos
559
+
560
+ Returns:
561
+ Column with standardized email
562
+ """
563
+ result = remove_whitespace(col)
564
+
565
+ if fix_typos:
566
+ result = fix_common_typos(result)
567
+
568
+ if lowercase:
569
+ result = lowercase_email(result)
570
+ else:
571
+ # At least lowercase the domain
572
+ result = lowercase_domain(result)
573
+
574
+ if remove_plus:
575
+ result = remove_plus_addressing(result)
576
+
577
+ if remove_dots_gmail:
578
+ result = remove_dots_from_gmail(result)
579
+
580
+ # Only return valid emails
581
+ return F.when(is_valid_email(result), result).otherwise(F.lit(""))
582
+
583
+
584
+ @emails.register()
585
+ def normalize_gmail(col: Column) -> Column:
586
+ """
587
+ Normalize Gmail addresses (remove dots, plus addressing, lowercase).
588
+
589
+ Args:
590
+ col: Column containing email address
591
+
592
+ Returns:
593
+ Column with normalized Gmail address
594
+ """
595
+ domain = extract_domain(col)
596
+
597
+ return F.when(
598
+ F.lower(domain).isin(["gmail.com", "googlemail.com"]),
599
+ standardize_email(
600
+ col, lowercase=True, remove_dots_gmail=True, remove_plus=True
601
+ ),
602
+ ).otherwise(col)
603
+
604
+
605
+ @emails.register()
606
+ def get_canonical_email(col: Column) -> Column:
607
+ """
608
+ Get canonical form of email address for deduplication.
609
+ Applies maximum normalization.
610
+
611
+ Args:
612
+ col: Column containing email address
613
+
614
+ Returns:
615
+ Column with canonical email form
616
+ """
617
+ return standardize_email(
618
+ col, lowercase=True, remove_dots_gmail=True, remove_plus=True, fix_typos=True
619
+ )
620
+
621
+
622
+ # ============================================================================
623
+ # Email Information Extraction
624
+ # ============================================================================
625
+
626
+
627
+ @emails.register()
628
+ def extract_name_from_email(col: Column) -> Column:
629
+ """
630
+ Attempt to extract person's name from email username.
631
+ E.g., john.smith@example.com -> "John Smith"
632
+
633
+ Args:
634
+ col: Column containing email address
635
+
636
+ Returns:
637
+ Column with extracted name or empty string
638
+ """
639
+ username = extract_username(col)
640
+
641
+ # Remove numbers and common prefixes/suffixes
642
+ cleaned = F.regexp_replace(username, r"[0-9]+", "")
643
+ cleaned = F.regexp_replace(
644
+ cleaned, r"^(info|admin|support|sales|contact|hello|hi|hey)", ""
645
+ )
646
+
647
+ # Replace separators with spaces
648
+ name = F.regexp_replace(cleaned, r"[._-]+", " ")
649
+
650
+ # Capitalize words
651
+ name = F.initcap(F.trim(name))
652
+
653
+ # Only return if it looks like a name (has letters, reasonable length)
654
+ return F.when(
655
+ (F.length(name) >= 2) & (F.length(name) <= 50) & name.rlike(r"^[A-Za-z\s]+$"),
656
+ name,
657
+ ).otherwise(F.lit(""))
658
+
659
+
660
+ @emails.register()
661
+ def get_email_provider(col: Column) -> Column:
662
+ """
663
+ Get email provider name from domain.
664
+
665
+ Args:
666
+ col: Column containing email address
667
+
668
+ Returns:
669
+ Column with provider name
670
+ """
671
+ domain = extract_domain(col)
672
+
673
+ # Map domains to provider names
674
+ provider_mappings = {
675
+ "gmail.com": "Gmail",
676
+ "googlemail.com": "Gmail",
677
+ "yahoo.com": "Yahoo",
678
+ "ymail.com": "Yahoo",
679
+ "hotmail.com": "Hotmail",
680
+ "outlook.com": "Outlook",
681
+ "live.com": "Outlook",
682
+ "msn.com": "Outlook",
683
+ "aol.com": "AOL",
684
+ "icloud.com": "iCloud",
685
+ "me.com": "iCloud",
686
+ "mac.com": "iCloud",
687
+ "protonmail.com": "ProtonMail",
688
+ "proton.me": "ProtonMail",
689
+ }
690
+
691
+ result = F.lit("Other")
692
+ for domain_str, provider in provider_mappings.items():
693
+ result = F.when(F.lower(domain) == domain_str, F.lit(provider)).otherwise(
694
+ result
695
+ )
696
+
697
+ return result
698
+
699
+
700
+ @emails.register()
701
+ def mask_email(col: Column, mask_char: str = "*", keep_chars: int = 3) -> Column:
702
+ """
703
+ Mask email address for privacy (e.g., joh***@gm***.com).
704
+
705
+ Args:
706
+ col: Column containing email address
707
+ mask_char: Character to use for masking
708
+ keep_chars: Number of characters to keep at start
709
+
710
+ Returns:
711
+ Column with masked email
712
+ """
713
+ username = extract_username(col)
714
+ # domain = extract_domain(col)
715
+ domain_name = extract_domain_name(col)
716
+ tld = extract_tld(col)
717
+
718
+ # Mask username (keep first few chars)
719
+ masked_username = F.when(
720
+ F.length(username) > keep_chars,
721
+ F.concat(F.substring(username, 1, keep_chars), F.lit(mask_char * 3)),
722
+ ).otherwise(F.lit(mask_char * 3))
723
+
724
+ # Mask domain (keep first few chars)
725
+ masked_domain_name = F.when(
726
+ F.length(domain_name) > keep_chars,
727
+ F.concat(F.substring(domain_name, 1, keep_chars), F.lit(mask_char * 3)),
728
+ ).otherwise(F.lit(mask_char * 3))
729
+
730
+ return F.when(col.isNull() | ~col.contains("@"), col).otherwise(
731
+ F.concat(masked_username, F.lit("@"), masked_domain_name, F.lit("."), tld)
732
+ )
733
+
734
+
735
+ # ============================================================================
736
+ # Email Filtering Functions
737
+ # ============================================================================
738
+
739
+
740
+ @emails.register()
741
+ def filter_valid_emails(col: Column) -> Column:
742
+ """
743
+ Return email only if valid, otherwise return null.
744
+
745
+ Args:
746
+ col: Column containing email address
747
+
748
+ Returns:
749
+ Column with valid email or null
750
+ """
751
+ return F.when(is_valid_email(col), col).otherwise(F.lit(None))
752
+
753
+
754
+ @emails.register()
755
+ def filter_corporate_emails(col: Column) -> Column:
756
+ """
757
+ Return email only if corporate, otherwise return null.
758
+
759
+ Args:
760
+ col: Column containing email address
761
+
762
+ Returns:
763
+ Column with corporate email or null
764
+ """
765
+ return F.when(is_corporate_email(col), col).otherwise(F.lit(None))
766
+
767
+
768
+ @emails.register()
769
+ def filter_non_disposable_emails(col: Column) -> Column:
770
+ """
771
+ Return email only if not disposable, otherwise return null.
772
+
773
+ Args:
774
+ col: Column containing email address
775
+
776
+ Returns:
777
+ Column with non-disposable email or null
778
+ """
779
+ return F.when(
780
+ col.isNotNull() & (col != "") & ~is_disposable_email(col), col
781
+ ).otherwise(F.lit(None))