datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,941 @@
1
+ import re
2
+ from typing import TYPE_CHECKING, Dict, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ # For type checkers only - these imports are always available during type checking
6
+ from pyspark.sql import Column
7
+ from pyspark.sql import functions as F
8
+ else:
9
+ # At runtime, handle missing PySpark gracefully
10
+ try:
11
+ from pyspark.sql import Column
12
+ from pyspark.sql import functions as F
13
+ except ImportError:
14
+ # PySpark is not installed - functions will fail at runtime if called
15
+ pass
16
+
17
+ try:
18
+ # Try local utils import first (for generated code)
19
+ from utils.primitives import PrimitiveRegistry
20
+ except ImportError:
21
+ # Fall back to installed datacompose package
22
+ from datacompose.operators.primitives import PrimitiveRegistry
23
+
24
+ phones = PrimitiveRegistry("phones")
25
+
26
+ # Phone keypad mapping for letter to number conversion
27
+ PHONE_KEYPAD_MAPPING = {
28
+ "A": "2", "B": "2", "C": "2",
29
+ "D": "3", "E": "3", "F": "3",
30
+ "G": "4", "H": "4", "I": "4",
31
+ "J": "5", "K": "5", "L": "5",
32
+ "M": "6", "N": "6", "O": "6",
33
+ "P": "7", "Q": "7", "R": "7", "S": "7",
34
+ "T": "8", "U": "8", "V": "8",
35
+ "W": "9", "X": "9", "Y": "9", "Z": "9",
36
+ }
37
+
38
+
39
+ # ============================================================================
40
+ # Core Phone Number Extraction Functions
41
+ # ============================================================================
42
+
43
+
44
+ @phones.register()
45
+ def extract_phone_from_text(col: Column) -> Column:
46
+ """
47
+ Extract first phone number from text using regex patterns.
48
+
49
+ Args:
50
+ col: Column containing text with potential phone numbers
51
+
52
+ Returns:
53
+ Column with extracted phone number or empty string
54
+ """
55
+ # Comprehensive phone pattern that matches various formats
56
+ # Handles: +1-555-123-4567, (555) 123-4567, 555.123.4567, 555-123-4567, etc.
57
+ phone_pattern = r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(\s*(ext|x)\.?\s*\d+)?"
58
+
59
+ return F.when(col.isNull(), F.lit("")).otherwise(
60
+ F.regexp_extract(col, phone_pattern, 0)
61
+ )
62
+
63
+
64
+ @phones.register()
65
+ def extract_all_phones_from_text(col: Column) -> Column:
66
+ """
67
+ Extract all phone numbers from text as an array.
68
+
69
+ Args:
70
+ col: Column containing text with potential phone numbers
71
+
72
+ Returns:
73
+ Column with array of phone numbers
74
+ """
75
+ # For simplicity, we'll return an array with just the first phone found
76
+ # A proper implementation would require more complex regex or UDF
77
+ # This is a limitation of Spark SQL's regex capabilities
78
+ first_phone = extract_phone_from_text(col)
79
+
80
+ # Return array with single element or empty array
81
+ return F.when(
82
+ first_phone != "",
83
+ F.array(first_phone)
84
+ ).otherwise(F.array())
85
+
86
+
87
+ @phones.register()
88
+ def extract_digits(col: Column) -> Column:
89
+ """
90
+ Extract only digits from phone number string.
91
+
92
+ Args:
93
+ col: Column containing phone number
94
+
95
+ Returns:
96
+ Column with only digits
97
+ """
98
+ return F.when(col.isNull(), F.lit("")).otherwise(
99
+ F.regexp_replace(col, r"[^\d]", "")
100
+ )
101
+
102
+
103
+ @phones.register()
104
+ def extract_extension(col: Column) -> Column:
105
+ """
106
+ Extract extension from phone number if present.
107
+
108
+ Args:
109
+ col: Column containing phone number
110
+
111
+ Returns:
112
+ Column with extension or empty string
113
+ """
114
+ return F.when(col.isNull(), F.lit("")).otherwise(
115
+ F.when(
116
+ col.rlike(r"ext\.?\s*(\d+)"),
117
+ F.regexp_extract(col, r"ext\.?\s*(\d+)", 1)
118
+ ).otherwise("")
119
+ )
120
+
121
+
122
+ @phones.register()
123
+ def extract_country_code(col: Column) -> Column:
124
+ """
125
+ Extract country code from phone number.
126
+
127
+ Args:
128
+ col: Column containing phone number
129
+
130
+ Returns:
131
+ Column with country code or empty string
132
+ """
133
+ digits = extract_digits(col)
134
+
135
+ # Check for explicit country code with + prefix
136
+ has_plus = col.contains("+")
137
+
138
+ return F.when(col.isNull(), F.lit("")).otherwise(
139
+ F.when(
140
+ # Explicit country code with +
141
+ has_plus & col.rlike(r"^\+(\d{1,3})"),
142
+ F.regexp_extract(col, r"^\+(\d{1,3})", 1)
143
+ ).when(
144
+ # NANP with leading 1 (11 digits total)
145
+ (F.length(digits) == 11) & digits.startswith("1"),
146
+ F.lit("1")
147
+ ).otherwise("")
148
+ )
149
+
150
+
151
+ @phones.register()
152
+ def extract_area_code(col: Column) -> Column:
153
+ """
154
+ Extract area code from NANP phone number.
155
+
156
+ Args:
157
+ col: Column containing phone number
158
+
159
+ Returns:
160
+ Column with area code or empty string
161
+ """
162
+ digits = extract_digits(col)
163
+
164
+ return F.when(col.isNull(), F.lit("")).otherwise(
165
+ F.when(
166
+ F.length(digits) == 11,
167
+ F.substring(digits, 2, 3) # Skip country code
168
+ ).when(
169
+ F.length(digits) == 10,
170
+ F.substring(digits, 1, 3)
171
+ ).otherwise("")
172
+ )
173
+
174
+
175
+ @phones.register()
176
+ def extract_exchange(col: Column) -> Column:
177
+ """
178
+ Extract exchange (first 3 digits of local number) from NANP phone number.
179
+
180
+ Args:
181
+ col: Column containing phone number
182
+
183
+ Returns:
184
+ Column with exchange or empty string
185
+ """
186
+ digits = extract_digits(col)
187
+
188
+ return F.when(col.isNull(), F.lit("")).otherwise(
189
+ F.when(
190
+ F.length(digits) == 11,
191
+ F.substring(digits, 5, 3)
192
+ ).when(
193
+ F.length(digits) == 10,
194
+ F.substring(digits, 4, 3)
195
+ ).otherwise("")
196
+ )
197
+
198
+
199
+ @phones.register()
200
+ def extract_subscriber(col: Column) -> Column:
201
+ """
202
+ Extract subscriber number (last 4 digits) from NANP phone number.
203
+
204
+ Args:
205
+ col: Column containing phone number
206
+
207
+ Returns:
208
+ Column with subscriber number or empty string
209
+ """
210
+ digits = extract_digits(col)
211
+
212
+ return F.when(col.isNull(), F.lit("")).otherwise(
213
+ F.when(
214
+ F.length(digits) == 11,
215
+ F.substring(digits, 8, 4)
216
+ ).when(
217
+ F.length(digits) == 10,
218
+ F.substring(digits, 7, 4)
219
+ ).otherwise("")
220
+ )
221
+
222
+
223
+ @phones.register()
224
+ def extract_local_number(col: Column) -> Column:
225
+ """
226
+ Extract local number (exchange + subscriber) from NANP phone number.
227
+
228
+ Args:
229
+ col: Column containing phone number
230
+
231
+ Returns:
232
+ Column with 7-digit local number or empty string
233
+ """
234
+ exchange = extract_exchange(col)
235
+ subscriber = extract_subscriber(col)
236
+
237
+ return F.when(
238
+ (exchange != "") & (subscriber != ""),
239
+ F.concat(exchange, subscriber)
240
+ ).otherwise("")
241
+
242
+
243
+ # ============================================================================
244
+ # Phone Number Validation Functions
245
+ # ============================================================================
246
+
247
+
248
+ @phones.register()
249
+ def is_valid_nanp(col: Column) -> Column:
250
+ """
251
+ Check if phone number is valid NANP format (North American Numbering Plan).
252
+
253
+ Args:
254
+ col: Column containing phone number
255
+
256
+ Returns:
257
+ Column with boolean indicating NANP validity
258
+ """
259
+ digits = extract_digits(col)
260
+ area_code = extract_area_code(col)
261
+ exchange = extract_exchange(col)
262
+ subscriber = extract_subscriber(col)
263
+
264
+ return F.when(col.isNull(), F.lit(False)).otherwise(
265
+ (F.length(digits).isin([10, 11])) &
266
+ # Area code: 2-9 for first digit, 0-9 for second, 0-9 for third
267
+ (area_code.rlike(r"^[2-9]\d{2}$")) &
268
+ # Exchange: 2-9 for first digit (historically, now 1-9 is valid)
269
+ (exchange.rlike(r"^[1-9]\d{2}$")) &
270
+ # Subscriber: any 4 digits
271
+ (subscriber.rlike(r"^\d{4}$")) &
272
+ # If 11 digits, must start with 1
273
+ ((F.length(digits) == 10) | (digits.startswith("1")))
274
+ )
275
+
276
+
277
+ @phones.register()
278
+ def is_valid_international(col: Column, min_length: int = 7, max_length: int = 15) -> Column:
279
+ """
280
+ Check if phone number could be valid international format.
281
+
282
+ Args:
283
+ col: Column containing phone number
284
+ min_length: Minimum digits for international number
285
+ max_length: Maximum digits for international number
286
+
287
+ Returns:
288
+ Column with boolean indicating potential international validity
289
+ """
290
+ digits = extract_digits(col)
291
+
292
+ return F.when(col.isNull(), F.lit(False)).otherwise(
293
+ (F.length(digits) >= min_length) &
294
+ (F.length(digits) <= max_length) &
295
+ digits.rlike(r"^\d+$")
296
+ )
297
+
298
+
299
+ @phones.register()
300
+ def is_valid_phone(col: Column) -> Column:
301
+ """
302
+ Check if phone number is valid (NANP or international).
303
+
304
+ Args:
305
+ col: Column containing phone number
306
+
307
+ Returns:
308
+ Column with boolean indicating validity
309
+ """
310
+ return is_valid_nanp(col) | is_valid_international(col)
311
+
312
+
313
+ @phones.register()
314
+ def is_toll_free(col: Column) -> Column:
315
+ """
316
+ Check if phone number is toll-free (800, 888, 877, 866, 855, 844, 833).
317
+
318
+ Args:
319
+ col: Column containing phone number
320
+
321
+ Returns:
322
+ Column with boolean indicating if toll-free
323
+ """
324
+ area_code = extract_area_code(col)
325
+
326
+ toll_free_codes = ["800", "888", "877", "866", "855", "844", "833"]
327
+
328
+ return F.when(col.isNull(), F.lit(False)).otherwise(
329
+ area_code.isin(toll_free_codes)
330
+ )
331
+
332
+
333
+ @phones.register()
334
+ def is_premium_rate(col: Column) -> Column:
335
+ """
336
+ Check if phone number is premium rate (900).
337
+
338
+ Args:
339
+ col: Column containing phone number
340
+
341
+ Returns:
342
+ Column with boolean indicating if premium rate
343
+ """
344
+ area_code = extract_area_code(col)
345
+
346
+ return F.when(col.isNull(), F.lit(False)).otherwise(
347
+ area_code == "900"
348
+ )
349
+
350
+
351
+ @phones.register()
352
+ def has_extension(col: Column) -> Column:
353
+ """
354
+ Check if phone number has an extension.
355
+
356
+ Args:
357
+ col: Column containing phone number
358
+
359
+ Returns:
360
+ Column with boolean indicating presence of extension
361
+ """
362
+ return F.when(col.isNull(), F.lit(False)).otherwise(
363
+ col.rlike(r"ext\.?\s*\d+")
364
+ )
365
+
366
+
367
+ # ============================================================================
368
+ # Phone Number Cleaning Functions
369
+ # ============================================================================
370
+
371
+
372
+ @phones.register()
373
+ def remove_non_digits(col: Column) -> Column:
374
+ """
375
+ Remove all non-digit characters from phone number.
376
+
377
+ Args:
378
+ col: Column containing phone number
379
+
380
+ Returns:
381
+ Column with only digits
382
+ """
383
+ return extract_digits(col)
384
+
385
+
386
+ @phones.register()
387
+ def remove_extension(col: Column) -> Column:
388
+ """
389
+ Remove extension from phone number.
390
+
391
+ Args:
392
+ col: Column containing phone number
393
+
394
+ Returns:
395
+ Column with extension removed
396
+ """
397
+ return F.when(col.isNull(), F.lit("")).otherwise(
398
+ F.regexp_replace(col, r"ext\.?\s*\d+", "")
399
+ )
400
+
401
+
402
+ @phones.register()
403
+ def convert_letters_to_numbers(col: Column) -> Column:
404
+ """
405
+ Convert phone letters to numbers (e.g., 1-800-FLOWERS to 1-800-3569377).
406
+
407
+ Args:
408
+ col: Column containing phone number with letters
409
+
410
+ Returns:
411
+ Column with letters converted to numbers
412
+ """
413
+ result = col
414
+
415
+ # Apply each letter-to-number mapping
416
+ for letter, number in PHONE_KEYPAD_MAPPING.items():
417
+ result = F.regexp_replace(result, letter, number)
418
+ result = F.regexp_replace(result, letter.lower(), number)
419
+
420
+ return F.when(col.isNull(), F.lit("")).otherwise(result)
421
+
422
+
423
+ @phones.register()
424
+ def normalize_separators(col: Column) -> Column:
425
+ """
426
+ Normalize various separator styles to hyphens.
427
+ Removes parentheses and replaces dots, spaces with hyphens.
428
+
429
+ Args:
430
+ col: Column containing phone number
431
+
432
+ Returns:
433
+ Column with normalized separators
434
+ """
435
+ # First remove parentheses and replace with space to maintain separation
436
+ result = F.regexp_replace(col, r"\(", "")
437
+ result = F.regexp_replace(result, r"\)", " ")
438
+ # Then replace any sequence of spaces or dots with hyphen
439
+ result = F.regexp_replace(result, r"[\s\.]+", "-")
440
+ # Collapse multiple hyphens into one
441
+ result = F.regexp_replace(result, r"-+", "-")
442
+ # Remove leading/trailing hyphens
443
+ result = F.regexp_replace(result, r"^-+|-+$", "")
444
+
445
+ return F.when(col.isNull(), F.lit("")).otherwise(result)
446
+
447
+
448
+ @phones.register()
449
+ def add_country_code(col: Column) -> Column:
450
+ """
451
+ Add country code "1" if not present (for NANP numbers).
452
+
453
+ Args:
454
+ col: Column containing phone number
455
+
456
+ Returns:
457
+ Column with country code added if needed
458
+ """
459
+ digits = extract_digits(col)
460
+
461
+ return F.when(col.isNull(), col).otherwise(
462
+ F.when(
463
+ (F.length(digits) == 10) & is_valid_nanp(col),
464
+ F.concat(F.lit("1"), digits)
465
+ ).otherwise(digits)
466
+ )
467
+
468
+
469
+ # ============================================================================
470
+ # Phone Number Formatting Functions
471
+ # ============================================================================
472
+
473
+
474
+ @phones.register()
475
+ def format_nanp(col: Column) -> Column:
476
+ """
477
+ Format NANP phone number in standard hyphen format (XXX-XXX-XXXX).
478
+
479
+ Args:
480
+ col: Column containing phone number
481
+
482
+ Returns:
483
+ Column with formatted phone number
484
+ """
485
+ # Remove extension for validation but preserve it
486
+ extension = extract_extension(col)
487
+ phone_no_ext = remove_extension(col)
488
+
489
+ area_code = extract_area_code(phone_no_ext)
490
+ exchange = extract_exchange(phone_no_ext)
491
+ subscriber = extract_subscriber(phone_no_ext)
492
+
493
+ base_format = F.concat(
494
+ area_code, F.lit("-"),
495
+ exchange, F.lit("-"),
496
+ subscriber
497
+ )
498
+
499
+ # Add extension if present
500
+ formatted = F.when(
501
+ (extension != ""),
502
+ F.concat(base_format, F.lit(" ext. "), extension)
503
+ ).otherwise(base_format)
504
+
505
+ return F.when(
506
+ is_valid_nanp(phone_no_ext),
507
+ formatted
508
+ ).otherwise(F.lit(""))
509
+
510
+
511
+ @phones.register()
512
+ def format_nanp_paren(col: Column) -> Column:
513
+ """
514
+ Format NANP phone number with parentheses ((XXX) XXX-XXXX).
515
+
516
+ Args:
517
+ col: Column containing phone number
518
+
519
+ Returns:
520
+ Column with formatted phone number
521
+ """
522
+ # Remove extension for validation but preserve it
523
+ extension = extract_extension(col)
524
+ phone_no_ext = remove_extension(col)
525
+
526
+ area_code = extract_area_code(phone_no_ext)
527
+ exchange = extract_exchange(phone_no_ext)
528
+ subscriber = extract_subscriber(phone_no_ext)
529
+
530
+ base_format = F.concat(
531
+ F.lit("("), area_code, F.lit(") "),
532
+ exchange, F.lit("-"), subscriber
533
+ )
534
+
535
+ # Add extension if present
536
+ formatted = F.when(
537
+ (extension != ""),
538
+ F.concat(base_format, F.lit(" ext. "), extension)
539
+ ).otherwise(base_format)
540
+
541
+ return F.when(
542
+ is_valid_nanp(phone_no_ext),
543
+ formatted
544
+ ).otherwise(F.lit(""))
545
+
546
+
547
+ @phones.register()
548
+ def format_nanp_dot(col: Column) -> Column:
549
+ """
550
+ Format NANP phone number with dots (XXX.XXX.XXXX).
551
+
552
+ Args:
553
+ col: Column containing phone number
554
+
555
+ Returns:
556
+ Column with formatted phone number
557
+ """
558
+ # Remove extension for validation but preserve it
559
+ extension = extract_extension(col)
560
+ phone_no_ext = remove_extension(col)
561
+
562
+ area_code = extract_area_code(phone_no_ext)
563
+ exchange = extract_exchange(phone_no_ext)
564
+ subscriber = extract_subscriber(phone_no_ext)
565
+
566
+ base_format = F.concat(
567
+ area_code, F.lit("."),
568
+ exchange, F.lit("."),
569
+ subscriber
570
+ )
571
+
572
+ # Add extension if present
573
+ formatted = F.when(
574
+ (extension != ""),
575
+ F.concat(base_format, F.lit(" ext. "), extension)
576
+ ).otherwise(base_format)
577
+
578
+ return F.when(
579
+ is_valid_nanp(phone_no_ext),
580
+ formatted
581
+ ).otherwise(F.lit(""))
582
+
583
+
584
+ @phones.register()
585
+ def format_nanp_space(col: Column) -> Column:
586
+ """
587
+ Format NANP phone number with spaces (XXX XXX XXXX).
588
+
589
+ Args:
590
+ col: Column containing phone number
591
+
592
+ Returns:
593
+ Column with formatted phone number
594
+ """
595
+ # Remove extension for validation but preserve it
596
+ extension = extract_extension(col)
597
+ phone_no_ext = remove_extension(col)
598
+
599
+ area_code = extract_area_code(phone_no_ext)
600
+ exchange = extract_exchange(phone_no_ext)
601
+ subscriber = extract_subscriber(phone_no_ext)
602
+
603
+ base_format = F.concat(
604
+ area_code, F.lit(" "),
605
+ exchange, F.lit(" "),
606
+ subscriber
607
+ )
608
+
609
+ # Add extension if present
610
+ formatted = F.when(
611
+ (extension != ""),
612
+ F.concat(base_format, F.lit(" ext. "), extension)
613
+ ).otherwise(base_format)
614
+
615
+ return F.when(
616
+ is_valid_nanp(phone_no_ext),
617
+ formatted
618
+ ).otherwise(F.lit(""))
619
+
620
+
621
+ @phones.register()
622
+ def format_international(col: Column) -> Column:
623
+ """
624
+ Format international phone number with country code.
625
+
626
+ Args:
627
+ col: Column containing phone number
628
+
629
+ Returns:
630
+ Column with formatted international number
631
+ """
632
+ country_code = extract_country_code(col)
633
+ digits = extract_digits(col)
634
+
635
+ # For international numbers, if we have a country code, remove it from the beginning
636
+ # Use F.substring with proper column references
637
+ cc_length = F.length(country_code)
638
+ remaining_digits = F.when(
639
+ (country_code != "") & (cc_length > 0) & digits.startswith(country_code),
640
+ F.substring(digits, cc_length + 1, 999)
641
+ ).otherwise(digits)
642
+
643
+ return F.when(
644
+ is_valid_international(col) & (country_code != ""),
645
+ F.concat(F.lit("+"), country_code, F.lit(" "), remaining_digits)
646
+ ).when(
647
+ is_valid_international(col),
648
+ digits
649
+ ).otherwise(F.lit(""))
650
+
651
+
652
+ @phones.register()
653
+ def format_e164(col: Column) -> Column:
654
+ """
655
+ Format phone number in E.164 format (+CCAAANNNNNNN) with default country code 1.
656
+
657
+ Args:
658
+ col: Column containing phone number
659
+
660
+ Returns:
661
+ Column with E.164 formatted number
662
+ """
663
+ digits = extract_digits(col)
664
+ country_code = extract_country_code(col)
665
+
666
+ # Check if it's a valid NANP number first
667
+ is_nanp = is_valid_nanp(col)
668
+
669
+ # Use default country code "1" if not present and number is 10 digits NANP
670
+ final_country = F.when(
671
+ (country_code == "") & (F.length(digits) == 10) & is_nanp,
672
+ F.lit("1")
673
+ ).otherwise(country_code)
674
+
675
+ # Build E.164 format - only for valid phones
676
+ return F.when(
677
+ is_valid_phone(col),
678
+ F.when(
679
+ (F.length(digits) == 10) & is_nanp,
680
+ F.concat(F.lit("+"), F.lit("1"), digits)
681
+ ).when(
682
+ (F.length(digits) == 11) & digits.startswith("1") & is_nanp,
683
+ F.concat(F.lit("+"), digits)
684
+ ).when(
685
+ (country_code != "") & is_valid_international(col),
686
+ F.concat(F.lit("+"), digits) # digits already includes country code
687
+ ).otherwise(F.lit(""))
688
+ ).otherwise(F.lit(""))
689
+
690
+
691
+ # ============================================================================
692
+ # Phone Number Standardization Functions
693
+ # ============================================================================
694
+
695
+
696
+ @phones.register()
697
+ def standardize_phone(col: Column) -> Column:
698
+ """
699
+ Standardize phone number with cleaning and NANP formatting.
700
+
701
+ Args:
702
+ col: Column containing phone number
703
+
704
+ Returns:
705
+ Column with standardized phone number in NANP format
706
+ """
707
+ # Clean and convert letters in a simpler way
708
+ cleaned = convert_letters_to_numbers(col)
709
+
710
+ # Extract extension first
711
+ extension = extract_extension(cleaned)
712
+ phone_no_ext = remove_extension(cleaned)
713
+
714
+ # Get digits and check validity
715
+ digits = extract_digits(phone_no_ext)
716
+
717
+ # Simple NANP formatting for valid 10 or 11 digit numbers
718
+ result = F.when(
719
+ F.length(digits) == 10,
720
+ F.concat(
721
+ F.substring(digits, 1, 3), F.lit("-"),
722
+ F.substring(digits, 4, 3), F.lit("-"),
723
+ F.substring(digits, 7, 4)
724
+ )
725
+ ).when(
726
+ F.length(digits) == 11,
727
+ F.concat(
728
+ F.substring(digits, 2, 3), F.lit("-"),
729
+ F.substring(digits, 5, 3), F.lit("-"),
730
+ F.substring(digits, 8, 4)
731
+ )
732
+ ).otherwise(F.lit(""))
733
+
734
+ # Add extension back if present
735
+ final_result = F.when(
736
+ (extension != "") & (result != ""),
737
+ F.concat(result, F.lit(" ext. "), extension)
738
+ ).otherwise(result)
739
+
740
+ return final_result
741
+
742
+
743
+ @phones.register()
744
+ def standardize_phone_e164(col: Column) -> Column:
745
+ """
746
+ Standardize phone number with cleaning and E.164 formatting.
747
+
748
+ Args:
749
+ col: Column containing phone number
750
+
751
+ Returns:
752
+ Column with standardized phone number in E.164 format
753
+ """
754
+ # Clean and convert letters
755
+ cleaned = convert_letters_to_numbers(col)
756
+
757
+ # Format as E.164
758
+ result = format_e164(cleaned)
759
+
760
+ # Only return valid phone numbers
761
+ return F.when(is_valid_phone(cleaned), result).otherwise(F.lit(""))
762
+
763
+
764
+ @phones.register()
765
+ def standardize_phone_digits(col: Column) -> Column:
766
+ """
767
+ Standardize phone number and return digits only.
768
+
769
+ Args:
770
+ col: Column containing phone number
771
+
772
+ Returns:
773
+ Column with digits only
774
+ """
775
+ # Clean and convert letters
776
+ cleaned = convert_letters_to_numbers(col)
777
+
778
+ # Get digits only
779
+ result = extract_digits(cleaned)
780
+
781
+ # Only return valid phone numbers
782
+ return F.when(is_valid_phone(cleaned), result).otherwise(F.lit(""))
783
+
784
+
785
+ @phones.register()
786
+ def clean_phone(col: Column) -> Column:
787
+ """
788
+ Clean and validate phone number, returning null for invalid numbers.
789
+
790
+ Args:
791
+ col: Column containing phone number
792
+
793
+ Returns:
794
+ Column with cleaned phone number or null
795
+ """
796
+ # Simple implementation to avoid deep nesting
797
+ cleaned = convert_letters_to_numbers(col)
798
+ digits = extract_digits(cleaned)
799
+
800
+ # Simple validation and formatting
801
+ result = F.when(
802
+ F.length(digits) == 10,
803
+ F.concat(
804
+ F.substring(digits, 1, 3), F.lit("-"),
805
+ F.substring(digits, 4, 3), F.lit("-"),
806
+ F.substring(digits, 7, 4)
807
+ )
808
+ ).when(
809
+ F.length(digits) == 11,
810
+ F.concat(
811
+ F.substring(digits, 2, 3), F.lit("-"),
812
+ F.substring(digits, 5, 3), F.lit("-"),
813
+ F.substring(digits, 8, 4)
814
+ )
815
+ ).otherwise(F.lit(None))
816
+
817
+ return result
818
+
819
+
820
+ # ============================================================================
821
+ # Phone Number Information Functions
822
+ # ============================================================================
823
+
824
+
825
+ @phones.register()
826
+ def get_phone_type(col: Column) -> Column:
827
+ """
828
+ Get phone number type (toll-free, premium, standard, international).
829
+
830
+ Args:
831
+ col: Column containing phone number
832
+
833
+ Returns:
834
+ Column with phone type
835
+ """
836
+ return F.when(col.isNull() | (col == ""), F.lit("unknown")).otherwise(
837
+ F.when(is_toll_free(col), F.lit("toll-free"))
838
+ .when(is_premium_rate(col), F.lit("premium"))
839
+ .when(is_valid_nanp(col), F.lit("standard"))
840
+ .when(is_valid_international(col), F.lit("international"))
841
+ .otherwise(F.lit("invalid"))
842
+ )
843
+
844
+
845
+ @phones.register()
846
+ def get_region_from_area_code(col: Column) -> Column:
847
+ """
848
+ Get geographic region from area code (simplified - would need lookup table).
849
+
850
+ Args:
851
+ col: Column containing phone number
852
+
853
+ Returns:
854
+ Column with region or empty string
855
+ """
856
+ area_code = extract_area_code(col)
857
+
858
+ # This is a simplified example - in practice you'd use a lookup table
859
+ # Just showing structure for major area codes
860
+ return F.when(area_code == "212", F.lit("New York, NY")).\
861
+ when(area_code == "213", F.lit("Los Angeles, CA")).\
862
+ when(area_code == "312", F.lit("Chicago, IL")).\
863
+ when(area_code == "415", F.lit("San Francisco, CA")).\
864
+ when(area_code == "202", F.lit("Washington, DC")).\
865
+ when(area_code.isin(["800", "888", "877", "866", "855", "844", "833"]),
866
+ F.lit("Toll-Free")).\
867
+ when(area_code == "900", F.lit("Premium")).\
868
+ otherwise(F.lit(""))
869
+
870
+
871
+ @phones.register()
872
+ def mask_phone(col: Column) -> Column:
873
+ """
874
+ Mask phone number for privacy keeping last 4 digits (e.g., ***-***-1234).
875
+
876
+ Args:
877
+ col: Column containing phone number
878
+
879
+ Returns:
880
+ Column with masked phone number
881
+ """
882
+ subscriber = extract_subscriber(col)
883
+
884
+ # Mask area code and exchange, keep last 4 digits
885
+ masked = F.when(
886
+ is_valid_nanp(col),
887
+ F.concat(
888
+ F.lit("***"), F.lit("-"),
889
+ F.lit("***"), F.lit("-"),
890
+ subscriber
891
+ )
892
+ ).otherwise(col)
893
+
894
+ return F.when(col.isNull() | (col == ""), F.lit(None)).otherwise(masked)
895
+
896
+
897
+ # ============================================================================
898
+ # Phone Number Filtering Functions
899
+ # ============================================================================
900
+
901
+
902
+ @phones.register()
903
+ def filter_valid_phones(col: Column) -> Column:
904
+ """
905
+ Return phone number only if valid, otherwise return null.
906
+
907
+ Args:
908
+ col: Column containing phone number
909
+
910
+ Returns:
911
+ Column with valid phone or null
912
+ """
913
+ return F.when(is_valid_phone(col), col).otherwise(F.lit(None))
914
+
915
+
916
+ @phones.register()
917
+ def filter_nanp_phones(col: Column) -> Column:
918
+ """
919
+ Return phone number only if valid NANP, otherwise return null.
920
+
921
+ Args:
922
+ col: Column containing phone number
923
+
924
+ Returns:
925
+ Column with NANP phone or null
926
+ """
927
+ return F.when(is_valid_nanp(col), col).otherwise(F.lit(None))
928
+
929
+
930
+ @phones.register()
931
+ def filter_toll_free_phones(col: Column) -> Column:
932
+ """
933
+ Return phone number only if toll-free, otherwise return null.
934
+
935
+ Args:
936
+ col: Column containing phone number
937
+
938
+ Returns:
939
+ Column with toll-free phone or null
940
+ """
941
+ return F.when(is_toll_free(col), col).otherwise(F.lit(None))