datacompose 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/__init__.py +1 -0
- datacompose/cli/__init__.py +5 -0
- datacompose/cli/colors.py +80 -0
- datacompose/cli/commands/__init__.py +3 -0
- datacompose/cli/commands/add.py +215 -0
- datacompose/cli/commands/init.py +451 -0
- datacompose/cli/commands/list.py +118 -0
- datacompose/cli/commands/upgrade.py +7 -0
- datacompose/cli/main.py +59 -0
- datacompose/cli/validation.py +72 -0
- datacompose/generators/__init__.py +3 -0
- datacompose/generators/base.py +193 -0
- datacompose/generators/pyspark/__init__.py +1 -0
- datacompose/generators/pyspark/generator.py +51 -0
- datacompose/operators/__init__.py +21 -0
- datacompose/operators/primitives.py +595 -0
- datacompose/transformers/__init__.py +0 -0
- datacompose/transformers/discovery.py +186 -0
- datacompose/transformers/text/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4.dist-info/METADATA +431 -0
- datacompose-0.2.4.dist-info/RECORD +31 -0
- datacompose-0.2.4.dist-info/WHEEL +5 -0
- datacompose-0.2.4.dist-info/entry_points.txt +2 -0
- datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
- datacompose-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,941 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
# For type checkers only - these imports are always available during type checking
|
|
6
|
+
from pyspark.sql import Column
|
|
7
|
+
from pyspark.sql import functions as F
|
|
8
|
+
else:
|
|
9
|
+
# At runtime, handle missing PySpark gracefully
|
|
10
|
+
try:
|
|
11
|
+
from pyspark.sql import Column
|
|
12
|
+
from pyspark.sql import functions as F
|
|
13
|
+
except ImportError:
|
|
14
|
+
# PySpark is not installed - functions will fail at runtime if called
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
# Try local utils import first (for generated code)
|
|
19
|
+
from utils.primitives import PrimitiveRegistry
|
|
20
|
+
except ImportError:
|
|
21
|
+
# Fall back to installed datacompose package
|
|
22
|
+
from datacompose.operators.primitives import PrimitiveRegistry
|
|
23
|
+
|
|
24
|
+
phones = PrimitiveRegistry("phones")
|
|
25
|
+
|
|
26
|
+
# Phone keypad mapping for letter to number conversion
|
|
27
|
+
PHONE_KEYPAD_MAPPING = {
|
|
28
|
+
"A": "2", "B": "2", "C": "2",
|
|
29
|
+
"D": "3", "E": "3", "F": "3",
|
|
30
|
+
"G": "4", "H": "4", "I": "4",
|
|
31
|
+
"J": "5", "K": "5", "L": "5",
|
|
32
|
+
"M": "6", "N": "6", "O": "6",
|
|
33
|
+
"P": "7", "Q": "7", "R": "7", "S": "7",
|
|
34
|
+
"T": "8", "U": "8", "V": "8",
|
|
35
|
+
"W": "9", "X": "9", "Y": "9", "Z": "9",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ============================================================================
|
|
40
|
+
# Core Phone Number Extraction Functions
|
|
41
|
+
# ============================================================================
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@phones.register()
|
|
45
|
+
def extract_phone_from_text(col: Column) -> Column:
|
|
46
|
+
"""
|
|
47
|
+
Extract first phone number from text using regex patterns.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
col: Column containing text with potential phone numbers
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Column with extracted phone number or empty string
|
|
54
|
+
"""
|
|
55
|
+
# Comprehensive phone pattern that matches various formats
|
|
56
|
+
# Handles: +1-555-123-4567, (555) 123-4567, 555.123.4567, 555-123-4567, etc.
|
|
57
|
+
phone_pattern = r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(\s*(ext|x)\.?\s*\d+)?"
|
|
58
|
+
|
|
59
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
60
|
+
F.regexp_extract(col, phone_pattern, 0)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@phones.register()
|
|
65
|
+
def extract_all_phones_from_text(col: Column) -> Column:
|
|
66
|
+
"""
|
|
67
|
+
Extract all phone numbers from text as an array.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
col: Column containing text with potential phone numbers
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Column with array of phone numbers
|
|
74
|
+
"""
|
|
75
|
+
# For simplicity, we'll return an array with just the first phone found
|
|
76
|
+
# A proper implementation would require more complex regex or UDF
|
|
77
|
+
# This is a limitation of Spark SQL's regex capabilities
|
|
78
|
+
first_phone = extract_phone_from_text(col)
|
|
79
|
+
|
|
80
|
+
# Return array with single element or empty array
|
|
81
|
+
return F.when(
|
|
82
|
+
first_phone != "",
|
|
83
|
+
F.array(first_phone)
|
|
84
|
+
).otherwise(F.array())
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@phones.register()
|
|
88
|
+
def extract_digits(col: Column) -> Column:
|
|
89
|
+
"""
|
|
90
|
+
Extract only digits from phone number string.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
col: Column containing phone number
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Column with only digits
|
|
97
|
+
"""
|
|
98
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
99
|
+
F.regexp_replace(col, r"[^\d]", "")
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@phones.register()
|
|
104
|
+
def extract_extension(col: Column) -> Column:
|
|
105
|
+
"""
|
|
106
|
+
Extract extension from phone number if present.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
col: Column containing phone number
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Column with extension or empty string
|
|
113
|
+
"""
|
|
114
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
115
|
+
F.when(
|
|
116
|
+
col.rlike(r"ext\.?\s*(\d+)"),
|
|
117
|
+
F.regexp_extract(col, r"ext\.?\s*(\d+)", 1)
|
|
118
|
+
).otherwise("")
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@phones.register()
|
|
123
|
+
def extract_country_code(col: Column) -> Column:
|
|
124
|
+
"""
|
|
125
|
+
Extract country code from phone number.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
col: Column containing phone number
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Column with country code or empty string
|
|
132
|
+
"""
|
|
133
|
+
digits = extract_digits(col)
|
|
134
|
+
|
|
135
|
+
# Check for explicit country code with + prefix
|
|
136
|
+
has_plus = col.contains("+")
|
|
137
|
+
|
|
138
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
139
|
+
F.when(
|
|
140
|
+
# Explicit country code with +
|
|
141
|
+
has_plus & col.rlike(r"^\+(\d{1,3})"),
|
|
142
|
+
F.regexp_extract(col, r"^\+(\d{1,3})", 1)
|
|
143
|
+
).when(
|
|
144
|
+
# NANP with leading 1 (11 digits total)
|
|
145
|
+
(F.length(digits) == 11) & digits.startswith("1"),
|
|
146
|
+
F.lit("1")
|
|
147
|
+
).otherwise("")
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@phones.register()
|
|
152
|
+
def extract_area_code(col: Column) -> Column:
|
|
153
|
+
"""
|
|
154
|
+
Extract area code from NANP phone number.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
col: Column containing phone number
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Column with area code or empty string
|
|
161
|
+
"""
|
|
162
|
+
digits = extract_digits(col)
|
|
163
|
+
|
|
164
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
165
|
+
F.when(
|
|
166
|
+
F.length(digits) == 11,
|
|
167
|
+
F.substring(digits, 2, 3) # Skip country code
|
|
168
|
+
).when(
|
|
169
|
+
F.length(digits) == 10,
|
|
170
|
+
F.substring(digits, 1, 3)
|
|
171
|
+
).otherwise("")
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@phones.register()
|
|
176
|
+
def extract_exchange(col: Column) -> Column:
|
|
177
|
+
"""
|
|
178
|
+
Extract exchange (first 3 digits of local number) from NANP phone number.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
col: Column containing phone number
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Column with exchange or empty string
|
|
185
|
+
"""
|
|
186
|
+
digits = extract_digits(col)
|
|
187
|
+
|
|
188
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
189
|
+
F.when(
|
|
190
|
+
F.length(digits) == 11,
|
|
191
|
+
F.substring(digits, 5, 3)
|
|
192
|
+
).when(
|
|
193
|
+
F.length(digits) == 10,
|
|
194
|
+
F.substring(digits, 4, 3)
|
|
195
|
+
).otherwise("")
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@phones.register()
|
|
200
|
+
def extract_subscriber(col: Column) -> Column:
|
|
201
|
+
"""
|
|
202
|
+
Extract subscriber number (last 4 digits) from NANP phone number.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
col: Column containing phone number
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Column with subscriber number or empty string
|
|
209
|
+
"""
|
|
210
|
+
digits = extract_digits(col)
|
|
211
|
+
|
|
212
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
213
|
+
F.when(
|
|
214
|
+
F.length(digits) == 11,
|
|
215
|
+
F.substring(digits, 8, 4)
|
|
216
|
+
).when(
|
|
217
|
+
F.length(digits) == 10,
|
|
218
|
+
F.substring(digits, 7, 4)
|
|
219
|
+
).otherwise("")
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@phones.register()
|
|
224
|
+
def extract_local_number(col: Column) -> Column:
|
|
225
|
+
"""
|
|
226
|
+
Extract local number (exchange + subscriber) from NANP phone number.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
col: Column containing phone number
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Column with 7-digit local number or empty string
|
|
233
|
+
"""
|
|
234
|
+
exchange = extract_exchange(col)
|
|
235
|
+
subscriber = extract_subscriber(col)
|
|
236
|
+
|
|
237
|
+
return F.when(
|
|
238
|
+
(exchange != "") & (subscriber != ""),
|
|
239
|
+
F.concat(exchange, subscriber)
|
|
240
|
+
).otherwise("")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ============================================================================
|
|
244
|
+
# Phone Number Validation Functions
|
|
245
|
+
# ============================================================================
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@phones.register()
|
|
249
|
+
def is_valid_nanp(col: Column) -> Column:
|
|
250
|
+
"""
|
|
251
|
+
Check if phone number is valid NANP format (North American Numbering Plan).
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
col: Column containing phone number
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Column with boolean indicating NANP validity
|
|
258
|
+
"""
|
|
259
|
+
digits = extract_digits(col)
|
|
260
|
+
area_code = extract_area_code(col)
|
|
261
|
+
exchange = extract_exchange(col)
|
|
262
|
+
subscriber = extract_subscriber(col)
|
|
263
|
+
|
|
264
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
265
|
+
(F.length(digits).isin([10, 11])) &
|
|
266
|
+
# Area code: 2-9 for first digit, 0-9 for second, 0-9 for third
|
|
267
|
+
(area_code.rlike(r"^[2-9]\d{2}$")) &
|
|
268
|
+
# Exchange: 2-9 for first digit (historically, now 1-9 is valid)
|
|
269
|
+
(exchange.rlike(r"^[1-9]\d{2}$")) &
|
|
270
|
+
# Subscriber: any 4 digits
|
|
271
|
+
(subscriber.rlike(r"^\d{4}$")) &
|
|
272
|
+
# If 11 digits, must start with 1
|
|
273
|
+
((F.length(digits) == 10) | (digits.startswith("1")))
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@phones.register()
|
|
278
|
+
def is_valid_international(col: Column, min_length: int = 7, max_length: int = 15) -> Column:
|
|
279
|
+
"""
|
|
280
|
+
Check if phone number could be valid international format.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
col: Column containing phone number
|
|
284
|
+
min_length: Minimum digits for international number
|
|
285
|
+
max_length: Maximum digits for international number
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Column with boolean indicating potential international validity
|
|
289
|
+
"""
|
|
290
|
+
digits = extract_digits(col)
|
|
291
|
+
|
|
292
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
293
|
+
(F.length(digits) >= min_length) &
|
|
294
|
+
(F.length(digits) <= max_length) &
|
|
295
|
+
digits.rlike(r"^\d+$")
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@phones.register()
|
|
300
|
+
def is_valid_phone(col: Column) -> Column:
|
|
301
|
+
"""
|
|
302
|
+
Check if phone number is valid (NANP or international).
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
col: Column containing phone number
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Column with boolean indicating validity
|
|
309
|
+
"""
|
|
310
|
+
return is_valid_nanp(col) | is_valid_international(col)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
@phones.register()
|
|
314
|
+
def is_toll_free(col: Column) -> Column:
|
|
315
|
+
"""
|
|
316
|
+
Check if phone number is toll-free (800, 888, 877, 866, 855, 844, 833).
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
col: Column containing phone number
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Column with boolean indicating if toll-free
|
|
323
|
+
"""
|
|
324
|
+
area_code = extract_area_code(col)
|
|
325
|
+
|
|
326
|
+
toll_free_codes = ["800", "888", "877", "866", "855", "844", "833"]
|
|
327
|
+
|
|
328
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
329
|
+
area_code.isin(toll_free_codes)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
@phones.register()
|
|
334
|
+
def is_premium_rate(col: Column) -> Column:
|
|
335
|
+
"""
|
|
336
|
+
Check if phone number is premium rate (900).
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
col: Column containing phone number
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
Column with boolean indicating if premium rate
|
|
343
|
+
"""
|
|
344
|
+
area_code = extract_area_code(col)
|
|
345
|
+
|
|
346
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
347
|
+
area_code == "900"
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@phones.register()
|
|
352
|
+
def has_extension(col: Column) -> Column:
|
|
353
|
+
"""
|
|
354
|
+
Check if phone number has an extension.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
col: Column containing phone number
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Column with boolean indicating presence of extension
|
|
361
|
+
"""
|
|
362
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
363
|
+
col.rlike(r"ext\.?\s*\d+")
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
# ============================================================================
|
|
368
|
+
# Phone Number Cleaning Functions
|
|
369
|
+
# ============================================================================
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
@phones.register()
|
|
373
|
+
def remove_non_digits(col: Column) -> Column:
|
|
374
|
+
"""
|
|
375
|
+
Remove all non-digit characters from phone number.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
col: Column containing phone number
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Column with only digits
|
|
382
|
+
"""
|
|
383
|
+
return extract_digits(col)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
@phones.register()
|
|
387
|
+
def remove_extension(col: Column) -> Column:
|
|
388
|
+
"""
|
|
389
|
+
Remove extension from phone number.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
col: Column containing phone number
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
Column with extension removed
|
|
396
|
+
"""
|
|
397
|
+
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
398
|
+
F.regexp_replace(col, r"ext\.?\s*\d+", "")
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
@phones.register()
|
|
403
|
+
def convert_letters_to_numbers(col: Column) -> Column:
|
|
404
|
+
"""
|
|
405
|
+
Convert phone letters to numbers (e.g., 1-800-FLOWERS to 1-800-3569377).
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
col: Column containing phone number with letters
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
Column with letters converted to numbers
|
|
412
|
+
"""
|
|
413
|
+
result = col
|
|
414
|
+
|
|
415
|
+
# Apply each letter-to-number mapping
|
|
416
|
+
for letter, number in PHONE_KEYPAD_MAPPING.items():
|
|
417
|
+
result = F.regexp_replace(result, letter, number)
|
|
418
|
+
result = F.regexp_replace(result, letter.lower(), number)
|
|
419
|
+
|
|
420
|
+
return F.when(col.isNull(), F.lit("")).otherwise(result)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
@phones.register()
|
|
424
|
+
def normalize_separators(col: Column) -> Column:
|
|
425
|
+
"""
|
|
426
|
+
Normalize various separator styles to hyphens.
|
|
427
|
+
Removes parentheses and replaces dots, spaces with hyphens.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
col: Column containing phone number
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
Column with normalized separators
|
|
434
|
+
"""
|
|
435
|
+
# First remove parentheses and replace with space to maintain separation
|
|
436
|
+
result = F.regexp_replace(col, r"\(", "")
|
|
437
|
+
result = F.regexp_replace(result, r"\)", " ")
|
|
438
|
+
# Then replace any sequence of spaces or dots with hyphen
|
|
439
|
+
result = F.regexp_replace(result, r"[\s\.]+", "-")
|
|
440
|
+
# Collapse multiple hyphens into one
|
|
441
|
+
result = F.regexp_replace(result, r"-+", "-")
|
|
442
|
+
# Remove leading/trailing hyphens
|
|
443
|
+
result = F.regexp_replace(result, r"^-+|-+$", "")
|
|
444
|
+
|
|
445
|
+
return F.when(col.isNull(), F.lit("")).otherwise(result)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
@phones.register()
|
|
449
|
+
def add_country_code(col: Column) -> Column:
|
|
450
|
+
"""
|
|
451
|
+
Add country code "1" if not present (for NANP numbers).
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
col: Column containing phone number
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Column with country code added if needed
|
|
458
|
+
"""
|
|
459
|
+
digits = extract_digits(col)
|
|
460
|
+
|
|
461
|
+
return F.when(col.isNull(), col).otherwise(
|
|
462
|
+
F.when(
|
|
463
|
+
(F.length(digits) == 10) & is_valid_nanp(col),
|
|
464
|
+
F.concat(F.lit("1"), digits)
|
|
465
|
+
).otherwise(digits)
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
# ============================================================================
|
|
470
|
+
# Phone Number Formatting Functions
|
|
471
|
+
# ============================================================================
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
@phones.register()
|
|
475
|
+
def format_nanp(col: Column) -> Column:
|
|
476
|
+
"""
|
|
477
|
+
Format NANP phone number in standard hyphen format (XXX-XXX-XXXX).
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
col: Column containing phone number
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
Column with formatted phone number
|
|
484
|
+
"""
|
|
485
|
+
# Remove extension for validation but preserve it
|
|
486
|
+
extension = extract_extension(col)
|
|
487
|
+
phone_no_ext = remove_extension(col)
|
|
488
|
+
|
|
489
|
+
area_code = extract_area_code(phone_no_ext)
|
|
490
|
+
exchange = extract_exchange(phone_no_ext)
|
|
491
|
+
subscriber = extract_subscriber(phone_no_ext)
|
|
492
|
+
|
|
493
|
+
base_format = F.concat(
|
|
494
|
+
area_code, F.lit("-"),
|
|
495
|
+
exchange, F.lit("-"),
|
|
496
|
+
subscriber
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Add extension if present
|
|
500
|
+
formatted = F.when(
|
|
501
|
+
(extension != ""),
|
|
502
|
+
F.concat(base_format, F.lit(" ext. "), extension)
|
|
503
|
+
).otherwise(base_format)
|
|
504
|
+
|
|
505
|
+
return F.when(
|
|
506
|
+
is_valid_nanp(phone_no_ext),
|
|
507
|
+
formatted
|
|
508
|
+
).otherwise(F.lit(""))
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
@phones.register()
|
|
512
|
+
def format_nanp_paren(col: Column) -> Column:
|
|
513
|
+
"""
|
|
514
|
+
Format NANP phone number with parentheses ((XXX) XXX-XXXX).
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
col: Column containing phone number
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
Column with formatted phone number
|
|
521
|
+
"""
|
|
522
|
+
# Remove extension for validation but preserve it
|
|
523
|
+
extension = extract_extension(col)
|
|
524
|
+
phone_no_ext = remove_extension(col)
|
|
525
|
+
|
|
526
|
+
area_code = extract_area_code(phone_no_ext)
|
|
527
|
+
exchange = extract_exchange(phone_no_ext)
|
|
528
|
+
subscriber = extract_subscriber(phone_no_ext)
|
|
529
|
+
|
|
530
|
+
base_format = F.concat(
|
|
531
|
+
F.lit("("), area_code, F.lit(") "),
|
|
532
|
+
exchange, F.lit("-"), subscriber
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# Add extension if present
|
|
536
|
+
formatted = F.when(
|
|
537
|
+
(extension != ""),
|
|
538
|
+
F.concat(base_format, F.lit(" ext. "), extension)
|
|
539
|
+
).otherwise(base_format)
|
|
540
|
+
|
|
541
|
+
return F.when(
|
|
542
|
+
is_valid_nanp(phone_no_ext),
|
|
543
|
+
formatted
|
|
544
|
+
).otherwise(F.lit(""))
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
@phones.register()
|
|
548
|
+
def format_nanp_dot(col: Column) -> Column:
|
|
549
|
+
"""
|
|
550
|
+
Format NANP phone number with dots (XXX.XXX.XXXX).
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
col: Column containing phone number
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Column with formatted phone number
|
|
557
|
+
"""
|
|
558
|
+
# Remove extension for validation but preserve it
|
|
559
|
+
extension = extract_extension(col)
|
|
560
|
+
phone_no_ext = remove_extension(col)
|
|
561
|
+
|
|
562
|
+
area_code = extract_area_code(phone_no_ext)
|
|
563
|
+
exchange = extract_exchange(phone_no_ext)
|
|
564
|
+
subscriber = extract_subscriber(phone_no_ext)
|
|
565
|
+
|
|
566
|
+
base_format = F.concat(
|
|
567
|
+
area_code, F.lit("."),
|
|
568
|
+
exchange, F.lit("."),
|
|
569
|
+
subscriber
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Add extension if present
|
|
573
|
+
formatted = F.when(
|
|
574
|
+
(extension != ""),
|
|
575
|
+
F.concat(base_format, F.lit(" ext. "), extension)
|
|
576
|
+
).otherwise(base_format)
|
|
577
|
+
|
|
578
|
+
return F.when(
|
|
579
|
+
is_valid_nanp(phone_no_ext),
|
|
580
|
+
formatted
|
|
581
|
+
).otherwise(F.lit(""))
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
@phones.register()
|
|
585
|
+
def format_nanp_space(col: Column) -> Column:
|
|
586
|
+
"""
|
|
587
|
+
Format NANP phone number with spaces (XXX XXX XXXX).
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
col: Column containing phone number
|
|
591
|
+
|
|
592
|
+
Returns:
|
|
593
|
+
Column with formatted phone number
|
|
594
|
+
"""
|
|
595
|
+
# Remove extension for validation but preserve it
|
|
596
|
+
extension = extract_extension(col)
|
|
597
|
+
phone_no_ext = remove_extension(col)
|
|
598
|
+
|
|
599
|
+
area_code = extract_area_code(phone_no_ext)
|
|
600
|
+
exchange = extract_exchange(phone_no_ext)
|
|
601
|
+
subscriber = extract_subscriber(phone_no_ext)
|
|
602
|
+
|
|
603
|
+
base_format = F.concat(
|
|
604
|
+
area_code, F.lit(" "),
|
|
605
|
+
exchange, F.lit(" "),
|
|
606
|
+
subscriber
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Add extension if present
|
|
610
|
+
formatted = F.when(
|
|
611
|
+
(extension != ""),
|
|
612
|
+
F.concat(base_format, F.lit(" ext. "), extension)
|
|
613
|
+
).otherwise(base_format)
|
|
614
|
+
|
|
615
|
+
return F.when(
|
|
616
|
+
is_valid_nanp(phone_no_ext),
|
|
617
|
+
formatted
|
|
618
|
+
).otherwise(F.lit(""))
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
@phones.register()
|
|
622
|
+
def format_international(col: Column) -> Column:
|
|
623
|
+
"""
|
|
624
|
+
Format international phone number with country code.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
col: Column containing phone number
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
Column with formatted international number
|
|
631
|
+
"""
|
|
632
|
+
country_code = extract_country_code(col)
|
|
633
|
+
digits = extract_digits(col)
|
|
634
|
+
|
|
635
|
+
# For international numbers, if we have a country code, remove it from the beginning
|
|
636
|
+
# Use F.substring with proper column references
|
|
637
|
+
cc_length = F.length(country_code)
|
|
638
|
+
remaining_digits = F.when(
|
|
639
|
+
(country_code != "") & (cc_length > 0) & digits.startswith(country_code),
|
|
640
|
+
F.substring(digits, cc_length + 1, 999)
|
|
641
|
+
).otherwise(digits)
|
|
642
|
+
|
|
643
|
+
return F.when(
|
|
644
|
+
is_valid_international(col) & (country_code != ""),
|
|
645
|
+
F.concat(F.lit("+"), country_code, F.lit(" "), remaining_digits)
|
|
646
|
+
).when(
|
|
647
|
+
is_valid_international(col),
|
|
648
|
+
digits
|
|
649
|
+
).otherwise(F.lit(""))
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
@phones.register()
|
|
653
|
+
def format_e164(col: Column) -> Column:
|
|
654
|
+
"""
|
|
655
|
+
Format phone number in E.164 format (+CCAAANNNNNNN) with default country code 1.
|
|
656
|
+
|
|
657
|
+
Args:
|
|
658
|
+
col: Column containing phone number
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
Column with E.164 formatted number
|
|
662
|
+
"""
|
|
663
|
+
digits = extract_digits(col)
|
|
664
|
+
country_code = extract_country_code(col)
|
|
665
|
+
|
|
666
|
+
# Check if it's a valid NANP number first
|
|
667
|
+
is_nanp = is_valid_nanp(col)
|
|
668
|
+
|
|
669
|
+
# Use default country code "1" if not present and number is 10 digits NANP
|
|
670
|
+
final_country = F.when(
|
|
671
|
+
(country_code == "") & (F.length(digits) == 10) & is_nanp,
|
|
672
|
+
F.lit("1")
|
|
673
|
+
).otherwise(country_code)
|
|
674
|
+
|
|
675
|
+
# Build E.164 format - only for valid phones
|
|
676
|
+
return F.when(
|
|
677
|
+
is_valid_phone(col),
|
|
678
|
+
F.when(
|
|
679
|
+
(F.length(digits) == 10) & is_nanp,
|
|
680
|
+
F.concat(F.lit("+"), F.lit("1"), digits)
|
|
681
|
+
).when(
|
|
682
|
+
(F.length(digits) == 11) & digits.startswith("1") & is_nanp,
|
|
683
|
+
F.concat(F.lit("+"), digits)
|
|
684
|
+
).when(
|
|
685
|
+
(country_code != "") & is_valid_international(col),
|
|
686
|
+
F.concat(F.lit("+"), digits) # digits already includes country code
|
|
687
|
+
).otherwise(F.lit(""))
|
|
688
|
+
).otherwise(F.lit(""))
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
# ============================================================================
|
|
692
|
+
# Phone Number Standardization Functions
|
|
693
|
+
# ============================================================================
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
@phones.register()
|
|
697
|
+
def standardize_phone(col: Column) -> Column:
|
|
698
|
+
"""
|
|
699
|
+
Standardize phone number with cleaning and NANP formatting.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
col: Column containing phone number
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
Column with standardized phone number in NANP format
|
|
706
|
+
"""
|
|
707
|
+
# Clean and convert letters in a simpler way
|
|
708
|
+
cleaned = convert_letters_to_numbers(col)
|
|
709
|
+
|
|
710
|
+
# Extract extension first
|
|
711
|
+
extension = extract_extension(cleaned)
|
|
712
|
+
phone_no_ext = remove_extension(cleaned)
|
|
713
|
+
|
|
714
|
+
# Get digits and check validity
|
|
715
|
+
digits = extract_digits(phone_no_ext)
|
|
716
|
+
|
|
717
|
+
# Simple NANP formatting for valid 10 or 11 digit numbers
|
|
718
|
+
result = F.when(
|
|
719
|
+
F.length(digits) == 10,
|
|
720
|
+
F.concat(
|
|
721
|
+
F.substring(digits, 1, 3), F.lit("-"),
|
|
722
|
+
F.substring(digits, 4, 3), F.lit("-"),
|
|
723
|
+
F.substring(digits, 7, 4)
|
|
724
|
+
)
|
|
725
|
+
).when(
|
|
726
|
+
F.length(digits) == 11,
|
|
727
|
+
F.concat(
|
|
728
|
+
F.substring(digits, 2, 3), F.lit("-"),
|
|
729
|
+
F.substring(digits, 5, 3), F.lit("-"),
|
|
730
|
+
F.substring(digits, 8, 4)
|
|
731
|
+
)
|
|
732
|
+
).otherwise(F.lit(""))
|
|
733
|
+
|
|
734
|
+
# Add extension back if present
|
|
735
|
+
final_result = F.when(
|
|
736
|
+
(extension != "") & (result != ""),
|
|
737
|
+
F.concat(result, F.lit(" ext. "), extension)
|
|
738
|
+
).otherwise(result)
|
|
739
|
+
|
|
740
|
+
return final_result
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
@phones.register()
|
|
744
|
+
def standardize_phone_e164(col: Column) -> Column:
|
|
745
|
+
"""
|
|
746
|
+
Standardize phone number with cleaning and E.164 formatting.
|
|
747
|
+
|
|
748
|
+
Args:
|
|
749
|
+
col: Column containing phone number
|
|
750
|
+
|
|
751
|
+
Returns:
|
|
752
|
+
Column with standardized phone number in E.164 format
|
|
753
|
+
"""
|
|
754
|
+
# Clean and convert letters
|
|
755
|
+
cleaned = convert_letters_to_numbers(col)
|
|
756
|
+
|
|
757
|
+
# Format as E.164
|
|
758
|
+
result = format_e164(cleaned)
|
|
759
|
+
|
|
760
|
+
# Only return valid phone numbers
|
|
761
|
+
return F.when(is_valid_phone(cleaned), result).otherwise(F.lit(""))
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
@phones.register()
|
|
765
|
+
def standardize_phone_digits(col: Column) -> Column:
|
|
766
|
+
"""
|
|
767
|
+
Standardize phone number and return digits only.
|
|
768
|
+
|
|
769
|
+
Args:
|
|
770
|
+
col: Column containing phone number
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
Column with digits only
|
|
774
|
+
"""
|
|
775
|
+
# Clean and convert letters
|
|
776
|
+
cleaned = convert_letters_to_numbers(col)
|
|
777
|
+
|
|
778
|
+
# Get digits only
|
|
779
|
+
result = extract_digits(cleaned)
|
|
780
|
+
|
|
781
|
+
# Only return valid phone numbers
|
|
782
|
+
return F.when(is_valid_phone(cleaned), result).otherwise(F.lit(""))
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
@phones.register()
|
|
786
|
+
def clean_phone(col: Column) -> Column:
|
|
787
|
+
"""
|
|
788
|
+
Clean and validate phone number, returning null for invalid numbers.
|
|
789
|
+
|
|
790
|
+
Args:
|
|
791
|
+
col: Column containing phone number
|
|
792
|
+
|
|
793
|
+
Returns:
|
|
794
|
+
Column with cleaned phone number or null
|
|
795
|
+
"""
|
|
796
|
+
# Simple implementation to avoid deep nesting
|
|
797
|
+
cleaned = convert_letters_to_numbers(col)
|
|
798
|
+
digits = extract_digits(cleaned)
|
|
799
|
+
|
|
800
|
+
# Simple validation and formatting
|
|
801
|
+
result = F.when(
|
|
802
|
+
F.length(digits) == 10,
|
|
803
|
+
F.concat(
|
|
804
|
+
F.substring(digits, 1, 3), F.lit("-"),
|
|
805
|
+
F.substring(digits, 4, 3), F.lit("-"),
|
|
806
|
+
F.substring(digits, 7, 4)
|
|
807
|
+
)
|
|
808
|
+
).when(
|
|
809
|
+
F.length(digits) == 11,
|
|
810
|
+
F.concat(
|
|
811
|
+
F.substring(digits, 2, 3), F.lit("-"),
|
|
812
|
+
F.substring(digits, 5, 3), F.lit("-"),
|
|
813
|
+
F.substring(digits, 8, 4)
|
|
814
|
+
)
|
|
815
|
+
).otherwise(F.lit(None))
|
|
816
|
+
|
|
817
|
+
return result
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
# ============================================================================
|
|
821
|
+
# Phone Number Information Functions
|
|
822
|
+
# ============================================================================
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
@phones.register()
|
|
826
|
+
def get_phone_type(col: Column) -> Column:
|
|
827
|
+
"""
|
|
828
|
+
Get phone number type (toll-free, premium, standard, international).
|
|
829
|
+
|
|
830
|
+
Args:
|
|
831
|
+
col: Column containing phone number
|
|
832
|
+
|
|
833
|
+
Returns:
|
|
834
|
+
Column with phone type
|
|
835
|
+
"""
|
|
836
|
+
return F.when(col.isNull() | (col == ""), F.lit("unknown")).otherwise(
|
|
837
|
+
F.when(is_toll_free(col), F.lit("toll-free"))
|
|
838
|
+
.when(is_premium_rate(col), F.lit("premium"))
|
|
839
|
+
.when(is_valid_nanp(col), F.lit("standard"))
|
|
840
|
+
.when(is_valid_international(col), F.lit("international"))
|
|
841
|
+
.otherwise(F.lit("invalid"))
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
@phones.register()
|
|
846
|
+
def get_region_from_area_code(col: Column) -> Column:
|
|
847
|
+
"""
|
|
848
|
+
Get geographic region from area code (simplified - would need lookup table).
|
|
849
|
+
|
|
850
|
+
Args:
|
|
851
|
+
col: Column containing phone number
|
|
852
|
+
|
|
853
|
+
Returns:
|
|
854
|
+
Column with region or empty string
|
|
855
|
+
"""
|
|
856
|
+
area_code = extract_area_code(col)
|
|
857
|
+
|
|
858
|
+
# This is a simplified example - in practice you'd use a lookup table
|
|
859
|
+
# Just showing structure for major area codes
|
|
860
|
+
return F.when(area_code == "212", F.lit("New York, NY")).\
|
|
861
|
+
when(area_code == "213", F.lit("Los Angeles, CA")).\
|
|
862
|
+
when(area_code == "312", F.lit("Chicago, IL")).\
|
|
863
|
+
when(area_code == "415", F.lit("San Francisco, CA")).\
|
|
864
|
+
when(area_code == "202", F.lit("Washington, DC")).\
|
|
865
|
+
when(area_code.isin(["800", "888", "877", "866", "855", "844", "833"]),
|
|
866
|
+
F.lit("Toll-Free")).\
|
|
867
|
+
when(area_code == "900", F.lit("Premium")).\
|
|
868
|
+
otherwise(F.lit(""))
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
@phones.register()
|
|
872
|
+
def mask_phone(col: Column) -> Column:
|
|
873
|
+
"""
|
|
874
|
+
Mask phone number for privacy keeping last 4 digits (e.g., ***-***-1234).
|
|
875
|
+
|
|
876
|
+
Args:
|
|
877
|
+
col: Column containing phone number
|
|
878
|
+
|
|
879
|
+
Returns:
|
|
880
|
+
Column with masked phone number
|
|
881
|
+
"""
|
|
882
|
+
subscriber = extract_subscriber(col)
|
|
883
|
+
|
|
884
|
+
# Mask area code and exchange, keep last 4 digits
|
|
885
|
+
masked = F.when(
|
|
886
|
+
is_valid_nanp(col),
|
|
887
|
+
F.concat(
|
|
888
|
+
F.lit("***"), F.lit("-"),
|
|
889
|
+
F.lit("***"), F.lit("-"),
|
|
890
|
+
subscriber
|
|
891
|
+
)
|
|
892
|
+
).otherwise(col)
|
|
893
|
+
|
|
894
|
+
return F.when(col.isNull() | (col == ""), F.lit(None)).otherwise(masked)
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
# ============================================================================
|
|
898
|
+
# Phone Number Filtering Functions
|
|
899
|
+
# ============================================================================
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
@phones.register()
|
|
903
|
+
def filter_valid_phones(col: Column) -> Column:
|
|
904
|
+
"""
|
|
905
|
+
Return phone number only if valid, otherwise return null.
|
|
906
|
+
|
|
907
|
+
Args:
|
|
908
|
+
col: Column containing phone number
|
|
909
|
+
|
|
910
|
+
Returns:
|
|
911
|
+
Column with valid phone or null
|
|
912
|
+
"""
|
|
913
|
+
return F.when(is_valid_phone(col), col).otherwise(F.lit(None))
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
@phones.register()
|
|
917
|
+
def filter_nanp_phones(col: Column) -> Column:
|
|
918
|
+
"""
|
|
919
|
+
Return phone number only if valid NANP, otherwise return null.
|
|
920
|
+
|
|
921
|
+
Args:
|
|
922
|
+
col: Column containing phone number
|
|
923
|
+
|
|
924
|
+
Returns:
|
|
925
|
+
Column with NANP phone or null
|
|
926
|
+
"""
|
|
927
|
+
return F.when(is_valid_nanp(col), col).otherwise(F.lit(None))
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
@phones.register()
|
|
931
|
+
def filter_toll_free_phones(col: Column) -> Column:
|
|
932
|
+
"""
|
|
933
|
+
Return phone number only if toll-free, otherwise return null.
|
|
934
|
+
|
|
935
|
+
Args:
|
|
936
|
+
col: Column containing phone number
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
Column with toll-free phone or null
|
|
940
|
+
"""
|
|
941
|
+
return F.when(is_toll_free(col), col).otherwise(F.lit(None))
|