datacompose 0.2.4.1__py3-none-any.whl → 0.2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

@@ -1,3 +1,61 @@
1
+ """
2
+ Phone number transformation primitives for PySpark.
3
+
4
+ Preview Output:
5
+ +------------------------+----------------+--------+---------+------------+-------+---------+------------+
6
+ |phone_numbers |standardized |is_valid|area_code|local_number|has_ext|extension|is_toll_free|
7
+ +------------------------+----------------+--------+---------+------------+-------+---------+------------+
8
+ | (555) 123-4567 |(555) 123-4567 |true |555 |1234567 |false |null |false |
9
+ |+1-800-555-1234 |+1 800-555-1234 |true |800 |5551234 |false |null |true |
10
+ |555.123.4567 ext 890 |555.123.4567 |true |555 |1234567 |true |890 |false |
11
+ |123-45-67 |null |false |null |null |false |null |false |
12
+ |1-800-FLOWERS |1-800-356-9377 |true |800 |3569377 |false |null |true |
13
+ | 415 555 0123 |415-555-0123 |true |415 |5550123 |false |null |false |
14
+ +------------------------+----------------+--------+---------+------------+-------+---------+------------+
15
+
16
+ Usage Example:
17
+ from pyspark.sql import SparkSession
18
+ from pyspark.sql import functions as F
19
+ from transformers.pyspark.phone_numbers import phone_numbers
20
+
21
+ # Initialize Spark
22
+ spark = SparkSession.builder.appName("PhoneCleaning").getOrCreate()
23
+
24
+ # Create sample data
25
+ data = [
26
+ ("(555) 123-4567",),
27
+ ("+1-800-555-1234",),
28
+ ("555.123.4567 ext 890",),
29
+ ("123-45-67",),
30
+ ("1-800-FLOWERS",),
31
+ ]
32
+ df = spark.createDataFrame(data, ["phone_numbers"])
33
+
34
+ # Apply transformations
35
+ result_df = df.select(
36
+ F.col("phone_numbers"),
37
+ phone_numbers.standardize_phone_numbers(F.col("phone_numbers")).alias("standardized"),
38
+ phone_numbers.is_valid_phone_numbers(F.col("phone_numbers")).alias("is_valid"),
39
+ phone_numbers.extract_area_code(
40
+ phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
41
+ ).alias("area_code"),
42
+ phone_numbers.extract_local_number(
43
+ phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
44
+ ).alias("local_number"),
45
+ phone_numbers.has_extension(F.col("phone_numbers")).alias("has_ext"),
46
+ phone_numbers.extract_extension(F.col("phone_numbers")).alias("extension"),
47
+ phone_numbers.is_toll_free(
48
+ phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
49
+ ).alias("is_toll_free")
50
+ )
51
+
52
+ # Show results
53
+ result_df.show(truncate=False)
54
+
55
+ Installation:
56
+ datacompose add phone_numbers
57
+ """
58
+
1
59
  import re
2
60
  from typing import TYPE_CHECKING, Dict, Optional
3
61
 
@@ -16,23 +74,41 @@ else:
16
74
 
17
75
  try:
18
76
  # Try local utils import first (for generated code)
19
- from utils.primitives import PrimitiveRegistry
77
+ from utils.primitives import PrimitiveRegistry # type: ignore
20
78
  except ImportError:
21
79
  # Fall back to installed datacompose package
22
80
  from datacompose.operators.primitives import PrimitiveRegistry
23
81
 
24
- phones = PrimitiveRegistry("phones")
82
+ phone_numbers = PrimitiveRegistry("phone_numbers")
25
83
 
26
84
  # Phone keypad mapping for letter to number conversion
27
85
  PHONE_KEYPAD_MAPPING = {
28
- "A": "2", "B": "2", "C": "2",
29
- "D": "3", "E": "3", "F": "3",
30
- "G": "4", "H": "4", "I": "4",
31
- "J": "5", "K": "5", "L": "5",
32
- "M": "6", "N": "6", "O": "6",
33
- "P": "7", "Q": "7", "R": "7", "S": "7",
34
- "T": "8", "U": "8", "V": "8",
35
- "W": "9", "X": "9", "Y": "9", "Z": "9",
86
+ "A": "2",
87
+ "B": "2",
88
+ "C": "2",
89
+ "D": "3",
90
+ "E": "3",
91
+ "F": "3",
92
+ "G": "4",
93
+ "H": "4",
94
+ "I": "4",
95
+ "J": "5",
96
+ "K": "5",
97
+ "L": "5",
98
+ "M": "6",
99
+ "N": "6",
100
+ "O": "6",
101
+ "P": "7",
102
+ "Q": "7",
103
+ "R": "7",
104
+ "S": "7",
105
+ "T": "8",
106
+ "U": "8",
107
+ "V": "8",
108
+ "W": "9",
109
+ "X": "9",
110
+ "Y": "9",
111
+ "Z": "9",
36
112
  }
37
113
 
38
114
 
@@ -41,57 +117,56 @@ PHONE_KEYPAD_MAPPING = {
41
117
  # ============================================================================
42
118
 
43
119
 
44
- @phones.register()
45
- def extract_phone_from_text(col: Column) -> Column:
120
+ @phone_numbers.register()
121
+ def extract_phone_numbers_from_text(col: Column) -> Column:
46
122
  """
47
123
  Extract first phone number from text using regex patterns.
48
-
124
+
49
125
  Args:
50
126
  col: Column containing text with potential phone numbers
51
-
127
+
52
128
  Returns:
53
- Column with extracted phone number or empty string
129
+ Column with extracted phone numbers or empty string
54
130
  """
55
- # Comprehensive phone pattern that matches various formats
131
+ # Comprehensive phone_numbers pattern that matches various formats
56
132
  # Handles: +1-555-123-4567, (555) 123-4567, 555.123.4567, 555-123-4567, etc.
57
- phone_pattern = r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(\s*(ext|x)\.?\s*\d+)?"
58
-
133
+ phone_numbers_pattern = (
134
+ r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(\s*(ext|x)\.?\s*\d+)?"
135
+ )
136
+
59
137
  return F.when(col.isNull(), F.lit("")).otherwise(
60
- F.regexp_extract(col, phone_pattern, 0)
138
+ F.regexp_extract(col, phone_numbers_pattern, 0)
61
139
  )
62
140
 
63
141
 
64
- @phones.register()
65
- def extract_all_phones_from_text(col: Column) -> Column:
142
+ @phone_numbers.register()
143
+ def extract_all_phone_numbers_from_text(col: Column) -> Column:
66
144
  """
67
145
  Extract all phone numbers from text as an array.
68
-
146
+
69
147
  Args:
70
148
  col: Column containing text with potential phone numbers
71
-
149
+
72
150
  Returns:
73
151
  Column with array of phone numbers
74
152
  """
75
- # For simplicity, we'll return an array with just the first phone found
153
+ # For simplicity, we'll return an array with just the first phone_numbers found
76
154
  # A proper implementation would require more complex regex or UDF
77
155
  # This is a limitation of Spark SQL's regex capabilities
78
- first_phone = extract_phone_from_text(col)
79
-
156
+ first_phone_numbers = extract_phone_numbers_from_text(col)
157
+
80
158
  # Return array with single element or empty array
81
- return F.when(
82
- first_phone != "",
83
- F.array(first_phone)
84
- ).otherwise(F.array())
159
+ return F.when(first_phone_numbers != "", F.array(first_phone_numbers)).otherwise(F.array())
85
160
 
86
161
 
87
- @phones.register()
162
+ @phone_numbers.register()
88
163
  def extract_digits(col: Column) -> Column:
89
164
  """
90
165
  Extract only digits from phone number string.
91
-
166
+
92
167
  Args:
93
168
  col: Column containing phone number
94
-
169
+
95
170
  Returns:
96
171
  Column with only digits
97
172
  """
@@ -100,143 +175,131 @@ def extract_digits(col: Column) -> Column:
100
175
  )
101
176
 
102
177
 
103
- @phones.register()
178
+ @phone_numbers.register()
104
179
  def extract_extension(col: Column) -> Column:
105
180
  """
106
181
  Extract extension from phone number if present.
107
-
182
+
108
183
  Args:
109
184
  col: Column containing phone number
110
-
185
+
111
186
  Returns:
112
187
  Column with extension or empty string
113
188
  """
114
189
  return F.when(col.isNull(), F.lit("")).otherwise(
115
190
  F.when(
116
- col.rlike(r"ext\.?\s*(\d+)"),
117
- F.regexp_extract(col, r"ext\.?\s*(\d+)", 1)
191
+ col.rlike(r"ext\.?\s*(\d+)"), F.regexp_extract(col, r"ext\.?\s*(\d+)", 1)
118
192
  ).otherwise("")
119
193
  )
120
194
 
121
195
 
122
- @phones.register()
196
+ @phone_numbers.register()
123
197
  def extract_country_code(col: Column) -> Column:
124
198
  """
125
199
  Extract country code from phone number.
126
-
200
+
127
201
  Args:
128
202
  col: Column containing phone number
129
-
203
+
130
204
  Returns:
131
205
  Column with country code or empty string
132
206
  """
133
207
  digits = extract_digits(col)
134
-
208
+
135
209
  # Check for explicit country code with + prefix
136
210
  has_plus = col.contains("+")
137
-
211
+
138
212
  return F.when(col.isNull(), F.lit("")).otherwise(
139
213
  F.when(
140
214
  # Explicit country code with +
141
215
  has_plus & col.rlike(r"^\+(\d{1,3})"),
142
- F.regexp_extract(col, r"^\+(\d{1,3})", 1)
143
- ).when(
216
+ F.regexp_extract(col, r"^\+(\d{1,3})", 1),
217
+ )
218
+ .when(
144
219
  # NANP with leading 1 (11 digits total)
145
220
  (F.length(digits) == 11) & digits.startswith("1"),
146
- F.lit("1")
147
- ).otherwise("")
221
+ F.lit("1"),
222
+ )
223
+ .otherwise("")
148
224
  )
149
225
 
150
226
 
151
- @phones.register()
227
+ @phone_numbers.register()
152
228
  def extract_area_code(col: Column) -> Column:
153
229
  """
154
230
  Extract area code from NANP phone number.
155
-
231
+
156
232
  Args:
157
233
  col: Column containing phone number
158
-
234
+
159
235
  Returns:
160
236
  Column with area code or empty string
161
237
  """
162
238
  digits = extract_digits(col)
163
-
239
+
164
240
  return F.when(col.isNull(), F.lit("")).otherwise(
165
- F.when(
166
- F.length(digits) == 11,
167
- F.substring(digits, 2, 3) # Skip country code
168
- ).when(
169
- F.length(digits) == 10,
170
- F.substring(digits, 1, 3)
171
- ).otherwise("")
241
+ F.when(F.length(digits) == 11, F.substring(digits, 2, 3)) # Skip country code
242
+ .when(F.length(digits) == 10, F.substring(digits, 1, 3))
243
+ .otherwise("")
172
244
  )
173
245
 
174
246
 
175
- @phones.register()
247
+ @phone_numbers.register()
176
248
  def extract_exchange(col: Column) -> Column:
177
249
  """
178
250
  Extract exchange (first 3 digits of local number) from NANP phone number.
179
-
251
+
180
252
  Args:
181
253
  col: Column containing phone number
182
-
254
+
183
255
  Returns:
184
256
  Column with exchange or empty string
185
257
  """
186
258
  digits = extract_digits(col)
187
-
259
+
188
260
  return F.when(col.isNull(), F.lit("")).otherwise(
189
- F.when(
190
- F.length(digits) == 11,
191
- F.substring(digits, 5, 3)
192
- ).when(
193
- F.length(digits) == 10,
194
- F.substring(digits, 4, 3)
195
- ).otherwise("")
261
+ F.when(F.length(digits) == 11, F.substring(digits, 5, 3))
262
+ .when(F.length(digits) == 10, F.substring(digits, 4, 3))
263
+ .otherwise("")
196
264
  )
197
265
 
198
266
 
199
- @phones.register()
267
+ @phone_numbers.register()
200
268
  def extract_subscriber(col: Column) -> Column:
201
269
  """
202
270
  Extract subscriber number (last 4 digits) from NANP phone number.
203
-
271
+
204
272
  Args:
205
273
  col: Column containing phone number
206
-
274
+
207
275
  Returns:
208
276
  Column with subscriber number or empty string
209
277
  """
210
278
  digits = extract_digits(col)
211
-
279
+
212
280
  return F.when(col.isNull(), F.lit("")).otherwise(
213
- F.when(
214
- F.length(digits) == 11,
215
- F.substring(digits, 8, 4)
216
- ).when(
217
- F.length(digits) == 10,
218
- F.substring(digits, 7, 4)
219
- ).otherwise("")
281
+ F.when(F.length(digits) == 11, F.substring(digits, 8, 4))
282
+ .when(F.length(digits) == 10, F.substring(digits, 7, 4))
283
+ .otherwise("")
220
284
  )
221
285
 
222
286
 
223
- @phones.register()
287
+ @phone_numbers.register()
224
288
  def extract_local_number(col: Column) -> Column:
225
289
  """
226
290
  Extract local number (exchange + subscriber) from NANP phone number.
227
-
291
+
228
292
  Args:
229
293
  col: Column containing phone number
230
-
294
+
231
295
  Returns:
232
296
  Column with 7-digit local number or empty string
233
297
  """
234
298
  exchange = extract_exchange(col)
235
299
  subscriber = extract_subscriber(col)
236
-
300
+
237
301
  return F.when(
238
- (exchange != "") & (subscriber != ""),
239
- F.concat(exchange, subscriber)
302
+ (exchange != "") & (subscriber != ""), F.concat(exchange, subscriber)
240
303
  ).otherwise("")
241
304
 
242
305
 
@@ -245,14 +308,14 @@ def extract_local_number(col: Column) -> Column:
245
308
  # ============================================================================
246
309
 
247
310
 
248
- @phones.register()
311
+ @phone_numbers.register()
249
312
  def is_valid_nanp(col: Column) -> Column:
250
313
  """
251
314
  Check if phone number is valid NANP format (North American Numbering Plan).
252
-
315
+
253
316
  Args:
254
317
  col: Column containing phone number
255
-
318
+
256
319
  Returns:
257
320
  Column with boolean indicating NANP validity
258
321
  """
@@ -260,108 +323,108 @@ def is_valid_nanp(col: Column) -> Column:
260
323
  area_code = extract_area_code(col)
261
324
  exchange = extract_exchange(col)
262
325
  subscriber = extract_subscriber(col)
263
-
326
+
264
327
  return F.when(col.isNull(), F.lit(False)).otherwise(
265
- (F.length(digits).isin([10, 11])) &
328
+ (F.length(digits).isin([10, 11]))
329
+ &
266
330
  # Area code: 2-9 for first digit, 0-9 for second, 0-9 for third
267
- (area_code.rlike(r"^[2-9]\d{2}$")) &
331
+ (area_code.rlike(r"^[2-9]\d{2}$"))
332
+ &
268
333
  # Exchange: 2-9 for first digit (historically, now 1-9 is valid)
269
- (exchange.rlike(r"^[1-9]\d{2}$")) &
334
+ (exchange.rlike(r"^[1-9]\d{2}$"))
335
+ &
270
336
  # Subscriber: any 4 digits
271
- (subscriber.rlike(r"^\d{4}$")) &
337
+ (subscriber.rlike(r"^\d{4}$"))
338
+ &
272
339
  # If 11 digits, must start with 1
273
340
  ((F.length(digits) == 10) | (digits.startswith("1")))
274
341
  )
275
342
 
276
343
 
277
- @phones.register()
278
- def is_valid_international(col: Column, min_length: int = 7, max_length: int = 15) -> Column:
344
+ @phone_numbers.register()
345
+ def is_valid_international(
346
+ col: Column, min_length: int = 7, max_length: int = 15
347
+ ) -> Column:
279
348
  """
280
349
  Check if phone number could be valid international format.
281
-
350
+
282
351
  Args:
283
352
  col: Column containing phone number
284
353
  min_length: Minimum digits for international number
285
354
  max_length: Maximum digits for international number
286
-
355
+
287
356
  Returns:
288
357
  Column with boolean indicating potential international validity
289
358
  """
290
359
  digits = extract_digits(col)
291
-
360
+
292
361
  return F.when(col.isNull(), F.lit(False)).otherwise(
293
- (F.length(digits) >= min_length) &
294
- (F.length(digits) <= max_length) &
295
- digits.rlike(r"^\d+$")
362
+ (F.length(digits) >= min_length)
363
+ & (F.length(digits) <= max_length)
364
+ & digits.rlike(r"^\d+$")
296
365
  )
297
366
 
298
367
 
299
- @phones.register()
300
- def is_valid_phone(col: Column) -> Column:
368
+ @phone_numbers.register()
369
+ def is_valid_phone_numbers(col: Column) -> Column:
301
370
  """
302
371
  Check if phone number is valid (NANP or international).
303
-
372
+
304
373
  Args:
305
374
  col: Column containing phone number
306
-
375
+
307
376
  Returns:
308
377
  Column with boolean indicating validity
309
378
  """
310
379
  return is_valid_nanp(col) | is_valid_international(col)
311
380
 
312
381
 
313
- @phones.register()
382
+ @phone_numbers.register()
314
383
  def is_toll_free(col: Column) -> Column:
315
384
  """
316
385
  Check if phone number is toll-free (800, 888, 877, 866, 855, 844, 833).
317
-
386
+
318
387
  Args:
319
388
  col: Column containing phone number
320
-
389
+
321
390
  Returns:
322
391
  Column with boolean indicating if toll-free
323
392
  """
324
393
  area_code = extract_area_code(col)
325
-
394
+
326
395
  toll_free_codes = ["800", "888", "877", "866", "855", "844", "833"]
327
-
328
- return F.when(col.isNull(), F.lit(False)).otherwise(
329
- area_code.isin(toll_free_codes)
330
- )
396
+
397
+ return F.when(col.isNull(), F.lit(False)).otherwise(area_code.isin(toll_free_codes))
331
398
 
332
399
 
333
- @phones.register()
400
+ @phone_numbers.register()
334
401
  def is_premium_rate(col: Column) -> Column:
335
402
  """
336
403
  Check if phone number is premium rate (900).
337
-
404
+
338
405
  Args:
339
- col: Column containing phone number
340
-
406
+ col: Column containing phophonene_numbers number
407
+
341
408
  Returns:
342
409
  Column with boolean indicating if premium rate
343
410
  """
344
411
  area_code = extract_area_code(col)
345
-
346
- return F.when(col.isNull(), F.lit(False)).otherwise(
347
- area_code == "900"
348
- )
412
+
413
+ return F.when(col.isNull(), F.lit(False)).otherwise(area_code == "900")
349
414
 
350
415
 
351
- @phones.register()
416
+ @phone_numbers.register()
352
417
  def has_extension(col: Column) -> Column:
353
418
  """
354
419
  Check if phone number has an extension.
355
-
420
+
356
421
  Args:
357
422
  col: Column containing phone number
358
-
423
+
359
424
  Returns:
360
425
  Column with boolean indicating presence of extension
361
426
  """
362
- return F.when(col.isNull(), F.lit(False)).otherwise(
363
- col.rlike(r"ext\.?\s*\d+")
364
- )
427
+ return F.when(col.isNull(), F.lit(False)).otherwise(col.rlike(r"ext\.?\s*\d+"))
365
428
 
366
429
 
367
430
  # ============================================================================
@@ -369,28 +432,28 @@ def has_extension(col: Column) -> Column:
369
432
  # ============================================================================
370
433
 
371
434
 
372
- @phones.register()
435
+ @phone_numbers.register()
373
436
  def remove_non_digits(col: Column) -> Column:
374
437
  """
375
438
  Remove all non-digit characters from phone number.
376
-
439
+
377
440
  Args:
378
441
  col: Column containing phone number
379
-
442
+
380
443
  Returns:
381
444
  Column with only digits
382
445
  """
383
446
  return extract_digits(col)
384
447
 
385
448
 
386
- @phones.register()
449
+ @phone_numbers.register()
387
450
  def remove_extension(col: Column) -> Column:
388
451
  """
389
452
  Remove extension from phone number.
390
-
453
+
391
454
  Args:
392
455
  col: Column containing phone number
393
-
456
+
394
457
  Returns:
395
458
  Column with extension removed
396
459
  """
@@ -399,36 +462,36 @@ def remove_extension(col: Column) -> Column:
399
462
  )
400
463
 
401
464
 
402
- @phones.register()
465
+ @phone_numbers.register()
403
466
  def convert_letters_to_numbers(col: Column) -> Column:
404
467
  """
405
468
  Convert phone letters to numbers (e.g., 1-800-FLOWERS to 1-800-3569377).
406
-
469
+
407
470
  Args:
408
471
  col: Column containing phone number with letters
409
-
472
+
410
473
  Returns:
411
474
  Column with letters converted to numbers
412
475
  """
413
476
  result = col
414
-
477
+
415
478
  # Apply each letter-to-number mapping
416
479
  for letter, number in PHONE_KEYPAD_MAPPING.items():
417
480
  result = F.regexp_replace(result, letter, number)
418
481
  result = F.regexp_replace(result, letter.lower(), number)
419
-
482
+
420
483
  return F.when(col.isNull(), F.lit("")).otherwise(result)
421
484
 
422
485
 
423
- @phones.register()
486
+ @phone_numbers.register()
424
487
  def normalize_separators(col: Column) -> Column:
425
488
  """
426
489
  Normalize various separator styles to hyphens.
427
490
  Removes parentheses and replaces dots, spaces with hyphens.
428
-
491
+
429
492
  Args:
430
493
  col: Column containing phone number
431
-
494
+
432
495
  Returns:
433
496
  Column with normalized separators
434
497
  """
@@ -441,27 +504,26 @@ def normalize_separators(col: Column) -> Column:
441
504
  result = F.regexp_replace(result, r"-+", "-")
442
505
  # Remove leading/trailing hyphens
443
506
  result = F.regexp_replace(result, r"^-+|-+$", "")
444
-
507
+
445
508
  return F.when(col.isNull(), F.lit("")).otherwise(result)
446
509
 
447
510
 
448
- @phones.register()
511
+ @phone_numbers.register()
449
512
  def add_country_code(col: Column) -> Column:
450
513
  """
451
514
  Add country code "1" if not present (for NANP numbers).
452
-
515
+
453
516
  Args:
454
517
  col: Column containing phone number
455
-
518
+
456
519
  Returns:
457
520
  Column with country code added if needed
458
521
  """
459
522
  digits = extract_digits(col)
460
-
523
+
461
524
  return F.when(col.isNull(), col).otherwise(
462
525
  F.when(
463
- (F.length(digits) == 10) & is_valid_nanp(col),
464
- F.concat(F.lit("1"), digits)
526
+ (F.length(digits) == 10) & is_valid_nanp(col), F.concat(F.lit("1"), digits)
465
527
  ).otherwise(digits)
466
528
  )
467
529
 
@@ -471,220 +533,193 @@ def add_country_code(col: Column) -> Column:
471
533
  # ============================================================================
472
534
 
473
535
 
474
- @phones.register()
536
+ @phone_numbers.register()
475
537
  def format_nanp(col: Column) -> Column:
476
538
  """
477
539
  Format NANP phone number in standard hyphen format (XXX-XXX-XXXX).
478
-
540
+
479
541
  Args:
480
542
  col: Column containing phone number
481
-
543
+
482
544
  Returns:
483
545
  Column with formatted phone number
484
546
  """
485
547
  # Remove extension for validation but preserve it
486
548
  extension = extract_extension(col)
487
- phone_no_ext = remove_extension(col)
488
-
489
- area_code = extract_area_code(phone_no_ext)
490
- exchange = extract_exchange(phone_no_ext)
491
- subscriber = extract_subscriber(phone_no_ext)
492
-
493
- base_format = F.concat(
494
- area_code, F.lit("-"),
495
- exchange, F.lit("-"),
496
- subscriber
497
- )
498
-
549
+ phone_numbers_no_ext = remove_extension(col)
550
+
551
+ area_code = extract_area_code(phone_numbers_no_ext)
552
+ exchange = extract_exchange(phone_numbers_no_ext)
553
+ subscriber = extract_subscriber(phone_numbers_no_ext)
554
+
555
+ base_format = F.concat(area_code, F.lit("-"), exchange, F.lit("-"), subscriber)
556
+
499
557
  # Add extension if present
500
558
  formatted = F.when(
501
- (extension != ""),
502
- F.concat(base_format, F.lit(" ext. "), extension)
559
+ (extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
503
560
  ).otherwise(base_format)
504
-
505
- return F.when(
506
- is_valid_nanp(phone_no_ext),
507
- formatted
508
- ).otherwise(F.lit(""))
561
+
562
+ return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
509
563
 
510
564
 
511
- @phones.register()
565
+ @phone_numbers.register()
512
566
  def format_nanp_paren(col: Column) -> Column:
513
567
  """
514
568
  Format NANP phone number with parentheses ((XXX) XXX-XXXX).
515
-
569
+
516
570
  Args:
517
571
  col: Column containing phone number
518
-
572
+
519
573
  Returns:
520
574
  Column with formatted phone number
521
575
  """
522
576
  # Remove extension for validation but preserve it
523
577
  extension = extract_extension(col)
524
- phone_no_ext = remove_extension(col)
525
-
526
- area_code = extract_area_code(phone_no_ext)
527
- exchange = extract_exchange(phone_no_ext)
528
- subscriber = extract_subscriber(phone_no_ext)
529
-
578
+ phone_numbers_no_ext = remove_extension(col)
579
+
580
+ area_code = extract_area_code(phone_numbers_no_ext)
581
+ exchange = extract_exchange(phone_numbers_no_ext)
582
+ subscriber = extract_subscriber(phone_numbers_no_ext)
583
+
530
584
  base_format = F.concat(
531
- F.lit("("), area_code, F.lit(") "),
532
- exchange, F.lit("-"), subscriber
585
+ F.lit("("), area_code, F.lit(") "), exchange, F.lit("-"), subscriber
533
586
  )
534
-
587
+
535
588
  # Add extension if present
536
589
  formatted = F.when(
537
- (extension != ""),
538
- F.concat(base_format, F.lit(" ext. "), extension)
590
+ (extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
539
591
  ).otherwise(base_format)
540
-
541
- return F.when(
542
- is_valid_nanp(phone_no_ext),
543
- formatted
544
- ).otherwise(F.lit(""))
592
+
593
+ return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
545
594
 
546
595
 
547
- @phones.register()
596
+ @phone_numbers.register()
548
597
  def format_nanp_dot(col: Column) -> Column:
549
598
  """
550
599
  Format NANP phone number with dots (XXX.XXX.XXXX).
551
-
600
+
552
601
  Args:
553
602
  col: Column containing phone number
554
-
603
+
555
604
  Returns:
556
605
  Column with formatted phone number
557
606
  """
558
607
  # Remove extension for validation but preserve it
559
608
  extension = extract_extension(col)
560
- phone_no_ext = remove_extension(col)
561
-
562
- area_code = extract_area_code(phone_no_ext)
563
- exchange = extract_exchange(phone_no_ext)
564
- subscriber = extract_subscriber(phone_no_ext)
565
-
566
- base_format = F.concat(
567
- area_code, F.lit("."),
568
- exchange, F.lit("."),
569
- subscriber
570
- )
571
-
609
+ phone_numbers_no_ext = remove_extension(col)
610
+
611
+ area_code = extract_area_code(phone_numbers_no_ext)
612
+ exchange = extract_exchange(phone_numbers_no_ext)
613
+ subscriber = extract_subscriber(phone_numbers_no_ext)
614
+
615
+ base_format = F.concat(area_code, F.lit("."), exchange, F.lit("."), subscriber)
616
+
572
617
  # Add extension if present
573
618
  formatted = F.when(
574
- (extension != ""),
575
- F.concat(base_format, F.lit(" ext. "), extension)
619
+ (extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
576
620
  ).otherwise(base_format)
577
-
578
- return F.when(
579
- is_valid_nanp(phone_no_ext),
580
- formatted
581
- ).otherwise(F.lit(""))
621
+
622
+ return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
582
623
 
583
624
 
584
- @phones.register()
625
+ @phone_numbers.register()
585
626
  def format_nanp_space(col: Column) -> Column:
586
627
  """
587
628
  Format NANP phone number with spaces (XXX XXX XXXX).
588
-
629
+
589
630
  Args:
590
631
  col: Column containing phone number
591
-
632
+
592
633
  Returns:
593
634
  Column with formatted phone number
594
635
  """
595
636
  # Remove extension for validation but preserve it
596
637
  extension = extract_extension(col)
597
- phone_no_ext = remove_extension(col)
598
-
599
- area_code = extract_area_code(phone_no_ext)
600
- exchange = extract_exchange(phone_no_ext)
601
- subscriber = extract_subscriber(phone_no_ext)
602
-
603
- base_format = F.concat(
604
- area_code, F.lit(" "),
605
- exchange, F.lit(" "),
606
- subscriber
607
- )
608
-
638
+ phone_numbers_no_ext = remove_extension(col)
639
+
640
+ area_code = extract_area_code(phone_numbers_no_ext)
641
+ exchange = extract_exchange(phone_numbers_no_ext)
642
+ subscriber = extract_subscriber(phone_numbers_no_ext)
643
+
644
+ base_format = F.concat(area_code, F.lit(" "), exchange, F.lit(" "), subscriber)
645
+
609
646
  # Add extension if present
610
647
  formatted = F.when(
611
- (extension != ""),
612
- F.concat(base_format, F.lit(" ext. "), extension)
648
+ (extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
613
649
  ).otherwise(base_format)
614
-
615
- return F.when(
616
- is_valid_nanp(phone_no_ext),
617
- formatted
618
- ).otherwise(F.lit(""))
650
+
651
+ return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
619
652
 
620
653
 
621
- @phones.register()
654
+ @phone_numbers.register()
622
655
  def format_international(col: Column) -> Column:
623
656
  """
624
657
  Format international phone number with country code.
625
-
658
+
626
659
  Args:
627
660
  col: Column containing phone number
628
-
661
+
629
662
  Returns:
630
663
  Column with formatted international number
631
664
  """
632
665
  country_code = extract_country_code(col)
633
666
  digits = extract_digits(col)
634
-
667
+
635
668
  # For international numbers, if we have a country code, remove it from the beginning
636
669
  # Use F.substring with proper column references
637
670
  cc_length = F.length(country_code)
638
671
  remaining_digits = F.when(
639
672
  (country_code != "") & (cc_length > 0) & digits.startswith(country_code),
640
- F.substring(digits, cc_length + 1, 999)
673
+ F.substring(digits, cc_length + 1, 999),
641
674
  ).otherwise(digits)
642
-
643
- return F.when(
644
- is_valid_international(col) & (country_code != ""),
645
- F.concat(F.lit("+"), country_code, F.lit(" "), remaining_digits)
646
- ).when(
647
- is_valid_international(col),
648
- digits
649
- ).otherwise(F.lit(""))
675
+
676
+ return (
677
+ F.when(
678
+ is_valid_international(col) & (country_code != ""),
679
+ F.concat(F.lit("+"), country_code, F.lit(" "), remaining_digits),
680
+ )
681
+ .when(is_valid_international(col), digits)
682
+ .otherwise(F.lit(""))
683
+ )
650
684
 
651
685
 
652
- @phones.register()
686
+ @phone_numbers.register()
653
687
  def format_e164(col: Column) -> Column:
654
688
  """
655
689
  Format phone number in E.164 format (+CCAAANNNNNNN) with default country code 1.
656
-
690
+
657
691
  Args:
658
692
  col: Column containing phone number
659
-
693
+
660
694
  Returns:
661
695
  Column with E.164 formatted number
662
696
  """
663
697
  digits = extract_digits(col)
664
698
  country_code = extract_country_code(col)
665
-
699
+
666
700
  # Check if it's a valid NANP number first
667
701
  is_nanp = is_valid_nanp(col)
668
-
702
+
669
703
  # Use default country code "1" if not present and number is 10 digits NANP
670
704
  final_country = F.when(
671
- (country_code == "") & (F.length(digits) == 10) & is_nanp,
672
- F.lit("1")
705
+ (country_code == "") & (F.length(digits) == 10) & is_nanp, F.lit("1")
673
706
  ).otherwise(country_code)
674
-
707
+
675
708
  # Build E.164 format - only for valid phones
676
709
  return F.when(
677
- is_valid_phone(col),
710
+ is_valid_phone_numbers(col),
678
711
  F.when(
679
- (F.length(digits) == 10) & is_nanp,
680
- F.concat(F.lit("+"), F.lit("1"), digits)
681
- ).when(
712
+ (F.length(digits) == 10) & is_nanp, F.concat(F.lit("+"), F.lit("1"), digits)
713
+ )
714
+ .when(
682
715
  (F.length(digits) == 11) & digits.startswith("1") & is_nanp,
683
- F.concat(F.lit("+"), digits)
684
- ).when(
716
+ F.concat(F.lit("+"), digits),
717
+ )
718
+ .when(
685
719
  (country_code != "") & is_valid_international(col),
686
- F.concat(F.lit("+"), digits) # digits already includes country code
687
- ).otherwise(F.lit(""))
720
+ F.concat(F.lit("+"), digits), # digits already includes country code
721
+ )
722
+ .otherwise(F.lit("")),
688
723
  ).otherwise(F.lit(""))
689
724
 
690
725
 
@@ -693,127 +728,142 @@ def format_e164(col: Column) -> Column:
693
728
  # ============================================================================
694
729
 
695
730
 
696
- @phones.register()
697
- def standardize_phone(col: Column) -> Column:
731
+ @phone_numbers.register()
732
+ def standardize_phone_numbers(col: Column) -> Column:
698
733
  """
699
734
  Standardize phone number with cleaning and NANP formatting.
700
-
735
+
701
736
  Args:
702
737
  col: Column containing phone number
703
-
738
+
704
739
  Returns:
705
740
  Column with standardized phone number in NANP format
706
741
  """
707
742
  # Clean and convert letters in a simpler way
708
743
  cleaned = convert_letters_to_numbers(col)
709
-
744
+
710
745
  # Extract extension first
711
746
  extension = extract_extension(cleaned)
712
- phone_no_ext = remove_extension(cleaned)
713
-
747
+ phone_no_ext = remove_extension(cleaned)
748
+
714
749
  # Get digits and check validity
715
750
  digits = extract_digits(phone_no_ext)
716
-
751
+
717
752
  # Simple NANP formatting for valid 10 or 11 digit numbers
718
- result = F.when(
719
- F.length(digits) == 10,
720
- F.concat(
721
- F.substring(digits, 1, 3), F.lit("-"),
722
- F.substring(digits, 4, 3), F.lit("-"),
723
- F.substring(digits, 7, 4)
753
+ result = (
754
+ F.when(
755
+ F.length(digits) == 10,
756
+ F.concat(
757
+ F.substring(digits, 1, 3),
758
+ F.lit("-"),
759
+ F.substring(digits, 4, 3),
760
+ F.lit("-"),
761
+ F.substring(digits, 7, 4),
762
+ ),
724
763
  )
725
- ).when(
726
- F.length(digits) == 11,
727
- F.concat(
728
- F.substring(digits, 2, 3), F.lit("-"),
729
- F.substring(digits, 5, 3), F.lit("-"),
730
- F.substring(digits, 8, 4)
764
+ .when(
765
+ F.length(digits) == 11,
766
+ F.concat(
767
+ F.substring(digits, 2, 3),
768
+ F.lit("-"),
769
+ F.substring(digits, 5, 3),
770
+ F.lit("-"),
771
+ F.substring(digits, 8, 4),
772
+ ),
731
773
  )
732
- ).otherwise(F.lit(""))
733
-
774
+ .otherwise(F.lit(""))
775
+ )
776
+
734
777
  # Add extension back if present
735
778
  final_result = F.when(
736
- (extension != "") & (result != ""),
737
- F.concat(result, F.lit(" ext. "), extension)
779
+ (extension != "") & (result != ""), F.concat(result, F.lit(" ext. "), extension)
738
780
  ).otherwise(result)
739
-
781
+
740
782
  return final_result
741
783
 
742
784
 
743
- @phones.register()
744
- def standardize_phone_e164(col: Column) -> Column:
785
+ @phone_numbers.register()
786
+ def standardize_phone_numbers_e164(col: Column) -> Column:
745
787
  """
746
788
  Standardize phone number with cleaning and E.164 formatting.
747
-
789
+
748
790
  Args:
749
791
  col: Column containing phone number
750
-
792
+
751
793
  Returns:
752
794
  Column with standardized phone number in E.164 format
753
795
  """
754
796
  # Clean and convert letters
755
797
  cleaned = convert_letters_to_numbers(col)
756
-
798
+
757
799
  # Format as E.164
758
800
  result = format_e164(cleaned)
759
-
801
+
760
802
  # Only return valid phone numbers
761
- return F.when(is_valid_phone(cleaned), result).otherwise(F.lit(""))
803
+ return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
762
804
 
763
805
 
764
- @phones.register()
765
- def standardize_phone_digits(col: Column) -> Column:
806
+ @phone_numbers.register()
807
+ def standardize_phone_numbers_digits(col: Column) -> Column:
766
808
  """
767
809
  Standardize phone number and return digits only.
768
-
810
+
769
811
  Args:
770
812
  col: Column containing phone number
771
-
813
+
772
814
  Returns:
773
815
  Column with digits only
774
816
  """
775
817
  # Clean and convert letters
776
818
  cleaned = convert_letters_to_numbers(col)
777
-
819
+
778
820
  # Get digits only
779
821
  result = extract_digits(cleaned)
780
-
822
+
781
823
  # Only return valid phone numbers
782
- return F.when(is_valid_phone(cleaned), result).otherwise(F.lit(""))
824
+ return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
783
825
 
784
826
 
785
- @phones.register()
786
- def clean_phone(col: Column) -> Column:
827
+ @phone_numbers.register()
828
+ def clean_phone_numbers(col: Column) -> Column:
787
829
  """
788
830
  Clean and validate phone number, returning null for invalid numbers.
789
-
831
+
790
832
  Args:
791
833
  col: Column containing phone number
792
-
834
+
793
835
  Returns:
794
836
  Column with cleaned phone number or null
795
837
  """
796
838
  # Simple implementation to avoid deep nesting
797
839
  cleaned = convert_letters_to_numbers(col)
798
840
  digits = extract_digits(cleaned)
799
-
841
+
800
842
  # Simple validation and formatting
801
- result = F.when(
802
- F.length(digits) == 10,
803
- F.concat(
804
- F.substring(digits, 1, 3), F.lit("-"),
805
- F.substring(digits, 4, 3), F.lit("-"),
806
- F.substring(digits, 7, 4)
843
+ result = (
844
+ F.when(
845
+ F.length(digits) == 10,
846
+ F.concat(
847
+ F.substring(digits, 1, 3),
848
+ F.lit("-"),
849
+ F.substring(digits, 4, 3),
850
+ F.lit("-"),
851
+ F.substring(digits, 7, 4),
852
+ ),
807
853
  )
808
- ).when(
809
- F.length(digits) == 11,
810
- F.concat(
811
- F.substring(digits, 2, 3), F.lit("-"),
812
- F.substring(digits, 5, 3), F.lit("-"),
813
- F.substring(digits, 8, 4)
854
+ .when(
855
+ F.length(digits) == 11,
856
+ F.concat(
857
+ F.substring(digits, 2, 3),
858
+ F.lit("-"),
859
+ F.substring(digits, 5, 3),
860
+ F.lit("-"),
861
+ F.substring(digits, 8, 4),
862
+ ),
814
863
  )
815
- ).otherwise(F.lit(None))
816
-
864
+ .otherwise(F.lit(None))
865
+ )
866
+
817
867
  return result
818
868
 
819
869
 
@@ -822,14 +872,14 @@ def clean_phone(col: Column) -> Column:
822
872
  # ============================================================================
823
873
 
824
874
 
825
- @phones.register()
826
- def get_phone_type(col: Column) -> Column:
875
+ @phone_numbers.register()
876
+ def get_phone_numbers_type(col: Column) -> Column:
827
877
  """
828
878
  Get phone number type (toll-free, premium, standard, international).
829
-
879
+
830
880
  Args:
831
- col: Column containing phone number
832
-
881
+ col: Column containing phone_numbers number
882
+
833
883
  Returns:
834
884
  Column with phone type
835
885
  """
@@ -842,55 +892,55 @@ def get_phone_type(col: Column) -> Column:
842
892
  )
843
893
 
844
894
 
845
- @phones.register()
895
+ @phone_numbers.register()
846
896
  def get_region_from_area_code(col: Column) -> Column:
847
897
  """
848
898
  Get geographic region from area code (simplified - would need lookup table).
849
-
899
+
850
900
  Args:
851
901
  col: Column containing phone number
852
-
902
+
853
903
  Returns:
854
904
  Column with region or empty string
855
905
  """
856
906
  area_code = extract_area_code(col)
857
-
907
+
858
908
  # This is a simplified example - in practice you'd use a lookup table
859
909
  # Just showing structure for major area codes
860
- return F.when(area_code == "212", F.lit("New York, NY")).\
861
- when(area_code == "213", F.lit("Los Angeles, CA")).\
862
- when(area_code == "312", F.lit("Chicago, IL")).\
863
- when(area_code == "415", F.lit("San Francisco, CA")).\
864
- when(area_code == "202", F.lit("Washington, DC")).\
865
- when(area_code.isin(["800", "888", "877", "866", "855", "844", "833"]),
866
- F.lit("Toll-Free")).\
867
- when(area_code == "900", F.lit("Premium")).\
868
- otherwise(F.lit(""))
910
+ return (
911
+ F.when(area_code == "212", F.lit("New York, NY"))
912
+ .when(area_code == "213", F.lit("Los Angeles, CA"))
913
+ .when(area_code == "312", F.lit("Chicago, IL"))
914
+ .when(area_code == "415", F.lit("San Francisco, CA"))
915
+ .when(area_code == "202", F.lit("Washington, DC"))
916
+ .when(
917
+ area_code.isin(["800", "888", "877", "866", "855", "844", "833"]),
918
+ F.lit("Toll-Free"),
919
+ )
920
+ .when(area_code == "900", F.lit("Premium"))
921
+ .otherwise(F.lit(""))
922
+ )
869
923
 
870
924
 
871
- @phones.register()
872
- def mask_phone(col: Column) -> Column:
925
+ @phone_numbers.register()
926
+ def mask_phone_numbers(col: Column) -> Column:
873
927
  """
874
928
  Mask phone number for privacy keeping last 4 digits (e.g., ***-***-1234).
875
-
929
+
876
930
  Args:
877
931
  col: Column containing phone number
878
-
932
+
879
933
  Returns:
880
934
  Column with masked phone number
881
935
  """
882
936
  subscriber = extract_subscriber(col)
883
-
937
+
884
938
  # Mask area code and exchange, keep last 4 digits
885
939
  masked = F.when(
886
940
  is_valid_nanp(col),
887
- F.concat(
888
- F.lit("***"), F.lit("-"),
889
- F.lit("***"), F.lit("-"),
890
- subscriber
891
- )
941
+ F.concat(F.lit("***"), F.lit("-"), F.lit("***"), F.lit("-"), subscriber),
892
942
  ).otherwise(col)
893
-
943
+
894
944
  return F.when(col.isNull() | (col == ""), F.lit(None)).otherwise(masked)
895
945
 
896
946
 
@@ -899,43 +949,43 @@ def mask_phone(col: Column) -> Column:
899
949
  # ============================================================================
900
950
 
901
951
 
902
- @phones.register()
903
- def filter_valid_phones(col: Column) -> Column:
952
+ @phone_numbers.register()
953
+ def filter_valid_phone_numbers_numbers(col: Column) -> Column:
904
954
  """
905
- Return phone number only if valid, otherwise return null.
906
-
955
+ Return phone_numbers number only if valid, otherwise return null.
956
+
907
957
  Args:
908
958
  col: Column containing phone number
909
-
959
+
910
960
  Returns:
911
961
  Column with valid phone or null
912
962
  """
913
- return F.when(is_valid_phone(col), col).otherwise(F.lit(None))
963
+ return F.when(is_valid_phone_numbers(col), col).otherwise(F.lit(None))
914
964
 
915
965
 
916
- @phones.register()
917
- def filter_nanp_phones(col: Column) -> Column:
966
+ @phone_numbers.register()
967
+ def filter_nanp_phone_numbers_numbers(col: Column) -> Column:
918
968
  """
919
- Return phone number only if valid NANP, otherwise return null.
920
-
969
+ Return phone_numbers number only if valid NANP, otherwise return null.
970
+
921
971
  Args:
922
972
  col: Column containing phone number
923
-
973
+
924
974
  Returns:
925
975
  Column with NANP phone or null
926
976
  """
927
977
  return F.when(is_valid_nanp(col), col).otherwise(F.lit(None))
928
978
 
929
979
 
930
- @phones.register()
931
- def filter_toll_free_phones(col: Column) -> Column:
980
+ @phone_numbers.register()
981
+ def filter_toll_free_phone_numbers_numbers(col: Column) -> Column:
932
982
  """
933
983
  Return phone number only if toll-free, otherwise return null.
934
-
984
+
935
985
  Args:
936
986
  col: Column containing phone number
937
-
987
+
938
988
  Returns:
939
989
  Column with toll-free phone or null
940
990
  """
941
- return F.when(is_toll_free(col), col).otherwise(F.lit(None))
991
+ return F.when(is_toll_free(col), col).otherwise(F.lit(None))