datacompose 0.2.4.1__py3-none-any.whl → 0.2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/__init__.py +1 -1
- datacompose/cli/commands/add.py +49 -21
- datacompose/cli/commands/init.py +35 -9
- datacompose/cli/commands/list.py +2 -2
- datacompose/cli/config.py +80 -0
- datacompose/cli/main.py +3 -3
- datacompose/generators/base.py +15 -14
- datacompose/generators/pyspark/generator.py +5 -10
- datacompose/operators/__init__.py +1 -1
- datacompose/operators/primitives.py +57 -19
- datacompose/transformers/text/{clean_addresses → addresses}/pyspark/pyspark_primitives.py +68 -13
- datacompose/transformers/text/{clean_emails → emails}/pyspark/pyspark_primitives.py +53 -1
- datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/pyspark/pyspark_primitives.py +416 -366
- datacompose-0.2.6.0.dist-info/METADATA +94 -0
- datacompose-0.2.6.0.dist-info/RECORD +31 -0
- datacompose-0.2.4.1.dist-info/METADATA +0 -449
- datacompose-0.2.4.1.dist-info/RECORD +0 -30
- /datacompose/transformers/text/{clean_addresses → addresses}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_emails → emails}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/__init__.py +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/WHEEL +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/top_level.txt +0 -0
datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/pyspark/pyspark_primitives.py
RENAMED
|
@@ -1,3 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Phone number transformation primitives for PySpark.
|
|
3
|
+
|
|
4
|
+
Preview Output:
|
|
5
|
+
+------------------------+----------------+--------+---------+------------+-------+---------+------------+
|
|
6
|
+
|phone_numbers |standardized |is_valid|area_code|local_number|has_ext|extension|is_toll_free|
|
|
7
|
+
+------------------------+----------------+--------+---------+------------+-------+---------+------------+
|
|
8
|
+
| (555) 123-4567 |(555) 123-4567 |true |555 |1234567 |false |null |false |
|
|
9
|
+
|+1-800-555-1234 |+1 800-555-1234 |true |800 |5551234 |false |null |true |
|
|
10
|
+
|555.123.4567 ext 890 |555.123.4567 |true |555 |1234567 |true |890 |false |
|
|
11
|
+
|123-45-67 |null |false |null |null |false |null |false |
|
|
12
|
+
|1-800-FLOWERS |1-800-356-9377 |true |800 |3569377 |false |null |true |
|
|
13
|
+
| 415 555 0123 |415-555-0123 |true |415 |5550123 |false |null |false |
|
|
14
|
+
+------------------------+----------------+--------+---------+------------+-------+---------+------------+
|
|
15
|
+
|
|
16
|
+
Usage Example:
|
|
17
|
+
from pyspark.sql import SparkSession
|
|
18
|
+
from pyspark.sql import functions as F
|
|
19
|
+
from transformers.pyspark.phone_numbers import phone_numbers
|
|
20
|
+
|
|
21
|
+
# Initialize Spark
|
|
22
|
+
spark = SparkSession.builder.appName("PhoneCleaning").getOrCreate()
|
|
23
|
+
|
|
24
|
+
# Create sample data
|
|
25
|
+
data = [
|
|
26
|
+
("(555) 123-4567",),
|
|
27
|
+
("+1-800-555-1234",),
|
|
28
|
+
("555.123.4567 ext 890",),
|
|
29
|
+
("123-45-67",),
|
|
30
|
+
("1-800-FLOWERS",),
|
|
31
|
+
]
|
|
32
|
+
df = spark.createDataFrame(data, ["phone_numbers"])
|
|
33
|
+
|
|
34
|
+
# Apply transformations
|
|
35
|
+
result_df = df.select(
|
|
36
|
+
F.col("phone_numbers"),
|
|
37
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers")).alias("standardized"),
|
|
38
|
+
phone_numbers.is_valid_phone_numbers(F.col("phone_numbers")).alias("is_valid"),
|
|
39
|
+
phone_numbers.extract_area_code(
|
|
40
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
41
|
+
).alias("area_code"),
|
|
42
|
+
phone_numbers.extract_local_number(
|
|
43
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
44
|
+
).alias("local_number"),
|
|
45
|
+
phone_numbers.has_extension(F.col("phone_numbers")).alias("has_ext"),
|
|
46
|
+
phone_numbers.extract_extension(F.col("phone_numbers")).alias("extension"),
|
|
47
|
+
phone_numbers.is_toll_free(
|
|
48
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
49
|
+
).alias("is_toll_free")
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Show results
|
|
53
|
+
result_df.show(truncate=False)
|
|
54
|
+
|
|
55
|
+
Installation:
|
|
56
|
+
datacompose add phone_numbers
|
|
57
|
+
"""
|
|
58
|
+
|
|
1
59
|
import re
|
|
2
60
|
from typing import TYPE_CHECKING, Dict, Optional
|
|
3
61
|
|
|
@@ -16,23 +74,41 @@ else:
|
|
|
16
74
|
|
|
17
75
|
try:
|
|
18
76
|
# Try local utils import first (for generated code)
|
|
19
|
-
from utils.primitives import PrimitiveRegistry
|
|
77
|
+
from utils.primitives import PrimitiveRegistry # type: ignore
|
|
20
78
|
except ImportError:
|
|
21
79
|
# Fall back to installed datacompose package
|
|
22
80
|
from datacompose.operators.primitives import PrimitiveRegistry
|
|
23
81
|
|
|
24
|
-
|
|
82
|
+
phone_numbers = PrimitiveRegistry("phone_numbers")
|
|
25
83
|
|
|
26
84
|
# Phone keypad mapping for letter to number conversion
|
|
27
85
|
PHONE_KEYPAD_MAPPING = {
|
|
28
|
-
"A": "2",
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
86
|
+
"A": "2",
|
|
87
|
+
"B": "2",
|
|
88
|
+
"C": "2",
|
|
89
|
+
"D": "3",
|
|
90
|
+
"E": "3",
|
|
91
|
+
"F": "3",
|
|
92
|
+
"G": "4",
|
|
93
|
+
"H": "4",
|
|
94
|
+
"I": "4",
|
|
95
|
+
"J": "5",
|
|
96
|
+
"K": "5",
|
|
97
|
+
"L": "5",
|
|
98
|
+
"M": "6",
|
|
99
|
+
"N": "6",
|
|
100
|
+
"O": "6",
|
|
101
|
+
"P": "7",
|
|
102
|
+
"Q": "7",
|
|
103
|
+
"R": "7",
|
|
104
|
+
"S": "7",
|
|
105
|
+
"T": "8",
|
|
106
|
+
"U": "8",
|
|
107
|
+
"V": "8",
|
|
108
|
+
"W": "9",
|
|
109
|
+
"X": "9",
|
|
110
|
+
"Y": "9",
|
|
111
|
+
"Z": "9",
|
|
36
112
|
}
|
|
37
113
|
|
|
38
114
|
|
|
@@ -41,57 +117,56 @@ PHONE_KEYPAD_MAPPING = {
|
|
|
41
117
|
# ============================================================================
|
|
42
118
|
|
|
43
119
|
|
|
44
|
-
@
|
|
45
|
-
def
|
|
120
|
+
@phone_numbers.register()
|
|
121
|
+
def extract_phone_numbers_from_text(col: Column) -> Column:
|
|
46
122
|
"""
|
|
47
123
|
Extract first phone number from text using regex patterns.
|
|
48
|
-
|
|
124
|
+
|
|
49
125
|
Args:
|
|
50
126
|
col: Column containing text with potential phone numbers
|
|
51
|
-
|
|
127
|
+
|
|
52
128
|
Returns:
|
|
53
|
-
Column with extracted phone
|
|
129
|
+
Column with extracted phone numbers or empty string
|
|
54
130
|
"""
|
|
55
|
-
# Comprehensive
|
|
131
|
+
# Comprehensive phone_numbers pattern that matches various formats
|
|
56
132
|
# Handles: +1-555-123-4567, (555) 123-4567, 555.123.4567, 555-123-4567, etc.
|
|
57
|
-
|
|
58
|
-
|
|
133
|
+
phone_numbers_pattern = (
|
|
134
|
+
r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(\s*(ext|x)\.?\s*\d+)?"
|
|
135
|
+
)
|
|
136
|
+
|
|
59
137
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
60
|
-
F.regexp_extract(col,
|
|
138
|
+
F.regexp_extract(col, phone_numbers_pattern, 0)
|
|
61
139
|
)
|
|
62
140
|
|
|
63
141
|
|
|
64
|
-
@
|
|
65
|
-
def
|
|
142
|
+
@phone_numbers.register()
|
|
143
|
+
def extract_all_phone_numbers_from_text(col: Column) -> Column:
|
|
66
144
|
"""
|
|
67
145
|
Extract all phone numbers from text as an array.
|
|
68
|
-
|
|
146
|
+
|
|
69
147
|
Args:
|
|
70
148
|
col: Column containing text with potential phone numbers
|
|
71
|
-
|
|
149
|
+
|
|
72
150
|
Returns:
|
|
73
151
|
Column with array of phone numbers
|
|
74
152
|
"""
|
|
75
|
-
# For simplicity, we'll return an array with just the first
|
|
153
|
+
# For simplicity, we'll return an array with just the first phone_numbers found
|
|
76
154
|
# A proper implementation would require more complex regex or UDF
|
|
77
155
|
# This is a limitation of Spark SQL's regex capabilities
|
|
78
|
-
|
|
79
|
-
|
|
156
|
+
first_phone_numbers = extract_phone_numbers_from_text(col)
|
|
157
|
+
|
|
80
158
|
# Return array with single element or empty array
|
|
81
|
-
return F.when(
|
|
82
|
-
first_phone != "",
|
|
83
|
-
F.array(first_phone)
|
|
84
|
-
).otherwise(F.array())
|
|
159
|
+
return F.when(first_phone_numbers != "", F.array(first_phone_numbers)).otherwise(F.array())
|
|
85
160
|
|
|
86
161
|
|
|
87
|
-
@
|
|
162
|
+
@phone_numbers.register()
|
|
88
163
|
def extract_digits(col: Column) -> Column:
|
|
89
164
|
"""
|
|
90
165
|
Extract only digits from phone number string.
|
|
91
|
-
|
|
166
|
+
|
|
92
167
|
Args:
|
|
93
168
|
col: Column containing phone number
|
|
94
|
-
|
|
169
|
+
|
|
95
170
|
Returns:
|
|
96
171
|
Column with only digits
|
|
97
172
|
"""
|
|
@@ -100,143 +175,131 @@ def extract_digits(col: Column) -> Column:
|
|
|
100
175
|
)
|
|
101
176
|
|
|
102
177
|
|
|
103
|
-
@
|
|
178
|
+
@phone_numbers.register()
|
|
104
179
|
def extract_extension(col: Column) -> Column:
|
|
105
180
|
"""
|
|
106
181
|
Extract extension from phone number if present.
|
|
107
|
-
|
|
182
|
+
|
|
108
183
|
Args:
|
|
109
184
|
col: Column containing phone number
|
|
110
|
-
|
|
185
|
+
|
|
111
186
|
Returns:
|
|
112
187
|
Column with extension or empty string
|
|
113
188
|
"""
|
|
114
189
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
115
190
|
F.when(
|
|
116
|
-
col.rlike(r"ext\.?\s*(\d+)"),
|
|
117
|
-
F.regexp_extract(col, r"ext\.?\s*(\d+)", 1)
|
|
191
|
+
col.rlike(r"ext\.?\s*(\d+)"), F.regexp_extract(col, r"ext\.?\s*(\d+)", 1)
|
|
118
192
|
).otherwise("")
|
|
119
193
|
)
|
|
120
194
|
|
|
121
195
|
|
|
122
|
-
@
|
|
196
|
+
@phone_numbers.register()
|
|
123
197
|
def extract_country_code(col: Column) -> Column:
|
|
124
198
|
"""
|
|
125
199
|
Extract country code from phone number.
|
|
126
|
-
|
|
200
|
+
|
|
127
201
|
Args:
|
|
128
202
|
col: Column containing phone number
|
|
129
|
-
|
|
203
|
+
|
|
130
204
|
Returns:
|
|
131
205
|
Column with country code or empty string
|
|
132
206
|
"""
|
|
133
207
|
digits = extract_digits(col)
|
|
134
|
-
|
|
208
|
+
|
|
135
209
|
# Check for explicit country code with + prefix
|
|
136
210
|
has_plus = col.contains("+")
|
|
137
|
-
|
|
211
|
+
|
|
138
212
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
139
213
|
F.when(
|
|
140
214
|
# Explicit country code with +
|
|
141
215
|
has_plus & col.rlike(r"^\+(\d{1,3})"),
|
|
142
|
-
F.regexp_extract(col, r"^\+(\d{1,3})", 1)
|
|
143
|
-
)
|
|
216
|
+
F.regexp_extract(col, r"^\+(\d{1,3})", 1),
|
|
217
|
+
)
|
|
218
|
+
.when(
|
|
144
219
|
# NANP with leading 1 (11 digits total)
|
|
145
220
|
(F.length(digits) == 11) & digits.startswith("1"),
|
|
146
|
-
F.lit("1")
|
|
147
|
-
)
|
|
221
|
+
F.lit("1"),
|
|
222
|
+
)
|
|
223
|
+
.otherwise("")
|
|
148
224
|
)
|
|
149
225
|
|
|
150
226
|
|
|
151
|
-
@
|
|
227
|
+
@phone_numbers.register()
|
|
152
228
|
def extract_area_code(col: Column) -> Column:
|
|
153
229
|
"""
|
|
154
230
|
Extract area code from NANP phone number.
|
|
155
|
-
|
|
231
|
+
|
|
156
232
|
Args:
|
|
157
233
|
col: Column containing phone number
|
|
158
|
-
|
|
234
|
+
|
|
159
235
|
Returns:
|
|
160
236
|
Column with area code or empty string
|
|
161
237
|
"""
|
|
162
238
|
digits = extract_digits(col)
|
|
163
|
-
|
|
239
|
+
|
|
164
240
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
165
|
-
F.when(
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
).when(
|
|
169
|
-
F.length(digits) == 10,
|
|
170
|
-
F.substring(digits, 1, 3)
|
|
171
|
-
).otherwise("")
|
|
241
|
+
F.when(F.length(digits) == 11, F.substring(digits, 2, 3)) # Skip country code
|
|
242
|
+
.when(F.length(digits) == 10, F.substring(digits, 1, 3))
|
|
243
|
+
.otherwise("")
|
|
172
244
|
)
|
|
173
245
|
|
|
174
246
|
|
|
175
|
-
@
|
|
247
|
+
@phone_numbers.register()
|
|
176
248
|
def extract_exchange(col: Column) -> Column:
|
|
177
249
|
"""
|
|
178
250
|
Extract exchange (first 3 digits of local number) from NANP phone number.
|
|
179
|
-
|
|
251
|
+
|
|
180
252
|
Args:
|
|
181
253
|
col: Column containing phone number
|
|
182
|
-
|
|
254
|
+
|
|
183
255
|
Returns:
|
|
184
256
|
Column with exchange or empty string
|
|
185
257
|
"""
|
|
186
258
|
digits = extract_digits(col)
|
|
187
|
-
|
|
259
|
+
|
|
188
260
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
189
|
-
F.when(
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
).when(
|
|
193
|
-
F.length(digits) == 10,
|
|
194
|
-
F.substring(digits, 4, 3)
|
|
195
|
-
).otherwise("")
|
|
261
|
+
F.when(F.length(digits) == 11, F.substring(digits, 5, 3))
|
|
262
|
+
.when(F.length(digits) == 10, F.substring(digits, 4, 3))
|
|
263
|
+
.otherwise("")
|
|
196
264
|
)
|
|
197
265
|
|
|
198
266
|
|
|
199
|
-
@
|
|
267
|
+
@phone_numbers.register()
|
|
200
268
|
def extract_subscriber(col: Column) -> Column:
|
|
201
269
|
"""
|
|
202
270
|
Extract subscriber number (last 4 digits) from NANP phone number.
|
|
203
|
-
|
|
271
|
+
|
|
204
272
|
Args:
|
|
205
273
|
col: Column containing phone number
|
|
206
|
-
|
|
274
|
+
|
|
207
275
|
Returns:
|
|
208
276
|
Column with subscriber number or empty string
|
|
209
277
|
"""
|
|
210
278
|
digits = extract_digits(col)
|
|
211
|
-
|
|
279
|
+
|
|
212
280
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
213
|
-
F.when(
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
).when(
|
|
217
|
-
F.length(digits) == 10,
|
|
218
|
-
F.substring(digits, 7, 4)
|
|
219
|
-
).otherwise("")
|
|
281
|
+
F.when(F.length(digits) == 11, F.substring(digits, 8, 4))
|
|
282
|
+
.when(F.length(digits) == 10, F.substring(digits, 7, 4))
|
|
283
|
+
.otherwise("")
|
|
220
284
|
)
|
|
221
285
|
|
|
222
286
|
|
|
223
|
-
@
|
|
287
|
+
@phone_numbers.register()
|
|
224
288
|
def extract_local_number(col: Column) -> Column:
|
|
225
289
|
"""
|
|
226
290
|
Extract local number (exchange + subscriber) from NANP phone number.
|
|
227
|
-
|
|
291
|
+
|
|
228
292
|
Args:
|
|
229
293
|
col: Column containing phone number
|
|
230
|
-
|
|
294
|
+
|
|
231
295
|
Returns:
|
|
232
296
|
Column with 7-digit local number or empty string
|
|
233
297
|
"""
|
|
234
298
|
exchange = extract_exchange(col)
|
|
235
299
|
subscriber = extract_subscriber(col)
|
|
236
|
-
|
|
300
|
+
|
|
237
301
|
return F.when(
|
|
238
|
-
(exchange != "") & (subscriber != ""),
|
|
239
|
-
F.concat(exchange, subscriber)
|
|
302
|
+
(exchange != "") & (subscriber != ""), F.concat(exchange, subscriber)
|
|
240
303
|
).otherwise("")
|
|
241
304
|
|
|
242
305
|
|
|
@@ -245,14 +308,14 @@ def extract_local_number(col: Column) -> Column:
|
|
|
245
308
|
# ============================================================================
|
|
246
309
|
|
|
247
310
|
|
|
248
|
-
@
|
|
311
|
+
@phone_numbers.register()
|
|
249
312
|
def is_valid_nanp(col: Column) -> Column:
|
|
250
313
|
"""
|
|
251
314
|
Check if phone number is valid NANP format (North American Numbering Plan).
|
|
252
|
-
|
|
315
|
+
|
|
253
316
|
Args:
|
|
254
317
|
col: Column containing phone number
|
|
255
|
-
|
|
318
|
+
|
|
256
319
|
Returns:
|
|
257
320
|
Column with boolean indicating NANP validity
|
|
258
321
|
"""
|
|
@@ -260,108 +323,108 @@ def is_valid_nanp(col: Column) -> Column:
|
|
|
260
323
|
area_code = extract_area_code(col)
|
|
261
324
|
exchange = extract_exchange(col)
|
|
262
325
|
subscriber = extract_subscriber(col)
|
|
263
|
-
|
|
326
|
+
|
|
264
327
|
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
265
|
-
(F.length(digits).isin([10, 11]))
|
|
328
|
+
(F.length(digits).isin([10, 11]))
|
|
329
|
+
&
|
|
266
330
|
# Area code: 2-9 for first digit, 0-9 for second, 0-9 for third
|
|
267
|
-
(area_code.rlike(r"^[2-9]\d{2}$"))
|
|
331
|
+
(area_code.rlike(r"^[2-9]\d{2}$"))
|
|
332
|
+
&
|
|
268
333
|
# Exchange: 2-9 for first digit (historically, now 1-9 is valid)
|
|
269
|
-
(exchange.rlike(r"^[1-9]\d{2}$"))
|
|
334
|
+
(exchange.rlike(r"^[1-9]\d{2}$"))
|
|
335
|
+
&
|
|
270
336
|
# Subscriber: any 4 digits
|
|
271
|
-
(subscriber.rlike(r"^\d{4}$"))
|
|
337
|
+
(subscriber.rlike(r"^\d{4}$"))
|
|
338
|
+
&
|
|
272
339
|
# If 11 digits, must start with 1
|
|
273
340
|
((F.length(digits) == 10) | (digits.startswith("1")))
|
|
274
341
|
)
|
|
275
342
|
|
|
276
343
|
|
|
277
|
-
@
|
|
278
|
-
def is_valid_international(
|
|
344
|
+
@phone_numbers.register()
|
|
345
|
+
def is_valid_international(
|
|
346
|
+
col: Column, min_length: int = 7, max_length: int = 15
|
|
347
|
+
) -> Column:
|
|
279
348
|
"""
|
|
280
349
|
Check if phone number could be valid international format.
|
|
281
|
-
|
|
350
|
+
|
|
282
351
|
Args:
|
|
283
352
|
col: Column containing phone number
|
|
284
353
|
min_length: Minimum digits for international number
|
|
285
354
|
max_length: Maximum digits for international number
|
|
286
|
-
|
|
355
|
+
|
|
287
356
|
Returns:
|
|
288
357
|
Column with boolean indicating potential international validity
|
|
289
358
|
"""
|
|
290
359
|
digits = extract_digits(col)
|
|
291
|
-
|
|
360
|
+
|
|
292
361
|
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
293
|
-
(F.length(digits) >= min_length)
|
|
294
|
-
(F.length(digits) <= max_length)
|
|
295
|
-
digits.rlike(r"^\d+$")
|
|
362
|
+
(F.length(digits) >= min_length)
|
|
363
|
+
& (F.length(digits) <= max_length)
|
|
364
|
+
& digits.rlike(r"^\d+$")
|
|
296
365
|
)
|
|
297
366
|
|
|
298
367
|
|
|
299
|
-
@
|
|
300
|
-
def
|
|
368
|
+
@phone_numbers.register()
|
|
369
|
+
def is_valid_phone_numbers(col: Column) -> Column:
|
|
301
370
|
"""
|
|
302
371
|
Check if phone number is valid (NANP or international).
|
|
303
|
-
|
|
372
|
+
|
|
304
373
|
Args:
|
|
305
374
|
col: Column containing phone number
|
|
306
|
-
|
|
375
|
+
|
|
307
376
|
Returns:
|
|
308
377
|
Column with boolean indicating validity
|
|
309
378
|
"""
|
|
310
379
|
return is_valid_nanp(col) | is_valid_international(col)
|
|
311
380
|
|
|
312
381
|
|
|
313
|
-
@
|
|
382
|
+
@phone_numbers.register()
|
|
314
383
|
def is_toll_free(col: Column) -> Column:
|
|
315
384
|
"""
|
|
316
385
|
Check if phone number is toll-free (800, 888, 877, 866, 855, 844, 833).
|
|
317
|
-
|
|
386
|
+
|
|
318
387
|
Args:
|
|
319
388
|
col: Column containing phone number
|
|
320
|
-
|
|
389
|
+
|
|
321
390
|
Returns:
|
|
322
391
|
Column with boolean indicating if toll-free
|
|
323
392
|
"""
|
|
324
393
|
area_code = extract_area_code(col)
|
|
325
|
-
|
|
394
|
+
|
|
326
395
|
toll_free_codes = ["800", "888", "877", "866", "855", "844", "833"]
|
|
327
|
-
|
|
328
|
-
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
329
|
-
area_code.isin(toll_free_codes)
|
|
330
|
-
)
|
|
396
|
+
|
|
397
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(area_code.isin(toll_free_codes))
|
|
331
398
|
|
|
332
399
|
|
|
333
|
-
@
|
|
400
|
+
@phone_numbers.register()
|
|
334
401
|
def is_premium_rate(col: Column) -> Column:
|
|
335
402
|
"""
|
|
336
403
|
Check if phone number is premium rate (900).
|
|
337
|
-
|
|
404
|
+
|
|
338
405
|
Args:
|
|
339
|
-
col: Column containing
|
|
340
|
-
|
|
406
|
+
col: Column containing phophonene_numbers number
|
|
407
|
+
|
|
341
408
|
Returns:
|
|
342
409
|
Column with boolean indicating if premium rate
|
|
343
410
|
"""
|
|
344
411
|
area_code = extract_area_code(col)
|
|
345
|
-
|
|
346
|
-
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
347
|
-
area_code == "900"
|
|
348
|
-
)
|
|
412
|
+
|
|
413
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(area_code == "900")
|
|
349
414
|
|
|
350
415
|
|
|
351
|
-
@
|
|
416
|
+
@phone_numbers.register()
|
|
352
417
|
def has_extension(col: Column) -> Column:
|
|
353
418
|
"""
|
|
354
419
|
Check if phone number has an extension.
|
|
355
|
-
|
|
420
|
+
|
|
356
421
|
Args:
|
|
357
422
|
col: Column containing phone number
|
|
358
|
-
|
|
423
|
+
|
|
359
424
|
Returns:
|
|
360
425
|
Column with boolean indicating presence of extension
|
|
361
426
|
"""
|
|
362
|
-
return F.when(col.isNull(), F.lit(False)).otherwise(
|
|
363
|
-
col.rlike(r"ext\.?\s*\d+")
|
|
364
|
-
)
|
|
427
|
+
return F.when(col.isNull(), F.lit(False)).otherwise(col.rlike(r"ext\.?\s*\d+"))
|
|
365
428
|
|
|
366
429
|
|
|
367
430
|
# ============================================================================
|
|
@@ -369,28 +432,28 @@ def has_extension(col: Column) -> Column:
|
|
|
369
432
|
# ============================================================================
|
|
370
433
|
|
|
371
434
|
|
|
372
|
-
@
|
|
435
|
+
@phone_numbers.register()
|
|
373
436
|
def remove_non_digits(col: Column) -> Column:
|
|
374
437
|
"""
|
|
375
438
|
Remove all non-digit characters from phone number.
|
|
376
|
-
|
|
439
|
+
|
|
377
440
|
Args:
|
|
378
441
|
col: Column containing phone number
|
|
379
|
-
|
|
442
|
+
|
|
380
443
|
Returns:
|
|
381
444
|
Column with only digits
|
|
382
445
|
"""
|
|
383
446
|
return extract_digits(col)
|
|
384
447
|
|
|
385
448
|
|
|
386
|
-
@
|
|
449
|
+
@phone_numbers.register()
|
|
387
450
|
def remove_extension(col: Column) -> Column:
|
|
388
451
|
"""
|
|
389
452
|
Remove extension from phone number.
|
|
390
|
-
|
|
453
|
+
|
|
391
454
|
Args:
|
|
392
455
|
col: Column containing phone number
|
|
393
|
-
|
|
456
|
+
|
|
394
457
|
Returns:
|
|
395
458
|
Column with extension removed
|
|
396
459
|
"""
|
|
@@ -399,36 +462,36 @@ def remove_extension(col: Column) -> Column:
|
|
|
399
462
|
)
|
|
400
463
|
|
|
401
464
|
|
|
402
|
-
@
|
|
465
|
+
@phone_numbers.register()
|
|
403
466
|
def convert_letters_to_numbers(col: Column) -> Column:
|
|
404
467
|
"""
|
|
405
468
|
Convert phone letters to numbers (e.g., 1-800-FLOWERS to 1-800-3569377).
|
|
406
|
-
|
|
469
|
+
|
|
407
470
|
Args:
|
|
408
471
|
col: Column containing phone number with letters
|
|
409
|
-
|
|
472
|
+
|
|
410
473
|
Returns:
|
|
411
474
|
Column with letters converted to numbers
|
|
412
475
|
"""
|
|
413
476
|
result = col
|
|
414
|
-
|
|
477
|
+
|
|
415
478
|
# Apply each letter-to-number mapping
|
|
416
479
|
for letter, number in PHONE_KEYPAD_MAPPING.items():
|
|
417
480
|
result = F.regexp_replace(result, letter, number)
|
|
418
481
|
result = F.regexp_replace(result, letter.lower(), number)
|
|
419
|
-
|
|
482
|
+
|
|
420
483
|
return F.when(col.isNull(), F.lit("")).otherwise(result)
|
|
421
484
|
|
|
422
485
|
|
|
423
|
-
@
|
|
486
|
+
@phone_numbers.register()
|
|
424
487
|
def normalize_separators(col: Column) -> Column:
|
|
425
488
|
"""
|
|
426
489
|
Normalize various separator styles to hyphens.
|
|
427
490
|
Removes parentheses and replaces dots, spaces with hyphens.
|
|
428
|
-
|
|
491
|
+
|
|
429
492
|
Args:
|
|
430
493
|
col: Column containing phone number
|
|
431
|
-
|
|
494
|
+
|
|
432
495
|
Returns:
|
|
433
496
|
Column with normalized separators
|
|
434
497
|
"""
|
|
@@ -441,27 +504,26 @@ def normalize_separators(col: Column) -> Column:
|
|
|
441
504
|
result = F.regexp_replace(result, r"-+", "-")
|
|
442
505
|
# Remove leading/trailing hyphens
|
|
443
506
|
result = F.regexp_replace(result, r"^-+|-+$", "")
|
|
444
|
-
|
|
507
|
+
|
|
445
508
|
return F.when(col.isNull(), F.lit("")).otherwise(result)
|
|
446
509
|
|
|
447
510
|
|
|
448
|
-
@
|
|
511
|
+
@phone_numbers.register()
|
|
449
512
|
def add_country_code(col: Column) -> Column:
|
|
450
513
|
"""
|
|
451
514
|
Add country code "1" if not present (for NANP numbers).
|
|
452
|
-
|
|
515
|
+
|
|
453
516
|
Args:
|
|
454
517
|
col: Column containing phone number
|
|
455
|
-
|
|
518
|
+
|
|
456
519
|
Returns:
|
|
457
520
|
Column with country code added if needed
|
|
458
521
|
"""
|
|
459
522
|
digits = extract_digits(col)
|
|
460
|
-
|
|
523
|
+
|
|
461
524
|
return F.when(col.isNull(), col).otherwise(
|
|
462
525
|
F.when(
|
|
463
|
-
(F.length(digits) == 10) & is_valid_nanp(col),
|
|
464
|
-
F.concat(F.lit("1"), digits)
|
|
526
|
+
(F.length(digits) == 10) & is_valid_nanp(col), F.concat(F.lit("1"), digits)
|
|
465
527
|
).otherwise(digits)
|
|
466
528
|
)
|
|
467
529
|
|
|
@@ -471,220 +533,193 @@ def add_country_code(col: Column) -> Column:
|
|
|
471
533
|
# ============================================================================
|
|
472
534
|
|
|
473
535
|
|
|
474
|
-
@
|
|
536
|
+
@phone_numbers.register()
|
|
475
537
|
def format_nanp(col: Column) -> Column:
|
|
476
538
|
"""
|
|
477
539
|
Format NANP phone number in standard hyphen format (XXX-XXX-XXXX).
|
|
478
|
-
|
|
540
|
+
|
|
479
541
|
Args:
|
|
480
542
|
col: Column containing phone number
|
|
481
|
-
|
|
543
|
+
|
|
482
544
|
Returns:
|
|
483
545
|
Column with formatted phone number
|
|
484
546
|
"""
|
|
485
547
|
# Remove extension for validation but preserve it
|
|
486
548
|
extension = extract_extension(col)
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
area_code = extract_area_code(
|
|
490
|
-
exchange = extract_exchange(
|
|
491
|
-
subscriber = extract_subscriber(
|
|
492
|
-
|
|
493
|
-
base_format = F.concat(
|
|
494
|
-
|
|
495
|
-
exchange, F.lit("-"),
|
|
496
|
-
subscriber
|
|
497
|
-
)
|
|
498
|
-
|
|
549
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
550
|
+
|
|
551
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
552
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
553
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
554
|
+
|
|
555
|
+
base_format = F.concat(area_code, F.lit("-"), exchange, F.lit("-"), subscriber)
|
|
556
|
+
|
|
499
557
|
# Add extension if present
|
|
500
558
|
formatted = F.when(
|
|
501
|
-
(extension != ""),
|
|
502
|
-
F.concat(base_format, F.lit(" ext. "), extension)
|
|
559
|
+
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
503
560
|
).otherwise(base_format)
|
|
504
|
-
|
|
505
|
-
return F.when(
|
|
506
|
-
is_valid_nanp(phone_no_ext),
|
|
507
|
-
formatted
|
|
508
|
-
).otherwise(F.lit(""))
|
|
561
|
+
|
|
562
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
509
563
|
|
|
510
564
|
|
|
511
|
-
@
|
|
565
|
+
@phone_numbers.register()
|
|
512
566
|
def format_nanp_paren(col: Column) -> Column:
|
|
513
567
|
"""
|
|
514
568
|
Format NANP phone number with parentheses ((XXX) XXX-XXXX).
|
|
515
|
-
|
|
569
|
+
|
|
516
570
|
Args:
|
|
517
571
|
col: Column containing phone number
|
|
518
|
-
|
|
572
|
+
|
|
519
573
|
Returns:
|
|
520
574
|
Column with formatted phone number
|
|
521
575
|
"""
|
|
522
576
|
# Remove extension for validation but preserve it
|
|
523
577
|
extension = extract_extension(col)
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
area_code = extract_area_code(
|
|
527
|
-
exchange = extract_exchange(
|
|
528
|
-
subscriber = extract_subscriber(
|
|
529
|
-
|
|
578
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
579
|
+
|
|
580
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
581
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
582
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
583
|
+
|
|
530
584
|
base_format = F.concat(
|
|
531
|
-
F.lit("("), area_code, F.lit(") "),
|
|
532
|
-
exchange, F.lit("-"), subscriber
|
|
585
|
+
F.lit("("), area_code, F.lit(") "), exchange, F.lit("-"), subscriber
|
|
533
586
|
)
|
|
534
|
-
|
|
587
|
+
|
|
535
588
|
# Add extension if present
|
|
536
589
|
formatted = F.when(
|
|
537
|
-
(extension != ""),
|
|
538
|
-
F.concat(base_format, F.lit(" ext. "), extension)
|
|
590
|
+
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
539
591
|
).otherwise(base_format)
|
|
540
|
-
|
|
541
|
-
return F.when(
|
|
542
|
-
is_valid_nanp(phone_no_ext),
|
|
543
|
-
formatted
|
|
544
|
-
).otherwise(F.lit(""))
|
|
592
|
+
|
|
593
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
545
594
|
|
|
546
595
|
|
|
547
|
-
@
|
|
596
|
+
@phone_numbers.register()
|
|
548
597
|
def format_nanp_dot(col: Column) -> Column:
|
|
549
598
|
"""
|
|
550
599
|
Format NANP phone number with dots (XXX.XXX.XXXX).
|
|
551
|
-
|
|
600
|
+
|
|
552
601
|
Args:
|
|
553
602
|
col: Column containing phone number
|
|
554
|
-
|
|
603
|
+
|
|
555
604
|
Returns:
|
|
556
605
|
Column with formatted phone number
|
|
557
606
|
"""
|
|
558
607
|
# Remove extension for validation but preserve it
|
|
559
608
|
extension = extract_extension(col)
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
area_code = extract_area_code(
|
|
563
|
-
exchange = extract_exchange(
|
|
564
|
-
subscriber = extract_subscriber(
|
|
565
|
-
|
|
566
|
-
base_format = F.concat(
|
|
567
|
-
|
|
568
|
-
exchange, F.lit("."),
|
|
569
|
-
subscriber
|
|
570
|
-
)
|
|
571
|
-
|
|
609
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
610
|
+
|
|
611
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
612
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
613
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
614
|
+
|
|
615
|
+
base_format = F.concat(area_code, F.lit("."), exchange, F.lit("."), subscriber)
|
|
616
|
+
|
|
572
617
|
# Add extension if present
|
|
573
618
|
formatted = F.when(
|
|
574
|
-
(extension != ""),
|
|
575
|
-
F.concat(base_format, F.lit(" ext. "), extension)
|
|
619
|
+
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
576
620
|
).otherwise(base_format)
|
|
577
|
-
|
|
578
|
-
return F.when(
|
|
579
|
-
is_valid_nanp(phone_no_ext),
|
|
580
|
-
formatted
|
|
581
|
-
).otherwise(F.lit(""))
|
|
621
|
+
|
|
622
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
582
623
|
|
|
583
624
|
|
|
584
|
-
@
|
|
625
|
+
@phone_numbers.register()
|
|
585
626
|
def format_nanp_space(col: Column) -> Column:
|
|
586
627
|
"""
|
|
587
628
|
Format NANP phone number with spaces (XXX XXX XXXX).
|
|
588
|
-
|
|
629
|
+
|
|
589
630
|
Args:
|
|
590
631
|
col: Column containing phone number
|
|
591
|
-
|
|
632
|
+
|
|
592
633
|
Returns:
|
|
593
634
|
Column with formatted phone number
|
|
594
635
|
"""
|
|
595
636
|
# Remove extension for validation but preserve it
|
|
596
637
|
extension = extract_extension(col)
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
area_code = extract_area_code(
|
|
600
|
-
exchange = extract_exchange(
|
|
601
|
-
subscriber = extract_subscriber(
|
|
602
|
-
|
|
603
|
-
base_format = F.concat(
|
|
604
|
-
|
|
605
|
-
exchange, F.lit(" "),
|
|
606
|
-
subscriber
|
|
607
|
-
)
|
|
608
|
-
|
|
638
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
639
|
+
|
|
640
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
641
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
642
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
643
|
+
|
|
644
|
+
base_format = F.concat(area_code, F.lit(" "), exchange, F.lit(" "), subscriber)
|
|
645
|
+
|
|
609
646
|
# Add extension if present
|
|
610
647
|
formatted = F.when(
|
|
611
|
-
(extension != ""),
|
|
612
|
-
F.concat(base_format, F.lit(" ext. "), extension)
|
|
648
|
+
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
613
649
|
).otherwise(base_format)
|
|
614
|
-
|
|
615
|
-
return F.when(
|
|
616
|
-
is_valid_nanp(phone_no_ext),
|
|
617
|
-
formatted
|
|
618
|
-
).otherwise(F.lit(""))
|
|
650
|
+
|
|
651
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
619
652
|
|
|
620
653
|
|
|
621
|
-
@
|
|
654
|
+
@phone_numbers.register()
|
|
622
655
|
def format_international(col: Column) -> Column:
|
|
623
656
|
"""
|
|
624
657
|
Format international phone number with country code.
|
|
625
|
-
|
|
658
|
+
|
|
626
659
|
Args:
|
|
627
660
|
col: Column containing phone number
|
|
628
|
-
|
|
661
|
+
|
|
629
662
|
Returns:
|
|
630
663
|
Column with formatted international number
|
|
631
664
|
"""
|
|
632
665
|
country_code = extract_country_code(col)
|
|
633
666
|
digits = extract_digits(col)
|
|
634
|
-
|
|
667
|
+
|
|
635
668
|
# For international numbers, if we have a country code, remove it from the beginning
|
|
636
669
|
# Use F.substring with proper column references
|
|
637
670
|
cc_length = F.length(country_code)
|
|
638
671
|
remaining_digits = F.when(
|
|
639
672
|
(country_code != "") & (cc_length > 0) & digits.startswith(country_code),
|
|
640
|
-
F.substring(digits, cc_length + 1, 999)
|
|
673
|
+
F.substring(digits, cc_length + 1, 999),
|
|
641
674
|
).otherwise(digits)
|
|
642
|
-
|
|
643
|
-
return
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
digits
|
|
649
|
-
|
|
675
|
+
|
|
676
|
+
return (
|
|
677
|
+
F.when(
|
|
678
|
+
is_valid_international(col) & (country_code != ""),
|
|
679
|
+
F.concat(F.lit("+"), country_code, F.lit(" "), remaining_digits),
|
|
680
|
+
)
|
|
681
|
+
.when(is_valid_international(col), digits)
|
|
682
|
+
.otherwise(F.lit(""))
|
|
683
|
+
)
|
|
650
684
|
|
|
651
685
|
|
|
652
|
-
@
|
|
686
|
+
@phone_numbers.register()
|
|
653
687
|
def format_e164(col: Column) -> Column:
|
|
654
688
|
"""
|
|
655
689
|
Format phone number in E.164 format (+CCAAANNNNNNN) with default country code 1.
|
|
656
|
-
|
|
690
|
+
|
|
657
691
|
Args:
|
|
658
692
|
col: Column containing phone number
|
|
659
|
-
|
|
693
|
+
|
|
660
694
|
Returns:
|
|
661
695
|
Column with E.164 formatted number
|
|
662
696
|
"""
|
|
663
697
|
digits = extract_digits(col)
|
|
664
698
|
country_code = extract_country_code(col)
|
|
665
|
-
|
|
699
|
+
|
|
666
700
|
# Check if it's a valid NANP number first
|
|
667
701
|
is_nanp = is_valid_nanp(col)
|
|
668
|
-
|
|
702
|
+
|
|
669
703
|
# Use default country code "1" if not present and number is 10 digits NANP
|
|
670
704
|
final_country = F.when(
|
|
671
|
-
(country_code == "") & (F.length(digits) == 10) & is_nanp,
|
|
672
|
-
F.lit("1")
|
|
705
|
+
(country_code == "") & (F.length(digits) == 10) & is_nanp, F.lit("1")
|
|
673
706
|
).otherwise(country_code)
|
|
674
|
-
|
|
707
|
+
|
|
675
708
|
# Build E.164 format - only for valid phones
|
|
676
709
|
return F.when(
|
|
677
|
-
|
|
710
|
+
is_valid_phone_numbers(col),
|
|
678
711
|
F.when(
|
|
679
|
-
(F.length(digits) == 10) & is_nanp,
|
|
680
|
-
|
|
681
|
-
|
|
712
|
+
(F.length(digits) == 10) & is_nanp, F.concat(F.lit("+"), F.lit("1"), digits)
|
|
713
|
+
)
|
|
714
|
+
.when(
|
|
682
715
|
(F.length(digits) == 11) & digits.startswith("1") & is_nanp,
|
|
683
|
-
F.concat(F.lit("+"), digits)
|
|
684
|
-
)
|
|
716
|
+
F.concat(F.lit("+"), digits),
|
|
717
|
+
)
|
|
718
|
+
.when(
|
|
685
719
|
(country_code != "") & is_valid_international(col),
|
|
686
|
-
F.concat(F.lit("+"), digits) # digits already includes country code
|
|
687
|
-
)
|
|
720
|
+
F.concat(F.lit("+"), digits), # digits already includes country code
|
|
721
|
+
)
|
|
722
|
+
.otherwise(F.lit("")),
|
|
688
723
|
).otherwise(F.lit(""))
|
|
689
724
|
|
|
690
725
|
|
|
@@ -693,127 +728,142 @@ def format_e164(col: Column) -> Column:
|
|
|
693
728
|
# ============================================================================
|
|
694
729
|
|
|
695
730
|
|
|
696
|
-
@
|
|
697
|
-
def
|
|
731
|
+
@phone_numbers.register()
|
|
732
|
+
def standardize_phone_numbers(col: Column) -> Column:
|
|
698
733
|
"""
|
|
699
734
|
Standardize phone number with cleaning and NANP formatting.
|
|
700
|
-
|
|
735
|
+
|
|
701
736
|
Args:
|
|
702
737
|
col: Column containing phone number
|
|
703
|
-
|
|
738
|
+
|
|
704
739
|
Returns:
|
|
705
740
|
Column with standardized phone number in NANP format
|
|
706
741
|
"""
|
|
707
742
|
# Clean and convert letters in a simpler way
|
|
708
743
|
cleaned = convert_letters_to_numbers(col)
|
|
709
|
-
|
|
744
|
+
|
|
710
745
|
# Extract extension first
|
|
711
746
|
extension = extract_extension(cleaned)
|
|
712
|
-
phone_no_ext = remove_extension(cleaned)
|
|
713
|
-
|
|
747
|
+
phone_no_ext = remove_extension(cleaned)
|
|
748
|
+
|
|
714
749
|
# Get digits and check validity
|
|
715
750
|
digits = extract_digits(phone_no_ext)
|
|
716
|
-
|
|
751
|
+
|
|
717
752
|
# Simple NANP formatting for valid 10 or 11 digit numbers
|
|
718
|
-
result =
|
|
719
|
-
F.
|
|
720
|
-
|
|
721
|
-
F.
|
|
722
|
-
|
|
723
|
-
|
|
753
|
+
result = (
|
|
754
|
+
F.when(
|
|
755
|
+
F.length(digits) == 10,
|
|
756
|
+
F.concat(
|
|
757
|
+
F.substring(digits, 1, 3),
|
|
758
|
+
F.lit("-"),
|
|
759
|
+
F.substring(digits, 4, 3),
|
|
760
|
+
F.lit("-"),
|
|
761
|
+
F.substring(digits, 7, 4),
|
|
762
|
+
),
|
|
724
763
|
)
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
764
|
+
.when(
|
|
765
|
+
F.length(digits) == 11,
|
|
766
|
+
F.concat(
|
|
767
|
+
F.substring(digits, 2, 3),
|
|
768
|
+
F.lit("-"),
|
|
769
|
+
F.substring(digits, 5, 3),
|
|
770
|
+
F.lit("-"),
|
|
771
|
+
F.substring(digits, 8, 4),
|
|
772
|
+
),
|
|
731
773
|
)
|
|
732
|
-
|
|
733
|
-
|
|
774
|
+
.otherwise(F.lit(""))
|
|
775
|
+
)
|
|
776
|
+
|
|
734
777
|
# Add extension back if present
|
|
735
778
|
final_result = F.when(
|
|
736
|
-
(extension != "") & (result != ""),
|
|
737
|
-
F.concat(result, F.lit(" ext. "), extension)
|
|
779
|
+
(extension != "") & (result != ""), F.concat(result, F.lit(" ext. "), extension)
|
|
738
780
|
).otherwise(result)
|
|
739
|
-
|
|
781
|
+
|
|
740
782
|
return final_result
|
|
741
783
|
|
|
742
784
|
|
|
743
|
-
@
|
|
744
|
-
def
|
|
785
|
+
@phone_numbers.register()
|
|
786
|
+
def standardize_phone_numbers_e164(col: Column) -> Column:
|
|
745
787
|
"""
|
|
746
788
|
Standardize phone number with cleaning and E.164 formatting.
|
|
747
|
-
|
|
789
|
+
|
|
748
790
|
Args:
|
|
749
791
|
col: Column containing phone number
|
|
750
|
-
|
|
792
|
+
|
|
751
793
|
Returns:
|
|
752
794
|
Column with standardized phone number in E.164 format
|
|
753
795
|
"""
|
|
754
796
|
# Clean and convert letters
|
|
755
797
|
cleaned = convert_letters_to_numbers(col)
|
|
756
|
-
|
|
798
|
+
|
|
757
799
|
# Format as E.164
|
|
758
800
|
result = format_e164(cleaned)
|
|
759
|
-
|
|
801
|
+
|
|
760
802
|
# Only return valid phone numbers
|
|
761
|
-
return F.when(
|
|
803
|
+
return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
|
|
762
804
|
|
|
763
805
|
|
|
764
|
-
@
|
|
765
|
-
def
|
|
806
|
+
@phone_numbers.register()
|
|
807
|
+
def standardize_phone_numbers_digits(col: Column) -> Column:
|
|
766
808
|
"""
|
|
767
809
|
Standardize phone number and return digits only.
|
|
768
|
-
|
|
810
|
+
|
|
769
811
|
Args:
|
|
770
812
|
col: Column containing phone number
|
|
771
|
-
|
|
813
|
+
|
|
772
814
|
Returns:
|
|
773
815
|
Column with digits only
|
|
774
816
|
"""
|
|
775
817
|
# Clean and convert letters
|
|
776
818
|
cleaned = convert_letters_to_numbers(col)
|
|
777
|
-
|
|
819
|
+
|
|
778
820
|
# Get digits only
|
|
779
821
|
result = extract_digits(cleaned)
|
|
780
|
-
|
|
822
|
+
|
|
781
823
|
# Only return valid phone numbers
|
|
782
|
-
return F.when(
|
|
824
|
+
return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
|
|
783
825
|
|
|
784
826
|
|
|
785
|
-
@
|
|
786
|
-
def
|
|
827
|
+
@phone_numbers.register()
|
|
828
|
+
def clean_phone_numbers(col: Column) -> Column:
|
|
787
829
|
"""
|
|
788
830
|
Clean and validate phone number, returning null for invalid numbers.
|
|
789
|
-
|
|
831
|
+
|
|
790
832
|
Args:
|
|
791
833
|
col: Column containing phone number
|
|
792
|
-
|
|
834
|
+
|
|
793
835
|
Returns:
|
|
794
836
|
Column with cleaned phone number or null
|
|
795
837
|
"""
|
|
796
838
|
# Simple implementation to avoid deep nesting
|
|
797
839
|
cleaned = convert_letters_to_numbers(col)
|
|
798
840
|
digits = extract_digits(cleaned)
|
|
799
|
-
|
|
841
|
+
|
|
800
842
|
# Simple validation and formatting
|
|
801
|
-
result =
|
|
802
|
-
F.
|
|
803
|
-
|
|
804
|
-
F.
|
|
805
|
-
|
|
806
|
-
|
|
843
|
+
result = (
|
|
844
|
+
F.when(
|
|
845
|
+
F.length(digits) == 10,
|
|
846
|
+
F.concat(
|
|
847
|
+
F.substring(digits, 1, 3),
|
|
848
|
+
F.lit("-"),
|
|
849
|
+
F.substring(digits, 4, 3),
|
|
850
|
+
F.lit("-"),
|
|
851
|
+
F.substring(digits, 7, 4),
|
|
852
|
+
),
|
|
807
853
|
)
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
854
|
+
.when(
|
|
855
|
+
F.length(digits) == 11,
|
|
856
|
+
F.concat(
|
|
857
|
+
F.substring(digits, 2, 3),
|
|
858
|
+
F.lit("-"),
|
|
859
|
+
F.substring(digits, 5, 3),
|
|
860
|
+
F.lit("-"),
|
|
861
|
+
F.substring(digits, 8, 4),
|
|
862
|
+
),
|
|
814
863
|
)
|
|
815
|
-
|
|
816
|
-
|
|
864
|
+
.otherwise(F.lit(None))
|
|
865
|
+
)
|
|
866
|
+
|
|
817
867
|
return result
|
|
818
868
|
|
|
819
869
|
|
|
@@ -822,14 +872,14 @@ def clean_phone(col: Column) -> Column:
|
|
|
822
872
|
# ============================================================================
|
|
823
873
|
|
|
824
874
|
|
|
825
|
-
@
|
|
826
|
-
def
|
|
875
|
+
@phone_numbers.register()
|
|
876
|
+
def get_phone_numbers_type(col: Column) -> Column:
|
|
827
877
|
"""
|
|
828
878
|
Get phone number type (toll-free, premium, standard, international).
|
|
829
|
-
|
|
879
|
+
|
|
830
880
|
Args:
|
|
831
|
-
col: Column containing
|
|
832
|
-
|
|
881
|
+
col: Column containing phone_numbers number
|
|
882
|
+
|
|
833
883
|
Returns:
|
|
834
884
|
Column with phone type
|
|
835
885
|
"""
|
|
@@ -842,55 +892,55 @@ def get_phone_type(col: Column) -> Column:
|
|
|
842
892
|
)
|
|
843
893
|
|
|
844
894
|
|
|
845
|
-
@
|
|
895
|
+
@phone_numbers.register()
|
|
846
896
|
def get_region_from_area_code(col: Column) -> Column:
|
|
847
897
|
"""
|
|
848
898
|
Get geographic region from area code (simplified - would need lookup table).
|
|
849
|
-
|
|
899
|
+
|
|
850
900
|
Args:
|
|
851
901
|
col: Column containing phone number
|
|
852
|
-
|
|
902
|
+
|
|
853
903
|
Returns:
|
|
854
904
|
Column with region or empty string
|
|
855
905
|
"""
|
|
856
906
|
area_code = extract_area_code(col)
|
|
857
|
-
|
|
907
|
+
|
|
858
908
|
# This is a simplified example - in practice you'd use a lookup table
|
|
859
909
|
# Just showing structure for major area codes
|
|
860
|
-
return
|
|
861
|
-
when(area_code == "
|
|
862
|
-
when(area_code == "
|
|
863
|
-
when(area_code == "
|
|
864
|
-
when(area_code == "
|
|
865
|
-
when(area_code
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
910
|
+
return (
|
|
911
|
+
F.when(area_code == "212", F.lit("New York, NY"))
|
|
912
|
+
.when(area_code == "213", F.lit("Los Angeles, CA"))
|
|
913
|
+
.when(area_code == "312", F.lit("Chicago, IL"))
|
|
914
|
+
.when(area_code == "415", F.lit("San Francisco, CA"))
|
|
915
|
+
.when(area_code == "202", F.lit("Washington, DC"))
|
|
916
|
+
.when(
|
|
917
|
+
area_code.isin(["800", "888", "877", "866", "855", "844", "833"]),
|
|
918
|
+
F.lit("Toll-Free"),
|
|
919
|
+
)
|
|
920
|
+
.when(area_code == "900", F.lit("Premium"))
|
|
921
|
+
.otherwise(F.lit(""))
|
|
922
|
+
)
|
|
869
923
|
|
|
870
924
|
|
|
871
|
-
@
|
|
872
|
-
def
|
|
925
|
+
@phone_numbers.register()
|
|
926
|
+
def mask_phone_numbers(col: Column) -> Column:
|
|
873
927
|
"""
|
|
874
928
|
Mask phone number for privacy keeping last 4 digits (e.g., ***-***-1234).
|
|
875
|
-
|
|
929
|
+
|
|
876
930
|
Args:
|
|
877
931
|
col: Column containing phone number
|
|
878
|
-
|
|
932
|
+
|
|
879
933
|
Returns:
|
|
880
934
|
Column with masked phone number
|
|
881
935
|
"""
|
|
882
936
|
subscriber = extract_subscriber(col)
|
|
883
|
-
|
|
937
|
+
|
|
884
938
|
# Mask area code and exchange, keep last 4 digits
|
|
885
939
|
masked = F.when(
|
|
886
940
|
is_valid_nanp(col),
|
|
887
|
-
F.concat(
|
|
888
|
-
F.lit("***"), F.lit("-"),
|
|
889
|
-
F.lit("***"), F.lit("-"),
|
|
890
|
-
subscriber
|
|
891
|
-
)
|
|
941
|
+
F.concat(F.lit("***"), F.lit("-"), F.lit("***"), F.lit("-"), subscriber),
|
|
892
942
|
).otherwise(col)
|
|
893
|
-
|
|
943
|
+
|
|
894
944
|
return F.when(col.isNull() | (col == ""), F.lit(None)).otherwise(masked)
|
|
895
945
|
|
|
896
946
|
|
|
@@ -899,43 +949,43 @@ def mask_phone(col: Column) -> Column:
|
|
|
899
949
|
# ============================================================================
|
|
900
950
|
|
|
901
951
|
|
|
902
|
-
@
|
|
903
|
-
def
|
|
952
|
+
@phone_numbers.register()
|
|
953
|
+
def filter_valid_phone_numbers_numbers(col: Column) -> Column:
|
|
904
954
|
"""
|
|
905
|
-
Return
|
|
906
|
-
|
|
955
|
+
Return phone_numbers number only if valid, otherwise return null.
|
|
956
|
+
|
|
907
957
|
Args:
|
|
908
958
|
col: Column containing phone number
|
|
909
|
-
|
|
959
|
+
|
|
910
960
|
Returns:
|
|
911
961
|
Column with valid phone or null
|
|
912
962
|
"""
|
|
913
|
-
return F.when(
|
|
963
|
+
return F.when(is_valid_phone_numbers(col), col).otherwise(F.lit(None))
|
|
914
964
|
|
|
915
965
|
|
|
916
|
-
@
|
|
917
|
-
def
|
|
966
|
+
@phone_numbers.register()
|
|
967
|
+
def filter_nanp_phone_numbers_numbers(col: Column) -> Column:
|
|
918
968
|
"""
|
|
919
|
-
Return
|
|
920
|
-
|
|
969
|
+
Return phone_numbers number only if valid NANP, otherwise return null.
|
|
970
|
+
|
|
921
971
|
Args:
|
|
922
972
|
col: Column containing phone number
|
|
923
|
-
|
|
973
|
+
|
|
924
974
|
Returns:
|
|
925
975
|
Column with NANP phone or null
|
|
926
976
|
"""
|
|
927
977
|
return F.when(is_valid_nanp(col), col).otherwise(F.lit(None))
|
|
928
978
|
|
|
929
979
|
|
|
930
|
-
@
|
|
931
|
-
def
|
|
980
|
+
@phone_numbers.register()
|
|
981
|
+
def filter_toll_free_phone_numbers_numbers(col: Column) -> Column:
|
|
932
982
|
"""
|
|
933
983
|
Return phone number only if toll-free, otherwise return null.
|
|
934
|
-
|
|
984
|
+
|
|
935
985
|
Args:
|
|
936
986
|
col: Column containing phone number
|
|
937
|
-
|
|
987
|
+
|
|
938
988
|
Returns:
|
|
939
989
|
Column with toll-free phone or null
|
|
940
990
|
"""
|
|
941
|
-
return F.when(is_toll_free(col), col).otherwise(F.lit(None))
|
|
991
|
+
return F.when(is_toll_free(col), col).otherwise(F.lit(None))
|