datacompose 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/__init__.py +1 -0
- datacompose/cli/__init__.py +5 -0
- datacompose/cli/colors.py +80 -0
- datacompose/cli/commands/__init__.py +3 -0
- datacompose/cli/commands/add.py +215 -0
- datacompose/cli/commands/init.py +451 -0
- datacompose/cli/commands/list.py +118 -0
- datacompose/cli/commands/upgrade.py +7 -0
- datacompose/cli/main.py +59 -0
- datacompose/cli/validation.py +72 -0
- datacompose/generators/__init__.py +3 -0
- datacompose/generators/base.py +193 -0
- datacompose/generators/pyspark/__init__.py +1 -0
- datacompose/generators/pyspark/generator.py +51 -0
- datacompose/operators/__init__.py +21 -0
- datacompose/operators/primitives.py +595 -0
- datacompose/transformers/__init__.py +0 -0
- datacompose/transformers/discovery.py +186 -0
- datacompose/transformers/text/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4.dist-info/METADATA +431 -0
- datacompose-0.2.4.dist-info/RECORD +31 -0
- datacompose-0.2.4.dist-info/WHEEL +5 -0
- datacompose-0.2.4.dist-info/entry_points.txt +2 -0
- datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
- datacompose-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1967 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
# For type checkers only - these imports are always available during type checking
|
|
6
|
+
from pyspark.sql import Column
|
|
7
|
+
from pyspark.sql import functions as F
|
|
8
|
+
else:
|
|
9
|
+
# At runtime, handle missing PySpark gracefully
|
|
10
|
+
try:
|
|
11
|
+
from pyspark.sql import Column
|
|
12
|
+
from pyspark.sql import functions as F
|
|
13
|
+
except ImportError:
|
|
14
|
+
# PySpark is not installed - functions will fail at runtime if called
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
# Try local utils import first (for generated code)
|
|
19
|
+
from utils.primitives import PrimitiveRegistry
|
|
20
|
+
except ImportError:
|
|
21
|
+
# Fall back to installed datacompose package
|
|
22
|
+
from datacompose.operators.primitives import PrimitiveRegistry
|
|
23
|
+
|
|
24
|
+
addresses = PrimitiveRegistry("addresses")
|
|
25
|
+
|
|
26
|
+
# US State mappings - comprehensive list including territories
|
|
27
|
+
# These are mutable to allow extension
|
|
28
|
+
US_STATES = {
|
|
29
|
+
"ALABAMA": "AL",
|
|
30
|
+
"ALASKA": "AK",
|
|
31
|
+
"ARIZONA": "AZ",
|
|
32
|
+
"ARKANSAS": "AR",
|
|
33
|
+
"CALIFORNIA": "CA",
|
|
34
|
+
"COLORADO": "CO",
|
|
35
|
+
"CONNECTICUT": "CT",
|
|
36
|
+
"DELAWARE": "DE",
|
|
37
|
+
"FLORIDA": "FL",
|
|
38
|
+
"GEORGIA": "GA",
|
|
39
|
+
"HAWAII": "HI",
|
|
40
|
+
"IDAHO": "ID",
|
|
41
|
+
"ILLINOIS": "IL",
|
|
42
|
+
"INDIANA": "IN",
|
|
43
|
+
"IOWA": "IA",
|
|
44
|
+
"KANSAS": "KS",
|
|
45
|
+
"KENTUCKY": "KY",
|
|
46
|
+
"LOUISIANA": "LA",
|
|
47
|
+
"MAINE": "ME",
|
|
48
|
+
"MARYLAND": "MD",
|
|
49
|
+
"MASSACHUSETTS": "MA",
|
|
50
|
+
"MICHIGAN": "MI",
|
|
51
|
+
"MINNESOTA": "MN",
|
|
52
|
+
"MISSISSIPPI": "MS",
|
|
53
|
+
"MISSOURI": "MO",
|
|
54
|
+
"MONTANA": "MT",
|
|
55
|
+
"NEBRASKA": "NE",
|
|
56
|
+
"NEVADA": "NV",
|
|
57
|
+
"NEW HAMPSHIRE": "NH",
|
|
58
|
+
"NEW JERSEY": "NJ",
|
|
59
|
+
"NEW MEXICO": "NM",
|
|
60
|
+
"NEW YORK": "NY",
|
|
61
|
+
"NORTH CAROLINA": "NC",
|
|
62
|
+
"NORTH DAKOTA": "ND",
|
|
63
|
+
"OHIO": "OH",
|
|
64
|
+
"OKLAHOMA": "OK",
|
|
65
|
+
"OREGON": "OR",
|
|
66
|
+
"PENNSYLVANIA": "PA",
|
|
67
|
+
"RHODE ISLAND": "RI",
|
|
68
|
+
"SOUTH CAROLINA": "SC",
|
|
69
|
+
"SOUTH DAKOTA": "SD",
|
|
70
|
+
"TENNESSEE": "TN",
|
|
71
|
+
"TEXAS": "TX",
|
|
72
|
+
"UTAH": "UT",
|
|
73
|
+
"VERMONT": "VT",
|
|
74
|
+
"VIRGINIA": "VA",
|
|
75
|
+
"WASHINGTON": "WA",
|
|
76
|
+
"WEST VIRGINIA": "WV",
|
|
77
|
+
"WISCONSIN": "WI",
|
|
78
|
+
"WYOMING": "WY",
|
|
79
|
+
# US Territories and DC
|
|
80
|
+
"DISTRICT OF COLUMBIA": "DC",
|
|
81
|
+
"PUERTO RICO": "PR",
|
|
82
|
+
"VIRGIN ISLANDS": "VI",
|
|
83
|
+
"GUAM": "GU",
|
|
84
|
+
"AMERICAN SAMOA": "AS",
|
|
85
|
+
"NORTHERN MARIANA ISLANDS": "MP",
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Reverse mapping: abbreviation to full name
|
|
89
|
+
STATE_ABBREV = {
|
|
90
|
+
"AL": "ALABAMA",
|
|
91
|
+
"AK": "ALASKA",
|
|
92
|
+
"AZ": "ARIZONA",
|
|
93
|
+
"AR": "ARKANSAS",
|
|
94
|
+
"CA": "CALIFORNIA",
|
|
95
|
+
"CO": "COLORADO",
|
|
96
|
+
"CT": "CONNECTICUT",
|
|
97
|
+
"DE": "DELAWARE",
|
|
98
|
+
"FL": "FLORIDA",
|
|
99
|
+
"GA": "GEORGIA",
|
|
100
|
+
"HI": "HAWAII",
|
|
101
|
+
"ID": "IDAHO",
|
|
102
|
+
"IL": "ILLINOIS",
|
|
103
|
+
"IN": "INDIANA",
|
|
104
|
+
"IA": "IOWA",
|
|
105
|
+
"KS": "KANSAS",
|
|
106
|
+
"KY": "KENTUCKY",
|
|
107
|
+
"LA": "LOUISIANA",
|
|
108
|
+
"ME": "MAINE",
|
|
109
|
+
"MD": "MARYLAND",
|
|
110
|
+
"MA": "MASSACHUSETTS",
|
|
111
|
+
"MI": "MICHIGAN",
|
|
112
|
+
"MN": "MINNESOTA",
|
|
113
|
+
"MS": "MISSISSIPPI",
|
|
114
|
+
"MO": "MISSOURI",
|
|
115
|
+
"MT": "MONTANA",
|
|
116
|
+
"NE": "NEBRASKA",
|
|
117
|
+
"NV": "NEVADA",
|
|
118
|
+
"NH": "NEW HAMPSHIRE",
|
|
119
|
+
"NJ": "NEW JERSEY",
|
|
120
|
+
"NM": "NEW MEXICO",
|
|
121
|
+
"NY": "NEW YORK",
|
|
122
|
+
"NC": "NORTH CAROLINA",
|
|
123
|
+
"ND": "NORTH DAKOTA",
|
|
124
|
+
"OH": "OHIO",
|
|
125
|
+
"OK": "OKLAHOMA",
|
|
126
|
+
"OR": "OREGON",
|
|
127
|
+
"PA": "PENNSYLVANIA",
|
|
128
|
+
"RI": "RHODE ISLAND",
|
|
129
|
+
"SC": "SOUTH CAROLINA",
|
|
130
|
+
"SD": "SOUTH DAKOTA",
|
|
131
|
+
"TN": "TENNESSEE",
|
|
132
|
+
"TX": "TEXAS",
|
|
133
|
+
"UT": "UTAH",
|
|
134
|
+
"VT": "VERMONT",
|
|
135
|
+
"VA": "VIRGINIA",
|
|
136
|
+
"WA": "WASHINGTON",
|
|
137
|
+
"WV": "WEST VIRGINIA",
|
|
138
|
+
"WI": "WISCONSIN",
|
|
139
|
+
"WY": "WYOMING",
|
|
140
|
+
# US Territories and DC
|
|
141
|
+
"DC": "DISTRICT OF COLUMBIA",
|
|
142
|
+
"PR": "PUERTO RICO",
|
|
143
|
+
"VI": "VIRGIN ISLANDS",
|
|
144
|
+
"GU": "GUAM",
|
|
145
|
+
"AS": "AMERICAN SAMOA",
|
|
146
|
+
"MP": "NORTHERN MARIANA ISLANDS",
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# Custom cities that users want to recognize
|
|
150
|
+
# Users can add to this list for better city extraction
|
|
151
|
+
CUSTOM_CITIES = set()
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def add_custom_state(full_name: str, abbreviation: str) -> None:
|
|
155
|
+
"""Add a custom state or region to the state mappings.
|
|
156
|
+
|
|
157
|
+
This allows extending the address parser to handle non-US states/provinces.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
full_name: Full name of the state/province (e.g., "ONTARIO")
|
|
161
|
+
abbreviation: Two-letter abbreviation (e.g., "ON")
|
|
162
|
+
|
|
163
|
+
Example:
|
|
164
|
+
# Add Canadian provinces
|
|
165
|
+
add_custom_state("ONTARIO", "ON")
|
|
166
|
+
add_custom_state("QUEBEC", "QC")
|
|
167
|
+
add_custom_state("BRITISH COLUMBIA", "BC")
|
|
168
|
+
"""
|
|
169
|
+
full_name_upper = full_name.upper()
|
|
170
|
+
abbrev_upper = abbreviation.upper()
|
|
171
|
+
|
|
172
|
+
US_STATES[full_name_upper] = abbrev_upper
|
|
173
|
+
STATE_ABBREV[abbrev_upper] = full_name_upper
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def add_custom_city(city_name: str) -> None:
|
|
177
|
+
"""Add a custom city name to improve city extraction.
|
|
178
|
+
|
|
179
|
+
This is useful for cities that might be ambiguous or hard to extract.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
city_name: Name of the city to add
|
|
183
|
+
|
|
184
|
+
Example:
|
|
185
|
+
# Add cities that might be confused with other words
|
|
186
|
+
add_custom_city("Reading") # Could be confused with the verb
|
|
187
|
+
add_custom_city("Mobile") # Could be confused with the adjective
|
|
188
|
+
"""
|
|
189
|
+
CUSTOM_CITIES.add(city_name.upper())
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def remove_custom_state(identifier: str) -> None:
|
|
193
|
+
"""Remove a custom state from the mappings.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
identifier: Either the full name or abbreviation of the state to remove
|
|
197
|
+
"""
|
|
198
|
+
identifier_upper = identifier.upper()
|
|
199
|
+
|
|
200
|
+
# Check if it's an abbreviation
|
|
201
|
+
if identifier_upper in STATE_ABBREV:
|
|
202
|
+
full_name = STATE_ABBREV[identifier_upper]
|
|
203
|
+
del STATE_ABBREV[identifier_upper]
|
|
204
|
+
if full_name in US_STATES:
|
|
205
|
+
del US_STATES[full_name]
|
|
206
|
+
# Check if it's a full name
|
|
207
|
+
elif identifier_upper in US_STATES:
|
|
208
|
+
abbrev = US_STATES[identifier_upper]
|
|
209
|
+
del US_STATES[identifier_upper]
|
|
210
|
+
if abbrev in STATE_ABBREV:
|
|
211
|
+
del STATE_ABBREV[abbrev]
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def remove_custom_city(city_name: str) -> None:
|
|
215
|
+
"""Remove a custom city from the set.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
city_name: Name of the city to remove
|
|
219
|
+
"""
|
|
220
|
+
CUSTOM_CITIES.discard(city_name.upper())
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@addresses.register()
|
|
224
|
+
def extract_street_number(col: Column) -> Column:
|
|
225
|
+
"""Extract street/house number from address.
|
|
226
|
+
|
|
227
|
+
Extracts the numeric portion at the beginning of an address.
|
|
228
|
+
Handles various formats: 123, 123A, 123-125, etc.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
col: Column containing address text
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Column with extracted street number or empty string
|
|
235
|
+
|
|
236
|
+
Example:
|
|
237
|
+
df.select(addresses.extract_street_number(F.col("address")))
|
|
238
|
+
# "123 Main St" -> "123"
|
|
239
|
+
# "123A Oak Ave" -> "123A"
|
|
240
|
+
# "123-125 Elm St" -> "123-125"
|
|
241
|
+
"""
|
|
242
|
+
# Pattern to match house/building numbers at the start (after optional whitespace)
|
|
243
|
+
# Matches: 123, 123A, 123-125, 123½, etc.
|
|
244
|
+
pattern = r"^\s*(\d+[\w\-/]*)\b"
|
|
245
|
+
result = F.regexp_extract(col, pattern, 1)
|
|
246
|
+
# Return empty string for null results
|
|
247
|
+
return F.when(result.isNull() | (col.isNull()), F.lit("")).otherwise(result)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@addresses.register()
|
|
251
|
+
def extract_street_prefix(col: Column) -> Column:
|
|
252
|
+
"""Extract directional prefix from street address.
|
|
253
|
+
|
|
254
|
+
Extracts directional prefixes like N, S, E, W, NE, NW, SE, SW.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
col: Column containing address text
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Column with extracted street prefix or empty string
|
|
261
|
+
|
|
262
|
+
Example:
|
|
263
|
+
df.select(addresses.extract_street_prefix(F.col("address")))
|
|
264
|
+
# "123 N Main St" -> "N"
|
|
265
|
+
# "456 South Oak Ave" -> "South"
|
|
266
|
+
"""
|
|
267
|
+
# Handle nulls
|
|
268
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
269
|
+
|
|
270
|
+
# Remove house number first (after trimming leading whitespace)
|
|
271
|
+
without_number = F.regexp_replace(col, r"^\s*\d+[\w\-/]*\s*", "")
|
|
272
|
+
|
|
273
|
+
# Pattern for directional prefixes - case insensitive
|
|
274
|
+
# Capture the prefix including optional period
|
|
275
|
+
prefix_pattern = r"^(?i)(North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\b"
|
|
276
|
+
|
|
277
|
+
result = F.regexp_extract(without_number, prefix_pattern, 1)
|
|
278
|
+
return F.when(result.isNull(), F.lit("")).otherwise(result)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@addresses.register()
|
|
282
|
+
def extract_street_name(col: Column) -> Column:
|
|
283
|
+
"""Extract street name from address.
|
|
284
|
+
|
|
285
|
+
Extracts the main street name, excluding number, prefix, and suffix.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
col: Column containing address text
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Column with extracted street name or empty string
|
|
292
|
+
|
|
293
|
+
Example:
|
|
294
|
+
df.select(addresses.extract_street_name(F.col("address")))
|
|
295
|
+
# "123 N Main Street" -> "Main"
|
|
296
|
+
# "456 Oak Avenue" -> "Oak"
|
|
297
|
+
# "789 Martin Luther King Jr Blvd" -> "Martin Luther King Jr"
|
|
298
|
+
"""
|
|
299
|
+
# Handle nulls
|
|
300
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
301
|
+
|
|
302
|
+
# Common street suffixes to identify end of street name
|
|
303
|
+
# Using abbreviated forms from the YAML config
|
|
304
|
+
suffixes = [
|
|
305
|
+
"Street",
|
|
306
|
+
"St",
|
|
307
|
+
"Avenue",
|
|
308
|
+
"Ave",
|
|
309
|
+
"Road",
|
|
310
|
+
"Rd",
|
|
311
|
+
"Boulevard",
|
|
312
|
+
"Blvd",
|
|
313
|
+
"Drive",
|
|
314
|
+
"Dr",
|
|
315
|
+
"Lane",
|
|
316
|
+
"Ln",
|
|
317
|
+
"Court",
|
|
318
|
+
"Ct",
|
|
319
|
+
"Place",
|
|
320
|
+
"Pl",
|
|
321
|
+
"Circle",
|
|
322
|
+
"Cir",
|
|
323
|
+
"Trail",
|
|
324
|
+
"Trl",
|
|
325
|
+
"Parkway",
|
|
326
|
+
"Pkwy",
|
|
327
|
+
"Highway",
|
|
328
|
+
"Hwy",
|
|
329
|
+
"Way",
|
|
330
|
+
"Terrace",
|
|
331
|
+
"Ter",
|
|
332
|
+
"Plaza",
|
|
333
|
+
"Plz",
|
|
334
|
+
"Square",
|
|
335
|
+
"Sq",
|
|
336
|
+
"Loop",
|
|
337
|
+
"Crescent",
|
|
338
|
+
"Cres",
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
# Remove house number only if followed by more text (not just a suffix)
|
|
342
|
+
# This preserves numbered streets like "5th Avenue" while removing "123 Main St"
|
|
343
|
+
# Check if we have a pattern like "number word suffix" vs just "number suffix"
|
|
344
|
+
# Trim leading whitespace first
|
|
345
|
+
trimmed_col = F.trim(col)
|
|
346
|
+
without_number = F.when(
|
|
347
|
+
# If it's just a numbered street (e.g., "5th Avenue", "1st Street")
|
|
348
|
+
trimmed_col.rlike(r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"),
|
|
349
|
+
trimmed_col # Keep as is - it's a numbered street name
|
|
350
|
+
).otherwise(
|
|
351
|
+
# Otherwise remove the house number
|
|
352
|
+
F.regexp_replace(trimmed_col, r"^\d+[\w\-/]*\s+", "")
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Remove directional prefix - case insensitive
|
|
356
|
+
# Include full directional words and abbreviations
|
|
357
|
+
prefix_pattern = (
|
|
358
|
+
r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
|
|
359
|
+
)
|
|
360
|
+
without_prefix = F.regexp_replace(without_number, prefix_pattern, "")
|
|
361
|
+
|
|
362
|
+
# Extract everything before the street suffix - case insensitive
|
|
363
|
+
suffix_pattern = r"^(?i)(.+?)\s+(?:" + "|".join(suffixes) + r")\b"
|
|
364
|
+
street_name = F.regexp_extract(without_prefix, suffix_pattern, 1)
|
|
365
|
+
|
|
366
|
+
# If no suffix found, try to extract before comma or end
|
|
367
|
+
street_name = F.when(street_name != "", street_name).otherwise(
|
|
368
|
+
F.regexp_extract(without_prefix, r"^([^,]+?)(?:\s*,|\s*$)", 1)
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
return F.trim(street_name)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
@addresses.register()
|
|
375
|
+
def extract_street_suffix(col: Column) -> Column:
|
|
376
|
+
"""Extract street type/suffix from address.
|
|
377
|
+
|
|
378
|
+
Extracts street type like Street, Avenue, Road, Boulevard, etc.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
col: Column containing address text
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
Column with extracted street suffix or empty string
|
|
385
|
+
|
|
386
|
+
Example:
|
|
387
|
+
df.select(addresses.extract_street_suffix(F.col("address")))
|
|
388
|
+
# "123 Main Street" -> "Street"
|
|
389
|
+
# "456 Oak Ave" -> "Ave"
|
|
390
|
+
# "789 Elm Boulevard" -> "Boulevard"
|
|
391
|
+
"""
|
|
392
|
+
# Handle nulls
|
|
393
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
394
|
+
|
|
395
|
+
# Comprehensive list of street suffixes (both full and abbreviated)
|
|
396
|
+
suffixes = [
|
|
397
|
+
"Street",
|
|
398
|
+
"St",
|
|
399
|
+
"Avenue",
|
|
400
|
+
"Ave",
|
|
401
|
+
"Road",
|
|
402
|
+
"Rd",
|
|
403
|
+
"Boulevard",
|
|
404
|
+
"Blvd",
|
|
405
|
+
"Drive",
|
|
406
|
+
"Dr",
|
|
407
|
+
"Lane",
|
|
408
|
+
"Ln",
|
|
409
|
+
"Court",
|
|
410
|
+
"Ct",
|
|
411
|
+
"Place",
|
|
412
|
+
"Pl",
|
|
413
|
+
"Circle",
|
|
414
|
+
"Cir",
|
|
415
|
+
"Trail",
|
|
416
|
+
"Trl",
|
|
417
|
+
"Parkway",
|
|
418
|
+
"Pkwy",
|
|
419
|
+
"Highway",
|
|
420
|
+
"Hwy",
|
|
421
|
+
"Way",
|
|
422
|
+
"Terrace",
|
|
423
|
+
"Ter",
|
|
424
|
+
"Plaza",
|
|
425
|
+
"Plz",
|
|
426
|
+
"Square",
|
|
427
|
+
"Sq",
|
|
428
|
+
"Loop",
|
|
429
|
+
"Crescent",
|
|
430
|
+
"Cres",
|
|
431
|
+
"Alley",
|
|
432
|
+
"Aly",
|
|
433
|
+
]
|
|
434
|
+
|
|
435
|
+
# Build pattern to match the LAST suffix in the string
|
|
436
|
+
# This handles cases like "St. James Place" where we want "Place" not "St"
|
|
437
|
+
suffix_pattern = r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
|
|
438
|
+
|
|
439
|
+
# Extract the last matching suffix - case insensitive
|
|
440
|
+
suffix_pattern_ci = r"(?i)" + suffix_pattern
|
|
441
|
+
result = F.regexp_extract(col, suffix_pattern_ci, 1)
|
|
442
|
+
return F.when(result.isNull(), F.lit("")).otherwise(result)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
@addresses.register()
|
|
446
|
+
def extract_full_street(col: Column) -> Column:
|
|
447
|
+
"""Extract complete street address (number + prefix + name + suffix).
|
|
448
|
+
|
|
449
|
+
Extracts everything before apartment/suite and city/state/zip.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
col: Column containing address text
|
|
453
|
+
|
|
454
|
+
Returns:
|
|
455
|
+
Column with extracted street address or empty string
|
|
456
|
+
|
|
457
|
+
Example:
|
|
458
|
+
df.select(addresses.extract_full_street(F.col("address")))
|
|
459
|
+
# "123 N Main St, Apt 4B, New York, NY" -> "123 N Main St"
|
|
460
|
+
"""
|
|
461
|
+
# Handle nulls
|
|
462
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
463
|
+
|
|
464
|
+
# Remove apartment/suite information - case insensitive
|
|
465
|
+
apt_pattern = r"\s*,?\s*(?i)(?:Apt|Apartment|Unit|Suite|Ste|#)\s*[\w\-]+\b"
|
|
466
|
+
without_apt = F.regexp_replace(col, apt_pattern, "")
|
|
467
|
+
|
|
468
|
+
# Extract everything before the first comma (usually street part)
|
|
469
|
+
street = F.regexp_extract(without_apt, r"^([^,]+)", 1)
|
|
470
|
+
|
|
471
|
+
# If no comma, try to extract before city/state pattern
|
|
472
|
+
# Look for pattern like "Street City" or "Street State ZIP"
|
|
473
|
+
street = F.when(
|
|
474
|
+
street == "",
|
|
475
|
+
F.regexp_extract(
|
|
476
|
+
col, r"^(.+?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s*,?\s*[A-Z]{2}\s+\d{5}", 1
|
|
477
|
+
),
|
|
478
|
+
).otherwise(street)
|
|
479
|
+
|
|
480
|
+
return F.trim(street)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
@addresses.register()
|
|
484
|
+
def standardize_street_prefix(
|
|
485
|
+
col: Column, custom_mappings: Optional[Dict[str, str]] = None
|
|
486
|
+
) -> Column:
|
|
487
|
+
"""Standardize street directional prefixes to abbreviated form.
|
|
488
|
+
|
|
489
|
+
Converts all variations to standard USPS abbreviations:
|
|
490
|
+
North/N/N. → N, South/S/S. → S, etc.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
col: Column containing street prefix
|
|
494
|
+
custom_mappings: Optional dict of custom prefix mappings (case insensitive)
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Column with standardized prefix (always abbreviated per USPS standards)
|
|
498
|
+
|
|
499
|
+
Example:
|
|
500
|
+
df.select(addresses.standardize_street_prefix(F.col("prefix")))
|
|
501
|
+
# "North" -> "N"
|
|
502
|
+
# "south" -> "S"
|
|
503
|
+
# "NorthEast" -> "NE"
|
|
504
|
+
"""
|
|
505
|
+
# Mapping based on YAML config prefixes (lines 806-814)
|
|
506
|
+
prefix_map = {
|
|
507
|
+
"NORTH": "N",
|
|
508
|
+
"N.": "N",
|
|
509
|
+
"N": "N",
|
|
510
|
+
"SOUTH": "S",
|
|
511
|
+
"S.": "S",
|
|
512
|
+
"S": "S",
|
|
513
|
+
"EAST": "E",
|
|
514
|
+
"E.": "E",
|
|
515
|
+
"E": "E",
|
|
516
|
+
"WEST": "W",
|
|
517
|
+
"W.": "W",
|
|
518
|
+
"W": "W",
|
|
519
|
+
"NORTHEAST": "NE",
|
|
520
|
+
"NE.": "NE",
|
|
521
|
+
"NE": "NE",
|
|
522
|
+
"NORTHWEST": "NW",
|
|
523
|
+
"NW.": "NW",
|
|
524
|
+
"NW": "NW",
|
|
525
|
+
"SOUTHEAST": "SE",
|
|
526
|
+
"SE.": "SE",
|
|
527
|
+
"SE": "SE",
|
|
528
|
+
"SOUTHWEST": "SW",
|
|
529
|
+
"SW.": "SW",
|
|
530
|
+
"SW": "SW",
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
# Convert to uppercase for matching
|
|
534
|
+
upper_col = F.upper(F.trim(col))
|
|
535
|
+
|
|
536
|
+
# Apply custom mappings first if provided
|
|
537
|
+
result = col
|
|
538
|
+
if custom_mappings:
|
|
539
|
+
for original, standard in custom_mappings.items():
|
|
540
|
+
result = F.when(
|
|
541
|
+
upper_col == F.upper(F.lit(original)), F.lit(standard)
|
|
542
|
+
).otherwise(result)
|
|
543
|
+
return result
|
|
544
|
+
|
|
545
|
+
# Apply default mapping
|
|
546
|
+
result = F.lit("")
|
|
547
|
+
for original, standard in prefix_map.items():
|
|
548
|
+
result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
|
|
549
|
+
|
|
550
|
+
return result
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
@addresses.register()
|
|
554
|
+
def standardize_street_suffix(
|
|
555
|
+
col: Column, custom_mappings: Optional[Dict[str, str]] = None
|
|
556
|
+
) -> Column:
|
|
557
|
+
"""Standardize street type/suffix to USPS abbreviated form.
|
|
558
|
+
|
|
559
|
+
Converts all variations to standard USPS abbreviations per the config:
|
|
560
|
+
Street/St/St. → St, Avenue/Ave/Av → Ave, Boulevard → Blvd, etc.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
col: Column containing street suffix
|
|
564
|
+
custom_mappings: Optional dict of custom suffix mappings (case insensitive)
|
|
565
|
+
|
|
566
|
+
Returns:
|
|
567
|
+
Column with standardized suffix (always abbreviated per USPS standards)
|
|
568
|
+
|
|
569
|
+
Example:
|
|
570
|
+
df.select(addresses.standardize_street_suffix(F.col("suffix")))
|
|
571
|
+
# "Street" -> "St"
|
|
572
|
+
# "avenue" -> "Ave"
|
|
573
|
+
# "BOULEVARD" -> "Blvd"
|
|
574
|
+
"""
|
|
575
|
+
# Based on YAML config suffixes mapping (lines 824-965)
|
|
576
|
+
# This is a subset of the most common ones
|
|
577
|
+
suffix_map = {
|
|
578
|
+
"STREET": "St",
|
|
579
|
+
"ST": "St",
|
|
580
|
+
"ST.": "St",
|
|
581
|
+
"STR": "St",
|
|
582
|
+
"AVENUE": "Ave",
|
|
583
|
+
"AVE": "Ave",
|
|
584
|
+
"AVE.": "Ave",
|
|
585
|
+
"AV": "Ave",
|
|
586
|
+
"AVEN": "Ave",
|
|
587
|
+
"ROAD": "Rd",
|
|
588
|
+
"RD": "Rd",
|
|
589
|
+
"RD.": "Rd",
|
|
590
|
+
"BOULEVARD": "Blvd",
|
|
591
|
+
"BLVD": "Blvd",
|
|
592
|
+
"BLVD.": "Blvd",
|
|
593
|
+
"BOUL": "Blvd",
|
|
594
|
+
"DRIVE": "Dr",
|
|
595
|
+
"DR": "Dr",
|
|
596
|
+
"DR.": "Dr",
|
|
597
|
+
"DRV": "Dr",
|
|
598
|
+
"DRIV": "Dr",
|
|
599
|
+
"LANE": "Ln",
|
|
600
|
+
"LN": "Ln",
|
|
601
|
+
"LN.": "Ln",
|
|
602
|
+
"COURT": "Ct",
|
|
603
|
+
"CT": "Ct",
|
|
604
|
+
"CT.": "Ct",
|
|
605
|
+
"CRT": "Ct",
|
|
606
|
+
"PLACE": "Pl",
|
|
607
|
+
"PL": "Pl",
|
|
608
|
+
"PL.": "Pl",
|
|
609
|
+
"PLC": "Pl",
|
|
610
|
+
"CIRCLE": "Cir",
|
|
611
|
+
"CIR": "Cir",
|
|
612
|
+
"CIR.": "Cir",
|
|
613
|
+
"CIRC": "Cir",
|
|
614
|
+
"TRAIL": "Trl",
|
|
615
|
+
"TRL": "Trl",
|
|
616
|
+
"TRL.": "Trl",
|
|
617
|
+
"TR": "Trl",
|
|
618
|
+
"PARKWAY": "Pkwy",
|
|
619
|
+
"PKWY": "Pkwy",
|
|
620
|
+
"PKY": "Pkwy",
|
|
621
|
+
"PWAY": "Pkwy",
|
|
622
|
+
"HIGHWAY": "Hwy",
|
|
623
|
+
"HWY": "Hwy",
|
|
624
|
+
"HWY.": "Hwy",
|
|
625
|
+
"HIWAY": "Hwy",
|
|
626
|
+
"WAY": "Way",
|
|
627
|
+
"WY": "Way",
|
|
628
|
+
"TERRACE": "Ter",
|
|
629
|
+
"TER": "Ter",
|
|
630
|
+
"TER.": "Ter",
|
|
631
|
+
"TERR": "Ter",
|
|
632
|
+
"PLAZA": "Plz",
|
|
633
|
+
"PLZ": "Plz",
|
|
634
|
+
"PLZ.": "Plz",
|
|
635
|
+
"PLZA": "Plz",
|
|
636
|
+
"SQUARE": "Sq",
|
|
637
|
+
"SQ": "Sq",
|
|
638
|
+
"SQ.": "Sq",
|
|
639
|
+
"SQR": "Sq",
|
|
640
|
+
"LOOP": "Loop",
|
|
641
|
+
"LP": "Loop",
|
|
642
|
+
"CRESCENT": "Cres",
|
|
643
|
+
"CRES": "Cres",
|
|
644
|
+
"CRES.": "Cres",
|
|
645
|
+
"CRSC": "Cres",
|
|
646
|
+
"ALLEY": "Aly",
|
|
647
|
+
"ALY": "Aly",
|
|
648
|
+
"ALY.": "Aly",
|
|
649
|
+
"ALLY": "Aly",
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
# Handle nulls - return empty string for null input
|
|
653
|
+
if col is None:
|
|
654
|
+
return F.lit("")
|
|
655
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
656
|
+
|
|
657
|
+
# Convert to uppercase for matching
|
|
658
|
+
upper_col = F.upper(F.trim(col))
|
|
659
|
+
|
|
660
|
+
# Start with the original column
|
|
661
|
+
result = col
|
|
662
|
+
|
|
663
|
+
# Apply custom mappings first if provided (they take precedence)
|
|
664
|
+
if custom_mappings:
|
|
665
|
+
for original, standard in custom_mappings.items():
|
|
666
|
+
result = F.when(
|
|
667
|
+
upper_col == F.upper(F.lit(original)), F.lit(standard)
|
|
668
|
+
).otherwise(result)
|
|
669
|
+
|
|
670
|
+
# Then apply standard mappings for anything not already mapped
|
|
671
|
+
# Need to check if result has changed to avoid overwriting custom mappings
|
|
672
|
+
for original, standard in suffix_map.items():
|
|
673
|
+
# Only apply if not already mapped by custom mappings
|
|
674
|
+
if custom_mappings and original.upper() in [k.upper() for k in custom_mappings.keys()]:
|
|
675
|
+
continue
|
|
676
|
+
result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
|
|
677
|
+
|
|
678
|
+
return result
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
@addresses.register()
|
|
682
|
+
def extract_apartment_number(col: Column) -> Column:
|
|
683
|
+
"""Extract apartment/unit number from address.
|
|
684
|
+
|
|
685
|
+
Extracts apartment, suite, unit, or room numbers including:
|
|
686
|
+
Apt 5B, Suite 200, Unit 12, #4A, Rm 101, etc.
|
|
687
|
+
|
|
688
|
+
Args:
|
|
689
|
+
col: Column containing address text
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
Column with extracted apartment/unit number or empty string
|
|
693
|
+
|
|
694
|
+
Example:
|
|
695
|
+
df.select(addresses.extract_apartment_number(F.col("address")))
|
|
696
|
+
# "123 Main St Apt 5B" -> "5B"
|
|
697
|
+
# "456 Oak Ave Suite 200" -> "200"
|
|
698
|
+
# "789 Elm St #4A" -> "4A"
|
|
699
|
+
"""
|
|
700
|
+
# Handle nulls
|
|
701
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
702
|
+
|
|
703
|
+
# Patterns for different unit types - case insensitive
|
|
704
|
+
# Matches: Apt, Apartment, Suite, Ste, Unit, Room, Rm, # followed by alphanumeric
|
|
705
|
+
# Updated to handle fractions (1/2, 3½), decimals (12.5), parentheses and other special cases
|
|
706
|
+
apt_pattern = r"(?i)(?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*(\(?[A-Z0-9\-/½¼¾\.]+\)?)"
|
|
707
|
+
|
|
708
|
+
result = F.regexp_extract(col, apt_pattern, 1)
|
|
709
|
+
|
|
710
|
+
# If no unit type found, check for trailing numbers (e.g., "123 Main St 456")
|
|
711
|
+
if_no_result = F.when(
|
|
712
|
+
result == "", F.regexp_extract(col, r"\s+(\d+[A-Z]?)\s*$", 1)
|
|
713
|
+
).otherwise(result)
|
|
714
|
+
|
|
715
|
+
return F.when(if_no_result.isNull(), F.lit("")).otherwise(if_no_result)
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
@addresses.register()
|
|
719
|
+
def extract_floor(col: Column) -> Column:
|
|
720
|
+
"""Extract floor number from address.
|
|
721
|
+
|
|
722
|
+
Extracts floor information like:
|
|
723
|
+
5th Floor, Floor 2, Fl 3, Level 4, etc.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
col: Column containing address text
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
Column with extracted floor number or empty string
|
|
730
|
+
|
|
731
|
+
Example:
|
|
732
|
+
df.select(addresses.extract_floor(F.col("address")))
|
|
733
|
+
# "123 Main St, 5th Floor" -> "5"
|
|
734
|
+
# "456 Oak Ave, Floor 2" -> "2"
|
|
735
|
+
# "789 Elm St, Level 3" -> "3"
|
|
736
|
+
"""
|
|
737
|
+
# Handle nulls
|
|
738
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
739
|
+
|
|
740
|
+
# Pattern for floor information - case insensitive
|
|
741
|
+
# Matches: 1st Floor, 2nd Floor, 3rd Floor, 4th-99th Floor, Floor 1, Fl. 2, Level 3
|
|
742
|
+
# Updated to handle abbreviated forms like "31st Fl"
|
|
743
|
+
floor_pattern = r"(?i)(?:(\d+)(?:st|nd|rd|th)?\s*(?:Floor|Fl\.?)|Floor\s*(\d+)|Fl\.?\s*(\d+)|Level\s*(\d+))"
|
|
744
|
+
|
|
745
|
+
# Extract from any of the capture groups
|
|
746
|
+
floor1 = F.regexp_extract(col, floor_pattern, 1)
|
|
747
|
+
floor2 = F.regexp_extract(col, floor_pattern, 2)
|
|
748
|
+
floor3 = F.regexp_extract(col, floor_pattern, 3)
|
|
749
|
+
floor4 = F.regexp_extract(col, floor_pattern, 4)
|
|
750
|
+
|
|
751
|
+
# Return the first non-empty match
|
|
752
|
+
result = F.when(floor1 != "", floor1).otherwise(
|
|
753
|
+
F.when(floor2 != "", floor2).otherwise(
|
|
754
|
+
F.when(floor3 != "", floor3).otherwise(floor4)
|
|
755
|
+
)
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
return F.when(result.isNull() | (result == ""), F.lit("")).otherwise(result)
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
@addresses.register()
|
|
762
|
+
def extract_building(col: Column) -> Column:
|
|
763
|
+
"""Extract building name or identifier from address.
|
|
764
|
+
|
|
765
|
+
Extracts building information like:
|
|
766
|
+
Building A, Tower 2, Complex B, Block C, etc.
|
|
767
|
+
|
|
768
|
+
Args:
|
|
769
|
+
col: Column containing address text
|
|
770
|
+
|
|
771
|
+
Returns:
|
|
772
|
+
Column with extracted building identifier or empty string
|
|
773
|
+
|
|
774
|
+
Example:
|
|
775
|
+
df.select(addresses.extract_building(F.col("address")))
|
|
776
|
+
# "123 Main St, Building A" -> "A"
|
|
777
|
+
# "456 Oak Ave, Tower 2" -> "2"
|
|
778
|
+
# "789 Elm St, Complex North" -> "North"
|
|
779
|
+
"""
|
|
780
|
+
# Handle nulls
|
|
781
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
782
|
+
|
|
783
|
+
# Pattern for building information - case insensitive
|
|
784
|
+
# Matches: Building A, Bldg 2, Tower B, Complex 3, Block C, Wing D, Blg B
|
|
785
|
+
# Updated to handle multi-word names but stop at commas or other building indicators
|
|
786
|
+
building_pattern = r"(?i)(?:Building|Bldg\.?|Blg|Tower|Complex|Block|Wing)\s+([A-Z0-9]+(?:\s+[A-Z0-9]+)?)"
|
|
787
|
+
|
|
788
|
+
# Stop capturing if we hit another building indicator (Floor, Suite, etc.)
|
|
789
|
+
result_raw = F.regexp_extract(col, building_pattern, 1)
|
|
790
|
+
|
|
791
|
+
# Clean up - remove anything after Floor, Suite, Apt, etc.
|
|
792
|
+
result = F.regexp_replace(
|
|
793
|
+
result_raw,
|
|
794
|
+
r"(?i)\s+(?:Floor|Fl\.?|Suite|Ste\.?|Apt\.?|Apartment|Unit|Room|Rm\.?).*",
|
|
795
|
+
"",
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
return F.when(result.isNull() | (result == ""), F.lit("")).otherwise(result)
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
@addresses.register()
|
|
802
|
+
def extract_unit_type(col: Column) -> Column:
|
|
803
|
+
"""Extract the type of unit (Apt, Suite, Unit, etc.) from address.
|
|
804
|
+
|
|
805
|
+
Args:
|
|
806
|
+
col: Column containing address text
|
|
807
|
+
|
|
808
|
+
Returns:
|
|
809
|
+
Column with unit type or empty string
|
|
810
|
+
|
|
811
|
+
Example:
|
|
812
|
+
df.select(addresses.extract_unit_type(F.col("address")))
|
|
813
|
+
# "123 Main St Apt 5B" -> "Apt"
|
|
814
|
+
# "456 Oak Ave Suite 200" -> "Suite"
|
|
815
|
+
# "789 Elm St Unit 12" -> "Unit"
|
|
816
|
+
"""
|
|
817
|
+
# Handle nulls
|
|
818
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
819
|
+
|
|
820
|
+
# Pattern to extract unit type - case insensitive
|
|
821
|
+
unit_type_pattern = r"(?i)(Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)"
|
|
822
|
+
|
|
823
|
+
result = F.regexp_extract(col, unit_type_pattern, 1)
|
|
824
|
+
|
|
825
|
+
# Clean up the result (remove periods, standardize case)
|
|
826
|
+
result = F.when(
|
|
827
|
+
result != "", F.initcap(F.regexp_replace(result, r"\.", ""))
|
|
828
|
+
).otherwise("")
|
|
829
|
+
|
|
830
|
+
return F.when(result.isNull(), F.lit("")).otherwise(result)
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
@addresses.register()
|
|
834
|
+
def standardize_unit_type(
|
|
835
|
+
col: Column, custom_mappings: Optional[Dict[str, str]] = None
|
|
836
|
+
) -> Column:
|
|
837
|
+
"""Standardize unit type to common abbreviations.
|
|
838
|
+
|
|
839
|
+
Converts all variations to standard abbreviations:
|
|
840
|
+
Apartment/Apt. → Apt, Suite → Ste, Room → Rm, etc.
|
|
841
|
+
|
|
842
|
+
Args:
|
|
843
|
+
col: Column containing unit type
|
|
844
|
+
custom_mappings: Optional dict of custom unit type mappings
|
|
845
|
+
|
|
846
|
+
Returns:
|
|
847
|
+
Column with standardized unit type
|
|
848
|
+
|
|
849
|
+
Example:
|
|
850
|
+
df.select(addresses.standardize_unit_type(F.col("unit_type")))
|
|
851
|
+
# "Apartment" -> "Apt"
|
|
852
|
+
# "Suite" -> "Ste"
|
|
853
|
+
# "Room" -> "Rm"
|
|
854
|
+
"""
|
|
855
|
+
# Handle nulls
|
|
856
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
857
|
+
|
|
858
|
+
# Standard mappings for unit types
|
|
859
|
+
unit_map = {
|
|
860
|
+
"APARTMENT": "Apt",
|
|
861
|
+
"APT.": "Apt",
|
|
862
|
+
"APT": "Apt",
|
|
863
|
+
"SUITE": "Ste",
|
|
864
|
+
"STE.": "Ste",
|
|
865
|
+
"STE": "Ste",
|
|
866
|
+
"UNIT": "Unit",
|
|
867
|
+
"ROOM": "Rm",
|
|
868
|
+
"RM.": "Rm",
|
|
869
|
+
"RM": "Rm",
|
|
870
|
+
"FLOOR": "Fl",
|
|
871
|
+
"FL.": "Fl",
|
|
872
|
+
"FL": "Fl",
|
|
873
|
+
"BUILDING": "Bldg",
|
|
874
|
+
"BLDG.": "Bldg",
|
|
875
|
+
"BLDG": "Bldg",
|
|
876
|
+
"#": "#",
|
|
877
|
+
"NUMBER": "#",
|
|
878
|
+
"NO.": "#",
|
|
879
|
+
"NO": "#",
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
# Convert to uppercase for matching
|
|
883
|
+
upper_col = F.upper(F.trim(col))
|
|
884
|
+
|
|
885
|
+
# Apply custom mappings first if provided
|
|
886
|
+
result = col
|
|
887
|
+
if custom_mappings:
|
|
888
|
+
for original, standard in custom_mappings.items():
|
|
889
|
+
result = F.when(
|
|
890
|
+
upper_col == F.upper(F.lit(original)), F.lit(standard)
|
|
891
|
+
).otherwise(result)
|
|
892
|
+
|
|
893
|
+
# Then apply standard mappings for anything not custom mapped
|
|
894
|
+
for original, standard in unit_map.items():
|
|
895
|
+
result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
|
|
896
|
+
|
|
897
|
+
return result
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
@addresses.register()
|
|
901
|
+
def extract_secondary_address(col: Column) -> Column:
|
|
902
|
+
"""Extract complete secondary address information (unit type + number).
|
|
903
|
+
|
|
904
|
+
Combines unit type and number into standard format:
|
|
905
|
+
"Apt 5B", "Ste 200", "Unit 12", etc.
|
|
906
|
+
|
|
907
|
+
Args:
|
|
908
|
+
col: Column containing address text
|
|
909
|
+
|
|
910
|
+
Returns:
|
|
911
|
+
Column with complete secondary address or empty string
|
|
912
|
+
|
|
913
|
+
Example:
|
|
914
|
+
df.select(addresses.extract_secondary_address(F.col("address")))
|
|
915
|
+
# "123 Main St Apt 5B" -> "Apt 5B"
|
|
916
|
+
# "456 Oak Ave, Suite 200" -> "Suite 200"
|
|
917
|
+
# "789 Elm St" -> ""
|
|
918
|
+
"""
|
|
919
|
+
# Handle nulls
|
|
920
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
921
|
+
|
|
922
|
+
# Pattern to extract complete secondary address - case insensitive
|
|
923
|
+
secondary_pattern = (
|
|
924
|
+
r"(?i)((?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*[A-Z0-9\-]+)"
|
|
925
|
+
)
|
|
926
|
+
|
|
927
|
+
result = F.regexp_extract(col, secondary_pattern, 1)
|
|
928
|
+
return F.when(result.isNull(), F.lit("")).otherwise(result)
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
@addresses.register()
|
|
932
|
+
def has_apartment(col: Column) -> Column:
|
|
933
|
+
"""Check if address contains apartment/unit information.
|
|
934
|
+
|
|
935
|
+
Args:
|
|
936
|
+
col: Column containing address text
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
Column with boolean indicating presence of apartment/unit
|
|
940
|
+
|
|
941
|
+
Example:
|
|
942
|
+
df.select(addresses.has_apartment(F.col("address")))
|
|
943
|
+
# "123 Main St Apt 5B" -> True
|
|
944
|
+
# "456 Oak Ave" -> False
|
|
945
|
+
"""
|
|
946
|
+
# Handle nulls
|
|
947
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
948
|
+
|
|
949
|
+
# Check for apartment/unit patterns
|
|
950
|
+
apt_pattern = (
|
|
951
|
+
r"(?i)(?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*[A-Z0-9\-]+"
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
# Return boolean
|
|
955
|
+
return F.when(F.regexp_extract(col, apt_pattern, 0) != "", F.lit(True)).otherwise(
|
|
956
|
+
F.lit(False)
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
@addresses.register()
|
|
961
|
+
def remove_secondary_address(col: Column) -> Column:
|
|
962
|
+
"""Remove apartment/suite/unit information from address.
|
|
963
|
+
|
|
964
|
+
Removes secondary address components to get clean street address.
|
|
965
|
+
|
|
966
|
+
Args:
|
|
967
|
+
col: Column containing address text
|
|
968
|
+
|
|
969
|
+
Returns:
|
|
970
|
+
Column with secondary address removed
|
|
971
|
+
|
|
972
|
+
Example:
|
|
973
|
+
df.select(addresses.remove_secondary_address(F.col("address")))
|
|
974
|
+
# "123 Main St Apt 5B" -> "123 Main St"
|
|
975
|
+
# "456 Oak Ave, Suite 200" -> "456 Oak Ave"
|
|
976
|
+
"""
|
|
977
|
+
# Handle nulls
|
|
978
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
979
|
+
|
|
980
|
+
# Pattern to match secondary address components - case insensitive
|
|
981
|
+
# Include optional comma and spaces before
|
|
982
|
+
secondary_pattern = (
|
|
983
|
+
r",?\s*(?i)(?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*[A-Z0-9\-]+\b"
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
# Remove the pattern and clean up extra spaces
|
|
987
|
+
result = F.regexp_replace(col, secondary_pattern, "")
|
|
988
|
+
result = F.regexp_replace(result, r"\s+", " ") # Clean multiple spaces
|
|
989
|
+
result = F.trim(result)
|
|
990
|
+
|
|
991
|
+
return result
|
|
992
|
+
|
|
993
|
+
|
|
994
|
+
def format_secondary_address(unit_type: Column, unit_number: Column) -> Column:
|
|
995
|
+
"""Format unit type and number into standard secondary address.
|
|
996
|
+
|
|
997
|
+
Note: This is a helper function, not registered with addresses primitive.
|
|
998
|
+
Use it directly with two columns.
|
|
999
|
+
|
|
1000
|
+
Args:
|
|
1001
|
+
unit_type: Column containing unit type (Apt, Suite, etc.)
|
|
1002
|
+
unit_number: Column containing unit number (5B, 200, etc.)
|
|
1003
|
+
|
|
1004
|
+
Returns:
|
|
1005
|
+
Column with formatted secondary address
|
|
1006
|
+
|
|
1007
|
+
Example:
|
|
1008
|
+
from datacompose.transformers.text.clean_addresses.pyspark.pyspark_udf import format_secondary_address
|
|
1009
|
+
df.select(format_secondary_address(F.lit("Apartment"), F.lit("5B")))
|
|
1010
|
+
# -> "Apt 5B"
|
|
1011
|
+
"""
|
|
1012
|
+
# Standardize the unit type first
|
|
1013
|
+
std_type = standardize_unit_type(unit_type)
|
|
1014
|
+
|
|
1015
|
+
# Combine type and number, handling nulls
|
|
1016
|
+
result = F.when(
|
|
1017
|
+
(std_type.isNotNull() & (std_type != ""))
|
|
1018
|
+
& (unit_number.isNotNull() & (unit_number != "")),
|
|
1019
|
+
F.concat_ws(" ", std_type, unit_number),
|
|
1020
|
+
).otherwise(F.lit(""))
|
|
1021
|
+
|
|
1022
|
+
return result
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
@addresses.register()
|
|
1026
|
+
def extract_zip_code(col: Column) -> Column: # type: ignore
|
|
1027
|
+
"""Extract US ZIP code (5-digit or ZIP+4 format) from text.
|
|
1028
|
+
|
|
1029
|
+
Returns empty string for null/invalid inputs.
|
|
1030
|
+
"""
|
|
1031
|
+
extracted = F.regexp_extract(col, r"\b(\d{5}(?:-\d{4})?)\b", 1)
|
|
1032
|
+
# Return empty string instead of null for consistency
|
|
1033
|
+
return F.when(extracted.isNull(), F.lit("")).otherwise(extracted)
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
@addresses.register()
|
|
1037
|
+
def validate_zip_code(col: Column) -> Column:
|
|
1038
|
+
"""Validate if a ZIP code is in correct US format.
|
|
1039
|
+
|
|
1040
|
+
Validates:
|
|
1041
|
+
- 5-digit format (e.g., "12345")
|
|
1042
|
+
- ZIP+4 format (e.g., "12345-6789")
|
|
1043
|
+
- Not all zeros (except "00000" which is technically valid)
|
|
1044
|
+
- Within valid range (00001-99999 for base ZIP)
|
|
1045
|
+
|
|
1046
|
+
Args:
|
|
1047
|
+
col (Column): Column containing ZIP codes to validate
|
|
1048
|
+
|
|
1049
|
+
Returns:
|
|
1050
|
+
Column: Boolean column indicating if ZIP code is valid
|
|
1051
|
+
"""
|
|
1052
|
+
# Check if the column matches valid ZIP code pattern
|
|
1053
|
+
is_valid_format = F.regexp_extract(col, r"^(\d{5}(?:-\d{4})?)$", 1) != ""
|
|
1054
|
+
|
|
1055
|
+
# Additional validation: not empty/null
|
|
1056
|
+
is_not_empty = (col.isNotNull()) & (F.trim(col) != "")
|
|
1057
|
+
|
|
1058
|
+
# Combined validation
|
|
1059
|
+
return is_valid_format & is_not_empty
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
@addresses.register()
|
|
1063
|
+
def is_valid_zip_code(col: Column) -> "Column":
|
|
1064
|
+
"""Alias for validate_zip_code for consistency.
|
|
1065
|
+
|
|
1066
|
+
Args:
|
|
1067
|
+
col (Column): Column containing ZIP codes to validate
|
|
1068
|
+
|
|
1069
|
+
Returns:
|
|
1070
|
+
Column: Boolean column indicating if ZIP code is valid
|
|
1071
|
+
"""
|
|
1072
|
+
return validate_zip_code(col)
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
@addresses.register()
|
|
1076
|
+
def standardize_zip_code(col: Column):
|
|
1077
|
+
"""Standardize ZIP code format.
|
|
1078
|
+
|
|
1079
|
+
- Removes extra spaces
|
|
1080
|
+
- Ensures proper dash placement for ZIP+4
|
|
1081
|
+
- Returns empty string for invalid formats
|
|
1082
|
+
|
|
1083
|
+
Args:
|
|
1084
|
+
col (Column): Column containing ZIP codes to standardize
|
|
1085
|
+
|
|
1086
|
+
Returns:
|
|
1087
|
+
Column: Standardized ZIP code or empty string if invalid
|
|
1088
|
+
"""
|
|
1089
|
+
# First extract the ZIP code
|
|
1090
|
+
extracted = extract_zip_code(col)
|
|
1091
|
+
|
|
1092
|
+
# Then validate it
|
|
1093
|
+
is_valid = validate_zip_code(extracted)
|
|
1094
|
+
|
|
1095
|
+
# Return standardized version or empty string
|
|
1096
|
+
return F.when(is_valid, extracted).otherwise(F.lit(""))
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
@addresses.register()
|
|
1100
|
+
def get_zip_code_type(col: Column):
|
|
1101
|
+
"""Determine the type of ZIP code.
|
|
1102
|
+
|
|
1103
|
+
Args:
|
|
1104
|
+
col (Column): Column containing ZIP codes
|
|
1105
|
+
|
|
1106
|
+
Returns:
|
|
1107
|
+
Column: String column with values: "standard", "plus4", "invalid", or "empty"
|
|
1108
|
+
"""
|
|
1109
|
+
# Check patterns
|
|
1110
|
+
is_standard = F.regexp_extract(col, r"^(\d{5})$", 1) != ""
|
|
1111
|
+
is_plus4 = F.regexp_extract(col, r"^(\d{5}-\d{4})$", 1) != ""
|
|
1112
|
+
is_empty = (col.isNull()) | (F.trim(col) == "")
|
|
1113
|
+
|
|
1114
|
+
return (
|
|
1115
|
+
F.when(is_plus4, F.lit("plus4"))
|
|
1116
|
+
.when(is_standard, F.lit("standard"))
|
|
1117
|
+
.when(is_empty, F.lit("empty"))
|
|
1118
|
+
.otherwise(F.lit("invalid"))
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
@addresses.register()
|
|
1123
|
+
def split_zip_code(col: Column):
|
|
1124
|
+
"""Split ZIP+4 code into base and extension components.
|
|
1125
|
+
|
|
1126
|
+
Args:
|
|
1127
|
+
col (Column): Column containing ZIP codes
|
|
1128
|
+
|
|
1129
|
+
Returns:
|
|
1130
|
+
Column: Struct with 'base' and 'extension' fields
|
|
1131
|
+
"""
|
|
1132
|
+
# Extract base ZIP (first 5 digits)
|
|
1133
|
+
base_zip = F.regexp_extract(col, r"^(\d{5})", 1)
|
|
1134
|
+
|
|
1135
|
+
# Extract extension (4 digits after dash, if present)
|
|
1136
|
+
extension = F.regexp_extract(col, r"^\d{5}-(\d{4})$", 1)
|
|
1137
|
+
|
|
1138
|
+
# Return as struct
|
|
1139
|
+
return F.struct(
|
|
1140
|
+
base_zip.alias("base"),
|
|
1141
|
+
F.when(extension != "", extension).otherwise(F.lit(None)).alias("extension"),
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
@addresses.register()
|
|
1146
|
+
def extract_city(col: Column, custom_cities: Optional[List] = None) -> Column:
|
|
1147
|
+
"""Extract city name from US address text.
|
|
1148
|
+
|
|
1149
|
+
Extracts city by finding text before state abbreviation or ZIP code.
|
|
1150
|
+
Handles various formats including comma-separated and multi-word cities.
|
|
1151
|
+
|
|
1152
|
+
Args:
|
|
1153
|
+
col: Column containing address text
|
|
1154
|
+
custom_cities: Optional list of custom city names to recognize (case-insensitive)
|
|
1155
|
+
|
|
1156
|
+
Returns:
|
|
1157
|
+
Column with extracted city name or empty string if not found
|
|
1158
|
+
|
|
1159
|
+
Example:
|
|
1160
|
+
# Direct usage
|
|
1161
|
+
df.select(addresses.extract_city(F.col("address")))
|
|
1162
|
+
|
|
1163
|
+
# With custom cities
|
|
1164
|
+
df.select(addresses.extract_city(F.col("address"), custom_cities=["Reading", "Mobile"]))
|
|
1165
|
+
|
|
1166
|
+
# Pre-configured
|
|
1167
|
+
extract_city_custom = addresses.extract_city(custom_cities=["Reading", "Mobile"])
|
|
1168
|
+
df.select(extract_city_custom(F.col("address")))
|
|
1169
|
+
"""
|
|
1170
|
+
# For city extraction, match both abbreviations and full state names
|
|
1171
|
+
# But prioritize abbreviations to avoid false matches
|
|
1172
|
+
state_abbrevs_only = list(STATE_ABBREV.keys())
|
|
1173
|
+
# Add common full state names for city extraction
|
|
1174
|
+
common_full_states = [
|
|
1175
|
+
"California",
|
|
1176
|
+
"New York",
|
|
1177
|
+
"Texas",
|
|
1178
|
+
"Florida",
|
|
1179
|
+
"Pennsylvania",
|
|
1180
|
+
"Illinois",
|
|
1181
|
+
"Ohio",
|
|
1182
|
+
"Georgia",
|
|
1183
|
+
"North Carolina",
|
|
1184
|
+
"Michigan",
|
|
1185
|
+
"New Jersey",
|
|
1186
|
+
"Virginia",
|
|
1187
|
+
"Washington",
|
|
1188
|
+
"Massachusetts",
|
|
1189
|
+
"Arizona",
|
|
1190
|
+
"Tennessee",
|
|
1191
|
+
"Indiana",
|
|
1192
|
+
"Missouri",
|
|
1193
|
+
"Maryland",
|
|
1194
|
+
"Wisconsin",
|
|
1195
|
+
"Colorado",
|
|
1196
|
+
"Minnesota",
|
|
1197
|
+
"South Carolina",
|
|
1198
|
+
"Alabama",
|
|
1199
|
+
"Louisiana",
|
|
1200
|
+
"Kentucky",
|
|
1201
|
+
"Oregon",
|
|
1202
|
+
"Oklahoma",
|
|
1203
|
+
"Connecticut",
|
|
1204
|
+
"Utah",
|
|
1205
|
+
"Iowa",
|
|
1206
|
+
"Nevada",
|
|
1207
|
+
"Arkansas",
|
|
1208
|
+
"Mississippi",
|
|
1209
|
+
"Kansas",
|
|
1210
|
+
"New Mexico",
|
|
1211
|
+
"Nebraska",
|
|
1212
|
+
"Idaho",
|
|
1213
|
+
"West Virginia",
|
|
1214
|
+
"Hawaii",
|
|
1215
|
+
"New Hampshire",
|
|
1216
|
+
"Maine",
|
|
1217
|
+
"Montana",
|
|
1218
|
+
"Rhode Island",
|
|
1219
|
+
"Delaware",
|
|
1220
|
+
"South Dakota",
|
|
1221
|
+
"North Dakota",
|
|
1222
|
+
"Alaska",
|
|
1223
|
+
"Vermont",
|
|
1224
|
+
"Wyoming",
|
|
1225
|
+
"District of Columbia",
|
|
1226
|
+
"Puerto Rico",
|
|
1227
|
+
]
|
|
1228
|
+
|
|
1229
|
+
# Combine abbreviations and full names for pattern
|
|
1230
|
+
all_state_patterns = state_abbrevs_only + [s.upper() for s in common_full_states]
|
|
1231
|
+
|
|
1232
|
+
# Check for custom cities if provided
|
|
1233
|
+
custom_city_result = F.lit("")
|
|
1234
|
+
|
|
1235
|
+
# Use provided custom_cities parameter, or fall back to module-level CUSTOM_CITIES
|
|
1236
|
+
cities_to_check = (
|
|
1237
|
+
custom_cities if custom_cities is not None else list(CUSTOM_CITIES)
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
if cities_to_check:
|
|
1241
|
+
# Create a single regex pattern for all custom cities
|
|
1242
|
+
# Sort by length (longest first) to match multi-word cities first
|
|
1243
|
+
sorted_custom_cities = sorted(cities_to_check, key=len, reverse=True)
|
|
1244
|
+
# Ensure cities are strings and uppercase for comparison
|
|
1245
|
+
sorted_custom_cities = [str(city).upper() for city in sorted_custom_cities]
|
|
1246
|
+
# Build pattern with all custom cities as alternatives
|
|
1247
|
+
custom_pattern = (
|
|
1248
|
+
r"(?i)\b(?:"
|
|
1249
|
+
+ "|".join(re.escape(city) for city in sorted_custom_cities)
|
|
1250
|
+
+ r")\b"
|
|
1251
|
+
)
|
|
1252
|
+
custom_city_result = F.regexp_extract(col, custom_pattern, 0)
|
|
1253
|
+
|
|
1254
|
+
# Pattern to extract city before a proper state
|
|
1255
|
+
# First pattern: try to match city that comes after a comma and before state
|
|
1256
|
+
# "anything, City, State" - captures "City"
|
|
1257
|
+
city_after_comma_pattern = (
|
|
1258
|
+
r"(?i),\s*([^,]+?)\s*,\s*(?:" + "|".join(all_state_patterns) + r")\b"
|
|
1259
|
+
)
|
|
1260
|
+
|
|
1261
|
+
# Second pattern: match city at start before state (no street address)
|
|
1262
|
+
# "City, State" or "City State ZIP"
|
|
1263
|
+
city_at_start_pattern = (
|
|
1264
|
+
r"(?i)^([^,]+?)(?:\s*,\s*(?:" + "|".join(all_state_patterns) + r")\b|"
|
|
1265
|
+
r"\s+(?:" + "|".join(state_abbrevs_only) + r")\s+\d{5})"
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
# Try to extract city using both patterns - prefer after comma (more specific)
|
|
1269
|
+
city_after_comma = F.regexp_extract(col, city_after_comma_pattern, 1)
|
|
1270
|
+
city_at_start = F.regexp_extract(col, city_at_start_pattern, 1)
|
|
1271
|
+
city = F.when(city_after_comma != "", city_after_comma).otherwise(city_at_start)
|
|
1272
|
+
|
|
1273
|
+
# If no state found, try to extract before ZIP code only
|
|
1274
|
+
city_from_zip = F.regexp_extract(col, r"^(.+?)\s*(?:,\s*)?\d{5}(?:-\d{4})?\s*$", 1)
|
|
1275
|
+
|
|
1276
|
+
# Use custom city if found, otherwise use regular extraction
|
|
1277
|
+
result = F.when(custom_city_result != "", F.initcap(custom_city_result)).otherwise(
|
|
1278
|
+
F.coalesce(city, city_from_zip, F.lit(""))
|
|
1279
|
+
)
|
|
1280
|
+
result = F.trim(F.regexp_replace(result, r"[,\s]+$", ""))
|
|
1281
|
+
|
|
1282
|
+
# Handle case where we might have captured too much (e.g., street info)
|
|
1283
|
+
# If result contains common street suffixes, try to extract just the city part
|
|
1284
|
+
street_indicators = [
|
|
1285
|
+
"Street",
|
|
1286
|
+
"St",
|
|
1287
|
+
"Avenue",
|
|
1288
|
+
"Ave",
|
|
1289
|
+
"Road",
|
|
1290
|
+
"Rd",
|
|
1291
|
+
"Boulevard",
|
|
1292
|
+
"Blvd",
|
|
1293
|
+
"Drive",
|
|
1294
|
+
"Dr",
|
|
1295
|
+
"Lane",
|
|
1296
|
+
"Ln",
|
|
1297
|
+
"Court",
|
|
1298
|
+
"Ct",
|
|
1299
|
+
"Place",
|
|
1300
|
+
"Pl",
|
|
1301
|
+
]
|
|
1302
|
+
street_pattern = r"(?i)\b(?:" + "|".join(street_indicators) + r")\b.*?,\s*(.+)$"
|
|
1303
|
+
|
|
1304
|
+
# If we find street indicators, extract what comes after the last comma
|
|
1305
|
+
city_after_street = F.regexp_extract(result, street_pattern, 1)
|
|
1306
|
+
|
|
1307
|
+
return F.when(city_after_street != "", city_after_street).otherwise(result)
|
|
1308
|
+
|
|
1309
|
+
|
|
1310
|
+
@addresses.register()
|
|
1311
|
+
def extract_state(col: Column, custom_states: Optional[Dict] = None) -> Column:
|
|
1312
|
+
"""Extract and standardize state to 2-letter abbreviation.
|
|
1313
|
+
|
|
1314
|
+
Handles both full state names and abbreviations, case-insensitive.
|
|
1315
|
+
Returns standardized 2-letter uppercase abbreviation.
|
|
1316
|
+
|
|
1317
|
+
Args:
|
|
1318
|
+
col: Column containing address text with state information
|
|
1319
|
+
custom_states: Optional dict mapping full state names to abbreviations
|
|
1320
|
+
e.g., {"ONTARIO": "ON", "QUEBEC": "QC"}
|
|
1321
|
+
|
|
1322
|
+
Returns:
|
|
1323
|
+
Column with 2-letter state abbreviation or empty string if not found
|
|
1324
|
+
|
|
1325
|
+
Example:
|
|
1326
|
+
# Direct usage
|
|
1327
|
+
df.select(addresses.extract_state(F.col("address")))
|
|
1328
|
+
|
|
1329
|
+
# With custom states (e.g., Canadian provinces)
|
|
1330
|
+
canadian_provinces = {"ONTARIO": "ON", "QUEBEC": "QC", "BRITISH COLUMBIA": "BC"}
|
|
1331
|
+
df.select(addresses.extract_state(F.col("address"), custom_states=canadian_provinces))
|
|
1332
|
+
"""
|
|
1333
|
+
# Build combined state mappings
|
|
1334
|
+
states_map = US_STATES.copy()
|
|
1335
|
+
abbrev_map = STATE_ABBREV.copy()
|
|
1336
|
+
|
|
1337
|
+
# Add custom states if provided
|
|
1338
|
+
if custom_states:
|
|
1339
|
+
for full_name, abbrev in custom_states.items():
|
|
1340
|
+
full_name_upper = str(full_name).upper()
|
|
1341
|
+
abbrev_upper = str(abbrev).upper()
|
|
1342
|
+
states_map[full_name_upper] = abbrev_upper
|
|
1343
|
+
abbrev_map[abbrev_upper] = full_name_upper
|
|
1344
|
+
|
|
1345
|
+
# Create comprehensive state pattern
|
|
1346
|
+
all_states = list(states_map.keys()) + list(abbrev_map.keys())
|
|
1347
|
+
|
|
1348
|
+
# Pattern to match state names/abbreviations
|
|
1349
|
+
# Look for states that appear before ZIP/postal code or at end of string
|
|
1350
|
+
# Support both US ZIP codes (12345) and Canadian postal codes (A1B 2C3)
|
|
1351
|
+
state_pattern = (
|
|
1352
|
+
r"(?i)\b("
|
|
1353
|
+
+ "|".join(all_states)
|
|
1354
|
+
+ r")\b(?:\s+(?:\d{5}(?:-\d{4})?|[A-Z]\d[A-Z]\s*\d[A-Z]\d))?(?:\s*$)"
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
# Extract the state (case-insensitive)
|
|
1358
|
+
extracted = F.upper(F.regexp_extract(col, state_pattern, 1))
|
|
1359
|
+
|
|
1360
|
+
# Check if it's already a valid abbreviation (including custom ones)
|
|
1361
|
+
is_abbrev = extracted.isin(list(abbrev_map.keys()))
|
|
1362
|
+
|
|
1363
|
+
# If it's an abbreviation, return it; otherwise check if it's a full name
|
|
1364
|
+
result = F.when(is_abbrev, extracted)
|
|
1365
|
+
|
|
1366
|
+
# Map full state names to abbreviations (including custom ones)
|
|
1367
|
+
for full_name, abbrev in states_map.items():
|
|
1368
|
+
result = result.when(extracted == full_name, F.lit(abbrev))
|
|
1369
|
+
|
|
1370
|
+
# Default to empty string if no match
|
|
1371
|
+
result = result.otherwise(F.lit(""))
|
|
1372
|
+
|
|
1373
|
+
return result
|
|
1374
|
+
|
|
1375
|
+
|
|
1376
|
+
@addresses.register()
|
|
1377
|
+
def validate_city(
|
|
1378
|
+
col: Column,
|
|
1379
|
+
known_cities: Optional[List] = None,
|
|
1380
|
+
min_length: int = 2,
|
|
1381
|
+
max_length: int = 50,
|
|
1382
|
+
) -> Column:
|
|
1383
|
+
"""Validate if a city name appears valid.
|
|
1384
|
+
|
|
1385
|
+
Validates:
|
|
1386
|
+
- Not empty/null
|
|
1387
|
+
- Within reasonable length bounds
|
|
1388
|
+
- Contains valid characters (letters, spaces, hyphens, apostrophes, periods)
|
|
1389
|
+
- Optionally: matches a list of known cities
|
|
1390
|
+
|
|
1391
|
+
Args:
|
|
1392
|
+
col: Column containing city names to validate
|
|
1393
|
+
known_cities: Optional list of valid city names to check against
|
|
1394
|
+
min_length: Minimum valid city name length (default 2)
|
|
1395
|
+
max_length: Maximum valid city name length (default 50)
|
|
1396
|
+
|
|
1397
|
+
Returns:
|
|
1398
|
+
Boolean column indicating if city name is valid
|
|
1399
|
+
|
|
1400
|
+
Example:
|
|
1401
|
+
# Basic validation
|
|
1402
|
+
df.select(addresses.validate_city(F.col("city")))
|
|
1403
|
+
|
|
1404
|
+
# Validate against known cities
|
|
1405
|
+
us_cities = ["New York", "Los Angeles", "Chicago", ...]
|
|
1406
|
+
df.select(addresses.validate_city(F.col("city"), known_cities=us_cities))
|
|
1407
|
+
"""
|
|
1408
|
+
# Clean the input
|
|
1409
|
+
cleaned = F.trim(col)
|
|
1410
|
+
|
|
1411
|
+
# Basic validation: not empty
|
|
1412
|
+
not_empty = (cleaned.isNotNull()) & (cleaned != "")
|
|
1413
|
+
|
|
1414
|
+
# Length validation
|
|
1415
|
+
length_valid = (F.length(cleaned) >= min_length) & (F.length(cleaned) <= max_length)
|
|
1416
|
+
|
|
1417
|
+
# Character validation: letters, spaces, hyphens, apostrophes, periods, and numbers
|
|
1418
|
+
# Allow: St. Louis, O'Fallon, Winston-Salem, 29 Palms, etc.
|
|
1419
|
+
char_pattern = r"^[A-Za-z0-9\s\-'.]+$"
|
|
1420
|
+
chars_valid = F.regexp_extract(cleaned, char_pattern, 0) != ""
|
|
1421
|
+
|
|
1422
|
+
# Combine basic validations
|
|
1423
|
+
basic_valid = not_empty & length_valid & chars_valid
|
|
1424
|
+
|
|
1425
|
+
# If known cities provided, check against them
|
|
1426
|
+
if known_cities:
|
|
1427
|
+
# Normalize for comparison
|
|
1428
|
+
cleaned_upper = F.upper(cleaned)
|
|
1429
|
+
known_cities_upper = [str(city).upper() for city in known_cities]
|
|
1430
|
+
in_known_list = cleaned_upper.isin(known_cities_upper)
|
|
1431
|
+
return basic_valid & in_known_list
|
|
1432
|
+
|
|
1433
|
+
return basic_valid
|
|
1434
|
+
|
|
1435
|
+
|
|
1436
|
+
@addresses.register()
|
|
1437
|
+
def validate_state(col: Column) -> Column:
|
|
1438
|
+
"""Validate if state code is a valid US state abbreviation.
|
|
1439
|
+
|
|
1440
|
+
Checks against list of valid US state abbreviations including territories.
|
|
1441
|
+
|
|
1442
|
+
Args:
|
|
1443
|
+
col: Column containing state codes to validate
|
|
1444
|
+
|
|
1445
|
+
Returns:
|
|
1446
|
+
Boolean column indicating if state code is valid
|
|
1447
|
+
"""
|
|
1448
|
+
# Convert to uppercase for comparison
|
|
1449
|
+
upper_col = F.upper(F.trim(col))
|
|
1450
|
+
|
|
1451
|
+
# Check if it's a valid abbreviation
|
|
1452
|
+
valid_abbrevs = list(STATE_ABBREV.keys())
|
|
1453
|
+
|
|
1454
|
+
# Also check if it's a valid full state name
|
|
1455
|
+
valid_full_names = list(US_STATES.keys())
|
|
1456
|
+
|
|
1457
|
+
return (upper_col.isin(valid_abbrevs)) | (upper_col.isin(valid_full_names))
|
|
1458
|
+
|
|
1459
|
+
|
|
1460
|
+
@addresses.register()
|
|
1461
|
+
def standardize_city(col: Column, custom_mappings: Optional[Dict] = None) -> Column:
|
|
1462
|
+
"""Standardize city name formatting.
|
|
1463
|
+
|
|
1464
|
+
- Trims whitespace
|
|
1465
|
+
- Normalizes internal spacing
|
|
1466
|
+
- Applies title case (with special handling for common patterns)
|
|
1467
|
+
- Optionally applies custom city name mappings
|
|
1468
|
+
|
|
1469
|
+
Args:
|
|
1470
|
+
col: Column containing city names to standardize
|
|
1471
|
+
custom_mappings: Optional dict for city name corrections/standardization
|
|
1472
|
+
e.g., {"ST LOUIS": "St. Louis", "NEWYORK": "New York"}
|
|
1473
|
+
|
|
1474
|
+
Returns:
|
|
1475
|
+
Column with standardized city names
|
|
1476
|
+
|
|
1477
|
+
Example:
|
|
1478
|
+
# Basic standardization
|
|
1479
|
+
df.select(addresses.standardize_city(F.col("city")))
|
|
1480
|
+
|
|
1481
|
+
# With custom mappings for common variations
|
|
1482
|
+
city_mappings = {
|
|
1483
|
+
"NYC": "New York",
|
|
1484
|
+
"LA": "Los Angeles",
|
|
1485
|
+
"SF": "San Francisco",
|
|
1486
|
+
"STLOUIS": "St. Louis"
|
|
1487
|
+
}
|
|
1488
|
+
df.select(addresses.standardize_city(F.col("city"), custom_mappings=city_mappings))
|
|
1489
|
+
"""
|
|
1490
|
+
# Clean and normalize whitespace
|
|
1491
|
+
result = F.trim(F.regexp_replace(col, r"\s+", " "))
|
|
1492
|
+
|
|
1493
|
+
# Apply custom mappings if provided
|
|
1494
|
+
mapped = F.lit(None)
|
|
1495
|
+
if custom_mappings:
|
|
1496
|
+
# Normalize mappings to uppercase for comparison
|
|
1497
|
+
normalized_mappings = {str(k).upper(): v for k, v in custom_mappings.items()}
|
|
1498
|
+
|
|
1499
|
+
# Start with the original result
|
|
1500
|
+
upper_result = F.upper(result)
|
|
1501
|
+
|
|
1502
|
+
# Apply each mapping
|
|
1503
|
+
for original, replacement in normalized_mappings.items():
|
|
1504
|
+
mapped = F.when(upper_result == original, F.lit(replacement)).otherwise(
|
|
1505
|
+
mapped
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
# If a mapping was applied, use it; otherwise apply standard formatting
|
|
1509
|
+
result = F.when(mapped.isNotNull(), mapped).otherwise(
|
|
1510
|
+
# Apply intelligent title case
|
|
1511
|
+
F.initcap(result)
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
# Fix common patterns that initcap doesn't handle well
|
|
1515
|
+
# Only apply these if we didn't use a custom mapping
|
|
1516
|
+
result = F.when(
|
|
1517
|
+
mapped.isNull(),
|
|
1518
|
+
F.regexp_replace(
|
|
1519
|
+
F.regexp_replace(
|
|
1520
|
+
F.regexp_replace(result, r"\bSt\b", "St."), r"\bFt\b", "Ft."
|
|
1521
|
+
),
|
|
1522
|
+
r"\bMt\b",
|
|
1523
|
+
"Mt.",
|
|
1524
|
+
),
|
|
1525
|
+
).otherwise(result)
|
|
1526
|
+
|
|
1527
|
+
return result
|
|
1528
|
+
|
|
1529
|
+
|
|
1530
|
+
@addresses.register()
|
|
1531
|
+
def standardize_state(col: Column) -> Column:
|
|
1532
|
+
"""Convert state to standard 2-letter format.
|
|
1533
|
+
|
|
1534
|
+
Converts full names to abbreviations and ensures uppercase.
|
|
1535
|
+
|
|
1536
|
+
Args:
|
|
1537
|
+
col: Column containing state names or abbreviations
|
|
1538
|
+
|
|
1539
|
+
Returns:
|
|
1540
|
+
Column with standardized 2-letter state codes
|
|
1541
|
+
"""
|
|
1542
|
+
# Use extract_state which already does the standardization
|
|
1543
|
+
return extract_state(col)
|
|
1544
|
+
|
|
1545
|
+
|
|
1546
|
+
@addresses.register()
|
|
1547
|
+
def get_state_name(col: Column) -> Column:
|
|
1548
|
+
"""Convert state abbreviation to full name.
|
|
1549
|
+
|
|
1550
|
+
Args:
|
|
1551
|
+
col: Column containing 2-letter state abbreviations
|
|
1552
|
+
|
|
1553
|
+
Returns:
|
|
1554
|
+
Column with full state names (title case) or empty string if invalid
|
|
1555
|
+
"""
|
|
1556
|
+
# Convert to uppercase for lookup
|
|
1557
|
+
upper_col = F.upper(F.trim(col))
|
|
1558
|
+
|
|
1559
|
+
# Start with empty string as default
|
|
1560
|
+
result = F.lit("")
|
|
1561
|
+
|
|
1562
|
+
# Map each abbreviation to its full name
|
|
1563
|
+
for abbrev, full_name in STATE_ABBREV.items():
|
|
1564
|
+
result = F.when(upper_col == abbrev, F.lit(full_name.title())).otherwise(result)
|
|
1565
|
+
|
|
1566
|
+
return result
|
|
1567
|
+
|
|
1568
|
+
|
|
1569
|
+
# Common country names and their variations
|
|
1570
|
+
COUNTRIES = {
|
|
1571
|
+
# North America
|
|
1572
|
+
"USA": [
|
|
1573
|
+
"USA",
|
|
1574
|
+
"US",
|
|
1575
|
+
"U.S.A.",
|
|
1576
|
+
"U.S.",
|
|
1577
|
+
"United States",
|
|
1578
|
+
"United States of America",
|
|
1579
|
+
"America",
|
|
1580
|
+
],
|
|
1581
|
+
"Canada": ["Canada", "CA", "CAN"],
|
|
1582
|
+
"Mexico": ["Mexico", "MX", "MEX"],
|
|
1583
|
+
# Europe
|
|
1584
|
+
"United Kingdom": [
|
|
1585
|
+
"UK",
|
|
1586
|
+
"U.K.",
|
|
1587
|
+
"United Kingdom",
|
|
1588
|
+
"Great Britain",
|
|
1589
|
+
"GB",
|
|
1590
|
+
"GBR",
|
|
1591
|
+
"England",
|
|
1592
|
+
],
|
|
1593
|
+
"Germany": ["Germany", "DE", "DEU", "Deutschland"],
|
|
1594
|
+
"France": ["France", "FR", "FRA"],
|
|
1595
|
+
"Italy": ["Italy", "IT", "ITA", "Italia"],
|
|
1596
|
+
"Spain": ["Spain", "ES", "ESP", "España"],
|
|
1597
|
+
"Netherlands": ["Netherlands", "NL", "NLD", "Holland"],
|
|
1598
|
+
"Belgium": ["Belgium", "BE", "BEL"],
|
|
1599
|
+
"Switzerland": ["Switzerland", "CH", "CHE", "Swiss"],
|
|
1600
|
+
"Austria": ["Austria", "AT", "AUT"],
|
|
1601
|
+
"Poland": ["Poland", "PL", "POL"],
|
|
1602
|
+
"Sweden": ["Sweden", "SE", "SWE"],
|
|
1603
|
+
"Norway": ["Norway", "NO", "NOR"],
|
|
1604
|
+
"Denmark": ["Denmark", "DK", "DNK"],
|
|
1605
|
+
"Finland": ["Finland", "FI", "FIN"],
|
|
1606
|
+
"Ireland": ["Ireland", "IE", "IRL"],
|
|
1607
|
+
"Portugal": ["Portugal", "PT", "PRT"],
|
|
1608
|
+
"Greece": ["Greece", "GR", "GRC"],
|
|
1609
|
+
# Asia
|
|
1610
|
+
"China": ["China", "CN", "CHN", "PRC", "People's Republic of China"],
|
|
1611
|
+
"Japan": ["Japan", "JP", "JPN"],
|
|
1612
|
+
"India": ["India", "IN", "IND"],
|
|
1613
|
+
"South Korea": ["South Korea", "Korea", "KR", "KOR", "Republic of Korea"],
|
|
1614
|
+
"Singapore": ["Singapore", "SG", "SGP"],
|
|
1615
|
+
"Thailand": ["Thailand", "TH", "THA"],
|
|
1616
|
+
"Malaysia": ["Malaysia", "MY", "MYS"],
|
|
1617
|
+
"Indonesia": ["Indonesia", "ID", "IDN"],
|
|
1618
|
+
"Philippines": ["Philippines", "PH", "PHL"],
|
|
1619
|
+
"Vietnam": ["Vietnam", "VN", "VNM"],
|
|
1620
|
+
# Oceania
|
|
1621
|
+
"Australia": ["Australia", "AU", "AUS"],
|
|
1622
|
+
"New Zealand": ["New Zealand", "NZ", "NZL"],
|
|
1623
|
+
# South America
|
|
1624
|
+
"Brazil": ["Brazil", "BR", "BRA", "Brasil"],
|
|
1625
|
+
"Argentina": ["Argentina", "AR", "ARG"],
|
|
1626
|
+
"Chile": ["Chile", "CL", "CHL"],
|
|
1627
|
+
"Colombia": ["Colombia", "CO", "COL"],
|
|
1628
|
+
"Peru": ["Peru", "PE", "PER"],
|
|
1629
|
+
# Middle East
|
|
1630
|
+
"Israel": ["Israel", "IL", "ISR"],
|
|
1631
|
+
"Saudi Arabia": ["Saudi Arabia", "SA", "SAU", "KSA"],
|
|
1632
|
+
"UAE": ["UAE", "United Arab Emirates", "AE", "ARE"],
|
|
1633
|
+
# Africa
|
|
1634
|
+
"South Africa": ["South Africa", "ZA", "ZAF", "RSA"],
|
|
1635
|
+
"Egypt": ["Egypt", "EG", "EGY"],
|
|
1636
|
+
"Nigeria": ["Nigeria", "NG", "NGA"],
|
|
1637
|
+
"Kenya": ["Kenya", "KE", "KEN"],
|
|
1638
|
+
}
|
|
1639
|
+
|
|
1640
|
+
# Create reverse mapping for quick lookups
|
|
1641
|
+
COUNTRY_LOOKUP = {}
|
|
1642
|
+
for standard_name, variations in COUNTRIES.items():
|
|
1643
|
+
for variation in variations:
|
|
1644
|
+
COUNTRY_LOOKUP[variation.upper()] = standard_name
|
|
1645
|
+
|
|
1646
|
+
|
|
1647
|
+
@addresses.register()
|
|
1648
|
+
def extract_country(col: Column) -> Column:
|
|
1649
|
+
"""Extract country from address.
|
|
1650
|
+
|
|
1651
|
+
Extracts country names from addresses, handling common variations
|
|
1652
|
+
and abbreviations. Returns standardized country name.
|
|
1653
|
+
|
|
1654
|
+
Args:
|
|
1655
|
+
col: Column containing address text with potential country
|
|
1656
|
+
|
|
1657
|
+
Returns:
|
|
1658
|
+
Column with extracted country name or empty string
|
|
1659
|
+
|
|
1660
|
+
Example:
|
|
1661
|
+
df.select(addresses.extract_country(F.col("address")))
|
|
1662
|
+
# "123 Main St, New York, USA" -> "USA"
|
|
1663
|
+
# "456 Oak Ave, Toronto, Canada" -> "Canada"
|
|
1664
|
+
# "789 Elm St, London, UK" -> "United Kingdom"
|
|
1665
|
+
"""
|
|
1666
|
+
# Handle nulls
|
|
1667
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1668
|
+
|
|
1669
|
+
# Start with empty result
|
|
1670
|
+
result = F.lit("")
|
|
1671
|
+
|
|
1672
|
+
# Check for country at the end of the address (most common)
|
|
1673
|
+
# Sort variations by length (longest first) to avoid partial matches
|
|
1674
|
+
sorted_variations = sorted(
|
|
1675
|
+
COUNTRY_LOOKUP.items(), key=lambda x: len(x[0]), reverse=True
|
|
1676
|
+
)
|
|
1677
|
+
|
|
1678
|
+
# Pattern to match country at the end, possibly after comma
|
|
1679
|
+
for variation, standard in sorted_variations:
|
|
1680
|
+
# Check if the address ends with this country variation
|
|
1681
|
+
# Use word boundary to avoid partial matches
|
|
1682
|
+
pattern = rf"(?:,\s*)?\b{re.escape(variation)}\.?\s*$"
|
|
1683
|
+
result = F.when(F.upper(col).rlike(pattern), F.lit(standard)).otherwise(result)
|
|
1684
|
+
|
|
1685
|
+
return result
|
|
1686
|
+
|
|
1687
|
+
|
|
1688
|
+
@addresses.register()
|
|
1689
|
+
def has_country(col: Column) -> Column:
|
|
1690
|
+
"""Check if address contains country information.
|
|
1691
|
+
|
|
1692
|
+
Args:
|
|
1693
|
+
col: Column containing address text
|
|
1694
|
+
|
|
1695
|
+
Returns:
|
|
1696
|
+
Column with boolean indicating presence of country
|
|
1697
|
+
|
|
1698
|
+
Example:
|
|
1699
|
+
df.select(addresses.has_country(F.col("address")))
|
|
1700
|
+
# "123 Main St, USA" -> True
|
|
1701
|
+
# "456 Oak Ave" -> False
|
|
1702
|
+
"""
|
|
1703
|
+
return extract_country(col) != ""
|
|
1704
|
+
|
|
1705
|
+
|
|
1706
|
+
@addresses.register()
|
|
1707
|
+
def remove_country(col: Column) -> Column:
|
|
1708
|
+
"""Remove country from address.
|
|
1709
|
+
|
|
1710
|
+
Removes country information from the end of addresses.
|
|
1711
|
+
|
|
1712
|
+
Args:
|
|
1713
|
+
col: Column containing address text
|
|
1714
|
+
|
|
1715
|
+
Returns:
|
|
1716
|
+
Column with country removed
|
|
1717
|
+
|
|
1718
|
+
Example:
|
|
1719
|
+
df.select(addresses.remove_country(F.col("address")))
|
|
1720
|
+
# "123 Main St, New York, USA" -> "123 Main St, New York"
|
|
1721
|
+
# "456 Oak Ave, Toronto, Canada" -> "456 Oak Ave, Toronto"
|
|
1722
|
+
"""
|
|
1723
|
+
# Handle nulls
|
|
1724
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1725
|
+
|
|
1726
|
+
result = col
|
|
1727
|
+
|
|
1728
|
+
# Sort variations by length (longest first) to avoid partial matches
|
|
1729
|
+
sorted_variations = sorted(COUNTRY_LOOKUP.keys(), key=len, reverse=True)
|
|
1730
|
+
|
|
1731
|
+
# Remove each country variation
|
|
1732
|
+
for variation in sorted_variations:
|
|
1733
|
+
# Pattern to match country at the end with optional comma and spaces
|
|
1734
|
+
# Note: PySpark's regexp_replace uses Java regex, which has different syntax
|
|
1735
|
+
# Escape the variation for regex
|
|
1736
|
+
escaped = re.escape(variation)
|
|
1737
|
+
# Build pattern for case-insensitive matching at end of string
|
|
1738
|
+
pattern = f"(?i),?\\s*{escaped}\\.?\\s*$"
|
|
1739
|
+
result = F.regexp_replace(result, pattern, "")
|
|
1740
|
+
|
|
1741
|
+
# Clean up any trailing commas or spaces
|
|
1742
|
+
result = F.regexp_replace(result, r",?\s*$", "")
|
|
1743
|
+
|
|
1744
|
+
return result
|
|
1745
|
+
|
|
1746
|
+
|
|
1747
|
+
@addresses.register()
|
|
1748
|
+
def standardize_country(col: Column, custom_mappings: Optional[dict] = None) -> Column:
|
|
1749
|
+
"""Standardize country name to consistent format.
|
|
1750
|
+
|
|
1751
|
+
Converts various country representations to standard names.
|
|
1752
|
+
|
|
1753
|
+
Args:
|
|
1754
|
+
col: Column containing country name or abbreviation
|
|
1755
|
+
custom_mappings: Optional dict of custom country mappings
|
|
1756
|
+
|
|
1757
|
+
Returns:
|
|
1758
|
+
Column with standardized country name
|
|
1759
|
+
|
|
1760
|
+
Example:
|
|
1761
|
+
df.select(addresses.standardize_country(F.col("country")))
|
|
1762
|
+
# "US" -> "USA"
|
|
1763
|
+
# "U.K." -> "United Kingdom"
|
|
1764
|
+
# "Deutschland" -> "Germany"
|
|
1765
|
+
"""
|
|
1766
|
+
# Handle nulls
|
|
1767
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1768
|
+
|
|
1769
|
+
# Clean and normalize
|
|
1770
|
+
upper_col = F.upper(F.trim(col))
|
|
1771
|
+
|
|
1772
|
+
# Apply custom mappings first if provided
|
|
1773
|
+
result = col
|
|
1774
|
+
if custom_mappings:
|
|
1775
|
+
for original, standard in custom_mappings.items():
|
|
1776
|
+
result = F.when(
|
|
1777
|
+
upper_col == F.upper(F.lit(original)), F.lit(standard)
|
|
1778
|
+
).otherwise(result)
|
|
1779
|
+
|
|
1780
|
+
# Then apply standard mappings
|
|
1781
|
+
for variation, standard in COUNTRY_LOOKUP.items():
|
|
1782
|
+
result = F.when(upper_col == variation, F.lit(standard)).otherwise(result)
|
|
1783
|
+
|
|
1784
|
+
return result
|
|
1785
|
+
|
|
1786
|
+
|
|
1787
|
+
@addresses.register()
|
|
1788
|
+
def extract_po_box(col: Column) -> Column:
|
|
1789
|
+
"""Extract PO Box number from address.
|
|
1790
|
+
|
|
1791
|
+
Extracts PO Box, P.O. Box, POB, Post Office Box numbers.
|
|
1792
|
+
Handles various formats including with/without periods and spaces.
|
|
1793
|
+
|
|
1794
|
+
Args:
|
|
1795
|
+
col: Column containing address text
|
|
1796
|
+
|
|
1797
|
+
Returns:
|
|
1798
|
+
Column with extracted PO Box number or empty string
|
|
1799
|
+
|
|
1800
|
+
Example:
|
|
1801
|
+
df.select(addresses.extract_po_box(F.col("address")))
|
|
1802
|
+
# "PO Box 123" -> "123"
|
|
1803
|
+
# "P.O. Box 456" -> "456"
|
|
1804
|
+
# "POB 789" -> "789"
|
|
1805
|
+
# "Post Office Box 1011" -> "1011"
|
|
1806
|
+
"""
|
|
1807
|
+
# Handle nulls
|
|
1808
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1809
|
+
|
|
1810
|
+
# Pattern to match various PO Box formats
|
|
1811
|
+
# Matches: PO Box, P.O. Box, POB, Post Office Box, etc.
|
|
1812
|
+
# Captures the box number (numeric, alphanumeric, or with dashes and special chars)
|
|
1813
|
+
# POB must be followed by space and start with number or #
|
|
1814
|
+
po_box_pattern = r"(?i)(?:P\.?\s?O\.?\s?Box|POB(?=\s+[#0-9])|Post\s+Office\s+Box)\s+(#?[A-Z0-9\-/]+)"
|
|
1815
|
+
|
|
1816
|
+
result = F.regexp_extract(col, po_box_pattern, 1)
|
|
1817
|
+
return F.when(result.isNull(), F.lit("")).otherwise(result)
|
|
1818
|
+
|
|
1819
|
+
|
|
1820
|
+
@addresses.register()
|
|
1821
|
+
def has_po_box(col: Column) -> Column:
|
|
1822
|
+
"""Check if address contains PO Box.
|
|
1823
|
+
|
|
1824
|
+
Args:
|
|
1825
|
+
col: Column containing address text
|
|
1826
|
+
|
|
1827
|
+
Returns:
|
|
1828
|
+
Column with boolean indicating presence of PO Box
|
|
1829
|
+
|
|
1830
|
+
Example:
|
|
1831
|
+
df.select(addresses.has_po_box(F.col("address")))
|
|
1832
|
+
# "PO Box 123" -> True
|
|
1833
|
+
# "123 Main St" -> False
|
|
1834
|
+
"""
|
|
1835
|
+
return extract_po_box(col) != ""
|
|
1836
|
+
|
|
1837
|
+
|
|
1838
|
+
@addresses.register()
|
|
1839
|
+
def is_po_box_only(col: Column) -> Column:
|
|
1840
|
+
"""Check if address is ONLY a PO Box (no street address).
|
|
1841
|
+
|
|
1842
|
+
Args:
|
|
1843
|
+
col: Column containing address text
|
|
1844
|
+
|
|
1845
|
+
Returns:
|
|
1846
|
+
Column with boolean indicating if address is PO Box only
|
|
1847
|
+
|
|
1848
|
+
Example:
|
|
1849
|
+
df.select(addresses.is_po_box_only(F.col("address")))
|
|
1850
|
+
# "PO Box 123" -> True
|
|
1851
|
+
# "123 Main St, PO Box 456" -> False
|
|
1852
|
+
# "PO Box 789, New York, NY" -> True
|
|
1853
|
+
"""
|
|
1854
|
+
# Handle nulls
|
|
1855
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1856
|
+
|
|
1857
|
+
# Check if it has a PO Box
|
|
1858
|
+
has_box = has_po_box(col)
|
|
1859
|
+
|
|
1860
|
+
# Check if it has a street number (indicating a street address)
|
|
1861
|
+
# Pattern to detect street numbers at the beginning
|
|
1862
|
+
street_pattern = r"^\d+\s+[A-Za-z]"
|
|
1863
|
+
has_street = F.regexp_extract(col, street_pattern, 0) != ""
|
|
1864
|
+
|
|
1865
|
+
# It's PO Box only if it has a PO Box but no street address
|
|
1866
|
+
return has_box & ~has_street
|
|
1867
|
+
|
|
1868
|
+
|
|
1869
|
+
@addresses.register()
|
|
1870
|
+
def remove_po_box(col: Column) -> Column:
|
|
1871
|
+
"""Remove PO Box from address.
|
|
1872
|
+
|
|
1873
|
+
Removes PO Box information while preserving other address components.
|
|
1874
|
+
|
|
1875
|
+
Args:
|
|
1876
|
+
col: Column containing address text
|
|
1877
|
+
|
|
1878
|
+
Returns:
|
|
1879
|
+
Column with PO Box removed
|
|
1880
|
+
|
|
1881
|
+
Example:
|
|
1882
|
+
df.select(addresses.remove_po_box(F.col("address")))
|
|
1883
|
+
# "123 Main St, PO Box 456" -> "123 Main St"
|
|
1884
|
+
# "PO Box 789, New York, NY" -> "New York, NY"
|
|
1885
|
+
"""
|
|
1886
|
+
# Handle nulls
|
|
1887
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1888
|
+
|
|
1889
|
+
# Pattern to match various PO Box formats with optional comma
|
|
1890
|
+
po_box_pattern = r"(?i),?\s*(?:P\.?\s?O\.?\s?Box|POB(?=\s+[#0-9])|Post\s+Office\s+Box)\s+(#?[A-Z0-9\-/]+)\s*,?"
|
|
1891
|
+
|
|
1892
|
+
# Remove the PO Box
|
|
1893
|
+
result = F.regexp_replace(col, po_box_pattern, ",")
|
|
1894
|
+
|
|
1895
|
+
# Clean up any leading/trailing commas or spaces
|
|
1896
|
+
result = F.regexp_replace(result, r"^\s*,\s*", "") # Leading comma
|
|
1897
|
+
result = F.regexp_replace(result, r",?\s*$", "") # Trailing comma/space
|
|
1898
|
+
result = F.regexp_replace(result, r",\s*,+", ",") # Multiple commas to single
|
|
1899
|
+
result = F.regexp_replace(result, r"\s+", " ") # Multiple spaces to single
|
|
1900
|
+
|
|
1901
|
+
return F.trim(result)
|
|
1902
|
+
|
|
1903
|
+
|
|
1904
|
+
@addresses.register()
|
|
1905
|
+
def standardize_po_box(col: Column) -> Column:
|
|
1906
|
+
"""Standardize PO Box format to consistent representation.
|
|
1907
|
+
|
|
1908
|
+
Converts various PO Box formats to standard "PO Box XXXX" format.
|
|
1909
|
+
|
|
1910
|
+
Args:
|
|
1911
|
+
col: Column containing PO Box text
|
|
1912
|
+
|
|
1913
|
+
Returns:
|
|
1914
|
+
Column with standardized PO Box format
|
|
1915
|
+
|
|
1916
|
+
Example:
|
|
1917
|
+
df.select(addresses.standardize_po_box(F.col("po_box")))
|
|
1918
|
+
# "P.O. Box 123" -> "PO Box 123"
|
|
1919
|
+
# "POB 456" -> "PO Box 456"
|
|
1920
|
+
# "Post Office Box 789" -> "PO Box 789"
|
|
1921
|
+
# "123 Main St" -> "123 Main St" (no change if no PO Box)
|
|
1922
|
+
"""
|
|
1923
|
+
# Handle nulls
|
|
1924
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1925
|
+
|
|
1926
|
+
# Extract the PO Box number
|
|
1927
|
+
box_number = extract_po_box(col)
|
|
1928
|
+
|
|
1929
|
+
# If we found a PO Box, replace it with standard format
|
|
1930
|
+
result = F.when(
|
|
1931
|
+
box_number != "",
|
|
1932
|
+
F.regexp_replace(
|
|
1933
|
+
col,
|
|
1934
|
+
r"(?i)(?:P\.?\s?O\.?\s?Box|POB(?=\s+[#0-9])|Post\s+Office\s+Box)\s+(#?[A-Z0-9\-/]+)",
|
|
1935
|
+
F.concat(F.lit("PO Box "), box_number),
|
|
1936
|
+
),
|
|
1937
|
+
).otherwise(col)
|
|
1938
|
+
|
|
1939
|
+
return result
|
|
1940
|
+
|
|
1941
|
+
|
|
1942
|
+
@addresses.register()
|
|
1943
|
+
def extract_private_mailbox(col: Column) -> Column:
|
|
1944
|
+
"""Extract private mailbox (PMB) number from address.
|
|
1945
|
+
|
|
1946
|
+
Extracts PMB or Private Mail Box numbers, commonly used with
|
|
1947
|
+
commercial mail receiving agencies (like UPS Store).
|
|
1948
|
+
|
|
1949
|
+
Args:
|
|
1950
|
+
col: Column containing address text
|
|
1951
|
+
|
|
1952
|
+
Returns:
|
|
1953
|
+
Column with extracted PMB number or empty string
|
|
1954
|
+
|
|
1955
|
+
Example:
|
|
1956
|
+
df.select(addresses.extract_private_mailbox(F.col("address")))
|
|
1957
|
+
# "123 Main St PMB 456" -> "456"
|
|
1958
|
+
# "789 Oak Ave #101 PMB 12" -> "12"
|
|
1959
|
+
"""
|
|
1960
|
+
# Handle nulls
|
|
1961
|
+
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
1962
|
+
|
|
1963
|
+
# Pattern to match PMB (Private Mail Box)
|
|
1964
|
+
pmb_pattern = r"(?i)(?:PMB|Private\s+Mail\s+Box)\s+([A-Z0-9\-]+)"
|
|
1965
|
+
|
|
1966
|
+
result = F.regexp_extract(col, pmb_pattern, 1)
|
|
1967
|
+
return F.when(result.isNull(), F.lit("")).otherwise(result)
|