datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1967 @@
1
+ import re
2
+ from typing import TYPE_CHECKING, Dict, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ # For type checkers only - these imports are always available during type checking
6
+ from pyspark.sql import Column
7
+ from pyspark.sql import functions as F
8
+ else:
9
+ # At runtime, handle missing PySpark gracefully
10
+ try:
11
+ from pyspark.sql import Column
12
+ from pyspark.sql import functions as F
13
+ except ImportError:
14
+ # PySpark is not installed - functions will fail at runtime if called
15
+ pass
16
+
17
+ try:
18
+ # Try local utils import first (for generated code)
19
+ from utils.primitives import PrimitiveRegistry
20
+ except ImportError:
21
+ # Fall back to installed datacompose package
22
+ from datacompose.operators.primitives import PrimitiveRegistry
23
+
24
+ addresses = PrimitiveRegistry("addresses")
25
+
26
+ # US State mappings - comprehensive list including territories
27
+ # These are mutable to allow extension
28
+ US_STATES = {
29
+ "ALABAMA": "AL",
30
+ "ALASKA": "AK",
31
+ "ARIZONA": "AZ",
32
+ "ARKANSAS": "AR",
33
+ "CALIFORNIA": "CA",
34
+ "COLORADO": "CO",
35
+ "CONNECTICUT": "CT",
36
+ "DELAWARE": "DE",
37
+ "FLORIDA": "FL",
38
+ "GEORGIA": "GA",
39
+ "HAWAII": "HI",
40
+ "IDAHO": "ID",
41
+ "ILLINOIS": "IL",
42
+ "INDIANA": "IN",
43
+ "IOWA": "IA",
44
+ "KANSAS": "KS",
45
+ "KENTUCKY": "KY",
46
+ "LOUISIANA": "LA",
47
+ "MAINE": "ME",
48
+ "MARYLAND": "MD",
49
+ "MASSACHUSETTS": "MA",
50
+ "MICHIGAN": "MI",
51
+ "MINNESOTA": "MN",
52
+ "MISSISSIPPI": "MS",
53
+ "MISSOURI": "MO",
54
+ "MONTANA": "MT",
55
+ "NEBRASKA": "NE",
56
+ "NEVADA": "NV",
57
+ "NEW HAMPSHIRE": "NH",
58
+ "NEW JERSEY": "NJ",
59
+ "NEW MEXICO": "NM",
60
+ "NEW YORK": "NY",
61
+ "NORTH CAROLINA": "NC",
62
+ "NORTH DAKOTA": "ND",
63
+ "OHIO": "OH",
64
+ "OKLAHOMA": "OK",
65
+ "OREGON": "OR",
66
+ "PENNSYLVANIA": "PA",
67
+ "RHODE ISLAND": "RI",
68
+ "SOUTH CAROLINA": "SC",
69
+ "SOUTH DAKOTA": "SD",
70
+ "TENNESSEE": "TN",
71
+ "TEXAS": "TX",
72
+ "UTAH": "UT",
73
+ "VERMONT": "VT",
74
+ "VIRGINIA": "VA",
75
+ "WASHINGTON": "WA",
76
+ "WEST VIRGINIA": "WV",
77
+ "WISCONSIN": "WI",
78
+ "WYOMING": "WY",
79
+ # US Territories and DC
80
+ "DISTRICT OF COLUMBIA": "DC",
81
+ "PUERTO RICO": "PR",
82
+ "VIRGIN ISLANDS": "VI",
83
+ "GUAM": "GU",
84
+ "AMERICAN SAMOA": "AS",
85
+ "NORTHERN MARIANA ISLANDS": "MP",
86
+ }
87
+
88
+ # Reverse mapping: abbreviation to full name
89
+ STATE_ABBREV = {
90
+ "AL": "ALABAMA",
91
+ "AK": "ALASKA",
92
+ "AZ": "ARIZONA",
93
+ "AR": "ARKANSAS",
94
+ "CA": "CALIFORNIA",
95
+ "CO": "COLORADO",
96
+ "CT": "CONNECTICUT",
97
+ "DE": "DELAWARE",
98
+ "FL": "FLORIDA",
99
+ "GA": "GEORGIA",
100
+ "HI": "HAWAII",
101
+ "ID": "IDAHO",
102
+ "IL": "ILLINOIS",
103
+ "IN": "INDIANA",
104
+ "IA": "IOWA",
105
+ "KS": "KANSAS",
106
+ "KY": "KENTUCKY",
107
+ "LA": "LOUISIANA",
108
+ "ME": "MAINE",
109
+ "MD": "MARYLAND",
110
+ "MA": "MASSACHUSETTS",
111
+ "MI": "MICHIGAN",
112
+ "MN": "MINNESOTA",
113
+ "MS": "MISSISSIPPI",
114
+ "MO": "MISSOURI",
115
+ "MT": "MONTANA",
116
+ "NE": "NEBRASKA",
117
+ "NV": "NEVADA",
118
+ "NH": "NEW HAMPSHIRE",
119
+ "NJ": "NEW JERSEY",
120
+ "NM": "NEW MEXICO",
121
+ "NY": "NEW YORK",
122
+ "NC": "NORTH CAROLINA",
123
+ "ND": "NORTH DAKOTA",
124
+ "OH": "OHIO",
125
+ "OK": "OKLAHOMA",
126
+ "OR": "OREGON",
127
+ "PA": "PENNSYLVANIA",
128
+ "RI": "RHODE ISLAND",
129
+ "SC": "SOUTH CAROLINA",
130
+ "SD": "SOUTH DAKOTA",
131
+ "TN": "TENNESSEE",
132
+ "TX": "TEXAS",
133
+ "UT": "UTAH",
134
+ "VT": "VERMONT",
135
+ "VA": "VIRGINIA",
136
+ "WA": "WASHINGTON",
137
+ "WV": "WEST VIRGINIA",
138
+ "WI": "WISCONSIN",
139
+ "WY": "WYOMING",
140
+ # US Territories and DC
141
+ "DC": "DISTRICT OF COLUMBIA",
142
+ "PR": "PUERTO RICO",
143
+ "VI": "VIRGIN ISLANDS",
144
+ "GU": "GUAM",
145
+ "AS": "AMERICAN SAMOA",
146
+ "MP": "NORTHERN MARIANA ISLANDS",
147
+ }
148
+
149
+ # Custom cities that users want to recognize
150
+ # Users can add to this list for better city extraction
151
+ CUSTOM_CITIES = set()
152
+
153
+
154
+ def add_custom_state(full_name: str, abbreviation: str) -> None:
155
+ """Add a custom state or region to the state mappings.
156
+
157
+ This allows extending the address parser to handle non-US states/provinces.
158
+
159
+ Args:
160
+ full_name: Full name of the state/province (e.g., "ONTARIO")
161
+ abbreviation: Two-letter abbreviation (e.g., "ON")
162
+
163
+ Example:
164
+ # Add Canadian provinces
165
+ add_custom_state("ONTARIO", "ON")
166
+ add_custom_state("QUEBEC", "QC")
167
+ add_custom_state("BRITISH COLUMBIA", "BC")
168
+ """
169
+ full_name_upper = full_name.upper()
170
+ abbrev_upper = abbreviation.upper()
171
+
172
+ US_STATES[full_name_upper] = abbrev_upper
173
+ STATE_ABBREV[abbrev_upper] = full_name_upper
174
+
175
+
176
+ def add_custom_city(city_name: str) -> None:
177
+ """Add a custom city name to improve city extraction.
178
+
179
+ This is useful for cities that might be ambiguous or hard to extract.
180
+
181
+ Args:
182
+ city_name: Name of the city to add
183
+
184
+ Example:
185
+ # Add cities that might be confused with other words
186
+ add_custom_city("Reading") # Could be confused with the verb
187
+ add_custom_city("Mobile") # Could be confused with the adjective
188
+ """
189
+ CUSTOM_CITIES.add(city_name.upper())
190
+
191
+
192
+ def remove_custom_state(identifier: str) -> None:
193
+ """Remove a custom state from the mappings.
194
+
195
+ Args:
196
+ identifier: Either the full name or abbreviation of the state to remove
197
+ """
198
+ identifier_upper = identifier.upper()
199
+
200
+ # Check if it's an abbreviation
201
+ if identifier_upper in STATE_ABBREV:
202
+ full_name = STATE_ABBREV[identifier_upper]
203
+ del STATE_ABBREV[identifier_upper]
204
+ if full_name in US_STATES:
205
+ del US_STATES[full_name]
206
+ # Check if it's a full name
207
+ elif identifier_upper in US_STATES:
208
+ abbrev = US_STATES[identifier_upper]
209
+ del US_STATES[identifier_upper]
210
+ if abbrev in STATE_ABBREV:
211
+ del STATE_ABBREV[abbrev]
212
+
213
+
214
+ def remove_custom_city(city_name: str) -> None:
215
+ """Remove a custom city from the set.
216
+
217
+ Args:
218
+ city_name: Name of the city to remove
219
+ """
220
+ CUSTOM_CITIES.discard(city_name.upper())
221
+
222
+
223
+ @addresses.register()
224
+ def extract_street_number(col: Column) -> Column:
225
+ """Extract street/house number from address.
226
+
227
+ Extracts the numeric portion at the beginning of an address.
228
+ Handles various formats: 123, 123A, 123-125, etc.
229
+
230
+ Args:
231
+ col: Column containing address text
232
+
233
+ Returns:
234
+ Column with extracted street number or empty string
235
+
236
+ Example:
237
+ df.select(addresses.extract_street_number(F.col("address")))
238
+ # "123 Main St" -> "123"
239
+ # "123A Oak Ave" -> "123A"
240
+ # "123-125 Elm St" -> "123-125"
241
+ """
242
+ # Pattern to match house/building numbers at the start (after optional whitespace)
243
+ # Matches: 123, 123A, 123-125, 123½, etc.
244
+ pattern = r"^\s*(\d+[\w\-/]*)\b"
245
+ result = F.regexp_extract(col, pattern, 1)
246
+ # Return empty string for null results
247
+ return F.when(result.isNull() | (col.isNull()), F.lit("")).otherwise(result)
248
+
249
+
250
+ @addresses.register()
251
+ def extract_street_prefix(col: Column) -> Column:
252
+ """Extract directional prefix from street address.
253
+
254
+ Extracts directional prefixes like N, S, E, W, NE, NW, SE, SW.
255
+
256
+ Args:
257
+ col: Column containing address text
258
+
259
+ Returns:
260
+ Column with extracted street prefix or empty string
261
+
262
+ Example:
263
+ df.select(addresses.extract_street_prefix(F.col("address")))
264
+ # "123 N Main St" -> "N"
265
+ # "456 South Oak Ave" -> "South"
266
+ """
267
+ # Handle nulls
268
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
269
+
270
+ # Remove house number first (after trimming leading whitespace)
271
+ without_number = F.regexp_replace(col, r"^\s*\d+[\w\-/]*\s*", "")
272
+
273
+ # Pattern for directional prefixes - case insensitive
274
+ # Capture the prefix including optional period
275
+ prefix_pattern = r"^(?i)(North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\b"
276
+
277
+ result = F.regexp_extract(without_number, prefix_pattern, 1)
278
+ return F.when(result.isNull(), F.lit("")).otherwise(result)
279
+
280
+
281
+ @addresses.register()
282
+ def extract_street_name(col: Column) -> Column:
283
+ """Extract street name from address.
284
+
285
+ Extracts the main street name, excluding number, prefix, and suffix.
286
+
287
+ Args:
288
+ col: Column containing address text
289
+
290
+ Returns:
291
+ Column with extracted street name or empty string
292
+
293
+ Example:
294
+ df.select(addresses.extract_street_name(F.col("address")))
295
+ # "123 N Main Street" -> "Main"
296
+ # "456 Oak Avenue" -> "Oak"
297
+ # "789 Martin Luther King Jr Blvd" -> "Martin Luther King Jr"
298
+ """
299
+ # Handle nulls
300
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
301
+
302
+ # Common street suffixes to identify end of street name
303
+ # Using abbreviated forms from the YAML config
304
+ suffixes = [
305
+ "Street",
306
+ "St",
307
+ "Avenue",
308
+ "Ave",
309
+ "Road",
310
+ "Rd",
311
+ "Boulevard",
312
+ "Blvd",
313
+ "Drive",
314
+ "Dr",
315
+ "Lane",
316
+ "Ln",
317
+ "Court",
318
+ "Ct",
319
+ "Place",
320
+ "Pl",
321
+ "Circle",
322
+ "Cir",
323
+ "Trail",
324
+ "Trl",
325
+ "Parkway",
326
+ "Pkwy",
327
+ "Highway",
328
+ "Hwy",
329
+ "Way",
330
+ "Terrace",
331
+ "Ter",
332
+ "Plaza",
333
+ "Plz",
334
+ "Square",
335
+ "Sq",
336
+ "Loop",
337
+ "Crescent",
338
+ "Cres",
339
+ ]
340
+
341
+ # Remove house number only if followed by more text (not just a suffix)
342
+ # This preserves numbered streets like "5th Avenue" while removing "123 Main St"
343
+ # Check if we have a pattern like "number word suffix" vs just "number suffix"
344
+ # Trim leading whitespace first
345
+ trimmed_col = F.trim(col)
346
+ without_number = F.when(
347
+ # If it's just a numbered street (e.g., "5th Avenue", "1st Street")
348
+ trimmed_col.rlike(r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"),
349
+ trimmed_col # Keep as is - it's a numbered street name
350
+ ).otherwise(
351
+ # Otherwise remove the house number
352
+ F.regexp_replace(trimmed_col, r"^\d+[\w\-/]*\s+", "")
353
+ )
354
+
355
+ # Remove directional prefix - case insensitive
356
+ # Include full directional words and abbreviations
357
+ prefix_pattern = (
358
+ r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
359
+ )
360
+ without_prefix = F.regexp_replace(without_number, prefix_pattern, "")
361
+
362
+ # Extract everything before the street suffix - case insensitive
363
+ suffix_pattern = r"^(?i)(.+?)\s+(?:" + "|".join(suffixes) + r")\b"
364
+ street_name = F.regexp_extract(without_prefix, suffix_pattern, 1)
365
+
366
+ # If no suffix found, try to extract before comma or end
367
+ street_name = F.when(street_name != "", street_name).otherwise(
368
+ F.regexp_extract(without_prefix, r"^([^,]+?)(?:\s*,|\s*$)", 1)
369
+ )
370
+
371
+ return F.trim(street_name)
372
+
373
+
374
+ @addresses.register()
375
+ def extract_street_suffix(col: Column) -> Column:
376
+ """Extract street type/suffix from address.
377
+
378
+ Extracts street type like Street, Avenue, Road, Boulevard, etc.
379
+
380
+ Args:
381
+ col: Column containing address text
382
+
383
+ Returns:
384
+ Column with extracted street suffix or empty string
385
+
386
+ Example:
387
+ df.select(addresses.extract_street_suffix(F.col("address")))
388
+ # "123 Main Street" -> "Street"
389
+ # "456 Oak Ave" -> "Ave"
390
+ # "789 Elm Boulevard" -> "Boulevard"
391
+ """
392
+ # Handle nulls
393
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
394
+
395
+ # Comprehensive list of street suffixes (both full and abbreviated)
396
+ suffixes = [
397
+ "Street",
398
+ "St",
399
+ "Avenue",
400
+ "Ave",
401
+ "Road",
402
+ "Rd",
403
+ "Boulevard",
404
+ "Blvd",
405
+ "Drive",
406
+ "Dr",
407
+ "Lane",
408
+ "Ln",
409
+ "Court",
410
+ "Ct",
411
+ "Place",
412
+ "Pl",
413
+ "Circle",
414
+ "Cir",
415
+ "Trail",
416
+ "Trl",
417
+ "Parkway",
418
+ "Pkwy",
419
+ "Highway",
420
+ "Hwy",
421
+ "Way",
422
+ "Terrace",
423
+ "Ter",
424
+ "Plaza",
425
+ "Plz",
426
+ "Square",
427
+ "Sq",
428
+ "Loop",
429
+ "Crescent",
430
+ "Cres",
431
+ "Alley",
432
+ "Aly",
433
+ ]
434
+
435
+ # Build pattern to match the LAST suffix in the string
436
+ # This handles cases like "St. James Place" where we want "Place" not "St"
437
+ suffix_pattern = r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
438
+
439
+ # Extract the last matching suffix - case insensitive
440
+ suffix_pattern_ci = r"(?i)" + suffix_pattern
441
+ result = F.regexp_extract(col, suffix_pattern_ci, 1)
442
+ return F.when(result.isNull(), F.lit("")).otherwise(result)
443
+
444
+
445
+ @addresses.register()
446
+ def extract_full_street(col: Column) -> Column:
447
+ """Extract complete street address (number + prefix + name + suffix).
448
+
449
+ Extracts everything before apartment/suite and city/state/zip.
450
+
451
+ Args:
452
+ col: Column containing address text
453
+
454
+ Returns:
455
+ Column with extracted street address or empty string
456
+
457
+ Example:
458
+ df.select(addresses.extract_full_street(F.col("address")))
459
+ # "123 N Main St, Apt 4B, New York, NY" -> "123 N Main St"
460
+ """
461
+ # Handle nulls
462
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
463
+
464
+ # Remove apartment/suite information - case insensitive
465
+ apt_pattern = r"\s*,?\s*(?i)(?:Apt|Apartment|Unit|Suite|Ste|#)\s*[\w\-]+\b"
466
+ without_apt = F.regexp_replace(col, apt_pattern, "")
467
+
468
+ # Extract everything before the first comma (usually street part)
469
+ street = F.regexp_extract(without_apt, r"^([^,]+)", 1)
470
+
471
+ # If no comma, try to extract before city/state pattern
472
+ # Look for pattern like "Street City" or "Street State ZIP"
473
+ street = F.when(
474
+ street == "",
475
+ F.regexp_extract(
476
+ col, r"^(.+?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s*,?\s*[A-Z]{2}\s+\d{5}", 1
477
+ ),
478
+ ).otherwise(street)
479
+
480
+ return F.trim(street)
481
+
482
+
483
+ @addresses.register()
484
+ def standardize_street_prefix(
485
+ col: Column, custom_mappings: Optional[Dict[str, str]] = None
486
+ ) -> Column:
487
+ """Standardize street directional prefixes to abbreviated form.
488
+
489
+ Converts all variations to standard USPS abbreviations:
490
+ North/N/N. → N, South/S/S. → S, etc.
491
+
492
+ Args:
493
+ col: Column containing street prefix
494
+ custom_mappings: Optional dict of custom prefix mappings (case insensitive)
495
+
496
+ Returns:
497
+ Column with standardized prefix (always abbreviated per USPS standards)
498
+
499
+ Example:
500
+ df.select(addresses.standardize_street_prefix(F.col("prefix")))
501
+ # "North" -> "N"
502
+ # "south" -> "S"
503
+ # "NorthEast" -> "NE"
504
+ """
505
+ # Mapping based on YAML config prefixes (lines 806-814)
506
+ prefix_map = {
507
+ "NORTH": "N",
508
+ "N.": "N",
509
+ "N": "N",
510
+ "SOUTH": "S",
511
+ "S.": "S",
512
+ "S": "S",
513
+ "EAST": "E",
514
+ "E.": "E",
515
+ "E": "E",
516
+ "WEST": "W",
517
+ "W.": "W",
518
+ "W": "W",
519
+ "NORTHEAST": "NE",
520
+ "NE.": "NE",
521
+ "NE": "NE",
522
+ "NORTHWEST": "NW",
523
+ "NW.": "NW",
524
+ "NW": "NW",
525
+ "SOUTHEAST": "SE",
526
+ "SE.": "SE",
527
+ "SE": "SE",
528
+ "SOUTHWEST": "SW",
529
+ "SW.": "SW",
530
+ "SW": "SW",
531
+ }
532
+
533
+ # Convert to uppercase for matching
534
+ upper_col = F.upper(F.trim(col))
535
+
536
+ # Apply custom mappings first if provided
537
+ result = col
538
+ if custom_mappings:
539
+ for original, standard in custom_mappings.items():
540
+ result = F.when(
541
+ upper_col == F.upper(F.lit(original)), F.lit(standard)
542
+ ).otherwise(result)
543
+ return result
544
+
545
+ # Apply default mapping
546
+ result = F.lit("")
547
+ for original, standard in prefix_map.items():
548
+ result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
549
+
550
+ return result
551
+
552
+
553
+ @addresses.register()
554
+ def standardize_street_suffix(
555
+ col: Column, custom_mappings: Optional[Dict[str, str]] = None
556
+ ) -> Column:
557
+ """Standardize street type/suffix to USPS abbreviated form.
558
+
559
+ Converts all variations to standard USPS abbreviations per the config:
560
+ Street/St/St. → St, Avenue/Ave/Av → Ave, Boulevard → Blvd, etc.
561
+
562
+ Args:
563
+ col: Column containing street suffix
564
+ custom_mappings: Optional dict of custom suffix mappings (case insensitive)
565
+
566
+ Returns:
567
+ Column with standardized suffix (always abbreviated per USPS standards)
568
+
569
+ Example:
570
+ df.select(addresses.standardize_street_suffix(F.col("suffix")))
571
+ # "Street" -> "St"
572
+ # "avenue" -> "Ave"
573
+ # "BOULEVARD" -> "Blvd"
574
+ """
575
+ # Based on YAML config suffixes mapping (lines 824-965)
576
+ # This is a subset of the most common ones
577
+ suffix_map = {
578
+ "STREET": "St",
579
+ "ST": "St",
580
+ "ST.": "St",
581
+ "STR": "St",
582
+ "AVENUE": "Ave",
583
+ "AVE": "Ave",
584
+ "AVE.": "Ave",
585
+ "AV": "Ave",
586
+ "AVEN": "Ave",
587
+ "ROAD": "Rd",
588
+ "RD": "Rd",
589
+ "RD.": "Rd",
590
+ "BOULEVARD": "Blvd",
591
+ "BLVD": "Blvd",
592
+ "BLVD.": "Blvd",
593
+ "BOUL": "Blvd",
594
+ "DRIVE": "Dr",
595
+ "DR": "Dr",
596
+ "DR.": "Dr",
597
+ "DRV": "Dr",
598
+ "DRIV": "Dr",
599
+ "LANE": "Ln",
600
+ "LN": "Ln",
601
+ "LN.": "Ln",
602
+ "COURT": "Ct",
603
+ "CT": "Ct",
604
+ "CT.": "Ct",
605
+ "CRT": "Ct",
606
+ "PLACE": "Pl",
607
+ "PL": "Pl",
608
+ "PL.": "Pl",
609
+ "PLC": "Pl",
610
+ "CIRCLE": "Cir",
611
+ "CIR": "Cir",
612
+ "CIR.": "Cir",
613
+ "CIRC": "Cir",
614
+ "TRAIL": "Trl",
615
+ "TRL": "Trl",
616
+ "TRL.": "Trl",
617
+ "TR": "Trl",
618
+ "PARKWAY": "Pkwy",
619
+ "PKWY": "Pkwy",
620
+ "PKY": "Pkwy",
621
+ "PWAY": "Pkwy",
622
+ "HIGHWAY": "Hwy",
623
+ "HWY": "Hwy",
624
+ "HWY.": "Hwy",
625
+ "HIWAY": "Hwy",
626
+ "WAY": "Way",
627
+ "WY": "Way",
628
+ "TERRACE": "Ter",
629
+ "TER": "Ter",
630
+ "TER.": "Ter",
631
+ "TERR": "Ter",
632
+ "PLAZA": "Plz",
633
+ "PLZ": "Plz",
634
+ "PLZ.": "Plz",
635
+ "PLZA": "Plz",
636
+ "SQUARE": "Sq",
637
+ "SQ": "Sq",
638
+ "SQ.": "Sq",
639
+ "SQR": "Sq",
640
+ "LOOP": "Loop",
641
+ "LP": "Loop",
642
+ "CRESCENT": "Cres",
643
+ "CRES": "Cres",
644
+ "CRES.": "Cres",
645
+ "CRSC": "Cres",
646
+ "ALLEY": "Aly",
647
+ "ALY": "Aly",
648
+ "ALY.": "Aly",
649
+ "ALLY": "Aly",
650
+ }
651
+
652
+ # Handle nulls - return empty string for null input
653
+ if col is None:
654
+ return F.lit("")
655
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
656
+
657
+ # Convert to uppercase for matching
658
+ upper_col = F.upper(F.trim(col))
659
+
660
+ # Start with the original column
661
+ result = col
662
+
663
+ # Apply custom mappings first if provided (they take precedence)
664
+ if custom_mappings:
665
+ for original, standard in custom_mappings.items():
666
+ result = F.when(
667
+ upper_col == F.upper(F.lit(original)), F.lit(standard)
668
+ ).otherwise(result)
669
+
670
+ # Then apply standard mappings for anything not already mapped
671
+ # Need to check if result has changed to avoid overwriting custom mappings
672
+ for original, standard in suffix_map.items():
673
+ # Only apply if not already mapped by custom mappings
674
+ if custom_mappings and original.upper() in [k.upper() for k in custom_mappings.keys()]:
675
+ continue
676
+ result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
677
+
678
+ return result
679
+
680
+
681
+ @addresses.register()
682
+ def extract_apartment_number(col: Column) -> Column:
683
+ """Extract apartment/unit number from address.
684
+
685
+ Extracts apartment, suite, unit, or room numbers including:
686
+ Apt 5B, Suite 200, Unit 12, #4A, Rm 101, etc.
687
+
688
+ Args:
689
+ col: Column containing address text
690
+
691
+ Returns:
692
+ Column with extracted apartment/unit number or empty string
693
+
694
+ Example:
695
+ df.select(addresses.extract_apartment_number(F.col("address")))
696
+ # "123 Main St Apt 5B" -> "5B"
697
+ # "456 Oak Ave Suite 200" -> "200"
698
+ # "789 Elm St #4A" -> "4A"
699
+ """
700
+ # Handle nulls
701
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
702
+
703
+ # Patterns for different unit types - case insensitive
704
+ # Matches: Apt, Apartment, Suite, Ste, Unit, Room, Rm, # followed by alphanumeric
705
+ # Updated to handle fractions (1/2, 3½), decimals (12.5), parentheses and other special cases
706
+ apt_pattern = r"(?i)(?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*(\(?[A-Z0-9\-/½¼¾\.]+\)?)"
707
+
708
+ result = F.regexp_extract(col, apt_pattern, 1)
709
+
710
+ # If no unit type found, check for trailing numbers (e.g., "123 Main St 456")
711
+ if_no_result = F.when(
712
+ result == "", F.regexp_extract(col, r"\s+(\d+[A-Z]?)\s*$", 1)
713
+ ).otherwise(result)
714
+
715
+ return F.when(if_no_result.isNull(), F.lit("")).otherwise(if_no_result)
716
+
717
+
718
+ @addresses.register()
719
+ def extract_floor(col: Column) -> Column:
720
+ """Extract floor number from address.
721
+
722
+ Extracts floor information like:
723
+ 5th Floor, Floor 2, Fl 3, Level 4, etc.
724
+
725
+ Args:
726
+ col: Column containing address text
727
+
728
+ Returns:
729
+ Column with extracted floor number or empty string
730
+
731
+ Example:
732
+ df.select(addresses.extract_floor(F.col("address")))
733
+ # "123 Main St, 5th Floor" -> "5"
734
+ # "456 Oak Ave, Floor 2" -> "2"
735
+ # "789 Elm St, Level 3" -> "3"
736
+ """
737
+ # Handle nulls
738
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
739
+
740
+ # Pattern for floor information - case insensitive
741
+ # Matches: 1st Floor, 2nd Floor, 3rd Floor, 4th-99th Floor, Floor 1, Fl. 2, Level 3
742
+ # Updated to handle abbreviated forms like "31st Fl"
743
+ floor_pattern = r"(?i)(?:(\d+)(?:st|nd|rd|th)?\s*(?:Floor|Fl\.?)|Floor\s*(\d+)|Fl\.?\s*(\d+)|Level\s*(\d+))"
744
+
745
+ # Extract from any of the capture groups
746
+ floor1 = F.regexp_extract(col, floor_pattern, 1)
747
+ floor2 = F.regexp_extract(col, floor_pattern, 2)
748
+ floor3 = F.regexp_extract(col, floor_pattern, 3)
749
+ floor4 = F.regexp_extract(col, floor_pattern, 4)
750
+
751
+ # Return the first non-empty match
752
+ result = F.when(floor1 != "", floor1).otherwise(
753
+ F.when(floor2 != "", floor2).otherwise(
754
+ F.when(floor3 != "", floor3).otherwise(floor4)
755
+ )
756
+ )
757
+
758
+ return F.when(result.isNull() | (result == ""), F.lit("")).otherwise(result)
759
+
760
+
761
+ @addresses.register()
762
+ def extract_building(col: Column) -> Column:
763
+ """Extract building name or identifier from address.
764
+
765
+ Extracts building information like:
766
+ Building A, Tower 2, Complex B, Block C, etc.
767
+
768
+ Args:
769
+ col: Column containing address text
770
+
771
+ Returns:
772
+ Column with extracted building identifier or empty string
773
+
774
+ Example:
775
+ df.select(addresses.extract_building(F.col("address")))
776
+ # "123 Main St, Building A" -> "A"
777
+ # "456 Oak Ave, Tower 2" -> "2"
778
+ # "789 Elm St, Complex North" -> "North"
779
+ """
780
+ # Handle nulls
781
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
782
+
783
+ # Pattern for building information - case insensitive
784
+ # Matches: Building A, Bldg 2, Tower B, Complex 3, Block C, Wing D, Blg B
785
+ # Updated to handle multi-word names but stop at commas or other building indicators
786
+ building_pattern = r"(?i)(?:Building|Bldg\.?|Blg|Tower|Complex|Block|Wing)\s+([A-Z0-9]+(?:\s+[A-Z0-9]+)?)"
787
+
788
+ # Stop capturing if we hit another building indicator (Floor, Suite, etc.)
789
+ result_raw = F.regexp_extract(col, building_pattern, 1)
790
+
791
+ # Clean up - remove anything after Floor, Suite, Apt, etc.
792
+ result = F.regexp_replace(
793
+ result_raw,
794
+ r"(?i)\s+(?:Floor|Fl\.?|Suite|Ste\.?|Apt\.?|Apartment|Unit|Room|Rm\.?).*",
795
+ "",
796
+ )
797
+
798
+ return F.when(result.isNull() | (result == ""), F.lit("")).otherwise(result)
799
+
800
+
801
+ @addresses.register()
802
+ def extract_unit_type(col: Column) -> Column:
803
+ """Extract the type of unit (Apt, Suite, Unit, etc.) from address.
804
+
805
+ Args:
806
+ col: Column containing address text
807
+
808
+ Returns:
809
+ Column with unit type or empty string
810
+
811
+ Example:
812
+ df.select(addresses.extract_unit_type(F.col("address")))
813
+ # "123 Main St Apt 5B" -> "Apt"
814
+ # "456 Oak Ave Suite 200" -> "Suite"
815
+ # "789 Elm St Unit 12" -> "Unit"
816
+ """
817
+ # Handle nulls
818
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
819
+
820
+ # Pattern to extract unit type - case insensitive
821
+ unit_type_pattern = r"(?i)(Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)"
822
+
823
+ result = F.regexp_extract(col, unit_type_pattern, 1)
824
+
825
+ # Clean up the result (remove periods, standardize case)
826
+ result = F.when(
827
+ result != "", F.initcap(F.regexp_replace(result, r"\.", ""))
828
+ ).otherwise("")
829
+
830
+ return F.when(result.isNull(), F.lit("")).otherwise(result)
831
+
832
+
833
+ @addresses.register()
834
+ def standardize_unit_type(
835
+ col: Column, custom_mappings: Optional[Dict[str, str]] = None
836
+ ) -> Column:
837
+ """Standardize unit type to common abbreviations.
838
+
839
+ Converts all variations to standard abbreviations:
840
+ Apartment/Apt. → Apt, Suite → Ste, Room → Rm, etc.
841
+
842
+ Args:
843
+ col: Column containing unit type
844
+ custom_mappings: Optional dict of custom unit type mappings
845
+
846
+ Returns:
847
+ Column with standardized unit type
848
+
849
+ Example:
850
+ df.select(addresses.standardize_unit_type(F.col("unit_type")))
851
+ # "Apartment" -> "Apt"
852
+ # "Suite" -> "Ste"
853
+ # "Room" -> "Rm"
854
+ """
855
+ # Handle nulls
856
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
857
+
858
+ # Standard mappings for unit types
859
+ unit_map = {
860
+ "APARTMENT": "Apt",
861
+ "APT.": "Apt",
862
+ "APT": "Apt",
863
+ "SUITE": "Ste",
864
+ "STE.": "Ste",
865
+ "STE": "Ste",
866
+ "UNIT": "Unit",
867
+ "ROOM": "Rm",
868
+ "RM.": "Rm",
869
+ "RM": "Rm",
870
+ "FLOOR": "Fl",
871
+ "FL.": "Fl",
872
+ "FL": "Fl",
873
+ "BUILDING": "Bldg",
874
+ "BLDG.": "Bldg",
875
+ "BLDG": "Bldg",
876
+ "#": "#",
877
+ "NUMBER": "#",
878
+ "NO.": "#",
879
+ "NO": "#",
880
+ }
881
+
882
+ # Convert to uppercase for matching
883
+ upper_col = F.upper(F.trim(col))
884
+
885
+ # Apply custom mappings first if provided
886
+ result = col
887
+ if custom_mappings:
888
+ for original, standard in custom_mappings.items():
889
+ result = F.when(
890
+ upper_col == F.upper(F.lit(original)), F.lit(standard)
891
+ ).otherwise(result)
892
+
893
+ # Then apply standard mappings for anything not custom mapped
894
+ for original, standard in unit_map.items():
895
+ result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
896
+
897
+ return result
898
+
899
+
900
+ @addresses.register()
901
+ def extract_secondary_address(col: Column) -> Column:
902
+ """Extract complete secondary address information (unit type + number).
903
+
904
+ Combines unit type and number into standard format:
905
+ "Apt 5B", "Ste 200", "Unit 12", etc.
906
+
907
+ Args:
908
+ col: Column containing address text
909
+
910
+ Returns:
911
+ Column with complete secondary address or empty string
912
+
913
+ Example:
914
+ df.select(addresses.extract_secondary_address(F.col("address")))
915
+ # "123 Main St Apt 5B" -> "Apt 5B"
916
+ # "456 Oak Ave, Suite 200" -> "Suite 200"
917
+ # "789 Elm St" -> ""
918
+ """
919
+ # Handle nulls
920
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
921
+
922
+ # Pattern to extract complete secondary address - case insensitive
923
+ secondary_pattern = (
924
+ r"(?i)((?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*[A-Z0-9\-]+)"
925
+ )
926
+
927
+ result = F.regexp_extract(col, secondary_pattern, 1)
928
+ return F.when(result.isNull(), F.lit("")).otherwise(result)
929
+
930
+
931
+ @addresses.register()
932
+ def has_apartment(col: Column) -> Column:
933
+ """Check if address contains apartment/unit information.
934
+
935
+ Args:
936
+ col: Column containing address text
937
+
938
+ Returns:
939
+ Column with boolean indicating presence of apartment/unit
940
+
941
+ Example:
942
+ df.select(addresses.has_apartment(F.col("address")))
943
+ # "123 Main St Apt 5B" -> True
944
+ # "456 Oak Ave" -> False
945
+ """
946
+ # Handle nulls
947
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
948
+
949
+ # Check for apartment/unit patterns
950
+ apt_pattern = (
951
+ r"(?i)(?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*[A-Z0-9\-]+"
952
+ )
953
+
954
+ # Return boolean
955
+ return F.when(F.regexp_extract(col, apt_pattern, 0) != "", F.lit(True)).otherwise(
956
+ F.lit(False)
957
+ )
958
+
959
+
960
+ @addresses.register()
961
+ def remove_secondary_address(col: Column) -> Column:
962
+ """Remove apartment/suite/unit information from address.
963
+
964
+ Removes secondary address components to get clean street address.
965
+
966
+ Args:
967
+ col: Column containing address text
968
+
969
+ Returns:
970
+ Column with secondary address removed
971
+
972
+ Example:
973
+ df.select(addresses.remove_secondary_address(F.col("address")))
974
+ # "123 Main St Apt 5B" -> "123 Main St"
975
+ # "456 Oak Ave, Suite 200" -> "456 Oak Ave"
976
+ """
977
+ # Handle nulls
978
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
979
+
980
+ # Pattern to match secondary address components - case insensitive
981
+ # Include optional comma and spaces before
982
+ secondary_pattern = (
983
+ r",?\s*(?i)(?:Apt\.?|Apartment|Suite|Ste\.?|Unit|Room|Rm\.?|#)\s*[A-Z0-9\-]+\b"
984
+ )
985
+
986
+ # Remove the pattern and clean up extra spaces
987
+ result = F.regexp_replace(col, secondary_pattern, "")
988
+ result = F.regexp_replace(result, r"\s+", " ") # Clean multiple spaces
989
+ result = F.trim(result)
990
+
991
+ return result
992
+
993
+
994
+ def format_secondary_address(unit_type: Column, unit_number: Column) -> Column:
995
+ """Format unit type and number into standard secondary address.
996
+
997
+ Note: This is a helper function, not registered with addresses primitive.
998
+ Use it directly with two columns.
999
+
1000
+ Args:
1001
+ unit_type: Column containing unit type (Apt, Suite, etc.)
1002
+ unit_number: Column containing unit number (5B, 200, etc.)
1003
+
1004
+ Returns:
1005
+ Column with formatted secondary address
1006
+
1007
+ Example:
1008
+ from datacompose.transformers.text.clean_addresses.pyspark.pyspark_udf import format_secondary_address
1009
+ df.select(format_secondary_address(F.lit("Apartment"), F.lit("5B")))
1010
+ # -> "Apt 5B"
1011
+ """
1012
+ # Standardize the unit type first
1013
+ std_type = standardize_unit_type(unit_type)
1014
+
1015
+ # Combine type and number, handling nulls
1016
+ result = F.when(
1017
+ (std_type.isNotNull() & (std_type != ""))
1018
+ & (unit_number.isNotNull() & (unit_number != "")),
1019
+ F.concat_ws(" ", std_type, unit_number),
1020
+ ).otherwise(F.lit(""))
1021
+
1022
+ return result
1023
+
1024
+
1025
+ @addresses.register()
1026
+ def extract_zip_code(col: Column) -> Column: # type: ignore
1027
+ """Extract US ZIP code (5-digit or ZIP+4 format) from text.
1028
+
1029
+ Returns empty string for null/invalid inputs.
1030
+ """
1031
+ extracted = F.regexp_extract(col, r"\b(\d{5}(?:-\d{4})?)\b", 1)
1032
+ # Return empty string instead of null for consistency
1033
+ return F.when(extracted.isNull(), F.lit("")).otherwise(extracted)
1034
+
1035
+
1036
+ @addresses.register()
1037
+ def validate_zip_code(col: Column) -> Column:
1038
+ """Validate if a ZIP code is in correct US format.
1039
+
1040
+ Validates:
1041
+ - 5-digit format (e.g., "12345")
1042
+ - ZIP+4 format (e.g., "12345-6789")
1043
+ - Not all zeros (except "00000" which is technically valid)
1044
+ - Within valid range (00001-99999 for base ZIP)
1045
+
1046
+ Args:
1047
+ col (Column): Column containing ZIP codes to validate
1048
+
1049
+ Returns:
1050
+ Column: Boolean column indicating if ZIP code is valid
1051
+ """
1052
+ # Check if the column matches valid ZIP code pattern
1053
+ is_valid_format = F.regexp_extract(col, r"^(\d{5}(?:-\d{4})?)$", 1) != ""
1054
+
1055
+ # Additional validation: not empty/null
1056
+ is_not_empty = (col.isNotNull()) & (F.trim(col) != "")
1057
+
1058
+ # Combined validation
1059
+ return is_valid_format & is_not_empty
1060
+
1061
+
1062
+ @addresses.register()
1063
+ def is_valid_zip_code(col: Column) -> "Column":
1064
+ """Alias for validate_zip_code for consistency.
1065
+
1066
+ Args:
1067
+ col (Column): Column containing ZIP codes to validate
1068
+
1069
+ Returns:
1070
+ Column: Boolean column indicating if ZIP code is valid
1071
+ """
1072
+ return validate_zip_code(col)
1073
+
1074
+
1075
+ @addresses.register()
1076
+ def standardize_zip_code(col: Column):
1077
+ """Standardize ZIP code format.
1078
+
1079
+ - Removes extra spaces
1080
+ - Ensures proper dash placement for ZIP+4
1081
+ - Returns empty string for invalid formats
1082
+
1083
+ Args:
1084
+ col (Column): Column containing ZIP codes to standardize
1085
+
1086
+ Returns:
1087
+ Column: Standardized ZIP code or empty string if invalid
1088
+ """
1089
+ # First extract the ZIP code
1090
+ extracted = extract_zip_code(col)
1091
+
1092
+ # Then validate it
1093
+ is_valid = validate_zip_code(extracted)
1094
+
1095
+ # Return standardized version or empty string
1096
+ return F.when(is_valid, extracted).otherwise(F.lit(""))
1097
+
1098
+
1099
+ @addresses.register()
1100
+ def get_zip_code_type(col: Column):
1101
+ """Determine the type of ZIP code.
1102
+
1103
+ Args:
1104
+ col (Column): Column containing ZIP codes
1105
+
1106
+ Returns:
1107
+ Column: String column with values: "standard", "plus4", "invalid", or "empty"
1108
+ """
1109
+ # Check patterns
1110
+ is_standard = F.regexp_extract(col, r"^(\d{5})$", 1) != ""
1111
+ is_plus4 = F.regexp_extract(col, r"^(\d{5}-\d{4})$", 1) != ""
1112
+ is_empty = (col.isNull()) | (F.trim(col) == "")
1113
+
1114
+ return (
1115
+ F.when(is_plus4, F.lit("plus4"))
1116
+ .when(is_standard, F.lit("standard"))
1117
+ .when(is_empty, F.lit("empty"))
1118
+ .otherwise(F.lit("invalid"))
1119
+ )
1120
+
1121
+
1122
+ @addresses.register()
1123
+ def split_zip_code(col: Column):
1124
+ """Split ZIP+4 code into base and extension components.
1125
+
1126
+ Args:
1127
+ col (Column): Column containing ZIP codes
1128
+
1129
+ Returns:
1130
+ Column: Struct with 'base' and 'extension' fields
1131
+ """
1132
+ # Extract base ZIP (first 5 digits)
1133
+ base_zip = F.regexp_extract(col, r"^(\d{5})", 1)
1134
+
1135
+ # Extract extension (4 digits after dash, if present)
1136
+ extension = F.regexp_extract(col, r"^\d{5}-(\d{4})$", 1)
1137
+
1138
+ # Return as struct
1139
+ return F.struct(
1140
+ base_zip.alias("base"),
1141
+ F.when(extension != "", extension).otherwise(F.lit(None)).alias("extension"),
1142
+ )
1143
+
1144
+
1145
+ @addresses.register()
1146
+ def extract_city(col: Column, custom_cities: Optional[List] = None) -> Column:
1147
+ """Extract city name from US address text.
1148
+
1149
+ Extracts city by finding text before state abbreviation or ZIP code.
1150
+ Handles various formats including comma-separated and multi-word cities.
1151
+
1152
+ Args:
1153
+ col: Column containing address text
1154
+ custom_cities: Optional list of custom city names to recognize (case-insensitive)
1155
+
1156
+ Returns:
1157
+ Column with extracted city name or empty string if not found
1158
+
1159
+ Example:
1160
+ # Direct usage
1161
+ df.select(addresses.extract_city(F.col("address")))
1162
+
1163
+ # With custom cities
1164
+ df.select(addresses.extract_city(F.col("address"), custom_cities=["Reading", "Mobile"]))
1165
+
1166
+ # Pre-configured
1167
+ extract_city_custom = addresses.extract_city(custom_cities=["Reading", "Mobile"])
1168
+ df.select(extract_city_custom(F.col("address")))
1169
+ """
1170
+ # For city extraction, match both abbreviations and full state names
1171
+ # But prioritize abbreviations to avoid false matches
1172
+ state_abbrevs_only = list(STATE_ABBREV.keys())
1173
+ # Add common full state names for city extraction
1174
+ common_full_states = [
1175
+ "California",
1176
+ "New York",
1177
+ "Texas",
1178
+ "Florida",
1179
+ "Pennsylvania",
1180
+ "Illinois",
1181
+ "Ohio",
1182
+ "Georgia",
1183
+ "North Carolina",
1184
+ "Michigan",
1185
+ "New Jersey",
1186
+ "Virginia",
1187
+ "Washington",
1188
+ "Massachusetts",
1189
+ "Arizona",
1190
+ "Tennessee",
1191
+ "Indiana",
1192
+ "Missouri",
1193
+ "Maryland",
1194
+ "Wisconsin",
1195
+ "Colorado",
1196
+ "Minnesota",
1197
+ "South Carolina",
1198
+ "Alabama",
1199
+ "Louisiana",
1200
+ "Kentucky",
1201
+ "Oregon",
1202
+ "Oklahoma",
1203
+ "Connecticut",
1204
+ "Utah",
1205
+ "Iowa",
1206
+ "Nevada",
1207
+ "Arkansas",
1208
+ "Mississippi",
1209
+ "Kansas",
1210
+ "New Mexico",
1211
+ "Nebraska",
1212
+ "Idaho",
1213
+ "West Virginia",
1214
+ "Hawaii",
1215
+ "New Hampshire",
1216
+ "Maine",
1217
+ "Montana",
1218
+ "Rhode Island",
1219
+ "Delaware",
1220
+ "South Dakota",
1221
+ "North Dakota",
1222
+ "Alaska",
1223
+ "Vermont",
1224
+ "Wyoming",
1225
+ "District of Columbia",
1226
+ "Puerto Rico",
1227
+ ]
1228
+
1229
+ # Combine abbreviations and full names for pattern
1230
+ all_state_patterns = state_abbrevs_only + [s.upper() for s in common_full_states]
1231
+
1232
+ # Check for custom cities if provided
1233
+ custom_city_result = F.lit("")
1234
+
1235
+ # Use provided custom_cities parameter, or fall back to module-level CUSTOM_CITIES
1236
+ cities_to_check = (
1237
+ custom_cities if custom_cities is not None else list(CUSTOM_CITIES)
1238
+ )
1239
+
1240
+ if cities_to_check:
1241
+ # Create a single regex pattern for all custom cities
1242
+ # Sort by length (longest first) to match multi-word cities first
1243
+ sorted_custom_cities = sorted(cities_to_check, key=len, reverse=True)
1244
+ # Ensure cities are strings and uppercase for comparison
1245
+ sorted_custom_cities = [str(city).upper() for city in sorted_custom_cities]
1246
+ # Build pattern with all custom cities as alternatives
1247
+ custom_pattern = (
1248
+ r"(?i)\b(?:"
1249
+ + "|".join(re.escape(city) for city in sorted_custom_cities)
1250
+ + r")\b"
1251
+ )
1252
+ custom_city_result = F.regexp_extract(col, custom_pattern, 0)
1253
+
1254
+ # Pattern to extract city before a proper state
1255
+ # First pattern: try to match city that comes after a comma and before state
1256
+ # "anything, City, State" - captures "City"
1257
+ city_after_comma_pattern = (
1258
+ r"(?i),\s*([^,]+?)\s*,\s*(?:" + "|".join(all_state_patterns) + r")\b"
1259
+ )
1260
+
1261
+ # Second pattern: match city at start before state (no street address)
1262
+ # "City, State" or "City State ZIP"
1263
+ city_at_start_pattern = (
1264
+ r"(?i)^([^,]+?)(?:\s*,\s*(?:" + "|".join(all_state_patterns) + r")\b|"
1265
+ r"\s+(?:" + "|".join(state_abbrevs_only) + r")\s+\d{5})"
1266
+ )
1267
+
1268
+ # Try to extract city using both patterns - prefer after comma (more specific)
1269
+ city_after_comma = F.regexp_extract(col, city_after_comma_pattern, 1)
1270
+ city_at_start = F.regexp_extract(col, city_at_start_pattern, 1)
1271
+ city = F.when(city_after_comma != "", city_after_comma).otherwise(city_at_start)
1272
+
1273
+ # If no state found, try to extract before ZIP code only
1274
+ city_from_zip = F.regexp_extract(col, r"^(.+?)\s*(?:,\s*)?\d{5}(?:-\d{4})?\s*$", 1)
1275
+
1276
+ # Use custom city if found, otherwise use regular extraction
1277
+ result = F.when(custom_city_result != "", F.initcap(custom_city_result)).otherwise(
1278
+ F.coalesce(city, city_from_zip, F.lit(""))
1279
+ )
1280
+ result = F.trim(F.regexp_replace(result, r"[,\s]+$", ""))
1281
+
1282
+ # Handle case where we might have captured too much (e.g., street info)
1283
+ # If result contains common street suffixes, try to extract just the city part
1284
+ street_indicators = [
1285
+ "Street",
1286
+ "St",
1287
+ "Avenue",
1288
+ "Ave",
1289
+ "Road",
1290
+ "Rd",
1291
+ "Boulevard",
1292
+ "Blvd",
1293
+ "Drive",
1294
+ "Dr",
1295
+ "Lane",
1296
+ "Ln",
1297
+ "Court",
1298
+ "Ct",
1299
+ "Place",
1300
+ "Pl",
1301
+ ]
1302
+ street_pattern = r"(?i)\b(?:" + "|".join(street_indicators) + r")\b.*?,\s*(.+)$"
1303
+
1304
+ # If we find street indicators, extract what comes after the last comma
1305
+ city_after_street = F.regexp_extract(result, street_pattern, 1)
1306
+
1307
+ return F.when(city_after_street != "", city_after_street).otherwise(result)
1308
+
1309
+
1310
+ @addresses.register()
1311
+ def extract_state(col: Column, custom_states: Optional[Dict] = None) -> Column:
1312
+ """Extract and standardize state to 2-letter abbreviation.
1313
+
1314
+ Handles both full state names and abbreviations, case-insensitive.
1315
+ Returns standardized 2-letter uppercase abbreviation.
1316
+
1317
+ Args:
1318
+ col: Column containing address text with state information
1319
+ custom_states: Optional dict mapping full state names to abbreviations
1320
+ e.g., {"ONTARIO": "ON", "QUEBEC": "QC"}
1321
+
1322
+ Returns:
1323
+ Column with 2-letter state abbreviation or empty string if not found
1324
+
1325
+ Example:
1326
+ # Direct usage
1327
+ df.select(addresses.extract_state(F.col("address")))
1328
+
1329
+ # With custom states (e.g., Canadian provinces)
1330
+ canadian_provinces = {"ONTARIO": "ON", "QUEBEC": "QC", "BRITISH COLUMBIA": "BC"}
1331
+ df.select(addresses.extract_state(F.col("address"), custom_states=canadian_provinces))
1332
+ """
1333
+ # Build combined state mappings
1334
+ states_map = US_STATES.copy()
1335
+ abbrev_map = STATE_ABBREV.copy()
1336
+
1337
+ # Add custom states if provided
1338
+ if custom_states:
1339
+ for full_name, abbrev in custom_states.items():
1340
+ full_name_upper = str(full_name).upper()
1341
+ abbrev_upper = str(abbrev).upper()
1342
+ states_map[full_name_upper] = abbrev_upper
1343
+ abbrev_map[abbrev_upper] = full_name_upper
1344
+
1345
+ # Create comprehensive state pattern
1346
+ all_states = list(states_map.keys()) + list(abbrev_map.keys())
1347
+
1348
+ # Pattern to match state names/abbreviations
1349
+ # Look for states that appear before ZIP/postal code or at end of string
1350
+ # Support both US ZIP codes (12345) and Canadian postal codes (A1B 2C3)
1351
+ state_pattern = (
1352
+ r"(?i)\b("
1353
+ + "|".join(all_states)
1354
+ + r")\b(?:\s+(?:\d{5}(?:-\d{4})?|[A-Z]\d[A-Z]\s*\d[A-Z]\d))?(?:\s*$)"
1355
+ )
1356
+
1357
+ # Extract the state (case-insensitive)
1358
+ extracted = F.upper(F.regexp_extract(col, state_pattern, 1))
1359
+
1360
+ # Check if it's already a valid abbreviation (including custom ones)
1361
+ is_abbrev = extracted.isin(list(abbrev_map.keys()))
1362
+
1363
+ # If it's an abbreviation, return it; otherwise check if it's a full name
1364
+ result = F.when(is_abbrev, extracted)
1365
+
1366
+ # Map full state names to abbreviations (including custom ones)
1367
+ for full_name, abbrev in states_map.items():
1368
+ result = result.when(extracted == full_name, F.lit(abbrev))
1369
+
1370
+ # Default to empty string if no match
1371
+ result = result.otherwise(F.lit(""))
1372
+
1373
+ return result
1374
+
1375
+
1376
+ @addresses.register()
1377
+ def validate_city(
1378
+ col: Column,
1379
+ known_cities: Optional[List] = None,
1380
+ min_length: int = 2,
1381
+ max_length: int = 50,
1382
+ ) -> Column:
1383
+ """Validate if a city name appears valid.
1384
+
1385
+ Validates:
1386
+ - Not empty/null
1387
+ - Within reasonable length bounds
1388
+ - Contains valid characters (letters, spaces, hyphens, apostrophes, periods)
1389
+ - Optionally: matches a list of known cities
1390
+
1391
+ Args:
1392
+ col: Column containing city names to validate
1393
+ known_cities: Optional list of valid city names to check against
1394
+ min_length: Minimum valid city name length (default 2)
1395
+ max_length: Maximum valid city name length (default 50)
1396
+
1397
+ Returns:
1398
+ Boolean column indicating if city name is valid
1399
+
1400
+ Example:
1401
+ # Basic validation
1402
+ df.select(addresses.validate_city(F.col("city")))
1403
+
1404
+ # Validate against known cities
1405
+ us_cities = ["New York", "Los Angeles", "Chicago", ...]
1406
+ df.select(addresses.validate_city(F.col("city"), known_cities=us_cities))
1407
+ """
1408
+ # Clean the input
1409
+ cleaned = F.trim(col)
1410
+
1411
+ # Basic validation: not empty
1412
+ not_empty = (cleaned.isNotNull()) & (cleaned != "")
1413
+
1414
+ # Length validation
1415
+ length_valid = (F.length(cleaned) >= min_length) & (F.length(cleaned) <= max_length)
1416
+
1417
+ # Character validation: letters, spaces, hyphens, apostrophes, periods, and numbers
1418
+ # Allow: St. Louis, O'Fallon, Winston-Salem, 29 Palms, etc.
1419
+ char_pattern = r"^[A-Za-z0-9\s\-'.]+$"
1420
+ chars_valid = F.regexp_extract(cleaned, char_pattern, 0) != ""
1421
+
1422
+ # Combine basic validations
1423
+ basic_valid = not_empty & length_valid & chars_valid
1424
+
1425
+ # If known cities provided, check against them
1426
+ if known_cities:
1427
+ # Normalize for comparison
1428
+ cleaned_upper = F.upper(cleaned)
1429
+ known_cities_upper = [str(city).upper() for city in known_cities]
1430
+ in_known_list = cleaned_upper.isin(known_cities_upper)
1431
+ return basic_valid & in_known_list
1432
+
1433
+ return basic_valid
1434
+
1435
+
1436
+ @addresses.register()
1437
+ def validate_state(col: Column) -> Column:
1438
+ """Validate if state code is a valid US state abbreviation.
1439
+
1440
+ Checks against list of valid US state abbreviations including territories.
1441
+
1442
+ Args:
1443
+ col: Column containing state codes to validate
1444
+
1445
+ Returns:
1446
+ Boolean column indicating if state code is valid
1447
+ """
1448
+ # Convert to uppercase for comparison
1449
+ upper_col = F.upper(F.trim(col))
1450
+
1451
+ # Check if it's a valid abbreviation
1452
+ valid_abbrevs = list(STATE_ABBREV.keys())
1453
+
1454
+ # Also check if it's a valid full state name
1455
+ valid_full_names = list(US_STATES.keys())
1456
+
1457
+ return (upper_col.isin(valid_abbrevs)) | (upper_col.isin(valid_full_names))
1458
+
1459
+
1460
+ @addresses.register()
1461
+ def standardize_city(col: Column, custom_mappings: Optional[Dict] = None) -> Column:
1462
+ """Standardize city name formatting.
1463
+
1464
+ - Trims whitespace
1465
+ - Normalizes internal spacing
1466
+ - Applies title case (with special handling for common patterns)
1467
+ - Optionally applies custom city name mappings
1468
+
1469
+ Args:
1470
+ col: Column containing city names to standardize
1471
+ custom_mappings: Optional dict for city name corrections/standardization
1472
+ e.g., {"ST LOUIS": "St. Louis", "NEWYORK": "New York"}
1473
+
1474
+ Returns:
1475
+ Column with standardized city names
1476
+
1477
+ Example:
1478
+ # Basic standardization
1479
+ df.select(addresses.standardize_city(F.col("city")))
1480
+
1481
+ # With custom mappings for common variations
1482
+ city_mappings = {
1483
+ "NYC": "New York",
1484
+ "LA": "Los Angeles",
1485
+ "SF": "San Francisco",
1486
+ "STLOUIS": "St. Louis"
1487
+ }
1488
+ df.select(addresses.standardize_city(F.col("city"), custom_mappings=city_mappings))
1489
+ """
1490
+ # Clean and normalize whitespace
1491
+ result = F.trim(F.regexp_replace(col, r"\s+", " "))
1492
+
1493
+ # Apply custom mappings if provided
1494
+ mapped = F.lit(None)
1495
+ if custom_mappings:
1496
+ # Normalize mappings to uppercase for comparison
1497
+ normalized_mappings = {str(k).upper(): v for k, v in custom_mappings.items()}
1498
+
1499
+ # Start with the original result
1500
+ upper_result = F.upper(result)
1501
+
1502
+ # Apply each mapping
1503
+ for original, replacement in normalized_mappings.items():
1504
+ mapped = F.when(upper_result == original, F.lit(replacement)).otherwise(
1505
+ mapped
1506
+ )
1507
+
1508
+ # If a mapping was applied, use it; otherwise apply standard formatting
1509
+ result = F.when(mapped.isNotNull(), mapped).otherwise(
1510
+ # Apply intelligent title case
1511
+ F.initcap(result)
1512
+ )
1513
+
1514
+ # Fix common patterns that initcap doesn't handle well
1515
+ # Only apply these if we didn't use a custom mapping
1516
+ result = F.when(
1517
+ mapped.isNull(),
1518
+ F.regexp_replace(
1519
+ F.regexp_replace(
1520
+ F.regexp_replace(result, r"\bSt\b", "St."), r"\bFt\b", "Ft."
1521
+ ),
1522
+ r"\bMt\b",
1523
+ "Mt.",
1524
+ ),
1525
+ ).otherwise(result)
1526
+
1527
+ return result
1528
+
1529
+
1530
+ @addresses.register()
1531
+ def standardize_state(col: Column) -> Column:
1532
+ """Convert state to standard 2-letter format.
1533
+
1534
+ Converts full names to abbreviations and ensures uppercase.
1535
+
1536
+ Args:
1537
+ col: Column containing state names or abbreviations
1538
+
1539
+ Returns:
1540
+ Column with standardized 2-letter state codes
1541
+ """
1542
+ # Use extract_state which already does the standardization
1543
+ return extract_state(col)
1544
+
1545
+
1546
+ @addresses.register()
1547
+ def get_state_name(col: Column) -> Column:
1548
+ """Convert state abbreviation to full name.
1549
+
1550
+ Args:
1551
+ col: Column containing 2-letter state abbreviations
1552
+
1553
+ Returns:
1554
+ Column with full state names (title case) or empty string if invalid
1555
+ """
1556
+ # Convert to uppercase for lookup
1557
+ upper_col = F.upper(F.trim(col))
1558
+
1559
+ # Start with empty string as default
1560
+ result = F.lit("")
1561
+
1562
+ # Map each abbreviation to its full name
1563
+ for abbrev, full_name in STATE_ABBREV.items():
1564
+ result = F.when(upper_col == abbrev, F.lit(full_name.title())).otherwise(result)
1565
+
1566
+ return result
1567
+
1568
+
1569
+ # Common country names and their variations
1570
+ COUNTRIES = {
1571
+ # North America
1572
+ "USA": [
1573
+ "USA",
1574
+ "US",
1575
+ "U.S.A.",
1576
+ "U.S.",
1577
+ "United States",
1578
+ "United States of America",
1579
+ "America",
1580
+ ],
1581
+ "Canada": ["Canada", "CA", "CAN"],
1582
+ "Mexico": ["Mexico", "MX", "MEX"],
1583
+ # Europe
1584
+ "United Kingdom": [
1585
+ "UK",
1586
+ "U.K.",
1587
+ "United Kingdom",
1588
+ "Great Britain",
1589
+ "GB",
1590
+ "GBR",
1591
+ "England",
1592
+ ],
1593
+ "Germany": ["Germany", "DE", "DEU", "Deutschland"],
1594
+ "France": ["France", "FR", "FRA"],
1595
+ "Italy": ["Italy", "IT", "ITA", "Italia"],
1596
+ "Spain": ["Spain", "ES", "ESP", "España"],
1597
+ "Netherlands": ["Netherlands", "NL", "NLD", "Holland"],
1598
+ "Belgium": ["Belgium", "BE", "BEL"],
1599
+ "Switzerland": ["Switzerland", "CH", "CHE", "Swiss"],
1600
+ "Austria": ["Austria", "AT", "AUT"],
1601
+ "Poland": ["Poland", "PL", "POL"],
1602
+ "Sweden": ["Sweden", "SE", "SWE"],
1603
+ "Norway": ["Norway", "NO", "NOR"],
1604
+ "Denmark": ["Denmark", "DK", "DNK"],
1605
+ "Finland": ["Finland", "FI", "FIN"],
1606
+ "Ireland": ["Ireland", "IE", "IRL"],
1607
+ "Portugal": ["Portugal", "PT", "PRT"],
1608
+ "Greece": ["Greece", "GR", "GRC"],
1609
+ # Asia
1610
+ "China": ["China", "CN", "CHN", "PRC", "People's Republic of China"],
1611
+ "Japan": ["Japan", "JP", "JPN"],
1612
+ "India": ["India", "IN", "IND"],
1613
+ "South Korea": ["South Korea", "Korea", "KR", "KOR", "Republic of Korea"],
1614
+ "Singapore": ["Singapore", "SG", "SGP"],
1615
+ "Thailand": ["Thailand", "TH", "THA"],
1616
+ "Malaysia": ["Malaysia", "MY", "MYS"],
1617
+ "Indonesia": ["Indonesia", "ID", "IDN"],
1618
+ "Philippines": ["Philippines", "PH", "PHL"],
1619
+ "Vietnam": ["Vietnam", "VN", "VNM"],
1620
+ # Oceania
1621
+ "Australia": ["Australia", "AU", "AUS"],
1622
+ "New Zealand": ["New Zealand", "NZ", "NZL"],
1623
+ # South America
1624
+ "Brazil": ["Brazil", "BR", "BRA", "Brasil"],
1625
+ "Argentina": ["Argentina", "AR", "ARG"],
1626
+ "Chile": ["Chile", "CL", "CHL"],
1627
+ "Colombia": ["Colombia", "CO", "COL"],
1628
+ "Peru": ["Peru", "PE", "PER"],
1629
+ # Middle East
1630
+ "Israel": ["Israel", "IL", "ISR"],
1631
+ "Saudi Arabia": ["Saudi Arabia", "SA", "SAU", "KSA"],
1632
+ "UAE": ["UAE", "United Arab Emirates", "AE", "ARE"],
1633
+ # Africa
1634
+ "South Africa": ["South Africa", "ZA", "ZAF", "RSA"],
1635
+ "Egypt": ["Egypt", "EG", "EGY"],
1636
+ "Nigeria": ["Nigeria", "NG", "NGA"],
1637
+ "Kenya": ["Kenya", "KE", "KEN"],
1638
+ }
1639
+
1640
+ # Create reverse mapping for quick lookups
1641
+ COUNTRY_LOOKUP = {}
1642
+ for standard_name, variations in COUNTRIES.items():
1643
+ for variation in variations:
1644
+ COUNTRY_LOOKUP[variation.upper()] = standard_name
1645
+
1646
+
1647
+ @addresses.register()
1648
+ def extract_country(col: Column) -> Column:
1649
+ """Extract country from address.
1650
+
1651
+ Extracts country names from addresses, handling common variations
1652
+ and abbreviations. Returns standardized country name.
1653
+
1654
+ Args:
1655
+ col: Column containing address text with potential country
1656
+
1657
+ Returns:
1658
+ Column with extracted country name or empty string
1659
+
1660
+ Example:
1661
+ df.select(addresses.extract_country(F.col("address")))
1662
+ # "123 Main St, New York, USA" -> "USA"
1663
+ # "456 Oak Ave, Toronto, Canada" -> "Canada"
1664
+ # "789 Elm St, London, UK" -> "United Kingdom"
1665
+ """
1666
+ # Handle nulls
1667
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1668
+
1669
+ # Start with empty result
1670
+ result = F.lit("")
1671
+
1672
+ # Check for country at the end of the address (most common)
1673
+ # Sort variations by length (longest first) to avoid partial matches
1674
+ sorted_variations = sorted(
1675
+ COUNTRY_LOOKUP.items(), key=lambda x: len(x[0]), reverse=True
1676
+ )
1677
+
1678
+ # Pattern to match country at the end, possibly after comma
1679
+ for variation, standard in sorted_variations:
1680
+ # Check if the address ends with this country variation
1681
+ # Use word boundary to avoid partial matches
1682
+ pattern = rf"(?:,\s*)?\b{re.escape(variation)}\.?\s*$"
1683
+ result = F.when(F.upper(col).rlike(pattern), F.lit(standard)).otherwise(result)
1684
+
1685
+ return result
1686
+
1687
+
1688
+ @addresses.register()
1689
+ def has_country(col: Column) -> Column:
1690
+ """Check if address contains country information.
1691
+
1692
+ Args:
1693
+ col: Column containing address text
1694
+
1695
+ Returns:
1696
+ Column with boolean indicating presence of country
1697
+
1698
+ Example:
1699
+ df.select(addresses.has_country(F.col("address")))
1700
+ # "123 Main St, USA" -> True
1701
+ # "456 Oak Ave" -> False
1702
+ """
1703
+ return extract_country(col) != ""
1704
+
1705
+
1706
+ @addresses.register()
1707
+ def remove_country(col: Column) -> Column:
1708
+ """Remove country from address.
1709
+
1710
+ Removes country information from the end of addresses.
1711
+
1712
+ Args:
1713
+ col: Column containing address text
1714
+
1715
+ Returns:
1716
+ Column with country removed
1717
+
1718
+ Example:
1719
+ df.select(addresses.remove_country(F.col("address")))
1720
+ # "123 Main St, New York, USA" -> "123 Main St, New York"
1721
+ # "456 Oak Ave, Toronto, Canada" -> "456 Oak Ave, Toronto"
1722
+ """
1723
+ # Handle nulls
1724
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1725
+
1726
+ result = col
1727
+
1728
+ # Sort variations by length (longest first) to avoid partial matches
1729
+ sorted_variations = sorted(COUNTRY_LOOKUP.keys(), key=len, reverse=True)
1730
+
1731
+ # Remove each country variation
1732
+ for variation in sorted_variations:
1733
+ # Pattern to match country at the end with optional comma and spaces
1734
+ # Note: PySpark's regexp_replace uses Java regex, which has different syntax
1735
+ # Escape the variation for regex
1736
+ escaped = re.escape(variation)
1737
+ # Build pattern for case-insensitive matching at end of string
1738
+ pattern = f"(?i),?\\s*{escaped}\\.?\\s*$"
1739
+ result = F.regexp_replace(result, pattern, "")
1740
+
1741
+ # Clean up any trailing commas or spaces
1742
+ result = F.regexp_replace(result, r",?\s*$", "")
1743
+
1744
+ return result
1745
+
1746
+
1747
+ @addresses.register()
1748
+ def standardize_country(col: Column, custom_mappings: Optional[dict] = None) -> Column:
1749
+ """Standardize country name to consistent format.
1750
+
1751
+ Converts various country representations to standard names.
1752
+
1753
+ Args:
1754
+ col: Column containing country name or abbreviation
1755
+ custom_mappings: Optional dict of custom country mappings
1756
+
1757
+ Returns:
1758
+ Column with standardized country name
1759
+
1760
+ Example:
1761
+ df.select(addresses.standardize_country(F.col("country")))
1762
+ # "US" -> "USA"
1763
+ # "U.K." -> "United Kingdom"
1764
+ # "Deutschland" -> "Germany"
1765
+ """
1766
+ # Handle nulls
1767
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1768
+
1769
+ # Clean and normalize
1770
+ upper_col = F.upper(F.trim(col))
1771
+
1772
+ # Apply custom mappings first if provided
1773
+ result = col
1774
+ if custom_mappings:
1775
+ for original, standard in custom_mappings.items():
1776
+ result = F.when(
1777
+ upper_col == F.upper(F.lit(original)), F.lit(standard)
1778
+ ).otherwise(result)
1779
+
1780
+ # Then apply standard mappings
1781
+ for variation, standard in COUNTRY_LOOKUP.items():
1782
+ result = F.when(upper_col == variation, F.lit(standard)).otherwise(result)
1783
+
1784
+ return result
1785
+
1786
+
1787
+ @addresses.register()
1788
+ def extract_po_box(col: Column) -> Column:
1789
+ """Extract PO Box number from address.
1790
+
1791
+ Extracts PO Box, P.O. Box, POB, Post Office Box numbers.
1792
+ Handles various formats including with/without periods and spaces.
1793
+
1794
+ Args:
1795
+ col: Column containing address text
1796
+
1797
+ Returns:
1798
+ Column with extracted PO Box number or empty string
1799
+
1800
+ Example:
1801
+ df.select(addresses.extract_po_box(F.col("address")))
1802
+ # "PO Box 123" -> "123"
1803
+ # "P.O. Box 456" -> "456"
1804
+ # "POB 789" -> "789"
1805
+ # "Post Office Box 1011" -> "1011"
1806
+ """
1807
+ # Handle nulls
1808
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1809
+
1810
+ # Pattern to match various PO Box formats
1811
+ # Matches: PO Box, P.O. Box, POB, Post Office Box, etc.
1812
+ # Captures the box number (numeric, alphanumeric, or with dashes and special chars)
1813
+ # POB must be followed by space and start with number or #
1814
+ po_box_pattern = r"(?i)(?:P\.?\s?O\.?\s?Box|POB(?=\s+[#0-9])|Post\s+Office\s+Box)\s+(#?[A-Z0-9\-/]+)"
1815
+
1816
+ result = F.regexp_extract(col, po_box_pattern, 1)
1817
+ return F.when(result.isNull(), F.lit("")).otherwise(result)
1818
+
1819
+
1820
+ @addresses.register()
1821
+ def has_po_box(col: Column) -> Column:
1822
+ """Check if address contains PO Box.
1823
+
1824
+ Args:
1825
+ col: Column containing address text
1826
+
1827
+ Returns:
1828
+ Column with boolean indicating presence of PO Box
1829
+
1830
+ Example:
1831
+ df.select(addresses.has_po_box(F.col("address")))
1832
+ # "PO Box 123" -> True
1833
+ # "123 Main St" -> False
1834
+ """
1835
+ return extract_po_box(col) != ""
1836
+
1837
+
1838
+ @addresses.register()
1839
+ def is_po_box_only(col: Column) -> Column:
1840
+ """Check if address is ONLY a PO Box (no street address).
1841
+
1842
+ Args:
1843
+ col: Column containing address text
1844
+
1845
+ Returns:
1846
+ Column with boolean indicating if address is PO Box only
1847
+
1848
+ Example:
1849
+ df.select(addresses.is_po_box_only(F.col("address")))
1850
+ # "PO Box 123" -> True
1851
+ # "123 Main St, PO Box 456" -> False
1852
+ # "PO Box 789, New York, NY" -> True
1853
+ """
1854
+ # Handle nulls
1855
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1856
+
1857
+ # Check if it has a PO Box
1858
+ has_box = has_po_box(col)
1859
+
1860
+ # Check if it has a street number (indicating a street address)
1861
+ # Pattern to detect street numbers at the beginning
1862
+ street_pattern = r"^\d+\s+[A-Za-z]"
1863
+ has_street = F.regexp_extract(col, street_pattern, 0) != ""
1864
+
1865
+ # It's PO Box only if it has a PO Box but no street address
1866
+ return has_box & ~has_street
1867
+
1868
+
1869
+ @addresses.register()
1870
+ def remove_po_box(col: Column) -> Column:
1871
+ """Remove PO Box from address.
1872
+
1873
+ Removes PO Box information while preserving other address components.
1874
+
1875
+ Args:
1876
+ col: Column containing address text
1877
+
1878
+ Returns:
1879
+ Column with PO Box removed
1880
+
1881
+ Example:
1882
+ df.select(addresses.remove_po_box(F.col("address")))
1883
+ # "123 Main St, PO Box 456" -> "123 Main St"
1884
+ # "PO Box 789, New York, NY" -> "New York, NY"
1885
+ """
1886
+ # Handle nulls
1887
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1888
+
1889
+ # Pattern to match various PO Box formats with optional comma
1890
+ po_box_pattern = r"(?i),?\s*(?:P\.?\s?O\.?\s?Box|POB(?=\s+[#0-9])|Post\s+Office\s+Box)\s+(#?[A-Z0-9\-/]+)\s*,?"
1891
+
1892
+ # Remove the PO Box
1893
+ result = F.regexp_replace(col, po_box_pattern, ",")
1894
+
1895
+ # Clean up any leading/trailing commas or spaces
1896
+ result = F.regexp_replace(result, r"^\s*,\s*", "") # Leading comma
1897
+ result = F.regexp_replace(result, r",?\s*$", "") # Trailing comma/space
1898
+ result = F.regexp_replace(result, r",\s*,+", ",") # Multiple commas to single
1899
+ result = F.regexp_replace(result, r"\s+", " ") # Multiple spaces to single
1900
+
1901
+ return F.trim(result)
1902
+
1903
+
1904
+ @addresses.register()
1905
+ def standardize_po_box(col: Column) -> Column:
1906
+ """Standardize PO Box format to consistent representation.
1907
+
1908
+ Converts various PO Box formats to standard "PO Box XXXX" format.
1909
+
1910
+ Args:
1911
+ col: Column containing PO Box text
1912
+
1913
+ Returns:
1914
+ Column with standardized PO Box format
1915
+
1916
+ Example:
1917
+ df.select(addresses.standardize_po_box(F.col("po_box")))
1918
+ # "P.O. Box 123" -> "PO Box 123"
1919
+ # "POB 456" -> "PO Box 456"
1920
+ # "Post Office Box 789" -> "PO Box 789"
1921
+ # "123 Main St" -> "123 Main St" (no change if no PO Box)
1922
+ """
1923
+ # Handle nulls
1924
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1925
+
1926
+ # Extract the PO Box number
1927
+ box_number = extract_po_box(col)
1928
+
1929
+ # If we found a PO Box, replace it with standard format
1930
+ result = F.when(
1931
+ box_number != "",
1932
+ F.regexp_replace(
1933
+ col,
1934
+ r"(?i)(?:P\.?\s?O\.?\s?Box|POB(?=\s+[#0-9])|Post\s+Office\s+Box)\s+(#?[A-Z0-9\-/]+)",
1935
+ F.concat(F.lit("PO Box "), box_number),
1936
+ ),
1937
+ ).otherwise(col)
1938
+
1939
+ return result
1940
+
1941
+
1942
+ @addresses.register()
1943
+ def extract_private_mailbox(col: Column) -> Column:
1944
+ """Extract private mailbox (PMB) number from address.
1945
+
1946
+ Extracts PMB or Private Mail Box numbers, commonly used with
1947
+ commercial mail receiving agencies (like UPS Store).
1948
+
1949
+ Args:
1950
+ col: Column containing address text
1951
+
1952
+ Returns:
1953
+ Column with extracted PMB number or empty string
1954
+
1955
+ Example:
1956
+ df.select(addresses.extract_private_mailbox(F.col("address")))
1957
+ # "123 Main St PMB 456" -> "456"
1958
+ # "789 Oak Ave #101 PMB 12" -> "12"
1959
+ """
1960
+ # Handle nulls
1961
+ col = F.when(col.isNull(), F.lit("")).otherwise(col)
1962
+
1963
+ # Pattern to match PMB (Private Mail Box)
1964
+ pmb_pattern = r"(?i)(?:PMB|Private\s+Mail\s+Box)\s+([A-Z0-9\-]+)"
1965
+
1966
+ result = F.regexp_extract(col, pmb_pattern, 1)
1967
+ return F.when(result.isNull(), F.lit("")).otherwise(result)