datacompose 0.2.5.2__py3-none-any.whl → 0.2.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/__init__.py +1 -1
- datacompose/cli/commands/init.py +1 -1
- datacompose/operators/__init__.py +1 -1
- datacompose/operators/primitives.py +57 -19
- datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py +10 -10
- datacompose/transformers/text/emails/pyspark/pyspark_primitives.py +14 -14
- datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py +59 -59
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.1.dist-info}/METADATA +1 -1
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.1.dist-info}/RECORD +13 -13
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.1.dist-info}/WHEEL +0 -0
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.1.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.1.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.1.dist-info}/top_level.txt +0 -0
datacompose/cli/__init__.py
CHANGED
datacompose/cli/commands/init.py
CHANGED
|
@@ -380,7 +380,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
380
380
|
# Check if config already exists
|
|
381
381
|
if config_path.exists() and not force:
|
|
382
382
|
print(error(f"Configuration file already exists: {config_path}"))
|
|
383
|
-
print(dim("Use --force to overwrite"))
|
|
383
|
+
print(dim("Use datacompose init --force to overwrite"))
|
|
384
384
|
return 1
|
|
385
385
|
|
|
386
386
|
try:
|
|
@@ -16,9 +16,13 @@ from typing import Any, Callable, Dict, List, Optional, Sequence
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
try:
|
|
19
|
-
from pyspark.sql import Column
|
|
19
|
+
from pyspark.sql import Column
|
|
20
|
+
from pyspark.sql import functions as F
|
|
20
21
|
except ImportError:
|
|
21
|
-
|
|
22
|
+
logging.debug("PySpark not available")
|
|
23
|
+
|
|
24
|
+
# Set up module logger
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
22
26
|
|
|
23
27
|
|
|
24
28
|
class SmartPrimitive:
|
|
@@ -120,11 +124,15 @@ class PrimitiveRegistry:
|
|
|
120
124
|
self._primitives = {}
|
|
121
125
|
self._conditionals = {}
|
|
122
126
|
|
|
123
|
-
def register(
|
|
127
|
+
def register(
|
|
128
|
+
self, name: Optional[str] = None, is_conditional: Optional[bool] = None
|
|
129
|
+
):
|
|
124
130
|
"""Decorator to register a function as a SmartPrimitive in this namespace.
|
|
125
131
|
|
|
126
132
|
Args:
|
|
127
133
|
name: Optional name for the primitive (defaults to function name)
|
|
134
|
+
is_conditional: Optional flag to mark as conditional. If None, auto-detects
|
|
135
|
+
based on function name patterns.
|
|
128
136
|
|
|
129
137
|
Returns:
|
|
130
138
|
Decorator function that wraps the target function as a SmartPrimitive
|
|
@@ -139,7 +147,29 @@ class PrimitiveRegistry:
|
|
|
139
147
|
def decorator(func: Callable):
|
|
140
148
|
primitive_name = name or func.__name__
|
|
141
149
|
|
|
142
|
-
if
|
|
150
|
+
# Auto-detect conditional if not explicitly specified
|
|
151
|
+
if is_conditional is None:
|
|
152
|
+
# Check common naming patterns for conditional functions
|
|
153
|
+
conditional_patterns = [
|
|
154
|
+
"is_",
|
|
155
|
+
"has_",
|
|
156
|
+
"needs_",
|
|
157
|
+
"should_",
|
|
158
|
+
"can_",
|
|
159
|
+
"contains_",
|
|
160
|
+
"matches_",
|
|
161
|
+
"equals_",
|
|
162
|
+
"starts_with_",
|
|
163
|
+
"ends_with_",
|
|
164
|
+
]
|
|
165
|
+
is_conditional_auto = any(
|
|
166
|
+
primitive_name.startswith(pattern)
|
|
167
|
+
for pattern in conditional_patterns
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
is_conditional_auto = is_conditional
|
|
171
|
+
|
|
172
|
+
if is_conditional_auto:
|
|
143
173
|
self._conditionals[primitive_name] = SmartPrimitive(
|
|
144
174
|
func, primitive_name
|
|
145
175
|
)
|
|
@@ -217,9 +247,17 @@ class PrimitiveRegistry:
|
|
|
217
247
|
pipeline.__doc__ = func.__doc__
|
|
218
248
|
return pipeline
|
|
219
249
|
|
|
250
|
+
# Auto-detect ALL namespace instances from func.__globals__
|
|
251
|
+
# This allows using multiple namespaces without explicitly passing them
|
|
252
|
+
for var_name, var_value in func.__globals__.items():
|
|
253
|
+
if isinstance(var_value, PrimitiveRegistry):
|
|
254
|
+
# Found a namespace instance
|
|
255
|
+
if var_name not in namespaces:
|
|
256
|
+
namespaces[var_name] = var_value
|
|
257
|
+
|
|
220
258
|
# Try to get the function as a string and parse it
|
|
221
259
|
try:
|
|
222
|
-
compiler = PipelineCompiler(namespaces, debug)
|
|
260
|
+
compiler = PipelineCompiler(namespaces, debug, func.__globals__)
|
|
223
261
|
pipeline = compiler.compile(func)
|
|
224
262
|
|
|
225
263
|
if debug and pipeline.steps:
|
|
@@ -270,7 +308,11 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
|
|
|
270
308
|
method_name = node.value.func.attr
|
|
271
309
|
namespace = (
|
|
272
310
|
namespaces.get(namespace_name) if namespace_name else None
|
|
273
|
-
) or (
|
|
311
|
+
) or (
|
|
312
|
+
func.__globals__.get(namespace_name)
|
|
313
|
+
if namespace_name
|
|
314
|
+
else None
|
|
315
|
+
)
|
|
274
316
|
if namespace and hasattr(namespace, method_name):
|
|
275
317
|
method = getattr(namespace, method_name)
|
|
276
318
|
|
|
@@ -312,16 +354,6 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
|
|
|
312
354
|
return pipeline
|
|
313
355
|
|
|
314
356
|
|
|
315
|
-
try:
|
|
316
|
-
from pyspark.sql import Column
|
|
317
|
-
from pyspark.sql import functions as F
|
|
318
|
-
except ImportError:
|
|
319
|
-
logging.debug("PySpark not available")
|
|
320
|
-
|
|
321
|
-
# Set up module logger
|
|
322
|
-
logger = logging.getLogger(__name__)
|
|
323
|
-
|
|
324
|
-
|
|
325
357
|
@dataclass
|
|
326
358
|
class CompiledStep:
|
|
327
359
|
"""A compiled pipeline step"""
|
|
@@ -452,9 +484,15 @@ class StablePipeline:
|
|
|
452
484
|
|
|
453
485
|
|
|
454
486
|
class PipelineCompiler:
|
|
455
|
-
def __init__(
|
|
487
|
+
def __init__(
|
|
488
|
+
self,
|
|
489
|
+
namespaces: Dict[str, Any],
|
|
490
|
+
debug: bool = False,
|
|
491
|
+
func_globals: Optional[Dict] = None,
|
|
492
|
+
):
|
|
456
493
|
self.namespaces = namespaces
|
|
457
494
|
self.debug = debug
|
|
495
|
+
self.func_globals = func_globals or {}
|
|
458
496
|
|
|
459
497
|
def compile(self, func: Callable) -> StablePipeline:
|
|
460
498
|
try:
|
|
@@ -530,7 +568,7 @@ class PipelineCompiler:
|
|
|
530
568
|
|
|
531
569
|
namespace = (
|
|
532
570
|
self.namespaces.get(namespace_name) if namespace_name else None
|
|
533
|
-
) or (
|
|
571
|
+
) or (self.func_globals.get(namespace_name) if namespace_name else None)
|
|
534
572
|
if namespace and hasattr(namespace, method_name):
|
|
535
573
|
method = getattr(namespace, method_name)
|
|
536
574
|
|
|
@@ -552,7 +590,7 @@ class PipelineCompiler:
|
|
|
552
590
|
|
|
553
591
|
namespace = (
|
|
554
592
|
self.namespaces.get(namespace_name) if namespace_name else None
|
|
555
|
-
) or (
|
|
593
|
+
) or (self.func_globals.get(namespace_name) if namespace_name else None)
|
|
556
594
|
if namespace and hasattr(namespace, method_name):
|
|
557
595
|
method = getattr(namespace, method_name)
|
|
558
596
|
|
|
@@ -544,7 +544,7 @@ def standardize_street_prefix(
|
|
|
544
544
|
|
|
545
545
|
Args:
|
|
546
546
|
col: Column containing street prefix
|
|
547
|
-
custom_mappings
|
|
547
|
+
custom_mappings (Optional): Dict of custom prefix mappings (case insensitive)
|
|
548
548
|
|
|
549
549
|
Returns:
|
|
550
550
|
Column with standardized prefix (always abbreviated per USPS standards)
|
|
@@ -614,7 +614,7 @@ def standardize_street_suffix(
|
|
|
614
614
|
|
|
615
615
|
Args:
|
|
616
616
|
col: Column containing street suffix
|
|
617
|
-
custom_mappings
|
|
617
|
+
custom_mappings (Optional): Dict of custom suffix mappings (case insensitive)
|
|
618
618
|
|
|
619
619
|
Returns:
|
|
620
620
|
Column with standardized suffix (always abbreviated per USPS standards)
|
|
@@ -896,7 +896,7 @@ def standardize_unit_type(
|
|
|
896
896
|
|
|
897
897
|
Args:
|
|
898
898
|
col: Column containing unit type
|
|
899
|
-
custom_mappings
|
|
899
|
+
custom_mappings (Optional): Dict of custom unit type mappings
|
|
900
900
|
|
|
901
901
|
Returns:
|
|
902
902
|
Column with standardized unit type
|
|
@@ -1206,7 +1206,7 @@ def extract_city(col: Column, custom_cities: Optional[List] = None) -> Column:
|
|
|
1206
1206
|
|
|
1207
1207
|
Args:
|
|
1208
1208
|
col: Column containing address text
|
|
1209
|
-
custom_cities
|
|
1209
|
+
custom_cities (Optional): List of custom city names to recognize (case-insensitive)
|
|
1210
1210
|
|
|
1211
1211
|
Returns:
|
|
1212
1212
|
Column with extracted city name or empty string if not found
|
|
@@ -1371,7 +1371,7 @@ def extract_state(col: Column, custom_states: Optional[Dict] = None) -> Column:
|
|
|
1371
1371
|
|
|
1372
1372
|
Args:
|
|
1373
1373
|
col: Column containing address text with state information
|
|
1374
|
-
custom_states
|
|
1374
|
+
custom_states (Optional): Dict mapping full state names to abbreviations
|
|
1375
1375
|
e.g., {"ONTARIO": "ON", "QUEBEC": "QC"}
|
|
1376
1376
|
|
|
1377
1377
|
Returns:
|
|
@@ -1445,9 +1445,9 @@ def validate_city(
|
|
|
1445
1445
|
|
|
1446
1446
|
Args:
|
|
1447
1447
|
col: Column containing city names to validate
|
|
1448
|
-
known_cities
|
|
1449
|
-
min_length: Minimum valid city name length (default 2)
|
|
1450
|
-
max_length: Maximum valid city name length (default 50)
|
|
1448
|
+
known_cities (Optional): List of valid city names to check against
|
|
1449
|
+
min_length (Optional): Minimum valid city name length (default 2)
|
|
1450
|
+
max_length (Optional): Maximum valid city name length (default 50)
|
|
1451
1451
|
|
|
1452
1452
|
Returns:
|
|
1453
1453
|
Boolean column indicating if city name is valid
|
|
@@ -1523,7 +1523,7 @@ def standardize_city(col: Column, custom_mappings: Optional[Dict] = None) -> Col
|
|
|
1523
1523
|
|
|
1524
1524
|
Args:
|
|
1525
1525
|
col: Column containing city names to standardize
|
|
1526
|
-
custom_mappings
|
|
1526
|
+
custom_mappings (Optional): Dict for city name corrections/standardization
|
|
1527
1527
|
e.g., {"ST LOUIS": "St. Louis", "NEWYORK": "New York"}
|
|
1528
1528
|
|
|
1529
1529
|
Returns:
|
|
@@ -1807,7 +1807,7 @@ def standardize_country(col: Column, custom_mappings: Optional[dict] = None) ->
|
|
|
1807
1807
|
|
|
1808
1808
|
Args:
|
|
1809
1809
|
col: Column containing country name or abbreviation
|
|
1810
|
-
custom_mappings
|
|
1810
|
+
custom_mappings (Optional): Dict of custom country mappings
|
|
1811
1811
|
|
|
1812
1812
|
Returns:
|
|
1813
1813
|
Column with standardized country name
|
|
@@ -255,8 +255,8 @@ def is_valid_email(col: Column, min_length: int = 6, max_length: int = 254) -> C
|
|
|
255
255
|
|
|
256
256
|
Args:
|
|
257
257
|
col: Column containing email address
|
|
258
|
-
min_length: Minimum length for valid email
|
|
259
|
-
max_length: Maximum length for valid email
|
|
258
|
+
min_length (Optional): Minimum length for valid email (default 6)
|
|
259
|
+
max_length (Optional): Maximum length for valid email (default 254)
|
|
260
260
|
|
|
261
261
|
Returns:
|
|
262
262
|
Column with boolean indicating validity
|
|
@@ -286,8 +286,8 @@ def is_valid_username(col: Column, min_length: int = 1, max_length: int = 64) ->
|
|
|
286
286
|
|
|
287
287
|
Args:
|
|
288
288
|
col: Column containing email address
|
|
289
|
-
min_length: Minimum length for valid username (default 1)
|
|
290
|
-
max_length: Maximum length for valid username (default 64 per RFC)
|
|
289
|
+
min_length (Optional): Minimum length for valid username (default 1)
|
|
290
|
+
max_length (Optional): Maximum length for valid username (default 64 per RFC)
|
|
291
291
|
|
|
292
292
|
Returns:
|
|
293
293
|
Column with boolean indicating username validity
|
|
@@ -351,7 +351,7 @@ def is_disposable_email(
|
|
|
351
351
|
|
|
352
352
|
Args:
|
|
353
353
|
col: Column containing email address
|
|
354
|
-
disposable_domains: List of disposable domains to check against
|
|
354
|
+
disposable_domains (Optional): List of disposable domains to check against
|
|
355
355
|
|
|
356
356
|
Returns:
|
|
357
357
|
Column with boolean indicating if email is disposable
|
|
@@ -389,7 +389,7 @@ def is_corporate_email(
|
|
|
389
389
|
|
|
390
390
|
Args:
|
|
391
391
|
col: Column containing email address
|
|
392
|
-
free_providers: List of free email provider domains to check against
|
|
392
|
+
free_providers (Optional): List of free email provider domains to check against
|
|
393
393
|
|
|
394
394
|
Returns:
|
|
395
395
|
Column with boolean indicating if email is corporate
|
|
@@ -535,8 +535,8 @@ def fix_common_typos(
|
|
|
535
535
|
|
|
536
536
|
Args:
|
|
537
537
|
col: Column containing email address
|
|
538
|
-
custom_mappings: Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
|
|
539
|
-
custom_tld_mappings: Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
|
|
538
|
+
custom_mappings (Optional): Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
|
|
539
|
+
custom_tld_mappings (Optional): Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
|
|
540
540
|
|
|
541
541
|
Returns:
|
|
542
542
|
Column with typos fixed
|
|
@@ -604,10 +604,10 @@ def standardize_email(
|
|
|
604
604
|
|
|
605
605
|
Args:
|
|
606
606
|
col: Column containing email address
|
|
607
|
-
lowercase: Convert to lowercase
|
|
608
|
-
remove_dots_gmail: Remove dots from Gmail addresses
|
|
609
|
-
remove_plus: Remove plus addressing
|
|
610
|
-
fix_typos: Fix common domain typos
|
|
607
|
+
lowercase (Optional): Convert to lowercase (default True)
|
|
608
|
+
remove_dots_gmail (Optional): Remove dots from Gmail addresses (default True)
|
|
609
|
+
remove_plus (Optional): Remove plus addressing (default False)
|
|
610
|
+
fix_typos (Optional): Fix common domain typos (default True)
|
|
611
611
|
|
|
612
612
|
Returns:
|
|
613
613
|
Column with standardized email
|
|
@@ -756,8 +756,8 @@ def mask_email(col: Column, mask_char: str = "*", keep_chars: int = 3) -> Column
|
|
|
756
756
|
|
|
757
757
|
Args:
|
|
758
758
|
col: Column containing email address
|
|
759
|
-
mask_char: Character to use for masking
|
|
760
|
-
keep_chars: Number of characters to keep at start
|
|
759
|
+
mask_char (Optional): Character to use for masking (default "*")
|
|
760
|
+
keep_chars (Optional): Number of characters to keep at start (default 3)
|
|
761
761
|
|
|
762
762
|
Returns:
|
|
763
763
|
Column with masked email
|
|
@@ -3,7 +3,7 @@ Phone number transformation primitives for PySpark.
|
|
|
3
3
|
|
|
4
4
|
Preview Output:
|
|
5
5
|
+------------------------+----------------+--------+---------+------------+-------+---------+------------+
|
|
6
|
-
|
|
|
6
|
+
|phone_numbers |standardized |is_valid|area_code|local_number|has_ext|extension|is_toll_free|
|
|
7
7
|
+------------------------+----------------+--------+---------+------------+-------+---------+------------+
|
|
8
8
|
| (555) 123-4567 |(555) 123-4567 |true |555 |1234567 |false |null |false |
|
|
9
9
|
|+1-800-555-1234 |+1 800-555-1234 |true |800 |5551234 |false |null |true |
|
|
@@ -29,23 +29,23 @@ data = [
|
|
|
29
29
|
("123-45-67",),
|
|
30
30
|
("1-800-FLOWERS",),
|
|
31
31
|
]
|
|
32
|
-
df = spark.createDataFrame(data, ["
|
|
32
|
+
df = spark.createDataFrame(data, ["phone_numbers"])
|
|
33
33
|
|
|
34
34
|
# Apply transformations
|
|
35
35
|
result_df = df.select(
|
|
36
|
-
F.col("
|
|
37
|
-
phone_numbers.
|
|
38
|
-
phone_numbers.
|
|
36
|
+
F.col("phone_numbers"),
|
|
37
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers")).alias("standardized"),
|
|
38
|
+
phone_numbers.is_valid_phone_numbers(F.col("phone_numbers")).alias("is_valid"),
|
|
39
39
|
phone_numbers.extract_area_code(
|
|
40
|
-
phone_numbers.
|
|
40
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
41
41
|
).alias("area_code"),
|
|
42
42
|
phone_numbers.extract_local_number(
|
|
43
|
-
phone_numbers.
|
|
43
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
44
44
|
).alias("local_number"),
|
|
45
|
-
phone_numbers.has_extension(F.col("
|
|
46
|
-
phone_numbers.extract_extension(F.col("
|
|
45
|
+
phone_numbers.has_extension(F.col("phone_numbers")).alias("has_ext"),
|
|
46
|
+
phone_numbers.extract_extension(F.col("phone_numbers")).alias("extension"),
|
|
47
47
|
phone_numbers.is_toll_free(
|
|
48
|
-
phone_numbers.
|
|
48
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
49
49
|
).alias("is_toll_free")
|
|
50
50
|
)
|
|
51
51
|
|
|
@@ -118,7 +118,7 @@ PHONE_KEYPAD_MAPPING = {
|
|
|
118
118
|
|
|
119
119
|
|
|
120
120
|
@phone_numbers.register()
|
|
121
|
-
def
|
|
121
|
+
def extract_phone_numbers_from_text(col: Column) -> Column:
|
|
122
122
|
"""
|
|
123
123
|
Extract first phone number from text using regex patterns.
|
|
124
124
|
|
|
@@ -126,21 +126,21 @@ def extract_phone_from_text(col: Column) -> Column:
|
|
|
126
126
|
col: Column containing text with potential phone numbers
|
|
127
127
|
|
|
128
128
|
Returns:
|
|
129
|
-
Column with extracted phone
|
|
129
|
+
Column with extracted phone numbers or empty string
|
|
130
130
|
"""
|
|
131
|
-
# Comprehensive
|
|
131
|
+
# Comprehensive phone_numbers pattern that matches various formats
|
|
132
132
|
# Handles: +1-555-123-4567, (555) 123-4567, 555.123.4567, 555-123-4567, etc.
|
|
133
|
-
|
|
133
|
+
phone_numbers_pattern = (
|
|
134
134
|
r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(\s*(ext|x)\.?\s*\d+)?"
|
|
135
135
|
)
|
|
136
136
|
|
|
137
137
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
138
|
-
F.regexp_extract(col,
|
|
138
|
+
F.regexp_extract(col, phone_numbers_pattern, 0)
|
|
139
139
|
)
|
|
140
140
|
|
|
141
141
|
|
|
142
142
|
@phone_numbers.register()
|
|
143
|
-
def
|
|
143
|
+
def extract_all_phone_numbers_from_text(col: Column) -> Column:
|
|
144
144
|
"""
|
|
145
145
|
Extract all phone numbers from text as an array.
|
|
146
146
|
|
|
@@ -150,13 +150,13 @@ def extract_all_phones_from_text(col: Column) -> Column:
|
|
|
150
150
|
Returns:
|
|
151
151
|
Column with array of phone numbers
|
|
152
152
|
"""
|
|
153
|
-
# For simplicity, we'll return an array with just the first
|
|
153
|
+
# For simplicity, we'll return an array with just the first phone_numbers found
|
|
154
154
|
# A proper implementation would require more complex regex or UDF
|
|
155
155
|
# This is a limitation of Spark SQL's regex capabilities
|
|
156
|
-
|
|
156
|
+
first_phone_numbers = extract_phone_numbers_from_text(col)
|
|
157
157
|
|
|
158
158
|
# Return array with single element or empty array
|
|
159
|
-
return F.when(
|
|
159
|
+
return F.when(first_phone_numbers != "", F.array(first_phone_numbers)).otherwise(F.array())
|
|
160
160
|
|
|
161
161
|
|
|
162
162
|
@phone_numbers.register()
|
|
@@ -350,8 +350,8 @@ def is_valid_international(
|
|
|
350
350
|
|
|
351
351
|
Args:
|
|
352
352
|
col: Column containing phone number
|
|
353
|
-
min_length: Minimum digits for international number
|
|
354
|
-
max_length: Maximum digits for international number
|
|
353
|
+
min_length (Optional): Minimum digits for international number (default 7)
|
|
354
|
+
max_length (Optional): Maximum digits for international number (default 15)
|
|
355
355
|
|
|
356
356
|
Returns:
|
|
357
357
|
Column with boolean indicating potential international validity
|
|
@@ -366,7 +366,7 @@ def is_valid_international(
|
|
|
366
366
|
|
|
367
367
|
|
|
368
368
|
@phone_numbers.register()
|
|
369
|
-
def
|
|
369
|
+
def is_valid_phone_numbers(col: Column) -> Column:
|
|
370
370
|
"""
|
|
371
371
|
Check if phone number is valid (NANP or international).
|
|
372
372
|
|
|
@@ -403,7 +403,7 @@ def is_premium_rate(col: Column) -> Column:
|
|
|
403
403
|
Check if phone number is premium rate (900).
|
|
404
404
|
|
|
405
405
|
Args:
|
|
406
|
-
col: Column containing
|
|
406
|
+
col: Column containing phophonene_numbers number
|
|
407
407
|
|
|
408
408
|
Returns:
|
|
409
409
|
Column with boolean indicating if premium rate
|
|
@@ -546,11 +546,11 @@ def format_nanp(col: Column) -> Column:
|
|
|
546
546
|
"""
|
|
547
547
|
# Remove extension for validation but preserve it
|
|
548
548
|
extension = extract_extension(col)
|
|
549
|
-
|
|
549
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
550
550
|
|
|
551
|
-
area_code = extract_area_code(
|
|
552
|
-
exchange = extract_exchange(
|
|
553
|
-
subscriber = extract_subscriber(
|
|
551
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
552
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
553
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
554
554
|
|
|
555
555
|
base_format = F.concat(area_code, F.lit("-"), exchange, F.lit("-"), subscriber)
|
|
556
556
|
|
|
@@ -559,7 +559,7 @@ def format_nanp(col: Column) -> Column:
|
|
|
559
559
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
560
560
|
).otherwise(base_format)
|
|
561
561
|
|
|
562
|
-
return F.when(is_valid_nanp(
|
|
562
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
563
563
|
|
|
564
564
|
|
|
565
565
|
@phone_numbers.register()
|
|
@@ -575,11 +575,11 @@ def format_nanp_paren(col: Column) -> Column:
|
|
|
575
575
|
"""
|
|
576
576
|
# Remove extension for validation but preserve it
|
|
577
577
|
extension = extract_extension(col)
|
|
578
|
-
|
|
578
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
579
579
|
|
|
580
|
-
area_code = extract_area_code(
|
|
581
|
-
exchange = extract_exchange(
|
|
582
|
-
subscriber = extract_subscriber(
|
|
580
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
581
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
582
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
583
583
|
|
|
584
584
|
base_format = F.concat(
|
|
585
585
|
F.lit("("), area_code, F.lit(") "), exchange, F.lit("-"), subscriber
|
|
@@ -590,7 +590,7 @@ def format_nanp_paren(col: Column) -> Column:
|
|
|
590
590
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
591
591
|
).otherwise(base_format)
|
|
592
592
|
|
|
593
|
-
return F.when(is_valid_nanp(
|
|
593
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
594
594
|
|
|
595
595
|
|
|
596
596
|
@phone_numbers.register()
|
|
@@ -606,11 +606,11 @@ def format_nanp_dot(col: Column) -> Column:
|
|
|
606
606
|
"""
|
|
607
607
|
# Remove extension for validation but preserve it
|
|
608
608
|
extension = extract_extension(col)
|
|
609
|
-
|
|
609
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
610
610
|
|
|
611
|
-
area_code = extract_area_code(
|
|
612
|
-
exchange = extract_exchange(
|
|
613
|
-
subscriber = extract_subscriber(
|
|
611
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
612
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
613
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
614
614
|
|
|
615
615
|
base_format = F.concat(area_code, F.lit("."), exchange, F.lit("."), subscriber)
|
|
616
616
|
|
|
@@ -619,7 +619,7 @@ def format_nanp_dot(col: Column) -> Column:
|
|
|
619
619
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
620
620
|
).otherwise(base_format)
|
|
621
621
|
|
|
622
|
-
return F.when(is_valid_nanp(
|
|
622
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
623
623
|
|
|
624
624
|
|
|
625
625
|
@phone_numbers.register()
|
|
@@ -635,11 +635,11 @@ def format_nanp_space(col: Column) -> Column:
|
|
|
635
635
|
"""
|
|
636
636
|
# Remove extension for validation but preserve it
|
|
637
637
|
extension = extract_extension(col)
|
|
638
|
-
|
|
638
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
639
639
|
|
|
640
|
-
area_code = extract_area_code(
|
|
641
|
-
exchange = extract_exchange(
|
|
642
|
-
subscriber = extract_subscriber(
|
|
640
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
641
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
642
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
643
643
|
|
|
644
644
|
base_format = F.concat(area_code, F.lit(" "), exchange, F.lit(" "), subscriber)
|
|
645
645
|
|
|
@@ -648,7 +648,7 @@ def format_nanp_space(col: Column) -> Column:
|
|
|
648
648
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
649
649
|
).otherwise(base_format)
|
|
650
650
|
|
|
651
|
-
return F.when(is_valid_nanp(
|
|
651
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
652
652
|
|
|
653
653
|
|
|
654
654
|
@phone_numbers.register()
|
|
@@ -707,7 +707,7 @@ def format_e164(col: Column) -> Column:
|
|
|
707
707
|
|
|
708
708
|
# Build E.164 format - only for valid phones
|
|
709
709
|
return F.when(
|
|
710
|
-
|
|
710
|
+
is_valid_phone_numbers(col),
|
|
711
711
|
F.when(
|
|
712
712
|
(F.length(digits) == 10) & is_nanp, F.concat(F.lit("+"), F.lit("1"), digits)
|
|
713
713
|
)
|
|
@@ -729,7 +729,7 @@ def format_e164(col: Column) -> Column:
|
|
|
729
729
|
|
|
730
730
|
|
|
731
731
|
@phone_numbers.register()
|
|
732
|
-
def
|
|
732
|
+
def standardize_phone_numbers(col: Column) -> Column:
|
|
733
733
|
"""
|
|
734
734
|
Standardize phone number with cleaning and NANP formatting.
|
|
735
735
|
|
|
@@ -783,7 +783,7 @@ def standardize_phone(col: Column) -> Column:
|
|
|
783
783
|
|
|
784
784
|
|
|
785
785
|
@phone_numbers.register()
|
|
786
|
-
def
|
|
786
|
+
def standardize_phone_numbers_e164(col: Column) -> Column:
|
|
787
787
|
"""
|
|
788
788
|
Standardize phone number with cleaning and E.164 formatting.
|
|
789
789
|
|
|
@@ -800,11 +800,11 @@ def standardize_phone_e164(col: Column) -> Column:
|
|
|
800
800
|
result = format_e164(cleaned)
|
|
801
801
|
|
|
802
802
|
# Only return valid phone numbers
|
|
803
|
-
return F.when(
|
|
803
|
+
return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
|
|
804
804
|
|
|
805
805
|
|
|
806
806
|
@phone_numbers.register()
|
|
807
|
-
def
|
|
807
|
+
def standardize_phone_numbers_digits(col: Column) -> Column:
|
|
808
808
|
"""
|
|
809
809
|
Standardize phone number and return digits only.
|
|
810
810
|
|
|
@@ -821,11 +821,11 @@ def standardize_phone_digits(col: Column) -> Column:
|
|
|
821
821
|
result = extract_digits(cleaned)
|
|
822
822
|
|
|
823
823
|
# Only return valid phone numbers
|
|
824
|
-
return F.when(
|
|
824
|
+
return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
|
|
825
825
|
|
|
826
826
|
|
|
827
827
|
@phone_numbers.register()
|
|
828
|
-
def
|
|
828
|
+
def clean_phone_numbers(col: Column) -> Column:
|
|
829
829
|
"""
|
|
830
830
|
Clean and validate phone number, returning null for invalid numbers.
|
|
831
831
|
|
|
@@ -873,12 +873,12 @@ def clean_phone(col: Column) -> Column:
|
|
|
873
873
|
|
|
874
874
|
|
|
875
875
|
@phone_numbers.register()
|
|
876
|
-
def
|
|
876
|
+
def get_phone_numbers_type(col: Column) -> Column:
|
|
877
877
|
"""
|
|
878
878
|
Get phone number type (toll-free, premium, standard, international).
|
|
879
879
|
|
|
880
880
|
Args:
|
|
881
|
-
col: Column containing
|
|
881
|
+
col: Column containing phone_numbers number
|
|
882
882
|
|
|
883
883
|
Returns:
|
|
884
884
|
Column with phone type
|
|
@@ -923,7 +923,7 @@ def get_region_from_area_code(col: Column) -> Column:
|
|
|
923
923
|
|
|
924
924
|
|
|
925
925
|
@phone_numbers.register()
|
|
926
|
-
def
|
|
926
|
+
def mask_phone_numbers(col: Column) -> Column:
|
|
927
927
|
"""
|
|
928
928
|
Mask phone number for privacy keeping last 4 digits (e.g., ***-***-1234).
|
|
929
929
|
|
|
@@ -950,9 +950,9 @@ def mask_phone(col: Column) -> Column:
|
|
|
950
950
|
|
|
951
951
|
|
|
952
952
|
@phone_numbers.register()
|
|
953
|
-
def
|
|
953
|
+
def filter_valid_phone_numbers_numbers(col: Column) -> Column:
|
|
954
954
|
"""
|
|
955
|
-
Return
|
|
955
|
+
Return phone_numbers number only if valid, otherwise return null.
|
|
956
956
|
|
|
957
957
|
Args:
|
|
958
958
|
col: Column containing phone number
|
|
@@ -960,13 +960,13 @@ def filter_valid_phones(col: Column) -> Column:
|
|
|
960
960
|
Returns:
|
|
961
961
|
Column with valid phone or null
|
|
962
962
|
"""
|
|
963
|
-
return F.when(
|
|
963
|
+
return F.when(is_valid_phone_numbers(col), col).otherwise(F.lit(None))
|
|
964
964
|
|
|
965
965
|
|
|
966
966
|
@phone_numbers.register()
|
|
967
|
-
def
|
|
967
|
+
def filter_nanp_phone_numbers_numbers(col: Column) -> Column:
|
|
968
968
|
"""
|
|
969
|
-
Return
|
|
969
|
+
Return phone_numbers number only if valid NANP, otherwise return null.
|
|
970
970
|
|
|
971
971
|
Args:
|
|
972
972
|
col: Column containing phone number
|
|
@@ -978,7 +978,7 @@ def filter_nanp_phones(col: Column) -> Column:
|
|
|
978
978
|
|
|
979
979
|
|
|
980
980
|
@phone_numbers.register()
|
|
981
|
-
def
|
|
981
|
+
def filter_toll_free_phone_numbers_numbers(col: Column) -> Column:
|
|
982
982
|
"""
|
|
983
983
|
Return phone number only if toll-free, otherwise return null.
|
|
984
984
|
|
|
@@ -1,31 +1,31 @@
|
|
|
1
1
|
datacompose/__init__.py,sha256=kbQEzEvheAMsm3-MT4nWSuX42fjrfgDWoR8WZmC_WX4,34
|
|
2
|
-
datacompose/cli/__init__.py,sha256=
|
|
2
|
+
datacompose/cli/__init__.py,sha256=UuHpU3QypWZC9NEjI89BNUW_PcRfMUyJbZDKJvAm1hQ,109
|
|
3
3
|
datacompose/cli/colors.py,sha256=Ax7jHhdAIuq5x3663gJ7_MzFCBOJv38DqNXts5t4SLs,1756
|
|
4
4
|
datacompose/cli/config.py,sha256=vvY6xGgIdybUuujdPrDI_CsSUUD9CEfODG8Kem4jqVQ,2353
|
|
5
5
|
datacompose/cli/main.py,sha256=d87hG1nxDVNTRZVi7ctNQQ06lwe5KbLGJMChlHtR3Kc,1343
|
|
6
6
|
datacompose/cli/validation.py,sha256=8WMZ9wtPgFk9eBgMS_wtkncFz_-BmH4E8V57tjp3YoI,2526
|
|
7
7
|
datacompose/cli/commands/__init__.py,sha256=Bu58UsnkGRbVFS92U2Px_KxlUPrdlbSY6wlvP6tet2o,38
|
|
8
8
|
datacompose/cli/commands/add.py,sha256=N4tqtOEXmaP6vpO4ZfhFH6qBznmFOYRU3y9TEvc2da0,7707
|
|
9
|
-
datacompose/cli/commands/init.py,sha256=
|
|
9
|
+
datacompose/cli/commands/init.py,sha256=44QZEyqjAXOahfbFNBZAnEq2JCzT6UDB0GqC-fmyOko,18003
|
|
10
10
|
datacompose/cli/commands/list.py,sha256=mXihUMrnwLUoIG-FpNb8-XJ0VZfh0v3exHq1m_Mrprg,3855
|
|
11
11
|
datacompose/generators/__init__.py,sha256=dFJWJScu8mkP0ZKIQtVlJ36PQW-LwCYBijuNwLSevZw,48
|
|
12
12
|
datacompose/generators/base.py,sha256=EgpHwaaSxAP1Ygq5Wtyq4ez-wG0oPwDEbiKgLsEilD0,6761
|
|
13
13
|
datacompose/generators/pyspark/__init__.py,sha256=ayoKDGtbt2KwFcNt2QxHKt8z83Kzy4ySw9Gg7j9ZMTY,33
|
|
14
14
|
datacompose/generators/pyspark/generator.py,sha256=be4GATA8rmLAg4_wZ3Ox3vC3up_OXMOajjIUJQrDQ10,1735
|
|
15
|
-
datacompose/operators/__init__.py,sha256=
|
|
16
|
-
datacompose/operators/primitives.py,sha256=
|
|
15
|
+
datacompose/operators/__init__.py,sha256=Eacc0JDCzeuTeLGO_N9Nz9pOc1D3_6BxEcpCibzrpz8,588
|
|
16
|
+
datacompose/operators/primitives.py,sha256=FxhtgP7aizKsnNBgh5oTqwc9m8QSjLTpRoG5zu6rFns,23615
|
|
17
17
|
datacompose/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
datacompose/transformers/discovery.py,sha256=JgMtmd8PkhmwLqS17NeKSv9MRneY9tdOsOK32A6luOQ,7048
|
|
19
19
|
datacompose/transformers/text/__init__.py,sha256=Mq0UmgYlBV8T18IkvHAS1TImEzWyGciCqxaCv324hFQ,36
|
|
20
20
|
datacompose/transformers/text/addresses/__init__.py,sha256=l5TItGrGBn69Mlq0CaRGJa-SwpyuUEYWvG5N26s3Pco,39
|
|
21
|
-
datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py,sha256=
|
|
21
|
+
datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py,sha256=u6vu5wevFc_15jYbIfM50rrEfRtJn_P-8MKEtceX08k,62427
|
|
22
22
|
datacompose/transformers/text/emails/__init__.py,sha256=snZLOJsxrPDOi8gIISlRxc6YlskKxUyu0NnOZCE5cIU,34
|
|
23
|
-
datacompose/transformers/text/emails/pyspark/pyspark_primitives.py,sha256=
|
|
23
|
+
datacompose/transformers/text/emails/pyspark/pyspark_primitives.py,sha256=TSuUrILHHK3SjGC-J7YICgzQqj-FOYcaXyfhRaEg4d4,24052
|
|
24
24
|
datacompose/transformers/text/phone_numbers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py,sha256=
|
|
26
|
-
datacompose-0.2.
|
|
27
|
-
datacompose-0.2.
|
|
28
|
-
datacompose-0.2.
|
|
29
|
-
datacompose-0.2.
|
|
30
|
-
datacompose-0.2.
|
|
31
|
-
datacompose-0.2.
|
|
25
|
+
datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py,sha256=O9wjGi9S_0sp7xkycMGUccp1XSgln9XJ9Mfs25FVyQU,28903
|
|
26
|
+
datacompose-0.2.6.1.dist-info/licenses/LICENSE,sha256=SCPOqmPhMikiyYDlKZ877fGHaE2O45cDBoJIomrlpDU,1067
|
|
27
|
+
datacompose-0.2.6.1.dist-info/METADATA,sha256=geD3OWaiLYbprfVhuf0wEXdX54o6Jei3frYnmAnFhhQ,4352
|
|
28
|
+
datacompose-0.2.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
29
|
+
datacompose-0.2.6.1.dist-info/entry_points.txt,sha256=oeG9oGgDwajk4v0C1awdUTBx2GmhLpuNHCTAV-jurUc,58
|
|
30
|
+
datacompose-0.2.6.1.dist-info/top_level.txt,sha256=AX1qGkuJMD2YJLZKo40h-w4MeFxDZL6W1vbKKuTpW8I,12
|
|
31
|
+
datacompose-0.2.6.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|