datacompose 0.2.5.2__py3-none-any.whl → 0.2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/__init__.py +1 -1
- datacompose/operators/__init__.py +1 -1
- datacompose/operators/primitives.py +57 -19
- datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py +57 -57
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.0.dist-info}/METADATA +1 -1
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.0.dist-info}/RECORD +10 -10
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.0.dist-info}/WHEEL +0 -0
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.0.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.0.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.5.2.dist-info → datacompose-0.2.6.0.dist-info}/top_level.txt +0 -0
datacompose/cli/__init__.py
CHANGED
|
@@ -16,9 +16,13 @@ from typing import Any, Callable, Dict, List, Optional, Sequence
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
try:
|
|
19
|
-
from pyspark.sql import Column
|
|
19
|
+
from pyspark.sql import Column
|
|
20
|
+
from pyspark.sql import functions as F
|
|
20
21
|
except ImportError:
|
|
21
|
-
|
|
22
|
+
logging.debug("PySpark not available")
|
|
23
|
+
|
|
24
|
+
# Set up module logger
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
22
26
|
|
|
23
27
|
|
|
24
28
|
class SmartPrimitive:
|
|
@@ -120,11 +124,15 @@ class PrimitiveRegistry:
|
|
|
120
124
|
self._primitives = {}
|
|
121
125
|
self._conditionals = {}
|
|
122
126
|
|
|
123
|
-
def register(
|
|
127
|
+
def register(
|
|
128
|
+
self, name: Optional[str] = None, is_conditional: Optional[bool] = None
|
|
129
|
+
):
|
|
124
130
|
"""Decorator to register a function as a SmartPrimitive in this namespace.
|
|
125
131
|
|
|
126
132
|
Args:
|
|
127
133
|
name: Optional name for the primitive (defaults to function name)
|
|
134
|
+
is_conditional: Optional flag to mark as conditional. If None, auto-detects
|
|
135
|
+
based on function name patterns.
|
|
128
136
|
|
|
129
137
|
Returns:
|
|
130
138
|
Decorator function that wraps the target function as a SmartPrimitive
|
|
@@ -139,7 +147,29 @@ class PrimitiveRegistry:
|
|
|
139
147
|
def decorator(func: Callable):
|
|
140
148
|
primitive_name = name or func.__name__
|
|
141
149
|
|
|
142
|
-
if
|
|
150
|
+
# Auto-detect conditional if not explicitly specified
|
|
151
|
+
if is_conditional is None:
|
|
152
|
+
# Check common naming patterns for conditional functions
|
|
153
|
+
conditional_patterns = [
|
|
154
|
+
"is_",
|
|
155
|
+
"has_",
|
|
156
|
+
"needs_",
|
|
157
|
+
"should_",
|
|
158
|
+
"can_",
|
|
159
|
+
"contains_",
|
|
160
|
+
"matches_",
|
|
161
|
+
"equals_",
|
|
162
|
+
"starts_with_",
|
|
163
|
+
"ends_with_",
|
|
164
|
+
]
|
|
165
|
+
is_conditional_auto = any(
|
|
166
|
+
primitive_name.startswith(pattern)
|
|
167
|
+
for pattern in conditional_patterns
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
is_conditional_auto = is_conditional
|
|
171
|
+
|
|
172
|
+
if is_conditional_auto:
|
|
143
173
|
self._conditionals[primitive_name] = SmartPrimitive(
|
|
144
174
|
func, primitive_name
|
|
145
175
|
)
|
|
@@ -217,9 +247,17 @@ class PrimitiveRegistry:
|
|
|
217
247
|
pipeline.__doc__ = func.__doc__
|
|
218
248
|
return pipeline
|
|
219
249
|
|
|
250
|
+
# Auto-detect ALL namespace instances from func.__globals__
|
|
251
|
+
# This allows using multiple namespaces without explicitly passing them
|
|
252
|
+
for var_name, var_value in func.__globals__.items():
|
|
253
|
+
if isinstance(var_value, PrimitiveRegistry):
|
|
254
|
+
# Found a namespace instance
|
|
255
|
+
if var_name not in namespaces:
|
|
256
|
+
namespaces[var_name] = var_value
|
|
257
|
+
|
|
220
258
|
# Try to get the function as a string and parse it
|
|
221
259
|
try:
|
|
222
|
-
compiler = PipelineCompiler(namespaces, debug)
|
|
260
|
+
compiler = PipelineCompiler(namespaces, debug, func.__globals__)
|
|
223
261
|
pipeline = compiler.compile(func)
|
|
224
262
|
|
|
225
263
|
if debug and pipeline.steps:
|
|
@@ -270,7 +308,11 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
|
|
|
270
308
|
method_name = node.value.func.attr
|
|
271
309
|
namespace = (
|
|
272
310
|
namespaces.get(namespace_name) if namespace_name else None
|
|
273
|
-
) or (
|
|
311
|
+
) or (
|
|
312
|
+
func.__globals__.get(namespace_name)
|
|
313
|
+
if namespace_name
|
|
314
|
+
else None
|
|
315
|
+
)
|
|
274
316
|
if namespace and hasattr(namespace, method_name):
|
|
275
317
|
method = getattr(namespace, method_name)
|
|
276
318
|
|
|
@@ -312,16 +354,6 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
|
|
|
312
354
|
return pipeline
|
|
313
355
|
|
|
314
356
|
|
|
315
|
-
try:
|
|
316
|
-
from pyspark.sql import Column
|
|
317
|
-
from pyspark.sql import functions as F
|
|
318
|
-
except ImportError:
|
|
319
|
-
logging.debug("PySpark not available")
|
|
320
|
-
|
|
321
|
-
# Set up module logger
|
|
322
|
-
logger = logging.getLogger(__name__)
|
|
323
|
-
|
|
324
|
-
|
|
325
357
|
@dataclass
|
|
326
358
|
class CompiledStep:
|
|
327
359
|
"""A compiled pipeline step"""
|
|
@@ -452,9 +484,15 @@ class StablePipeline:
|
|
|
452
484
|
|
|
453
485
|
|
|
454
486
|
class PipelineCompiler:
|
|
455
|
-
def __init__(
|
|
487
|
+
def __init__(
|
|
488
|
+
self,
|
|
489
|
+
namespaces: Dict[str, Any],
|
|
490
|
+
debug: bool = False,
|
|
491
|
+
func_globals: Optional[Dict] = None,
|
|
492
|
+
):
|
|
456
493
|
self.namespaces = namespaces
|
|
457
494
|
self.debug = debug
|
|
495
|
+
self.func_globals = func_globals or {}
|
|
458
496
|
|
|
459
497
|
def compile(self, func: Callable) -> StablePipeline:
|
|
460
498
|
try:
|
|
@@ -530,7 +568,7 @@ class PipelineCompiler:
|
|
|
530
568
|
|
|
531
569
|
namespace = (
|
|
532
570
|
self.namespaces.get(namespace_name) if namespace_name else None
|
|
533
|
-
) or (
|
|
571
|
+
) or (self.func_globals.get(namespace_name) if namespace_name else None)
|
|
534
572
|
if namespace and hasattr(namespace, method_name):
|
|
535
573
|
method = getattr(namespace, method_name)
|
|
536
574
|
|
|
@@ -552,7 +590,7 @@ class PipelineCompiler:
|
|
|
552
590
|
|
|
553
591
|
namespace = (
|
|
554
592
|
self.namespaces.get(namespace_name) if namespace_name else None
|
|
555
|
-
) or (
|
|
593
|
+
) or (self.func_globals.get(namespace_name) if namespace_name else None)
|
|
556
594
|
if namespace and hasattr(namespace, method_name):
|
|
557
595
|
method = getattr(namespace, method_name)
|
|
558
596
|
|
|
@@ -3,7 +3,7 @@ Phone number transformation primitives for PySpark.
|
|
|
3
3
|
|
|
4
4
|
Preview Output:
|
|
5
5
|
+------------------------+----------------+--------+---------+------------+-------+---------+------------+
|
|
6
|
-
|
|
|
6
|
+
|phone_numbers |standardized |is_valid|area_code|local_number|has_ext|extension|is_toll_free|
|
|
7
7
|
+------------------------+----------------+--------+---------+------------+-------+---------+------------+
|
|
8
8
|
| (555) 123-4567 |(555) 123-4567 |true |555 |1234567 |false |null |false |
|
|
9
9
|
|+1-800-555-1234 |+1 800-555-1234 |true |800 |5551234 |false |null |true |
|
|
@@ -29,23 +29,23 @@ data = [
|
|
|
29
29
|
("123-45-67",),
|
|
30
30
|
("1-800-FLOWERS",),
|
|
31
31
|
]
|
|
32
|
-
df = spark.createDataFrame(data, ["
|
|
32
|
+
df = spark.createDataFrame(data, ["phone_numbers"])
|
|
33
33
|
|
|
34
34
|
# Apply transformations
|
|
35
35
|
result_df = df.select(
|
|
36
|
-
F.col("
|
|
37
|
-
phone_numbers.
|
|
38
|
-
phone_numbers.
|
|
36
|
+
F.col("phone_numbers"),
|
|
37
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers")).alias("standardized"),
|
|
38
|
+
phone_numbers.is_valid_phone_numbers(F.col("phone_numbers")).alias("is_valid"),
|
|
39
39
|
phone_numbers.extract_area_code(
|
|
40
|
-
phone_numbers.
|
|
40
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
41
41
|
).alias("area_code"),
|
|
42
42
|
phone_numbers.extract_local_number(
|
|
43
|
-
phone_numbers.
|
|
43
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
44
44
|
).alias("local_number"),
|
|
45
|
-
phone_numbers.has_extension(F.col("
|
|
46
|
-
phone_numbers.extract_extension(F.col("
|
|
45
|
+
phone_numbers.has_extension(F.col("phone_numbers")).alias("has_ext"),
|
|
46
|
+
phone_numbers.extract_extension(F.col("phone_numbers")).alias("extension"),
|
|
47
47
|
phone_numbers.is_toll_free(
|
|
48
|
-
phone_numbers.
|
|
48
|
+
phone_numbers.standardize_phone_numbers(F.col("phone_numbers"))
|
|
49
49
|
).alias("is_toll_free")
|
|
50
50
|
)
|
|
51
51
|
|
|
@@ -118,7 +118,7 @@ PHONE_KEYPAD_MAPPING = {
|
|
|
118
118
|
|
|
119
119
|
|
|
120
120
|
@phone_numbers.register()
|
|
121
|
-
def
|
|
121
|
+
def extract_phone_numbers_from_text(col: Column) -> Column:
|
|
122
122
|
"""
|
|
123
123
|
Extract first phone number from text using regex patterns.
|
|
124
124
|
|
|
@@ -126,21 +126,21 @@ def extract_phone_from_text(col: Column) -> Column:
|
|
|
126
126
|
col: Column containing text with potential phone numbers
|
|
127
127
|
|
|
128
128
|
Returns:
|
|
129
|
-
Column with extracted phone
|
|
129
|
+
Column with extracted phone numbers or empty string
|
|
130
130
|
"""
|
|
131
|
-
# Comprehensive
|
|
131
|
+
# Comprehensive phone_numbers pattern that matches various formats
|
|
132
132
|
# Handles: +1-555-123-4567, (555) 123-4567, 555.123.4567, 555-123-4567, etc.
|
|
133
|
-
|
|
133
|
+
phone_numbers_pattern = (
|
|
134
134
|
r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(\s*(ext|x)\.?\s*\d+)?"
|
|
135
135
|
)
|
|
136
136
|
|
|
137
137
|
return F.when(col.isNull(), F.lit("")).otherwise(
|
|
138
|
-
F.regexp_extract(col,
|
|
138
|
+
F.regexp_extract(col, phone_numbers_pattern, 0)
|
|
139
139
|
)
|
|
140
140
|
|
|
141
141
|
|
|
142
142
|
@phone_numbers.register()
|
|
143
|
-
def
|
|
143
|
+
def extract_all_phone_numbers_from_text(col: Column) -> Column:
|
|
144
144
|
"""
|
|
145
145
|
Extract all phone numbers from text as an array.
|
|
146
146
|
|
|
@@ -150,13 +150,13 @@ def extract_all_phones_from_text(col: Column) -> Column:
|
|
|
150
150
|
Returns:
|
|
151
151
|
Column with array of phone numbers
|
|
152
152
|
"""
|
|
153
|
-
# For simplicity, we'll return an array with just the first
|
|
153
|
+
# For simplicity, we'll return an array with just the first phone_numbers found
|
|
154
154
|
# A proper implementation would require more complex regex or UDF
|
|
155
155
|
# This is a limitation of Spark SQL's regex capabilities
|
|
156
|
-
|
|
156
|
+
first_phone_numbers = extract_phone_numbers_from_text(col)
|
|
157
157
|
|
|
158
158
|
# Return array with single element or empty array
|
|
159
|
-
return F.when(
|
|
159
|
+
return F.when(first_phone_numbers != "", F.array(first_phone_numbers)).otherwise(F.array())
|
|
160
160
|
|
|
161
161
|
|
|
162
162
|
@phone_numbers.register()
|
|
@@ -366,7 +366,7 @@ def is_valid_international(
|
|
|
366
366
|
|
|
367
367
|
|
|
368
368
|
@phone_numbers.register()
|
|
369
|
-
def
|
|
369
|
+
def is_valid_phone_numbers(col: Column) -> Column:
|
|
370
370
|
"""
|
|
371
371
|
Check if phone number is valid (NANP or international).
|
|
372
372
|
|
|
@@ -403,7 +403,7 @@ def is_premium_rate(col: Column) -> Column:
|
|
|
403
403
|
Check if phone number is premium rate (900).
|
|
404
404
|
|
|
405
405
|
Args:
|
|
406
|
-
col: Column containing
|
|
406
|
+
col: Column containing phophonene_numbers number
|
|
407
407
|
|
|
408
408
|
Returns:
|
|
409
409
|
Column with boolean indicating if premium rate
|
|
@@ -546,11 +546,11 @@ def format_nanp(col: Column) -> Column:
|
|
|
546
546
|
"""
|
|
547
547
|
# Remove extension for validation but preserve it
|
|
548
548
|
extension = extract_extension(col)
|
|
549
|
-
|
|
549
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
550
550
|
|
|
551
|
-
area_code = extract_area_code(
|
|
552
|
-
exchange = extract_exchange(
|
|
553
|
-
subscriber = extract_subscriber(
|
|
551
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
552
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
553
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
554
554
|
|
|
555
555
|
base_format = F.concat(area_code, F.lit("-"), exchange, F.lit("-"), subscriber)
|
|
556
556
|
|
|
@@ -559,7 +559,7 @@ def format_nanp(col: Column) -> Column:
|
|
|
559
559
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
560
560
|
).otherwise(base_format)
|
|
561
561
|
|
|
562
|
-
return F.when(is_valid_nanp(
|
|
562
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
563
563
|
|
|
564
564
|
|
|
565
565
|
@phone_numbers.register()
|
|
@@ -575,11 +575,11 @@ def format_nanp_paren(col: Column) -> Column:
|
|
|
575
575
|
"""
|
|
576
576
|
# Remove extension for validation but preserve it
|
|
577
577
|
extension = extract_extension(col)
|
|
578
|
-
|
|
578
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
579
579
|
|
|
580
|
-
area_code = extract_area_code(
|
|
581
|
-
exchange = extract_exchange(
|
|
582
|
-
subscriber = extract_subscriber(
|
|
580
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
581
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
582
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
583
583
|
|
|
584
584
|
base_format = F.concat(
|
|
585
585
|
F.lit("("), area_code, F.lit(") "), exchange, F.lit("-"), subscriber
|
|
@@ -590,7 +590,7 @@ def format_nanp_paren(col: Column) -> Column:
|
|
|
590
590
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
591
591
|
).otherwise(base_format)
|
|
592
592
|
|
|
593
|
-
return F.when(is_valid_nanp(
|
|
593
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
594
594
|
|
|
595
595
|
|
|
596
596
|
@phone_numbers.register()
|
|
@@ -606,11 +606,11 @@ def format_nanp_dot(col: Column) -> Column:
|
|
|
606
606
|
"""
|
|
607
607
|
# Remove extension for validation but preserve it
|
|
608
608
|
extension = extract_extension(col)
|
|
609
|
-
|
|
609
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
610
610
|
|
|
611
|
-
area_code = extract_area_code(
|
|
612
|
-
exchange = extract_exchange(
|
|
613
|
-
subscriber = extract_subscriber(
|
|
611
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
612
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
613
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
614
614
|
|
|
615
615
|
base_format = F.concat(area_code, F.lit("."), exchange, F.lit("."), subscriber)
|
|
616
616
|
|
|
@@ -619,7 +619,7 @@ def format_nanp_dot(col: Column) -> Column:
|
|
|
619
619
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
620
620
|
).otherwise(base_format)
|
|
621
621
|
|
|
622
|
-
return F.when(is_valid_nanp(
|
|
622
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
623
623
|
|
|
624
624
|
|
|
625
625
|
@phone_numbers.register()
|
|
@@ -635,11 +635,11 @@ def format_nanp_space(col: Column) -> Column:
|
|
|
635
635
|
"""
|
|
636
636
|
# Remove extension for validation but preserve it
|
|
637
637
|
extension = extract_extension(col)
|
|
638
|
-
|
|
638
|
+
phone_numbers_no_ext = remove_extension(col)
|
|
639
639
|
|
|
640
|
-
area_code = extract_area_code(
|
|
641
|
-
exchange = extract_exchange(
|
|
642
|
-
subscriber = extract_subscriber(
|
|
640
|
+
area_code = extract_area_code(phone_numbers_no_ext)
|
|
641
|
+
exchange = extract_exchange(phone_numbers_no_ext)
|
|
642
|
+
subscriber = extract_subscriber(phone_numbers_no_ext)
|
|
643
643
|
|
|
644
644
|
base_format = F.concat(area_code, F.lit(" "), exchange, F.lit(" "), subscriber)
|
|
645
645
|
|
|
@@ -648,7 +648,7 @@ def format_nanp_space(col: Column) -> Column:
|
|
|
648
648
|
(extension != ""), F.concat(base_format, F.lit(" ext. "), extension)
|
|
649
649
|
).otherwise(base_format)
|
|
650
650
|
|
|
651
|
-
return F.when(is_valid_nanp(
|
|
651
|
+
return F.when(is_valid_nanp(phone_numbers_no_ext), formatted).otherwise(F.lit(""))
|
|
652
652
|
|
|
653
653
|
|
|
654
654
|
@phone_numbers.register()
|
|
@@ -707,7 +707,7 @@ def format_e164(col: Column) -> Column:
|
|
|
707
707
|
|
|
708
708
|
# Build E.164 format - only for valid phones
|
|
709
709
|
return F.when(
|
|
710
|
-
|
|
710
|
+
is_valid_phone_numbers(col),
|
|
711
711
|
F.when(
|
|
712
712
|
(F.length(digits) == 10) & is_nanp, F.concat(F.lit("+"), F.lit("1"), digits)
|
|
713
713
|
)
|
|
@@ -729,7 +729,7 @@ def format_e164(col: Column) -> Column:
|
|
|
729
729
|
|
|
730
730
|
|
|
731
731
|
@phone_numbers.register()
|
|
732
|
-
def
|
|
732
|
+
def standardize_phone_numbers(col: Column) -> Column:
|
|
733
733
|
"""
|
|
734
734
|
Standardize phone number with cleaning and NANP formatting.
|
|
735
735
|
|
|
@@ -783,7 +783,7 @@ def standardize_phone(col: Column) -> Column:
|
|
|
783
783
|
|
|
784
784
|
|
|
785
785
|
@phone_numbers.register()
|
|
786
|
-
def
|
|
786
|
+
def standardize_phone_numbers_e164(col: Column) -> Column:
|
|
787
787
|
"""
|
|
788
788
|
Standardize phone number with cleaning and E.164 formatting.
|
|
789
789
|
|
|
@@ -800,11 +800,11 @@ def standardize_phone_e164(col: Column) -> Column:
|
|
|
800
800
|
result = format_e164(cleaned)
|
|
801
801
|
|
|
802
802
|
# Only return valid phone numbers
|
|
803
|
-
return F.when(
|
|
803
|
+
return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
|
|
804
804
|
|
|
805
805
|
|
|
806
806
|
@phone_numbers.register()
|
|
807
|
-
def
|
|
807
|
+
def standardize_phone_numbers_digits(col: Column) -> Column:
|
|
808
808
|
"""
|
|
809
809
|
Standardize phone number and return digits only.
|
|
810
810
|
|
|
@@ -821,11 +821,11 @@ def standardize_phone_digits(col: Column) -> Column:
|
|
|
821
821
|
result = extract_digits(cleaned)
|
|
822
822
|
|
|
823
823
|
# Only return valid phone numbers
|
|
824
|
-
return F.when(
|
|
824
|
+
return F.when(is_valid_phone_numbers(cleaned), result).otherwise(F.lit(""))
|
|
825
825
|
|
|
826
826
|
|
|
827
827
|
@phone_numbers.register()
|
|
828
|
-
def
|
|
828
|
+
def clean_phone_numbers(col: Column) -> Column:
|
|
829
829
|
"""
|
|
830
830
|
Clean and validate phone number, returning null for invalid numbers.
|
|
831
831
|
|
|
@@ -873,12 +873,12 @@ def clean_phone(col: Column) -> Column:
|
|
|
873
873
|
|
|
874
874
|
|
|
875
875
|
@phone_numbers.register()
|
|
876
|
-
def
|
|
876
|
+
def get_phone_numbers_type(col: Column) -> Column:
|
|
877
877
|
"""
|
|
878
878
|
Get phone number type (toll-free, premium, standard, international).
|
|
879
879
|
|
|
880
880
|
Args:
|
|
881
|
-
col: Column containing
|
|
881
|
+
col: Column containing phone_numbers number
|
|
882
882
|
|
|
883
883
|
Returns:
|
|
884
884
|
Column with phone type
|
|
@@ -923,7 +923,7 @@ def get_region_from_area_code(col: Column) -> Column:
|
|
|
923
923
|
|
|
924
924
|
|
|
925
925
|
@phone_numbers.register()
|
|
926
|
-
def
|
|
926
|
+
def mask_phone_numbers(col: Column) -> Column:
|
|
927
927
|
"""
|
|
928
928
|
Mask phone number for privacy keeping last 4 digits (e.g., ***-***-1234).
|
|
929
929
|
|
|
@@ -950,9 +950,9 @@ def mask_phone(col: Column) -> Column:
|
|
|
950
950
|
|
|
951
951
|
|
|
952
952
|
@phone_numbers.register()
|
|
953
|
-
def
|
|
953
|
+
def filter_valid_phone_numbers_numbers(col: Column) -> Column:
|
|
954
954
|
"""
|
|
955
|
-
Return
|
|
955
|
+
Return phone_numbers number only if valid, otherwise return null.
|
|
956
956
|
|
|
957
957
|
Args:
|
|
958
958
|
col: Column containing phone number
|
|
@@ -960,13 +960,13 @@ def filter_valid_phones(col: Column) -> Column:
|
|
|
960
960
|
Returns:
|
|
961
961
|
Column with valid phone or null
|
|
962
962
|
"""
|
|
963
|
-
return F.when(
|
|
963
|
+
return F.when(is_valid_phone_numbers(col), col).otherwise(F.lit(None))
|
|
964
964
|
|
|
965
965
|
|
|
966
966
|
@phone_numbers.register()
|
|
967
|
-
def
|
|
967
|
+
def filter_nanp_phone_numbers_numbers(col: Column) -> Column:
|
|
968
968
|
"""
|
|
969
|
-
Return
|
|
969
|
+
Return phone_numbers number only if valid NANP, otherwise return null.
|
|
970
970
|
|
|
971
971
|
Args:
|
|
972
972
|
col: Column containing phone number
|
|
@@ -978,7 +978,7 @@ def filter_nanp_phones(col: Column) -> Column:
|
|
|
978
978
|
|
|
979
979
|
|
|
980
980
|
@phone_numbers.register()
|
|
981
|
-
def
|
|
981
|
+
def filter_toll_free_phone_numbers_numbers(col: Column) -> Column:
|
|
982
982
|
"""
|
|
983
983
|
Return phone number only if toll-free, otherwise return null.
|
|
984
984
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
datacompose/__init__.py,sha256=kbQEzEvheAMsm3-MT4nWSuX42fjrfgDWoR8WZmC_WX4,34
|
|
2
|
-
datacompose/cli/__init__.py,sha256=
|
|
2
|
+
datacompose/cli/__init__.py,sha256=UuHpU3QypWZC9NEjI89BNUW_PcRfMUyJbZDKJvAm1hQ,109
|
|
3
3
|
datacompose/cli/colors.py,sha256=Ax7jHhdAIuq5x3663gJ7_MzFCBOJv38DqNXts5t4SLs,1756
|
|
4
4
|
datacompose/cli/config.py,sha256=vvY6xGgIdybUuujdPrDI_CsSUUD9CEfODG8Kem4jqVQ,2353
|
|
5
5
|
datacompose/cli/main.py,sha256=d87hG1nxDVNTRZVi7ctNQQ06lwe5KbLGJMChlHtR3Kc,1343
|
|
@@ -12,8 +12,8 @@ datacompose/generators/__init__.py,sha256=dFJWJScu8mkP0ZKIQtVlJ36PQW-LwCYBijuNwL
|
|
|
12
12
|
datacompose/generators/base.py,sha256=EgpHwaaSxAP1Ygq5Wtyq4ez-wG0oPwDEbiKgLsEilD0,6761
|
|
13
13
|
datacompose/generators/pyspark/__init__.py,sha256=ayoKDGtbt2KwFcNt2QxHKt8z83Kzy4ySw9Gg7j9ZMTY,33
|
|
14
14
|
datacompose/generators/pyspark/generator.py,sha256=be4GATA8rmLAg4_wZ3Ox3vC3up_OXMOajjIUJQrDQ10,1735
|
|
15
|
-
datacompose/operators/__init__.py,sha256=
|
|
16
|
-
datacompose/operators/primitives.py,sha256=
|
|
15
|
+
datacompose/operators/__init__.py,sha256=Eacc0JDCzeuTeLGO_N9Nz9pOc1D3_6BxEcpCibzrpz8,588
|
|
16
|
+
datacompose/operators/primitives.py,sha256=FxhtgP7aizKsnNBgh5oTqwc9m8QSjLTpRoG5zu6rFns,23615
|
|
17
17
|
datacompose/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
datacompose/transformers/discovery.py,sha256=JgMtmd8PkhmwLqS17NeKSv9MRneY9tdOsOK32A6luOQ,7048
|
|
19
19
|
datacompose/transformers/text/__init__.py,sha256=Mq0UmgYlBV8T18IkvHAS1TImEzWyGciCqxaCv324hFQ,36
|
|
@@ -22,10 +22,10 @@ datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py,sha256=gf_
|
|
|
22
22
|
datacompose/transformers/text/emails/__init__.py,sha256=snZLOJsxrPDOi8gIISlRxc6YlskKxUyu0NnOZCE5cIU,34
|
|
23
23
|
datacompose/transformers/text/emails/pyspark/pyspark_primitives.py,sha256=zQpntW4-RsDv5C1TWp0put10UyUEamP1BxvVYbr2Q58,23785
|
|
24
24
|
datacompose/transformers/text/phone_numbers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py,sha256=
|
|
26
|
-
datacompose-0.2.
|
|
27
|
-
datacompose-0.2.
|
|
28
|
-
datacompose-0.2.
|
|
29
|
-
datacompose-0.2.
|
|
30
|
-
datacompose-0.2.
|
|
31
|
-
datacompose-0.2.
|
|
25
|
+
datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py,sha256=PhyC4GEHJiUVHNulRDEjLUoZgnlBnxN9VKzLr802QrI,28856
|
|
26
|
+
datacompose-0.2.6.0.dist-info/licenses/LICENSE,sha256=SCPOqmPhMikiyYDlKZ877fGHaE2O45cDBoJIomrlpDU,1067
|
|
27
|
+
datacompose-0.2.6.0.dist-info/METADATA,sha256=n5GlJ73W6LwLojmtIMLX_rqX7qCBd60lzcQo-iEzUEM,4352
|
|
28
|
+
datacompose-0.2.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
29
|
+
datacompose-0.2.6.0.dist-info/entry_points.txt,sha256=oeG9oGgDwajk4v0C1awdUTBx2GmhLpuNHCTAV-jurUc,58
|
|
30
|
+
datacompose-0.2.6.0.dist-info/top_level.txt,sha256=AX1qGkuJMD2YJLZKo40h-w4MeFxDZL6W1vbKKuTpW8I,12
|
|
31
|
+
datacompose-0.2.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|