datacompose 0.2.5.2__tar.gz → 0.2.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/CHANGELOG.md +41 -0
- {datacompose-0.2.5.2/datacompose.egg-info → datacompose-0.2.6.1}/PKG-INFO +1 -1
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/__init__.py +1 -1
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/init.py +1 -1
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/operators/__init__.py +1 -1
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/operators/primitives.py +57 -19
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py +10 -10
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/emails/pyspark/pyspark_primitives.py +14 -14
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py +59 -59
- {datacompose-0.2.5.2 → datacompose-0.2.6.1/datacompose.egg-info}/PKG-INFO +1 -1
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/SOURCES.txt +3 -8
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/pyproject.toml +1 -1
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_config.py +56 -57
- datacompose-0.2.6.1/tests/unit/operators/test_compose_conditions.py +594 -0
- datacompose-0.2.6.1/tests/unit/operators/test_conditional_auto_detection.py +192 -0
- datacompose-0.2.6.1/tests/unit/operators/test_conditional_core.py +733 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/operators/test_conditional_real_world.py +142 -7
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/operators/test_operators.py +9 -9
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/operators/test_primitives_complete.py +67 -65
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +69 -50
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +42 -18
- datacompose-0.2.5.2/tests/unit/operators/conditional_tests_common.py +0 -26
- datacompose-0.2.5.2/tests/unit/operators/conftest.py +0 -61
- datacompose-0.2.5.2/tests/unit/operators/test_conditional_complex_logic.py +0 -200
- datacompose-0.2.5.2/tests/unit/operators/test_conditional_data_driven.py +0 -117
- datacompose-0.2.5.2/tests/unit/operators/test_conditional_edge_cases.py +0 -150
- datacompose-0.2.5.2/tests/unit/operators/test_conditional_error_handling.py +0 -67
- datacompose-0.2.5.2/tests/unit/operators/test_conditional_parameters.py +0 -94
- datacompose-0.2.5.2/tests/unit/operators/test_conditional_performance.py +0 -106
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/LICENSE +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/MANIFEST.in +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/README.md +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/colors.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/add.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/list.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/config.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/main.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/validation.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/base.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/pyspark/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/pyspark/generator.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/discovery.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/addresses/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/emails/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/phone_numbers/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/dependency_links.txt +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/entry_points.txt +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/requires.txt +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/top_level.txt +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/setup.cfg +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/test_end_to_end.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/test_full_workflow.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/test_generated_imports.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/.venv/bin/activate_this.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_command.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_command_complete.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_default_target.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_validation.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_init_command.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_init_command_complete.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_list_command.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_main.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_main_complete.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_validation_complete.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/generators/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/generators/test_base_generator.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/generators/test_spark_generator.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/__init__.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/test_discovery.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/common/test_common.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_country_extraction.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_data_addresses.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_street_extraction.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_emails/test_email_extraction.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_emails/test_email_optimized.py +0 -0
- {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/yaml_specs/__init__.py +0 -0
|
@@ -7,6 +7,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.2.6.0] - 2025-08-24
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- **Automatic Conditional Detection**: Smart detection of conditional operators based on naming patterns
|
|
14
|
+
- Functions starting with `is_`, `has_`, `needs_`, `should_`, `can_`, `contains_`, `matches_`, `equals_`, `starts_with_`, `ends_with_` are automatically detected as conditionals
|
|
15
|
+
- Eliminates need for explicit `is_conditional=True` in most cases
|
|
16
|
+
- Explicit override still available when needed via `is_conditional` parameter
|
|
17
|
+
- **Phone Number Processing Pipeline**: Complete phone number validation and formatting example
|
|
18
|
+
- Letter-to-number conversion (1-800-FLOWERS)
|
|
19
|
+
- NANP validation and formatting
|
|
20
|
+
- Toll-free number detection
|
|
21
|
+
- E.164 and parentheses formatting
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
- **Conditional Operator Registration**: `is_conditional` parameter now optional with smart defaults
|
|
25
|
+
- **Test Organization**: Consolidated conditional tests into three focused files:
|
|
26
|
+
- `test_conditional_core.py` - Core functionality, logic, errors, parameters, and performance
|
|
27
|
+
- `test_conditional_real_world.py` - Real-world pipeline scenarios
|
|
28
|
+
- `test_conditional_auto_detection.py` - Auto-detection feature tests
|
|
29
|
+
|
|
30
|
+
### Fixed
|
|
31
|
+
- **Phone Number Validation**: Updated NANP validation to be more flexible for testing scenarios
|
|
32
|
+
|
|
33
|
+
## [0.2.5.3] - 2025-08-23
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
- **Compose Decorator Enhancement**: Auto-detection of PrimitiveRegistry instances in function globals
|
|
37
|
+
- Compose decorator now automatically discovers all namespace instances without explicit passing
|
|
38
|
+
- Improved namespace resolution using function's global scope instead of module globals
|
|
39
|
+
- Better support for multiple namespaces in composed functions
|
|
40
|
+
|
|
41
|
+
### Fixed
|
|
42
|
+
- **Namespace Resolution**: Fixed global namespace lookups to use function's own globals
|
|
43
|
+
- PipelineCompiler now correctly resolves namespaces from the decorated function's scope
|
|
44
|
+
- Fallback compose mode uses function globals for namespace discovery
|
|
45
|
+
- Prevents namespace resolution errors when registries are defined in different modules
|
|
46
|
+
|
|
47
|
+
### Changed
|
|
48
|
+
- **Phone Number Tests**: Updated test imports and formatting for phone number primitives
|
|
49
|
+
- **Test Organization**: Added comprehensive conditional composition tests
|
|
50
|
+
|
|
10
51
|
## [0.2.5.2] - 2025-08-22
|
|
11
52
|
|
|
12
53
|
### Fixed
|
|
@@ -380,7 +380,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
380
380
|
# Check if config already exists
|
|
381
381
|
if config_path.exists() and not force:
|
|
382
382
|
print(error(f"Configuration file already exists: {config_path}"))
|
|
383
|
-
print(dim("Use --force to overwrite"))
|
|
383
|
+
print(dim("Use datacompose init --force to overwrite"))
|
|
384
384
|
return 1
|
|
385
385
|
|
|
386
386
|
try:
|
|
@@ -16,9 +16,13 @@ from typing import Any, Callable, Dict, List, Optional, Sequence
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
try:
|
|
19
|
-
from pyspark.sql import Column
|
|
19
|
+
from pyspark.sql import Column
|
|
20
|
+
from pyspark.sql import functions as F
|
|
20
21
|
except ImportError:
|
|
21
|
-
|
|
22
|
+
logging.debug("PySpark not available")
|
|
23
|
+
|
|
24
|
+
# Set up module logger
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
22
26
|
|
|
23
27
|
|
|
24
28
|
class SmartPrimitive:
|
|
@@ -120,11 +124,15 @@ class PrimitiveRegistry:
|
|
|
120
124
|
self._primitives = {}
|
|
121
125
|
self._conditionals = {}
|
|
122
126
|
|
|
123
|
-
def register(
|
|
127
|
+
def register(
|
|
128
|
+
self, name: Optional[str] = None, is_conditional: Optional[bool] = None
|
|
129
|
+
):
|
|
124
130
|
"""Decorator to register a function as a SmartPrimitive in this namespace.
|
|
125
131
|
|
|
126
132
|
Args:
|
|
127
133
|
name: Optional name for the primitive (defaults to function name)
|
|
134
|
+
is_conditional: Optional flag to mark as conditional. If None, auto-detects
|
|
135
|
+
based on function name patterns.
|
|
128
136
|
|
|
129
137
|
Returns:
|
|
130
138
|
Decorator function that wraps the target function as a SmartPrimitive
|
|
@@ -139,7 +147,29 @@ class PrimitiveRegistry:
|
|
|
139
147
|
def decorator(func: Callable):
|
|
140
148
|
primitive_name = name or func.__name__
|
|
141
149
|
|
|
142
|
-
if
|
|
150
|
+
# Auto-detect conditional if not explicitly specified
|
|
151
|
+
if is_conditional is None:
|
|
152
|
+
# Check common naming patterns for conditional functions
|
|
153
|
+
conditional_patterns = [
|
|
154
|
+
"is_",
|
|
155
|
+
"has_",
|
|
156
|
+
"needs_",
|
|
157
|
+
"should_",
|
|
158
|
+
"can_",
|
|
159
|
+
"contains_",
|
|
160
|
+
"matches_",
|
|
161
|
+
"equals_",
|
|
162
|
+
"starts_with_",
|
|
163
|
+
"ends_with_",
|
|
164
|
+
]
|
|
165
|
+
is_conditional_auto = any(
|
|
166
|
+
primitive_name.startswith(pattern)
|
|
167
|
+
for pattern in conditional_patterns
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
is_conditional_auto = is_conditional
|
|
171
|
+
|
|
172
|
+
if is_conditional_auto:
|
|
143
173
|
self._conditionals[primitive_name] = SmartPrimitive(
|
|
144
174
|
func, primitive_name
|
|
145
175
|
)
|
|
@@ -217,9 +247,17 @@ class PrimitiveRegistry:
|
|
|
217
247
|
pipeline.__doc__ = func.__doc__
|
|
218
248
|
return pipeline
|
|
219
249
|
|
|
250
|
+
# Auto-detect ALL namespace instances from func.__globals__
|
|
251
|
+
# This allows using multiple namespaces without explicitly passing them
|
|
252
|
+
for var_name, var_value in func.__globals__.items():
|
|
253
|
+
if isinstance(var_value, PrimitiveRegistry):
|
|
254
|
+
# Found a namespace instance
|
|
255
|
+
if var_name not in namespaces:
|
|
256
|
+
namespaces[var_name] = var_value
|
|
257
|
+
|
|
220
258
|
# Try to get the function as a string and parse it
|
|
221
259
|
try:
|
|
222
|
-
compiler = PipelineCompiler(namespaces, debug)
|
|
260
|
+
compiler = PipelineCompiler(namespaces, debug, func.__globals__)
|
|
223
261
|
pipeline = compiler.compile(func)
|
|
224
262
|
|
|
225
263
|
if debug and pipeline.steps:
|
|
@@ -270,7 +308,11 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
|
|
|
270
308
|
method_name = node.value.func.attr
|
|
271
309
|
namespace = (
|
|
272
310
|
namespaces.get(namespace_name) if namespace_name else None
|
|
273
|
-
) or (
|
|
311
|
+
) or (
|
|
312
|
+
func.__globals__.get(namespace_name)
|
|
313
|
+
if namespace_name
|
|
314
|
+
else None
|
|
315
|
+
)
|
|
274
316
|
if namespace and hasattr(namespace, method_name):
|
|
275
317
|
method = getattr(namespace, method_name)
|
|
276
318
|
|
|
@@ -312,16 +354,6 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
|
|
|
312
354
|
return pipeline
|
|
313
355
|
|
|
314
356
|
|
|
315
|
-
try:
|
|
316
|
-
from pyspark.sql import Column
|
|
317
|
-
from pyspark.sql import functions as F
|
|
318
|
-
except ImportError:
|
|
319
|
-
logging.debug("PySpark not available")
|
|
320
|
-
|
|
321
|
-
# Set up module logger
|
|
322
|
-
logger = logging.getLogger(__name__)
|
|
323
|
-
|
|
324
|
-
|
|
325
357
|
@dataclass
|
|
326
358
|
class CompiledStep:
|
|
327
359
|
"""A compiled pipeline step"""
|
|
@@ -452,9 +484,15 @@ class StablePipeline:
|
|
|
452
484
|
|
|
453
485
|
|
|
454
486
|
class PipelineCompiler:
|
|
455
|
-
def __init__(
|
|
487
|
+
def __init__(
|
|
488
|
+
self,
|
|
489
|
+
namespaces: Dict[str, Any],
|
|
490
|
+
debug: bool = False,
|
|
491
|
+
func_globals: Optional[Dict] = None,
|
|
492
|
+
):
|
|
456
493
|
self.namespaces = namespaces
|
|
457
494
|
self.debug = debug
|
|
495
|
+
self.func_globals = func_globals or {}
|
|
458
496
|
|
|
459
497
|
def compile(self, func: Callable) -> StablePipeline:
|
|
460
498
|
try:
|
|
@@ -530,7 +568,7 @@ class PipelineCompiler:
|
|
|
530
568
|
|
|
531
569
|
namespace = (
|
|
532
570
|
self.namespaces.get(namespace_name) if namespace_name else None
|
|
533
|
-
) or (
|
|
571
|
+
) or (self.func_globals.get(namespace_name) if namespace_name else None)
|
|
534
572
|
if namespace and hasattr(namespace, method_name):
|
|
535
573
|
method = getattr(namespace, method_name)
|
|
536
574
|
|
|
@@ -552,7 +590,7 @@ class PipelineCompiler:
|
|
|
552
590
|
|
|
553
591
|
namespace = (
|
|
554
592
|
self.namespaces.get(namespace_name) if namespace_name else None
|
|
555
|
-
) or (
|
|
593
|
+
) or (self.func_globals.get(namespace_name) if namespace_name else None)
|
|
556
594
|
if namespace and hasattr(namespace, method_name):
|
|
557
595
|
method = getattr(namespace, method_name)
|
|
558
596
|
|
|
@@ -544,7 +544,7 @@ def standardize_street_prefix(
|
|
|
544
544
|
|
|
545
545
|
Args:
|
|
546
546
|
col: Column containing street prefix
|
|
547
|
-
custom_mappings
|
|
547
|
+
custom_mappings (Optional): Dict of custom prefix mappings (case insensitive)
|
|
548
548
|
|
|
549
549
|
Returns:
|
|
550
550
|
Column with standardized prefix (always abbreviated per USPS standards)
|
|
@@ -614,7 +614,7 @@ def standardize_street_suffix(
|
|
|
614
614
|
|
|
615
615
|
Args:
|
|
616
616
|
col: Column containing street suffix
|
|
617
|
-
custom_mappings
|
|
617
|
+
custom_mappings (Optional): Dict of custom suffix mappings (case insensitive)
|
|
618
618
|
|
|
619
619
|
Returns:
|
|
620
620
|
Column with standardized suffix (always abbreviated per USPS standards)
|
|
@@ -896,7 +896,7 @@ def standardize_unit_type(
|
|
|
896
896
|
|
|
897
897
|
Args:
|
|
898
898
|
col: Column containing unit type
|
|
899
|
-
custom_mappings
|
|
899
|
+
custom_mappings (Optional): Dict of custom unit type mappings
|
|
900
900
|
|
|
901
901
|
Returns:
|
|
902
902
|
Column with standardized unit type
|
|
@@ -1206,7 +1206,7 @@ def extract_city(col: Column, custom_cities: Optional[List] = None) -> Column:
|
|
|
1206
1206
|
|
|
1207
1207
|
Args:
|
|
1208
1208
|
col: Column containing address text
|
|
1209
|
-
custom_cities
|
|
1209
|
+
custom_cities (Optional): List of custom city names to recognize (case-insensitive)
|
|
1210
1210
|
|
|
1211
1211
|
Returns:
|
|
1212
1212
|
Column with extracted city name or empty string if not found
|
|
@@ -1371,7 +1371,7 @@ def extract_state(col: Column, custom_states: Optional[Dict] = None) -> Column:
|
|
|
1371
1371
|
|
|
1372
1372
|
Args:
|
|
1373
1373
|
col: Column containing address text with state information
|
|
1374
|
-
custom_states
|
|
1374
|
+
custom_states (Optional): Dict mapping full state names to abbreviations
|
|
1375
1375
|
e.g., {"ONTARIO": "ON", "QUEBEC": "QC"}
|
|
1376
1376
|
|
|
1377
1377
|
Returns:
|
|
@@ -1445,9 +1445,9 @@ def validate_city(
|
|
|
1445
1445
|
|
|
1446
1446
|
Args:
|
|
1447
1447
|
col: Column containing city names to validate
|
|
1448
|
-
known_cities
|
|
1449
|
-
min_length: Minimum valid city name length (default 2)
|
|
1450
|
-
max_length: Maximum valid city name length (default 50)
|
|
1448
|
+
known_cities (Optional): List of valid city names to check against
|
|
1449
|
+
min_length (Optional): Minimum valid city name length (default 2)
|
|
1450
|
+
max_length (Optional): Maximum valid city name length (default 50)
|
|
1451
1451
|
|
|
1452
1452
|
Returns:
|
|
1453
1453
|
Boolean column indicating if city name is valid
|
|
@@ -1523,7 +1523,7 @@ def standardize_city(col: Column, custom_mappings: Optional[Dict] = None) -> Col
|
|
|
1523
1523
|
|
|
1524
1524
|
Args:
|
|
1525
1525
|
col: Column containing city names to standardize
|
|
1526
|
-
custom_mappings
|
|
1526
|
+
custom_mappings (Optional): Dict for city name corrections/standardization
|
|
1527
1527
|
e.g., {"ST LOUIS": "St. Louis", "NEWYORK": "New York"}
|
|
1528
1528
|
|
|
1529
1529
|
Returns:
|
|
@@ -1807,7 +1807,7 @@ def standardize_country(col: Column, custom_mappings: Optional[dict] = None) ->
|
|
|
1807
1807
|
|
|
1808
1808
|
Args:
|
|
1809
1809
|
col: Column containing country name or abbreviation
|
|
1810
|
-
custom_mappings
|
|
1810
|
+
custom_mappings (Optional): Dict of custom country mappings
|
|
1811
1811
|
|
|
1812
1812
|
Returns:
|
|
1813
1813
|
Column with standardized country name
|
|
@@ -255,8 +255,8 @@ def is_valid_email(col: Column, min_length: int = 6, max_length: int = 254) -> C
|
|
|
255
255
|
|
|
256
256
|
Args:
|
|
257
257
|
col: Column containing email address
|
|
258
|
-
min_length: Minimum length for valid email
|
|
259
|
-
max_length: Maximum length for valid email
|
|
258
|
+
min_length (Optional): Minimum length for valid email (default 6)
|
|
259
|
+
max_length (Optional): Maximum length for valid email (default 254)
|
|
260
260
|
|
|
261
261
|
Returns:
|
|
262
262
|
Column with boolean indicating validity
|
|
@@ -286,8 +286,8 @@ def is_valid_username(col: Column, min_length: int = 1, max_length: int = 64) ->
|
|
|
286
286
|
|
|
287
287
|
Args:
|
|
288
288
|
col: Column containing email address
|
|
289
|
-
min_length: Minimum length for valid username (default 1)
|
|
290
|
-
max_length: Maximum length for valid username (default 64 per RFC)
|
|
289
|
+
min_length (Optional): Minimum length for valid username (default 1)
|
|
290
|
+
max_length (Optional): Maximum length for valid username (default 64 per RFC)
|
|
291
291
|
|
|
292
292
|
Returns:
|
|
293
293
|
Column with boolean indicating username validity
|
|
@@ -351,7 +351,7 @@ def is_disposable_email(
|
|
|
351
351
|
|
|
352
352
|
Args:
|
|
353
353
|
col: Column containing email address
|
|
354
|
-
disposable_domains: List of disposable domains to check against
|
|
354
|
+
disposable_domains (Optional): List of disposable domains to check against
|
|
355
355
|
|
|
356
356
|
Returns:
|
|
357
357
|
Column with boolean indicating if email is disposable
|
|
@@ -389,7 +389,7 @@ def is_corporate_email(
|
|
|
389
389
|
|
|
390
390
|
Args:
|
|
391
391
|
col: Column containing email address
|
|
392
|
-
free_providers: List of free email provider domains to check against
|
|
392
|
+
free_providers (Optional): List of free email provider domains to check against
|
|
393
393
|
|
|
394
394
|
Returns:
|
|
395
395
|
Column with boolean indicating if email is corporate
|
|
@@ -535,8 +535,8 @@ def fix_common_typos(
|
|
|
535
535
|
|
|
536
536
|
Args:
|
|
537
537
|
col: Column containing email address
|
|
538
|
-
custom_mappings: Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
|
|
539
|
-
custom_tld_mappings: Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
|
|
538
|
+
custom_mappings (Optional): Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
|
|
539
|
+
custom_tld_mappings (Optional): Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
|
|
540
540
|
|
|
541
541
|
Returns:
|
|
542
542
|
Column with typos fixed
|
|
@@ -604,10 +604,10 @@ def standardize_email(
|
|
|
604
604
|
|
|
605
605
|
Args:
|
|
606
606
|
col: Column containing email address
|
|
607
|
-
lowercase: Convert to lowercase
|
|
608
|
-
remove_dots_gmail: Remove dots from Gmail addresses
|
|
609
|
-
remove_plus: Remove plus addressing
|
|
610
|
-
fix_typos: Fix common domain typos
|
|
607
|
+
lowercase (Optional): Convert to lowercase (default True)
|
|
608
|
+
remove_dots_gmail (Optional): Remove dots from Gmail addresses (default True)
|
|
609
|
+
remove_plus (Optional): Remove plus addressing (default False)
|
|
610
|
+
fix_typos (Optional): Fix common domain typos (default True)
|
|
611
611
|
|
|
612
612
|
Returns:
|
|
613
613
|
Column with standardized email
|
|
@@ -756,8 +756,8 @@ def mask_email(col: Column, mask_char: str = "*", keep_chars: int = 3) -> Column
|
|
|
756
756
|
|
|
757
757
|
Args:
|
|
758
758
|
col: Column containing email address
|
|
759
|
-
mask_char: Character to use for masking
|
|
760
|
-
keep_chars: Number of characters to keep at start
|
|
759
|
+
mask_char (Optional): Character to use for masking (default "*")
|
|
760
|
+
keep_chars (Optional): Number of characters to keep at start (default 3)
|
|
761
761
|
|
|
762
762
|
Returns:
|
|
763
763
|
Column with masked email
|