datacompose 0.2.5.2__tar.gz → 0.2.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (101) hide show
  1. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/CHANGELOG.md +41 -0
  2. {datacompose-0.2.5.2/datacompose.egg-info → datacompose-0.2.6.1}/PKG-INFO +1 -1
  3. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/__init__.py +1 -1
  4. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/init.py +1 -1
  5. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/operators/__init__.py +1 -1
  6. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/operators/primitives.py +57 -19
  7. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py +10 -10
  8. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/emails/pyspark/pyspark_primitives.py +14 -14
  9. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py +59 -59
  10. {datacompose-0.2.5.2 → datacompose-0.2.6.1/datacompose.egg-info}/PKG-INFO +1 -1
  11. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/SOURCES.txt +3 -8
  12. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/pyproject.toml +1 -1
  13. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_config.py +56 -57
  14. datacompose-0.2.6.1/tests/unit/operators/test_compose_conditions.py +594 -0
  15. datacompose-0.2.6.1/tests/unit/operators/test_conditional_auto_detection.py +192 -0
  16. datacompose-0.2.6.1/tests/unit/operators/test_conditional_core.py +733 -0
  17. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/operators/test_conditional_real_world.py +142 -7
  18. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/operators/test_operators.py +9 -9
  19. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/operators/test_primitives_complete.py +67 -65
  20. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +69 -50
  21. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +42 -18
  22. datacompose-0.2.5.2/tests/unit/operators/conditional_tests_common.py +0 -26
  23. datacompose-0.2.5.2/tests/unit/operators/conftest.py +0 -61
  24. datacompose-0.2.5.2/tests/unit/operators/test_conditional_complex_logic.py +0 -200
  25. datacompose-0.2.5.2/tests/unit/operators/test_conditional_data_driven.py +0 -117
  26. datacompose-0.2.5.2/tests/unit/operators/test_conditional_edge_cases.py +0 -150
  27. datacompose-0.2.5.2/tests/unit/operators/test_conditional_error_handling.py +0 -67
  28. datacompose-0.2.5.2/tests/unit/operators/test_conditional_parameters.py +0 -94
  29. datacompose-0.2.5.2/tests/unit/operators/test_conditional_performance.py +0 -106
  30. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/LICENSE +0 -0
  31. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/MANIFEST.in +0 -0
  32. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/README.md +0 -0
  33. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/__init__.py +0 -0
  34. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/colors.py +0 -0
  35. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/__init__.py +0 -0
  36. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/add.py +0 -0
  37. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/commands/list.py +0 -0
  38. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/config.py +0 -0
  39. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/main.py +0 -0
  40. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/cli/validation.py +0 -0
  41. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/__init__.py +0 -0
  42. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/base.py +0 -0
  43. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/pyspark/__init__.py +0 -0
  44. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/generators/pyspark/generator.py +0 -0
  45. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/__init__.py +0 -0
  46. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/discovery.py +0 -0
  47. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/__init__.py +0 -0
  48. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/addresses/__init__.py +0 -0
  49. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/emails/__init__.py +0 -0
  50. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose/transformers/text/phone_numbers/__init__.py +0 -0
  51. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/dependency_links.txt +0 -0
  52. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/entry_points.txt +0 -0
  53. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/requires.txt +0 -0
  54. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/datacompose.egg-info/top_level.txt +0 -0
  55. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/setup.cfg +0 -0
  56. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/__init__.py +0 -0
  57. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/__init__.py +0 -0
  58. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/test_end_to_end.py +0 -0
  59. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/test_full_workflow.py +0 -0
  60. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/integration/test_generated_imports.py +0 -0
  61. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/.venv/bin/activate_this.py +0 -0
  62. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +0 -0
  63. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/__init__.py +0 -0
  64. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/__init__.py +0 -0
  65. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/__init__.py +0 -0
  66. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
  67. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +0 -0
  68. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +0 -0
  69. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/__init__.py +0 -0
  70. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
  71. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +0 -0
  72. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +0 -0
  73. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +0 -0
  74. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_command.py +0 -0
  75. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_command_complete.py +0 -0
  76. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_default_target.py +0 -0
  77. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_add_validation.py +0 -0
  78. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_init_command.py +0 -0
  79. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_init_command_complete.py +0 -0
  80. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_list_command.py +0 -0
  81. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_main.py +0 -0
  82. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_main_complete.py +0 -0
  83. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/cli/test_validation_complete.py +0 -0
  84. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/generators/__init__.py +0 -0
  85. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/generators/test_base_generator.py +0 -0
  86. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/generators/test_spark_generator.py +0 -0
  87. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/__init__.py +0 -0
  88. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/test_discovery.py +0 -0
  89. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/common/test_common.py +0 -0
  90. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +0 -0
  91. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +0 -0
  92. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +0 -0
  93. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_country_extraction.py +0 -0
  94. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_data_addresses.py +0 -0
  95. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +0 -0
  96. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_street_extraction.py +0 -0
  97. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +0 -0
  98. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +0 -0
  99. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_emails/test_email_extraction.py +0 -0
  100. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/unit/transformers/text/test_emails/test_email_optimized.py +0 -0
  101. {datacompose-0.2.5.2 → datacompose-0.2.6.1}/tests/yaml_specs/__init__.py +0 -0
@@ -7,6 +7,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.2.6.0] - 2025-08-24
11
+
12
+ ### Added
13
+ - **Automatic Conditional Detection**: Smart detection of conditional operators based on naming patterns
14
+ - Functions starting with `is_`, `has_`, `needs_`, `should_`, `can_`, `contains_`, `matches_`, `equals_`, `starts_with_`, `ends_with_` are automatically detected as conditionals
15
+ - Eliminates need for explicit `is_conditional=True` in most cases
16
+ - Explicit override still available when needed via `is_conditional` parameter
17
+ - **Phone Number Processing Pipeline**: Complete phone number validation and formatting example
18
+ - Letter-to-number conversion (1-800-FLOWERS)
19
+ - NANP validation and formatting
20
+ - Toll-free number detection
21
+ - E.164 and parentheses formatting
22
+
23
+ ### Changed
24
+ - **Conditional Operator Registration**: `is_conditional` parameter now optional with smart defaults
25
+ - **Test Organization**: Consolidated conditional tests into three focused files:
26
+ - `test_conditional_core.py` - Core functionality, logic, errors, parameters, and performance
27
+ - `test_conditional_real_world.py` - Real-world pipeline scenarios
28
+ - `test_conditional_auto_detection.py` - Auto-detection feature tests
29
+
30
+ ### Fixed
31
+ - **Phone Number Validation**: Updated NANP validation to be more flexible for testing scenarios
32
+
33
+ ## [0.2.5.3] - 2025-08-23
34
+
35
+ ### Added
36
+ - **Compose Decorator Enhancement**: Auto-detection of PrimitiveRegistry instances in function globals
37
+ - Compose decorator now automatically discovers all namespace instances without explicit passing
38
+ - Improved namespace resolution using function's global scope instead of module globals
39
+ - Better support for multiple namespaces in composed functions
40
+
41
+ ### Fixed
42
+ - **Namespace Resolution**: Fixed global namespace lookups to use function's own globals
43
+ - PipelineCompiler now correctly resolves namespaces from the decorated function's scope
44
+ - Fallback compose mode uses function globals for namespace discovery
45
+ - Prevents namespace resolution errors when registries are defined in different modules
46
+
47
+ ### Changed
48
+ - **Phone Number Tests**: Updated test imports and formatting for phone number primitives
49
+ - **Test Organization**: Added comprehensive conditional composition tests
50
+
10
51
  ## [0.2.5.2] - 2025-08-22
11
52
 
12
53
  ### Fixed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datacompose
3
- Version: 0.2.5.2
3
+ Version: 0.2.6.1
4
4
  Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
5
5
  Author: Datacompose Contributors
6
6
  Maintainer: Datacompose Contributors
@@ -2,4 +2,4 @@
2
2
  Datacompose CLI - Command-line interface for generating data cleaning UDFs.
3
3
  """
4
4
 
5
- __version__ = "0.2.4"
5
+ __version__ = "0.2.6.0"
@@ -380,7 +380,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
380
380
  # Check if config already exists
381
381
  if config_path.exists() and not force:
382
382
  print(error(f"Configuration file already exists: {config_path}"))
383
- print(dim("Use --force to overwrite"))
383
+ print(dim("Use datacompose init --force to overwrite"))
384
384
  return 1
385
385
 
386
386
  try:
@@ -18,4 +18,4 @@ __all__ = [
18
18
  "PrimitiveRegistry",
19
19
  ]
20
20
 
21
- __version__ = "0.2.4"
21
+ __version__ = "0.2.6.0"
@@ -16,9 +16,13 @@ from typing import Any, Callable, Dict, List, Optional, Sequence
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
18
  try:
19
- from pyspark.sql import Column # type: ignore
19
+ from pyspark.sql import Column
20
+ from pyspark.sql import functions as F
20
21
  except ImportError:
21
- pass
22
+ logging.debug("PySpark not available")
23
+
24
+ # Set up module logger
25
+ logger = logging.getLogger(__name__)
22
26
 
23
27
 
24
28
  class SmartPrimitive:
@@ -120,11 +124,15 @@ class PrimitiveRegistry:
120
124
  self._primitives = {}
121
125
  self._conditionals = {}
122
126
 
123
- def register(self, name: Optional[str] = None, is_conditional: bool = False):
127
+ def register(
128
+ self, name: Optional[str] = None, is_conditional: Optional[bool] = None
129
+ ):
124
130
  """Decorator to register a function as a SmartPrimitive in this namespace.
125
131
 
126
132
  Args:
127
133
  name: Optional name for the primitive (defaults to function name)
134
+ is_conditional: Optional flag to mark as conditional. If None, auto-detects
135
+ based on function name patterns.
128
136
 
129
137
  Returns:
130
138
  Decorator function that wraps the target function as a SmartPrimitive
@@ -139,7 +147,29 @@ class PrimitiveRegistry:
139
147
  def decorator(func: Callable):
140
148
  primitive_name = name or func.__name__
141
149
 
142
- if is_conditional:
150
+ # Auto-detect conditional if not explicitly specified
151
+ if is_conditional is None:
152
+ # Check common naming patterns for conditional functions
153
+ conditional_patterns = [
154
+ "is_",
155
+ "has_",
156
+ "needs_",
157
+ "should_",
158
+ "can_",
159
+ "contains_",
160
+ "matches_",
161
+ "equals_",
162
+ "starts_with_",
163
+ "ends_with_",
164
+ ]
165
+ is_conditional_auto = any(
166
+ primitive_name.startswith(pattern)
167
+ for pattern in conditional_patterns
168
+ )
169
+ else:
170
+ is_conditional_auto = is_conditional
171
+
172
+ if is_conditional_auto:
143
173
  self._conditionals[primitive_name] = SmartPrimitive(
144
174
  func, primitive_name
145
175
  )
@@ -217,9 +247,17 @@ class PrimitiveRegistry:
217
247
  pipeline.__doc__ = func.__doc__
218
248
  return pipeline
219
249
 
250
+ # Auto-detect ALL namespace instances from func.__globals__
251
+ # This allows using multiple namespaces without explicitly passing them
252
+ for var_name, var_value in func.__globals__.items():
253
+ if isinstance(var_value, PrimitiveRegistry):
254
+ # Found a namespace instance
255
+ if var_name not in namespaces:
256
+ namespaces[var_name] = var_value
257
+
220
258
  # Try to get the function as a string and parse it
221
259
  try:
222
- compiler = PipelineCompiler(namespaces, debug)
260
+ compiler = PipelineCompiler(namespaces, debug, func.__globals__)
223
261
  pipeline = compiler.compile(func)
224
262
 
225
263
  if debug and pipeline.steps:
@@ -270,7 +308,11 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
270
308
  method_name = node.value.func.attr
271
309
  namespace = (
272
310
  namespaces.get(namespace_name) if namespace_name else None
273
- ) or (globals().get(namespace_name) if namespace_name else None)
311
+ ) or (
312
+ func.__globals__.get(namespace_name)
313
+ if namespace_name
314
+ else None
315
+ )
274
316
  if namespace and hasattr(namespace, method_name):
275
317
  method = getattr(namespace, method_name)
276
318
 
@@ -312,16 +354,6 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
312
354
  return pipeline
313
355
 
314
356
 
315
- try:
316
- from pyspark.sql import Column
317
- from pyspark.sql import functions as F
318
- except ImportError:
319
- logging.debug("PySpark not available")
320
-
321
- # Set up module logger
322
- logger = logging.getLogger(__name__)
323
-
324
-
325
357
  @dataclass
326
358
  class CompiledStep:
327
359
  """A compiled pipeline step"""
@@ -452,9 +484,15 @@ class StablePipeline:
452
484
 
453
485
 
454
486
  class PipelineCompiler:
455
- def __init__(self, namespaces: Dict[str, Any], debug: bool = False):
487
+ def __init__(
488
+ self,
489
+ namespaces: Dict[str, Any],
490
+ debug: bool = False,
491
+ func_globals: Optional[Dict] = None,
492
+ ):
456
493
  self.namespaces = namespaces
457
494
  self.debug = debug
495
+ self.func_globals = func_globals or {}
458
496
 
459
497
  def compile(self, func: Callable) -> StablePipeline:
460
498
  try:
@@ -530,7 +568,7 @@ class PipelineCompiler:
530
568
 
531
569
  namespace = (
532
570
  self.namespaces.get(namespace_name) if namespace_name else None
533
- ) or (globals().get(namespace_name) if namespace_name else None)
571
+ ) or (self.func_globals.get(namespace_name) if namespace_name else None)
534
572
  if namespace and hasattr(namespace, method_name):
535
573
  method = getattr(namespace, method_name)
536
574
 
@@ -552,7 +590,7 @@ class PipelineCompiler:
552
590
 
553
591
  namespace = (
554
592
  self.namespaces.get(namespace_name) if namespace_name else None
555
- ) or (globals().get(namespace_name) if namespace_name else None)
593
+ ) or (self.func_globals.get(namespace_name) if namespace_name else None)
556
594
  if namespace and hasattr(namespace, method_name):
557
595
  method = getattr(namespace, method_name)
558
596
 
@@ -544,7 +544,7 @@ def standardize_street_prefix(
544
544
 
545
545
  Args:
546
546
  col: Column containing street prefix
547
- custom_mappings: Optional dict of custom prefix mappings (case insensitive)
547
+ custom_mappings (Optional): Dict of custom prefix mappings (case insensitive)
548
548
 
549
549
  Returns:
550
550
  Column with standardized prefix (always abbreviated per USPS standards)
@@ -614,7 +614,7 @@ def standardize_street_suffix(
614
614
 
615
615
  Args:
616
616
  col: Column containing street suffix
617
- custom_mappings: Optional dict of custom suffix mappings (case insensitive)
617
+ custom_mappings (Optional): Dict of custom suffix mappings (case insensitive)
618
618
 
619
619
  Returns:
620
620
  Column with standardized suffix (always abbreviated per USPS standards)
@@ -896,7 +896,7 @@ def standardize_unit_type(
896
896
 
897
897
  Args:
898
898
  col: Column containing unit type
899
- custom_mappings: Optional dict of custom unit type mappings
899
+ custom_mappings (Optional): Dict of custom unit type mappings
900
900
 
901
901
  Returns:
902
902
  Column with standardized unit type
@@ -1206,7 +1206,7 @@ def extract_city(col: Column, custom_cities: Optional[List] = None) -> Column:
1206
1206
 
1207
1207
  Args:
1208
1208
  col: Column containing address text
1209
- custom_cities: Optional list of custom city names to recognize (case-insensitive)
1209
+ custom_cities (Optional): List of custom city names to recognize (case-insensitive)
1210
1210
 
1211
1211
  Returns:
1212
1212
  Column with extracted city name or empty string if not found
@@ -1371,7 +1371,7 @@ def extract_state(col: Column, custom_states: Optional[Dict] = None) -> Column:
1371
1371
 
1372
1372
  Args:
1373
1373
  col: Column containing address text with state information
1374
- custom_states: Optional dict mapping full state names to abbreviations
1374
+ custom_states (Optional): Dict mapping full state names to abbreviations
1375
1375
  e.g., {"ONTARIO": "ON", "QUEBEC": "QC"}
1376
1376
 
1377
1377
  Returns:
@@ -1445,9 +1445,9 @@ def validate_city(
1445
1445
 
1446
1446
  Args:
1447
1447
  col: Column containing city names to validate
1448
- known_cities: Optional list of valid city names to check against
1449
- min_length: Minimum valid city name length (default 2)
1450
- max_length: Maximum valid city name length (default 50)
1448
+ known_cities (Optional): List of valid city names to check against
1449
+ min_length (Optional): Minimum valid city name length (default 2)
1450
+ max_length (Optional): Maximum valid city name length (default 50)
1451
1451
 
1452
1452
  Returns:
1453
1453
  Boolean column indicating if city name is valid
@@ -1523,7 +1523,7 @@ def standardize_city(col: Column, custom_mappings: Optional[Dict] = None) -> Col
1523
1523
 
1524
1524
  Args:
1525
1525
  col: Column containing city names to standardize
1526
- custom_mappings: Optional dict for city name corrections/standardization
1526
+ custom_mappings (Optional): Dict for city name corrections/standardization
1527
1527
  e.g., {"ST LOUIS": "St. Louis", "NEWYORK": "New York"}
1528
1528
 
1529
1529
  Returns:
@@ -1807,7 +1807,7 @@ def standardize_country(col: Column, custom_mappings: Optional[dict] = None) ->
1807
1807
 
1808
1808
  Args:
1809
1809
  col: Column containing country name or abbreviation
1810
- custom_mappings: Optional dict of custom country mappings
1810
+ custom_mappings (Optional): Dict of custom country mappings
1811
1811
 
1812
1812
  Returns:
1813
1813
  Column with standardized country name
@@ -255,8 +255,8 @@ def is_valid_email(col: Column, min_length: int = 6, max_length: int = 254) -> C
255
255
 
256
256
  Args:
257
257
  col: Column containing email address
258
- min_length: Minimum length for valid email
259
- max_length: Maximum length for valid email
258
+ min_length (Optional): Minimum length for valid email (default 6)
259
+ max_length (Optional): Maximum length for valid email (default 254)
260
260
 
261
261
  Returns:
262
262
  Column with boolean indicating validity
@@ -286,8 +286,8 @@ def is_valid_username(col: Column, min_length: int = 1, max_length: int = 64) ->
286
286
 
287
287
  Args:
288
288
  col: Column containing email address
289
- min_length: Minimum length for valid username (default 1)
290
- max_length: Maximum length for valid username (default 64 per RFC)
289
+ min_length (Optional): Minimum length for valid username (default 1)
290
+ max_length (Optional): Maximum length for valid username (default 64 per RFC)
291
291
 
292
292
  Returns:
293
293
  Column with boolean indicating username validity
@@ -351,7 +351,7 @@ def is_disposable_email(
351
351
 
352
352
  Args:
353
353
  col: Column containing email address
354
- disposable_domains: List of disposable domains to check against
354
+ disposable_domains (Optional): List of disposable domains to check against
355
355
 
356
356
  Returns:
357
357
  Column with boolean indicating if email is disposable
@@ -389,7 +389,7 @@ def is_corporate_email(
389
389
 
390
390
  Args:
391
391
  col: Column containing email address
392
- free_providers: List of free email provider domains to check against
392
+ free_providers (Optional): List of free email provider domains to check against
393
393
 
394
394
  Returns:
395
395
  Column with boolean indicating if email is corporate
@@ -535,8 +535,8 @@ def fix_common_typos(
535
535
 
536
536
  Args:
537
537
  col: Column containing email address
538
- custom_mappings: Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
539
- custom_tld_mappings: Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
538
+ custom_mappings (Optional): Additional domain mappings to apply (extends DOMAIN_TYPO_MAPPINGS)
539
+ custom_tld_mappings (Optional): Additional TLD mappings to apply (extends TLD_TYPO_MAPPINGS)
540
540
 
541
541
  Returns:
542
542
  Column with typos fixed
@@ -604,10 +604,10 @@ def standardize_email(
604
604
 
605
605
  Args:
606
606
  col: Column containing email address
607
- lowercase: Convert to lowercase
608
- remove_dots_gmail: Remove dots from Gmail addresses
609
- remove_plus: Remove plus addressing
610
- fix_typos: Fix common domain typos
607
+ lowercase (Optional): Convert to lowercase (default True)
608
+ remove_dots_gmail (Optional): Remove dots from Gmail addresses (default True)
609
+ remove_plus (Optional): Remove plus addressing (default False)
610
+ fix_typos (Optional): Fix common domain typos (default True)
611
611
 
612
612
  Returns:
613
613
  Column with standardized email
@@ -756,8 +756,8 @@ def mask_email(col: Column, mask_char: str = "*", keep_chars: int = 3) -> Column
756
756
 
757
757
  Args:
758
758
  col: Column containing email address
759
- mask_char: Character to use for masking
760
- keep_chars: Number of characters to keep at start
759
+ mask_char (Optional): Character to use for masking (default "*")
760
+ keep_chars (Optional): Number of characters to keep at start (default 3)
761
761
 
762
762
  Returns:
763
763
  Column with masked email