datacompose 0.2.4.1__py3-none-any.whl → 0.2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

@@ -7,6 +7,7 @@ from pathlib import Path
7
7
  import click
8
8
 
9
9
  from datacompose.cli.colors import dim, error, highlight, info, success
10
+ from datacompose.cli.config import ConfigLoader
10
11
  from datacompose.cli.validation import validate_platform, validate_type_for_platform
11
12
  from datacompose.transformers.discovery import TransformerDiscovery
12
13
 
@@ -85,28 +86,48 @@ _MODULE_DIR = Path(__file__).parent
85
86
  @click.option(
86
87
  "--target",
87
88
  "-t",
88
- default="pyspark",
89
+ default=None,
89
90
  shell_complete=complete_target,
90
- help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Default: pyspark",
91
+ help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Uses default from datacompose.json if not specified",
91
92
  )
92
93
  @click.option(
93
94
  "--type",
94
95
  shell_complete=complete_type,
95
96
  help="UDF type for the platform (e.g., 'pandas_udf', 'sql_udf'). Uses platform default if not specified",
96
97
  )
97
- @click.option("--output", "-o", help="Output directory (default: build/{target})")
98
98
  @click.option(
99
- "--template-dir",
100
- default="src/transformers/templates",
101
- help="Directory containing templates (default: src/transformers/templates)",
99
+ "--output",
100
+ "-o",
101
+ help="Output directory (default: from config or transformers/{target})",
102
102
  )
103
103
  @click.option("--verbose", "-v", is_flag=True, help="Verbose output")
104
104
  @click.pass_context
105
- def add(ctx, transformer, target, type, output, template_dir, verbose):
105
+ def add(ctx, transformer, target, type, output, verbose):
106
106
  """Add UDFs for transformers.
107
107
 
108
- TRANSFORMER: Transformer to add UDF for (e.g., 'clean_emails')
108
+ TRANSFORMER: Transformer to add UDF for (e.g., 'emails')
109
109
  """
110
+ # Load config to get default target if not specified
111
+ config = ConfigLoader.load_config()
112
+
113
+ if target is None:
114
+ # Try to get default target from config
115
+ target = ConfigLoader.get_default_target(config)
116
+ if target is None:
117
+ print(
118
+ error(
119
+ "Error: No target specified and no default target found in datacompose.json"
120
+ )
121
+ )
122
+ print(
123
+ info(
124
+ "Please specify a target with --target or run 'datacompose init' to set up defaults"
125
+ )
126
+ )
127
+ ctx.exit(1)
128
+ elif verbose:
129
+ print(dim(f"Using default target from config: {target}"))
130
+
110
131
  # Initialize discovery for validation
111
132
  discovery = TransformerDiscovery()
112
133
 
@@ -119,12 +140,12 @@ def add(ctx, transformer, target, type, output, template_dir, verbose):
119
140
  ctx.exit(1)
120
141
 
121
142
  # Combine target and type into generator reference
122
- exit_code = _run_add(transformer, target, output, template_dir, verbose)
143
+ exit_code = _run_add(transformer, target, output, verbose)
123
144
  if exit_code != 0:
124
145
  ctx.exit(exit_code)
125
146
 
126
147
 
127
- def _run_add(transformer, target, output, template_dir, verbose) -> int:
148
+ def _run_add(transformer, target, output, verbose) -> int:
128
149
  """Execute the add command."""
129
150
  # Initialize discovery
130
151
  discovery = TransformerDiscovery()
@@ -135,9 +156,7 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
135
156
  if not transformer_path:
136
157
  print(error(f"Error: Transformer not found: {transformer}"))
137
158
  print(
138
- info(
139
- f"Available transformers: {', '.join(discovery.list_transformers())}"
140
- )
159
+ info(f"Available transformers: {', '.join(discovery.list_transformers())}")
141
160
  )
142
161
  return 1
143
162
  else:
@@ -154,18 +173,28 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
154
173
  print(info(f"Available generators: {', '.join(discovery.list_generators())}"))
155
174
  return 1
156
175
 
157
- # Determine output directory - no platform subdirectory needed
176
+ # Determine output directory
158
177
  if not output:
159
- output_dir = f"build/{transformer_name}"
178
+ # Try to get output from config first
179
+ config = ConfigLoader.load_config()
180
+ config_output = ConfigLoader.get_target_output(config, target)
181
+ if config_output:
182
+ # Config output already includes 'transformers/pyspark', so use it directly
183
+ output_dir = config_output
184
+ else:
185
+ output_dir = f"transformers/{target}"
160
186
  else:
161
- output_dir = f"{output}/{transformer_name}"
187
+ output_dir = output
162
188
 
163
189
  try:
164
190
  # Create generator instance
191
+ # Note: template_dir is required by base class but not used by current generators
165
192
  generator = generator_class(
166
- template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
193
+ template_dir=Path("."), # Placeholder - not actually used
194
+ output_dir=Path(output_dir),
195
+ verbose=verbose,
167
196
  )
168
-
197
+
169
198
  # Generate the UDF
170
199
  result = generator.generate(
171
200
  transformer_name, force=False, transformer_dir=transformer_dir
@@ -178,14 +207,14 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
178
207
  print(dim(f" Hash: {result.get('hash', 'N/A')}"))
179
208
  else:
180
209
  print(success(f"✓ UDF generated: {result['output_path']}"))
181
- if result.get('test_path'):
210
+ if result.get("test_path"):
182
211
  print(success(f"✓ Test created: {result['test_path']}"))
183
212
  print(highlight(f"Function name: {result['function_name']}"))
184
213
  if verbose:
185
214
  print(dim(f" Target: {target}"))
186
215
  print(highlight("\nGenerated package contents:"))
187
216
  print(f" - UDF code: {result['output_path']}")
188
- if result.get('test_path'):
217
+ if result.get("test_path"):
189
218
  print(f" - Test file: {result['test_path']}")
190
219
 
191
220
  return 0
@@ -197,4 +226,3 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
197
226
 
198
227
  traceback.print_exc()
199
228
  return 1
200
-
@@ -18,10 +18,11 @@ from datacompose.cli.colors import dim, error, highlight, info, success
18
18
 
19
19
  DEFAULT_CONFIG = {
20
20
  "version": "1.0",
21
+ "default_target": "pyspark",
21
22
  "aliases": {"utils": "./src/utils"},
22
23
  "targets": {
23
24
  "pyspark": {
24
- "output": "./build",
25
+ "output": "./transformers/pyspark",
25
26
  }
26
27
  },
27
28
  }
@@ -57,7 +58,7 @@ class InitCommand:
57
58
  def get_config_template(template_name: str) -> Dict[str, Any]:
58
59
  """Get configuration template by name."""
59
60
  if template_name == "minimal":
60
- return {"version": "1.0", "targets": {"pyspark": {"output": "./build"}}}
61
+ return {"version": "1.0", "default_target": "pyspark", "targets": {"pyspark": {"output": "./transformers/pyspark"}}}
61
62
  elif template_name == "advanced":
62
63
  config = DEFAULT_CONFIG.copy()
63
64
  config.update(
@@ -65,10 +66,10 @@ class InitCommand:
65
66
  "style": "custom",
66
67
  "aliases": {
67
68
  "utils": "./src/utils",
68
- "build": "./build",
69
+ "transformers": "./transformers",
69
70
  },
70
71
  "include": ["src/**/*"],
71
- "exclude": ["__pycache__", "build", "*.pyc", ".pytest_cache"],
72
+ "exclude": ["__pycache__", "transformers", "*.pyc", ".pytest_cache"],
72
73
  "testing": {"framework": "pytest", "test_dir": "./tests"},
73
74
  }
74
75
  )
@@ -184,7 +185,7 @@ class InitCommand:
184
185
 
185
186
  # Select targets with multi-select
186
187
  available_targets = {
187
- "pyspark": {"output": "./build/pyspark", "name": "PySpark (Apache Spark)"},
188
+ "pyspark": {"output": "./transformers/pyspark", "name": "PySpark (Apache Spark)"},
188
189
  }
189
190
 
190
191
  selected_targets = InitCommand.prompt_for_targets(available_targets)
@@ -199,6 +200,31 @@ class InitCommand:
199
200
 
200
201
  # Update targets with user selections
201
202
  config["targets"] = selected_targets
203
+
204
+ # Set default target to the first selected target (or only target if single)
205
+ target_keys = list(selected_targets.keys())
206
+ if len(target_keys) == 1:
207
+ config["default_target"] = target_keys[0]
208
+ elif len(target_keys) > 1:
209
+ # Ask user to select default target
210
+ print(highlight("\nSelect Default Target"))
211
+ print(dim("Which platform should be used by default when running 'datacompose add'?\n"))
212
+ for i, key in enumerate(target_keys, 1):
213
+ print(f" {i}. {key}")
214
+ print()
215
+
216
+ while True:
217
+ choice = input(f"Select default target (1-{len(target_keys)}): ").strip()
218
+ try:
219
+ choice_idx = int(choice) - 1
220
+ if 0 <= choice_idx < len(target_keys):
221
+ config["default_target"] = target_keys[choice_idx]
222
+ print(dim(f"Default target set to: {target_keys[choice_idx]}\n"))
223
+ break
224
+ else:
225
+ print(error("Invalid selection. Please try again."))
226
+ except ValueError:
227
+ print(error("Please enter a number."))
202
228
 
203
229
  print() # Add spacing
204
230
  return config
@@ -403,11 +429,11 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
403
429
  "2. Source your shell config or restart terminal for tab completion"
404
430
  )
405
431
  print(
406
- "3. Add your first transformer: datacompose add clean_emails --target pyspark"
432
+ "3. Add your first transformer: datacompose add emails"
407
433
  )
408
434
  else:
409
435
  print(
410
- "2. Add your first transformer: datacompose add clean_emails --target pyspark"
436
+ "2. Add your first transformer: datacompose add emails"
411
437
  )
412
438
  if not skip_completion:
413
439
  print(
@@ -419,7 +445,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
419
445
  print(success("✓ Tab completion configured"))
420
446
  print(
421
447
  highlight(
422
- "\nRun 'datacompose add clean_emails --target pyspark' to get started"
448
+ "\nRun 'datacompose add emails' to get started"
423
449
  )
424
450
  )
425
451
  print(
@@ -430,7 +456,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
430
456
  else:
431
457
  print(
432
458
  highlight(
433
- "\nRun 'datacompose add clean_emails --target pyspark' to get started"
459
+ "\nRun 'datacompose add emails' to get started"
434
460
  )
435
461
  )
436
462
  if not skip_completion and not yes:
@@ -95,7 +95,7 @@ class ListCommand:
95
95
  print(f" • {transformer_name}")
96
96
 
97
97
  print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
98
- print("Example: datacompose add clean_emails --target pyspark")
98
+ print("Example: datacompose add emails --target pyspark")
99
99
  return 0
100
100
 
101
101
  @staticmethod
@@ -114,5 +114,5 @@ class ListCommand:
114
114
  print(f" • {gen_type} ({gen_class.__name__})")
115
115
 
116
116
  print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
117
- print("Example: datacompose add clean_emails --target pyspark")
117
+ print("Example: datacompose add emails --target pyspark")
118
118
  return 0
@@ -0,0 +1,80 @@
1
+ """
2
+ Configuration management for Datacompose CLI.
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+
10
+ class ConfigLoader:
11
+ """Load and manage Datacompose configuration."""
12
+
13
+ DEFAULT_CONFIG_FILE = "datacompose.json"
14
+
15
+ @staticmethod
16
+ def load_config(config_path: Optional[Path] = None) -> Optional[Dict[str, Any]]:
17
+ """Load configuration from datacompose.json.
18
+
19
+ Args:
20
+ config_path: Optional path to config file. Defaults to ./datacompose.json
21
+
22
+ Returns:
23
+ Config dictionary or None if not found
24
+ """
25
+ if config_path is None:
26
+ config_path = Path(ConfigLoader.DEFAULT_CONFIG_FILE)
27
+
28
+ if not config_path.exists():
29
+ return None
30
+
31
+ try:
32
+ with open(config_path, 'r') as f:
33
+ return json.load(f)
34
+ except (json.JSONDecodeError, IOError):
35
+ return None
36
+
37
+ @staticmethod
38
+ def get_default_target(config: Optional[Dict[str, Any]] = None) -> Optional[str]:
39
+ """Get the default target from config.
40
+
41
+ Args:
42
+ config: Optional config dict. If None, will load from file.
43
+
44
+ Returns:
45
+ Default target name or None
46
+ """
47
+ if config is None:
48
+ config = ConfigLoader.load_config()
49
+
50
+ if not config:
51
+ return None
52
+
53
+ # Check for explicit default_target setting
54
+ if "default_target" in config:
55
+ return config["default_target"]
56
+
57
+ # Otherwise use the first target if only one exists
58
+ targets = config.get("targets", {})
59
+ if len(targets) == 1:
60
+ return list(targets.keys())[0]
61
+
62
+ return None
63
+
64
+ @staticmethod
65
+ def get_target_output(config: Optional[Dict[str, Any]], target: str) -> Optional[str]:
66
+ """Get the output directory for a specific target.
67
+
68
+ Args:
69
+ config: Config dictionary
70
+ target: Target name
71
+
72
+ Returns:
73
+ Output directory path or None
74
+ """
75
+ if not config:
76
+ return None
77
+
78
+ targets = config.get("targets", {})
79
+ target_config = targets.get(target, {})
80
+ return target_config.get("output")
datacompose/cli/main.py CHANGED
@@ -25,9 +25,9 @@ def cli(ctx):
25
25
  """Generate data cleaning UDFs for various platforms.
26
26
 
27
27
  Examples:
28
- datacompose init
29
- datacompose add clean_emails --target pyspark
30
- datacompose add clean_emails --target snowflake --output sql/udfs/
28
+ datacompose init # Set up project with default target
29
+ datacompose add emails # Uses default target from config
30
+ datacompose add emails --target snowflake --output sql/udfs/
31
31
  datacompose list targets
32
32
  """
33
33
  pass
@@ -105,14 +105,14 @@ class BaseGenerator(ABC):
105
105
 
106
106
  def _ensure_init_files(self, output_path: Path):
107
107
  """Ensure __init__.py files exist to make directories importable."""
108
- # Get all directories from build down to the target directory
108
+ # Get all directories from transformers down to the target directory
109
109
  path_parts = output_path.parts
110
110
 
111
- # Find the build directory index
111
+ # Find the transformers directory index
112
112
  try:
113
- build_index = path_parts.index("build")
113
+ transformers_index = path_parts.index("transformers")
114
114
  except ValueError:
115
- # No build directory found, just create init for immediate parent
115
+ # No transformers directory found, just create init for immediate parent
116
116
  init_file = output_path.parent / "__init__.py"
117
117
  if not init_file.exists():
118
118
  init_file.touch()
@@ -120,9 +120,9 @@ class BaseGenerator(ABC):
120
120
  print(f"Created {init_file}")
121
121
  return
122
122
 
123
- # Create __init__.py files for build and all subdirectories leading to output
123
+ # Create __init__.py files for transformers and all subdirectories leading to output
124
124
  for i in range(
125
- build_index, len(path_parts) - 1
125
+ transformers_index, len(path_parts) - 1
126
126
  ): # -1 to exclude the file itself
127
127
  dir_path = Path(*path_parts[: i + 1])
128
128
  init_file = dir_path / "__init__.py"
@@ -133,18 +133,19 @@ class BaseGenerator(ABC):
133
133
 
134
134
 
135
135
  def _copy_utils_files(self, output_path: Path):
136
- """Copy utility files like primitives.py to the build root directory."""
137
- # Find the build directory root
136
+ """Copy utility files like primitives.py to the transformers directory."""
137
+ # Find the transformers directory root
138
138
  path_parts = output_path.parts
139
139
  try:
140
- build_index = path_parts.index("build")
141
- build_root = Path(*path_parts[:build_index + 1])
140
+ transformers_index = path_parts.index("transformers")
141
+ transformers_root = Path(*path_parts[:transformers_index + 1])
142
142
  except ValueError:
143
- # Fallback to parent directory if no 'build' in path
144
- build_root = output_path.parent.parent
143
+ # Fallback to parent directory if no 'transformers' in path
144
+ transformers_root = output_path.parent.parent
145
145
 
146
- # Create utils directory at build root
147
- utils_dir = build_root / "utils"
146
+ # Create utils directory in the same directory as the generated files
147
+ # This puts it at transformers/pyspark/utils
148
+ utils_dir = output_path.parent / "utils"
148
149
  utils_dir.mkdir(parents=True, exist_ok=True)
149
150
 
150
151
  # Create __init__.py in utils directory
@@ -39,13 +39,8 @@ class SparkPandasUDFGenerator(BaseGenerator):
39
39
 
40
40
  def _get_output_filename(self, transformer_name: str) -> str:
41
41
  """Get the output filename for PySpark primitives."""
42
- # Map transformer names to their primitive namespace names
43
- name_mapping = {
44
- "clean_emails": "email_primitives",
45
- "clean_addresses": "address_primitives",
46
- "clean_phone_numbers": "phone_primitives"
47
- }
48
-
49
- # Use mapped name if available, otherwise fall back to transformer_name
50
- output_name = name_mapping.get(transformer_name, f"{transformer_name}_primitives")
51
- return f"{output_name}.py"
42
+ # Use the transformer name directly as the filename
43
+ # emails -> emails.py
44
+ # addresses -> addresses.py
45
+ # phone_numbers -> phone_numbers.py
46
+ return f"{transformer_name}.py"
@@ -1,3 +1,54 @@
1
+ """
2
+ Address transformation primitives for PySpark.
3
+
4
+ Preview Output:
5
+ +----------------------------------------------+-------------+-----------+-----------+-----+-------+
6
+ |address |street_number|street_name|city |state|zip |
7
+ +----------------------------------------------+-------------+-----------+-----------+-----+-------+
8
+ | 123 Main St, New York, NY 10001 |123 |Main |New York |NY |10001 |
9
+ |456 oak ave apt 5b, los angeles, ca 90001 |456 |Oak |Los Angeles|CA |90001 |
10
+ |789 ELM STREET CHICAGO IL 60601 |789 |Elm |Chicago |IL |60601 |
11
+ |321 pine rd. suite 100,, boston massachusetts|321 |Pine |Boston |MA |null |
12
+ |PO Box 789, Atlanta, GA 30301 |null |null |Atlanta |GA |30301 |
13
+ +----------------------------------------------+-------------+-----------+-----------+-----+-------+
14
+
15
+ Usage Example:
16
+ from pyspark.sql import SparkSession
17
+ from pyspark.sql import functions as F
18
+ from transformers.pyspark.addresses import addresses
19
+
20
+ # Initialize Spark
21
+ spark = SparkSession.builder.appName("DataCleaning").getOrCreate()
22
+
23
+ # Create sample data
24
+ data = [
25
+ ("123 Main St, New York, NY 10001",),
26
+ ("456 Oak Ave Apt 5B, Los Angeles, CA 90001",),
27
+ ("789 Elm Street, Chicago, IL 60601",),
28
+ ("321 Pine Road Suite 100, Boston, MA 02101",),
29
+ ]
30
+ df = spark.createDataFrame(data, ["address"])
31
+
32
+ # Extract and standardize address components
33
+ result_df = df.select(
34
+ F.col("address"),
35
+ addresses.extract_street_number(F.col("address")).alias("street_number"),
36
+ addresses.extract_street_name(F.col("address")).alias("street_name"),
37
+ addresses.extract_city(F.col("address")).alias("city"),
38
+ addresses.extract_state(F.col("address")).alias("state"),
39
+ addresses.extract_zip_code(F.col("address")).alias("zip")
40
+ )
41
+
42
+ # Show results
43
+ result_df.show(truncate=False)
44
+
45
+ # Filter to valid addresses
46
+ valid_addresses = result_df.filter(addresses.validate_zip_code(F.col("zip")))
47
+
48
+ Installation:
49
+ datacompose add addresses
50
+ """
51
+
1
52
  import re
2
53
  from typing import TYPE_CHECKING, Dict, List, Optional
3
54
 
@@ -16,7 +67,7 @@ else:
16
67
 
17
68
  try:
18
69
  # Try local utils import first (for generated code)
19
- from utils.primitives import PrimitiveRegistry
70
+ from utils.primitives import PrimitiveRegistry # type: ignore
20
71
  except ImportError:
21
72
  # Fall back to installed datacompose package
22
73
  from datacompose.operators.primitives import PrimitiveRegistry
@@ -345,8 +396,10 @@ def extract_street_name(col: Column) -> Column:
345
396
  trimmed_col = F.trim(col)
346
397
  without_number = F.when(
347
398
  # If it's just a numbered street (e.g., "5th Avenue", "1st Street")
348
- trimmed_col.rlike(r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"),
349
- trimmed_col # Keep as is - it's a numbered street name
399
+ trimmed_col.rlike(
400
+ r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"
401
+ ),
402
+ trimmed_col, # Keep as is - it's a numbered street name
350
403
  ).otherwise(
351
404
  # Otherwise remove the house number
352
405
  F.regexp_replace(trimmed_col, r"^\d+[\w\-/]*\s+", "")
@@ -354,9 +407,7 @@ def extract_street_name(col: Column) -> Column:
354
407
 
355
408
  # Remove directional prefix - case insensitive
356
409
  # Include full directional words and abbreviations
357
- prefix_pattern = (
358
- r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
359
- )
410
+ prefix_pattern = r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
360
411
  without_prefix = F.regexp_replace(without_number, prefix_pattern, "")
361
412
 
362
413
  # Extract everything before the street suffix - case insensitive
@@ -434,8 +485,10 @@ def extract_street_suffix(col: Column) -> Column:
434
485
 
435
486
  # Build pattern to match the LAST suffix in the string
436
487
  # This handles cases like "St. James Place" where we want "Place" not "St"
437
- suffix_pattern = r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
438
-
488
+ suffix_pattern = (
489
+ r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
490
+ )
491
+
439
492
  # Extract the last matching suffix - case insensitive
440
493
  suffix_pattern_ci = r"(?i)" + suffix_pattern
441
494
  result = F.regexp_extract(col, suffix_pattern_ci, 1)
@@ -653,25 +706,27 @@ def standardize_street_suffix(
653
706
  if col is None:
654
707
  return F.lit("")
655
708
  col = F.when(col.isNull(), F.lit("")).otherwise(col)
656
-
709
+
657
710
  # Convert to uppercase for matching
658
711
  upper_col = F.upper(F.trim(col))
659
712
 
660
713
  # Start with the original column
661
714
  result = col
662
-
715
+
663
716
  # Apply custom mappings first if provided (they take precedence)
664
717
  if custom_mappings:
665
718
  for original, standard in custom_mappings.items():
666
719
  result = F.when(
667
720
  upper_col == F.upper(F.lit(original)), F.lit(standard)
668
721
  ).otherwise(result)
669
-
722
+
670
723
  # Then apply standard mappings for anything not already mapped
671
724
  # Need to check if result has changed to avoid overwriting custom mappings
672
725
  for original, standard in suffix_map.items():
673
726
  # Only apply if not already mapped by custom mappings
674
- if custom_mappings and original.upper() in [k.upper() for k in custom_mappings.keys()]:
727
+ if custom_mappings and original.upper() in [
728
+ k.upper() for k in custom_mappings.keys()
729
+ ]:
675
730
  continue
676
731
  result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
677
732
 
@@ -1005,7 +1060,7 @@ def format_secondary_address(unit_type: Column, unit_number: Column) -> Column:
1005
1060
  Column with formatted secondary address
1006
1061
 
1007
1062
  Example:
1008
- from datacompose.transformers.text.clean_addresses.pyspark.pyspark_udf import format_secondary_address
1063
+ from datacompose.transformers.text.addresses.pyspark.pyspark_udf import format_secondary_address
1009
1064
  df.select(format_secondary_address(F.lit("Apartment"), F.lit("5B")))
1010
1065
  # -> "Apt 5B"
1011
1066
  """
@@ -1,3 +1,55 @@
1
+ """
2
+ Email transformation primitives for PySpark.
3
+
4
+ Preview Output:
5
+ +---------------------------+----------------------+-------------+----------------+--------+
6
+ |email |standardized |username |domain |is_valid|
7
+ +---------------------------+----------------------+-------------+----------------+--------+
8
+ | John.Doe@Gmail.COM |john.doe@gmail.com |john.doe |gmail.com |true |
9
+ |JANE.SMITH@OUTLOOK.COM |jane.smith@outlook.com|jane.smith |outlook.com |true |
10
+ | info@company-name.org |info@company-name.org |info |company-name.org|true |
11
+ |invalid.email@ |null |null |null |false |
12
+ |user+tag@domain.co.uk |user+tag@domain.co.uk |user+tag |domain.co.uk |true |
13
+ |bad email@test.com |null |null |null |false |
14
+ +---------------------------+----------------------+-------------+----------------+--------+
15
+
16
+ Usage Example:
17
+ from pyspark.sql import SparkSession
18
+ from pyspark.sql import functions as F
19
+ from transformers.pyspark.emails import emails
20
+
21
+ # Initialize Spark
22
+ spark = SparkSession.builder.appName("EmailCleaning").getOrCreate()
23
+
24
+ # Create sample data
25
+ data = [
26
+ ("john.doe@gmail.com",),
27
+ ("JANE.SMITH@OUTLOOK.COM",),
28
+ ("info@company-name.org",),
29
+ ("invalid.email@",),
30
+ ("user+tag@domain.co.uk",),
31
+ ]
32
+ df = spark.createDataFrame(data, ["email"])
33
+
34
+ # Extract and validate email components
35
+ result_df = df.select(
36
+ F.col("email"),
37
+ emails.standardize_email(F.col("email")).alias("standardized"),
38
+ emails.extract_username(F.col("email")).alias("username"),
39
+ emails.extract_domain(F.col("email")).alias("domain"),
40
+ emails.is_valid_email(F.col("email")).alias("is_valid")
41
+ )
42
+
43
+ # Show results
44
+ result_df.show(truncate=False)
45
+
46
+ # Filter to valid emails only
47
+ valid_emails = result_df.filter(F.col("is_valid") == True)
48
+
49
+ Installation:
50
+ datacompose add emails
51
+ """
52
+
1
53
  import re
2
54
  from typing import TYPE_CHECKING, Dict, List, Optional
3
55
 
@@ -16,7 +68,7 @@ else:
16
68
 
17
69
  try:
18
70
  # Try local utils import first (for generated code)
19
- from utils.primitives import PrimitiveRegistry
71
+ from utils.primitives import PrimitiveRegistry # type: ignore
20
72
  except ImportError:
21
73
  # Fall back to installed datacompose package
22
74
  from datacompose.operators.primitives import PrimitiveRegistry