PyPI - datacompose - Versions diffs - 0.2.4.1__py3-none-any.whl → 0.2.6.0__py3-none-any.whl - Mend

datacompose 0.2.4.1py3-none-any.whl → 0.2.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacompose might be problematic. Click here for more details.

Files changed (24) hide show

datacompose/cli/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 Datacompose CLI - Command-line interface for generating data cleaning UDFs.
 """
-__version__ = "0.2.4"
+__version__ = "0.2.6.0"

datacompose/cli/commands/add.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pathlib import Path
 import click
 from datacompose.cli.colors import dim, error, highlight, info, success
+from datacompose.cli.config import ConfigLoader
 from datacompose.cli.validation import validate_platform, validate_type_for_platform
 from datacompose.transformers.discovery import TransformerDiscovery
@@ -85,28 +86,48 @@ _MODULE_DIR = Path(__file__).parent
 @click.option(
     "--target",
     "-t",
-    default="pyspark",
+    default=None,
     shell_complete=complete_target,
-    help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Default: pyspark",
+    help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Uses default from datacompose.json if not specified",
 )
 @click.option(
     "--type",
     shell_complete=complete_type,
     help="UDF type for the platform (e.g., 'pandas_udf', 'sql_udf'). Uses platform default if not specified",
 )
-@click.option("--output", "-o", help="Output directory (default: build/{target})")
 @click.option(
-    "--template-dir",
-    default="src/transformers/templates",
-    help="Directory containing templates (default: src/transformers/templates)",
+    "--output",
+    "-o",
+    help="Output directory (default: from config or transformers/{target})",
 )
 @click.option("--verbose", "-v", is_flag=True, help="Verbose output")
 @click.pass_context
-def add(ctx, transformer, target, type, output, template_dir, verbose):
+def add(ctx, transformer, target, type, output, verbose):
     """Add UDFs for transformers.
-    TRANSFORMER: Transformer to add UDF for (e.g., 'clean_emails')
+    TRANSFORMER: Transformer to add UDF for (e.g., 'emails')
     """
+    # Load config to get default target if not specified
+    config = ConfigLoader.load_config()
+    if target is None:
+        # Try to get default target from config
+        target = ConfigLoader.get_default_target(config)
+        if target is None:
+            print(
+                error(
+                    "Error: No target specified and no default target found in datacompose.json"
+                )
+            )
+            print(
+                info(
+                    "Please specify a target with --target or run 'datacompose init' to set up defaults"
+                )
+            )
+            ctx.exit(1)
+        elif verbose:
+            print(dim(f"Using default target from config: {target}"))
     # Initialize discovery for validation
     discovery = TransformerDiscovery()
@@ -119,12 +140,12 @@ def add(ctx, transformer, target, type, output, template_dir, verbose):
         ctx.exit(1)
     # Combine target and type into generator reference
-    exit_code = _run_add(transformer, target, output, template_dir, verbose)
+    exit_code = _run_add(transformer, target, output, verbose)
     if exit_code != 0:
         ctx.exit(exit_code)
-def _run_add(transformer, target, output, template_dir, verbose) -> int:
+def _run_add(transformer, target, output, verbose) -> int:
     """Execute the add command."""
     # Initialize discovery
     discovery = TransformerDiscovery()
@@ -135,9 +156,7 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
     if not transformer_path:
         print(error(f"Error: Transformer not found: {transformer}"))
         print(
-            info(
-                f"Available transformers: {', '.join(discovery.list_transformers())}"
-            )
+            info(f"Available transformers: {', '.join(discovery.list_transformers())}")
         )
         return 1
     else:
@@ -154,18 +173,28 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
         print(info(f"Available generators: {', '.join(discovery.list_generators())}"))
         return 1
-    # Determine output directory - no platform subdirectory needed
+    # Determine output directory
     if not output:
-        output_dir = f"build/{transformer_name}"
+        # Try to get output from config first
+        config = ConfigLoader.load_config()
+        config_output = ConfigLoader.get_target_output(config, target)
+        if config_output:
+            # Config output already includes 'transformers/pyspark', so use it directly
+            output_dir = config_output
+        else:
+            output_dir = f"transformers/{target}"
     else:
-        output_dir = f"{output}/{transformer_name}"
+        output_dir = output
     try:
         # Create generator instance
+        # Note: template_dir is required by base class but not used by current generators
         generator = generator_class(
-            template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
+            template_dir=Path("."),  # Placeholder - not actually used
+            output_dir=Path(output_dir),
+            verbose=verbose,
         )
         # Generate the UDF
         result = generator.generate(
             transformer_name, force=False, transformer_dir=transformer_dir
@@ -178,14 +207,14 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
                 print(dim(f"   Hash: {result.get('hash', 'N/A')}"))
         else:
             print(success(f"✓ UDF generated: {result['output_path']}"))
-            if result.get('test_path'):
+            if result.get("test_path"):
                 print(success(f"✓ Test created: {result['test_path']}"))
             print(highlight(f"Function name: {result['function_name']}"))
             if verbose:
                 print(dim(f"   Target: {target}"))
                 print(highlight("\nGenerated package contents:"))
                 print(f"  - UDF code: {result['output_path']}")
-                if result.get('test_path'):
+                if result.get("test_path"):
                     print(f"  - Test file: {result['test_path']}")
         return 0
@@ -197,4 +226,3 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
             traceback.print_exc()
         return 1

datacompose/cli/commands/init.py CHANGED Viewed

@@ -18,10 +18,11 @@ from datacompose.cli.colors import dim, error, highlight, info, success
 DEFAULT_CONFIG = {
     "version": "1.0",
+    "default_target": "pyspark",
     "aliases": {"utils": "./src/utils"},
     "targets": {
         "pyspark": {
-            "output": "./build",
+            "output": "./transformers/pyspark",
         }
     },
 }
@@ -57,7 +58,7 @@ class InitCommand:
     def get_config_template(template_name: str) -> Dict[str, Any]:
         """Get configuration template by name."""
         if template_name == "minimal":
-            return {"version": "1.0", "targets": {"pyspark": {"output": "./build"}}}
+            return {"version": "1.0", "default_target": "pyspark", "targets": {"pyspark": {"output": "./transformers/pyspark"}}}
         elif template_name == "advanced":
             config = DEFAULT_CONFIG.copy()
             config.update(
@@ -65,10 +66,10 @@ class InitCommand:
                     "style": "custom",
                     "aliases": {
                         "utils": "./src/utils",
-                        "build": "./build",
+                        "transformers": "./transformers",
                     },
                     "include": ["src/**/*"],
-                    "exclude": ["__pycache__", "build", "*.pyc", ".pytest_cache"],
+                    "exclude": ["__pycache__", "transformers", "*.pyc", ".pytest_cache"],
                     "testing": {"framework": "pytest", "test_dir": "./tests"},
                 }
             )
@@ -184,7 +185,7 @@ class InitCommand:
         # Select targets with multi-select
         available_targets = {
-            "pyspark": {"output": "./build/pyspark", "name": "PySpark (Apache Spark)"},
+            "pyspark": {"output": "./transformers/pyspark", "name": "PySpark (Apache Spark)"},
         }
         selected_targets = InitCommand.prompt_for_targets(available_targets)
@@ -199,6 +200,31 @@ class InitCommand:
         # Update targets with user selections
         config["targets"] = selected_targets
+        # Set default target to the first selected target (or only target if single)
+        target_keys = list(selected_targets.keys())
+        if len(target_keys) == 1:
+            config["default_target"] = target_keys[0]
+        elif len(target_keys) > 1:
+            # Ask user to select default target
+            print(highlight("\nSelect Default Target"))
+            print(dim("Which platform should be used by default when running 'datacompose add'?\n"))
+            for i, key in enumerate(target_keys, 1):
+                print(f"  {i}. {key}")
+            print()
+            while True:
+                choice = input(f"Select default target (1-{len(target_keys)}): ").strip()
+                try:
+                    choice_idx = int(choice) - 1
+                    if 0 <= choice_idx < len(target_keys):
+                        config["default_target"] = target_keys[choice_idx]
+                        print(dim(f"Default target set to: {target_keys[choice_idx]}\n"))
+                        break
+                    else:
+                        print(error("Invalid selection. Please try again."))
+                except ValueError:
+                    print(error("Please enter a number."))
         print()  # Add spacing
         return config
@@ -403,11 +429,11 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
                     "2. Source your shell config or restart terminal for tab completion"
                 )
                 print(
-                    "3. Add your first transformer: datacompose add clean_emails --target pyspark"
+                    "3. Add your first transformer: datacompose add emails"
                 )
             else:
                 print(
-                    "2. Add your first transformer: datacompose add clean_emails --target pyspark"
+                    "2. Add your first transformer: datacompose add emails"
                 )
                 if not skip_completion:
                     print(
@@ -419,7 +445,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
                 print(success("✓ Tab completion configured"))
                 print(
                     highlight(
-                        "\nRun 'datacompose add clean_emails --target pyspark' to get started"
+                        "\nRun 'datacompose add emails' to get started"
                     )
                 )
                 print(
@@ -430,7 +456,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
             else:
                 print(
                     highlight(
-                        "\nRun 'datacompose add clean_emails --target pyspark' to get started"
+                        "\nRun 'datacompose add emails' to get started"
                     )
                 )
                 if not skip_completion and not yes:

datacompose/cli/commands/list.py CHANGED Viewed

@@ -95,7 +95,7 @@ class ListCommand:
                 print(f"    • {transformer_name}")
         print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
-        print("Example: datacompose add clean_emails --target pyspark")
+        print("Example: datacompose add emails --target pyspark")
         return 0
     @staticmethod
@@ -114,5 +114,5 @@ class ListCommand:
                 print(f"    • {gen_type} ({gen_class.__name__})")
         print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
-        print("Example: datacompose add clean_emails --target pyspark")
+        print("Example: datacompose add emails --target pyspark")
         return 0

datacompose/cli/config.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""
+Configuration management for Datacompose CLI.
+"""
+import json
+from pathlib import Path
+from typing import Any, Dict, Optional
+class ConfigLoader:
+    """Load and manage Datacompose configuration."""
+    DEFAULT_CONFIG_FILE = "datacompose.json"
+    @staticmethod
+    def load_config(config_path: Optional[Path] = None) -> Optional[Dict[str, Any]]:
+        """Load configuration from datacompose.json.
+        Args:
+            config_path: Optional path to config file. Defaults to ./datacompose.json
+        Returns:
+            Config dictionary or None if not found
+        """
+        if config_path is None:
+            config_path = Path(ConfigLoader.DEFAULT_CONFIG_FILE)
+        if not config_path.exists():
+            return None
+        try:
+            with open(config_path, 'r') as f:
+                return json.load(f)
+        except (json.JSONDecodeError, IOError):
+            return None
+    @staticmethod
+    def get_default_target(config: Optional[Dict[str, Any]] = None) -> Optional[str]:
+        """Get the default target from config.
+        Args:
+            config: Optional config dict. If None, will load from file.
+        Returns:
+            Default target name or None
+        """
+        if config is None:
+            config = ConfigLoader.load_config()
+        if not config:
+            return None
+        # Check for explicit default_target setting
+        if "default_target" in config:
+            return config["default_target"]
+        # Otherwise use the first target if only one exists
+        targets = config.get("targets", {})
+        if len(targets) == 1:
+            return list(targets.keys())[0]
+        return None
+    @staticmethod
+    def get_target_output(config: Optional[Dict[str, Any]], target: str) -> Optional[str]:
+        """Get the output directory for a specific target.
+        Args:
+            config: Config dictionary
+            target: Target name
+        Returns:
+            Output directory path or None
+        """
+        if not config:
+            return None
+        targets = config.get("targets", {})
+        target_config = targets.get(target, {})
+        return target_config.get("output")

datacompose/cli/main.py CHANGED Viewed

@@ -25,9 +25,9 @@ def cli(ctx):
     """Generate data cleaning UDFs for various platforms.
     Examples:
-      datacompose init
-      datacompose add clean_emails --target pyspark
-      datacompose add clean_emails --target snowflake --output sql/udfs/
+      datacompose init                  # Set up project with default target
+      datacompose add emails            # Uses default target from config
+      datacompose add emails --target snowflake --output sql/udfs/
       datacompose list targets
     """
     pass

datacompose/generators/base.py CHANGED Viewed

@@ -105,14 +105,14 @@ class BaseGenerator(ABC):
     def _ensure_init_files(self, output_path: Path):
         """Ensure __init__.py files exist to make directories importable."""
-        # Get all directories from build down to the target directory
+        # Get all directories from transformers down to the target directory
         path_parts = output_path.parts
-        # Find the build directory index
+        # Find the transformers directory index
         try:
-            build_index = path_parts.index("build")
+            transformers_index = path_parts.index("transformers")
         except ValueError:
-            # No build directory found, just create init for immediate parent
+            # No transformers directory found, just create init for immediate parent
             init_file = output_path.parent / "__init__.py"
             if not init_file.exists():
                 init_file.touch()
@@ -120,9 +120,9 @@ class BaseGenerator(ABC):
                     print(f"Created {init_file}")
             return
-        # Create __init__.py files for build and all subdirectories leading to output
+        # Create __init__.py files for transformers and all subdirectories leading to output
         for i in range(
-            build_index, len(path_parts) - 1
+            transformers_index, len(path_parts) - 1
         ):  # -1 to exclude the file itself
             dir_path = Path(*path_parts[: i + 1])
             init_file = dir_path / "__init__.py"
@@ -133,18 +133,19 @@ class BaseGenerator(ABC):
     def _copy_utils_files(self, output_path: Path):
-        """Copy utility files like primitives.py to the build root directory."""
-        # Find the build directory root
+        """Copy utility files like primitives.py to the transformers directory."""
+        # Find the transformers directory root
         path_parts = output_path.parts
         try:
-            build_index = path_parts.index("build")
-            build_root = Path(*path_parts[:build_index + 1])
+            transformers_index = path_parts.index("transformers")
+            transformers_root = Path(*path_parts[:transformers_index + 1])
         except ValueError:
-            # Fallback to parent directory if no 'build' in path
-            build_root = output_path.parent.parent
+            # Fallback to parent directory if no 'transformers' in path
+            transformers_root = output_path.parent.parent
-        # Create utils directory at build root
-        utils_dir = build_root / "utils"
+        # Create utils directory in the same directory as the generated files
+        # This puts it at transformers/pyspark/utils
+        utils_dir = output_path.parent / "utils"
         utils_dir.mkdir(parents=True, exist_ok=True)
         # Create __init__.py in utils directory

datacompose/generators/pyspark/generator.py CHANGED Viewed

@@ -39,13 +39,8 @@ class SparkPandasUDFGenerator(BaseGenerator):
     def _get_output_filename(self, transformer_name: str) -> str:
         """Get the output filename for PySpark primitives."""
-        # Map transformer names to their primitive namespace names
-        name_mapping = {
-            "clean_emails": "email_primitives",
-            "clean_addresses": "address_primitives",
-            "clean_phone_numbers": "phone_primitives"
-        }
-        # Use mapped name if available, otherwise fall back to transformer_name
-        output_name = name_mapping.get(transformer_name, f"{transformer_name}_primitives")
-        return f"{output_name}.py"
+        # Use the transformer name directly as the filename
+        # emails -> emails.py
+        # addresses -> addresses.py
+        # phone_numbers -> phone_numbers.py
+        return f"{transformer_name}.py"

datacompose/operators/__init__.py CHANGED Viewed

@@ -18,4 +18,4 @@ __all__ = [
     "PrimitiveRegistry",
 ]
-__version__ = "0.2.4"
+__version__ = "0.2.6.0"

datacompose/operators/primitives.py CHANGED Viewed

@@ -16,9 +16,13 @@ from typing import Any, Callable, Dict, List, Optional, Sequence
 logger = logging.getLogger(__name__)
 try:
-    from pyspark.sql import Column  # type: ignore
+    from pyspark.sql import Column
+    from pyspark.sql import functions as F
 except ImportError:
-    pass
+    logging.debug("PySpark not available")
+# Set up module logger
+logger = logging.getLogger(__name__)
 class SmartPrimitive:
@@ -120,11 +124,15 @@ class PrimitiveRegistry:
         self._primitives = {}
         self._conditionals = {}
-    def register(self, name: Optional[str] = None, is_conditional: bool = False):
+    def register(
+        self, name: Optional[str] = None, is_conditional: Optional[bool] = None
+    ):
         """Decorator to register a function as a SmartPrimitive in this namespace.
         Args:
             name: Optional name for the primitive (defaults to function name)
+            is_conditional: Optional flag to mark as conditional. If None, auto-detects
+                          based on function name patterns.
         Returns:
             Decorator function that wraps the target function as a SmartPrimitive
@@ -139,7 +147,29 @@ class PrimitiveRegistry:
         def decorator(func: Callable):
             primitive_name = name or func.__name__
-            if is_conditional:
+            # Auto-detect conditional if not explicitly specified
+            if is_conditional is None:
+                # Check common naming patterns for conditional functions
+                conditional_patterns = [
+                    "is_",
+                    "has_",
+                    "needs_",
+                    "should_",
+                    "can_",
+                    "contains_",
+                    "matches_",
+                    "equals_",
+                    "starts_with_",
+                    "ends_with_",
+                ]
+                is_conditional_auto = any(
+                    primitive_name.startswith(pattern)
+                    for pattern in conditional_patterns
+                )
+            else:
+                is_conditional_auto = is_conditional
+            if is_conditional_auto:
                 self._conditionals[primitive_name] = SmartPrimitive(
                     func, primitive_name
                 )
@@ -217,9 +247,17 @@ class PrimitiveRegistry:
                 pipeline.__doc__ = func.__doc__
                 return pipeline
+            # Auto-detect ALL namespace instances from func.__globals__
+            # This allows using multiple namespaces without explicitly passing them
+            for var_name, var_value in func.__globals__.items():
+                if isinstance(var_value, PrimitiveRegistry):
+                    # Found a namespace instance
+                    if var_name not in namespaces:
+                        namespaces[var_name] = var_value
             # Try to get the function as a string and parse it
             try:
-                compiler = PipelineCompiler(namespaces, debug)
+                compiler = PipelineCompiler(namespaces, debug, func.__globals__)
                 pipeline = compiler.compile(func)
                 if debug and pipeline.steps:
@@ -270,7 +308,11 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
                         method_name = node.value.func.attr
                         namespace = (
                             namespaces.get(namespace_name) if namespace_name else None
-                        ) or (globals().get(namespace_name) if namespace_name else None)
+                        ) or (
+                            func.__globals__.get(namespace_name)
+                            if namespace_name
+                            else None
+                        )
                         if namespace and hasattr(namespace, method_name):
                             method = getattr(namespace, method_name)
@@ -312,16 +354,6 @@ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable
         return pipeline
-try:
-    from pyspark.sql import Column
-    from pyspark.sql import functions as F
-except ImportError:
-    logging.debug("PySpark not available")
-# Set up module logger
-logger = logging.getLogger(__name__)
 @dataclass
 class CompiledStep:
     """A compiled pipeline step"""
@@ -452,9 +484,15 @@ class StablePipeline:
 class PipelineCompiler:
-    def __init__(self, namespaces: Dict[str, Any], debug: bool = False):
+    def __init__(
+        self,
+        namespaces: Dict[str, Any],
+        debug: bool = False,
+        func_globals: Optional[Dict] = None,
+    ):
         self.namespaces = namespaces
         self.debug = debug
+        self.func_globals = func_globals or {}
     def compile(self, func: Callable) -> StablePipeline:
         try:
@@ -530,7 +568,7 @@ class PipelineCompiler:
             namespace = (
                 self.namespaces.get(namespace_name) if namespace_name else None
-            ) or (globals().get(namespace_name) if namespace_name else None)
+            ) or (self.func_globals.get(namespace_name) if namespace_name else None)
             if namespace and hasattr(namespace, method_name):
                 method = getattr(namespace, method_name)
@@ -552,7 +590,7 @@ class PipelineCompiler:
             namespace = (
                 self.namespaces.get(namespace_name) if namespace_name else None
-            ) or (globals().get(namespace_name) if namespace_name else None)
+            ) or (self.func_globals.get(namespace_name) if namespace_name else None)
             if namespace and hasattr(namespace, method_name):
                 method = getattr(namespace, method_name)

datacompose 0.2.4.1__py3-none-any.whl → 0.2.6.0__py3-none-any.whl

Potentially problematic release.

datacompose 0.2.4.1py3-none-any.whl → 0.2.6.0py3-none-any.whl