PyPI - easy-data-loader - Versions diffs - 0.1.0__py3-none-any.whl - Mend

easy-data-loader 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

easy_data_loader/__init__.py +11 -0
easy_data_loader/cli.py +302 -0
easy_data_loader/config_loader.py +184 -0
easy_data_loader/custom_exceptions.py +21 -0
easy_data_loader/database_connector.py +190 -0
easy_data_loader/database_operations.py +129 -0
easy_data_loader/driver_detector.py +46 -0
easy_data_loader/file_operations.py +146 -0
easy_data_loader/log.py +90 -0
easy_data_loader/models.py +168 -0
easy_data_loader/orchestrator.py +59 -0
easy_data_loader/pipeline.py +169 -0
easy_data_loader/pipeline_base.py +121 -0
easy_data_loader/procedure_pipeline.py +56 -0
easy_data_loader-0.1.0.dist-info/METADATA +52 -0
easy_data_loader-0.1.0.dist-info/RECORD +20 -0
easy_data_loader-0.1.0.dist-info/WHEEL +5 -0
easy_data_loader-0.1.0.dist-info/entry_points.txt +2 -0
easy_data_loader-0.1.0.dist-info/licenses/LICENSE +21 -0
easy_data_loader-0.1.0.dist-info/top_level.txt +1 -0

easy_data_loader/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+__version__ = "0.1.0"
+from .pipeline import LoadPipeline
+from .procedure_pipeline import ProcedurePipeline
+from .orchestrator import OrchestratorPipeline
+from .models import BasePipelineDefinition, ProcedureDefinition, ColumnDefinition, OrchestratorDefinition
+__all__ = [
+    "LoadPipeline", "ProcedurePipeline", "OrchestratorPipeline",
+    "BasePipelineDefinition", "ProcedureDefinition", "ColumnDefinition", "OrchestratorDefinition"
+]

easy_data_loader/cli.py ADDED Viewed

@@ -0,0 +1,302 @@
+import click
+import os
+from pathlib import Path
+# Integrated templates
+PIPELINE_TEMPLATE = """
+from easy_data_loader.pipeline import LoadPipeline
+from easy_data_loader.models import BasePipelineDefinition, ColumnDefinition
+import pandas as pd
+from sqlalchemy.types import DATETIME, INT, DECIMAL, NVARCHAR
+example_pipeline = BasePipelineDefinition(
+    pipeline_name="test_pipeline", # pipeline name to be used when initializing the Pipeline object
+    # source name represented by the file having the source settings -> corresponds to a file in the config/resources folder
+    source = "example_file",
+    # "dbo.FactSales" ## if the source is a database then source sql defines the table name or a custom query to be executed
+    # source_sql = "SELECT TOP 100 * FROM dbo.FactSales"
+    # destination name represented by the file having the destination settings -> corresponds to a file in the config/resources folder
+    destination="example_database",
+    # if the destination is a database then here we define the destination table name
+    destination_table="dbo.LargeSalesData",
+    # columns definition if we are sending data to a database table
+    columns={
+        "transaction_id": ColumnDefinition(target_name="new_transaction_id", data_type=INT()),
+        "date": ColumnDefinition(target_name="sales_date", data_type=DATETIME()),
+        "customer_id": ColumnDefinition(target_name="id_customer", data_type=INT()),
+        "product_category" : ColumnDefinition(target_name="category_of_product", data_type=NVARCHAR(100)),
+        "units_sold" : ColumnDefinition(target_name="units", data_type=INT()),
+        "unit_price" : ColumnDefinition(target_name="price", data_type=DECIMAL(6,2)),
+        "raw_notes" : ColumnDefinition(target_name="notes", data_type=NVARCHAR(100))
+    },
+    # different parameters passed to the write functions
+    write_parameters={"if_exists" : "replace", "index" : False},
+    # different parameters passed to the read function
+    read_parameters={"sep" : ";"}
+)
+def add_timestamp(df):
+    # Adding an audit column during load
+    df['insert_timestamp'] = pd.Timestamp.now()
+    return df
+example_pipeline.transform = add_timestamp
+"""
+PROCEDURE_TEMPLATE = """
+from easy_data_loader.models import ProcedureDefinition
+example_procedure = ProcedureDefinition(
+    pipeline_name="example_procedure",
+    resource="example_database",
+    procedures=[
+        ("dbo.sp_UpdateSales", {"year": 2024}),
+        ("dbo.sp_ArchiveOldData", {})
+    ]
+)
+"""
+ORCHESTRATOR_TEMPLATE = """
+from easy_data_loader.models import OrchestratorDefinition
+example_orchestrator = OrchestratorDefinition(
+    orchestrator_name="example_orchestrator",
+    pipelines=[
+        "example_pipeline",
+        "example_procedure"
+    ],
+    fail_fast=True
+)
+"""
+DATABASE_ENV = """
+# database resource definition
+CONN_SERVER_TYPE=MSSQL
+CONN_SERVER=.
+CONN_DATABASE=test_database
+CONN_USERNAME=my_user
+CONN_PASSWORD=my_password
+CONN_PORT=1433
+"""
+FILE_ENV = """
+# file resource definition
+FILE_TYPE=CSV
+FOLDER_PATH=./data/imports
+FILE_NAME=large_sales_data
+"""
+MAIN = """
+from easy_data_loader.pipeline import LoadPipeline
+# Run an ETL pipeline
+LoadPipeline(pipeline_name="example_pipeline").run()
+# Run a procedure pipeline
+# from easy_data_loader.procedure_pipeline import ProcedurePipeline
+# ProcedurePipeline(pipeline_name="example_procedure").run()
+"""
+@click.group()
+def main():
+    """Easy Data Loader CLI - ETL instrument between files and databases"""
+    pass
+@main.command()
+def init():
+    """Initialize folder structure and sample files"""
+    base_path = Path.cwd()
+    # folders
+    folders = ['config/resources', 'config/pipelines']
+    for folder in folders:
+        (base_path / folder).mkdir(parents=True, exist_ok=True)
+    # Example files to create
+    files = {
+        "config/pipelines/pipeline_example.py": PIPELINE_TEMPLATE,
+        "config/pipelines/procedure_example.py": PROCEDURE_TEMPLATE,
+        "config/pipelines/orchestrator_example.py": ORCHESTRATOR_TEMPLATE,
+        "config/resources/database_example.env": DATABASE_ENV,
+        "config/resources/file_example.env": FILE_ENV,
+        "main.py" : MAIN,
+    }
+    for name, content in files.items():
+        file_path = base_path / name
+        if not file_path.exists():
+            with open(file_path, "w", encoding="utf-8") as f:
+                f.write(content)
+            click.echo(f"Created: {name}")
+        else:
+            click.echo(f"Skipped: {name} (already exists)")
+    click.echo("\nProject initialized successfully!")
+@main.command()
+def list():
+    """List all discovered resources and pipelines"""
+    from .config_loader import Configuration
+    config = Configuration()
+    click.echo("--- Discovered Resources ---")
+    for name in config.get_all_resources():
+        click.echo(f" - {name}")
+    click.echo("\n--- Discovered Pipelines ---")
+    for name in config.get_all_pipelines():
+        click.echo(f" - {name}")
+@main.command()
+@click.argument('resource_name')
+@click.argument('table_name')
+def inspect_db(resource_name, table_name):
+    """Inspect a database table and generate ColumnDefinition code"""
+    from .config_loader import Configuration
+    from .database_connector import CONNECTOR_FACTORY
+    from .database_operations import DatabaseOperations
+    from .models import ConnectionSettings
+    config = Configuration()
+    resource = config.get_resource(resource_name)
+    if not isinstance(resource, ConnectionSettings):
+        click.echo(f"Error: Resource '{resource_name}' is not a database connection.")
+        return
+    # Initialize connector and ops
+    connector = CONNECTOR_FACTORY[resource.conn_server_type](resource)
+    ops = DatabaseOperations(connector.get_engine())
+    schema = ops.inspect_table(table_name)
+    if not schema:
+        click.echo(f"No columns found for table '{table_name}'.")
+        return
+    click.echo(f"\n# Suggested Column definitions for {table_name}:")
+    click.echo("columns={")
+    for col, dtype in schema.items():
+        click.echo(f'    "{col}": ColumnDefinition(target_name="{col}", data_type={dtype}),')
+    click.echo("}")
+@main.command()
+def run_all():
+    """Run all discovered pipelines and show status summary"""
+    from .config_loader import Configuration
+    from .pipeline import LoadPipeline
+    from .procedure_pipeline import ProcedurePipeline
+    from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition
+    config = Configuration()
+    pipelines = config.get_all_pipelines()
+    if not pipelines:
+        click.echo("No pipelines discovered.")
+        return
+    results = {}
+    click.echo(f"🚀 Running {len(pipelines)} discovered pipelines...\n")
+    for name in pipelines:
+        click.echo(f"Pipeline: {name} ... ", nl=False)
+        try:
+            definition = config.get_pipeline(name)
+            if isinstance(definition, BasePipelineDefinition):
+                success = LoadPipeline(name).run()
+            elif isinstance(definition, ProcedureDefinition):
+                success = ProcedurePipeline(name).run()
+            elif isinstance(definition, OrchestratorDefinition):
+                from .orchestrator import OrchestratorPipeline
+                success = OrchestratorPipeline(name).run()
+            else:
+                success = False
+            results[name] = "SUCCESS" if success else "FAILED"
+        except Exception as e:
+            results[name] = f"ERROR: {str(e)}"
+        click.echo(results[name])
+    click.echo("\n" + "=" * 40)
+    click.echo(f"{'PIPELINE':<25} | {'STATUS'}")
+    click.echo("-" * 40)
+    for name, status in results.items():
+        click.echo(f"{name:<25} | {status}")
+@main.command()
+@click.argument('orchestrator_name')
+def run_orchestrator(orchestrator_name):
+    """Run a specific orchestrator by name"""
+    from .orchestrator import OrchestratorPipeline
+    try:
+        success = OrchestratorPipeline(orchestrator_name).run()
+        if success:
+            click.echo(f"✅ Orchestrator '{orchestrator_name}' completed successfully.")
+        else:
+            click.echo(f"❌ Orchestrator '{orchestrator_name}' failed.")
+    except Exception as e:
+        click.echo(f"💥 Error: {str(e)}")
+@main.command()
+def validate_resources():
+    """Validate all configured resources"""
+    from .config_loader import Configuration
+    from .database_connector import CONNECTOR_FACTORY
+    from .models import ConnectionSettings, FileSettings
+    config = Configuration()
+    resources = config.get_all_resources()
+    if not resources:
+        click.echo("No resources found.")
+        return
+    click.echo(f"🔍 Validating {len(resources)} resources...\n")
+    results = {}
+    for name, resource in resources.items():
+        click.echo(f"Resource: {name} ... ", nl=False)
+        try:
+            if isinstance(resource, ConnectionSettings):
+                # Validate Database Connection
+                connector = CONNECTOR_FACTORY[resource.conn_server_type](resource)
+                # The connector tests connection in __init__, so if we are here it passed
+                results[name] = "OK (Connected)"
+            elif isinstance(resource, FileSettings):
+                # Validate File Path
+                if resource.folder_path.exists():
+                     results[name] = "OK (Path Exists)"
+                else:
+                     raise ValueError(f"Path does not exist: {resource.folder_path}")
+            else:
+                 results[name] = "UNKNOWN TYPE"
+        except Exception as e:
+            results[name] = f"FAILED: {str(e)}"
+        click.echo(results[name])
+    click.echo("\n" + "=" * 60)
+    click.echo(f"{'RESOURCE':<30} | {'STATUS'}")
+    click.echo("-" * 60)
+    for name, status in results.items():
+        click.echo(f"{name:<30} | {status}")
+if __name__ == "__main__":
+    main()

easy_data_loader/config_loader.py ADDED Viewed

@@ -0,0 +1,184 @@
+import importlib.util
+from pathlib import Path
+from typing import Any, Dict, Union
+from types import ModuleType
+from dotenv import dotenv_values
+from .log import LoggedComponent
+from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition, ConnectionSettings, FileSettings, ResourceConfig
+class Configuration(LoggedComponent):
+    """
+    Configuration manager with lazy loading capabilities.
+    Resources and pipelines are loaded only when requested.
+    This class implements the Singleton pattern.
+    """
+    _instance = None
+    _initialized = False
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super(Configuration, cls).__new__(cls)
+        return cls._instance
+    def __init__(self, config_dir: str = "./config"):
+        if not self._initialized:
+            super().__init__()
+            self.config_dir = Path(config_dir)
+            self.resources : Dict[str, ResourceConfig] = {}
+            self.pipelines : Dict[str, Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]] = {}
+            self.logger.debug(f"Initializing configuration from directory: {config_dir}")
+            self._initialized = True
+    def _load_env_file(self, env_file: Path) -> ResourceConfig:
+        """Load environment variables from a specific file"""
+        self.logger.debug(f"Loading environment file: {env_file}")
+        env_file_name = env_file.stem
+        if env_file_name.startswith('database_'):
+            return ConnectionSettings(_env_file=[env_file]) # type: ignore
+        if env_file_name.startswith('file_'):
+            return FileSettings(_env_file=[env_file]) # type: ignore
+        self.log_and_raise(ValueError,
+            f"Failed to load env file: {env_file.name}. "
+            f"Resource files must start with 'database_' or 'file_' prefix."
+            )
+    def _import_module(self, module_file: Path) -> ModuleType:
+        """Dynamically import a Python module from a file path"""
+        self.logger.debug(f"Importing configuration from {module_file}")
+        spec = importlib.util.spec_from_file_location(module_file.stem, module_file)
+        if spec is None:
+            self.log_and_raise(
+                ImportError,
+                f"Could not create module spec for {module_file}",
+                file_path=str(module_file),
+            )
+        if spec.loader is not None:
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            self.logger.debug(
+                f"Succesfully imported configuration from {str(module_file)}"
+            )
+            return module
+        else:
+            self.log_and_raise(
+                ImportError,
+                f"Module spec has no loader {module_file}",
+                file_path=str(module_file),
+            )
+    def get_resource(self, resource_name: str) -> ResourceConfig:
+        """
+        Retrieve a connection by name.
+        Uses lazy loading: checks memory first, then attempts to load from file.
+        """
+        self.logger.debug(f"Retrieving connection by name: {resource_name}")
+        # 1. Check if already loaded
+        if resource_name in self.resources:
+            return self.resources[resource_name]
+        # 2. Try to load from file
+        resource_file = self.config_dir / "resources" / f"{resource_name}.env"
+        if resource_file.exists():
+            try:
+                resource = self._load_env_file(resource_file)
+                self.resources[resource_name] = resource
+                self.logger.info(f"Lazily loaded resource: {resource_name}")
+                return resource
+            except Exception as e:
+                self.log_exception(e, f"Failed to load resource: {resource_name}")
+                raise
+        # 3. Not found
+        self.log_and_raise(ValueError,
+             f"Resource not found: {resource_name}. "
+             f"Checked path: {resource_file}"
+        )
+    def get_pipeline(self, pipeline_name: str) -> Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]:
+        """
+        Retrieve a pipeline definition by name.
+        Uses lazy loading: checks memory first, then attempts to load from file.
+        """
+        self.logger.debug(f"Retriving pipeline definition by name: {pipeline_name}")
+        # 1. Check if already loaded
+        if pipeline_name in self.pipelines:
+            return self.pipelines[pipeline_name]
+        # 2. Try to load from file
+        pipeline_file = self.config_dir / "pipelines" / f"{pipeline_name}.py"
+        if pipeline_file.exists():
+            try:
+                config_module = self._import_module(pipeline_file)
+                # Find the definition in the module
+                for attr_name in dir(config_module):
+                    attr = getattr(config_module, attr_name)
+                    if isinstance(attr, (BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition)):
+                        self.pipelines[pipeline_name] = attr
+                        self.logger.info(f"Lazily loaded pipeline: {pipeline_name}")
+                        return attr
+            except Exception as e:
+                 self.log_exception(e, f"Failed to load pipeline: {pipeline_name}")
+                 raise
+        # 3. Not found
+        self.log_and_raise(ValueError,
+                           f"Pipeline not found: {pipeline_name}. "
+                           f"Checked path: {pipeline_file}")
+    def get_all_resources(self) -> dict[str, ResourceConfig]:
+        """
+        Retrieve all connections.
+        Scans the resources directory if not all loaded.
+        """
+        resources_dir = self.config_dir / "resources"
+        if not resources_dir.exists():
+            return self.resources
+        # Discover all .env files
+        for env_file in resources_dir.glob("*.env"):
+             # Simple logging to debug discovery
+             self.logger.debug(f"Found potential resource file: {env_file.name}")
+             resource_name = env_file.stem
+             if resource_name not in self.resources:
+                 try:
+                     self.resources[resource_name] = self._load_env_file(env_file)
+                 except Exception as e:
+                     self.logger.warning(f"Failed to load resource {resource_name}: {e}")
+        return self.resources
+    def get_all_pipelines(self) -> dict[str, BasePipelineDefinition]:
+        """
+        Retrieve all pipelines.
+        Scans the pipelines directory if not all loaded.
+        """
+        pipelines_dir = self.config_dir / "pipelines"
+        if not pipelines_dir.exists():
+            return self.pipelines
+        for pipeline_file in pipelines_dir.glob("*.py"):
+            self.logger.debug(f"Found potential pipeline file: {pipeline_file.name}")
+            pipeline_name = pipeline_file.stem
+            if pipeline_name not in self.pipelines:
+                 # Helper to trigger lazy load
+                 try:
+                    self.get_pipeline(pipeline_name)
+                 except Exception as e:
+                     self.logger.warning(f"Failed to load pipeline {pipeline_name}: {e}")
+        return self.pipelines

easy_data_loader/custom_exceptions.py ADDED Viewed

@@ -0,0 +1,21 @@
+class DriverNotFoundException(Exception):
+    def __init__(self, message: str = "ODBC driver not available"):
+        self.message = message
+        super().__init__(self.message)
+class EngineTestException(Exception):
+    def __init__(self, message: str = "Engine has not passed connection test"):
+        self.message = message
+        super().__init__(self.message)
+class DatabaseOperationException(Exception):
+    def __init__(self, operation: str, message: str):
+        self.message = message
+        super().__init__(self.message)
+class InvalidFileException(Exception):
+    def __init__(self, message: str = "The provided file is invalid or corrupted"):
+        self.message = message
+        super().__init__(self.message)