PyPI - easy-data-loader - Versions diffs - 0.1.0__tar.gz - Mend

easy-data-loader 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

easy_data_loader-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Bojoi Gabriel
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

easy_data_loader-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,52 @@
+Metadata-Version: 2.4
+Name: easy_data_loader
+Version: 0.1.0
+Summary: Data transfer utilities between files and databases
+Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: click>=8.3.0
+Requires-Dist: ipykernel
+Requires-Dist: openpyxl>=3.1.5
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: pyarrow>=22.0.0
+Requires-Dist: pydantic>=2.12.5
+Requires-Dist: pydantic-settings>=2.12.0
+Requires-Dist: pyodbc>=5.2.0
+Requires-Dist: python-dotenv>=1.1.1
+Requires-Dist: sqlalchemy>=2.0.43
+Dynamic: license-file
+# Easy Data Loader 🚀
+**Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various data sources (CSV, Excel, Parquet) and SQL databases (MSSQL, PostgreSQL, and others).
+## ✨ Key Features
+- **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
+- **Integrated CLI**: Initialize a standardized project structure with a single command.
+- **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
+- **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
+- **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
+---
+## 📦 Installation
+Install directly via `pip` or `uv`:
+```bash
+pip install easy_data_loader
+```
+## 🚀 Getting Started
+1. Initialize a new project structure to generate template configurations:
+   ```bash
+   easy-loader init
+   ```
+2. Review the generated `config/` folders for sample resources and pipelines.
+3. Run all discovered pipelines across the active configurations:
+   ```bash
+   easy-loader run_all
+   ```

easy_data_loader-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,32 @@
+# Easy Data Loader 🚀
+**Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various data sources (CSV, Excel, Parquet) and SQL databases (MSSQL, PostgreSQL, and others).
+## ✨ Key Features
+- **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
+- **Integrated CLI**: Initialize a standardized project structure with a single command.
+- **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
+- **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
+- **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
+---
+## 📦 Installation
+Install directly via `pip` or `uv`:
+```bash
+pip install easy_data_loader
+```
+## 🚀 Getting Started
+1. Initialize a new project structure to generate template configurations:
+   ```bash
+   easy-loader init
+   ```
+2. Review the generated `config/` folders for sample resources and pipelines.
+3. Run all discovered pipelines across the active configurations:
+   ```bash
+   easy-loader run_all
+   ```

easy_data_loader-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,37 @@
+[project]
+name = "easy_data_loader"
+version = "0.1.0"
+description ="Data transfer utilities between files and databases"
+authors = [
+    {name = "Bojoi Gabriel", email = "bojoigabriel@gmail.com"}
+]
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "click>=8.3.0",
+    "ipykernel",
+    "openpyxl>=3.1.5",
+    "pandas>=2.3.3",
+    "pyarrow>=22.0.0",
+    "pydantic>=2.12.5",
+    "pydantic-settings>=2.12.0",
+    "pyodbc>=5.2.0",
+    "python-dotenv>=1.1.1",
+    "sqlalchemy>=2.0.43",
+]
+[dependency-groups]
+dev = [
+    "ipykernel>=7.1.0",
+    "pytest>=8.4.2",
+]
+[project.scripts]
+easy-loader = "easy_data_loader.cli:main"
+[tool.setuptools.packages.find]
+where = ["src"]
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"

easy_data_loader-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

easy_data_loader-0.1.0/src/easy_data_loader/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+__version__ = "0.1.0"
+from .pipeline import LoadPipeline
+from .procedure_pipeline import ProcedurePipeline
+from .orchestrator import OrchestratorPipeline
+from .models import BasePipelineDefinition, ProcedureDefinition, ColumnDefinition, OrchestratorDefinition
+__all__ = [
+    "LoadPipeline", "ProcedurePipeline", "OrchestratorPipeline",
+    "BasePipelineDefinition", "ProcedureDefinition", "ColumnDefinition", "OrchestratorDefinition"
+]

easy_data_loader-0.1.0/src/easy_data_loader/cli.py ADDED Viewed

@@ -0,0 +1,302 @@
+import click
+import os
+from pathlib import Path
+# Integrated templates
+PIPELINE_TEMPLATE = """
+from easy_data_loader.pipeline import LoadPipeline
+from easy_data_loader.models import BasePipelineDefinition, ColumnDefinition
+import pandas as pd
+from sqlalchemy.types import DATETIME, INT, DECIMAL, NVARCHAR
+example_pipeline = BasePipelineDefinition(
+    pipeline_name="test_pipeline", # pipeline name to be used when initializing the Pipeline object
+    # source name represented by the file having the source settings -> corresponds to a file in the config/resources folder
+    source = "example_file",
+    # "dbo.FactSales" ## if the source is a database then source sql defines the table name or a custom query to be executed
+    # source_sql = "SELECT TOP 100 * FROM dbo.FactSales"
+    # destination name represented by the file having the destination settings -> corresponds to a file in the config/resources folder
+    destination="example_database",
+    # if the destination is a database then here we define the destination table name
+    destination_table="dbo.LargeSalesData",
+    # columns definition if we are sending data to a database table
+    columns={
+        "transaction_id": ColumnDefinition(target_name="new_transaction_id", data_type=INT()),
+        "date": ColumnDefinition(target_name="sales_date", data_type=DATETIME()),
+        "customer_id": ColumnDefinition(target_name="id_customer", data_type=INT()),
+        "product_category" : ColumnDefinition(target_name="category_of_product", data_type=NVARCHAR(100)),
+        "units_sold" : ColumnDefinition(target_name="units", data_type=INT()),
+        "unit_price" : ColumnDefinition(target_name="price", data_type=DECIMAL(6,2)),
+        "raw_notes" : ColumnDefinition(target_name="notes", data_type=NVARCHAR(100))
+    },
+    # different parameters passed to the write functions
+    write_parameters={"if_exists" : "replace", "index" : False},
+    # different parameters passed to the read function
+    read_parameters={"sep" : ";"}
+)
+def add_timestamp(df):
+    # Adding an audit column during load
+    df['insert_timestamp'] = pd.Timestamp.now()
+    return df
+example_pipeline.transform = add_timestamp
+"""
+PROCEDURE_TEMPLATE = """
+from easy_data_loader.models import ProcedureDefinition
+example_procedure = ProcedureDefinition(
+    pipeline_name="example_procedure",
+    resource="example_database",
+    procedures=[
+        ("dbo.sp_UpdateSales", {"year": 2024}),
+        ("dbo.sp_ArchiveOldData", {})
+    ]
+)
+"""
+ORCHESTRATOR_TEMPLATE = """
+from easy_data_loader.models import OrchestratorDefinition
+example_orchestrator = OrchestratorDefinition(
+    orchestrator_name="example_orchestrator",
+    pipelines=[
+        "example_pipeline",
+        "example_procedure"
+    ],
+    fail_fast=True
+)
+"""
+DATABASE_ENV = """
+# database resource definition
+CONN_SERVER_TYPE=MSSQL
+CONN_SERVER=.
+CONN_DATABASE=test_database
+CONN_USERNAME=my_user
+CONN_PASSWORD=my_password
+CONN_PORT=1433
+"""
+FILE_ENV = """
+# file resource definition
+FILE_TYPE=CSV
+FOLDER_PATH=./data/imports
+FILE_NAME=large_sales_data
+"""
+MAIN = """
+from easy_data_loader.pipeline import LoadPipeline
+# Run an ETL pipeline
+LoadPipeline(pipeline_name="example_pipeline").run()
+# Run a procedure pipeline
+# from easy_data_loader.procedure_pipeline import ProcedurePipeline
+# ProcedurePipeline(pipeline_name="example_procedure").run()
+"""
+@click.group()
+def main():
+    """Easy Data Loader CLI - ETL instrument between files and databases"""
+    pass
+@main.command()
+def init():
+    """Initialize folder structure and sample files"""
+    base_path = Path.cwd()
+    # folders
+    folders = ['config/resources', 'config/pipelines']
+    for folder in folders:
+        (base_path / folder).mkdir(parents=True, exist_ok=True)
+    # Example files to create
+    files = {
+        "config/pipelines/pipeline_example.py": PIPELINE_TEMPLATE,
+        "config/pipelines/procedure_example.py": PROCEDURE_TEMPLATE,
+        "config/pipelines/orchestrator_example.py": ORCHESTRATOR_TEMPLATE,
+        "config/resources/database_example.env": DATABASE_ENV,
+        "config/resources/file_example.env": FILE_ENV,
+        "main.py" : MAIN,
+    }
+    for name, content in files.items():
+        file_path = base_path / name
+        if not file_path.exists():
+            with open(file_path, "w", encoding="utf-8") as f:
+                f.write(content)
+            click.echo(f"Created: {name}")
+        else:
+            click.echo(f"Skipped: {name} (already exists)")
+    click.echo("\nProject initialized successfully!")
+@main.command()
+def list():
+    """List all discovered resources and pipelines"""
+    from .config_loader import Configuration
+    config = Configuration()
+    click.echo("--- Discovered Resources ---")
+    for name in config.get_all_resources():
+        click.echo(f" - {name}")
+    click.echo("\n--- Discovered Pipelines ---")
+    for name in config.get_all_pipelines():
+        click.echo(f" - {name}")
+@main.command()
+@click.argument('resource_name')
+@click.argument('table_name')
+def inspect_db(resource_name, table_name):
+    """Inspect a database table and generate ColumnDefinition code"""
+    from .config_loader import Configuration
+    from .database_connector import CONNECTOR_FACTORY
+    from .database_operations import DatabaseOperations
+    from .models import ConnectionSettings
+    config = Configuration()
+    resource = config.get_resource(resource_name)
+    if not isinstance(resource, ConnectionSettings):
+        click.echo(f"Error: Resource '{resource_name}' is not a database connection.")
+        return
+    # Initialize connector and ops
+    connector = CONNECTOR_FACTORY[resource.conn_server_type](resource)
+    ops = DatabaseOperations(connector.get_engine())
+    schema = ops.inspect_table(table_name)
+    if not schema:
+        click.echo(f"No columns found for table '{table_name}'.")
+        return
+    click.echo(f"\n# Suggested Column definitions for {table_name}:")
+    click.echo("columns={")
+    for col, dtype in schema.items():
+        click.echo(f'    "{col}": ColumnDefinition(target_name="{col}", data_type={dtype}),')
+    click.echo("}")
+@main.command()
+def run_all():
+    """Run all discovered pipelines and show status summary"""
+    from .config_loader import Configuration
+    from .pipeline import LoadPipeline
+    from .procedure_pipeline import ProcedurePipeline
+    from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition
+    config = Configuration()
+    pipelines = config.get_all_pipelines()
+    if not pipelines:
+        click.echo("No pipelines discovered.")
+        return
+    results = {}
+    click.echo(f"🚀 Running {len(pipelines)} discovered pipelines...\n")
+    for name in pipelines:
+        click.echo(f"Pipeline: {name} ... ", nl=False)
+        try:
+            definition = config.get_pipeline(name)
+            if isinstance(definition, BasePipelineDefinition):
+                success = LoadPipeline(name).run()
+            elif isinstance(definition, ProcedureDefinition):
+                success = ProcedurePipeline(name).run()
+            elif isinstance(definition, OrchestratorDefinition):
+                from .orchestrator import OrchestratorPipeline
+                success = OrchestratorPipeline(name).run()
+            else:
+                success = False
+            results[name] = "SUCCESS" if success else "FAILED"
+        except Exception as e:
+            results[name] = f"ERROR: {str(e)}"
+        click.echo(results[name])
+    click.echo("\n" + "=" * 40)
+    click.echo(f"{'PIPELINE':<25} | {'STATUS'}")
+    click.echo("-" * 40)
+    for name, status in results.items():
+        click.echo(f"{name:<25} | {status}")
+@main.command()
+@click.argument('orchestrator_name')
+def run_orchestrator(orchestrator_name):
+    """Run a specific orchestrator by name"""
+    from .orchestrator import OrchestratorPipeline
+    try:
+        success = OrchestratorPipeline(orchestrator_name).run()
+        if success:
+            click.echo(f"✅ Orchestrator '{orchestrator_name}' completed successfully.")
+        else:
+            click.echo(f"❌ Orchestrator '{orchestrator_name}' failed.")
+    except Exception as e:
+        click.echo(f"💥 Error: {str(e)}")
+@main.command()
+def validate_resources():
+    """Validate all configured resources"""
+    from .config_loader import Configuration
+    from .database_connector import CONNECTOR_FACTORY
+    from .models import ConnectionSettings, FileSettings
+    config = Configuration()
+    resources = config.get_all_resources()
+    if not resources:
+        click.echo("No resources found.")
+        return
+    click.echo(f"🔍 Validating {len(resources)} resources...\n")
+    results = {}
+    for name, resource in resources.items():
+        click.echo(f"Resource: {name} ... ", nl=False)
+        try:
+            if isinstance(resource, ConnectionSettings):
+                # Validate Database Connection
+                connector = CONNECTOR_FACTORY[resource.conn_server_type](resource)
+                # The connector tests connection in __init__, so if we are here it passed
+                results[name] = "OK (Connected)"
+            elif isinstance(resource, FileSettings):
+                # Validate File Path
+                if resource.folder_path.exists():
+                     results[name] = "OK (Path Exists)"
+                else:
+                     raise ValueError(f"Path does not exist: {resource.folder_path}")
+            else:
+                 results[name] = "UNKNOWN TYPE"
+        except Exception as e:
+            results[name] = f"FAILED: {str(e)}"
+        click.echo(results[name])
+    click.echo("\n" + "=" * 60)
+    click.echo(f"{'RESOURCE':<30} | {'STATUS'}")
+    click.echo("-" * 60)
+    for name, status in results.items():
+        click.echo(f"{name:<30} | {status}")
+if __name__ == "__main__":
+    main()

easy_data_loader-0.1.0/src/easy_data_loader/config_loader.py ADDED Viewed

@@ -0,0 +1,184 @@
+import importlib.util
+from pathlib import Path
+from typing import Any, Dict, Union
+from types import ModuleType
+from dotenv import dotenv_values
+from .log import LoggedComponent
+from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition, ConnectionSettings, FileSettings, ResourceConfig
+class Configuration(LoggedComponent):
+    """
+    Configuration manager with lazy loading capabilities.
+    Resources and pipelines are loaded only when requested.
+    This class implements the Singleton pattern.
+    """
+    _instance = None
+    _initialized = False
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super(Configuration, cls).__new__(cls)
+        return cls._instance
+    def __init__(self, config_dir: str = "./config"):
+        if not self._initialized:
+            super().__init__()
+            self.config_dir = Path(config_dir)
+            self.resources : Dict[str, ResourceConfig] = {}
+            self.pipelines : Dict[str, Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]] = {}
+            self.logger.debug(f"Initializing configuration from directory: {config_dir}")
+            self._initialized = True
+    def _load_env_file(self, env_file: Path) -> ResourceConfig:
+        """Load environment variables from a specific file"""
+        self.logger.debug(f"Loading environment file: {env_file}")
+        env_file_name = env_file.stem
+        if env_file_name.startswith('database_'):
+            return ConnectionSettings(_env_file=[env_file]) # type: ignore
+        if env_file_name.startswith('file_'):
+            return FileSettings(_env_file=[env_file]) # type: ignore
+        self.log_and_raise(ValueError,
+            f"Failed to load env file: {env_file.name}. "
+            f"Resource files must start with 'database_' or 'file_' prefix."
+            )
+    def _import_module(self, module_file: Path) -> ModuleType:
+        """Dynamically import a Python module from a file path"""
+        self.logger.debug(f"Importing configuration from {module_file}")
+        spec = importlib.util.spec_from_file_location(module_file.stem, module_file)
+        if spec is None:
+            self.log_and_raise(
+                ImportError,
+                f"Could not create module spec for {module_file}",
+                file_path=str(module_file),
+            )
+        if spec.loader is not None:
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            self.logger.debug(
+                f"Succesfully imported configuration from {str(module_file)}"
+            )
+            return module
+        else:
+            self.log_and_raise(
+                ImportError,
+                f"Module spec has no loader {module_file}",
+                file_path=str(module_file),
+            )
+    def get_resource(self, resource_name: str) -> ResourceConfig:
+        """
+        Retrieve a connection by name.
+        Uses lazy loading: checks memory first, then attempts to load from file.
+        """
+        self.logger.debug(f"Retrieving connection by name: {resource_name}")
+        # 1. Check if already loaded
+        if resource_name in self.resources:
+            return self.resources[resource_name]
+        # 2. Try to load from file
+        resource_file = self.config_dir / "resources" / f"{resource_name}.env"
+        if resource_file.exists():
+            try:
+                resource = self._load_env_file(resource_file)
+                self.resources[resource_name] = resource
+                self.logger.info(f"Lazily loaded resource: {resource_name}")
+                return resource
+            except Exception as e:
+                self.log_exception(e, f"Failed to load resource: {resource_name}")
+                raise
+        # 3. Not found
+        self.log_and_raise(ValueError,
+             f"Resource not found: {resource_name}. "
+             f"Checked path: {resource_file}"
+        )
+    def get_pipeline(self, pipeline_name: str) -> Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]:
+        """
+        Retrieve a pipeline definition by name.
+        Uses lazy loading: checks memory first, then attempts to load from file.
+        """
+        self.logger.debug(f"Retriving pipeline definition by name: {pipeline_name}")
+        # 1. Check if already loaded
+        if pipeline_name in self.pipelines:
+            return self.pipelines[pipeline_name]
+        # 2. Try to load from file
+        pipeline_file = self.config_dir / "pipelines" / f"{pipeline_name}.py"
+        if pipeline_file.exists():
+            try:
+                config_module = self._import_module(pipeline_file)
+                # Find the definition in the module
+                for attr_name in dir(config_module):
+                    attr = getattr(config_module, attr_name)
+                    if isinstance(attr, (BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition)):
+                        self.pipelines[pipeline_name] = attr
+                        self.logger.info(f"Lazily loaded pipeline: {pipeline_name}")
+                        return attr
+            except Exception as e:
+                 self.log_exception(e, f"Failed to load pipeline: {pipeline_name}")
+                 raise
+        # 3. Not found
+        self.log_and_raise(ValueError,
+                           f"Pipeline not found: {pipeline_name}. "
+                           f"Checked path: {pipeline_file}")
+    def get_all_resources(self) -> dict[str, ResourceConfig]:
+        """
+        Retrieve all connections.
+        Scans the resources directory if not all loaded.
+        """
+        resources_dir = self.config_dir / "resources"
+        if not resources_dir.exists():
+            return self.resources
+        # Discover all .env files
+        for env_file in resources_dir.glob("*.env"):
+             # Simple logging to debug discovery
+             self.logger.debug(f"Found potential resource file: {env_file.name}")
+             resource_name = env_file.stem
+             if resource_name not in self.resources:
+                 try:
+                     self.resources[resource_name] = self._load_env_file(env_file)
+                 except Exception as e:
+                     self.logger.warning(f"Failed to load resource {resource_name}: {e}")
+        return self.resources
+    def get_all_pipelines(self) -> dict[str, BasePipelineDefinition]:
+        """
+        Retrieve all pipelines.
+        Scans the pipelines directory if not all loaded.
+        """
+        pipelines_dir = self.config_dir / "pipelines"
+        if not pipelines_dir.exists():
+            return self.pipelines
+        for pipeline_file in pipelines_dir.glob("*.py"):
+            self.logger.debug(f"Found potential pipeline file: {pipeline_file.name}")
+            pipeline_name = pipeline_file.stem
+            if pipeline_name not in self.pipelines:
+                 # Helper to trigger lazy load
+                 try:
+                    self.get_pipeline(pipeline_name)
+                 except Exception as e:
+                     self.logger.warning(f"Failed to load pipeline {pipeline_name}: {e}")
+        return self.pipelines

easy_data_loader-0.1.0/src/easy_data_loader/custom_exceptions.py ADDED Viewed

@@ -0,0 +1,21 @@
+class DriverNotFoundException(Exception):
+    def __init__(self, message: str = "ODBC driver not available"):
+        self.message = message
+        super().__init__(self.message)
+class EngineTestException(Exception):
+    def __init__(self, message: str = "Engine has not passed connection test"):
+        self.message = message
+        super().__init__(self.message)
+class DatabaseOperationException(Exception):
+    def __init__(self, operation: str, message: str):
+        self.message = message
+        super().__init__(self.message)
+class InvalidFileException(Exception):
+    def __init__(self, message: str = "The provided file is invalid or corrupted"):
+        self.message = message
+        super().__init__(self.message)