PyPI - easy-data-loader - Versions diffs - 0.1.2__tar.gz → 0.1.4__tar.gz - Mend

easy-data-loader 0.1.2tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{easy_data_loader-0.1.2/src/easy_data_loader.egg-info → easy_data_loader-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easy_data_loader
-Version: 0.1.2
+Version: 0.1.4
 Summary: Data transfer utilities between files and databases
 Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
 Classifier: Development Status :: 3 - Alpha
@@ -8,14 +8,17 @@ Classifier: Intended Audience :: Developers
 Classifier: Topic :: Database
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.13
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: click>=8.3.0
 Requires-Dist: openpyxl>=3.1.5
 Requires-Dist: pandas>=2.3.3
+Requires-Dist: psycopg2-binary>=2.9.11
 Requires-Dist: pyarrow>=22.0.0
 Requires-Dist: pydantic>=2.12.5
 Requires-Dist: pydantic-settings>=2.12.0

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/pyproject.toml RENAMED Viewed

@@ -1,14 +1,15 @@
 [project]
 name = "easy_data_loader"
-version = "0.1.2"
+version = "0.1.4"
 description = "Data transfer utilities between files and databases"
 authors = [{ name = "Bojoi Gabriel", email = "bojoigabriel@gmail.com" }]
 readme = "README.md"
-requires-python = ">=3.13"
+requires-python = ">=3.11"
 dependencies = [
     "click>=8.3.0",
     "openpyxl>=3.1.5",
     "pandas>=2.3.3",
+    "psycopg2-binary>=2.9.11",
     "pyarrow>=22.0.0",
     "pydantic>=2.12.5",
     "pydantic-settings>=2.12.0",
@@ -22,6 +23,8 @@ classifiers = [
     "Topic :: Database",
     "Topic :: Scientific/Engineering :: Information Analysis",
     "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
     "Operating System :: OS Independent",
 ]

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/custom_exceptions.py RENAMED Viewed

@@ -20,3 +20,9 @@ class InvalidFileException(Exception):
     def __init__(self, message: str = "The provided file is invalid or corrupted"):
         self.message = message
         super().__init__(self.message)
+class PipelineValidationError(Exception):
+    def __init__(self, message: str):
+        self.message = message
+        super().__init__(self.message)

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/database_connector.py RENAMED Viewed

@@ -220,8 +220,109 @@ class SQLiteDatabaseConnector(LoggedComponent, DatabaseConnector):
         self.engine.dispose()
+class PostgresDatabaseConnector(LoggedComponent, DatabaseConnector):
+    """Database connector to a PostgreSQL database"""
+    def __init__(self, config: ConnectionSettings):
+        super().__init__()
+        self.config = config
+        self.engine = self._create_engine(self.config)
+        self._test_engine()
+    def _build_connection_string(self) -> str:
+        """Define the connection string from the given configuration"""
+        if not isinstance(self.config, ServerBasedConnectionSettings):
+            self.log_and_raise(
+                ValueError,
+                "PostgresDatabaseConnector requires ServerBasedConnectionSettings",
+            )
+        # Validate minimum parameters
+        if not all(
+            [self.config.conn_server, self.config.conn_database, self.config.conn_port]
+        ):
+            self.log_and_raise(
+                ValueError,
+                "Postgres connection configuration is missing host, port or database name",
+            )
+        # encode special characters in password and username
+        user = (
+            urllib.parse.quote_plus(self.config.conn_username)
+            if self.config.conn_username
+            else ""
+        )
+        password = (
+            urllib.parse.quote_plus(self.config.conn_password)
+            if self.config.conn_password
+            else ""
+        )
+        host = self.config.conn_server
+        port = self.config.conn_port
+        database = self.config.conn_database
+        # Standard SQLAlchemy format: postgresql+psycopg2://user:password@host:port/dbname
+        if user and password:
+            return f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
+        # trusted connection
+        return f"postgresql+psycopg2://{user}@{host}:{port}/{database}"
+    def _create_engine(self, settings: ConnectionSettings) -> Engine:
+        """Create a sqlalchemy engine using the provided configuration"""
+        connection_string = self._build_connection_string()
+        try:
+            engine = create_engine(
+                connection_string,
+                pool_size=5,
+                max_overflow=10,
+                pool_timeout=30,
+                pool_recycle=3600,
+                echo=self.is_debug_enabled,
+            )
+            return engine
+        except Exception as e:
+            self.log_exception(
+                e, "Could not create Postgres engine from connection string"
+            )
+            raise
+    def get_engine(self) -> Engine:
+        """Return the engine to be used"""
+        return self.engine
+    def _test_engine(self) -> bool:
+        """Test if the connection works"""
+        if not isinstance(self.config, ServerBasedConnectionSettings):
+            self.log_and_raise(
+                ValueError,
+                "PostgresDatabaseConnector requires ServerBasedConnectionSettings",
+            )
+        try:
+            with self.engine.connect() as conn:
+                conn.execute(text("SELECT 1"))
+            self.logger.debug(
+                f"Postgres connection test successful: {self.config.conn_server} - {self.config.conn_database}"
+            )
+            return True
+        except Exception as e:
+            self.log_and_raise(
+                EngineTestException,
+                "Postgres connection test failed",
+                exception=str(e),
+            )
+            return False
+    def _dispose_engine(self) -> None:
+        self.logger.debug("Disposing of Postgres engine")
+        self.engine.dispose()
 CONNECTOR_FACTORY: dict[ServerType, type] = {
     ServerType.MSSQL: SqlServerDatabaseConnector,
     ServerType.SQLITE: SQLiteDatabaseConnector,
-    # ServerType.POSTGRESQL: PostgresDatabaseConnector
+    ServerType.POSTGRESQL: PostgresDatabaseConnector,
 }

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/database_operations.py RENAMED Viewed

@@ -2,7 +2,7 @@ from typing import Dict
 import pandas as pd
 from pandas import DataFrame
-from sqlalchemy import Column, MetaData, Table, inspect, text
+from sqlalchemy import Column, MetaData, Table, TextClause, inspect, text
 from sqlalchemy.engine import Engine
 from sqlalchemy.types import TypeEngine
 from sqlalchemy.schema import SchemaItem
@@ -74,12 +74,36 @@ class DatabaseOperations(LoggedComponent):
             self.log_exception(e, f"Failed to create table {table_name}")
             return False
+    def _build_procedure_sql(
+        self, dialect: str, procedure_name: str, params: dict
+    ) -> TextClause:
+        """Build dialect-specific SQL for calling a stored procedure."""
+        params_keys = list(params.keys())
+        if dialect == "mssql":
+            if params_keys:
+                params_str = ", ".join([f":{k}" for k in params_keys])
+                return text(f"EXEC {procedure_name} {params_str};")
+            return text(f"EXEC {procedure_name};")
+        elif dialect == "postgresql":
+            params_str = ", ".join([f":{k}" for k in params_keys])
+            return text(f"CALL {procedure_name}({params_str});")
+        elif dialect == "sqlite":
+            raise NotImplementedError("SQLite does not support stored procedures.")
+        else:
+            raise NotImplementedError(
+                f"Stored procedure execution is not supported for dialect: {dialect}"
+            )
     def execute_stored_procedure(self, procedure_name: str, **kwargs) -> bool:
-        """Execute a stored procedure using the engine connection"""
+        """Execute a stored procedure using the engine connection.
+        Generates dialect-appropriate SQL based on the engine's dialect.
+        """
         self.logger.info(f"Executing stored procedure: {procedure_name}")
         try:
-            params_str = ", ".join([f":{k}" for k in kwargs.keys()])
-            sql = text(f"EXEC {procedure_name} {params_str}")
+            dialect = self.engine.dialect.name
+            sql = self._build_procedure_sql(dialect, procedure_name, kwargs)
             with self.engine.begin() as conn:
                 # self.engine.begin() is a context manager for the transaction
@@ -87,12 +111,14 @@ class DatabaseOperations(LoggedComponent):
                 conn.execute(sql, kwargs)
             return True
         except Exception as e:
-            self.log_exception(e, f"Failed to execute procedure {procedure_name}")
+            self.logger.error(
+                f"Failed to execute procedure {procedure_name} | Error: {str(e)}"
+            )
             raise
     def write_audit(self, table_name: str, entry: AuditEntry):
         """Write an audit entry to the database, ensuring the table exists first."""
-        self.logger.debug(f"Writing audit entry for execution: {entry.execution_id}")
+        self.logger.debug(f"Writing audit entry for execution: {entry.pipeline_id}")
         try:
             from sqlalchemy import (
@@ -108,7 +134,8 @@ class DatabaseOperations(LoggedComponent):
             metadata = MetaData()
             columns: list[SchemaItem] = [
-                Column("execution_id", String(50), primary_key=True),
+                Column("pipeline_id", String(36), primary_key=True),
+                Column("orchestrator_id", String(36)),
                 Column("pipeline_name", String(100)),
                 Column("status", String(50)),
                 Column("input_rows", Integer),

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/models.py RENAMED Viewed

@@ -42,8 +42,8 @@ class ServerBasedConnectionSettings(BaseConnectionSettings):
     conn_server: str
     conn_database: str
-    conn_username: str
-    conn_password: str
+    conn_username: str | None = None
+    conn_password: str | None = None
     conn_port: Annotated[int, Field(default=1433, ge=1, le=65535)]
     model_config = SettingsConfigDict(extra="ignore")
@@ -62,6 +62,12 @@ class ServerBasedConnectionSettings(BaseConnectionSettings):
             raise ValueError(
                 "Both username and password must be provided for authentication"
             )
+        # If the username and password are captured as empty strings nullify them
+        # which will lead the connection to be created as trusted connection for Sql Server
+        if self.conn_username == "" and self.conn_password == "":
+            self.conn_username = None
+            self.conn_password = None
         return self
@@ -128,6 +134,7 @@ class BasePipelineDefinition(BaseModel):
     destination_table: Optional[str] = None
     audit: Optional[str] = None  # resource name for the audit database
     validator: Optional[Any] = None  # pydantic model class for validation
+    validation_fail: bool = False
     # mapping of source columns to destination columns together with datatypes
     columns: Dict[str, ColumnDefinition] = Field(default_factory=dict)
@@ -195,7 +202,8 @@ class ProcedureDefinition(BaseModel):
 class AuditEntry(BaseModel):
     """Model representing an entry in the audit table"""
-    execution_id: str
+    orchestrator_id: Optional[str] = None
+    pipeline_id: str
     pipeline_name: str
     status: str
     input_rows: int = 0

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/orchestrator.py RENAMED Viewed

@@ -1,4 +1,4 @@
-import uuid
+from uuid import uuid4
 from .config_loader import Configuration
 from .log import LoggedComponent
@@ -17,6 +17,7 @@ class OrchestratorPipeline(LoggedComponent):
     def __init__(self, orchestrator_name: str):
         super().__init__()
+        self.orchestrator_id = str(uuid4())
         self.config = Configuration()
         definition = self.config.get_pipeline(orchestrator_name)
@@ -26,11 +27,10 @@ class OrchestratorPipeline(LoggedComponent):
             )
         self.definition: OrchestratorDefinition = definition
-        self.batch_id = str(uuid.uuid4())
     def run(self) -> bool:
         self.logger.info(
-            f"=== Starting Orchestrator: {self.definition.orchestrator_name} (Batch: {self.batch_id}) ==="
+            f"=== Starting Orchestrator: {self.definition.orchestrator_name} (Orchestrator: {self.orchestrator_id}) ==="
         )
         success = True
@@ -44,9 +44,13 @@ class OrchestratorPipeline(LoggedComponent):
             # Instantiate and run
             if isinstance(p_def, BasePipelineDefinition):
-                p_success = LoadPipeline(pipeline_name).run()
+                p_success = LoadPipeline(
+                    pipeline_name, orchestrator_id=self.orchestrator_id
+                ).run()
             elif isinstance(p_def, ProcedureDefinition):
-                p_success = ProcedurePipeline(pipeline_name).run()
+                p_success = ProcedurePipeline(
+                    pipeline_name, orchestrator_id=self.orchestrator_id
+                ).run()
             elif isinstance(p_def, OrchestratorDefinition):
                 self.logger.error(
                     f"[{self.definition.orchestrator_name}] -> Nested orchestrators are not supported."

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/pipeline.py RENAMED Viewed

@@ -14,6 +14,7 @@ from .models import (
     FileBasedConnectionSettings,
     FileType,
 )
+from .custom_exceptions import PipelineValidationError
 from .pipeline_base import BasePipeline
@@ -24,7 +25,8 @@ class LoadPipeline(BasePipeline):
     Inherits shared logic from BasePipeline.
     """
-    def __init__(self, pipeline_name: str):
+    def __init__(self, pipeline_name: str, orchestrator_id: Optional[str] = None):
         # Load definition from config
         definition = Configuration().get_pipeline(pipeline_name)
         if not isinstance(definition, BasePipelineDefinition):
@@ -32,7 +34,7 @@ class LoadPipeline(BasePipeline):
                 f"Pipeline '{pipeline_name}' is not a LoadPipeline definition."
             )
-        super().__init__(definition)
+        super().__init__(definition, orchestrator_id=orchestrator_id)
         self.definition: BasePipelineDefinition = definition
         # initialize components
@@ -95,6 +97,12 @@ class LoadPipeline(BasePipeline):
             # 2. TRANSFORM & VALIDATE
             df = self._transform_step(df)
             self.output_rows = len(df)
+            if df.empty:
+                self.logger.warning(
+                    "No valid data remaining after validation! Pipeline will stop gracefully."
+                )
+                self._log_audit("SUCCESS")
+                return True
             # 3. LOAD
             load_success = self._load_step(df, inferred_dtypes)
@@ -111,6 +119,13 @@ class LoadPipeline(BasePipeline):
             )
             return True
+        except PipelineValidationError as e:
+            self.error_details = str(e)
+            self.logger.error(
+                f"Critical pipeline error - {self.definition.pipeline_name}: {str(e)}"
+            )
+            self._log_audit("FAILED")
+            return False
         except Exception as e:
             self.error_details = str(e)
             self.log_exception(
@@ -192,18 +207,52 @@ class LoadPipeline(BasePipeline):
                 f"Starting data validation against model: {self.definition.validator.__name__}"
             )
             validated_rows = []
+            failed_rows = []
             # Optimization: iterrows is slow because it creates a Series for each row.
             # Converting to a list of dicts first is much faster for iteration.
             records = df.to_dict(orient="records")
+            from pydantic import ValidationError
             for i, record in enumerate(records):
                 try:
                     # Use model_validate for individual row validation
                     model_inst = self.definition.validator.model_validate(record)
                     validated_rows.append(model_inst.model_dump())
+                except ValidationError as ve:
+                    # Format Pydantic validation error to be human friendly
+                    errors_list = []
+                    for err in ve.errors():
+                        loc_str = " -> ".join(str(item) for item in err.get("loc", []))
+                        msg = err.get("msg", "Validation error")
+                        errors_list.append(f"{loc_str}: {msg}")
+                    err_msg = ", ".join(errors_list)
+                    failed_rows.append((i, record, err_msg))
                 except Exception as e:
-                    self.logger.warning(f"Row {i} failed validation: {str(e)}")
+                    failed_rows.append((i, record, str(e)))
+            if failed_rows:
+                total_failed = len(failed_rows)
+                total_records = len(records)
+                if self.definition.validation_fail:
+                    first_errors = "; ".join(
+                        [f"Row {idx}: {err}" for idx, _, err in failed_rows[:5]]
+                    )
+                    if len(failed_rows) > 5:
+                        first_errors += f" ... and {len(failed_rows) - 5} more errors"
+                    self.log_and_raise(
+                        PipelineValidationError,
+                        f"Validation failed for {total_failed} out of {total_records} rows: {first_errors}",
+                    )
+                else:
+                    self.logger.warning(
+                        f"Validation failed for {total_failed} out of {total_records} rows."
+                    )
+                    for idx, record, err in failed_rows:
+                        self.logger.debug(
+                            f"Row {idx} failed validation: {err}. Row data: {record}"
+                        )
             df = pd.DataFrame(validated_rows)

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/pipeline_base.py RENAMED Viewed

@@ -1,6 +1,6 @@
-import uuid
 from abc import ABC, abstractmethod
 from typing import List, Optional, Union
+from uuid import uuid4
 import pandas as pd
@@ -27,10 +27,15 @@ class BasePipeline(LoggedComponent, ABC):
     - Shared resource cleanup
     """
-    def __init__(self, definition: Union[BasePipelineDefinition, ProcedureDefinition]):
+    def __init__(
+        self,
+        definition: Union[BasePipelineDefinition, ProcedureDefinition],
+        orchestrator_id: Optional[str] = None,
+    ):
         super().__init__()
+        self.orchestrator_id = orchestrator_id
+        self.pipeline_id = str(uuid4())
         self.definition = definition
-        self.execution_id = str(uuid.uuid4())
         self.config = Configuration()
         self._active_connectors: List[DatabaseConnector] = []
@@ -113,7 +118,8 @@ class BasePipeline(LoggedComponent, ABC):
                 self.logger.warning(f"Could not read file metadata for audit: {e}")
         entry = AuditEntry(
-            execution_id=self.execution_id,
+            orchestrator_id=self.orchestrator_id,
+            pipeline_id=self.pipeline_id,
             pipeline_name=self.definition.pipeline_name,
             status=status,
             input_rows=self.input_rows,

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/procedure_pipeline.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from typing import Optional
 from .config_loader import Configuration
 from .models import (
     ProcedureDefinition,
@@ -12,7 +14,7 @@ class ProcedurePipeline(BasePipeline):
     Pipeline specialized in executing database stored procedures.
     """
-    def __init__(self, pipeline_name: str):
+    def __init__(self, pipeline_name: str, orchestrator_id: Optional[str] = None):
         # Load definition from config
         definition = Configuration().get_pipeline(pipeline_name)
         if not isinstance(definition, ProcedureDefinition):
@@ -20,7 +22,7 @@ class ProcedurePipeline(BasePipeline):
                 f"Pipeline '{pipeline_name}' is not a ProcedureDefinition."
             )
-        super().__init__(definition)
+        super().__init__(definition, orchestrator_id=orchestrator_id)
         self.definition: ProcedureDefinition = definition
         # Set up primary database connection
@@ -61,8 +63,8 @@ class ProcedurePipeline(BasePipeline):
         except Exception as e:
             self.error_details = str(e)
-            self.log_exception(
-                e, f"Error in Procedure Pipeline: {self.definition.pipeline_name}"
+            self.logger.error(
+                f"Procedure Pipeline failed: {self.definition.pipeline_name} | Error: {str(e)}"
             )
             # Set audit details for failure too

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4/src/easy_data_loader.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easy_data_loader
-Version: 0.1.2
+Version: 0.1.4
 Summary: Data transfer utilities between files and databases
 Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
 Classifier: Development Status :: 3 - Alpha
@@ -8,14 +8,17 @@ Classifier: Intended Audience :: Developers
 Classifier: Topic :: Database
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.13
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: click>=8.3.0
 Requires-Dist: openpyxl>=3.1.5
 Requires-Dist: pandas>=2.3.3
+Requires-Dist: psycopg2-binary>=2.9.11
 Requires-Dist: pyarrow>=22.0.0
 Requires-Dist: pydantic>=2.12.5
 Requires-Dist: pydantic-settings>=2.12.0

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/SOURCES.txt RENAMED Viewed

@@ -23,4 +23,6 @@ src/easy_data_loader.egg-info/entry_points.txt
 src/easy_data_loader.egg-info/requires.txt
 src/easy_data_loader.egg-info/top_level.txt
 tests/test_data_inference.py
-tests/test_imports.py
+tests/test_imports.py
+tests/test_orchestrator.py
+tests/test_validation.py

{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/requires.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 click>=8.3.0
 openpyxl>=3.1.5
 pandas>=2.3.3
+psycopg2-binary>=2.9.11
 pyarrow>=22.0.0
 pydantic>=2.12.5
 pydantic-settings>=2.12.0

easy_data_loader-0.1.4/tests/test_orchestrator.py ADDED Viewed

@@ -0,0 +1,134 @@
+import os
+import tempfile
+import sqlite3
+import pytest
+from easy_data_loader.config_loader import Configuration
+from easy_data_loader.models import (
+    BasePipelineDefinition,
+    OrchestratorDefinition,
+    FileBasedConnectionSettings,
+    ServerType,
+)
+from easy_data_loader.orchestrator import OrchestratorPipeline
+@pytest.fixture(autouse=True)
+def reset_configuration_singleton():
+    """Reset the Configuration singleton before and after each test."""
+    Configuration._instance = None
+    Configuration._initialized = False
+    yield
+    Configuration._instance = None
+    Configuration._initialized = False
+def test_orchestrator_passes_orchestrator_id_and_writes_audit():
+    # Create temporary file paths for our sqlite databases
+    with tempfile.TemporaryDirectory() as tmpdir:
+        src_db_path = os.path.join(tmpdir, "src.db")
+        dst_db_path = os.path.join(tmpdir, "dst.db")
+        audit_db_path = os.path.join(tmpdir, "audit.db")
+        # Create tables and put some data in the source database
+        conn_src = sqlite3.connect(src_db_path)
+        cursor_src = conn_src.cursor()
+        cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
+        cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
+        cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bob')")
+        conn_src.commit()
+        conn_src.close()
+        # Set up resources in Configuration
+        config = Configuration()
+        src_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=src_db_path
+        )
+        dst_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=dst_db_path
+        )
+        audit_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=audit_db_path
+        )
+        config.resources["src_db"] = src_resource
+        config.resources["dst_db"] = dst_resource
+        config.resources["audit_db"] = audit_resource
+        # Define 2 pipelines that copy data
+        pipeline1_def = BasePipelineDefinition(
+            pipeline_name="pipeline1",
+            source="src_db",
+            source_sql="SELECT * FROM source_table",
+            destination="dst_db",
+            destination_table="target_table1",
+            audit="audit_db",
+            write_parameters={"if_exists": "replace", "index": False},
+        )
+        pipeline2_def = BasePipelineDefinition(
+            pipeline_name="pipeline2",
+            source="src_db",
+            source_sql="SELECT * FROM source_table",
+            destination="dst_db",
+            destination_table="target_table2",
+            audit="audit_db",
+            write_parameters={"if_exists": "replace", "index": False},
+        )
+        # Define orchestrator
+        orchestrator_def = OrchestratorDefinition(
+            orchestrator_name="my_orchestrator",
+            pipelines=["pipeline1", "pipeline2"],
+            fail_fast=True,
+        )
+        config.pipelines["pipeline1"] = pipeline1_def
+        config.pipelines["pipeline2"] = pipeline2_def
+        config.pipelines["my_orchestrator"] = orchestrator_def
+        # Run the orchestrator
+        orchestrator = OrchestratorPipeline("my_orchestrator")
+        assert orchestrator.orchestrator_id is not None
+        success = orchestrator.run()
+        assert success is True
+        # Verify target tables were created and populated in destination
+        conn_dst = sqlite3.connect(dst_db_path)
+        cursor_dst = conn_dst.cursor()
+        cursor_dst.execute("SELECT COUNT(*) FROM target_table1")
+        assert cursor_dst.fetchone()[0] == 2
+        cursor_dst.execute("SELECT COUNT(*) FROM target_table2")
+        assert cursor_dst.fetchone()[0] == 2
+        conn_dst.close()
+        # Verify audit records in the audit database
+        conn_audit = sqlite3.connect(audit_db_path)
+        cursor_audit = conn_audit.cursor()
+        # Read the execution_audit table columns and rows
+        cursor_audit.execute(
+            "SELECT pipeline_id, orchestrator_id, pipeline_name, status FROM execution_audit"
+        )
+        rows = cursor_audit.fetchall()
+        assert len(rows) == 2
+        # Row 1 (pipeline1)
+        assert rows[0][1] == orchestrator.orchestrator_id
+        assert rows[0][2] == "pipeline1"
+        assert rows[0][3] == "SUCCESS"
+        # Row 2 (pipeline2)
+        assert rows[1][1] == orchestrator.orchestrator_id
+        assert rows[1][2] == "pipeline2"
+        assert rows[1][3] == "SUCCESS"
+        # Make sure they have distinct pipeline_ids but identical orchestrator_ids
+        assert rows[0][0] != rows[1][0]
+        conn_audit.close()

easy_data_loader-0.1.4/tests/test_validation.py ADDED Viewed

@@ -0,0 +1,219 @@
+import os
+import tempfile
+import sqlite3
+import pytest
+from pydantic import BaseModel, Field
+from easy_data_loader.config_loader import Configuration
+from easy_data_loader.models import (
+    BasePipelineDefinition,
+    FileBasedConnectionSettings,
+    ServerType,
+)
+from easy_data_loader.pipeline import LoadPipeline
+class ValidationTestModel(BaseModel):
+    id: int
+    name: str = Field(min_length=3)
+@pytest.fixture(autouse=True)
+def reset_configuration_singleton():
+    """Reset the Configuration singleton before and after each test."""
+    Configuration._instance = None
+    Configuration._initialized = False
+    yield
+    Configuration._instance = None
+    Configuration._initialized = False
+def test_validation_fail_false_keeps_valid_rows():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        src_db_path = os.path.join(tmpdir, "src.db")
+        dst_db_path = os.path.join(tmpdir, "dst.db")
+        audit_db_path = os.path.join(tmpdir, "audit.db")
+        # Create source data:
+        # - Row 1: Valid (name length 5 >= 3)
+        # - Row 2: Invalid (name length 2 < 3)
+        # - Row 3: Valid (name length 3 >= 3)
+        conn_src = sqlite3.connect(src_db_path)
+        cursor_src = conn_src.cursor()
+        cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
+        cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
+        cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bo')")
+        cursor_src.execute("INSERT INTO source_table VALUES (3, 'Charlie')")
+        conn_src.commit()
+        conn_src.close()
+        config = Configuration()
+        src_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=src_db_path
+        )
+        dst_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=dst_db_path
+        )
+        audit_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=audit_db_path
+        )
+        config.resources["src_db"] = src_resource
+        config.resources["dst_db"] = dst_resource
+        config.resources["audit_db"] = audit_resource
+        pipeline_def = BasePipelineDefinition(
+            pipeline_name="test_pipeline",
+            source="src_db",
+            source_sql="SELECT * FROM source_table",
+            destination="dst_db",
+            destination_table="target_table",
+            audit="audit_db",
+            validator=ValidationTestModel,
+            validation_fail=False,
+            write_parameters={"if_exists": "replace", "index": False},
+        )
+        config.pipelines["test_pipeline"] = pipeline_def
+        # Run pipeline
+        pipeline = LoadPipeline("test_pipeline")
+        success = pipeline.run()
+        # Should be successful because validation_fail=False
+        assert success is True
+        # Verify destination table only has valid rows (1 and 3)
+        conn_dst = sqlite3.connect(dst_db_path)
+        cursor_dst = conn_dst.cursor()
+        cursor_dst.execute("SELECT id, name FROM target_table ORDER BY id")
+        rows = cursor_dst.fetchall()
+        conn_dst.close()
+        assert len(rows) == 2
+        assert rows[0] == (1, "Alice")
+        assert rows[1] == (3, "Charlie")
+def test_validation_fail_true_fails_pipeline():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        src_db_path = os.path.join(tmpdir, "src.db")
+        dst_db_path = os.path.join(tmpdir, "dst.db")
+        audit_db_path = os.path.join(tmpdir, "audit.db")
+        # Create source data with an invalid row
+        conn_src = sqlite3.connect(src_db_path)
+        cursor_src = conn_src.cursor()
+        cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
+        cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
+        cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bo')")
+        conn_src.commit()
+        conn_src.close()
+        config = Configuration()
+        src_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=src_db_path
+        )
+        dst_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=dst_db_path
+        )
+        audit_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=audit_db_path
+        )
+        config.resources["src_db"] = src_resource
+        config.resources["dst_db"] = dst_resource
+        config.resources["audit_db"] = audit_resource
+        pipeline_def = BasePipelineDefinition(
+            pipeline_name="test_pipeline",
+            source="src_db",
+            source_sql="SELECT * FROM source_table",
+            destination="dst_db",
+            destination_table="target_table",
+            audit="audit_db",
+            validator=ValidationTestModel,
+            validation_fail=True,
+            write_parameters={"if_exists": "replace", "index": False},
+        )
+        config.pipelines["test_pipeline"] = pipeline_def
+        # Run pipeline
+        pipeline = LoadPipeline("test_pipeline")
+        success = pipeline.run()
+        # Should fail because validation_fail=True
+        assert success is False
+        assert pipeline.error_details is not None
+        assert "Validation failed" in pipeline.error_details
+        assert "Row 1:" in pipeline.error_details  # Row 2 (0-indexed row 1) failed
+def test_validation_fail_false_graceful_stop_when_all_rows_invalid():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        src_db_path = os.path.join(tmpdir, "src.db")
+        dst_db_path = os.path.join(tmpdir, "dst.db")
+        audit_db_path = os.path.join(tmpdir, "audit.db")
+        # Create source data with ONLY invalid rows
+        conn_src = sqlite3.connect(src_db_path)
+        cursor_src = conn_src.cursor()
+        cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
+        cursor_src.execute("INSERT INTO source_table VALUES (1, 'Bo')")
+        cursor_src.execute("INSERT INTO source_table VALUES (2, 'Co')")
+        conn_src.commit()
+        conn_src.close()
+        config = Configuration()
+        src_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=src_db_path
+        )
+        dst_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=dst_db_path
+        )
+        audit_resource = FileBasedConnectionSettings(
+            conn_server_type=ServerType.SQLITE, file_path=audit_db_path
+        )
+        config.resources["src_db"] = src_resource
+        config.resources["dst_db"] = dst_resource
+        config.resources["audit_db"] = audit_resource
+        pipeline_def = BasePipelineDefinition(
+            pipeline_name="test_pipeline",
+            source="src_db",
+            source_sql="SELECT * FROM source_table",
+            destination="dst_db",
+            destination_table="target_table",
+            audit="audit_db",
+            validator=ValidationTestModel,
+            validation_fail=False,
+            write_parameters={"if_exists": "replace", "index": False},
+        )
+        config.pipelines["test_pipeline"] = pipeline_def
+        # Run pipeline
+        pipeline = LoadPipeline("test_pipeline")
+        success = pipeline.run()
+        # Should stop gracefully and return True because validation_fail=False
+        assert success is True
+        assert pipeline.output_rows == 0
+        # Destination table should NOT have been loaded or have any data
+        conn_dst = sqlite3.connect(dst_db_path)
+        cursor_dst = conn_dst.cursor()
+        # The table should not exist or be empty
+        try:
+            cursor_dst.execute("SELECT COUNT(*) FROM target_table")
+            count = cursor_dst.fetchone()[0]
+            assert count == 0
+        except sqlite3.OperationalError:
+            # Table not even created, which is also correct/graceful
+            pass
+        conn_dst.close()