easy-data-loader 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ from pydantic import BaseModel, Field, ValidationError, model_validator, ConfigDict
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+ from typing import Optional, TypeAlias, Annotated
4
+ from typing_extensions import Annotated, Any, Union, Dict, List
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ from sqlalchemy.types import TypeEngine
8
+ from enum import Enum
9
+
10
+
11
+ class ServerType(Enum):
12
+ MSSQL = 'MSSQL'
13
+ POSTGRESQL = 'POSTGRESQL'
14
+ SQLITE = 'SQLITE'
15
+
16
+ class FileType(Enum):
17
+ CSV = 'CSV'
18
+ EXCEL = 'EXCEL'
19
+ ORC = 'ORC'
20
+ PARQUET = 'PARQUET'
21
+
22
+ class ConnectionSettings(BaseSettings):
23
+ """Default model for all database connections"""
24
+ conn_server_type : ServerType = ServerType.MSSQL
25
+ conn_server: str = "."
26
+ conn_database: str
27
+ conn_username: str | None = None
28
+ conn_password: str | None = None
29
+ conn_port: Annotated[int, Field(default=1433, ge=1, le=65535)]
30
+
31
+ model_config = SettingsConfigDict(
32
+ extra='ignore')
33
+
34
+ @model_validator(mode='after')
35
+ def check_auth(self) -> 'ConnectionSettings':
36
+ if(self.conn_username and not self.conn_password) or (self.conn_password and not self.conn_username):
37
+ raise ValueError("Both username and password must be provided for authentication")
38
+ return self
39
+
40
+ class FileSettings(BaseSettings):
41
+ """default model for all source file settings"""
42
+ file_type: FileType = FileType.CSV
43
+ folder_path: Path
44
+ file_name: Optional[str] = None
45
+ file_pattern: Optional[str] = None
46
+
47
+ @model_validator(mode='after')
48
+ def validate_model(self) -> 'FileSettings':
49
+ if not self.folder_path.exists():
50
+ raise ValueError(f'Root folder path: {self.folder_path} - does not exist')
51
+
52
+ # model needs to accept a file_name (we know the exact file name) or
53
+ # a file_pattern (we know part of the file name)
54
+ if self.file_pattern and self.file_name:
55
+ raise ValueError(f'Please define a specific file name or a file pattern, not both.')
56
+ return self
57
+
58
+ class ColumnDefinition(BaseModel):
59
+ """Default model representing a column in a database table"""
60
+ model_config = ConfigDict(arbitrary_types_allowed=True)
61
+
62
+ target_name: Optional[str] = None # name of the column in the Sql table
63
+ data_type: Optional[TypeEngine] = None # the SqlAlchemy data type
64
+
65
+
66
+ class BasePipelineDefinition(BaseModel):
67
+ """Base pipeline definition. Used for user defined pipelines but also autogenerated"""
68
+
69
+ model_config = ConfigDict(
70
+ arbitrary_types_allowed=True,
71
+ extra='allow'
72
+ )
73
+ pipeline_name: str = 'generic_pipeline'
74
+
75
+ source: str
76
+ source_sql: Optional[str] = None # can be a table name or a sql query
77
+ destination: str
78
+ destination_table: Optional[str] = None
79
+ audit: Optional[str] = None # resource name for the audit database
80
+ validator: Optional[Any] = None # pydantic model class for validation
81
+
82
+ # mapping of source columns to destination columns together with datatypes
83
+ columns: Dict[str, ColumnDefinition] = Field(default_factory=dict)
84
+
85
+ read_parameters: Dict[str, Any] = Field(default_factory=dict)
86
+ write_parameters: Dict[str, Any] = Field(default_factory=dict)
87
+
88
+ def file_pre_process(self, file_path: Path) -> Path:
89
+ """
90
+ Hook for preprocessing the file before reading.
91
+ Ex: extract archive, rename file etc.
92
+ """
93
+ return file_path
94
+
95
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
96
+ """
97
+ Hook method that allows data transformation before upload.
98
+ Can be overwritten by user in manual defined schema
99
+ """
100
+ return df
101
+
102
+ def get_sql_columns(self) -> Dict[str, ColumnDefinition]:
103
+ """
104
+ Return the sql column definition from the pipeline.
105
+ If there is no column mapping defined then the values must be populated
106
+ by reading the sql table columns.
107
+ """
108
+ return self.columns
109
+
110
+ # Helper for getting the columns to rename in pandas if any
111
+ def get_rename_map(self) -> Dict[str, str]:
112
+ rename_map = {}
113
+ for source, col_def in self.columns.items():
114
+ if col_def.target_name:
115
+ rename_map[source] = col_def.target_name
116
+ return rename_map
117
+
118
+ # Helper to get the sql columns data type
119
+ def get_dtype_map(self) -> Dict[str, TypeEngine]:
120
+ dtype_map = {}
121
+ for source, col_def in self.columns.items():
122
+ if col_def.data_type:
123
+ key = col_def.target_name or source
124
+ dtype_map[key] = col_def.data_type
125
+ return dtype_map
126
+
127
+
128
+ ResourceConfig: TypeAlias = Annotated[
129
+ Union[ConnectionSettings, FileSettings],
130
+ 'Hybrid resource configuration: it can be a file or database to serve as a source or destination'
131
+ ]
132
+
133
+
134
+ class ProcedureDefinition(BaseModel):
135
+ """Model representing a pipeline that executes stored procedures"""
136
+ pipeline_name: str
137
+ audit: Optional[str] = None
138
+ resource: str # database resource name
139
+ # List of (procedure_name, parameter_dict)
140
+ procedures: List[tuple[str, Optional[Dict[str, Any]]]]
141
+
142
+
143
+ class AuditEntry(BaseModel):
144
+ """Model representing an entry in the audit table"""
145
+ execution_id: str
146
+ pipeline_name: str
147
+ status: str
148
+ input_rows: int = 0
149
+ output_rows: int = 0
150
+ source_name: Optional[str] = None
151
+ destination_name: Optional[str] = None
152
+ file_name: Optional[str] = None
153
+ file_path: Optional[str] = None
154
+ file_size_bytes: Optional[int] = None
155
+ file_last_modified: Optional[pd.Timestamp] = None
156
+ error_details: Optional[str] = None
157
+ sp_name: Optional[str] = None
158
+ sp_parameters: Optional[Dict[str, Any]] = None
159
+ timestamp: pd.Timestamp = Field(default_factory=pd.Timestamp.now)
160
+
161
+ model_config = ConfigDict(arbitrary_types_allowed=True)
162
+
163
+
164
+ class OrchestratorDefinition(BaseModel):
165
+ """Model representing an orchestrator that chains multiple pipelines"""
166
+ orchestrator_name: str
167
+ pipelines: List[str] # List of pipeline names to run in sequence
168
+ fail_fast: bool = True # Stop execution if a pipeline fails
@@ -0,0 +1,59 @@
1
+ from typing import List
2
+ from .log import LoggedComponent
3
+ from .models import OrchestratorDefinition, BasePipelineDefinition, ProcedureDefinition
4
+ from .config_loader import Configuration
5
+ from .pipeline import LoadPipeline
6
+ from .procedure_pipeline import ProcedurePipeline
7
+ import uuid
8
+
9
+ class OrchestratorPipeline(LoggedComponent):
10
+ """Executes a chain of pipelines defined by an OrchestratorDefinition"""
11
+
12
+ def __init__(self, orchestrator_name: str):
13
+ super().__init__()
14
+ self.config = Configuration()
15
+ definition = self.config.get_pipeline(orchestrator_name)
16
+
17
+ if not isinstance(definition, OrchestratorDefinition):
18
+ self.log_and_raise(ValueError, f"'{orchestrator_name}' is not an orchestrator.")
19
+
20
+ self.definition: OrchestratorDefinition = definition
21
+ self.batch_id = str(uuid.uuid4())
22
+
23
+ def run(self) -> bool:
24
+ self.logger.info(f"=== Starting Orchestrator: {self.definition.orchestrator_name} (Batch: {self.batch_id}) ===")
25
+
26
+ success = True
27
+ for pipeline_name in self.definition.pipelines:
28
+ self.logger.info(f"[{self.definition.orchestrator_name}] -> Triggering pipeline: {pipeline_name}")
29
+
30
+ p_def = self.config.get_pipeline(pipeline_name)
31
+ p_success = False
32
+
33
+ # Instantiate and run
34
+ if isinstance(p_def, BasePipelineDefinition):
35
+ pipeline = LoadPipeline(pipeline_name)
36
+ # optionally pass batch_id down if we update BasePipeline later
37
+ p_success = pipeline.run()
38
+ elif isinstance(p_def, ProcedureDefinition):
39
+ pipeline = ProcedurePipeline(pipeline_name)
40
+ p_success = pipeline.run()
41
+ elif isinstance(p_def, OrchestratorDefinition):
42
+ self.logger.error(f"[{self.definition.orchestrator_name}] -> Nested orchestrators are not supported.")
43
+ p_success = False
44
+ else:
45
+ self.logger.error(f"[{self.definition.orchestrator_name}] -> Unknown pipeline type for '{pipeline_name}'")
46
+ p_success = False
47
+
48
+ if not p_success:
49
+ success = False
50
+ self.logger.error(f"[{self.definition.orchestrator_name}] -> Pipeline failed: {pipeline_name}")
51
+ if self.definition.fail_fast:
52
+ self.logger.error(f"[{self.definition.orchestrator_name}] -> Fail fast enabled. Stopping orchestrator.")
53
+ break
54
+ else:
55
+ self.logger.info(f"[{self.definition.orchestrator_name}] -> Pipeline succeeded: {pipeline_name}")
56
+
57
+ status = "SUCCESS" if success else "FAILED"
58
+ self.logger.info(f"=== Orchestrator {self.definition.orchestrator_name} finished with status: {status} ===")
59
+ return success
@@ -0,0 +1,169 @@
1
+ from typing import Optional, List, Any
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from pydantic import BaseModel
5
+
6
+ from .pipeline_base import BasePipeline
7
+ from .models import ConnectionSettings, FileSettings, BasePipelineDefinition
8
+ from .file_operations import FileOperations
9
+ from .database_operations import DatabaseOperations
10
+ from .config_loader import Configuration
11
+
12
+ class LoadPipeline(BasePipeline):
13
+ """
14
+ Main orchestrator responsible for executing a specific ETL pipeline by name.
15
+ It retrieves definitions and resources from the Configuration instance.
16
+ Inherits shared logic from BasePipeline.
17
+ """
18
+
19
+ def __init__(self, pipeline_name: str):
20
+ # Load definition from config
21
+ definition = Configuration().get_pipeline(pipeline_name)
22
+ if not isinstance(definition, BasePipelineDefinition):
23
+ raise ValueError(f"Pipeline '{pipeline_name}' is not a LoadPipeline definition.")
24
+
25
+ super().__init__(definition)
26
+ self.definition: BasePipelineDefinition = definition
27
+
28
+ # initialize components
29
+ self.src_db_ops: Optional[DatabaseOperations] = None
30
+ self.dst_db_ops: Optional[DatabaseOperations] = None
31
+ self.src_file_ops: Optional[FileOperations] = None
32
+ self.dst_file_ops: Optional[FileOperations] = None
33
+ self.source_file_path: Optional[Path] = None # For auditing
34
+
35
+ self._initialize_components()
36
+
37
+ def _initialize_components(self):
38
+ """Dynamically initialize the components based on their type and definition"""
39
+
40
+ source_resource = self.config.get_resource(self.definition.source)
41
+ destination_resource = self.config.get_resource(self.definition.destination)
42
+
43
+ # initialize source file
44
+ if isinstance(source_resource, FileSettings):
45
+ self.src_file_ops = FileOperations(source_resource)
46
+ # Store resource names for auditing
47
+ self.source_name = source_resource.folder_path.name
48
+ elif isinstance(source_resource, ConnectionSettings):
49
+ self.src_db_ops = self._setup_db_operations(source_resource)
50
+ self.source_name = source_resource.conn_database
51
+
52
+ # initialize destination file
53
+ if isinstance(destination_resource, FileSettings):
54
+ self.dst_file_ops = FileOperations(destination_resource)
55
+ self.destination_name = destination_resource.folder_path.name
56
+ elif isinstance(destination_resource, ConnectionSettings):
57
+ self.dst_db_ops = self._setup_db_operations(destination_resource)
58
+ self.destination_name = destination_resource.conn_database
59
+
60
+ def run(self) -> bool:
61
+ """Executes the entire ETL flow"""
62
+ self.logger.info(f">>> Starting Load Pipeline: {self.definition.pipeline_name} <<<")
63
+
64
+ try:
65
+ # 1. EXTRACT
66
+ df = self._extract_step()
67
+ if df.empty:
68
+ self.logger.warning("No data found! Pipeline will stop!")
69
+ return False
70
+
71
+ self.input_rows = len(df)
72
+
73
+ # 2. TRANSFORM & VALIDATE
74
+ df = self._transform_step(df)
75
+ self.output_rows = len(df)
76
+
77
+ # 3. LOAD
78
+ load_success = self._load_step(df)
79
+ if not load_success:
80
+ self.logger.error(f'>>> Pipeline {self.definition.pipeline_name} failed to load during the LOAD step.')
81
+ self._log_audit("FAILED")
82
+ return False
83
+
84
+ self._log_audit("SUCCESS")
85
+ self.logger.info(f">>> Pipeline {self.definition.pipeline_name} finished successfully <<<")
86
+ return True
87
+
88
+ except Exception as e:
89
+ self.error_details = str(e)
90
+ self.log_exception(e, f"Critical pipeline error - {self.definition.pipeline_name}")
91
+ self._log_audit("FAILED")
92
+ return False
93
+ finally:
94
+ self._cleanup()
95
+
96
+ def _extract_step(self) -> pd.DataFrame:
97
+ """Handles extraction logic based on source type."""
98
+ try:
99
+ if self.src_db_ops: # DB source
100
+ if self.definition.source_sql:
101
+ return self.src_db_ops.read_data(self.definition.source_sql, **self.definition.read_parameters)
102
+
103
+ if self.src_file_ops: # File source
104
+ self.src_file_ops._apply_file_preprocessor(self.definition.file_pre_process)
105
+ df = self.src_file_ops.read_file(**self.definition.read_parameters)
106
+ self.source_file_path = self.src_file_ops.file_path # Ensure we have the path for auditing
107
+ return df
108
+
109
+ self.logger.error('No valid source configured for the Pipeline')
110
+
111
+ except Exception as e:
112
+ self.log_exception(e, 'Extraction step failed')
113
+ raise
114
+
115
+ return pd.DataFrame()
116
+
117
+ def _transform_step(self, df: pd.DataFrame) -> pd.DataFrame:
118
+ """Applies transformations and optional Pydantic validation."""
119
+
120
+ # 1. Pipeline hook transformation
121
+ df = self.definition.transform(df)
122
+
123
+ # 2. Renaming columns (can also be handled by Pydantic aliases)
124
+ rename_map = self.definition.get_rename_map()
125
+ if rename_map:
126
+ df.rename(columns=rename_map, inplace=True)
127
+ self.logger.info(f"Columns renamed: {list(rename_map.values())}")
128
+
129
+ # 3. Optional Pydantic Validation
130
+ if self.definition.validator:
131
+ self.logger.info(f"Starting data validation against model: {self.definition.validator.__name__}")
132
+ validated_rows = []
133
+
134
+ # Optimization: iterrows is slow because it creates a Series for each row.
135
+ # Converting to a list of dicts first is much faster for iteration.
136
+ records = df.to_dict(orient='records')
137
+
138
+ for i, record in enumerate(records):
139
+ try:
140
+ # Use model_validate for individual row validation
141
+ model_inst = self.definition.validator.model_validate(record)
142
+ validated_rows.append(model_inst.model_dump())
143
+ except Exception as e:
144
+ self.logger.warning(f"Row {i} failed validation: {str(e)}")
145
+
146
+ df = pd.DataFrame(validated_rows)
147
+
148
+ return df
149
+
150
+ def _load_step(self, df: pd.DataFrame) -> bool:
151
+ """Handles loading logic based on destination type."""
152
+ try:
153
+ if self.dst_db_ops and self.definition.destination_table: #DB destination
154
+ dtype_map = self.definition.get_dtype_map()
155
+ self.dst_db_ops.write_to_table(
156
+ table_name=self.definition.destination_table,
157
+ df=df,
158
+ dtype=dtype_map,
159
+ **self.definition.write_parameters,
160
+ )
161
+ return True
162
+ elif self.dst_file_ops: # File destination
163
+ self.dst_file_ops.write_file(df, **self.definition.write_parameters)
164
+ return True
165
+ return False
166
+ except Exception as e:
167
+ self.log_exception(e, 'Error writing to destination')
168
+ return False
169
+
@@ -0,0 +1,121 @@
1
+ import uuid
2
+ from datetime import datetime
3
+ import pandas as pd
4
+ from abc import ABC, abstractmethod
5
+ from typing import List, Optional, Union
6
+
7
+ from .log import LoggedComponent
8
+ from .models import ConnectionSettings, AuditEntry, BasePipelineDefinition, ProcedureDefinition, ServerType
9
+ from .database_connector import DatabaseConnector, CONNECTOR_FACTORY
10
+ from .database_operations import DatabaseOperations
11
+ from .config_loader import Configuration
12
+
13
+ class BasePipeline(LoggedComponent, ABC):
14
+ """
15
+ Abstract base class for all pipeline types.
16
+ Handles:
17
+ - Execution ID generation
18
+ - Audit resource initialization
19
+ - Shared resource cleanup
20
+ """
21
+
22
+ def __init__(self, definition: Union[BasePipelineDefinition, ProcedureDefinition]):
23
+ super().__init__()
24
+ self.definition = definition
25
+ self.execution_id = str(uuid.uuid4())
26
+ self.config = Configuration()
27
+
28
+ self._active_connectors: List[DatabaseConnector] = []
29
+ self.audit_db_ops: Optional[DatabaseOperations] = None
30
+
31
+ # Metadata for auditing
32
+ self.input_rows = 0
33
+ self.output_rows = 0
34
+ self.error_details: Optional[str] = None
35
+ self.start_time = pd.Timestamp.now()
36
+
37
+ self._initialize_audit_resource()
38
+
39
+ def _initialize_audit_resource(self):
40
+ """Initialize the database operations for auditing."""
41
+ audit_resource_name = self.definition.audit
42
+
43
+ if audit_resource_name:
44
+ # External audit database
45
+ resource = self.config.get_resource(audit_resource_name)
46
+ if not isinstance(resource, ConnectionSettings):
47
+ self.log_and_raise(ValueError, f"Audit resource '{audit_resource_name}' must be a database connection.")
48
+ self.audit_db_ops = self._setup_db_operations(resource)
49
+ else:
50
+ # SQLite Fallback
51
+ self.logger.info("No audit resource defined. Falling back to SQLite audit_logs.db")
52
+ sqlite_settings = ConnectionSettings(
53
+ conn_server_type=ServerType.SQLITE,
54
+ conn_server="audit_logs.db",
55
+ conn_database="audit_logs" # Not used by SQLite but required by model
56
+ )
57
+ self.audit_db_ops = self._setup_db_operations(sqlite_settings)
58
+
59
+ def _setup_db_operations(self, settings: ConnectionSettings) -> DatabaseOperations:
60
+ """Helper to create the connector, store it for cleanup, and return the Ops."""
61
+ server_type = settings.conn_server_type
62
+
63
+ if server_type not in CONNECTOR_FACTORY:
64
+ self.log_and_raise(ValueError, f"Unsupported server type: {server_type}")
65
+
66
+ connector = CONNECTOR_FACTORY[server_type](settings)
67
+ self._active_connectors.append(connector)
68
+ return DatabaseOperations(connector.get_engine())
69
+
70
+ def _cleanup(self):
71
+ """Dispose of all active database connectors."""
72
+ for connector in self._active_connectors:
73
+ connector._dispose_engine()
74
+ self.logger.debug("Pipeline cleanup completed.")
75
+
76
+ def _log_audit(self, status: str):
77
+ """Write the execution audit entry to the audit database."""
78
+ if not self.audit_db_ops:
79
+ return
80
+
81
+ # Build file metadata dict before constructing the entry.
82
+ # Pydantic v2 models are immutable by default, so we must pass
83
+ # all fields at construction time rather than mutating afterward.
84
+ file_meta = {}
85
+ source_file = getattr(self, 'source_file_path', None)
86
+ if source_file:
87
+ try:
88
+ stat = source_file.stat()
89
+ file_meta = {
90
+ "file_name": source_file.name,
91
+ "file_path": str(source_file),
92
+ "file_size_bytes": stat.st_size,
93
+ "file_last_modified": pd.Timestamp(stat.st_mtime, unit='s'),
94
+ }
95
+ except OSError as e:
96
+ self.logger.warning(f"Could not read file metadata for audit: {e}")
97
+
98
+ entry = AuditEntry(
99
+ execution_id=self.execution_id,
100
+ pipeline_name=self.definition.pipeline_name,
101
+ status=status,
102
+ input_rows=self.input_rows,
103
+ output_rows=self.output_rows,
104
+ source_name=getattr(self, 'source_name', None),
105
+ destination_name=getattr(self, 'destination_name', None),
106
+ sp_name=getattr(self, 'sp_name', None),
107
+ sp_parameters=getattr(self, 'sp_parameters', None),
108
+ timestamp=pd.Timestamp.now(),
109
+ error_details=self.error_details,
110
+ **file_meta,
111
+ )
112
+
113
+ try:
114
+ self.audit_db_ops.write_audit("execution_audit", entry)
115
+ except Exception as e:
116
+ self.logger.error(f"Failed to write audit log: {str(e)}")
117
+
118
+ @abstractmethod
119
+ def run(self) -> bool:
120
+ """Main execution logic to be implemented by child classes."""
121
+ pass
@@ -0,0 +1,56 @@
1
+ from typing import Dict, Any
2
+ from .pipeline_base import BasePipeline
3
+ from .models import ProcedureDefinition, ConnectionSettings
4
+ from .config_loader import Configuration
5
+
6
+ class ProcedurePipeline(BasePipeline):
7
+ """
8
+ Pipeline specialized in executing database stored procedures.
9
+ """
10
+
11
+ def __init__(self, pipeline_name: str):
12
+ # Load definition from config
13
+ definition = Configuration().get_pipeline(pipeline_name)
14
+ if not isinstance(definition, ProcedureDefinition):
15
+ raise ValueError(f"Pipeline '{pipeline_name}' is not a ProcedureDefinition.")
16
+
17
+ super().__init__(definition)
18
+ self.definition: ProcedureDefinition = definition
19
+
20
+ # Set up primary database connection
21
+ resource = self.config.get_resource(self.definition.resource)
22
+ if not isinstance(resource, ConnectionSettings):
23
+ self.log_and_raise(ValueError, f"Resource '{self.definition.resource}' must be a database connection.")
24
+
25
+ self.db_ops = self._setup_db_operations(resource)
26
+
27
+ def run(self) -> bool:
28
+ """Execute all procedures in the definition."""
29
+ self.logger.info(f">>> Starting Procedure Pipeline: {self.definition.pipeline_name} <<<")
30
+
31
+ try:
32
+ for proc_name, params in self.definition.procedures:
33
+ params = params or {}
34
+ self.logger.info(f"Running procedure: {proc_name} with params: {params}")
35
+ self.db_ops.execute_stored_procedure(proc_name, **params)
36
+
37
+ # Set audit details
38
+ self.sp_name = ", ".join([p[0] for p in self.definition.procedures])
39
+ self.sp_parameters = {p[0]: p[1] for p in self.definition.procedures}
40
+
41
+ self._log_audit("SUCCESS")
42
+ self.logger.info(f">>> Procedure Pipeline {self.definition.pipeline_name} finished successfully <<<")
43
+ return True
44
+
45
+ except Exception as e:
46
+ self.error_details = str(e)
47
+ self.log_exception(e, f"Error in Procedure Pipeline: {self.definition.pipeline_name}")
48
+
49
+ # Set audit details for failure too
50
+ self.sp_name = ", ".join([p[0] for p in self.definition.procedures])
51
+ self.sp_parameters = {p[0]: p[1] for p in self.definition.procedures}
52
+
53
+ self._log_audit("FAILED")
54
+ return False
55
+ finally:
56
+ self._cleanup()
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: easy_data_loader
3
+ Version: 0.1.0
4
+ Summary: Data transfer utilities between files and databases
5
+ Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: click>=8.3.0
10
+ Requires-Dist: ipykernel
11
+ Requires-Dist: openpyxl>=3.1.5
12
+ Requires-Dist: pandas>=2.3.3
13
+ Requires-Dist: pyarrow>=22.0.0
14
+ Requires-Dist: pydantic>=2.12.5
15
+ Requires-Dist: pydantic-settings>=2.12.0
16
+ Requires-Dist: pyodbc>=5.2.0
17
+ Requires-Dist: python-dotenv>=1.1.1
18
+ Requires-Dist: sqlalchemy>=2.0.43
19
+ Dynamic: license-file
20
+
21
+ # Easy Data Loader 🚀
22
+
23
+ **Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various data sources (CSV, Excel, Parquet) and SQL databases (MSSQL, PostgreSQL, and others).
24
+
25
+ ## ✨ Key Features
26
+ - **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
27
+ - **Integrated CLI**: Initialize a standardized project structure with a single command.
28
+ - **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
29
+ - **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
30
+ - **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
31
+
32
+ ---
33
+
34
+ ## 📦 Installation
35
+
36
+ Install directly via `pip` or `uv`:
37
+
38
+ ```bash
39
+ pip install easy_data_loader
40
+ ```
41
+
42
+ ## 🚀 Getting Started
43
+
44
+ 1. Initialize a new project structure to generate template configurations:
45
+ ```bash
46
+ easy-loader init
47
+ ```
48
+ 2. Review the generated `config/` folders for sample resources and pipelines.
49
+ 3. Run all discovered pipelines across the active configurations:
50
+ ```bash
51
+ easy-loader run_all
52
+ ```
@@ -0,0 +1,20 @@
1
+ easy_data_loader/__init__.py,sha256=kUkTXzuQYPFTU3pBvGODVgYrF6V6XGiXrSHpEQ81uZY,439
2
+ easy_data_loader/cli.py,sha256=W-JIEMB3abQpD5N8vvOd8LB3_Jd-N_tyooRGku_qNmE,9893
3
+ easy_data_loader/config_loader.py,sha256=KsXNN5UHr6GA8oxu_sJ0K3mHCI5UOuulQat3Jt-o0vk,7254
4
+ easy_data_loader/custom_exceptions.py,sha256=nF-0ENJOPTtyrv_UO_kHSvVM3tfFXQmLHqXvADTzW8E,735
5
+ easy_data_loader/database_connector.py,sha256=K879FpeIrcdE_NCcA44rW6CDMgB3tPbp_t6V_aFdKlI,6784
6
+ easy_data_loader/database_operations.py,sha256=oAm4LRC84KLqeFlzHz0OdsS32y-TVad2x_UHMGCMI0o,5280
7
+ easy_data_loader/driver_detector.py,sha256=L8H7IPEvDD7eb183wQH2eFwcmOuW_oxy7tNxGH-Kmlo,1280
8
+ easy_data_loader/file_operations.py,sha256=Eu4H6q6CLAQe-zE12ZBJ1m_A9zp9m1UDKYGChwGBD-s,5820
9
+ easy_data_loader/log.py,sha256=yIfvOhqnHzq8TogdvK6Si1LVsbI9QXsw0EDRIdEluS4,2825
10
+ easy_data_loader/models.py,sha256=yY1z6WcWqKyBefO4br9f88WeiNdKQEbBUqWDNr2sUok,5551
11
+ easy_data_loader/orchestrator.py,sha256=myDCM1Jmd1Soj_AvIuCZgi_uPAZvSpyJMKROb2u2tVc,2861
12
+ easy_data_loader/pipeline.py,sha256=UxjpZ9Q4F1uzlUIoO9r5a4y6B6lexhVKWwHorbRSenQ,6020
13
+ easy_data_loader/pipeline_base.py,sha256=qnb8dFEgQThFPT6rF9ywFzhdtFs1tB9wy7H6VDPBoRA,4201
14
+ easy_data_loader/procedure_pipeline.py,sha256=vYtYaIshnr34KSF_PeraUM1bWHFUYVcT9L2s2gLTMIQ,2075
15
+ easy_data_loader-0.1.0.dist-info/licenses/LICENSE,sha256=zMCK-7LtJpSF-qQw0A1-t9a6N3F49e36vI3bh1Klvlk,1070
16
+ easy_data_loader-0.1.0.dist-info/METADATA,sha256=sv99uwRor2gXojTwrUvH1dWNuQR_gLuO12Bq_HOcl54,1856
17
+ easy_data_loader-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
18
+ easy_data_loader-0.1.0.dist-info/entry_points.txt,sha256=o0rWYa-lvUmVnX_rBRTcr1UQRG8aJWDdA_BdDhX-pug,58
19
+ easy_data_loader-0.1.0.dist-info/top_level.txt,sha256=YsMeo8e9snRg9aEw7rSdLL0zFuaGCJOEzAh7TiJkloc,17
20
+ easy_data_loader-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ easy-loader = easy_data_loader.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Bojoi Gabriel
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ easy_data_loader