easy-data-loader 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easy_data_loader/__init__.py +11 -0
- easy_data_loader/cli.py +302 -0
- easy_data_loader/config_loader.py +184 -0
- easy_data_loader/custom_exceptions.py +21 -0
- easy_data_loader/database_connector.py +190 -0
- easy_data_loader/database_operations.py +129 -0
- easy_data_loader/driver_detector.py +46 -0
- easy_data_loader/file_operations.py +146 -0
- easy_data_loader/log.py +90 -0
- easy_data_loader/models.py +168 -0
- easy_data_loader/orchestrator.py +59 -0
- easy_data_loader/pipeline.py +169 -0
- easy_data_loader/pipeline_base.py +121 -0
- easy_data_loader/procedure_pipeline.py +56 -0
- easy_data_loader-0.1.0.dist-info/METADATA +52 -0
- easy_data_loader-0.1.0.dist-info/RECORD +20 -0
- easy_data_loader-0.1.0.dist-info/WHEEL +5 -0
- easy_data_loader-0.1.0.dist-info/entry_points.txt +2 -0
- easy_data_loader-0.1.0.dist-info/licenses/LICENSE +21 -0
- easy_data_loader-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, ValidationError, model_validator, ConfigDict
|
|
2
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
|
+
from typing import Optional, TypeAlias, Annotated
|
|
4
|
+
from typing_extensions import Annotated, Any, Union, Dict, List
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from sqlalchemy.types import TypeEngine
|
|
8
|
+
from enum import Enum
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ServerType(Enum):
|
|
12
|
+
MSSQL = 'MSSQL'
|
|
13
|
+
POSTGRESQL = 'POSTGRESQL'
|
|
14
|
+
SQLITE = 'SQLITE'
|
|
15
|
+
|
|
16
|
+
class FileType(Enum):
|
|
17
|
+
CSV = 'CSV'
|
|
18
|
+
EXCEL = 'EXCEL'
|
|
19
|
+
ORC = 'ORC'
|
|
20
|
+
PARQUET = 'PARQUET'
|
|
21
|
+
|
|
22
|
+
class ConnectionSettings(BaseSettings):
|
|
23
|
+
"""Default model for all database connections"""
|
|
24
|
+
conn_server_type : ServerType = ServerType.MSSQL
|
|
25
|
+
conn_server: str = "."
|
|
26
|
+
conn_database: str
|
|
27
|
+
conn_username: str | None = None
|
|
28
|
+
conn_password: str | None = None
|
|
29
|
+
conn_port: Annotated[int, Field(default=1433, ge=1, le=65535)]
|
|
30
|
+
|
|
31
|
+
model_config = SettingsConfigDict(
|
|
32
|
+
extra='ignore')
|
|
33
|
+
|
|
34
|
+
@model_validator(mode='after')
|
|
35
|
+
def check_auth(self) -> 'ConnectionSettings':
|
|
36
|
+
if(self.conn_username and not self.conn_password) or (self.conn_password and not self.conn_username):
|
|
37
|
+
raise ValueError("Both username and password must be provided for authentication")
|
|
38
|
+
return self
|
|
39
|
+
|
|
40
|
+
class FileSettings(BaseSettings):
|
|
41
|
+
"""default model for all source file settings"""
|
|
42
|
+
file_type: FileType = FileType.CSV
|
|
43
|
+
folder_path: Path
|
|
44
|
+
file_name: Optional[str] = None
|
|
45
|
+
file_pattern: Optional[str] = None
|
|
46
|
+
|
|
47
|
+
@model_validator(mode='after')
|
|
48
|
+
def validate_model(self) -> 'FileSettings':
|
|
49
|
+
if not self.folder_path.exists():
|
|
50
|
+
raise ValueError(f'Root folder path: {self.folder_path} - does not exist')
|
|
51
|
+
|
|
52
|
+
# model needs to accept a file_name (we know the exact file name) or
|
|
53
|
+
# a file_pattern (we know part of the file name)
|
|
54
|
+
if self.file_pattern and self.file_name:
|
|
55
|
+
raise ValueError(f'Please define a specific file name or a file pattern, not both.')
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
class ColumnDefinition(BaseModel):
|
|
59
|
+
"""Default model representing a column in a database table"""
|
|
60
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
61
|
+
|
|
62
|
+
target_name: Optional[str] = None # name of the column in the Sql table
|
|
63
|
+
data_type: Optional[TypeEngine] = None # the SqlAlchemy data type
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class BasePipelineDefinition(BaseModel):
|
|
67
|
+
"""Base pipeline definition. Used for user defined pipelines but also autogenerated"""
|
|
68
|
+
|
|
69
|
+
model_config = ConfigDict(
|
|
70
|
+
arbitrary_types_allowed=True,
|
|
71
|
+
extra='allow'
|
|
72
|
+
)
|
|
73
|
+
pipeline_name: str = 'generic_pipeline'
|
|
74
|
+
|
|
75
|
+
source: str
|
|
76
|
+
source_sql: Optional[str] = None # can be a table name or a sql query
|
|
77
|
+
destination: str
|
|
78
|
+
destination_table: Optional[str] = None
|
|
79
|
+
audit: Optional[str] = None # resource name for the audit database
|
|
80
|
+
validator: Optional[Any] = None # pydantic model class for validation
|
|
81
|
+
|
|
82
|
+
# mapping of source columns to destination columns together with datatypes
|
|
83
|
+
columns: Dict[str, ColumnDefinition] = Field(default_factory=dict)
|
|
84
|
+
|
|
85
|
+
read_parameters: Dict[str, Any] = Field(default_factory=dict)
|
|
86
|
+
write_parameters: Dict[str, Any] = Field(default_factory=dict)
|
|
87
|
+
|
|
88
|
+
def file_pre_process(self, file_path: Path) -> Path:
|
|
89
|
+
"""
|
|
90
|
+
Hook for preprocessing the file before reading.
|
|
91
|
+
Ex: extract archive, rename file etc.
|
|
92
|
+
"""
|
|
93
|
+
return file_path
|
|
94
|
+
|
|
95
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
96
|
+
"""
|
|
97
|
+
Hook method that allows data transformation before upload.
|
|
98
|
+
Can be overwritten by user in manual defined schema
|
|
99
|
+
"""
|
|
100
|
+
return df
|
|
101
|
+
|
|
102
|
+
def get_sql_columns(self) -> Dict[str, ColumnDefinition]:
|
|
103
|
+
"""
|
|
104
|
+
Return the sql column definition from the pipeline.
|
|
105
|
+
If there is no column mapping defined then the values must be populated
|
|
106
|
+
by reading the sql table columns.
|
|
107
|
+
"""
|
|
108
|
+
return self.columns
|
|
109
|
+
|
|
110
|
+
# Helper for getting the columns to rename in pandas if any
|
|
111
|
+
def get_rename_map(self) -> Dict[str, str]:
|
|
112
|
+
rename_map = {}
|
|
113
|
+
for source, col_def in self.columns.items():
|
|
114
|
+
if col_def.target_name:
|
|
115
|
+
rename_map[source] = col_def.target_name
|
|
116
|
+
return rename_map
|
|
117
|
+
|
|
118
|
+
# Helper to get the sql columns data type
|
|
119
|
+
def get_dtype_map(self) -> Dict[str, TypeEngine]:
|
|
120
|
+
dtype_map = {}
|
|
121
|
+
for source, col_def in self.columns.items():
|
|
122
|
+
if col_def.data_type:
|
|
123
|
+
key = col_def.target_name or source
|
|
124
|
+
dtype_map[key] = col_def.data_type
|
|
125
|
+
return dtype_map
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
ResourceConfig: TypeAlias = Annotated[
|
|
129
|
+
Union[ConnectionSettings, FileSettings],
|
|
130
|
+
'Hybrid resource configuration: it can be a file or database to serve as a source or destination'
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class ProcedureDefinition(BaseModel):
|
|
135
|
+
"""Model representing a pipeline that executes stored procedures"""
|
|
136
|
+
pipeline_name: str
|
|
137
|
+
audit: Optional[str] = None
|
|
138
|
+
resource: str # database resource name
|
|
139
|
+
# List of (procedure_name, parameter_dict)
|
|
140
|
+
procedures: List[tuple[str, Optional[Dict[str, Any]]]]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class AuditEntry(BaseModel):
|
|
144
|
+
"""Model representing an entry in the audit table"""
|
|
145
|
+
execution_id: str
|
|
146
|
+
pipeline_name: str
|
|
147
|
+
status: str
|
|
148
|
+
input_rows: int = 0
|
|
149
|
+
output_rows: int = 0
|
|
150
|
+
source_name: Optional[str] = None
|
|
151
|
+
destination_name: Optional[str] = None
|
|
152
|
+
file_name: Optional[str] = None
|
|
153
|
+
file_path: Optional[str] = None
|
|
154
|
+
file_size_bytes: Optional[int] = None
|
|
155
|
+
file_last_modified: Optional[pd.Timestamp] = None
|
|
156
|
+
error_details: Optional[str] = None
|
|
157
|
+
sp_name: Optional[str] = None
|
|
158
|
+
sp_parameters: Optional[Dict[str, Any]] = None
|
|
159
|
+
timestamp: pd.Timestamp = Field(default_factory=pd.Timestamp.now)
|
|
160
|
+
|
|
161
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class OrchestratorDefinition(BaseModel):
|
|
165
|
+
"""Model representing an orchestrator that chains multiple pipelines"""
|
|
166
|
+
orchestrator_name: str
|
|
167
|
+
pipelines: List[str] # List of pipeline names to run in sequence
|
|
168
|
+
fail_fast: bool = True # Stop execution if a pipeline fails
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from .log import LoggedComponent
|
|
3
|
+
from .models import OrchestratorDefinition, BasePipelineDefinition, ProcedureDefinition
|
|
4
|
+
from .config_loader import Configuration
|
|
5
|
+
from .pipeline import LoadPipeline
|
|
6
|
+
from .procedure_pipeline import ProcedurePipeline
|
|
7
|
+
import uuid
|
|
8
|
+
|
|
9
|
+
class OrchestratorPipeline(LoggedComponent):
|
|
10
|
+
"""Executes a chain of pipelines defined by an OrchestratorDefinition"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, orchestrator_name: str):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self.config = Configuration()
|
|
15
|
+
definition = self.config.get_pipeline(orchestrator_name)
|
|
16
|
+
|
|
17
|
+
if not isinstance(definition, OrchestratorDefinition):
|
|
18
|
+
self.log_and_raise(ValueError, f"'{orchestrator_name}' is not an orchestrator.")
|
|
19
|
+
|
|
20
|
+
self.definition: OrchestratorDefinition = definition
|
|
21
|
+
self.batch_id = str(uuid.uuid4())
|
|
22
|
+
|
|
23
|
+
def run(self) -> bool:
|
|
24
|
+
self.logger.info(f"=== Starting Orchestrator: {self.definition.orchestrator_name} (Batch: {self.batch_id}) ===")
|
|
25
|
+
|
|
26
|
+
success = True
|
|
27
|
+
for pipeline_name in self.definition.pipelines:
|
|
28
|
+
self.logger.info(f"[{self.definition.orchestrator_name}] -> Triggering pipeline: {pipeline_name}")
|
|
29
|
+
|
|
30
|
+
p_def = self.config.get_pipeline(pipeline_name)
|
|
31
|
+
p_success = False
|
|
32
|
+
|
|
33
|
+
# Instantiate and run
|
|
34
|
+
if isinstance(p_def, BasePipelineDefinition):
|
|
35
|
+
pipeline = LoadPipeline(pipeline_name)
|
|
36
|
+
# optionally pass batch_id down if we update BasePipeline later
|
|
37
|
+
p_success = pipeline.run()
|
|
38
|
+
elif isinstance(p_def, ProcedureDefinition):
|
|
39
|
+
pipeline = ProcedurePipeline(pipeline_name)
|
|
40
|
+
p_success = pipeline.run()
|
|
41
|
+
elif isinstance(p_def, OrchestratorDefinition):
|
|
42
|
+
self.logger.error(f"[{self.definition.orchestrator_name}] -> Nested orchestrators are not supported.")
|
|
43
|
+
p_success = False
|
|
44
|
+
else:
|
|
45
|
+
self.logger.error(f"[{self.definition.orchestrator_name}] -> Unknown pipeline type for '{pipeline_name}'")
|
|
46
|
+
p_success = False
|
|
47
|
+
|
|
48
|
+
if not p_success:
|
|
49
|
+
success = False
|
|
50
|
+
self.logger.error(f"[{self.definition.orchestrator_name}] -> Pipeline failed: {pipeline_name}")
|
|
51
|
+
if self.definition.fail_fast:
|
|
52
|
+
self.logger.error(f"[{self.definition.orchestrator_name}] -> Fail fast enabled. Stopping orchestrator.")
|
|
53
|
+
break
|
|
54
|
+
else:
|
|
55
|
+
self.logger.info(f"[{self.definition.orchestrator_name}] -> Pipeline succeeded: {pipeline_name}")
|
|
56
|
+
|
|
57
|
+
status = "SUCCESS" if success else "FAILED"
|
|
58
|
+
self.logger.info(f"=== Orchestrator {self.definition.orchestrator_name} finished with status: {status} ===")
|
|
59
|
+
return success
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from typing import Optional, List, Any
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from .pipeline_base import BasePipeline
|
|
7
|
+
from .models import ConnectionSettings, FileSettings, BasePipelineDefinition
|
|
8
|
+
from .file_operations import FileOperations
|
|
9
|
+
from .database_operations import DatabaseOperations
|
|
10
|
+
from .config_loader import Configuration
|
|
11
|
+
|
|
12
|
+
class LoadPipeline(BasePipeline):
|
|
13
|
+
"""
|
|
14
|
+
Main orchestrator responsible for executing a specific ETL pipeline by name.
|
|
15
|
+
It retrieves definitions and resources from the Configuration instance.
|
|
16
|
+
Inherits shared logic from BasePipeline.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, pipeline_name: str):
|
|
20
|
+
# Load definition from config
|
|
21
|
+
definition = Configuration().get_pipeline(pipeline_name)
|
|
22
|
+
if not isinstance(definition, BasePipelineDefinition):
|
|
23
|
+
raise ValueError(f"Pipeline '{pipeline_name}' is not a LoadPipeline definition.")
|
|
24
|
+
|
|
25
|
+
super().__init__(definition)
|
|
26
|
+
self.definition: BasePipelineDefinition = definition
|
|
27
|
+
|
|
28
|
+
# initialize components
|
|
29
|
+
self.src_db_ops: Optional[DatabaseOperations] = None
|
|
30
|
+
self.dst_db_ops: Optional[DatabaseOperations] = None
|
|
31
|
+
self.src_file_ops: Optional[FileOperations] = None
|
|
32
|
+
self.dst_file_ops: Optional[FileOperations] = None
|
|
33
|
+
self.source_file_path: Optional[Path] = None # For auditing
|
|
34
|
+
|
|
35
|
+
self._initialize_components()
|
|
36
|
+
|
|
37
|
+
def _initialize_components(self):
|
|
38
|
+
"""Dynamically initialize the components based on their type and definition"""
|
|
39
|
+
|
|
40
|
+
source_resource = self.config.get_resource(self.definition.source)
|
|
41
|
+
destination_resource = self.config.get_resource(self.definition.destination)
|
|
42
|
+
|
|
43
|
+
# initialize source file
|
|
44
|
+
if isinstance(source_resource, FileSettings):
|
|
45
|
+
self.src_file_ops = FileOperations(source_resource)
|
|
46
|
+
# Store resource names for auditing
|
|
47
|
+
self.source_name = source_resource.folder_path.name
|
|
48
|
+
elif isinstance(source_resource, ConnectionSettings):
|
|
49
|
+
self.src_db_ops = self._setup_db_operations(source_resource)
|
|
50
|
+
self.source_name = source_resource.conn_database
|
|
51
|
+
|
|
52
|
+
# initialize destination file
|
|
53
|
+
if isinstance(destination_resource, FileSettings):
|
|
54
|
+
self.dst_file_ops = FileOperations(destination_resource)
|
|
55
|
+
self.destination_name = destination_resource.folder_path.name
|
|
56
|
+
elif isinstance(destination_resource, ConnectionSettings):
|
|
57
|
+
self.dst_db_ops = self._setup_db_operations(destination_resource)
|
|
58
|
+
self.destination_name = destination_resource.conn_database
|
|
59
|
+
|
|
60
|
+
def run(self) -> bool:
|
|
61
|
+
"""Executes the entire ETL flow"""
|
|
62
|
+
self.logger.info(f">>> Starting Load Pipeline: {self.definition.pipeline_name} <<<")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
# 1. EXTRACT
|
|
66
|
+
df = self._extract_step()
|
|
67
|
+
if df.empty:
|
|
68
|
+
self.logger.warning("No data found! Pipeline will stop!")
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
self.input_rows = len(df)
|
|
72
|
+
|
|
73
|
+
# 2. TRANSFORM & VALIDATE
|
|
74
|
+
df = self._transform_step(df)
|
|
75
|
+
self.output_rows = len(df)
|
|
76
|
+
|
|
77
|
+
# 3. LOAD
|
|
78
|
+
load_success = self._load_step(df)
|
|
79
|
+
if not load_success:
|
|
80
|
+
self.logger.error(f'>>> Pipeline {self.definition.pipeline_name} failed to load during the LOAD step.')
|
|
81
|
+
self._log_audit("FAILED")
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
self._log_audit("SUCCESS")
|
|
85
|
+
self.logger.info(f">>> Pipeline {self.definition.pipeline_name} finished successfully <<<")
|
|
86
|
+
return True
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
self.error_details = str(e)
|
|
90
|
+
self.log_exception(e, f"Critical pipeline error - {self.definition.pipeline_name}")
|
|
91
|
+
self._log_audit("FAILED")
|
|
92
|
+
return False
|
|
93
|
+
finally:
|
|
94
|
+
self._cleanup()
|
|
95
|
+
|
|
96
|
+
def _extract_step(self) -> pd.DataFrame:
|
|
97
|
+
"""Handles extraction logic based on source type."""
|
|
98
|
+
try:
|
|
99
|
+
if self.src_db_ops: # DB source
|
|
100
|
+
if self.definition.source_sql:
|
|
101
|
+
return self.src_db_ops.read_data(self.definition.source_sql, **self.definition.read_parameters)
|
|
102
|
+
|
|
103
|
+
if self.src_file_ops: # File source
|
|
104
|
+
self.src_file_ops._apply_file_preprocessor(self.definition.file_pre_process)
|
|
105
|
+
df = self.src_file_ops.read_file(**self.definition.read_parameters)
|
|
106
|
+
self.source_file_path = self.src_file_ops.file_path # Ensure we have the path for auditing
|
|
107
|
+
return df
|
|
108
|
+
|
|
109
|
+
self.logger.error('No valid source configured for the Pipeline')
|
|
110
|
+
|
|
111
|
+
except Exception as e:
|
|
112
|
+
self.log_exception(e, 'Extraction step failed')
|
|
113
|
+
raise
|
|
114
|
+
|
|
115
|
+
return pd.DataFrame()
|
|
116
|
+
|
|
117
|
+
def _transform_step(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
118
|
+
"""Applies transformations and optional Pydantic validation."""
|
|
119
|
+
|
|
120
|
+
# 1. Pipeline hook transformation
|
|
121
|
+
df = self.definition.transform(df)
|
|
122
|
+
|
|
123
|
+
# 2. Renaming columns (can also be handled by Pydantic aliases)
|
|
124
|
+
rename_map = self.definition.get_rename_map()
|
|
125
|
+
if rename_map:
|
|
126
|
+
df.rename(columns=rename_map, inplace=True)
|
|
127
|
+
self.logger.info(f"Columns renamed: {list(rename_map.values())}")
|
|
128
|
+
|
|
129
|
+
# 3. Optional Pydantic Validation
|
|
130
|
+
if self.definition.validator:
|
|
131
|
+
self.logger.info(f"Starting data validation against model: {self.definition.validator.__name__}")
|
|
132
|
+
validated_rows = []
|
|
133
|
+
|
|
134
|
+
# Optimization: iterrows is slow because it creates a Series for each row.
|
|
135
|
+
# Converting to a list of dicts first is much faster for iteration.
|
|
136
|
+
records = df.to_dict(orient='records')
|
|
137
|
+
|
|
138
|
+
for i, record in enumerate(records):
|
|
139
|
+
try:
|
|
140
|
+
# Use model_validate for individual row validation
|
|
141
|
+
model_inst = self.definition.validator.model_validate(record)
|
|
142
|
+
validated_rows.append(model_inst.model_dump())
|
|
143
|
+
except Exception as e:
|
|
144
|
+
self.logger.warning(f"Row {i} failed validation: {str(e)}")
|
|
145
|
+
|
|
146
|
+
df = pd.DataFrame(validated_rows)
|
|
147
|
+
|
|
148
|
+
return df
|
|
149
|
+
|
|
150
|
+
def _load_step(self, df: pd.DataFrame) -> bool:
|
|
151
|
+
"""Handles loading logic based on destination type."""
|
|
152
|
+
try:
|
|
153
|
+
if self.dst_db_ops and self.definition.destination_table: #DB destination
|
|
154
|
+
dtype_map = self.definition.get_dtype_map()
|
|
155
|
+
self.dst_db_ops.write_to_table(
|
|
156
|
+
table_name=self.definition.destination_table,
|
|
157
|
+
df=df,
|
|
158
|
+
dtype=dtype_map,
|
|
159
|
+
**self.definition.write_parameters,
|
|
160
|
+
)
|
|
161
|
+
return True
|
|
162
|
+
elif self.dst_file_ops: # File destination
|
|
163
|
+
self.dst_file_ops.write_file(df, **self.definition.write_parameters)
|
|
164
|
+
return True
|
|
165
|
+
return False
|
|
166
|
+
except Exception as e:
|
|
167
|
+
self.log_exception(e, 'Error writing to destination')
|
|
168
|
+
return False
|
|
169
|
+
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from .log import LoggedComponent
|
|
8
|
+
from .models import ConnectionSettings, AuditEntry, BasePipelineDefinition, ProcedureDefinition, ServerType
|
|
9
|
+
from .database_connector import DatabaseConnector, CONNECTOR_FACTORY
|
|
10
|
+
from .database_operations import DatabaseOperations
|
|
11
|
+
from .config_loader import Configuration
|
|
12
|
+
|
|
13
|
+
class BasePipeline(LoggedComponent, ABC):
|
|
14
|
+
"""
|
|
15
|
+
Abstract base class for all pipeline types.
|
|
16
|
+
Handles:
|
|
17
|
+
- Execution ID generation
|
|
18
|
+
- Audit resource initialization
|
|
19
|
+
- Shared resource cleanup
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, definition: Union[BasePipelineDefinition, ProcedureDefinition]):
|
|
23
|
+
super().__init__()
|
|
24
|
+
self.definition = definition
|
|
25
|
+
self.execution_id = str(uuid.uuid4())
|
|
26
|
+
self.config = Configuration()
|
|
27
|
+
|
|
28
|
+
self._active_connectors: List[DatabaseConnector] = []
|
|
29
|
+
self.audit_db_ops: Optional[DatabaseOperations] = None
|
|
30
|
+
|
|
31
|
+
# Metadata for auditing
|
|
32
|
+
self.input_rows = 0
|
|
33
|
+
self.output_rows = 0
|
|
34
|
+
self.error_details: Optional[str] = None
|
|
35
|
+
self.start_time = pd.Timestamp.now()
|
|
36
|
+
|
|
37
|
+
self._initialize_audit_resource()
|
|
38
|
+
|
|
39
|
+
def _initialize_audit_resource(self):
|
|
40
|
+
"""Initialize the database operations for auditing."""
|
|
41
|
+
audit_resource_name = self.definition.audit
|
|
42
|
+
|
|
43
|
+
if audit_resource_name:
|
|
44
|
+
# External audit database
|
|
45
|
+
resource = self.config.get_resource(audit_resource_name)
|
|
46
|
+
if not isinstance(resource, ConnectionSettings):
|
|
47
|
+
self.log_and_raise(ValueError, f"Audit resource '{audit_resource_name}' must be a database connection.")
|
|
48
|
+
self.audit_db_ops = self._setup_db_operations(resource)
|
|
49
|
+
else:
|
|
50
|
+
# SQLite Fallback
|
|
51
|
+
self.logger.info("No audit resource defined. Falling back to SQLite audit_logs.db")
|
|
52
|
+
sqlite_settings = ConnectionSettings(
|
|
53
|
+
conn_server_type=ServerType.SQLITE,
|
|
54
|
+
conn_server="audit_logs.db",
|
|
55
|
+
conn_database="audit_logs" # Not used by SQLite but required by model
|
|
56
|
+
)
|
|
57
|
+
self.audit_db_ops = self._setup_db_operations(sqlite_settings)
|
|
58
|
+
|
|
59
|
+
def _setup_db_operations(self, settings: ConnectionSettings) -> DatabaseOperations:
|
|
60
|
+
"""Helper to create the connector, store it for cleanup, and return the Ops."""
|
|
61
|
+
server_type = settings.conn_server_type
|
|
62
|
+
|
|
63
|
+
if server_type not in CONNECTOR_FACTORY:
|
|
64
|
+
self.log_and_raise(ValueError, f"Unsupported server type: {server_type}")
|
|
65
|
+
|
|
66
|
+
connector = CONNECTOR_FACTORY[server_type](settings)
|
|
67
|
+
self._active_connectors.append(connector)
|
|
68
|
+
return DatabaseOperations(connector.get_engine())
|
|
69
|
+
|
|
70
|
+
def _cleanup(self):
|
|
71
|
+
"""Dispose of all active database connectors."""
|
|
72
|
+
for connector in self._active_connectors:
|
|
73
|
+
connector._dispose_engine()
|
|
74
|
+
self.logger.debug("Pipeline cleanup completed.")
|
|
75
|
+
|
|
76
|
+
def _log_audit(self, status: str):
|
|
77
|
+
"""Write the execution audit entry to the audit database."""
|
|
78
|
+
if not self.audit_db_ops:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
# Build file metadata dict before constructing the entry.
|
|
82
|
+
# Pydantic v2 models are immutable by default, so we must pass
|
|
83
|
+
# all fields at construction time rather than mutating afterward.
|
|
84
|
+
file_meta = {}
|
|
85
|
+
source_file = getattr(self, 'source_file_path', None)
|
|
86
|
+
if source_file:
|
|
87
|
+
try:
|
|
88
|
+
stat = source_file.stat()
|
|
89
|
+
file_meta = {
|
|
90
|
+
"file_name": source_file.name,
|
|
91
|
+
"file_path": str(source_file),
|
|
92
|
+
"file_size_bytes": stat.st_size,
|
|
93
|
+
"file_last_modified": pd.Timestamp(stat.st_mtime, unit='s'),
|
|
94
|
+
}
|
|
95
|
+
except OSError as e:
|
|
96
|
+
self.logger.warning(f"Could not read file metadata for audit: {e}")
|
|
97
|
+
|
|
98
|
+
entry = AuditEntry(
|
|
99
|
+
execution_id=self.execution_id,
|
|
100
|
+
pipeline_name=self.definition.pipeline_name,
|
|
101
|
+
status=status,
|
|
102
|
+
input_rows=self.input_rows,
|
|
103
|
+
output_rows=self.output_rows,
|
|
104
|
+
source_name=getattr(self, 'source_name', None),
|
|
105
|
+
destination_name=getattr(self, 'destination_name', None),
|
|
106
|
+
sp_name=getattr(self, 'sp_name', None),
|
|
107
|
+
sp_parameters=getattr(self, 'sp_parameters', None),
|
|
108
|
+
timestamp=pd.Timestamp.now(),
|
|
109
|
+
error_details=self.error_details,
|
|
110
|
+
**file_meta,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
self.audit_db_ops.write_audit("execution_audit", entry)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
self.logger.error(f"Failed to write audit log: {str(e)}")
|
|
117
|
+
|
|
118
|
+
@abstractmethod
|
|
119
|
+
def run(self) -> bool:
|
|
120
|
+
"""Main execution logic to be implemented by child classes."""
|
|
121
|
+
pass
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import Dict, Any
|
|
2
|
+
from .pipeline_base import BasePipeline
|
|
3
|
+
from .models import ProcedureDefinition, ConnectionSettings
|
|
4
|
+
from .config_loader import Configuration
|
|
5
|
+
|
|
6
|
+
class ProcedurePipeline(BasePipeline):
|
|
7
|
+
"""
|
|
8
|
+
Pipeline specialized in executing database stored procedures.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, pipeline_name: str):
|
|
12
|
+
# Load definition from config
|
|
13
|
+
definition = Configuration().get_pipeline(pipeline_name)
|
|
14
|
+
if not isinstance(definition, ProcedureDefinition):
|
|
15
|
+
raise ValueError(f"Pipeline '{pipeline_name}' is not a ProcedureDefinition.")
|
|
16
|
+
|
|
17
|
+
super().__init__(definition)
|
|
18
|
+
self.definition: ProcedureDefinition = definition
|
|
19
|
+
|
|
20
|
+
# Set up primary database connection
|
|
21
|
+
resource = self.config.get_resource(self.definition.resource)
|
|
22
|
+
if not isinstance(resource, ConnectionSettings):
|
|
23
|
+
self.log_and_raise(ValueError, f"Resource '{self.definition.resource}' must be a database connection.")
|
|
24
|
+
|
|
25
|
+
self.db_ops = self._setup_db_operations(resource)
|
|
26
|
+
|
|
27
|
+
def run(self) -> bool:
|
|
28
|
+
"""Execute all procedures in the definition."""
|
|
29
|
+
self.logger.info(f">>> Starting Procedure Pipeline: {self.definition.pipeline_name} <<<")
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
for proc_name, params in self.definition.procedures:
|
|
33
|
+
params = params or {}
|
|
34
|
+
self.logger.info(f"Running procedure: {proc_name} with params: {params}")
|
|
35
|
+
self.db_ops.execute_stored_procedure(proc_name, **params)
|
|
36
|
+
|
|
37
|
+
# Set audit details
|
|
38
|
+
self.sp_name = ", ".join([p[0] for p in self.definition.procedures])
|
|
39
|
+
self.sp_parameters = {p[0]: p[1] for p in self.definition.procedures}
|
|
40
|
+
|
|
41
|
+
self._log_audit("SUCCESS")
|
|
42
|
+
self.logger.info(f">>> Procedure Pipeline {self.definition.pipeline_name} finished successfully <<<")
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
except Exception as e:
|
|
46
|
+
self.error_details = str(e)
|
|
47
|
+
self.log_exception(e, f"Error in Procedure Pipeline: {self.definition.pipeline_name}")
|
|
48
|
+
|
|
49
|
+
# Set audit details for failure too
|
|
50
|
+
self.sp_name = ", ".join([p[0] for p in self.definition.procedures])
|
|
51
|
+
self.sp_parameters = {p[0]: p[1] for p in self.definition.procedures}
|
|
52
|
+
|
|
53
|
+
self._log_audit("FAILED")
|
|
54
|
+
return False
|
|
55
|
+
finally:
|
|
56
|
+
self._cleanup()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: easy_data_loader
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Data transfer utilities between files and databases
|
|
5
|
+
Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: click>=8.3.0
|
|
10
|
+
Requires-Dist: ipykernel
|
|
11
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
12
|
+
Requires-Dist: pandas>=2.3.3
|
|
13
|
+
Requires-Dist: pyarrow>=22.0.0
|
|
14
|
+
Requires-Dist: pydantic>=2.12.5
|
|
15
|
+
Requires-Dist: pydantic-settings>=2.12.0
|
|
16
|
+
Requires-Dist: pyodbc>=5.2.0
|
|
17
|
+
Requires-Dist: python-dotenv>=1.1.1
|
|
18
|
+
Requires-Dist: sqlalchemy>=2.0.43
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# Easy Data Loader 🚀
|
|
22
|
+
|
|
23
|
+
**Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various data sources (CSV, Excel, Parquet) and SQL databases (MSSQL, PostgreSQL, and others).
|
|
24
|
+
|
|
25
|
+
## ✨ Key Features
|
|
26
|
+
- **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
|
|
27
|
+
- **Integrated CLI**: Initialize a standardized project structure with a single command.
|
|
28
|
+
- **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
|
|
29
|
+
- **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
|
|
30
|
+
- **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## 📦 Installation
|
|
35
|
+
|
|
36
|
+
Install directly via `pip` or `uv`:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install easy_data_loader
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## 🚀 Getting Started
|
|
43
|
+
|
|
44
|
+
1. Initialize a new project structure to generate template configurations:
|
|
45
|
+
```bash
|
|
46
|
+
easy-loader init
|
|
47
|
+
```
|
|
48
|
+
2. Review the generated `config/` folders for sample resources and pipelines.
|
|
49
|
+
3. Run all discovered pipelines across the active configurations:
|
|
50
|
+
```bash
|
|
51
|
+
easy-loader run_all
|
|
52
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
easy_data_loader/__init__.py,sha256=kUkTXzuQYPFTU3pBvGODVgYrF6V6XGiXrSHpEQ81uZY,439
|
|
2
|
+
easy_data_loader/cli.py,sha256=W-JIEMB3abQpD5N8vvOd8LB3_Jd-N_tyooRGku_qNmE,9893
|
|
3
|
+
easy_data_loader/config_loader.py,sha256=KsXNN5UHr6GA8oxu_sJ0K3mHCI5UOuulQat3Jt-o0vk,7254
|
|
4
|
+
easy_data_loader/custom_exceptions.py,sha256=nF-0ENJOPTtyrv_UO_kHSvVM3tfFXQmLHqXvADTzW8E,735
|
|
5
|
+
easy_data_loader/database_connector.py,sha256=K879FpeIrcdE_NCcA44rW6CDMgB3tPbp_t6V_aFdKlI,6784
|
|
6
|
+
easy_data_loader/database_operations.py,sha256=oAm4LRC84KLqeFlzHz0OdsS32y-TVad2x_UHMGCMI0o,5280
|
|
7
|
+
easy_data_loader/driver_detector.py,sha256=L8H7IPEvDD7eb183wQH2eFwcmOuW_oxy7tNxGH-Kmlo,1280
|
|
8
|
+
easy_data_loader/file_operations.py,sha256=Eu4H6q6CLAQe-zE12ZBJ1m_A9zp9m1UDKYGChwGBD-s,5820
|
|
9
|
+
easy_data_loader/log.py,sha256=yIfvOhqnHzq8TogdvK6Si1LVsbI9QXsw0EDRIdEluS4,2825
|
|
10
|
+
easy_data_loader/models.py,sha256=yY1z6WcWqKyBefO4br9f88WeiNdKQEbBUqWDNr2sUok,5551
|
|
11
|
+
easy_data_loader/orchestrator.py,sha256=myDCM1Jmd1Soj_AvIuCZgi_uPAZvSpyJMKROb2u2tVc,2861
|
|
12
|
+
easy_data_loader/pipeline.py,sha256=UxjpZ9Q4F1uzlUIoO9r5a4y6B6lexhVKWwHorbRSenQ,6020
|
|
13
|
+
easy_data_loader/pipeline_base.py,sha256=qnb8dFEgQThFPT6rF9ywFzhdtFs1tB9wy7H6VDPBoRA,4201
|
|
14
|
+
easy_data_loader/procedure_pipeline.py,sha256=vYtYaIshnr34KSF_PeraUM1bWHFUYVcT9L2s2gLTMIQ,2075
|
|
15
|
+
easy_data_loader-0.1.0.dist-info/licenses/LICENSE,sha256=zMCK-7LtJpSF-qQw0A1-t9a6N3F49e36vI3bh1Klvlk,1070
|
|
16
|
+
easy_data_loader-0.1.0.dist-info/METADATA,sha256=sv99uwRor2gXojTwrUvH1dWNuQR_gLuO12Bq_HOcl54,1856
|
|
17
|
+
easy_data_loader-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
18
|
+
easy_data_loader-0.1.0.dist-info/entry_points.txt,sha256=o0rWYa-lvUmVnX_rBRTcr1UQRG8aJWDdA_BdDhX-pug,58
|
|
19
|
+
easy_data_loader-0.1.0.dist-info/top_level.txt,sha256=YsMeo8e9snRg9aEw7rSdLL0zFuaGCJOEzAh7TiJkloc,17
|
|
20
|
+
easy_data_loader-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Bojoi Gabriel
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
easy_data_loader
|