easy-data-loader 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easy_data_loader/__init__.py +11 -0
- easy_data_loader/cli.py +302 -0
- easy_data_loader/config_loader.py +184 -0
- easy_data_loader/custom_exceptions.py +21 -0
- easy_data_loader/database_connector.py +190 -0
- easy_data_loader/database_operations.py +129 -0
- easy_data_loader/driver_detector.py +46 -0
- easy_data_loader/file_operations.py +146 -0
- easy_data_loader/log.py +90 -0
- easy_data_loader/models.py +168 -0
- easy_data_loader/orchestrator.py +59 -0
- easy_data_loader/pipeline.py +169 -0
- easy_data_loader/pipeline_base.py +121 -0
- easy_data_loader/procedure_pipeline.py +56 -0
- easy_data_loader-0.1.0.dist-info/METADATA +52 -0
- easy_data_loader-0.1.0.dist-info/RECORD +20 -0
- easy_data_loader-0.1.0.dist-info/WHEEL +5 -0
- easy_data_loader-0.1.0.dist-info/entry_points.txt +2 -0
- easy_data_loader-0.1.0.dist-info/licenses/LICENSE +21 -0
- easy_data_loader-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import urllib.parse
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
import pyodbc
|
|
5
|
+
from sqlalchemy import create_engine, text
|
|
6
|
+
from sqlalchemy.engine import Engine
|
|
7
|
+
|
|
8
|
+
from .custom_exceptions import EngineTestException
|
|
9
|
+
from .driver_detector import SqlServerDriverDetector
|
|
10
|
+
from .log import LoggedComponent
|
|
11
|
+
from .models import ConnectionSettings, ServerType
|
|
12
|
+
|
|
13
|
+
pyodbc.pooling = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DatabaseConnector(ABC):
|
|
17
|
+
"""Abstract class that defines a database connector"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def _build_connection_string(self) -> str:
|
|
21
|
+
"""define the connection string to use when creating the engine - database specific"""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def _create_engine(self, settings: ConnectionSettings) -> Engine:
|
|
25
|
+
"""Create an engine to interact with the database"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def get_engine(self) -> Engine:
|
|
30
|
+
"""Get database engine by connection name"""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def _test_engine(self) -> bool:
|
|
36
|
+
"""Test a given connection by name"""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def _dispose_engine(self) -> None:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SqlServerDatabaseConnector(LoggedComponent, DatabaseConnector):
|
|
46
|
+
"""Database connector to a Sql Server database"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, config: ConnectionSettings):
|
|
49
|
+
super().__init__()
|
|
50
|
+
self.driver_detector = SqlServerDriverDetector()
|
|
51
|
+
self.config = config
|
|
52
|
+
self.engine = self._create_engine(self.config)
|
|
53
|
+
self._test_engine()
|
|
54
|
+
|
|
55
|
+
def _build_connection_string(self) -> str:
|
|
56
|
+
"""Define the connection string from the given configuration"""
|
|
57
|
+
driver = self.driver_detector.select_preferred_driver()
|
|
58
|
+
|
|
59
|
+
if driver is not None and self.config.conn_database is not None:
|
|
60
|
+
self.logger.debug(f"Sql driver found: {driver}")
|
|
61
|
+
connection_string = (
|
|
62
|
+
f"DRIVER={driver};SERVER={self.config.conn_server},{self.config.conn_port};DATABASE={self.config.conn_database}"
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
self.log_and_raise(ValueError, "Connection configuration is not valid")
|
|
66
|
+
|
|
67
|
+
if self.config.conn_username is not None and self.config.conn_password is not None:
|
|
68
|
+
connection_string += f";UID={self.config.conn_username};PWD={self.config.conn_password}"
|
|
69
|
+
elif self.config.conn_username is None and self.config.conn_password is None:
|
|
70
|
+
connection_string += ";Trusted_Connection=Yes"
|
|
71
|
+
else:
|
|
72
|
+
self.log_and_raise(
|
|
73
|
+
ValueError,
|
|
74
|
+
"Credentials definition is not valid. Please check username or password!",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if driver == "ODBC Driver 18 for SQL Server":
|
|
78
|
+
connection_string += ";TrustServerCertificate=Yes"
|
|
79
|
+
|
|
80
|
+
connection_string += ";APP=SqlDataLoader"
|
|
81
|
+
params = urllib.parse.quote_plus(connection_string)
|
|
82
|
+
|
|
83
|
+
return f"mssql+pyodbc:///?odbc_connect={params}"
|
|
84
|
+
|
|
85
|
+
def _create_engine(self, settings: ConnectionSettings) -> Engine:
|
|
86
|
+
"""Create a sqlalchemy engine using the provided configuration"""
|
|
87
|
+
connection_string = self._build_connection_string()
|
|
88
|
+
if connection_string:
|
|
89
|
+
try:
|
|
90
|
+
engine = create_engine(
|
|
91
|
+
connection_string,
|
|
92
|
+
fast_executemany=True,
|
|
93
|
+
pool_size=5,
|
|
94
|
+
max_overflow=10,
|
|
95
|
+
pool_timeout=30,
|
|
96
|
+
pool_recycle=3600,
|
|
97
|
+
echo=self.is_debug_enabled,
|
|
98
|
+
)
|
|
99
|
+
return engine
|
|
100
|
+
except Exception as e:
|
|
101
|
+
self.log_exception(e, "Could not create engine from connection string")
|
|
102
|
+
raise
|
|
103
|
+
else:
|
|
104
|
+
self.log_and_raise(
|
|
105
|
+
ValueError, "Connection string could not be created from configuration"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def get_engine(self) -> Engine:
|
|
109
|
+
"""Return the engine to be used"""
|
|
110
|
+
return self.engine
|
|
111
|
+
|
|
112
|
+
def _test_engine(self) -> bool:
|
|
113
|
+
"""Test if the connection works"""
|
|
114
|
+
try:
|
|
115
|
+
with self.engine.connect() as conn:
|
|
116
|
+
conn.execute(text("SELECT 1"))
|
|
117
|
+
self.logger.debug(f"Connection test successful: {self.config.conn_server} - {self.config.conn_database}")
|
|
118
|
+
return True
|
|
119
|
+
except Exception as e:
|
|
120
|
+
self.log_and_raise(
|
|
121
|
+
EngineTestException,
|
|
122
|
+
f"Connection test failed",
|
|
123
|
+
exception=str(e),
|
|
124
|
+
)
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
def _dispose_engine(self) -> None:
|
|
128
|
+
self.logger.debug('Disposing of engine')
|
|
129
|
+
return self.engine.dispose()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class SQLiteDatabaseConnector(LoggedComponent, DatabaseConnector):
|
|
133
|
+
"""Database connector to a SQLite database"""
|
|
134
|
+
|
|
135
|
+
def __init__(self, config: ConnectionSettings):
|
|
136
|
+
super().__init__()
|
|
137
|
+
self.config = config
|
|
138
|
+
self.engine = self._create_engine(self.config)
|
|
139
|
+
self._test_engine()
|
|
140
|
+
|
|
141
|
+
def _build_connection_string(self) -> str:
|
|
142
|
+
"""Define the connection string from the given configuration"""
|
|
143
|
+
if self.config.conn_server:
|
|
144
|
+
# For SQLite, conn_server is treated as the file path
|
|
145
|
+
return f"sqlite:///{self.config.conn_server}"
|
|
146
|
+
self.log_and_raise(ValueError, "Connection configuration is not valid. SQLite requires a file path in conn_server.")
|
|
147
|
+
|
|
148
|
+
def _create_engine(self, settings: ConnectionSettings) -> Engine:
|
|
149
|
+
"""Create a sqlalchemy engine using the provided configuration"""
|
|
150
|
+
connection_string = self._build_connection_string()
|
|
151
|
+
try:
|
|
152
|
+
# SQLite specific engine creation
|
|
153
|
+
engine = create_engine(
|
|
154
|
+
connection_string,
|
|
155
|
+
echo=self.is_debug_enabled,
|
|
156
|
+
)
|
|
157
|
+
return engine
|
|
158
|
+
except Exception as e:
|
|
159
|
+
self.log_exception(e, "Could not create engine from connection string")
|
|
160
|
+
raise
|
|
161
|
+
|
|
162
|
+
def get_engine(self) -> Engine:
|
|
163
|
+
"""Return the engine to be used"""
|
|
164
|
+
return self.engine
|
|
165
|
+
|
|
166
|
+
def _test_engine(self) -> bool:
|
|
167
|
+
"""Test if the connection works"""
|
|
168
|
+
try:
|
|
169
|
+
with self.engine.connect() as conn:
|
|
170
|
+
conn.execute(text("SELECT 1"))
|
|
171
|
+
self.logger.info(f"Connection test successful: {self.config.conn_server} - {self.config.conn_database}")
|
|
172
|
+
return True
|
|
173
|
+
except Exception as e:
|
|
174
|
+
self.log_and_raise(
|
|
175
|
+
EngineTestException,
|
|
176
|
+
f"Connection test failed",
|
|
177
|
+
exception=str(e),
|
|
178
|
+
)
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
def _dispose_engine(self) -> None:
|
|
182
|
+
self.logger.debug('Disposing of engine')
|
|
183
|
+
return self.engine.dispose()
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
CONNECTOR_FACTORY = {
|
|
187
|
+
ServerType.MSSQL: SqlServerDatabaseConnector,
|
|
188
|
+
ServerType.SQLITE: SQLiteDatabaseConnector,
|
|
189
|
+
# ServerType.POSTGRESQL: PostgresDatabaseConnector
|
|
190
|
+
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from sqlalchemy import inspect, text, Table, Column, MetaData
|
|
2
|
+
from sqlalchemy.engine import Engine
|
|
3
|
+
from sqlalchemy.types import TypeEngine
|
|
4
|
+
from pandas import DataFrame
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
|
+
from .log import LoggedComponent
|
|
8
|
+
|
|
9
|
+
class DatabaseOperations(LoggedComponent):
|
|
10
|
+
"""Component responsible for all SQL interactions using SQLAlchemy engines"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, engine: Engine):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self.engine = engine
|
|
15
|
+
self._inspector = inspect(self.engine)
|
|
16
|
+
|
|
17
|
+
def write_to_table(self, table_name: str, df: DataFrame, **kwargs) -> bool:
|
|
18
|
+
"""Write a dataframe to a specified table in the database"""
|
|
19
|
+
|
|
20
|
+
self.logger.info(f"Writing {len(df)} rows to table: {table_name}")
|
|
21
|
+
try:
|
|
22
|
+
df.to_sql(table_name, con=self.engine, **kwargs)
|
|
23
|
+
return True
|
|
24
|
+
except Exception as e:
|
|
25
|
+
self.log_exception(e, f"Failed to write to table {table_name}")
|
|
26
|
+
raise
|
|
27
|
+
|
|
28
|
+
def read_data(self, sql: str, **kwargs) -> DataFrame:
|
|
29
|
+
"""Read a specified table from the database into a dataframe"""
|
|
30
|
+
|
|
31
|
+
self.logger.debug(f"Reading data.")
|
|
32
|
+
try:
|
|
33
|
+
return pd.read_sql(sql, con=self.engine, **kwargs)
|
|
34
|
+
except Exception as e:
|
|
35
|
+
self.log_exception(e, f"Failed to read data")
|
|
36
|
+
raise
|
|
37
|
+
|
|
38
|
+
def inspect_table(self, table_name: str) -> dict:
|
|
39
|
+
"""Read the metadata (columns and types) of a specified table"""
|
|
40
|
+
|
|
41
|
+
self.logger.debug(f"Inspecting metadata for table: {table_name}")
|
|
42
|
+
try:
|
|
43
|
+
columns = self._inspector.get_columns(table_name)
|
|
44
|
+
if not columns:
|
|
45
|
+
self.logger.warning(f"Table {table_name} not found or has no columns.")
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
return {col['name']: str(col['type']) for col in columns}
|
|
49
|
+
except Exception as e:
|
|
50
|
+
self.log_exception(e, f"Failed to inspect table {table_name}")
|
|
51
|
+
return {}
|
|
52
|
+
|
|
53
|
+
def create_table(self, table_name: str, schema: Dict[str, TypeEngine]) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Create a table dynamically.
|
|
56
|
+
schema: {'col_name': sqlalchemy_type}
|
|
57
|
+
"""
|
|
58
|
+
self.logger.info(f"Creating table: {table_name}")
|
|
59
|
+
try:
|
|
60
|
+
metadata = MetaData()
|
|
61
|
+
columns = [Column(name, col_type) for name, col_type in schema.items()]
|
|
62
|
+
table = Table(table_name, metadata, *columns)
|
|
63
|
+
|
|
64
|
+
metadata.create_all(self.engine)
|
|
65
|
+
return True
|
|
66
|
+
except Exception as e:
|
|
67
|
+
self.log_exception(e, f"Failed to create table {table_name}")
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
def execute_stored_procedure(self, procedure_name: str, **kwargs) -> bool:
|
|
71
|
+
"""Execute a stored procedure using the engine connection"""
|
|
72
|
+
self.logger.info(f"Executing stored procedure: {procedure_name}")
|
|
73
|
+
try:
|
|
74
|
+
params_str = ", ".join([f":{k}" for k in kwargs.keys()])
|
|
75
|
+
sql = text(f"EXEC {procedure_name} {params_str}")
|
|
76
|
+
|
|
77
|
+
with self.engine.begin() as conn:
|
|
78
|
+
# self.engine.begin() is a context manager for the transaction
|
|
79
|
+
# if no error occurs the transaction gets commited
|
|
80
|
+
conn.execute(sql, kwargs)
|
|
81
|
+
return True
|
|
82
|
+
except Exception as e:
|
|
83
|
+
self.log_exception(e, f"Failed to execute procedure {procedure_name}")
|
|
84
|
+
raise
|
|
85
|
+
|
|
86
|
+
def write_audit(self, table_name: str, entry: 'AuditEntry'):
|
|
87
|
+
"""Write an audit entry to the database, ensuring the table exists first."""
|
|
88
|
+
self.logger.debug(f"Writing audit entry for execution: {entry.execution_id}")
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
from sqlalchemy import Table, Column, String, Integer, BigInteger, DateTime, MetaData
|
|
92
|
+
metadata = MetaData()
|
|
93
|
+
|
|
94
|
+
columns = [
|
|
95
|
+
Column("execution_id", String(50), primary_key=True),
|
|
96
|
+
Column("pipeline_name", String(100)),
|
|
97
|
+
Column("status", String(50)),
|
|
98
|
+
Column("input_rows", Integer),
|
|
99
|
+
Column("output_rows", Integer),
|
|
100
|
+
Column("source_name", String(255)),
|
|
101
|
+
Column("destination_name", String(255)),
|
|
102
|
+
Column("file_name", String(255)),
|
|
103
|
+
Column("file_path", String(500)),
|
|
104
|
+
Column("file_size_bytes", BigInteger),
|
|
105
|
+
Column("file_last_modified", DateTime),
|
|
106
|
+
Column("sp_name", String(255)),
|
|
107
|
+
Column("sp_parameters", String),
|
|
108
|
+
Column("timestamp", DateTime),
|
|
109
|
+
Column("error_details", String)
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
table = Table(table_name, metadata, *columns)
|
|
113
|
+
metadata.create_all(self.engine)
|
|
114
|
+
|
|
115
|
+
data = entry.model_dump()
|
|
116
|
+
# Serialize dictionary fields for storage
|
|
117
|
+
if data.get("sp_parameters"):
|
|
118
|
+
import json
|
|
119
|
+
try:
|
|
120
|
+
data["sp_parameters"] = json.dumps(data["sp_parameters"], default=str)
|
|
121
|
+
except:
|
|
122
|
+
data["sp_parameters"] = str(data["sp_parameters"])
|
|
123
|
+
df = pd.DataFrame([data])
|
|
124
|
+
|
|
125
|
+
df.to_sql(table_name, con=self.engine, if_exists="append", index=False)
|
|
126
|
+
|
|
127
|
+
except Exception as e:
|
|
128
|
+
self.log_exception(e, f"Failed to write audit entry to {table_name}")
|
|
129
|
+
raise
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
import pyodbc
|
|
4
|
+
|
|
5
|
+
from .custom_exceptions import DriverNotFoundException
|
|
6
|
+
from .log import LoggedComponent
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DriverDetector(ABC):
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def get_available_drivers(self) -> list[str]:
|
|
12
|
+
"""Get list of available drivers."""
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def select_preferred_driver(self) -> str:
|
|
17
|
+
"""Find the best available driver."""
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SqlServerDriverDetector(LoggedComponent, DriverDetector):
|
|
22
|
+
def __init__(self):
|
|
23
|
+
super().__init__()
|
|
24
|
+
self._preferred_drivers = [
|
|
25
|
+
"ODBC Driver 18 for SQL Server",
|
|
26
|
+
"ODBC Driver 17 for SQL Server",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
def get_available_drivers(self) -> list[str]:
|
|
30
|
+
"""Get all available ODBC drivers."""
|
|
31
|
+
return pyodbc.drivers()
|
|
32
|
+
|
|
33
|
+
def select_preferred_driver(self) -> str:
|
|
34
|
+
"""Select first detected driver in order of preferrence."""
|
|
35
|
+
|
|
36
|
+
available_drivers = self.get_available_drivers()
|
|
37
|
+
|
|
38
|
+
for driver in self._preferred_drivers:
|
|
39
|
+
if driver in available_drivers:
|
|
40
|
+
return driver
|
|
41
|
+
|
|
42
|
+
self.log_and_raise(
|
|
43
|
+
DriverNotFoundException,
|
|
44
|
+
available_drivers=available_drivers,
|
|
45
|
+
preferred_drivers=self._preferred_drivers,
|
|
46
|
+
)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Callable, Dict, Optional
|
|
4
|
+
from .log import LoggedComponent
|
|
5
|
+
from .models import FileType, FileSettings
|
|
6
|
+
|
|
7
|
+
class FileOperations(LoggedComponent):
|
|
8
|
+
"""Handles all file operations"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, settings: FileSettings):
|
|
11
|
+
super().__init__()
|
|
12
|
+
self.settings = settings
|
|
13
|
+
self.file_type = self.settings.file_type
|
|
14
|
+
self.file_path : Optional[Path] = None
|
|
15
|
+
|
|
16
|
+
# mapping the file reader by extension
|
|
17
|
+
self._readers: Dict[FileType, Callable] = {
|
|
18
|
+
FileType.CSV : pd.read_csv,
|
|
19
|
+
FileType.EXCEL : pd.read_excel,
|
|
20
|
+
FileType.PARQUET: pd.read_parquet,
|
|
21
|
+
FileType.ORC : pd.read_orc
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# mapping the writters to FileType
|
|
25
|
+
self._writers : Dict[FileType, Callable] = {
|
|
26
|
+
FileType.CSV : self._write_csv,
|
|
27
|
+
FileType.EXCEL : self._write_excel,
|
|
28
|
+
FileType.PARQUET : self._write_parquet,
|
|
29
|
+
FileType.ORC : self._write_orc,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
def _find_file(self) -> Path:
|
|
33
|
+
"""
|
|
34
|
+
Identifies the file based on pattern (latest) or explicit name.
|
|
35
|
+
"""
|
|
36
|
+
if self.settings.file_pattern:
|
|
37
|
+
files = list(self.settings.folder_path.glob(self.settings.file_pattern))
|
|
38
|
+
if not files:
|
|
39
|
+
self.log_and_raise(ValueError, f'No files found by pattern {self.settings.file_pattern} inside folder {self.settings.folder_path}')
|
|
40
|
+
latest_file = max(files, key=lambda f: f.stat().st_mtime)
|
|
41
|
+
return latest_file
|
|
42
|
+
elif self.settings.file_name:
|
|
43
|
+
file = (self.settings.folder_path / self.settings.file_name).with_suffix(f'.{self.settings.file_type.value.lower()}')
|
|
44
|
+
if not file.exists():
|
|
45
|
+
self.log_and_raise(ValueError, 'Error finding file, please check the file definition.')
|
|
46
|
+
return file
|
|
47
|
+
self.log_and_raise(ValueError, 'Error finding file by name or pattern')
|
|
48
|
+
|
|
49
|
+
def read_file(self, **kwargs) -> pd.DataFrame:
|
|
50
|
+
"""
|
|
51
|
+
Entrypoint for reading files to a pandas DataFrame.
|
|
52
|
+
Depending on the file type it will delegate to the coresponding reader.
|
|
53
|
+
"""
|
|
54
|
+
if not self.file_path:
|
|
55
|
+
self.file_path = self._find_file()
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
self.logger.info(f'Reading file: {self.file_path.name}')
|
|
59
|
+
return self._readers[self.file_type](self.file_path, **kwargs)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
self.log_exception(e, f'Failed to read {self.file_path.name} into a dataframe')
|
|
62
|
+
raise
|
|
63
|
+
|
|
64
|
+
def write_file(self, df: pd.DataFrame, **kwargs) -> Path:
|
|
65
|
+
"""
|
|
66
|
+
Ensures a valid path exists and writes the dataframe.
|
|
67
|
+
If file_name is provided, it uses it (and overwrites).
|
|
68
|
+
If file_pattern is provided, it constructs a name using the pattern and a timestamp.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
output_path = self._construct_output_path()
|
|
72
|
+
try:
|
|
73
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
output_path.touch(exist_ok=True)
|
|
75
|
+
|
|
76
|
+
self.logger.info(f"Writing to file: {output_path.name}")
|
|
77
|
+
self._writers[self.file_type](df, output_path, **kwargs)
|
|
78
|
+
|
|
79
|
+
return output_path
|
|
80
|
+
except Exception as e:
|
|
81
|
+
self.log_exception(e, f'Failed to write to: {output_path}')
|
|
82
|
+
raise
|
|
83
|
+
|
|
84
|
+
def _construct_output_path(self) -> Path:
|
|
85
|
+
"""
|
|
86
|
+
Construct the output file path based on the settings.
|
|
87
|
+
If the file name is defined the output path will be fixed,
|
|
88
|
+
if the file pattern is defined the file name will have a timestamp appended.
|
|
89
|
+
"""
|
|
90
|
+
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
|
|
91
|
+
file_extension = f'.{self.settings.file_type.value.lower()}'
|
|
92
|
+
|
|
93
|
+
if self.settings.file_name:
|
|
94
|
+
return (self.settings.folder_path / self.settings.file_name).with_suffix(file_extension)
|
|
95
|
+
return self.settings.folder_path / f"{self.settings.file_pattern}_{timestamp}{file_extension}"
|
|
96
|
+
|
|
97
|
+
def _apply_file_preprocessor(self, preprocessor_func: Callable[[Path], Path]):
|
|
98
|
+
"""
|
|
99
|
+
Executes a custom function passed from the BasePipelineDefinition
|
|
100
|
+
and returns the new file path
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
current_path = self._find_file()
|
|
104
|
+
self.logger.info(f'Current file path is: {current_path}')
|
|
105
|
+
|
|
106
|
+
if preprocessor_func is None:
|
|
107
|
+
self.file_path = current_path
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
self.logger.info(f"Running pre-processor on: {current_path}")
|
|
112
|
+
# run pre-processor function and record the path - old or new
|
|
113
|
+
processed_path = preprocessor_func(current_path)
|
|
114
|
+
|
|
115
|
+
if isinstance(processed_path, Path):
|
|
116
|
+
self.file_path = processed_path
|
|
117
|
+
self.logger.debug(f'File path updated to: {self.file_path}')
|
|
118
|
+
else:
|
|
119
|
+
self.logger.warning(f'Pre-processor function did not return a new Path')
|
|
120
|
+
self.file_path = current_path
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.log_exception(e, f'Pre-processor failed for : {current_path}')
|
|
124
|
+
# In case of failure, keep the original path so the pipeline can try to proceed
|
|
125
|
+
self.file_path = current_path
|
|
126
|
+
|
|
127
|
+
def _write_csv(self, df: pd.DataFrame, file_path: Path, **kwargs):
|
|
128
|
+
"""Internal CSV writer"""
|
|
129
|
+
df.to_csv(file_path, **kwargs)
|
|
130
|
+
|
|
131
|
+
def _write_excel(self, df: pd.DataFrame, file_path: Path, **kwargs):
|
|
132
|
+
"""Internal Excel writer"""
|
|
133
|
+
|
|
134
|
+
df.to_excel(file_path, **kwargs)
|
|
135
|
+
|
|
136
|
+
def _write_parquet(self, df: pd.DataFrame, file_path: Path, **kwargs):
|
|
137
|
+
"""Internal Parquet writer"""
|
|
138
|
+
|
|
139
|
+
df.to_parquet(file_path, **kwargs)
|
|
140
|
+
|
|
141
|
+
def _write_orc(self, df: pd.DataFrame, file_path: Path, **kwargs):
|
|
142
|
+
"""Internal Orc writer"""
|
|
143
|
+
|
|
144
|
+
df.to_orc(file_path, **kwargs)
|
|
145
|
+
|
|
146
|
+
|
easy_data_loader/log.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import logging.handlers
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AppLogger:
|
|
7
|
+
_instance = None
|
|
8
|
+
_initialized = False
|
|
9
|
+
|
|
10
|
+
def __new__(cls):
|
|
11
|
+
if cls._instance is None:
|
|
12
|
+
cls._instance = super().__new__(cls)
|
|
13
|
+
return cls._instance
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
if not self._initialized:
|
|
17
|
+
self._setup_logging()
|
|
18
|
+
AppLogger._initialized = True
|
|
19
|
+
|
|
20
|
+
def _setup_logging(self):
|
|
21
|
+
Path("logs").mkdir(exist_ok=True)
|
|
22
|
+
|
|
23
|
+
formatter = logging.Formatter(
|
|
24
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
25
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
root_logger = logging.getLogger()
|
|
29
|
+
root_logger.setLevel(logging.INFO)
|
|
30
|
+
root_logger.handlers.clear()
|
|
31
|
+
|
|
32
|
+
# Console
|
|
33
|
+
console_handler = logging.StreamHandler()
|
|
34
|
+
console_handler.setFormatter(formatter)
|
|
35
|
+
root_logger.addHandler(console_handler)
|
|
36
|
+
|
|
37
|
+
# File
|
|
38
|
+
file_handler = logging.handlers.RotatingFileHandler(
|
|
39
|
+
"logs/application.log", maxBytes=10 * 1024 * 1024, backupCount=5
|
|
40
|
+
)
|
|
41
|
+
file_handler.setFormatter(formatter)
|
|
42
|
+
root_logger.addHandler(file_handler)
|
|
43
|
+
|
|
44
|
+
def get_logger(self, name: str) -> logging.Logger:
|
|
45
|
+
return logging.getLogger(name)
|
|
46
|
+
|
|
47
|
+
def set_level(self, level: str):
|
|
48
|
+
"""Change log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"""
|
|
49
|
+
|
|
50
|
+
log_level = getattr(logging, level.upper())
|
|
51
|
+
logging.getLogger().setLevel(log_level)
|
|
52
|
+
|
|
53
|
+
# Update all handlers
|
|
54
|
+
for handler in logging.getLogger().handlers:
|
|
55
|
+
handler.setLevel(log_level)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def is_debug(self) -> bool:
|
|
59
|
+
return logging.getLogger().isEnabledFor(logging.DEBUG)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class LoggedComponent:
|
|
63
|
+
"""Base class providing logging functionality to all components"""
|
|
64
|
+
|
|
65
|
+
def __init__(self):
|
|
66
|
+
self.log = AppLogger()
|
|
67
|
+
self.logger = self.log.get_logger(self.__class__.__name__)
|
|
68
|
+
|
|
69
|
+
def log_and_raise(self, exception_class, message: str = "", **context):
|
|
70
|
+
"""Log error message and raise exception with optional context"""
|
|
71
|
+
if context:
|
|
72
|
+
context_str = ", ".join(f"{k}={v}" for k, v in context.items())
|
|
73
|
+
self.logger.error(f"{message} | Context: {context_str}")
|
|
74
|
+
else:
|
|
75
|
+
self.logger.error(message)
|
|
76
|
+
|
|
77
|
+
raise exception_class(message)
|
|
78
|
+
|
|
79
|
+
def log_exception(self, exc: Exception, message: str = "", **context):
|
|
80
|
+
"""Log a caught exception with optional context"""
|
|
81
|
+
log_msg = message or f"Exception occurred: {str(exc)}"
|
|
82
|
+
if context:
|
|
83
|
+
context_str = ", ".join(f"{k}={v}" for k, v in context.items())
|
|
84
|
+
log_msg += f" | Context: {context_str}"
|
|
85
|
+
|
|
86
|
+
self.logger.error(log_msg, exc_info=True)
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def is_debug_enabled(self) -> bool:
|
|
90
|
+
return self.log.is_debug
|