easy-data-loader 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ import urllib.parse
2
+ from abc import ABC, abstractmethod
3
+
4
+ import pyodbc
5
+ from sqlalchemy import create_engine, text
6
+ from sqlalchemy.engine import Engine
7
+
8
+ from .custom_exceptions import EngineTestException
9
+ from .driver_detector import SqlServerDriverDetector
10
+ from .log import LoggedComponent
11
+ from .models import ConnectionSettings, ServerType
12
+
13
+ pyodbc.pooling = False
14
+
15
+
16
+ class DatabaseConnector(ABC):
17
+ """Abstract class that defines a database connector"""
18
+
19
+ @abstractmethod
20
+ def _build_connection_string(self) -> str:
21
+ """define the connection string to use when creating the engine - database specific"""
22
+
23
+ @abstractmethod
24
+ def _create_engine(self, settings: ConnectionSettings) -> Engine:
25
+ """Create an engine to interact with the database"""
26
+ pass
27
+
28
+ @abstractmethod
29
+ def get_engine(self) -> Engine:
30
+ """Get database engine by connection name"""
31
+ pass
32
+
33
+
34
+ @abstractmethod
35
+ def _test_engine(self) -> bool:
36
+ """Test a given connection by name"""
37
+ pass
38
+
39
+ @abstractmethod
40
+ def _dispose_engine(self) -> None:
41
+ pass
42
+
43
+
44
+
45
+ class SqlServerDatabaseConnector(LoggedComponent, DatabaseConnector):
46
+ """Database connector to a Sql Server database"""
47
+
48
+ def __init__(self, config: ConnectionSettings):
49
+ super().__init__()
50
+ self.driver_detector = SqlServerDriverDetector()
51
+ self.config = config
52
+ self.engine = self._create_engine(self.config)
53
+ self._test_engine()
54
+
55
+ def _build_connection_string(self) -> str:
56
+ """Define the connection string from the given configuration"""
57
+ driver = self.driver_detector.select_preferred_driver()
58
+
59
+ if driver is not None and self.config.conn_database is not None:
60
+ self.logger.debug(f"Sql driver found: {driver}")
61
+ connection_string = (
62
+ f"DRIVER={driver};SERVER={self.config.conn_server},{self.config.conn_port};DATABASE={self.config.conn_database}"
63
+ )
64
+ else:
65
+ self.log_and_raise(ValueError, "Connection configuration is not valid")
66
+
67
+ if self.config.conn_username is not None and self.config.conn_password is not None:
68
+ connection_string += f";UID={self.config.conn_username};PWD={self.config.conn_password}"
69
+ elif self.config.conn_username is None and self.config.conn_password is None:
70
+ connection_string += ";Trusted_Connection=Yes"
71
+ else:
72
+ self.log_and_raise(
73
+ ValueError,
74
+ "Credentials definition is not valid. Please check username or password!",
75
+ )
76
+
77
+ if driver == "ODBC Driver 18 for SQL Server":
78
+ connection_string += ";TrustServerCertificate=Yes"
79
+
80
+ connection_string += ";APP=SqlDataLoader"
81
+ params = urllib.parse.quote_plus(connection_string)
82
+
83
+ return f"mssql+pyodbc:///?odbc_connect={params}"
84
+
85
+ def _create_engine(self, settings: ConnectionSettings) -> Engine:
86
+ """Create a sqlalchemy engine using the provided configuration"""
87
+ connection_string = self._build_connection_string()
88
+ if connection_string:
89
+ try:
90
+ engine = create_engine(
91
+ connection_string,
92
+ fast_executemany=True,
93
+ pool_size=5,
94
+ max_overflow=10,
95
+ pool_timeout=30,
96
+ pool_recycle=3600,
97
+ echo=self.is_debug_enabled,
98
+ )
99
+ return engine
100
+ except Exception as e:
101
+ self.log_exception(e, "Could not create engine from connection string")
102
+ raise
103
+ else:
104
+ self.log_and_raise(
105
+ ValueError, "Connection string could not be created from configuration"
106
+ )
107
+
108
+ def get_engine(self) -> Engine:
109
+ """Return the engine to be used"""
110
+ return self.engine
111
+
112
+ def _test_engine(self) -> bool:
113
+ """Test if the connection works"""
114
+ try:
115
+ with self.engine.connect() as conn:
116
+ conn.execute(text("SELECT 1"))
117
+ self.logger.debug(f"Connection test successful: {self.config.conn_server} - {self.config.conn_database}")
118
+ return True
119
+ except Exception as e:
120
+ self.log_and_raise(
121
+ EngineTestException,
122
+ f"Connection test failed",
123
+ exception=str(e),
124
+ )
125
+ return False
126
+
127
+ def _dispose_engine(self) -> None:
128
+ self.logger.debug('Disposing of engine')
129
+ return self.engine.dispose()
130
+
131
+
132
+ class SQLiteDatabaseConnector(LoggedComponent, DatabaseConnector):
133
+ """Database connector to a SQLite database"""
134
+
135
+ def __init__(self, config: ConnectionSettings):
136
+ super().__init__()
137
+ self.config = config
138
+ self.engine = self._create_engine(self.config)
139
+ self._test_engine()
140
+
141
+ def _build_connection_string(self) -> str:
142
+ """Define the connection string from the given configuration"""
143
+ if self.config.conn_server:
144
+ # For SQLite, conn_server is treated as the file path
145
+ return f"sqlite:///{self.config.conn_server}"
146
+ self.log_and_raise(ValueError, "Connection configuration is not valid. SQLite requires a file path in conn_server.")
147
+
148
+ def _create_engine(self, settings: ConnectionSettings) -> Engine:
149
+ """Create a sqlalchemy engine using the provided configuration"""
150
+ connection_string = self._build_connection_string()
151
+ try:
152
+ # SQLite specific engine creation
153
+ engine = create_engine(
154
+ connection_string,
155
+ echo=self.is_debug_enabled,
156
+ )
157
+ return engine
158
+ except Exception as e:
159
+ self.log_exception(e, "Could not create engine from connection string")
160
+ raise
161
+
162
+ def get_engine(self) -> Engine:
163
+ """Return the engine to be used"""
164
+ return self.engine
165
+
166
+ def _test_engine(self) -> bool:
167
+ """Test if the connection works"""
168
+ try:
169
+ with self.engine.connect() as conn:
170
+ conn.execute(text("SELECT 1"))
171
+ self.logger.info(f"Connection test successful: {self.config.conn_server} - {self.config.conn_database}")
172
+ return True
173
+ except Exception as e:
174
+ self.log_and_raise(
175
+ EngineTestException,
176
+ f"Connection test failed",
177
+ exception=str(e),
178
+ )
179
+ return False
180
+
181
+ def _dispose_engine(self) -> None:
182
+ self.logger.debug('Disposing of engine')
183
+ return self.engine.dispose()
184
+
185
+
186
+ CONNECTOR_FACTORY = {
187
+ ServerType.MSSQL: SqlServerDatabaseConnector,
188
+ ServerType.SQLITE: SQLiteDatabaseConnector,
189
+ # ServerType.POSTGRESQL: PostgresDatabaseConnector
190
+ }
@@ -0,0 +1,129 @@
1
+ from sqlalchemy import inspect, text, Table, Column, MetaData
2
+ from sqlalchemy.engine import Engine
3
+ from sqlalchemy.types import TypeEngine
4
+ from pandas import DataFrame
5
+ import pandas as pd
6
+ from typing import Dict, Any, Optional
7
+ from .log import LoggedComponent
8
+
9
+ class DatabaseOperations(LoggedComponent):
10
+ """Component responsible for all SQL interactions using SQLAlchemy engines"""
11
+
12
+ def __init__(self, engine: Engine):
13
+ super().__init__()
14
+ self.engine = engine
15
+ self._inspector = inspect(self.engine)
16
+
17
+ def write_to_table(self, table_name: str, df: DataFrame, **kwargs) -> bool:
18
+ """Write a dataframe to a specified table in the database"""
19
+
20
+ self.logger.info(f"Writing {len(df)} rows to table: {table_name}")
21
+ try:
22
+ df.to_sql(table_name, con=self.engine, **kwargs)
23
+ return True
24
+ except Exception as e:
25
+ self.log_exception(e, f"Failed to write to table {table_name}")
26
+ raise
27
+
28
+ def read_data(self, sql: str, **kwargs) -> DataFrame:
29
+ """Read a specified table from the database into a dataframe"""
30
+
31
+ self.logger.debug(f"Reading data.")
32
+ try:
33
+ return pd.read_sql(sql, con=self.engine, **kwargs)
34
+ except Exception as e:
35
+ self.log_exception(e, f"Failed to read data")
36
+ raise
37
+
38
+ def inspect_table(self, table_name: str) -> dict:
39
+ """Read the metadata (columns and types) of a specified table"""
40
+
41
+ self.logger.debug(f"Inspecting metadata for table: {table_name}")
42
+ try:
43
+ columns = self._inspector.get_columns(table_name)
44
+ if not columns:
45
+ self.logger.warning(f"Table {table_name} not found or has no columns.")
46
+ return {}
47
+
48
+ return {col['name']: str(col['type']) for col in columns}
49
+ except Exception as e:
50
+ self.log_exception(e, f"Failed to inspect table {table_name}")
51
+ return {}
52
+
53
+ def create_table(self, table_name: str, schema: Dict[str, TypeEngine]) -> bool:
54
+ """
55
+ Create a table dynamically.
56
+ schema: {'col_name': sqlalchemy_type}
57
+ """
58
+ self.logger.info(f"Creating table: {table_name}")
59
+ try:
60
+ metadata = MetaData()
61
+ columns = [Column(name, col_type) for name, col_type in schema.items()]
62
+ table = Table(table_name, metadata, *columns)
63
+
64
+ metadata.create_all(self.engine)
65
+ return True
66
+ except Exception as e:
67
+ self.log_exception(e, f"Failed to create table {table_name}")
68
+ return False
69
+
70
+ def execute_stored_procedure(self, procedure_name: str, **kwargs) -> bool:
71
+ """Execute a stored procedure using the engine connection"""
72
+ self.logger.info(f"Executing stored procedure: {procedure_name}")
73
+ try:
74
+ params_str = ", ".join([f":{k}" for k in kwargs.keys()])
75
+ sql = text(f"EXEC {procedure_name} {params_str}")
76
+
77
+ with self.engine.begin() as conn:
78
+ # self.engine.begin() is a context manager for the transaction
79
+ # if no error occurs the transaction gets commited
80
+ conn.execute(sql, kwargs)
81
+ return True
82
+ except Exception as e:
83
+ self.log_exception(e, f"Failed to execute procedure {procedure_name}")
84
+ raise
85
+
86
+ def write_audit(self, table_name: str, entry: 'AuditEntry'):
87
+ """Write an audit entry to the database, ensuring the table exists first."""
88
+ self.logger.debug(f"Writing audit entry for execution: {entry.execution_id}")
89
+
90
+ try:
91
+ from sqlalchemy import Table, Column, String, Integer, BigInteger, DateTime, MetaData
92
+ metadata = MetaData()
93
+
94
+ columns = [
95
+ Column("execution_id", String(50), primary_key=True),
96
+ Column("pipeline_name", String(100)),
97
+ Column("status", String(50)),
98
+ Column("input_rows", Integer),
99
+ Column("output_rows", Integer),
100
+ Column("source_name", String(255)),
101
+ Column("destination_name", String(255)),
102
+ Column("file_name", String(255)),
103
+ Column("file_path", String(500)),
104
+ Column("file_size_bytes", BigInteger),
105
+ Column("file_last_modified", DateTime),
106
+ Column("sp_name", String(255)),
107
+ Column("sp_parameters", String),
108
+ Column("timestamp", DateTime),
109
+ Column("error_details", String)
110
+ ]
111
+
112
+ table = Table(table_name, metadata, *columns)
113
+ metadata.create_all(self.engine)
114
+
115
+ data = entry.model_dump()
116
+ # Serialize dictionary fields for storage
117
+ if data.get("sp_parameters"):
118
+ import json
119
+ try:
120
+ data["sp_parameters"] = json.dumps(data["sp_parameters"], default=str)
121
+ except:
122
+ data["sp_parameters"] = str(data["sp_parameters"])
123
+ df = pd.DataFrame([data])
124
+
125
+ df.to_sql(table_name, con=self.engine, if_exists="append", index=False)
126
+
127
+ except Exception as e:
128
+ self.log_exception(e, f"Failed to write audit entry to {table_name}")
129
+ raise
@@ -0,0 +1,46 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ import pyodbc
4
+
5
+ from .custom_exceptions import DriverNotFoundException
6
+ from .log import LoggedComponent
7
+
8
+
9
+ class DriverDetector(ABC):
10
+ @abstractmethod
11
+ def get_available_drivers(self) -> list[str]:
12
+ """Get list of available drivers."""
13
+ pass
14
+
15
+ @abstractmethod
16
+ def select_preferred_driver(self) -> str:
17
+ """Find the best available driver."""
18
+ pass
19
+
20
+
21
+ class SqlServerDriverDetector(LoggedComponent, DriverDetector):
22
+ def __init__(self):
23
+ super().__init__()
24
+ self._preferred_drivers = [
25
+ "ODBC Driver 18 for SQL Server",
26
+ "ODBC Driver 17 for SQL Server",
27
+ ]
28
+
29
+ def get_available_drivers(self) -> list[str]:
30
+ """Get all available ODBC drivers."""
31
+ return pyodbc.drivers()
32
+
33
+ def select_preferred_driver(self) -> str:
34
+ """Select first detected driver in order of preferrence."""
35
+
36
+ available_drivers = self.get_available_drivers()
37
+
38
+ for driver in self._preferred_drivers:
39
+ if driver in available_drivers:
40
+ return driver
41
+
42
+ self.log_and_raise(
43
+ DriverNotFoundException,
44
+ available_drivers=available_drivers,
45
+ preferred_drivers=self._preferred_drivers,
46
+ )
@@ -0,0 +1,146 @@
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from typing import Callable, Dict, Optional
4
+ from .log import LoggedComponent
5
+ from .models import FileType, FileSettings
6
+
7
+ class FileOperations(LoggedComponent):
8
+ """Handles all file operations"""
9
+
10
+ def __init__(self, settings: FileSettings):
11
+ super().__init__()
12
+ self.settings = settings
13
+ self.file_type = self.settings.file_type
14
+ self.file_path : Optional[Path] = None
15
+
16
+ # mapping the file reader by extension
17
+ self._readers: Dict[FileType, Callable] = {
18
+ FileType.CSV : pd.read_csv,
19
+ FileType.EXCEL : pd.read_excel,
20
+ FileType.PARQUET: pd.read_parquet,
21
+ FileType.ORC : pd.read_orc
22
+ }
23
+
24
+ # mapping the writters to FileType
25
+ self._writers : Dict[FileType, Callable] = {
26
+ FileType.CSV : self._write_csv,
27
+ FileType.EXCEL : self._write_excel,
28
+ FileType.PARQUET : self._write_parquet,
29
+ FileType.ORC : self._write_orc,
30
+ }
31
+
32
+ def _find_file(self) -> Path:
33
+ """
34
+ Identifies the file based on pattern (latest) or explicit name.
35
+ """
36
+ if self.settings.file_pattern:
37
+ files = list(self.settings.folder_path.glob(self.settings.file_pattern))
38
+ if not files:
39
+ self.log_and_raise(ValueError, f'No files found by pattern {self.settings.file_pattern} inside folder {self.settings.folder_path}')
40
+ latest_file = max(files, key=lambda f: f.stat().st_mtime)
41
+ return latest_file
42
+ elif self.settings.file_name:
43
+ file = (self.settings.folder_path / self.settings.file_name).with_suffix(f'.{self.settings.file_type.value.lower()}')
44
+ if not file.exists():
45
+ self.log_and_raise(ValueError, 'Error finding file, please check the file definition.')
46
+ return file
47
+ self.log_and_raise(ValueError, 'Error finding file by name or pattern')
48
+
49
+ def read_file(self, **kwargs) -> pd.DataFrame:
50
+ """
51
+ Entrypoint for reading files to a pandas DataFrame.
52
+ Depending on the file type it will delegate to the coresponding reader.
53
+ """
54
+ if not self.file_path:
55
+ self.file_path = self._find_file()
56
+
57
+ try:
58
+ self.logger.info(f'Reading file: {self.file_path.name}')
59
+ return self._readers[self.file_type](self.file_path, **kwargs)
60
+ except Exception as e:
61
+ self.log_exception(e, f'Failed to read {self.file_path.name} into a dataframe')
62
+ raise
63
+
64
+ def write_file(self, df: pd.DataFrame, **kwargs) -> Path:
65
+ """
66
+ Ensures a valid path exists and writes the dataframe.
67
+ If file_name is provided, it uses it (and overwrites).
68
+ If file_pattern is provided, it constructs a name using the pattern and a timestamp.
69
+ """
70
+
71
+ output_path = self._construct_output_path()
72
+ try:
73
+ output_path.parent.mkdir(parents=True, exist_ok=True)
74
+ output_path.touch(exist_ok=True)
75
+
76
+ self.logger.info(f"Writing to file: {output_path.name}")
77
+ self._writers[self.file_type](df, output_path, **kwargs)
78
+
79
+ return output_path
80
+ except Exception as e:
81
+ self.log_exception(e, f'Failed to write to: {output_path}')
82
+ raise
83
+
84
+ def _construct_output_path(self) -> Path:
85
+ """
86
+ Construct the output file path based on the settings.
87
+ If the file name is defined the output path will be fixed,
88
+ if the file pattern is defined the file name will have a timestamp appended.
89
+ """
90
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
91
+ file_extension = f'.{self.settings.file_type.value.lower()}'
92
+
93
+ if self.settings.file_name:
94
+ return (self.settings.folder_path / self.settings.file_name).with_suffix(file_extension)
95
+ return self.settings.folder_path / f"{self.settings.file_pattern}_{timestamp}{file_extension}"
96
+
97
+ def _apply_file_preprocessor(self, preprocessor_func: Callable[[Path], Path]):
98
+ """
99
+ Executes a custom function passed from the BasePipelineDefinition
100
+ and returns the new file path
101
+ """
102
+
103
+ current_path = self._find_file()
104
+ self.logger.info(f'Current file path is: {current_path}')
105
+
106
+ if preprocessor_func is None:
107
+ self.file_path = current_path
108
+ return
109
+
110
+ try:
111
+ self.logger.info(f"Running pre-processor on: {current_path}")
112
+ # run pre-processor function and record the path - old or new
113
+ processed_path = preprocessor_func(current_path)
114
+
115
+ if isinstance(processed_path, Path):
116
+ self.file_path = processed_path
117
+ self.logger.debug(f'File path updated to: {self.file_path}')
118
+ else:
119
+ self.logger.warning(f'Pre-processor function did not return a new Path')
120
+ self.file_path = current_path
121
+
122
+ except Exception as e:
123
+ self.log_exception(e, f'Pre-processor failed for : {current_path}')
124
+ # In case of failure, keep the original path so the pipeline can try to proceed
125
+ self.file_path = current_path
126
+
127
+ def _write_csv(self, df: pd.DataFrame, file_path: Path, **kwargs):
128
+ """Internal CSV writer"""
129
+ df.to_csv(file_path, **kwargs)
130
+
131
+ def _write_excel(self, df: pd.DataFrame, file_path: Path, **kwargs):
132
+ """Internal Excel writer"""
133
+
134
+ df.to_excel(file_path, **kwargs)
135
+
136
+ def _write_parquet(self, df: pd.DataFrame, file_path: Path, **kwargs):
137
+ """Internal Parquet writer"""
138
+
139
+ df.to_parquet(file_path, **kwargs)
140
+
141
+ def _write_orc(self, df: pd.DataFrame, file_path: Path, **kwargs):
142
+ """Internal Orc writer"""
143
+
144
+ df.to_orc(file_path, **kwargs)
145
+
146
+
@@ -0,0 +1,90 @@
1
+ import logging
2
+ import logging.handlers
3
+ from pathlib import Path
4
+
5
+
6
+ class AppLogger:
7
+ _instance = None
8
+ _initialized = False
9
+
10
+ def __new__(cls):
11
+ if cls._instance is None:
12
+ cls._instance = super().__new__(cls)
13
+ return cls._instance
14
+
15
+ def __init__(self):
16
+ if not self._initialized:
17
+ self._setup_logging()
18
+ AppLogger._initialized = True
19
+
20
+ def _setup_logging(self):
21
+ Path("logs").mkdir(exist_ok=True)
22
+
23
+ formatter = logging.Formatter(
24
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
25
+ datefmt="%Y-%m-%d %H:%M:%S",
26
+ )
27
+
28
+ root_logger = logging.getLogger()
29
+ root_logger.setLevel(logging.INFO)
30
+ root_logger.handlers.clear()
31
+
32
+ # Console
33
+ console_handler = logging.StreamHandler()
34
+ console_handler.setFormatter(formatter)
35
+ root_logger.addHandler(console_handler)
36
+
37
+ # File
38
+ file_handler = logging.handlers.RotatingFileHandler(
39
+ "logs/application.log", maxBytes=10 * 1024 * 1024, backupCount=5
40
+ )
41
+ file_handler.setFormatter(formatter)
42
+ root_logger.addHandler(file_handler)
43
+
44
+ def get_logger(self, name: str) -> logging.Logger:
45
+ return logging.getLogger(name)
46
+
47
+ def set_level(self, level: str):
48
+ """Change log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"""
49
+
50
+ log_level = getattr(logging, level.upper())
51
+ logging.getLogger().setLevel(log_level)
52
+
53
+ # Update all handlers
54
+ for handler in logging.getLogger().handlers:
55
+ handler.setLevel(log_level)
56
+
57
+ @property
58
+ def is_debug(self) -> bool:
59
+ return logging.getLogger().isEnabledFor(logging.DEBUG)
60
+
61
+
62
+ class LoggedComponent:
63
+ """Base class providing logging functionality to all components"""
64
+
65
+ def __init__(self):
66
+ self.log = AppLogger()
67
+ self.logger = self.log.get_logger(self.__class__.__name__)
68
+
69
+ def log_and_raise(self, exception_class, message: str = "", **context):
70
+ """Log error message and raise exception with optional context"""
71
+ if context:
72
+ context_str = ", ".join(f"{k}={v}" for k, v in context.items())
73
+ self.logger.error(f"{message} | Context: {context_str}")
74
+ else:
75
+ self.logger.error(message)
76
+
77
+ raise exception_class(message)
78
+
79
+ def log_exception(self, exc: Exception, message: str = "", **context):
80
+ """Log a caught exception with optional context"""
81
+ log_msg = message or f"Exception occurred: {str(exc)}"
82
+ if context:
83
+ context_str = ", ".join(f"{k}={v}" for k, v in context.items())
84
+ log_msg += f" | Context: {context_str}"
85
+
86
+ self.logger.error(log_msg, exc_info=True)
87
+
88
+ @property
89
+ def is_debug_enabled(self) -> bool:
90
+ return self.log.is_debug