easy-data-loader 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {easy_data_loader-0.1.2/src/easy_data_loader.egg-info → easy_data_loader-0.1.3}/PKG-INFO +2 -1
  2. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/pyproject.toml +2 -1
  3. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/custom_exceptions.py +6 -0
  4. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/database_connector.py +102 -1
  5. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/database_operations.py +34 -7
  6. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/models.py +11 -3
  7. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/orchestrator.py +9 -5
  8. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/pipeline.py +52 -3
  9. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/pipeline_base.py +10 -4
  10. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/procedure_pipeline.py +6 -4
  11. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3/src/easy_data_loader.egg-info}/PKG-INFO +2 -1
  12. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader.egg-info/SOURCES.txt +3 -1
  13. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader.egg-info/requires.txt +1 -0
  14. easy_data_loader-0.1.3/tests/test_orchestrator.py +134 -0
  15. easy_data_loader-0.1.3/tests/test_validation.py +219 -0
  16. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/LICENSE +0 -0
  17. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/README.md +0 -0
  18. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/setup.cfg +0 -0
  19. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/__init__.py +0 -0
  20. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/cli.py +0 -0
  21. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/config_loader.py +0 -0
  22. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/data_inferrence.py +0 -0
  23. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/driver_detector.py +0 -0
  24. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/file_operations.py +0 -0
  25. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader/log.py +0 -0
  26. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader.egg-info/dependency_links.txt +0 -0
  27. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader.egg-info/entry_points.txt +0 -0
  28. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/src/easy_data_loader.egg-info/top_level.txt +0 -0
  29. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/tests/test_data_inference.py +0 -0
  30. {easy_data_loader-0.1.2 → easy_data_loader-0.1.3}/tests/test_imports.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easy_data_loader
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Data transfer utilities between files and databases
5
5
  Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -16,6 +16,7 @@ License-File: LICENSE
16
16
  Requires-Dist: click>=8.3.0
17
17
  Requires-Dist: openpyxl>=3.1.5
18
18
  Requires-Dist: pandas>=2.3.3
19
+ Requires-Dist: psycopg2-binary>=2.9.11
19
20
  Requires-Dist: pyarrow>=22.0.0
20
21
  Requires-Dist: pydantic>=2.12.5
21
22
  Requires-Dist: pydantic-settings>=2.12.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "easy_data_loader"
3
- version = "0.1.2"
3
+ version = "0.1.3"
4
4
  description = "Data transfer utilities between files and databases"
5
5
  authors = [{ name = "Bojoi Gabriel", email = "bojoigabriel@gmail.com" }]
6
6
  readme = "README.md"
@@ -9,6 +9,7 @@ dependencies = [
9
9
  "click>=8.3.0",
10
10
  "openpyxl>=3.1.5",
11
11
  "pandas>=2.3.3",
12
+ "psycopg2-binary>=2.9.11",
12
13
  "pyarrow>=22.0.0",
13
14
  "pydantic>=2.12.5",
14
15
  "pydantic-settings>=2.12.0",
@@ -20,3 +20,9 @@ class InvalidFileException(Exception):
20
20
  def __init__(self, message: str = "The provided file is invalid or corrupted"):
21
21
  self.message = message
22
22
  super().__init__(self.message)
23
+
24
+
25
+ class PipelineValidationError(Exception):
26
+ def __init__(self, message: str):
27
+ self.message = message
28
+ super().__init__(self.message)
@@ -220,8 +220,109 @@ class SQLiteDatabaseConnector(LoggedComponent, DatabaseConnector):
220
220
  self.engine.dispose()
221
221
 
222
222
 
223
+ class PostgresDatabaseConnector(LoggedComponent, DatabaseConnector):
224
+ """Database connector to a PostgreSQL database"""
225
+
226
+ def __init__(self, config: ConnectionSettings):
227
+ super().__init__()
228
+ self.config = config
229
+ self.engine = self._create_engine(self.config)
230
+ self._test_engine()
231
+
232
+ def _build_connection_string(self) -> str:
233
+ """Define the connection string from the given configuration"""
234
+ if not isinstance(self.config, ServerBasedConnectionSettings):
235
+ self.log_and_raise(
236
+ ValueError,
237
+ "PostgresDatabaseConnector requires ServerBasedConnectionSettings",
238
+ )
239
+
240
+ # Validate minimum parameters
241
+ if not all(
242
+ [self.config.conn_server, self.config.conn_database, self.config.conn_port]
243
+ ):
244
+ self.log_and_raise(
245
+ ValueError,
246
+ "Postgres connection configuration is missing host, port or database name",
247
+ )
248
+
249
+ # encode special characters in password and username
250
+ user = (
251
+ urllib.parse.quote_plus(self.config.conn_username)
252
+ if self.config.conn_username
253
+ else ""
254
+ )
255
+ password = (
256
+ urllib.parse.quote_plus(self.config.conn_password)
257
+ if self.config.conn_password
258
+ else ""
259
+ )
260
+
261
+ host = self.config.conn_server
262
+ port = self.config.conn_port
263
+ database = self.config.conn_database
264
+
265
+ # Standard SQLAlchemy format: postgresql+psycopg2://user:password@host:port/dbname
266
+ if user and password:
267
+ return f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
268
+
269
+ # trusted connection
270
+ return f"postgresql+psycopg2://{user}@{host}:{port}/{database}"
271
+
272
+ def _create_engine(self, settings: ConnectionSettings) -> Engine:
273
+ """Create a sqlalchemy engine using the provided configuration"""
274
+ connection_string = self._build_connection_string()
275
+
276
+ try:
277
+ engine = create_engine(
278
+ connection_string,
279
+ pool_size=5,
280
+ max_overflow=10,
281
+ pool_timeout=30,
282
+ pool_recycle=3600,
283
+ echo=self.is_debug_enabled,
284
+ )
285
+ return engine
286
+ except Exception as e:
287
+ self.log_exception(
288
+ e, "Could not create Postgres engine from connection string"
289
+ )
290
+ raise
291
+
292
+ def get_engine(self) -> Engine:
293
+ """Return the engine to be used"""
294
+ return self.engine
295
+
296
+ def _test_engine(self) -> bool:
297
+ """Test if the connection works"""
298
+ if not isinstance(self.config, ServerBasedConnectionSettings):
299
+ self.log_and_raise(
300
+ ValueError,
301
+ "PostgresDatabaseConnector requires ServerBasedConnectionSettings",
302
+ )
303
+
304
+ try:
305
+ with self.engine.connect() as conn:
306
+ conn.execute(text("SELECT 1"))
307
+ self.logger.debug(
308
+ f"Postgres connection test successful: {self.config.conn_server} - {self.config.conn_database}"
309
+ )
310
+ return True
311
+ except Exception as e:
312
+ self.log_and_raise(
313
+ EngineTestException,
314
+ "Postgres connection test failed",
315
+ exception=str(e),
316
+ )
317
+ return False
318
+
319
+ def _dispose_engine(self) -> None:
320
+ self.logger.debug("Disposing of Postgres engine")
321
+ self.engine.dispose()
322
+
323
+
223
324
  CONNECTOR_FACTORY: dict[ServerType, type] = {
224
325
  ServerType.MSSQL: SqlServerDatabaseConnector,
225
326
  ServerType.SQLITE: SQLiteDatabaseConnector,
226
- # ServerType.POSTGRESQL: PostgresDatabaseConnector
327
+ ServerType.POSTGRESQL: PostgresDatabaseConnector,
227
328
  }
@@ -2,7 +2,7 @@ from typing import Dict
2
2
 
3
3
  import pandas as pd
4
4
  from pandas import DataFrame
5
- from sqlalchemy import Column, MetaData, Table, inspect, text
5
+ from sqlalchemy import Column, MetaData, Table, TextClause, inspect, text
6
6
  from sqlalchemy.engine import Engine
7
7
  from sqlalchemy.types import TypeEngine
8
8
  from sqlalchemy.schema import SchemaItem
@@ -74,12 +74,36 @@ class DatabaseOperations(LoggedComponent):
74
74
  self.log_exception(e, f"Failed to create table {table_name}")
75
75
  return False
76
76
 
77
+ def _build_procedure_sql(
78
+ self, dialect: str, procedure_name: str, params: dict
79
+ ) -> TextClause:
80
+ """Build dialect-specific SQL for calling a stored procedure."""
81
+ params_keys = list(params.keys())
82
+
83
+ if dialect == "mssql":
84
+ if params_keys:
85
+ params_str = ", ".join([f":{k}" for k in params_keys])
86
+ return text(f"EXEC {procedure_name} {params_str};")
87
+ return text(f"EXEC {procedure_name};")
88
+ elif dialect == "postgresql":
89
+ params_str = ", ".join([f":{k}" for k in params_keys])
90
+ return text(f"CALL {procedure_name}({params_str});")
91
+ elif dialect == "sqlite":
92
+ raise NotImplementedError("SQLite does not support stored procedures.")
93
+ else:
94
+ raise NotImplementedError(
95
+ f"Stored procedure execution is not supported for dialect: {dialect}"
96
+ )
97
+
77
98
  def execute_stored_procedure(self, procedure_name: str, **kwargs) -> bool:
78
- """Execute a stored procedure using the engine connection"""
99
+ """Execute a stored procedure using the engine connection.
100
+
101
+ Generates dialect-appropriate SQL based on the engine's dialect.
102
+ """
79
103
  self.logger.info(f"Executing stored procedure: {procedure_name}")
80
104
  try:
81
- params_str = ", ".join([f":{k}" for k in kwargs.keys()])
82
- sql = text(f"EXEC {procedure_name} {params_str}")
105
+ dialect = self.engine.dialect.name
106
+ sql = self._build_procedure_sql(dialect, procedure_name, kwargs)
83
107
 
84
108
  with self.engine.begin() as conn:
85
109
  # self.engine.begin() is a context manager for the transaction
@@ -87,12 +111,14 @@ class DatabaseOperations(LoggedComponent):
87
111
  conn.execute(sql, kwargs)
88
112
  return True
89
113
  except Exception as e:
90
- self.log_exception(e, f"Failed to execute procedure {procedure_name}")
114
+ self.logger.error(
115
+ f"Failed to execute procedure {procedure_name} | Error: {str(e)}"
116
+ )
91
117
  raise
92
118
 
93
119
  def write_audit(self, table_name: str, entry: AuditEntry):
94
120
  """Write an audit entry to the database, ensuring the table exists first."""
95
- self.logger.debug(f"Writing audit entry for execution: {entry.execution_id}")
121
+ self.logger.debug(f"Writing audit entry for execution: {entry.pipeline_id}")
96
122
 
97
123
  try:
98
124
  from sqlalchemy import (
@@ -108,7 +134,8 @@ class DatabaseOperations(LoggedComponent):
108
134
  metadata = MetaData()
109
135
 
110
136
  columns: list[SchemaItem] = [
111
- Column("execution_id", String(50), primary_key=True),
137
+ Column("pipeline_id", String(36), primary_key=True),
138
+ Column("orchestrator_id", String(36)),
112
139
  Column("pipeline_name", String(100)),
113
140
  Column("status", String(50)),
114
141
  Column("input_rows", Integer),
@@ -42,8 +42,8 @@ class ServerBasedConnectionSettings(BaseConnectionSettings):
42
42
 
43
43
  conn_server: str
44
44
  conn_database: str
45
- conn_username: str
46
- conn_password: str
45
+ conn_username: str | None = None
46
+ conn_password: str | None = None
47
47
  conn_port: Annotated[int, Field(default=1433, ge=1, le=65535)]
48
48
 
49
49
  model_config = SettingsConfigDict(extra="ignore")
@@ -62,6 +62,12 @@ class ServerBasedConnectionSettings(BaseConnectionSettings):
62
62
  raise ValueError(
63
63
  "Both username and password must be provided for authentication"
64
64
  )
65
+
66
+ # If the username and password are captured as empty strings nullify them
67
+ # which will lead the connection to be created as trusted connection for Sql Server
68
+ if self.conn_username == "" and self.conn_password == "":
69
+ self.conn_username = None
70
+ self.conn_password = None
65
71
  return self
66
72
 
67
73
 
@@ -128,6 +134,7 @@ class BasePipelineDefinition(BaseModel):
128
134
  destination_table: Optional[str] = None
129
135
  audit: Optional[str] = None # resource name for the audit database
130
136
  validator: Optional[Any] = None # pydantic model class for validation
137
+ validation_fail: bool = False
131
138
 
132
139
  # mapping of source columns to destination columns together with datatypes
133
140
  columns: Dict[str, ColumnDefinition] = Field(default_factory=dict)
@@ -195,7 +202,8 @@ class ProcedureDefinition(BaseModel):
195
202
  class AuditEntry(BaseModel):
196
203
  """Model representing an entry in the audit table"""
197
204
 
198
- execution_id: str
205
+ orchestrator_id: Optional[str] = None
206
+ pipeline_id: str
199
207
  pipeline_name: str
200
208
  status: str
201
209
  input_rows: int = 0
@@ -1,4 +1,4 @@
1
- import uuid
1
+ from uuid import uuid4
2
2
 
3
3
  from .config_loader import Configuration
4
4
  from .log import LoggedComponent
@@ -17,6 +17,7 @@ class OrchestratorPipeline(LoggedComponent):
17
17
 
18
18
  def __init__(self, orchestrator_name: str):
19
19
  super().__init__()
20
+ self.orchestrator_id = str(uuid4())
20
21
  self.config = Configuration()
21
22
  definition = self.config.get_pipeline(orchestrator_name)
22
23
 
@@ -26,11 +27,10 @@ class OrchestratorPipeline(LoggedComponent):
26
27
  )
27
28
 
28
29
  self.definition: OrchestratorDefinition = definition
29
- self.batch_id = str(uuid.uuid4())
30
30
 
31
31
  def run(self) -> bool:
32
32
  self.logger.info(
33
- f"=== Starting Orchestrator: {self.definition.orchestrator_name} (Batch: {self.batch_id}) ==="
33
+ f"=== Starting Orchestrator: {self.definition.orchestrator_name} (Orchestrator: {self.orchestrator_id}) ==="
34
34
  )
35
35
 
36
36
  success = True
@@ -44,9 +44,13 @@ class OrchestratorPipeline(LoggedComponent):
44
44
 
45
45
  # Instantiate and run
46
46
  if isinstance(p_def, BasePipelineDefinition):
47
- p_success = LoadPipeline(pipeline_name).run()
47
+ p_success = LoadPipeline(
48
+ pipeline_name, orchestrator_id=self.orchestrator_id
49
+ ).run()
48
50
  elif isinstance(p_def, ProcedureDefinition):
49
- p_success = ProcedurePipeline(pipeline_name).run()
51
+ p_success = ProcedurePipeline(
52
+ pipeline_name, orchestrator_id=self.orchestrator_id
53
+ ).run()
50
54
  elif isinstance(p_def, OrchestratorDefinition):
51
55
  self.logger.error(
52
56
  f"[{self.definition.orchestrator_name}] -> Nested orchestrators are not supported."
@@ -14,6 +14,7 @@ from .models import (
14
14
  FileBasedConnectionSettings,
15
15
  FileType,
16
16
  )
17
+ from .custom_exceptions import PipelineValidationError
17
18
  from .pipeline_base import BasePipeline
18
19
 
19
20
 
@@ -24,7 +25,8 @@ class LoadPipeline(BasePipeline):
24
25
  Inherits shared logic from BasePipeline.
25
26
  """
26
27
 
27
- def __init__(self, pipeline_name: str):
28
+ def __init__(self, pipeline_name: str, orchestrator_id: Optional[str] = None):
29
+
28
30
  # Load definition from config
29
31
  definition = Configuration().get_pipeline(pipeline_name)
30
32
  if not isinstance(definition, BasePipelineDefinition):
@@ -32,7 +34,7 @@ class LoadPipeline(BasePipeline):
32
34
  f"Pipeline '{pipeline_name}' is not a LoadPipeline definition."
33
35
  )
34
36
 
35
- super().__init__(definition)
37
+ super().__init__(definition, orchestrator_id=orchestrator_id)
36
38
  self.definition: BasePipelineDefinition = definition
37
39
 
38
40
  # initialize components
@@ -95,6 +97,12 @@ class LoadPipeline(BasePipeline):
95
97
  # 2. TRANSFORM & VALIDATE
96
98
  df = self._transform_step(df)
97
99
  self.output_rows = len(df)
100
+ if df.empty:
101
+ self.logger.warning(
102
+ "No valid data remaining after validation! Pipeline will stop gracefully."
103
+ )
104
+ self._log_audit("SUCCESS")
105
+ return True
98
106
 
99
107
  # 3. LOAD
100
108
  load_success = self._load_step(df, inferred_dtypes)
@@ -111,6 +119,13 @@ class LoadPipeline(BasePipeline):
111
119
  )
112
120
  return True
113
121
 
122
+ except PipelineValidationError as e:
123
+ self.error_details = str(e)
124
+ self.logger.error(
125
+ f"Critical pipeline error - {self.definition.pipeline_name}: {str(e)}"
126
+ )
127
+ self._log_audit("FAILED")
128
+ return False
114
129
  except Exception as e:
115
130
  self.error_details = str(e)
116
131
  self.log_exception(
@@ -192,18 +207,52 @@ class LoadPipeline(BasePipeline):
192
207
  f"Starting data validation against model: {self.definition.validator.__name__}"
193
208
  )
194
209
  validated_rows = []
210
+ failed_rows = []
195
211
 
196
212
  # Optimization: iterrows is slow because it creates a Series for each row.
197
213
  # Converting to a list of dicts first is much faster for iteration.
198
214
  records = df.to_dict(orient="records")
199
215
 
216
+ from pydantic import ValidationError
217
+
200
218
  for i, record in enumerate(records):
201
219
  try:
202
220
  # Use model_validate for individual row validation
203
221
  model_inst = self.definition.validator.model_validate(record)
204
222
  validated_rows.append(model_inst.model_dump())
223
+ except ValidationError as ve:
224
+ # Format Pydantic validation error to be human friendly
225
+ errors_list = []
226
+ for err in ve.errors():
227
+ loc_str = " -> ".join(str(item) for item in err.get("loc", []))
228
+ msg = err.get("msg", "Validation error")
229
+ errors_list.append(f"{loc_str}: {msg}")
230
+ err_msg = ", ".join(errors_list)
231
+ failed_rows.append((i, record, err_msg))
205
232
  except Exception as e:
206
- self.logger.warning(f"Row {i} failed validation: {str(e)}")
233
+ failed_rows.append((i, record, str(e)))
234
+
235
+ if failed_rows:
236
+ total_failed = len(failed_rows)
237
+ total_records = len(records)
238
+ if self.definition.validation_fail:
239
+ first_errors = "; ".join(
240
+ [f"Row {idx}: {err}" for idx, _, err in failed_rows[:5]]
241
+ )
242
+ if len(failed_rows) > 5:
243
+ first_errors += f" ... and {len(failed_rows) - 5} more errors"
244
+ self.log_and_raise(
245
+ PipelineValidationError,
246
+ f"Validation failed for {total_failed} out of {total_records} rows: {first_errors}",
247
+ )
248
+ else:
249
+ self.logger.warning(
250
+ f"Validation failed for {total_failed} out of {total_records} rows."
251
+ )
252
+ for idx, record, err in failed_rows:
253
+ self.logger.debug(
254
+ f"Row {idx} failed validation: {err}. Row data: {record}"
255
+ )
207
256
 
208
257
  df = pd.DataFrame(validated_rows)
209
258
 
@@ -1,6 +1,6 @@
1
- import uuid
2
1
  from abc import ABC, abstractmethod
3
2
  from typing import List, Optional, Union
3
+ from uuid import uuid4
4
4
 
5
5
  import pandas as pd
6
6
 
@@ -27,10 +27,15 @@ class BasePipeline(LoggedComponent, ABC):
27
27
  - Shared resource cleanup
28
28
  """
29
29
 
30
- def __init__(self, definition: Union[BasePipelineDefinition, ProcedureDefinition]):
30
+ def __init__(
31
+ self,
32
+ definition: Union[BasePipelineDefinition, ProcedureDefinition],
33
+ orchestrator_id: Optional[str] = None,
34
+ ):
31
35
  super().__init__()
36
+ self.orchestrator_id = orchestrator_id
37
+ self.pipeline_id = str(uuid4())
32
38
  self.definition = definition
33
- self.execution_id = str(uuid.uuid4())
34
39
  self.config = Configuration()
35
40
 
36
41
  self._active_connectors: List[DatabaseConnector] = []
@@ -113,7 +118,8 @@ class BasePipeline(LoggedComponent, ABC):
113
118
  self.logger.warning(f"Could not read file metadata for audit: {e}")
114
119
 
115
120
  entry = AuditEntry(
116
- execution_id=self.execution_id,
121
+ orchestrator_id=self.orchestrator_id,
122
+ pipeline_id=self.pipeline_id,
117
123
  pipeline_name=self.definition.pipeline_name,
118
124
  status=status,
119
125
  input_rows=self.input_rows,
@@ -1,3 +1,5 @@
1
+ from typing import Optional
2
+
1
3
  from .config_loader import Configuration
2
4
  from .models import (
3
5
  ProcedureDefinition,
@@ -12,7 +14,7 @@ class ProcedurePipeline(BasePipeline):
12
14
  Pipeline specialized in executing database stored procedures.
13
15
  """
14
16
 
15
- def __init__(self, pipeline_name: str):
17
+ def __init__(self, pipeline_name: str, orchestrator_id: Optional[str] = None):
16
18
  # Load definition from config
17
19
  definition = Configuration().get_pipeline(pipeline_name)
18
20
  if not isinstance(definition, ProcedureDefinition):
@@ -20,7 +22,7 @@ class ProcedurePipeline(BasePipeline):
20
22
  f"Pipeline '{pipeline_name}' is not a ProcedureDefinition."
21
23
  )
22
24
 
23
- super().__init__(definition)
25
+ super().__init__(definition, orchestrator_id=orchestrator_id)
24
26
  self.definition: ProcedureDefinition = definition
25
27
 
26
28
  # Set up primary database connection
@@ -61,8 +63,8 @@ class ProcedurePipeline(BasePipeline):
61
63
 
62
64
  except Exception as e:
63
65
  self.error_details = str(e)
64
- self.log_exception(
65
- e, f"Error in Procedure Pipeline: {self.definition.pipeline_name}"
66
+ self.logger.error(
67
+ f"Procedure Pipeline failed: {self.definition.pipeline_name} | Error: {str(e)}"
66
68
  )
67
69
 
68
70
  # Set audit details for failure too
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easy_data_loader
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Data transfer utilities between files and databases
5
5
  Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -16,6 +16,7 @@ License-File: LICENSE
16
16
  Requires-Dist: click>=8.3.0
17
17
  Requires-Dist: openpyxl>=3.1.5
18
18
  Requires-Dist: pandas>=2.3.3
19
+ Requires-Dist: psycopg2-binary>=2.9.11
19
20
  Requires-Dist: pyarrow>=22.0.0
20
21
  Requires-Dist: pydantic>=2.12.5
21
22
  Requires-Dist: pydantic-settings>=2.12.0
@@ -23,4 +23,6 @@ src/easy_data_loader.egg-info/entry_points.txt
23
23
  src/easy_data_loader.egg-info/requires.txt
24
24
  src/easy_data_loader.egg-info/top_level.txt
25
25
  tests/test_data_inference.py
26
- tests/test_imports.py
26
+ tests/test_imports.py
27
+ tests/test_orchestrator.py
28
+ tests/test_validation.py
@@ -1,6 +1,7 @@
1
1
  click>=8.3.0
2
2
  openpyxl>=3.1.5
3
3
  pandas>=2.3.3
4
+ psycopg2-binary>=2.9.11
4
5
  pyarrow>=22.0.0
5
6
  pydantic>=2.12.5
6
7
  pydantic-settings>=2.12.0
@@ -0,0 +1,134 @@
1
+ import os
2
+ import tempfile
3
+ import sqlite3
4
+ import pytest
5
+
6
+ from easy_data_loader.config_loader import Configuration
7
+ from easy_data_loader.models import (
8
+ BasePipelineDefinition,
9
+ OrchestratorDefinition,
10
+ FileBasedConnectionSettings,
11
+ ServerType,
12
+ )
13
+ from easy_data_loader.orchestrator import OrchestratorPipeline
14
+
15
+
16
+ @pytest.fixture(autouse=True)
17
+ def reset_configuration_singleton():
18
+ """Reset the Configuration singleton before and after each test."""
19
+ Configuration._instance = None
20
+ Configuration._initialized = False
21
+ yield
22
+ Configuration._instance = None
23
+ Configuration._initialized = False
24
+
25
+
26
+ def test_orchestrator_passes_orchestrator_id_and_writes_audit():
27
+ # Create temporary file paths for our sqlite databases
28
+ with tempfile.TemporaryDirectory() as tmpdir:
29
+ src_db_path = os.path.join(tmpdir, "src.db")
30
+ dst_db_path = os.path.join(tmpdir, "dst.db")
31
+ audit_db_path = os.path.join(tmpdir, "audit.db")
32
+
33
+ # Create tables and put some data in the source database
34
+ conn_src = sqlite3.connect(src_db_path)
35
+ cursor_src = conn_src.cursor()
36
+ cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
37
+ cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
38
+ cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bob')")
39
+ conn_src.commit()
40
+ conn_src.close()
41
+
42
+ # Set up resources in Configuration
43
+ config = Configuration()
44
+
45
+ src_resource = FileBasedConnectionSettings(
46
+ conn_server_type=ServerType.SQLITE, file_path=src_db_path
47
+ )
48
+ dst_resource = FileBasedConnectionSettings(
49
+ conn_server_type=ServerType.SQLITE, file_path=dst_db_path
50
+ )
51
+ audit_resource = FileBasedConnectionSettings(
52
+ conn_server_type=ServerType.SQLITE, file_path=audit_db_path
53
+ )
54
+
55
+ config.resources["src_db"] = src_resource
56
+ config.resources["dst_db"] = dst_resource
57
+ config.resources["audit_db"] = audit_resource
58
+
59
+ # Define 2 pipelines that copy data
60
+ pipeline1_def = BasePipelineDefinition(
61
+ pipeline_name="pipeline1",
62
+ source="src_db",
63
+ source_sql="SELECT * FROM source_table",
64
+ destination="dst_db",
65
+ destination_table="target_table1",
66
+ audit="audit_db",
67
+ write_parameters={"if_exists": "replace", "index": False},
68
+ )
69
+
70
+ pipeline2_def = BasePipelineDefinition(
71
+ pipeline_name="pipeline2",
72
+ source="src_db",
73
+ source_sql="SELECT * FROM source_table",
74
+ destination="dst_db",
75
+ destination_table="target_table2",
76
+ audit="audit_db",
77
+ write_parameters={"if_exists": "replace", "index": False},
78
+ )
79
+
80
+ # Define orchestrator
81
+ orchestrator_def = OrchestratorDefinition(
82
+ orchestrator_name="my_orchestrator",
83
+ pipelines=["pipeline1", "pipeline2"],
84
+ fail_fast=True,
85
+ )
86
+
87
+ config.pipelines["pipeline1"] = pipeline1_def
88
+ config.pipelines["pipeline2"] = pipeline2_def
89
+ config.pipelines["my_orchestrator"] = orchestrator_def
90
+
91
+ # Run the orchestrator
92
+ orchestrator = OrchestratorPipeline("my_orchestrator")
93
+ assert orchestrator.orchestrator_id is not None
94
+
95
+ success = orchestrator.run()
96
+ assert success is True
97
+
98
+ # Verify target tables were created and populated in destination
99
+ conn_dst = sqlite3.connect(dst_db_path)
100
+ cursor_dst = conn_dst.cursor()
101
+
102
+ cursor_dst.execute("SELECT COUNT(*) FROM target_table1")
103
+ assert cursor_dst.fetchone()[0] == 2
104
+
105
+ cursor_dst.execute("SELECT COUNT(*) FROM target_table2")
106
+ assert cursor_dst.fetchone()[0] == 2
107
+
108
+ conn_dst.close()
109
+
110
+ # Verify audit records in the audit database
111
+ conn_audit = sqlite3.connect(audit_db_path)
112
+ cursor_audit = conn_audit.cursor()
113
+
114
+ # Read the execution_audit table columns and rows
115
+ cursor_audit.execute(
116
+ "SELECT pipeline_id, orchestrator_id, pipeline_name, status FROM execution_audit"
117
+ )
118
+ rows = cursor_audit.fetchall()
119
+ assert len(rows) == 2
120
+
121
+ # Row 1 (pipeline1)
122
+ assert rows[0][1] == orchestrator.orchestrator_id
123
+ assert rows[0][2] == "pipeline1"
124
+ assert rows[0][3] == "SUCCESS"
125
+
126
+ # Row 2 (pipeline2)
127
+ assert rows[1][1] == orchestrator.orchestrator_id
128
+ assert rows[1][2] == "pipeline2"
129
+ assert rows[1][3] == "SUCCESS"
130
+
131
+ # Make sure they have distinct pipeline_ids but identical orchestrator_ids
132
+ assert rows[0][0] != rows[1][0]
133
+
134
+ conn_audit.close()
@@ -0,0 +1,219 @@
1
+ import os
2
+ import tempfile
3
+ import sqlite3
4
+ import pytest
5
+ from pydantic import BaseModel, Field
6
+
7
+ from easy_data_loader.config_loader import Configuration
8
+ from easy_data_loader.models import (
9
+ BasePipelineDefinition,
10
+ FileBasedConnectionSettings,
11
+ ServerType,
12
+ )
13
+ from easy_data_loader.pipeline import LoadPipeline
14
+
15
+
16
+ class ValidationTestModel(BaseModel):
17
+ id: int
18
+ name: str = Field(min_length=3)
19
+
20
+
21
+ @pytest.fixture(autouse=True)
22
+ def reset_configuration_singleton():
23
+ """Reset the Configuration singleton before and after each test."""
24
+ Configuration._instance = None
25
+ Configuration._initialized = False
26
+ yield
27
+ Configuration._instance = None
28
+ Configuration._initialized = False
29
+
30
+
31
+ def test_validation_fail_false_keeps_valid_rows():
32
+ with tempfile.TemporaryDirectory() as tmpdir:
33
+ src_db_path = os.path.join(tmpdir, "src.db")
34
+ dst_db_path = os.path.join(tmpdir, "dst.db")
35
+ audit_db_path = os.path.join(tmpdir, "audit.db")
36
+
37
+ # Create source data:
38
+ # - Row 1: Valid (name length 5 >= 3)
39
+ # - Row 2: Invalid (name length 2 < 3)
40
+ # - Row 3: Valid (name length 3 >= 3)
41
+ conn_src = sqlite3.connect(src_db_path)
42
+ cursor_src = conn_src.cursor()
43
+ cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
44
+ cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
45
+ cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bo')")
46
+ cursor_src.execute("INSERT INTO source_table VALUES (3, 'Charlie')")
47
+ conn_src.commit()
48
+ conn_src.close()
49
+
50
+ config = Configuration()
51
+
52
+ src_resource = FileBasedConnectionSettings(
53
+ conn_server_type=ServerType.SQLITE, file_path=src_db_path
54
+ )
55
+ dst_resource = FileBasedConnectionSettings(
56
+ conn_server_type=ServerType.SQLITE, file_path=dst_db_path
57
+ )
58
+ audit_resource = FileBasedConnectionSettings(
59
+ conn_server_type=ServerType.SQLITE, file_path=audit_db_path
60
+ )
61
+
62
+ config.resources["src_db"] = src_resource
63
+ config.resources["dst_db"] = dst_resource
64
+ config.resources["audit_db"] = audit_resource
65
+
66
+ pipeline_def = BasePipelineDefinition(
67
+ pipeline_name="test_pipeline",
68
+ source="src_db",
69
+ source_sql="SELECT * FROM source_table",
70
+ destination="dst_db",
71
+ destination_table="target_table",
72
+ audit="audit_db",
73
+ validator=ValidationTestModel,
74
+ validation_fail=False,
75
+ write_parameters={"if_exists": "replace", "index": False},
76
+ )
77
+
78
+ config.pipelines["test_pipeline"] = pipeline_def
79
+
80
+ # Run pipeline
81
+ pipeline = LoadPipeline("test_pipeline")
82
+ success = pipeline.run()
83
+
84
+ # Should be successful because validation_fail=False
85
+ assert success is True
86
+
87
+ # Verify destination table only has valid rows (1 and 3)
88
+ conn_dst = sqlite3.connect(dst_db_path)
89
+ cursor_dst = conn_dst.cursor()
90
+ cursor_dst.execute("SELECT id, name FROM target_table ORDER BY id")
91
+ rows = cursor_dst.fetchall()
92
+ conn_dst.close()
93
+
94
+ assert len(rows) == 2
95
+ assert rows[0] == (1, "Alice")
96
+ assert rows[1] == (3, "Charlie")
97
+
98
+
99
+ def test_validation_fail_true_fails_pipeline():
100
+ with tempfile.TemporaryDirectory() as tmpdir:
101
+ src_db_path = os.path.join(tmpdir, "src.db")
102
+ dst_db_path = os.path.join(tmpdir, "dst.db")
103
+ audit_db_path = os.path.join(tmpdir, "audit.db")
104
+
105
+ # Create source data with an invalid row
106
+ conn_src = sqlite3.connect(src_db_path)
107
+ cursor_src = conn_src.cursor()
108
+ cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
109
+ cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
110
+ cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bo')")
111
+ conn_src.commit()
112
+ conn_src.close()
113
+
114
+ config = Configuration()
115
+
116
+ src_resource = FileBasedConnectionSettings(
117
+ conn_server_type=ServerType.SQLITE, file_path=src_db_path
118
+ )
119
+ dst_resource = FileBasedConnectionSettings(
120
+ conn_server_type=ServerType.SQLITE, file_path=dst_db_path
121
+ )
122
+ audit_resource = FileBasedConnectionSettings(
123
+ conn_server_type=ServerType.SQLITE, file_path=audit_db_path
124
+ )
125
+
126
+ config.resources["src_db"] = src_resource
127
+ config.resources["dst_db"] = dst_resource
128
+ config.resources["audit_db"] = audit_resource
129
+
130
+ pipeline_def = BasePipelineDefinition(
131
+ pipeline_name="test_pipeline",
132
+ source="src_db",
133
+ source_sql="SELECT * FROM source_table",
134
+ destination="dst_db",
135
+ destination_table="target_table",
136
+ audit="audit_db",
137
+ validator=ValidationTestModel,
138
+ validation_fail=True,
139
+ write_parameters={"if_exists": "replace", "index": False},
140
+ )
141
+
142
+ config.pipelines["test_pipeline"] = pipeline_def
143
+
144
+ # Run pipeline
145
+ pipeline = LoadPipeline("test_pipeline")
146
+ success = pipeline.run()
147
+
148
+ # Should fail because validation_fail=True
149
+ assert success is False
150
+ assert pipeline.error_details is not None
151
+ assert "Validation failed" in pipeline.error_details
152
+ assert "Row 1:" in pipeline.error_details # Row 2 (0-indexed row 1) failed
153
+
154
+
155
+ def test_validation_fail_false_graceful_stop_when_all_rows_invalid():
156
+ with tempfile.TemporaryDirectory() as tmpdir:
157
+ src_db_path = os.path.join(tmpdir, "src.db")
158
+ dst_db_path = os.path.join(tmpdir, "dst.db")
159
+ audit_db_path = os.path.join(tmpdir, "audit.db")
160
+
161
+ # Create source data with ONLY invalid rows
162
+ conn_src = sqlite3.connect(src_db_path)
163
+ cursor_src = conn_src.cursor()
164
+ cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
165
+ cursor_src.execute("INSERT INTO source_table VALUES (1, 'Bo')")
166
+ cursor_src.execute("INSERT INTO source_table VALUES (2, 'Co')")
167
+ conn_src.commit()
168
+ conn_src.close()
169
+
170
+ config = Configuration()
171
+
172
+ src_resource = FileBasedConnectionSettings(
173
+ conn_server_type=ServerType.SQLITE, file_path=src_db_path
174
+ )
175
+ dst_resource = FileBasedConnectionSettings(
176
+ conn_server_type=ServerType.SQLITE, file_path=dst_db_path
177
+ )
178
+ audit_resource = FileBasedConnectionSettings(
179
+ conn_server_type=ServerType.SQLITE, file_path=audit_db_path
180
+ )
181
+
182
+ config.resources["src_db"] = src_resource
183
+ config.resources["dst_db"] = dst_resource
184
+ config.resources["audit_db"] = audit_resource
185
+
186
+ pipeline_def = BasePipelineDefinition(
187
+ pipeline_name="test_pipeline",
188
+ source="src_db",
189
+ source_sql="SELECT * FROM source_table",
190
+ destination="dst_db",
191
+ destination_table="target_table",
192
+ audit="audit_db",
193
+ validator=ValidationTestModel,
194
+ validation_fail=False,
195
+ write_parameters={"if_exists": "replace", "index": False},
196
+ )
197
+
198
+ config.pipelines["test_pipeline"] = pipeline_def
199
+
200
+ # Run pipeline
201
+ pipeline = LoadPipeline("test_pipeline")
202
+ success = pipeline.run()
203
+
204
+ # Should stop gracefully and return True because validation_fail=False
205
+ assert success is True
206
+ assert pipeline.output_rows == 0
207
+
208
+ # Destination table should NOT have been loaded or have any data
209
+ conn_dst = sqlite3.connect(dst_db_path)
210
+ cursor_dst = conn_dst.cursor()
211
+ # The table should not exist or be empty
212
+ try:
213
+ cursor_dst.execute("SELECT COUNT(*) FROM target_table")
214
+ count = cursor_dst.fetchone()[0]
215
+ assert count == 0
216
+ except sqlite3.OperationalError:
217
+ # Table not even created, which is also correct/graceful
218
+ pass
219
+ conn_dst.close()