easy-data-loader 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {easy_data_loader-0.1.2/src/easy_data_loader.egg-info → easy_data_loader-0.1.4}/PKG-INFO +5 -2
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/pyproject.toml +5 -2
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/custom_exceptions.py +6 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/database_connector.py +102 -1
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/database_operations.py +34 -7
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/models.py +11 -3
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/orchestrator.py +9 -5
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/pipeline.py +52 -3
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/pipeline_base.py +10 -4
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/procedure_pipeline.py +6 -4
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4/src/easy_data_loader.egg-info}/PKG-INFO +5 -2
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/SOURCES.txt +3 -1
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/requires.txt +1 -0
- easy_data_loader-0.1.4/tests/test_orchestrator.py +134 -0
- easy_data_loader-0.1.4/tests/test_validation.py +219 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/LICENSE +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/README.md +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/setup.cfg +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/__init__.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/cli.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/config_loader.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/data_inferrence.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/driver_detector.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/file_operations.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/log.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/dependency_links.txt +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/entry_points.txt +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/top_level.txt +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/tests/test_data_inference.py +0 -0
- {easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/tests/test_imports.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: easy_data_loader
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Data transfer utilities between files and databases
|
|
5
5
|
Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -8,14 +8,17 @@ Classifier: Intended Audience :: Developers
|
|
|
8
8
|
Classifier: Topic :: Database
|
|
9
9
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
12
14
|
Classifier: Operating System :: OS Independent
|
|
13
|
-
Requires-Python: >=3.
|
|
15
|
+
Requires-Python: >=3.11
|
|
14
16
|
Description-Content-Type: text/markdown
|
|
15
17
|
License-File: LICENSE
|
|
16
18
|
Requires-Dist: click>=8.3.0
|
|
17
19
|
Requires-Dist: openpyxl>=3.1.5
|
|
18
20
|
Requires-Dist: pandas>=2.3.3
|
|
21
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
19
22
|
Requires-Dist: pyarrow>=22.0.0
|
|
20
23
|
Requires-Dist: pydantic>=2.12.5
|
|
21
24
|
Requires-Dist: pydantic-settings>=2.12.0
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "easy_data_loader"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.4"
|
|
4
4
|
description = "Data transfer utilities between files and databases"
|
|
5
5
|
authors = [{ name = "Bojoi Gabriel", email = "bojoigabriel@gmail.com" }]
|
|
6
6
|
readme = "README.md"
|
|
7
|
-
requires-python = ">=3.
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
8
|
dependencies = [
|
|
9
9
|
"click>=8.3.0",
|
|
10
10
|
"openpyxl>=3.1.5",
|
|
11
11
|
"pandas>=2.3.3",
|
|
12
|
+
"psycopg2-binary>=2.9.11",
|
|
12
13
|
"pyarrow>=22.0.0",
|
|
13
14
|
"pydantic>=2.12.5",
|
|
14
15
|
"pydantic-settings>=2.12.0",
|
|
@@ -22,6 +23,8 @@ classifiers = [
|
|
|
22
23
|
"Topic :: Database",
|
|
23
24
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
24
25
|
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
25
28
|
"Programming Language :: Python :: 3.13",
|
|
26
29
|
"Operating System :: OS Independent",
|
|
27
30
|
]
|
|
@@ -20,3 +20,9 @@ class InvalidFileException(Exception):
|
|
|
20
20
|
def __init__(self, message: str = "The provided file is invalid or corrupted"):
|
|
21
21
|
self.message = message
|
|
22
22
|
super().__init__(self.message)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PipelineValidationError(Exception):
|
|
26
|
+
def __init__(self, message: str):
|
|
27
|
+
self.message = message
|
|
28
|
+
super().__init__(self.message)
|
{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/database_connector.py
RENAMED
|
@@ -220,8 +220,109 @@ class SQLiteDatabaseConnector(LoggedComponent, DatabaseConnector):
|
|
|
220
220
|
self.engine.dispose()
|
|
221
221
|
|
|
222
222
|
|
|
223
|
+
class PostgresDatabaseConnector(LoggedComponent, DatabaseConnector):
|
|
224
|
+
"""Database connector to a PostgreSQL database"""
|
|
225
|
+
|
|
226
|
+
def __init__(self, config: ConnectionSettings):
|
|
227
|
+
super().__init__()
|
|
228
|
+
self.config = config
|
|
229
|
+
self.engine = self._create_engine(self.config)
|
|
230
|
+
self._test_engine()
|
|
231
|
+
|
|
232
|
+
def _build_connection_string(self) -> str:
|
|
233
|
+
"""Define the connection string from the given configuration"""
|
|
234
|
+
if not isinstance(self.config, ServerBasedConnectionSettings):
|
|
235
|
+
self.log_and_raise(
|
|
236
|
+
ValueError,
|
|
237
|
+
"PostgresDatabaseConnector requires ServerBasedConnectionSettings",
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Validate minimum parameters
|
|
241
|
+
if not all(
|
|
242
|
+
[self.config.conn_server, self.config.conn_database, self.config.conn_port]
|
|
243
|
+
):
|
|
244
|
+
self.log_and_raise(
|
|
245
|
+
ValueError,
|
|
246
|
+
"Postgres connection configuration is missing host, port or database name",
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# encode special characters in password and username
|
|
250
|
+
user = (
|
|
251
|
+
urllib.parse.quote_plus(self.config.conn_username)
|
|
252
|
+
if self.config.conn_username
|
|
253
|
+
else ""
|
|
254
|
+
)
|
|
255
|
+
password = (
|
|
256
|
+
urllib.parse.quote_plus(self.config.conn_password)
|
|
257
|
+
if self.config.conn_password
|
|
258
|
+
else ""
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
host = self.config.conn_server
|
|
262
|
+
port = self.config.conn_port
|
|
263
|
+
database = self.config.conn_database
|
|
264
|
+
|
|
265
|
+
# Standard SQLAlchemy format: postgresql+psycopg2://user:password@host:port/dbname
|
|
266
|
+
if user and password:
|
|
267
|
+
return f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
|
|
268
|
+
|
|
269
|
+
# trusted connection
|
|
270
|
+
return f"postgresql+psycopg2://{user}@{host}:{port}/{database}"
|
|
271
|
+
|
|
272
|
+
def _create_engine(self, settings: ConnectionSettings) -> Engine:
|
|
273
|
+
"""Create a sqlalchemy engine using the provided configuration"""
|
|
274
|
+
connection_string = self._build_connection_string()
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
engine = create_engine(
|
|
278
|
+
connection_string,
|
|
279
|
+
pool_size=5,
|
|
280
|
+
max_overflow=10,
|
|
281
|
+
pool_timeout=30,
|
|
282
|
+
pool_recycle=3600,
|
|
283
|
+
echo=self.is_debug_enabled,
|
|
284
|
+
)
|
|
285
|
+
return engine
|
|
286
|
+
except Exception as e:
|
|
287
|
+
self.log_exception(
|
|
288
|
+
e, "Could not create Postgres engine from connection string"
|
|
289
|
+
)
|
|
290
|
+
raise
|
|
291
|
+
|
|
292
|
+
def get_engine(self) -> Engine:
|
|
293
|
+
"""Return the engine to be used"""
|
|
294
|
+
return self.engine
|
|
295
|
+
|
|
296
|
+
def _test_engine(self) -> bool:
|
|
297
|
+
"""Test if the connection works"""
|
|
298
|
+
if not isinstance(self.config, ServerBasedConnectionSettings):
|
|
299
|
+
self.log_and_raise(
|
|
300
|
+
ValueError,
|
|
301
|
+
"PostgresDatabaseConnector requires ServerBasedConnectionSettings",
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
with self.engine.connect() as conn:
|
|
306
|
+
conn.execute(text("SELECT 1"))
|
|
307
|
+
self.logger.debug(
|
|
308
|
+
f"Postgres connection test successful: {self.config.conn_server} - {self.config.conn_database}"
|
|
309
|
+
)
|
|
310
|
+
return True
|
|
311
|
+
except Exception as e:
|
|
312
|
+
self.log_and_raise(
|
|
313
|
+
EngineTestException,
|
|
314
|
+
"Postgres connection test failed",
|
|
315
|
+
exception=str(e),
|
|
316
|
+
)
|
|
317
|
+
return False
|
|
318
|
+
|
|
319
|
+
def _dispose_engine(self) -> None:
|
|
320
|
+
self.logger.debug("Disposing of Postgres engine")
|
|
321
|
+
self.engine.dispose()
|
|
322
|
+
|
|
323
|
+
|
|
223
324
|
CONNECTOR_FACTORY: dict[ServerType, type] = {
|
|
224
325
|
ServerType.MSSQL: SqlServerDatabaseConnector,
|
|
225
326
|
ServerType.SQLITE: SQLiteDatabaseConnector,
|
|
226
|
-
|
|
327
|
+
ServerType.POSTGRESQL: PostgresDatabaseConnector,
|
|
227
328
|
}
|
{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/database_operations.py
RENAMED
|
@@ -2,7 +2,7 @@ from typing import Dict
|
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from pandas import DataFrame
|
|
5
|
-
from sqlalchemy import Column, MetaData, Table, inspect, text
|
|
5
|
+
from sqlalchemy import Column, MetaData, Table, TextClause, inspect, text
|
|
6
6
|
from sqlalchemy.engine import Engine
|
|
7
7
|
from sqlalchemy.types import TypeEngine
|
|
8
8
|
from sqlalchemy.schema import SchemaItem
|
|
@@ -74,12 +74,36 @@ class DatabaseOperations(LoggedComponent):
|
|
|
74
74
|
self.log_exception(e, f"Failed to create table {table_name}")
|
|
75
75
|
return False
|
|
76
76
|
|
|
77
|
+
def _build_procedure_sql(
|
|
78
|
+
self, dialect: str, procedure_name: str, params: dict
|
|
79
|
+
) -> TextClause:
|
|
80
|
+
"""Build dialect-specific SQL for calling a stored procedure."""
|
|
81
|
+
params_keys = list(params.keys())
|
|
82
|
+
|
|
83
|
+
if dialect == "mssql":
|
|
84
|
+
if params_keys:
|
|
85
|
+
params_str = ", ".join([f":{k}" for k in params_keys])
|
|
86
|
+
return text(f"EXEC {procedure_name} {params_str};")
|
|
87
|
+
return text(f"EXEC {procedure_name};")
|
|
88
|
+
elif dialect == "postgresql":
|
|
89
|
+
params_str = ", ".join([f":{k}" for k in params_keys])
|
|
90
|
+
return text(f"CALL {procedure_name}({params_str});")
|
|
91
|
+
elif dialect == "sqlite":
|
|
92
|
+
raise NotImplementedError("SQLite does not support stored procedures.")
|
|
93
|
+
else:
|
|
94
|
+
raise NotImplementedError(
|
|
95
|
+
f"Stored procedure execution is not supported for dialect: {dialect}"
|
|
96
|
+
)
|
|
97
|
+
|
|
77
98
|
def execute_stored_procedure(self, procedure_name: str, **kwargs) -> bool:
|
|
78
|
-
"""Execute a stored procedure using the engine connection
|
|
99
|
+
"""Execute a stored procedure using the engine connection.
|
|
100
|
+
|
|
101
|
+
Generates dialect-appropriate SQL based on the engine's dialect.
|
|
102
|
+
"""
|
|
79
103
|
self.logger.info(f"Executing stored procedure: {procedure_name}")
|
|
80
104
|
try:
|
|
81
|
-
|
|
82
|
-
sql =
|
|
105
|
+
dialect = self.engine.dialect.name
|
|
106
|
+
sql = self._build_procedure_sql(dialect, procedure_name, kwargs)
|
|
83
107
|
|
|
84
108
|
with self.engine.begin() as conn:
|
|
85
109
|
# self.engine.begin() is a context manager for the transaction
|
|
@@ -87,12 +111,14 @@ class DatabaseOperations(LoggedComponent):
|
|
|
87
111
|
conn.execute(sql, kwargs)
|
|
88
112
|
return True
|
|
89
113
|
except Exception as e:
|
|
90
|
-
self.
|
|
114
|
+
self.logger.error(
|
|
115
|
+
f"Failed to execute procedure {procedure_name} | Error: {str(e)}"
|
|
116
|
+
)
|
|
91
117
|
raise
|
|
92
118
|
|
|
93
119
|
def write_audit(self, table_name: str, entry: AuditEntry):
|
|
94
120
|
"""Write an audit entry to the database, ensuring the table exists first."""
|
|
95
|
-
self.logger.debug(f"Writing audit entry for execution: {entry.
|
|
121
|
+
self.logger.debug(f"Writing audit entry for execution: {entry.pipeline_id}")
|
|
96
122
|
|
|
97
123
|
try:
|
|
98
124
|
from sqlalchemy import (
|
|
@@ -108,7 +134,8 @@ class DatabaseOperations(LoggedComponent):
|
|
|
108
134
|
metadata = MetaData()
|
|
109
135
|
|
|
110
136
|
columns: list[SchemaItem] = [
|
|
111
|
-
Column("
|
|
137
|
+
Column("pipeline_id", String(36), primary_key=True),
|
|
138
|
+
Column("orchestrator_id", String(36)),
|
|
112
139
|
Column("pipeline_name", String(100)),
|
|
113
140
|
Column("status", String(50)),
|
|
114
141
|
Column("input_rows", Integer),
|
|
@@ -42,8 +42,8 @@ class ServerBasedConnectionSettings(BaseConnectionSettings):
|
|
|
42
42
|
|
|
43
43
|
conn_server: str
|
|
44
44
|
conn_database: str
|
|
45
|
-
conn_username: str
|
|
46
|
-
conn_password: str
|
|
45
|
+
conn_username: str | None = None
|
|
46
|
+
conn_password: str | None = None
|
|
47
47
|
conn_port: Annotated[int, Field(default=1433, ge=1, le=65535)]
|
|
48
48
|
|
|
49
49
|
model_config = SettingsConfigDict(extra="ignore")
|
|
@@ -62,6 +62,12 @@ class ServerBasedConnectionSettings(BaseConnectionSettings):
|
|
|
62
62
|
raise ValueError(
|
|
63
63
|
"Both username and password must be provided for authentication"
|
|
64
64
|
)
|
|
65
|
+
|
|
66
|
+
# If the username and password are captured as empty strings nullify them
|
|
67
|
+
# which will lead the connection to be created as trusted connection for Sql Server
|
|
68
|
+
if self.conn_username == "" and self.conn_password == "":
|
|
69
|
+
self.conn_username = None
|
|
70
|
+
self.conn_password = None
|
|
65
71
|
return self
|
|
66
72
|
|
|
67
73
|
|
|
@@ -128,6 +134,7 @@ class BasePipelineDefinition(BaseModel):
|
|
|
128
134
|
destination_table: Optional[str] = None
|
|
129
135
|
audit: Optional[str] = None # resource name for the audit database
|
|
130
136
|
validator: Optional[Any] = None # pydantic model class for validation
|
|
137
|
+
validation_fail: bool = False
|
|
131
138
|
|
|
132
139
|
# mapping of source columns to destination columns together with datatypes
|
|
133
140
|
columns: Dict[str, ColumnDefinition] = Field(default_factory=dict)
|
|
@@ -195,7 +202,8 @@ class ProcedureDefinition(BaseModel):
|
|
|
195
202
|
class AuditEntry(BaseModel):
|
|
196
203
|
"""Model representing an entry in the audit table"""
|
|
197
204
|
|
|
198
|
-
|
|
205
|
+
orchestrator_id: Optional[str] = None
|
|
206
|
+
pipeline_id: str
|
|
199
207
|
pipeline_name: str
|
|
200
208
|
status: str
|
|
201
209
|
input_rows: int = 0
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
from uuid import uuid4
|
|
2
2
|
|
|
3
3
|
from .config_loader import Configuration
|
|
4
4
|
from .log import LoggedComponent
|
|
@@ -17,6 +17,7 @@ class OrchestratorPipeline(LoggedComponent):
|
|
|
17
17
|
|
|
18
18
|
def __init__(self, orchestrator_name: str):
|
|
19
19
|
super().__init__()
|
|
20
|
+
self.orchestrator_id = str(uuid4())
|
|
20
21
|
self.config = Configuration()
|
|
21
22
|
definition = self.config.get_pipeline(orchestrator_name)
|
|
22
23
|
|
|
@@ -26,11 +27,10 @@ class OrchestratorPipeline(LoggedComponent):
|
|
|
26
27
|
)
|
|
27
28
|
|
|
28
29
|
self.definition: OrchestratorDefinition = definition
|
|
29
|
-
self.batch_id = str(uuid.uuid4())
|
|
30
30
|
|
|
31
31
|
def run(self) -> bool:
|
|
32
32
|
self.logger.info(
|
|
33
|
-
f"=== Starting Orchestrator: {self.definition.orchestrator_name} (
|
|
33
|
+
f"=== Starting Orchestrator: {self.definition.orchestrator_name} (Orchestrator: {self.orchestrator_id}) ==="
|
|
34
34
|
)
|
|
35
35
|
|
|
36
36
|
success = True
|
|
@@ -44,9 +44,13 @@ class OrchestratorPipeline(LoggedComponent):
|
|
|
44
44
|
|
|
45
45
|
# Instantiate and run
|
|
46
46
|
if isinstance(p_def, BasePipelineDefinition):
|
|
47
|
-
p_success = LoadPipeline(
|
|
47
|
+
p_success = LoadPipeline(
|
|
48
|
+
pipeline_name, orchestrator_id=self.orchestrator_id
|
|
49
|
+
).run()
|
|
48
50
|
elif isinstance(p_def, ProcedureDefinition):
|
|
49
|
-
p_success = ProcedurePipeline(
|
|
51
|
+
p_success = ProcedurePipeline(
|
|
52
|
+
pipeline_name, orchestrator_id=self.orchestrator_id
|
|
53
|
+
).run()
|
|
50
54
|
elif isinstance(p_def, OrchestratorDefinition):
|
|
51
55
|
self.logger.error(
|
|
52
56
|
f"[{self.definition.orchestrator_name}] -> Nested orchestrators are not supported."
|
|
@@ -14,6 +14,7 @@ from .models import (
|
|
|
14
14
|
FileBasedConnectionSettings,
|
|
15
15
|
FileType,
|
|
16
16
|
)
|
|
17
|
+
from .custom_exceptions import PipelineValidationError
|
|
17
18
|
from .pipeline_base import BasePipeline
|
|
18
19
|
|
|
19
20
|
|
|
@@ -24,7 +25,8 @@ class LoadPipeline(BasePipeline):
|
|
|
24
25
|
Inherits shared logic from BasePipeline.
|
|
25
26
|
"""
|
|
26
27
|
|
|
27
|
-
def __init__(self, pipeline_name: str):
|
|
28
|
+
def __init__(self, pipeline_name: str, orchestrator_id: Optional[str] = None):
|
|
29
|
+
|
|
28
30
|
# Load definition from config
|
|
29
31
|
definition = Configuration().get_pipeline(pipeline_name)
|
|
30
32
|
if not isinstance(definition, BasePipelineDefinition):
|
|
@@ -32,7 +34,7 @@ class LoadPipeline(BasePipeline):
|
|
|
32
34
|
f"Pipeline '{pipeline_name}' is not a LoadPipeline definition."
|
|
33
35
|
)
|
|
34
36
|
|
|
35
|
-
super().__init__(definition)
|
|
37
|
+
super().__init__(definition, orchestrator_id=orchestrator_id)
|
|
36
38
|
self.definition: BasePipelineDefinition = definition
|
|
37
39
|
|
|
38
40
|
# initialize components
|
|
@@ -95,6 +97,12 @@ class LoadPipeline(BasePipeline):
|
|
|
95
97
|
# 2. TRANSFORM & VALIDATE
|
|
96
98
|
df = self._transform_step(df)
|
|
97
99
|
self.output_rows = len(df)
|
|
100
|
+
if df.empty:
|
|
101
|
+
self.logger.warning(
|
|
102
|
+
"No valid data remaining after validation! Pipeline will stop gracefully."
|
|
103
|
+
)
|
|
104
|
+
self._log_audit("SUCCESS")
|
|
105
|
+
return True
|
|
98
106
|
|
|
99
107
|
# 3. LOAD
|
|
100
108
|
load_success = self._load_step(df, inferred_dtypes)
|
|
@@ -111,6 +119,13 @@ class LoadPipeline(BasePipeline):
|
|
|
111
119
|
)
|
|
112
120
|
return True
|
|
113
121
|
|
|
122
|
+
except PipelineValidationError as e:
|
|
123
|
+
self.error_details = str(e)
|
|
124
|
+
self.logger.error(
|
|
125
|
+
f"Critical pipeline error - {self.definition.pipeline_name}: {str(e)}"
|
|
126
|
+
)
|
|
127
|
+
self._log_audit("FAILED")
|
|
128
|
+
return False
|
|
114
129
|
except Exception as e:
|
|
115
130
|
self.error_details = str(e)
|
|
116
131
|
self.log_exception(
|
|
@@ -192,18 +207,52 @@ class LoadPipeline(BasePipeline):
|
|
|
192
207
|
f"Starting data validation against model: {self.definition.validator.__name__}"
|
|
193
208
|
)
|
|
194
209
|
validated_rows = []
|
|
210
|
+
failed_rows = []
|
|
195
211
|
|
|
196
212
|
# Optimization: iterrows is slow because it creates a Series for each row.
|
|
197
213
|
# Converting to a list of dicts first is much faster for iteration.
|
|
198
214
|
records = df.to_dict(orient="records")
|
|
199
215
|
|
|
216
|
+
from pydantic import ValidationError
|
|
217
|
+
|
|
200
218
|
for i, record in enumerate(records):
|
|
201
219
|
try:
|
|
202
220
|
# Use model_validate for individual row validation
|
|
203
221
|
model_inst = self.definition.validator.model_validate(record)
|
|
204
222
|
validated_rows.append(model_inst.model_dump())
|
|
223
|
+
except ValidationError as ve:
|
|
224
|
+
# Format Pydantic validation error to be human friendly
|
|
225
|
+
errors_list = []
|
|
226
|
+
for err in ve.errors():
|
|
227
|
+
loc_str = " -> ".join(str(item) for item in err.get("loc", []))
|
|
228
|
+
msg = err.get("msg", "Validation error")
|
|
229
|
+
errors_list.append(f"{loc_str}: {msg}")
|
|
230
|
+
err_msg = ", ".join(errors_list)
|
|
231
|
+
failed_rows.append((i, record, err_msg))
|
|
205
232
|
except Exception as e:
|
|
206
|
-
|
|
233
|
+
failed_rows.append((i, record, str(e)))
|
|
234
|
+
|
|
235
|
+
if failed_rows:
|
|
236
|
+
total_failed = len(failed_rows)
|
|
237
|
+
total_records = len(records)
|
|
238
|
+
if self.definition.validation_fail:
|
|
239
|
+
first_errors = "; ".join(
|
|
240
|
+
[f"Row {idx}: {err}" for idx, _, err in failed_rows[:5]]
|
|
241
|
+
)
|
|
242
|
+
if len(failed_rows) > 5:
|
|
243
|
+
first_errors += f" ... and {len(failed_rows) - 5} more errors"
|
|
244
|
+
self.log_and_raise(
|
|
245
|
+
PipelineValidationError,
|
|
246
|
+
f"Validation failed for {total_failed} out of {total_records} rows: {first_errors}",
|
|
247
|
+
)
|
|
248
|
+
else:
|
|
249
|
+
self.logger.warning(
|
|
250
|
+
f"Validation failed for {total_failed} out of {total_records} rows."
|
|
251
|
+
)
|
|
252
|
+
for idx, record, err in failed_rows:
|
|
253
|
+
self.logger.debug(
|
|
254
|
+
f"Row {idx} failed validation: {err}. Row data: {record}"
|
|
255
|
+
)
|
|
207
256
|
|
|
208
257
|
df = pd.DataFrame(validated_rows)
|
|
209
258
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import uuid
|
|
2
1
|
from abc import ABC, abstractmethod
|
|
3
2
|
from typing import List, Optional, Union
|
|
3
|
+
from uuid import uuid4
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
@@ -27,10 +27,15 @@ class BasePipeline(LoggedComponent, ABC):
|
|
|
27
27
|
- Shared resource cleanup
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
-
def __init__(
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
definition: Union[BasePipelineDefinition, ProcedureDefinition],
|
|
33
|
+
orchestrator_id: Optional[str] = None,
|
|
34
|
+
):
|
|
31
35
|
super().__init__()
|
|
36
|
+
self.orchestrator_id = orchestrator_id
|
|
37
|
+
self.pipeline_id = str(uuid4())
|
|
32
38
|
self.definition = definition
|
|
33
|
-
self.execution_id = str(uuid.uuid4())
|
|
34
39
|
self.config = Configuration()
|
|
35
40
|
|
|
36
41
|
self._active_connectors: List[DatabaseConnector] = []
|
|
@@ -113,7 +118,8 @@ class BasePipeline(LoggedComponent, ABC):
|
|
|
113
118
|
self.logger.warning(f"Could not read file metadata for audit: {e}")
|
|
114
119
|
|
|
115
120
|
entry = AuditEntry(
|
|
116
|
-
|
|
121
|
+
orchestrator_id=self.orchestrator_id,
|
|
122
|
+
pipeline_id=self.pipeline_id,
|
|
117
123
|
pipeline_name=self.definition.pipeline_name,
|
|
118
124
|
status=status,
|
|
119
125
|
input_rows=self.input_rows,
|
{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader/procedure_pipeline.py
RENAMED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
1
3
|
from .config_loader import Configuration
|
|
2
4
|
from .models import (
|
|
3
5
|
ProcedureDefinition,
|
|
@@ -12,7 +14,7 @@ class ProcedurePipeline(BasePipeline):
|
|
|
12
14
|
Pipeline specialized in executing database stored procedures.
|
|
13
15
|
"""
|
|
14
16
|
|
|
15
|
-
def __init__(self, pipeline_name: str):
|
|
17
|
+
def __init__(self, pipeline_name: str, orchestrator_id: Optional[str] = None):
|
|
16
18
|
# Load definition from config
|
|
17
19
|
definition = Configuration().get_pipeline(pipeline_name)
|
|
18
20
|
if not isinstance(definition, ProcedureDefinition):
|
|
@@ -20,7 +22,7 @@ class ProcedurePipeline(BasePipeline):
|
|
|
20
22
|
f"Pipeline '{pipeline_name}' is not a ProcedureDefinition."
|
|
21
23
|
)
|
|
22
24
|
|
|
23
|
-
super().__init__(definition)
|
|
25
|
+
super().__init__(definition, orchestrator_id=orchestrator_id)
|
|
24
26
|
self.definition: ProcedureDefinition = definition
|
|
25
27
|
|
|
26
28
|
# Set up primary database connection
|
|
@@ -61,8 +63,8 @@ class ProcedurePipeline(BasePipeline):
|
|
|
61
63
|
|
|
62
64
|
except Exception as e:
|
|
63
65
|
self.error_details = str(e)
|
|
64
|
-
self.
|
|
65
|
-
|
|
66
|
+
self.logger.error(
|
|
67
|
+
f"Procedure Pipeline failed: {self.definition.pipeline_name} | Error: {str(e)}"
|
|
66
68
|
)
|
|
67
69
|
|
|
68
70
|
# Set audit details for failure too
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: easy_data_loader
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Data transfer utilities between files and databases
|
|
5
5
|
Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -8,14 +8,17 @@ Classifier: Intended Audience :: Developers
|
|
|
8
8
|
Classifier: Topic :: Database
|
|
9
9
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
12
14
|
Classifier: Operating System :: OS Independent
|
|
13
|
-
Requires-Python: >=3.
|
|
15
|
+
Requires-Python: >=3.11
|
|
14
16
|
Description-Content-Type: text/markdown
|
|
15
17
|
License-File: LICENSE
|
|
16
18
|
Requires-Dist: click>=8.3.0
|
|
17
19
|
Requires-Dist: openpyxl>=3.1.5
|
|
18
20
|
Requires-Dist: pandas>=2.3.3
|
|
21
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
19
22
|
Requires-Dist: pyarrow>=22.0.0
|
|
20
23
|
Requires-Dist: pydantic>=2.12.5
|
|
21
24
|
Requires-Dist: pydantic-settings>=2.12.0
|
|
@@ -23,4 +23,6 @@ src/easy_data_loader.egg-info/entry_points.txt
|
|
|
23
23
|
src/easy_data_loader.egg-info/requires.txt
|
|
24
24
|
src/easy_data_loader.egg-info/top_level.txt
|
|
25
25
|
tests/test_data_inference.py
|
|
26
|
-
tests/test_imports.py
|
|
26
|
+
tests/test_imports.py
|
|
27
|
+
tests/test_orchestrator.py
|
|
28
|
+
tests/test_validation.py
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import sqlite3
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from easy_data_loader.config_loader import Configuration
|
|
7
|
+
from easy_data_loader.models import (
|
|
8
|
+
BasePipelineDefinition,
|
|
9
|
+
OrchestratorDefinition,
|
|
10
|
+
FileBasedConnectionSettings,
|
|
11
|
+
ServerType,
|
|
12
|
+
)
|
|
13
|
+
from easy_data_loader.orchestrator import OrchestratorPipeline
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.fixture(autouse=True)
|
|
17
|
+
def reset_configuration_singleton():
|
|
18
|
+
"""Reset the Configuration singleton before and after each test."""
|
|
19
|
+
Configuration._instance = None
|
|
20
|
+
Configuration._initialized = False
|
|
21
|
+
yield
|
|
22
|
+
Configuration._instance = None
|
|
23
|
+
Configuration._initialized = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_orchestrator_passes_orchestrator_id_and_writes_audit():
|
|
27
|
+
# Create temporary file paths for our sqlite databases
|
|
28
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
29
|
+
src_db_path = os.path.join(tmpdir, "src.db")
|
|
30
|
+
dst_db_path = os.path.join(tmpdir, "dst.db")
|
|
31
|
+
audit_db_path = os.path.join(tmpdir, "audit.db")
|
|
32
|
+
|
|
33
|
+
# Create tables and put some data in the source database
|
|
34
|
+
conn_src = sqlite3.connect(src_db_path)
|
|
35
|
+
cursor_src = conn_src.cursor()
|
|
36
|
+
cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
|
|
37
|
+
cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
|
|
38
|
+
cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bob')")
|
|
39
|
+
conn_src.commit()
|
|
40
|
+
conn_src.close()
|
|
41
|
+
|
|
42
|
+
# Set up resources in Configuration
|
|
43
|
+
config = Configuration()
|
|
44
|
+
|
|
45
|
+
src_resource = FileBasedConnectionSettings(
|
|
46
|
+
conn_server_type=ServerType.SQLITE, file_path=src_db_path
|
|
47
|
+
)
|
|
48
|
+
dst_resource = FileBasedConnectionSettings(
|
|
49
|
+
conn_server_type=ServerType.SQLITE, file_path=dst_db_path
|
|
50
|
+
)
|
|
51
|
+
audit_resource = FileBasedConnectionSettings(
|
|
52
|
+
conn_server_type=ServerType.SQLITE, file_path=audit_db_path
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
config.resources["src_db"] = src_resource
|
|
56
|
+
config.resources["dst_db"] = dst_resource
|
|
57
|
+
config.resources["audit_db"] = audit_resource
|
|
58
|
+
|
|
59
|
+
# Define 2 pipelines that copy data
|
|
60
|
+
pipeline1_def = BasePipelineDefinition(
|
|
61
|
+
pipeline_name="pipeline1",
|
|
62
|
+
source="src_db",
|
|
63
|
+
source_sql="SELECT * FROM source_table",
|
|
64
|
+
destination="dst_db",
|
|
65
|
+
destination_table="target_table1",
|
|
66
|
+
audit="audit_db",
|
|
67
|
+
write_parameters={"if_exists": "replace", "index": False},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
pipeline2_def = BasePipelineDefinition(
|
|
71
|
+
pipeline_name="pipeline2",
|
|
72
|
+
source="src_db",
|
|
73
|
+
source_sql="SELECT * FROM source_table",
|
|
74
|
+
destination="dst_db",
|
|
75
|
+
destination_table="target_table2",
|
|
76
|
+
audit="audit_db",
|
|
77
|
+
write_parameters={"if_exists": "replace", "index": False},
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Define orchestrator
|
|
81
|
+
orchestrator_def = OrchestratorDefinition(
|
|
82
|
+
orchestrator_name="my_orchestrator",
|
|
83
|
+
pipelines=["pipeline1", "pipeline2"],
|
|
84
|
+
fail_fast=True,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
config.pipelines["pipeline1"] = pipeline1_def
|
|
88
|
+
config.pipelines["pipeline2"] = pipeline2_def
|
|
89
|
+
config.pipelines["my_orchestrator"] = orchestrator_def
|
|
90
|
+
|
|
91
|
+
# Run the orchestrator
|
|
92
|
+
orchestrator = OrchestratorPipeline("my_orchestrator")
|
|
93
|
+
assert orchestrator.orchestrator_id is not None
|
|
94
|
+
|
|
95
|
+
success = orchestrator.run()
|
|
96
|
+
assert success is True
|
|
97
|
+
|
|
98
|
+
# Verify target tables were created and populated in destination
|
|
99
|
+
conn_dst = sqlite3.connect(dst_db_path)
|
|
100
|
+
cursor_dst = conn_dst.cursor()
|
|
101
|
+
|
|
102
|
+
cursor_dst.execute("SELECT COUNT(*) FROM target_table1")
|
|
103
|
+
assert cursor_dst.fetchone()[0] == 2
|
|
104
|
+
|
|
105
|
+
cursor_dst.execute("SELECT COUNT(*) FROM target_table2")
|
|
106
|
+
assert cursor_dst.fetchone()[0] == 2
|
|
107
|
+
|
|
108
|
+
conn_dst.close()
|
|
109
|
+
|
|
110
|
+
# Verify audit records in the audit database
|
|
111
|
+
conn_audit = sqlite3.connect(audit_db_path)
|
|
112
|
+
cursor_audit = conn_audit.cursor()
|
|
113
|
+
|
|
114
|
+
# Read the execution_audit table columns and rows
|
|
115
|
+
cursor_audit.execute(
|
|
116
|
+
"SELECT pipeline_id, orchestrator_id, pipeline_name, status FROM execution_audit"
|
|
117
|
+
)
|
|
118
|
+
rows = cursor_audit.fetchall()
|
|
119
|
+
assert len(rows) == 2
|
|
120
|
+
|
|
121
|
+
# Row 1 (pipeline1)
|
|
122
|
+
assert rows[0][1] == orchestrator.orchestrator_id
|
|
123
|
+
assert rows[0][2] == "pipeline1"
|
|
124
|
+
assert rows[0][3] == "SUCCESS"
|
|
125
|
+
|
|
126
|
+
# Row 2 (pipeline2)
|
|
127
|
+
assert rows[1][1] == orchestrator.orchestrator_id
|
|
128
|
+
assert rows[1][2] == "pipeline2"
|
|
129
|
+
assert rows[1][3] == "SUCCESS"
|
|
130
|
+
|
|
131
|
+
# Make sure they have distinct pipeline_ids but identical orchestrator_ids
|
|
132
|
+
assert rows[0][0] != rows[1][0]
|
|
133
|
+
|
|
134
|
+
conn_audit.close()
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import sqlite3
|
|
4
|
+
import pytest
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from easy_data_loader.config_loader import Configuration
|
|
8
|
+
from easy_data_loader.models import (
|
|
9
|
+
BasePipelineDefinition,
|
|
10
|
+
FileBasedConnectionSettings,
|
|
11
|
+
ServerType,
|
|
12
|
+
)
|
|
13
|
+
from easy_data_loader.pipeline import LoadPipeline
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ValidationTestModel(BaseModel):
|
|
17
|
+
id: int
|
|
18
|
+
name: str = Field(min_length=3)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture(autouse=True)
|
|
22
|
+
def reset_configuration_singleton():
|
|
23
|
+
"""Reset the Configuration singleton before and after each test."""
|
|
24
|
+
Configuration._instance = None
|
|
25
|
+
Configuration._initialized = False
|
|
26
|
+
yield
|
|
27
|
+
Configuration._instance = None
|
|
28
|
+
Configuration._initialized = False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_validation_fail_false_keeps_valid_rows():
|
|
32
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
33
|
+
src_db_path = os.path.join(tmpdir, "src.db")
|
|
34
|
+
dst_db_path = os.path.join(tmpdir, "dst.db")
|
|
35
|
+
audit_db_path = os.path.join(tmpdir, "audit.db")
|
|
36
|
+
|
|
37
|
+
# Create source data:
|
|
38
|
+
# - Row 1: Valid (name length 5 >= 3)
|
|
39
|
+
# - Row 2: Invalid (name length 2 < 3)
|
|
40
|
+
# - Row 3: Valid (name length 3 >= 3)
|
|
41
|
+
conn_src = sqlite3.connect(src_db_path)
|
|
42
|
+
cursor_src = conn_src.cursor()
|
|
43
|
+
cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
|
|
44
|
+
cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
|
|
45
|
+
cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bo')")
|
|
46
|
+
cursor_src.execute("INSERT INTO source_table VALUES (3, 'Charlie')")
|
|
47
|
+
conn_src.commit()
|
|
48
|
+
conn_src.close()
|
|
49
|
+
|
|
50
|
+
config = Configuration()
|
|
51
|
+
|
|
52
|
+
src_resource = FileBasedConnectionSettings(
|
|
53
|
+
conn_server_type=ServerType.SQLITE, file_path=src_db_path
|
|
54
|
+
)
|
|
55
|
+
dst_resource = FileBasedConnectionSettings(
|
|
56
|
+
conn_server_type=ServerType.SQLITE, file_path=dst_db_path
|
|
57
|
+
)
|
|
58
|
+
audit_resource = FileBasedConnectionSettings(
|
|
59
|
+
conn_server_type=ServerType.SQLITE, file_path=audit_db_path
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
config.resources["src_db"] = src_resource
|
|
63
|
+
config.resources["dst_db"] = dst_resource
|
|
64
|
+
config.resources["audit_db"] = audit_resource
|
|
65
|
+
|
|
66
|
+
pipeline_def = BasePipelineDefinition(
|
|
67
|
+
pipeline_name="test_pipeline",
|
|
68
|
+
source="src_db",
|
|
69
|
+
source_sql="SELECT * FROM source_table",
|
|
70
|
+
destination="dst_db",
|
|
71
|
+
destination_table="target_table",
|
|
72
|
+
audit="audit_db",
|
|
73
|
+
validator=ValidationTestModel,
|
|
74
|
+
validation_fail=False,
|
|
75
|
+
write_parameters={"if_exists": "replace", "index": False},
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
config.pipelines["test_pipeline"] = pipeline_def
|
|
79
|
+
|
|
80
|
+
# Run pipeline
|
|
81
|
+
pipeline = LoadPipeline("test_pipeline")
|
|
82
|
+
success = pipeline.run()
|
|
83
|
+
|
|
84
|
+
# Should be successful because validation_fail=False
|
|
85
|
+
assert success is True
|
|
86
|
+
|
|
87
|
+
# Verify destination table only has valid rows (1 and 3)
|
|
88
|
+
conn_dst = sqlite3.connect(dst_db_path)
|
|
89
|
+
cursor_dst = conn_dst.cursor()
|
|
90
|
+
cursor_dst.execute("SELECT id, name FROM target_table ORDER BY id")
|
|
91
|
+
rows = cursor_dst.fetchall()
|
|
92
|
+
conn_dst.close()
|
|
93
|
+
|
|
94
|
+
assert len(rows) == 2
|
|
95
|
+
assert rows[0] == (1, "Alice")
|
|
96
|
+
assert rows[1] == (3, "Charlie")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_validation_fail_true_fails_pipeline():
|
|
100
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
101
|
+
src_db_path = os.path.join(tmpdir, "src.db")
|
|
102
|
+
dst_db_path = os.path.join(tmpdir, "dst.db")
|
|
103
|
+
audit_db_path = os.path.join(tmpdir, "audit.db")
|
|
104
|
+
|
|
105
|
+
# Create source data with an invalid row
|
|
106
|
+
conn_src = sqlite3.connect(src_db_path)
|
|
107
|
+
cursor_src = conn_src.cursor()
|
|
108
|
+
cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
|
|
109
|
+
cursor_src.execute("INSERT INTO source_table VALUES (1, 'Alice')")
|
|
110
|
+
cursor_src.execute("INSERT INTO source_table VALUES (2, 'Bo')")
|
|
111
|
+
conn_src.commit()
|
|
112
|
+
conn_src.close()
|
|
113
|
+
|
|
114
|
+
config = Configuration()
|
|
115
|
+
|
|
116
|
+
src_resource = FileBasedConnectionSettings(
|
|
117
|
+
conn_server_type=ServerType.SQLITE, file_path=src_db_path
|
|
118
|
+
)
|
|
119
|
+
dst_resource = FileBasedConnectionSettings(
|
|
120
|
+
conn_server_type=ServerType.SQLITE, file_path=dst_db_path
|
|
121
|
+
)
|
|
122
|
+
audit_resource = FileBasedConnectionSettings(
|
|
123
|
+
conn_server_type=ServerType.SQLITE, file_path=audit_db_path
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
config.resources["src_db"] = src_resource
|
|
127
|
+
config.resources["dst_db"] = dst_resource
|
|
128
|
+
config.resources["audit_db"] = audit_resource
|
|
129
|
+
|
|
130
|
+
pipeline_def = BasePipelineDefinition(
|
|
131
|
+
pipeline_name="test_pipeline",
|
|
132
|
+
source="src_db",
|
|
133
|
+
source_sql="SELECT * FROM source_table",
|
|
134
|
+
destination="dst_db",
|
|
135
|
+
destination_table="target_table",
|
|
136
|
+
audit="audit_db",
|
|
137
|
+
validator=ValidationTestModel,
|
|
138
|
+
validation_fail=True,
|
|
139
|
+
write_parameters={"if_exists": "replace", "index": False},
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
config.pipelines["test_pipeline"] = pipeline_def
|
|
143
|
+
|
|
144
|
+
# Run pipeline
|
|
145
|
+
pipeline = LoadPipeline("test_pipeline")
|
|
146
|
+
success = pipeline.run()
|
|
147
|
+
|
|
148
|
+
# Should fail because validation_fail=True
|
|
149
|
+
assert success is False
|
|
150
|
+
assert pipeline.error_details is not None
|
|
151
|
+
assert "Validation failed" in pipeline.error_details
|
|
152
|
+
assert "Row 1:" in pipeline.error_details # Row 2 (0-indexed row 1) failed
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def test_validation_fail_false_graceful_stop_when_all_rows_invalid():
|
|
156
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
157
|
+
src_db_path = os.path.join(tmpdir, "src.db")
|
|
158
|
+
dst_db_path = os.path.join(tmpdir, "dst.db")
|
|
159
|
+
audit_db_path = os.path.join(tmpdir, "audit.db")
|
|
160
|
+
|
|
161
|
+
# Create source data with ONLY invalid rows
|
|
162
|
+
conn_src = sqlite3.connect(src_db_path)
|
|
163
|
+
cursor_src = conn_src.cursor()
|
|
164
|
+
cursor_src.execute("CREATE TABLE source_table (id INTEGER, name TEXT)")
|
|
165
|
+
cursor_src.execute("INSERT INTO source_table VALUES (1, 'Bo')")
|
|
166
|
+
cursor_src.execute("INSERT INTO source_table VALUES (2, 'Co')")
|
|
167
|
+
conn_src.commit()
|
|
168
|
+
conn_src.close()
|
|
169
|
+
|
|
170
|
+
config = Configuration()
|
|
171
|
+
|
|
172
|
+
src_resource = FileBasedConnectionSettings(
|
|
173
|
+
conn_server_type=ServerType.SQLITE, file_path=src_db_path
|
|
174
|
+
)
|
|
175
|
+
dst_resource = FileBasedConnectionSettings(
|
|
176
|
+
conn_server_type=ServerType.SQLITE, file_path=dst_db_path
|
|
177
|
+
)
|
|
178
|
+
audit_resource = FileBasedConnectionSettings(
|
|
179
|
+
conn_server_type=ServerType.SQLITE, file_path=audit_db_path
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
config.resources["src_db"] = src_resource
|
|
183
|
+
config.resources["dst_db"] = dst_resource
|
|
184
|
+
config.resources["audit_db"] = audit_resource
|
|
185
|
+
|
|
186
|
+
pipeline_def = BasePipelineDefinition(
|
|
187
|
+
pipeline_name="test_pipeline",
|
|
188
|
+
source="src_db",
|
|
189
|
+
source_sql="SELECT * FROM source_table",
|
|
190
|
+
destination="dst_db",
|
|
191
|
+
destination_table="target_table",
|
|
192
|
+
audit="audit_db",
|
|
193
|
+
validator=ValidationTestModel,
|
|
194
|
+
validation_fail=False,
|
|
195
|
+
write_parameters={"if_exists": "replace", "index": False},
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
config.pipelines["test_pipeline"] = pipeline_def
|
|
199
|
+
|
|
200
|
+
# Run pipeline
|
|
201
|
+
pipeline = LoadPipeline("test_pipeline")
|
|
202
|
+
success = pipeline.run()
|
|
203
|
+
|
|
204
|
+
# Should stop gracefully and return True because validation_fail=False
|
|
205
|
+
assert success is True
|
|
206
|
+
assert pipeline.output_rows == 0
|
|
207
|
+
|
|
208
|
+
# Destination table should NOT have been loaded or have any data
|
|
209
|
+
conn_dst = sqlite3.connect(dst_db_path)
|
|
210
|
+
cursor_dst = conn_dst.cursor()
|
|
211
|
+
# The table should not exist or be empty
|
|
212
|
+
try:
|
|
213
|
+
cursor_dst.execute("SELECT COUNT(*) FROM target_table")
|
|
214
|
+
count = cursor_dst.fetchone()[0]
|
|
215
|
+
assert count == 0
|
|
216
|
+
except sqlite3.OperationalError:
|
|
217
|
+
# Table not even created, which is also correct/graceful
|
|
218
|
+
pass
|
|
219
|
+
conn_dst.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{easy_data_loader-0.1.2 → easy_data_loader-0.1.4}/src/easy_data_loader.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|