easy-data-loader 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ __version__ = "0.1.0"
2
+
3
+ from .pipeline import LoadPipeline
4
+ from .procedure_pipeline import ProcedurePipeline
5
+ from .orchestrator import OrchestratorPipeline
6
+ from .models import BasePipelineDefinition, ProcedureDefinition, ColumnDefinition, OrchestratorDefinition
7
+
8
+ __all__ = [
9
+ "LoadPipeline", "ProcedurePipeline", "OrchestratorPipeline",
10
+ "BasePipelineDefinition", "ProcedureDefinition", "ColumnDefinition", "OrchestratorDefinition"
11
+ ]
@@ -0,0 +1,302 @@
1
+ import click
2
+ import os
3
+ from pathlib import Path
4
+
5
+ # Integrated templates
6
+ PIPELINE_TEMPLATE = """
7
+ from easy_data_loader.pipeline import LoadPipeline
8
+ from easy_data_loader.models import BasePipelineDefinition, ColumnDefinition
9
+ import pandas as pd
10
+ from sqlalchemy.types import DATETIME, INT, DECIMAL, NVARCHAR
11
+
12
+ example_pipeline = BasePipelineDefinition(
13
+ pipeline_name="test_pipeline", # pipeline name to be used when initializing the Pipeline object
14
+
15
+ # source name represented by the file having the source settings -> corresponds to a file in the config/resources folder
16
+ source = "example_file",
17
+
18
+ # "dbo.FactSales" ## if the source is a database then source sql defines the table name or a custom query to be executed
19
+ # source_sql = "SELECT TOP 100 * FROM dbo.FactSales"
20
+
21
+ # destination name represented by the file having the destination settings -> corresponds to a file in the config/resources folder
22
+ destination="example_database",
23
+
24
+ # if the destination is a database then here we define the destination table name
25
+ destination_table="dbo.LargeSalesData",
26
+
27
+ # columns definition if we are sending data to a database table
28
+ columns={
29
+ "transaction_id": ColumnDefinition(target_name="new_transaction_id", data_type=INT()),
30
+ "date": ColumnDefinition(target_name="sales_date", data_type=DATETIME()),
31
+ "customer_id": ColumnDefinition(target_name="id_customer", data_type=INT()),
32
+ "product_category" : ColumnDefinition(target_name="category_of_product", data_type=NVARCHAR(100)),
33
+ "units_sold" : ColumnDefinition(target_name="units", data_type=INT()),
34
+ "unit_price" : ColumnDefinition(target_name="price", data_type=DECIMAL(6,2)),
35
+ "raw_notes" : ColumnDefinition(target_name="notes", data_type=NVARCHAR(100))
36
+ },
37
+
38
+ # different parameters passed to the write functions
39
+ write_parameters={"if_exists" : "replace", "index" : False},
40
+
41
+ # different parameters passed to the read function
42
+ read_parameters={"sep" : ";"}
43
+ )
44
+
45
+ def add_timestamp(df):
46
+ # Adding an audit column during load
47
+ df['insert_timestamp'] = pd.Timestamp.now()
48
+ return df
49
+
50
+ example_pipeline.transform = add_timestamp
51
+ """
52
+
53
+ PROCEDURE_TEMPLATE = """
54
+ from easy_data_loader.models import ProcedureDefinition
55
+
56
+ example_procedure = ProcedureDefinition(
57
+ pipeline_name="example_procedure",
58
+ resource="example_database",
59
+ procedures=[
60
+ ("dbo.sp_UpdateSales", {"year": 2024}),
61
+ ("dbo.sp_ArchiveOldData", {})
62
+ ]
63
+ )
64
+ """
65
+
66
+ ORCHESTRATOR_TEMPLATE = """
67
+ from easy_data_loader.models import OrchestratorDefinition
68
+
69
+ example_orchestrator = OrchestratorDefinition(
70
+ orchestrator_name="example_orchestrator",
71
+ pipelines=[
72
+ "example_pipeline",
73
+ "example_procedure"
74
+ ],
75
+ fail_fast=True
76
+ )
77
+ """
78
+
79
+
80
+ DATABASE_ENV = """
81
+ # database resource definition
82
+ CONN_SERVER_TYPE=MSSQL
83
+ CONN_SERVER=.
84
+ CONN_DATABASE=test_database
85
+ CONN_USERNAME=my_user
86
+ CONN_PASSWORD=my_password
87
+ CONN_PORT=1433
88
+ """
89
+
90
+ FILE_ENV = """
91
+ # file resource definition
92
+ FILE_TYPE=CSV
93
+ FOLDER_PATH=./data/imports
94
+ FILE_NAME=large_sales_data
95
+ """
96
+
97
+ MAIN = """
98
+ from easy_data_loader.pipeline import LoadPipeline
99
+
100
+ # Run an ETL pipeline
101
+ LoadPipeline(pipeline_name="example_pipeline").run()
102
+
103
+ # Run a procedure pipeline
104
+ # from easy_data_loader.procedure_pipeline import ProcedurePipeline
105
+ # ProcedurePipeline(pipeline_name="example_procedure").run()
106
+ """
107
+
108
+ @click.group()
109
+ def main():
110
+ """Easy Data Loader CLI - ETL instrument between files and databases"""
111
+ pass
112
+
113
+ @main.command()
114
+ def init():
115
+ """Initialize folder structure and sample files"""
116
+ base_path = Path.cwd()
117
+
118
+ # folders
119
+ folders = ['config/resources', 'config/pipelines']
120
+ for folder in folders:
121
+ (base_path / folder).mkdir(parents=True, exist_ok=True)
122
+
123
+ # Example files to create
124
+ files = {
125
+ "config/pipelines/pipeline_example.py": PIPELINE_TEMPLATE,
126
+ "config/pipelines/procedure_example.py": PROCEDURE_TEMPLATE,
127
+ "config/pipelines/orchestrator_example.py": ORCHESTRATOR_TEMPLATE,
128
+ "config/resources/database_example.env": DATABASE_ENV,
129
+ "config/resources/file_example.env": FILE_ENV,
130
+ "main.py" : MAIN,
131
+ }
132
+
133
+ for name, content in files.items():
134
+ file_path = base_path / name
135
+ if not file_path.exists():
136
+ with open(file_path, "w", encoding="utf-8") as f:
137
+ f.write(content)
138
+ click.echo(f"Created: {name}")
139
+ else:
140
+ click.echo(f"Skipped: {name} (already exists)")
141
+
142
+ click.echo("\nProject initialized successfully!")
143
+
144
+ @main.command()
145
+ def list():
146
+ """List all discovered resources and pipelines"""
147
+ from .config_loader import Configuration
148
+ config = Configuration()
149
+
150
+ click.echo("--- Discovered Resources ---")
151
+ for name in config.get_all_resources():
152
+ click.echo(f" - {name}")
153
+
154
+ click.echo("\n--- Discovered Pipelines ---")
155
+ for name in config.get_all_pipelines():
156
+ click.echo(f" - {name}")
157
+
158
+
159
+ @main.command()
160
+ @click.argument('resource_name')
161
+ @click.argument('table_name')
162
+ def inspect_db(resource_name, table_name):
163
+ """Inspect a database table and generate ColumnDefinition code"""
164
+ from .config_loader import Configuration
165
+ from .database_connector import CONNECTOR_FACTORY
166
+ from .database_operations import DatabaseOperations
167
+ from .models import ConnectionSettings
168
+
169
+ config = Configuration()
170
+ resource = config.get_resource(resource_name)
171
+
172
+ if not isinstance(resource, ConnectionSettings):
173
+ click.echo(f"Error: Resource '{resource_name}' is not a database connection.")
174
+ return
175
+
176
+ # Initialize connector and ops
177
+ connector = CONNECTOR_FACTORY[resource.conn_server_type](resource)
178
+ ops = DatabaseOperations(connector.get_engine())
179
+
180
+ schema = ops.inspect_table(table_name)
181
+
182
+ if not schema:
183
+ click.echo(f"No columns found for table '{table_name}'.")
184
+ return
185
+
186
+ click.echo(f"\n# Suggested Column definitions for {table_name}:")
187
+ click.echo("columns={")
188
+ for col, dtype in schema.items():
189
+ click.echo(f' "{col}": ColumnDefinition(target_name="{col}", data_type={dtype}),')
190
+ click.echo("}")
191
+
192
+
193
+ @main.command()
194
+ def run_all():
195
+ """Run all discovered pipelines and show status summary"""
196
+ from .config_loader import Configuration
197
+ from .pipeline import LoadPipeline
198
+ from .procedure_pipeline import ProcedurePipeline
199
+ from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition
200
+
201
+ config = Configuration()
202
+ pipelines = config.get_all_pipelines()
203
+
204
+ if not pipelines:
205
+ click.echo("No pipelines discovered.")
206
+ return
207
+
208
+ results = {}
209
+
210
+ click.echo(f"🚀 Running {len(pipelines)} discovered pipelines...\n")
211
+
212
+ for name in pipelines:
213
+ click.echo(f"Pipeline: {name} ... ", nl=False)
214
+ try:
215
+ definition = config.get_pipeline(name)
216
+ if isinstance(definition, BasePipelineDefinition):
217
+ success = LoadPipeline(name).run()
218
+ elif isinstance(definition, ProcedureDefinition):
219
+ success = ProcedurePipeline(name).run()
220
+ elif isinstance(definition, OrchestratorDefinition):
221
+ from .orchestrator import OrchestratorPipeline
222
+ success = OrchestratorPipeline(name).run()
223
+ else:
224
+ success = False
225
+
226
+ results[name] = "SUCCESS" if success else "FAILED"
227
+ except Exception as e:
228
+ results[name] = f"ERROR: {str(e)}"
229
+
230
+ click.echo(results[name])
231
+
232
+ click.echo("\n" + "=" * 40)
233
+ click.echo(f"{'PIPELINE':<25} | {'STATUS'}")
234
+ click.echo("-" * 40)
235
+ for name, status in results.items():
236
+ click.echo(f"{name:<25} | {status}")
237
+
238
+ @main.command()
239
+ @click.argument('orchestrator_name')
240
+ def run_orchestrator(orchestrator_name):
241
+ """Run a specific orchestrator by name"""
242
+ from .orchestrator import OrchestratorPipeline
243
+
244
+ try:
245
+ success = OrchestratorPipeline(orchestrator_name).run()
246
+ if success:
247
+ click.echo(f"✅ Orchestrator '{orchestrator_name}' completed successfully.")
248
+ else:
249
+ click.echo(f"❌ Orchestrator '{orchestrator_name}' failed.")
250
+ except Exception as e:
251
+ click.echo(f"💥 Error: {str(e)}")
252
+
253
+
254
+ @main.command()
255
+ def validate_resources():
256
+ """Validate all configured resources"""
257
+ from .config_loader import Configuration
258
+ from .database_connector import CONNECTOR_FACTORY
259
+ from .models import ConnectionSettings, FileSettings
260
+
261
+ config = Configuration()
262
+ resources = config.get_all_resources()
263
+
264
+ if not resources:
265
+ click.echo("No resources found.")
266
+ return
267
+
268
+ click.echo(f"🔍 Validating {len(resources)} resources...\n")
269
+
270
+ results = {}
271
+
272
+ for name, resource in resources.items():
273
+ click.echo(f"Resource: {name} ... ", nl=False)
274
+ try:
275
+ if isinstance(resource, ConnectionSettings):
276
+ # Validate Database Connection
277
+ connector = CONNECTOR_FACTORY[resource.conn_server_type](resource)
278
+ # The connector tests connection in __init__, so if we are here it passed
279
+ results[name] = "OK (Connected)"
280
+ elif isinstance(resource, FileSettings):
281
+ # Validate File Path
282
+ if resource.folder_path.exists():
283
+ results[name] = "OK (Path Exists)"
284
+ else:
285
+ raise ValueError(f"Path does not exist: {resource.folder_path}")
286
+ else:
287
+ results[name] = "UNKNOWN TYPE"
288
+
289
+ except Exception as e:
290
+ results[name] = f"FAILED: {str(e)}"
291
+
292
+ click.echo(results[name])
293
+
294
+ click.echo("\n" + "=" * 60)
295
+ click.echo(f"{'RESOURCE':<30} | {'STATUS'}")
296
+ click.echo("-" * 60)
297
+ for name, status in results.items():
298
+ click.echo(f"{name:<30} | {status}")
299
+
300
+
301
+ if __name__ == "__main__":
302
+ main()
@@ -0,0 +1,184 @@
1
+ import importlib.util
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Union
4
+ from types import ModuleType
5
+ from dotenv import dotenv_values
6
+
7
+ from .log import LoggedComponent
8
+ from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition, ConnectionSettings, FileSettings, ResourceConfig
9
+
10
+
11
+ class Configuration(LoggedComponent):
12
+ """
13
+ Configuration manager with lazy loading capabilities.
14
+ Resources and pipelines are loaded only when requested.
15
+ This class implements the Singleton pattern.
16
+ """
17
+ _instance = None
18
+ _initialized = False
19
+
20
+ def __new__(cls, *args, **kwargs):
21
+ if cls._instance is None:
22
+ cls._instance = super(Configuration, cls).__new__(cls)
23
+ return cls._instance
24
+
25
+ def __init__(self, config_dir: str = "./config"):
26
+ if not self._initialized:
27
+ super().__init__()
28
+ self.config_dir = Path(config_dir)
29
+ self.resources : Dict[str, ResourceConfig] = {}
30
+ self.pipelines : Dict[str, Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]] = {}
31
+
32
+ self.logger.debug(f"Initializing configuration from directory: {config_dir}")
33
+ self._initialized = True
34
+
35
+ def _load_env_file(self, env_file: Path) -> ResourceConfig:
36
+ """Load environment variables from a specific file"""
37
+ self.logger.debug(f"Loading environment file: {env_file}")
38
+
39
+ env_file_name = env_file.stem
40
+
41
+ if env_file_name.startswith('database_'):
42
+ return ConnectionSettings(_env_file=[env_file]) # type: ignore
43
+ if env_file_name.startswith('file_'):
44
+ return FileSettings(_env_file=[env_file]) # type: ignore
45
+
46
+ self.log_and_raise(ValueError,
47
+ f"Failed to load env file: {env_file.name}. "
48
+ f"Resource files must start with 'database_' or 'file_' prefix."
49
+ )
50
+
51
+ def _import_module(self, module_file: Path) -> ModuleType:
52
+ """Dynamically import a Python module from a file path"""
53
+
54
+ self.logger.debug(f"Importing configuration from {module_file}")
55
+
56
+ spec = importlib.util.spec_from_file_location(module_file.stem, module_file)
57
+
58
+ if spec is None:
59
+ self.log_and_raise(
60
+ ImportError,
61
+ f"Could not create module spec for {module_file}",
62
+ file_path=str(module_file),
63
+ )
64
+
65
+ if spec.loader is not None:
66
+ module = importlib.util.module_from_spec(spec)
67
+
68
+ spec.loader.exec_module(module)
69
+ self.logger.debug(
70
+ f"Succesfully imported configuration from {str(module_file)}"
71
+ )
72
+ return module
73
+ else:
74
+ self.log_and_raise(
75
+ ImportError,
76
+ f"Module spec has no loader {module_file}",
77
+ file_path=str(module_file),
78
+ )
79
+
80
+ def get_resource(self, resource_name: str) -> ResourceConfig:
81
+ """
82
+ Retrieve a connection by name.
83
+ Uses lazy loading: checks memory first, then attempts to load from file.
84
+ """
85
+ self.logger.debug(f"Retrieving connection by name: {resource_name}")
86
+
87
+ # 1. Check if already loaded
88
+ if resource_name in self.resources:
89
+ return self.resources[resource_name]
90
+
91
+ # 2. Try to load from file
92
+ resource_file = self.config_dir / "resources" / f"{resource_name}.env"
93
+ if resource_file.exists():
94
+ try:
95
+ resource = self._load_env_file(resource_file)
96
+ self.resources[resource_name] = resource
97
+ self.logger.info(f"Lazily loaded resource: {resource_name}")
98
+ return resource
99
+ except Exception as e:
100
+ self.log_exception(e, f"Failed to load resource: {resource_name}")
101
+ raise
102
+
103
+ # 3. Not found
104
+ self.log_and_raise(ValueError,
105
+ f"Resource not found: {resource_name}. "
106
+ f"Checked path: {resource_file}"
107
+ )
108
+
109
+ def get_pipeline(self, pipeline_name: str) -> Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]:
110
+ """
111
+ Retrieve a pipeline definition by name.
112
+ Uses lazy loading: checks memory first, then attempts to load from file.
113
+ """
114
+ self.logger.debug(f"Retriving pipeline definition by name: {pipeline_name}")
115
+
116
+ # 1. Check if already loaded
117
+ if pipeline_name in self.pipelines:
118
+ return self.pipelines[pipeline_name]
119
+
120
+ # 2. Try to load from file
121
+ pipeline_file = self.config_dir / "pipelines" / f"{pipeline_name}.py"
122
+ if pipeline_file.exists():
123
+ try:
124
+ config_module = self._import_module(pipeline_file)
125
+ # Find the definition in the module
126
+ for attr_name in dir(config_module):
127
+ attr = getattr(config_module, attr_name)
128
+ if isinstance(attr, (BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition)):
129
+ self.pipelines[pipeline_name] = attr
130
+ self.logger.info(f"Lazily loaded pipeline: {pipeline_name}")
131
+ return attr
132
+ except Exception as e:
133
+ self.log_exception(e, f"Failed to load pipeline: {pipeline_name}")
134
+ raise
135
+
136
+ # 3. Not found
137
+ self.log_and_raise(ValueError,
138
+ f"Pipeline not found: {pipeline_name}. "
139
+ f"Checked path: {pipeline_file}")
140
+
141
+ def get_all_resources(self) -> dict[str, ResourceConfig]:
142
+ """
143
+ Retrieve all connections.
144
+ Scans the resources directory if not all loaded.
145
+ """
146
+ resources_dir = self.config_dir / "resources"
147
+ if not resources_dir.exists():
148
+ return self.resources
149
+
150
+ # Discover all .env files
151
+ for env_file in resources_dir.glob("*.env"):
152
+ # Simple logging to debug discovery
153
+ self.logger.debug(f"Found potential resource file: {env_file.name}")
154
+
155
+ resource_name = env_file.stem
156
+ if resource_name not in self.resources:
157
+ try:
158
+ self.resources[resource_name] = self._load_env_file(env_file)
159
+ except Exception as e:
160
+ self.logger.warning(f"Failed to load resource {resource_name}: {e}")
161
+
162
+ return self.resources
163
+
164
+ def get_all_pipelines(self) -> dict[str, BasePipelineDefinition]:
165
+ """
166
+ Retrieve all pipelines.
167
+ Scans the pipelines directory if not all loaded.
168
+ """
169
+ pipelines_dir = self.config_dir / "pipelines"
170
+ if not pipelines_dir.exists():
171
+ return self.pipelines
172
+
173
+ for pipeline_file in pipelines_dir.glob("*.py"):
174
+ self.logger.debug(f"Found potential pipeline file: {pipeline_file.name}")
175
+
176
+ pipeline_name = pipeline_file.stem
177
+ if pipeline_name not in self.pipelines:
178
+ # Helper to trigger lazy load
179
+ try:
180
+ self.get_pipeline(pipeline_name)
181
+ except Exception as e:
182
+ self.logger.warning(f"Failed to load pipeline {pipeline_name}: {e}")
183
+
184
+ return self.pipelines
@@ -0,0 +1,21 @@
1
+ class DriverNotFoundException(Exception):
2
+ def __init__(self, message: str = "ODBC driver not available"):
3
+ self.message = message
4
+ super().__init__(self.message)
5
+
6
+
7
+ class EngineTestException(Exception):
8
+ def __init__(self, message: str = "Engine has not passed connection test"):
9
+ self.message = message
10
+ super().__init__(self.message)
11
+
12
+
13
+ class DatabaseOperationException(Exception):
14
+ def __init__(self, operation: str, message: str):
15
+ self.message = message
16
+ super().__init__(self.message)
17
+
18
+ class InvalidFileException(Exception):
19
+ def __init__(self, message: str = "The provided file is invalid or corrupted"):
20
+ self.message = message
21
+ super().__init__(self.message)