easy-data-loader 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easy_data_loader-0.1.2/PKG-INFO +110 -0
- easy_data_loader-0.1.2/README.md +84 -0
- easy_data_loader-0.1.2/pyproject.toml +43 -0
- easy_data_loader-0.1.2/src/easy_data_loader/__init__.py +21 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/cli.py +52 -29
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/config_loader.py +69 -36
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/custom_exceptions.py +1 -0
- easy_data_loader-0.1.2/src/easy_data_loader/data_inferrence.py +897 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/database_connector.py +57 -20
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/database_operations.py +36 -17
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/driver_detector.py +2 -1
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/file_operations.py +90 -34
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/log.py +2 -1
- easy_data_loader-0.1.2/src/easy_data_loader/models.py +238 -0
- easy_data_loader-0.1.2/src/easy_data_loader/orchestrator.py +80 -0
- easy_data_loader-0.1.2/src/easy_data_loader/pipeline.py +231 -0
- easy_data_loader-0.1.2/src/easy_data_loader/pipeline_base.py +138 -0
- easy_data_loader-0.1.2/src/easy_data_loader/procedure_pipeline.py +75 -0
- easy_data_loader-0.1.2/src/easy_data_loader.egg-info/PKG-INFO +110 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/SOURCES.txt +2 -0
- easy_data_loader-0.1.2/src/easy_data_loader.egg-info/entry_points.txt +2 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/requires.txt +0 -1
- easy_data_loader-0.1.2/tests/test_data_inference.py +843 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/tests/test_imports.py +3 -0
- easy_data_loader-0.1.0/PKG-INFO +0 -52
- easy_data_loader-0.1.0/README.md +0 -32
- easy_data_loader-0.1.0/pyproject.toml +0 -37
- easy_data_loader-0.1.0/src/easy_data_loader/__init__.py +0 -11
- easy_data_loader-0.1.0/src/easy_data_loader/models.py +0 -168
- easy_data_loader-0.1.0/src/easy_data_loader/orchestrator.py +0 -59
- easy_data_loader-0.1.0/src/easy_data_loader/pipeline.py +0 -169
- easy_data_loader-0.1.0/src/easy_data_loader/pipeline_base.py +0 -121
- easy_data_loader-0.1.0/src/easy_data_loader/procedure_pipeline.py +0 -56
- easy_data_loader-0.1.0/src/easy_data_loader.egg-info/PKG-INFO +0 -52
- easy_data_loader-0.1.0/src/easy_data_loader.egg-info/entry_points.txt +0 -2
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/LICENSE +0 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/setup.cfg +0 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/dependency_links.txt +0 -0
- {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: easy_data_loader
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Data transfer utilities between files and databases
|
|
5
|
+
Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Intended Audience :: Developers
|
|
8
|
+
Classifier: Topic :: Database
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.13
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: click>=8.3.0
|
|
17
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
18
|
+
Requires-Dist: pandas>=2.3.3
|
|
19
|
+
Requires-Dist: pyarrow>=22.0.0
|
|
20
|
+
Requires-Dist: pydantic>=2.12.5
|
|
21
|
+
Requires-Dist: pydantic-settings>=2.12.0
|
|
22
|
+
Requires-Dist: pyodbc>=5.2.0
|
|
23
|
+
Requires-Dist: python-dotenv>=1.1.1
|
|
24
|
+
Requires-Dist: sqlalchemy>=2.0.43
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# Easy Data Loader 🚀
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
[](https://badge.fury.io/py/easy-data-loader)
|
|
31
|
+
[](https://opensource.org/licenses/MIT)
|
|
32
|
+

|
|
33
|
+
|
|
34
|
+
**Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various file data sources (csv, xlsx, parquet, orc) and databases (MSSQL, PostgreSQL and others).
|
|
35
|
+
|
|
36
|
+
## ✨ Key Features
|
|
37
|
+
- **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
|
|
38
|
+
- **Integrated CLI**: Initialize a standardized project structure with a single command.
|
|
39
|
+
- **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
|
|
40
|
+
- **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
|
|
41
|
+
- **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## 📦 Installation
|
|
46
|
+
|
|
47
|
+
Install directly via `pip` or `uv`:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install easy_data_loader
|
|
51
|
+
uv add easy_data_loader
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 🚀 Getting Started
|
|
55
|
+
|
|
56
|
+
1. Initialize a new project structure to generate template configurations:
|
|
57
|
+
```bash
|
|
58
|
+
easy-data-loader init
|
|
59
|
+
```
|
|
60
|
+
2. Review the generated `config/` folders for sample resources and pipelines.
|
|
61
|
+
3. Run all discovered pipelines across the active configurations:
|
|
62
|
+
```bash
|
|
63
|
+
easy-data-loader run_all
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## ✔️ Generic concepts
|
|
67
|
+
|
|
68
|
+
`easy_data_loader` uses `resources` as a way to define a file or a database. The resouces can represent either a source or a destination making posible the folowing ETL scenarios: file -> file, file -> database, database -> file, database -> database.
|
|
69
|
+
|
|
70
|
+
`easy_data_loader` project initializer will created the predefined folder structure `/config/resources` where the resources are expected to be defined following the current convention: the file type is .env and the file name must be prefixed with the resource type `file_` or `database_`. The predefined folder structure together with the naming convention enables `easy_data_loader` to find and load all resources.
|
|
71
|
+
|
|
72
|
+
A secondary predefined folder `/config/pipelines` will contain the pipeline definition files, which are regular Python files. There are 3 types of pipelines that can be defined:
|
|
73
|
+
- `LoadPipeline` the main pipeline type which transports data from source to destination
|
|
74
|
+
- `ProcedurePipeline` a pipeline dedicated for executing stored procedures inside a database
|
|
75
|
+
- `OrchestratorPipeline` a pipeline that can execute a group of pipelines sequentialy
|
|
76
|
+
|
|
77
|
+
## LoadPipeline
|
|
78
|
+
|
|
79
|
+
In order to define a `LoadPipeline` we must use the `BasePipelineDefinition` from `easy_data_loader` as depicted in the example pipelines created by the initializer.
|
|
80
|
+
In the simplest form there are only a few mandatory parameters:
|
|
81
|
+
- `pipeline_name : str` - this name will be used to execute the pipeline
|
|
82
|
+
- `source : str` - the file name (without extension) coresponding to the desired resource to be the data source
|
|
83
|
+
- `destination : str` - the file name (without extension) coresponding to the desired resource to be the data destination
|
|
84
|
+
|
|
85
|
+
If either the source or destination are a database then additional parameters become mandatory:
|
|
86
|
+
- `source_sql : str` - can be a table name or a specific query in the SQL dialect of the source database flavor
|
|
87
|
+
- `destination_table : str` - table name where the data will be inserted
|
|
88
|
+
|
|
89
|
+
There are many other aspects of the pipeline that can be defined:
|
|
90
|
+
- `audit : str` - the pipeline has a built in audit functionality, it records certain information after the pipeline completes in a SqlLite database. If the user desires, the same information can be recorded in a database `resource`
|
|
91
|
+
- `validator: Pydantic BaseModel` - the data read from the source `resource` can be validated using an arbitrary defined Pydantic model before is written to destination
|
|
92
|
+
- `columns : Dict[str, ColumnDefinition]` - this parameter is used for strict control on how the data is written to destination; it has the dual purpose of renaming the columns and also define explicitly the data types (mainly for inserting into a database table); the `ColumnDefinition` is constructed with an optional `target_name: str` for renaming columns and / or a `data_type : SqlAlchemy Type` thus controling column data types, lenghts, precision etc.
|
|
93
|
+
- `read_parameters : Dict[str, Any]` and `write_parameters : Dict[str, Any]` - these parameters control how the data is being read or written from source to destination and provide an easy way to use special delimiters for files, drop and recreate the database table, etc. `easy_data_loader` is using pandas as the transport layer therefore the read and write parameters will be passed to the coresponding read and write functions supported by pandas.
|
|
94
|
+
- the pipeline has a set of predefined hooks allowing the execution of functions at specific moments during the execution: `file_pre_process : Callable` - executed before the file is read into the pandas DataFrame (e.g. unzip the file); `transform : Callable` - perform data transformation over the data already in the pandas DataFrame (requires pandas methods); `file_post_process : Callable` - after the pipeline completes and the data is written to the destination perform post processing on the source file (e.g. move the file to another folder)
|
|
95
|
+
|
|
96
|
+
## ProcedurePipeline
|
|
97
|
+
|
|
98
|
+
This secondary pipeline type is responsible for executing one or more stored procedures inside a database.
|
|
99
|
+
To define one we need to use the `ProcedureDefinition` with the following parameters:
|
|
100
|
+
- `pipeline_name : str` - this name will be used to execute the pipeline
|
|
101
|
+
- `audit : str, optional` - database resource name where the audit info will be recorded
|
|
102
|
+
- `resource : str` - database resource name where the stored procedure(s) wil be executed
|
|
103
|
+
- `procedures : List[tuple(str, Optional[Dict[str, Any]])]` - list of one or more stord procedures along with optional procedures parameters as dictionaries
|
|
104
|
+
|
|
105
|
+
## OrchestratorPipeline
|
|
106
|
+
|
|
107
|
+
This pipeline type is responsible of executing sequentially a set of pipelines, `LoadPipeline`s and / or `ProcedurePipeline`s. Very simple to define using the `OrchestratorDefinition` with:
|
|
108
|
+
- `orchestrator_name : str` - name by which the orchestrator is executer
|
|
109
|
+
- 'pipelines : List[str]` - list of pipelines to execute sequentially
|
|
110
|
+
- `fail_fast : bool, Default True` - if any of the pipelines fail the rest of the pipelines in the list do not get executed
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Easy Data Loader 🚀
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
[](https://badge.fury.io/py/easy-data-loader)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+

|
|
7
|
+
|
|
8
|
+
**Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various file data sources (csv, xlsx, parquet, orc) and databases (MSSQL, PostgreSQL and others).
|
|
9
|
+
|
|
10
|
+
## ✨ Key Features
|
|
11
|
+
- **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
|
|
12
|
+
- **Integrated CLI**: Initialize a standardized project structure with a single command.
|
|
13
|
+
- **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
|
|
14
|
+
- **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
|
|
15
|
+
- **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 📦 Installation
|
|
20
|
+
|
|
21
|
+
Install directly via `pip` or `uv`:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install easy_data_loader
|
|
25
|
+
uv add easy_data_loader
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## 🚀 Getting Started
|
|
29
|
+
|
|
30
|
+
1. Initialize a new project structure to generate template configurations:
|
|
31
|
+
```bash
|
|
32
|
+
easy-data-loader init
|
|
33
|
+
```
|
|
34
|
+
2. Review the generated `config/` folders for sample resources and pipelines.
|
|
35
|
+
3. Run all discovered pipelines across the active configurations:
|
|
36
|
+
```bash
|
|
37
|
+
easy-data-loader run_all
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## ✔️ Generic concepts
|
|
41
|
+
|
|
42
|
+
`easy_data_loader` uses `resources` as a way to define a file or a database. The resouces can represent either a source or a destination making posible the folowing ETL scenarios: file -> file, file -> database, database -> file, database -> database.
|
|
43
|
+
|
|
44
|
+
`easy_data_loader` project initializer will created the predefined folder structure `/config/resources` where the resources are expected to be defined following the current convention: the file type is .env and the file name must be prefixed with the resource type `file_` or `database_`. The predefined folder structure together with the naming convention enables `easy_data_loader` to find and load all resources.
|
|
45
|
+
|
|
46
|
+
A secondary predefined folder `/config/pipelines` will contain the pipeline definition files, which are regular Python files. There are 3 types of pipelines that can be defined:
|
|
47
|
+
- `LoadPipeline` the main pipeline type which transports data from source to destination
|
|
48
|
+
- `ProcedurePipeline` a pipeline dedicated for executing stored procedures inside a database
|
|
49
|
+
- `OrchestratorPipeline` a pipeline that can execute a group of pipelines sequentialy
|
|
50
|
+
|
|
51
|
+
## LoadPipeline
|
|
52
|
+
|
|
53
|
+
In order to define a `LoadPipeline` we must use the `BasePipelineDefinition` from `easy_data_loader` as depicted in the example pipelines created by the initializer.
|
|
54
|
+
In the simplest form there are only a few mandatory parameters:
|
|
55
|
+
- `pipeline_name : str` - this name will be used to execute the pipeline
|
|
56
|
+
- `source : str` - the file name (without extension) coresponding to the desired resource to be the data source
|
|
57
|
+
- `destination : str` - the file name (without extension) coresponding to the desired resource to be the data destination
|
|
58
|
+
|
|
59
|
+
If either the source or destination are a database then additional parameters become mandatory:
|
|
60
|
+
- `source_sql : str` - can be a table name or a specific query in the SQL dialect of the source database flavor
|
|
61
|
+
- `destination_table : str` - table name where the data will be inserted
|
|
62
|
+
|
|
63
|
+
There are many other aspects of the pipeline that can be defined:
|
|
64
|
+
- `audit : str` - the pipeline has a built in audit functionality, it records certain information after the pipeline completes in a SqlLite database. If the user desires, the same information can be recorded in a database `resource`
|
|
65
|
+
- `validator: Pydantic BaseModel` - the data read from the source `resource` can be validated using an arbitrary defined Pydantic model before is written to destination
|
|
66
|
+
- `columns : Dict[str, ColumnDefinition]` - this parameter is used for strict control on how the data is written to destination; it has the dual purpose of renaming the columns and also define explicitly the data types (mainly for inserting into a database table); the `ColumnDefinition` is constructed with an optional `target_name: str` for renaming columns and / or a `data_type : SqlAlchemy Type` thus controling column data types, lenghts, precision etc.
|
|
67
|
+
- `read_parameters : Dict[str, Any]` and `write_parameters : Dict[str, Any]` - these parameters control how the data is being read or written from source to destination and provide an easy way to use special delimiters for files, drop and recreate the database table, etc. `easy_data_loader` is using pandas as the transport layer therefore the read and write parameters will be passed to the coresponding read and write functions supported by pandas.
|
|
68
|
+
- the pipeline has a set of predefined hooks allowing the execution of functions at specific moments during the execution: `file_pre_process : Callable` - executed before the file is read into the pandas DataFrame (e.g. unzip the file); `transform : Callable` - perform data transformation over the data already in the pandas DataFrame (requires pandas methods); `file_post_process : Callable` - after the pipeline completes and the data is written to the destination perform post processing on the source file (e.g. move the file to another folder)
|
|
69
|
+
|
|
70
|
+
## ProcedurePipeline
|
|
71
|
+
|
|
72
|
+
This secondary pipeline type is responsible for executing one or more stored procedures inside a database.
|
|
73
|
+
To define one we need to use the `ProcedureDefinition` with the following parameters:
|
|
74
|
+
- `pipeline_name : str` - this name will be used to execute the pipeline
|
|
75
|
+
- `audit : str, optional` - database resource name where the audit info will be recorded
|
|
76
|
+
- `resource : str` - database resource name where the stored procedure(s) wil be executed
|
|
77
|
+
- `procedures : List[tuple(str, Optional[Dict[str, Any]])]` - list of one or more stord procedures along with optional procedures parameters as dictionaries
|
|
78
|
+
|
|
79
|
+
## OrchestratorPipeline
|
|
80
|
+
|
|
81
|
+
This pipeline type is responsible of executing sequentially a set of pipelines, `LoadPipeline`s and / or `ProcedurePipeline`s. Very simple to define using the `OrchestratorDefinition` with:
|
|
82
|
+
- `orchestrator_name : str` - name by which the orchestrator is executer
|
|
83
|
+
- 'pipelines : List[str]` - list of pipelines to execute sequentially
|
|
84
|
+
- `fail_fast : bool, Default True` - if any of the pipelines fail the rest of the pipelines in the list do not get executed
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "easy_data_loader"
|
|
3
|
+
version = "0.1.2"
|
|
4
|
+
description = "Data transfer utilities between files and databases"
|
|
5
|
+
authors = [{ name = "Bojoi Gabriel", email = "bojoigabriel@gmail.com" }]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
requires-python = ">=3.13"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"click>=8.3.0",
|
|
10
|
+
"openpyxl>=3.1.5",
|
|
11
|
+
"pandas>=2.3.3",
|
|
12
|
+
"pyarrow>=22.0.0",
|
|
13
|
+
"pydantic>=2.12.5",
|
|
14
|
+
"pydantic-settings>=2.12.0",
|
|
15
|
+
"pyodbc>=5.2.0",
|
|
16
|
+
"python-dotenv>=1.1.1",
|
|
17
|
+
"sqlalchemy>=2.0.43",
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 3 - Alpha",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Topic :: Database",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
24
|
+
"License :: OSI Approved :: MIT License",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[dependency-groups]
|
|
30
|
+
dev = ["ipykernel>=7.1.0", "pytest>=8.4.2", "ruff", "mypy", "pre-commit"]
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
easy-data-loader = "easy_data_loader.cli:main"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
where = ["src"]
|
|
37
|
+
|
|
38
|
+
[tool.pytest.ini_options]
|
|
39
|
+
pythonpath = "src"
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["setuptools>=61.0"]
|
|
43
|
+
build-backend = "setuptools.build_meta"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
2
|
+
|
|
3
|
+
from .models import (
|
|
4
|
+
BasePipelineDefinition,
|
|
5
|
+
ColumnDefinition,
|
|
6
|
+
OrchestratorDefinition,
|
|
7
|
+
ProcedureDefinition,
|
|
8
|
+
)
|
|
9
|
+
from .orchestrator import OrchestratorPipeline
|
|
10
|
+
from .pipeline import LoadPipeline
|
|
11
|
+
from .procedure_pipeline import ProcedurePipeline
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"LoadPipeline",
|
|
15
|
+
"ProcedurePipeline",
|
|
16
|
+
"OrchestratorPipeline",
|
|
17
|
+
"BasePipelineDefinition",
|
|
18
|
+
"ProcedureDefinition",
|
|
19
|
+
"ColumnDefinition",
|
|
20
|
+
"OrchestratorDefinition",
|
|
21
|
+
]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import click
|
|
2
|
-
import os
|
|
3
1
|
from pathlib import Path
|
|
4
2
|
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
5
|
# Integrated templates
|
|
6
6
|
PIPELINE_TEMPLATE = """
|
|
7
7
|
from easy_data_loader.pipeline import LoadPipeline
|
|
@@ -89,34 +89,40 @@ CONN_PORT=1433
|
|
|
89
89
|
|
|
90
90
|
FILE_ENV = """
|
|
91
91
|
# file resource definition
|
|
92
|
-
FILE_TYPE=CSV
|
|
93
|
-
FOLDER_PATH=./data/imports
|
|
94
|
-
FILE_NAME=large_sales_data
|
|
92
|
+
FILE_TYPE=CSV # can also be XLSX, PARQUET, ORC
|
|
93
|
+
FOLDER_PATH=./data/imports # source folder where the file is located
|
|
94
|
+
FILE_NAME=large_sales_data # exact file name without extension
|
|
95
|
+
#FILE_PATTERN=large_sales # file pattern to search in the source folder
|
|
95
96
|
"""
|
|
96
97
|
|
|
97
98
|
MAIN = """
|
|
98
|
-
from easy_data_loader
|
|
99
|
+
from easy_data_loader import LoadPipeline, ProcedurePipeline
|
|
99
100
|
|
|
100
|
-
|
|
101
|
-
|
|
101
|
+
def main():
|
|
102
|
+
# Run an ETL pipeline
|
|
103
|
+
LoadPipeline(pipeline_name="example_pipeline").run()
|
|
104
|
+
|
|
105
|
+
# Run a procedure pipeline
|
|
106
|
+
ProcedurePipeline(pipeline_name="example_procedure").run()
|
|
102
107
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
# ProcedurePipeline(pipeline_name="example_procedure").run()
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
main()
|
|
106
110
|
"""
|
|
107
111
|
|
|
112
|
+
|
|
108
113
|
@click.group()
|
|
109
114
|
def main():
|
|
110
|
-
"""Easy Data Loader CLI - ETL instrument
|
|
115
|
+
"""Easy Data Loader CLI - ETL instrument for files and databases"""
|
|
111
116
|
pass
|
|
112
117
|
|
|
118
|
+
|
|
113
119
|
@main.command()
|
|
114
120
|
def init():
|
|
115
121
|
"""Initialize folder structure and sample files"""
|
|
116
122
|
base_path = Path.cwd()
|
|
117
123
|
|
|
118
124
|
# folders
|
|
119
|
-
folders = [
|
|
125
|
+
folders = ["config/resources", "config/pipelines"]
|
|
120
126
|
for folder in folders:
|
|
121
127
|
(base_path / folder).mkdir(parents=True, exist_ok=True)
|
|
122
128
|
|
|
@@ -127,7 +133,7 @@ def init():
|
|
|
127
133
|
"config/pipelines/orchestrator_example.py": ORCHESTRATOR_TEMPLATE,
|
|
128
134
|
"config/resources/database_example.env": DATABASE_ENV,
|
|
129
135
|
"config/resources/file_example.env": FILE_ENV,
|
|
130
|
-
"main.py"
|
|
136
|
+
"main.py": MAIN,
|
|
131
137
|
}
|
|
132
138
|
|
|
133
139
|
for name, content in files.items():
|
|
@@ -141,10 +147,12 @@ def init():
|
|
|
141
147
|
|
|
142
148
|
click.echo("\nProject initialized successfully!")
|
|
143
149
|
|
|
150
|
+
|
|
144
151
|
@main.command()
|
|
145
152
|
def list():
|
|
146
153
|
"""List all discovered resources and pipelines"""
|
|
147
154
|
from .config_loader import Configuration
|
|
155
|
+
|
|
148
156
|
config = Configuration()
|
|
149
157
|
|
|
150
158
|
click.echo("--- Discovered Resources ---")
|
|
@@ -157,19 +165,22 @@ def list():
|
|
|
157
165
|
|
|
158
166
|
|
|
159
167
|
@main.command()
|
|
160
|
-
@click.argument(
|
|
161
|
-
@click.argument(
|
|
168
|
+
@click.argument("resource_name")
|
|
169
|
+
@click.argument("table_name")
|
|
162
170
|
def inspect_db(resource_name, table_name):
|
|
163
171
|
"""Inspect a database table and generate ColumnDefinition code"""
|
|
164
172
|
from .config_loader import Configuration
|
|
165
173
|
from .database_connector import CONNECTOR_FACTORY
|
|
166
174
|
from .database_operations import DatabaseOperations
|
|
167
|
-
|
|
175
|
+
|
|
176
|
+
from .models import ServerBasedConnectionSettings, FileBasedConnectionSettings
|
|
168
177
|
|
|
169
178
|
config = Configuration()
|
|
170
179
|
resource = config.get_resource(resource_name)
|
|
171
180
|
|
|
172
|
-
if not isinstance(
|
|
181
|
+
if not isinstance(
|
|
182
|
+
resource, (ServerBasedConnectionSettings, FileBasedConnectionSettings)
|
|
183
|
+
):
|
|
173
184
|
click.echo(f"Error: Resource '{resource_name}' is not a database connection.")
|
|
174
185
|
return
|
|
175
186
|
|
|
@@ -186,7 +197,9 @@ def inspect_db(resource_name, table_name):
|
|
|
186
197
|
click.echo(f"\n# Suggested Column definitions for {table_name}:")
|
|
187
198
|
click.echo("columns={")
|
|
188
199
|
for col, dtype in schema.items():
|
|
189
|
-
click.echo(
|
|
200
|
+
click.echo(
|
|
201
|
+
f' "{col}": ColumnDefinition(target_name="{col}", data_type={dtype}),'
|
|
202
|
+
)
|
|
190
203
|
click.echo("}")
|
|
191
204
|
|
|
192
205
|
|
|
@@ -194,9 +207,13 @@ def inspect_db(resource_name, table_name):
|
|
|
194
207
|
def run_all():
|
|
195
208
|
"""Run all discovered pipelines and show status summary"""
|
|
196
209
|
from .config_loader import Configuration
|
|
210
|
+
from .models import (
|
|
211
|
+
BasePipelineDefinition,
|
|
212
|
+
OrchestratorDefinition,
|
|
213
|
+
ProcedureDefinition,
|
|
214
|
+
)
|
|
197
215
|
from .pipeline import LoadPipeline
|
|
198
216
|
from .procedure_pipeline import ProcedurePipeline
|
|
199
|
-
from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition
|
|
200
217
|
|
|
201
218
|
config = Configuration()
|
|
202
219
|
pipelines = config.get_all_pipelines()
|
|
@@ -219,6 +236,7 @@ def run_all():
|
|
|
219
236
|
success = ProcedurePipeline(name).run()
|
|
220
237
|
elif isinstance(definition, OrchestratorDefinition):
|
|
221
238
|
from .orchestrator import OrchestratorPipeline
|
|
239
|
+
|
|
222
240
|
success = OrchestratorPipeline(name).run()
|
|
223
241
|
else:
|
|
224
242
|
success = False
|
|
@@ -235,12 +253,13 @@ def run_all():
|
|
|
235
253
|
for name, status in results.items():
|
|
236
254
|
click.echo(f"{name:<25} | {status}")
|
|
237
255
|
|
|
256
|
+
|
|
238
257
|
@main.command()
|
|
239
|
-
@click.argument(
|
|
258
|
+
@click.argument("orchestrator_name")
|
|
240
259
|
def run_orchestrator(orchestrator_name):
|
|
241
260
|
"""Run a specific orchestrator by name"""
|
|
242
261
|
from .orchestrator import OrchestratorPipeline
|
|
243
|
-
|
|
262
|
+
|
|
244
263
|
try:
|
|
245
264
|
success = OrchestratorPipeline(orchestrator_name).run()
|
|
246
265
|
if success:
|
|
@@ -256,7 +275,7 @@ def validate_resources():
|
|
|
256
275
|
"""Validate all configured resources"""
|
|
257
276
|
from .config_loader import Configuration
|
|
258
277
|
from .database_connector import CONNECTOR_FACTORY
|
|
259
|
-
from .models import
|
|
278
|
+
from .models import FileSettings
|
|
260
279
|
|
|
261
280
|
config = Configuration()
|
|
262
281
|
resources = config.get_all_resources()
|
|
@@ -267,24 +286,28 @@ def validate_resources():
|
|
|
267
286
|
|
|
268
287
|
click.echo(f"🔍 Validating {len(resources)} resources...\n")
|
|
269
288
|
|
|
289
|
+
from .models import ServerBasedConnectionSettings, FileBasedConnectionSettings
|
|
290
|
+
|
|
270
291
|
results = {}
|
|
271
292
|
|
|
272
293
|
for name, resource in resources.items():
|
|
273
294
|
click.echo(f"Resource: {name} ... ", nl=False)
|
|
274
295
|
try:
|
|
275
|
-
if isinstance(
|
|
296
|
+
if isinstance(
|
|
297
|
+
resource, (ServerBasedConnectionSettings, FileBasedConnectionSettings)
|
|
298
|
+
):
|
|
276
299
|
# Validate Database Connection
|
|
277
|
-
|
|
300
|
+
CONNECTOR_FACTORY[resource.conn_server_type](resource)
|
|
278
301
|
# The connector tests connection in __init__, so if we are here it passed
|
|
279
302
|
results[name] = "OK (Connected)"
|
|
280
303
|
elif isinstance(resource, FileSettings):
|
|
281
304
|
# Validate File Path
|
|
282
305
|
if resource.folder_path.exists():
|
|
283
|
-
|
|
306
|
+
results[name] = "OK (Path Exists)"
|
|
284
307
|
else:
|
|
285
|
-
|
|
308
|
+
raise ValueError(f"Path does not exist: {resource.folder_path}")
|
|
286
309
|
else:
|
|
287
|
-
|
|
310
|
+
results[name] = "UNKNOWN TYPE"
|
|
288
311
|
|
|
289
312
|
except Exception as e:
|
|
290
313
|
results[name] = f"FAILED: {str(e)}"
|
|
@@ -299,4 +322,4 @@ def validate_resources():
|
|
|
299
322
|
|
|
300
323
|
|
|
301
324
|
if __name__ == "__main__":
|
|
302
|
-
main()
|
|
325
|
+
main()
|
|
@@ -1,11 +1,18 @@
|
|
|
1
1
|
import importlib.util
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Any, Dict, Union
|
|
4
3
|
from types import ModuleType
|
|
5
|
-
from
|
|
4
|
+
from typing import Dict, Union
|
|
6
5
|
|
|
7
6
|
from .log import LoggedComponent
|
|
8
|
-
from .models import
|
|
7
|
+
from .models import (
|
|
8
|
+
BasePipelineDefinition,
|
|
9
|
+
FileSettings,
|
|
10
|
+
OrchestratorDefinition,
|
|
11
|
+
ProcedureDefinition,
|
|
12
|
+
ResourceConfig,
|
|
13
|
+
PipelineType,
|
|
14
|
+
ServerBasedConnectionSettings,
|
|
15
|
+
)
|
|
9
16
|
|
|
10
17
|
|
|
11
18
|
class Configuration(LoggedComponent):
|
|
@@ -14,6 +21,7 @@ class Configuration(LoggedComponent):
|
|
|
14
21
|
Resources and pipelines are loaded only when requested.
|
|
15
22
|
This class implements the Singleton pattern.
|
|
16
23
|
"""
|
|
24
|
+
|
|
17
25
|
_instance = None
|
|
18
26
|
_initialized = False
|
|
19
27
|
|
|
@@ -26,10 +34,17 @@ class Configuration(LoggedComponent):
|
|
|
26
34
|
if not self._initialized:
|
|
27
35
|
super().__init__()
|
|
28
36
|
self.config_dir = Path(config_dir)
|
|
29
|
-
self.resources
|
|
30
|
-
self.pipelines
|
|
37
|
+
self.resources: Dict[str, ResourceConfig] = {}
|
|
38
|
+
self.pipelines: Dict[
|
|
39
|
+
str,
|
|
40
|
+
Union[
|
|
41
|
+
BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition
|
|
42
|
+
],
|
|
43
|
+
] = {}
|
|
31
44
|
|
|
32
|
-
self.logger.debug(
|
|
45
|
+
self.logger.debug(
|
|
46
|
+
f"Initializing configuration from directory: {config_dir}"
|
|
47
|
+
)
|
|
33
48
|
self._initialized = True
|
|
34
49
|
|
|
35
50
|
def _load_env_file(self, env_file: Path) -> ResourceConfig:
|
|
@@ -38,15 +53,25 @@ class Configuration(LoggedComponent):
|
|
|
38
53
|
|
|
39
54
|
env_file_name = env_file.stem
|
|
40
55
|
|
|
41
|
-
if env_file_name.startswith(
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
56
|
+
if env_file_name.startswith("database_"):
|
|
57
|
+
# Peek at the env file to determine which connection class to use
|
|
58
|
+
from dotenv import dotenv_values
|
|
59
|
+
|
|
60
|
+
raw_values = dotenv_values(env_file)
|
|
61
|
+
server_type_str = raw_values.get("CONN_SERVER_TYPE", "").upper()
|
|
62
|
+
if server_type_str == "SQLITE":
|
|
63
|
+
from .models import FileBasedConnectionSettings
|
|
64
|
+
|
|
65
|
+
return FileBasedConnectionSettings(_env_file=[env_file])
|
|
66
|
+
return ServerBasedConnectionSettings(_env_file=[env_file])
|
|
67
|
+
if env_file_name.startswith("file_"):
|
|
68
|
+
return FileSettings(_env_file=[env_file])
|
|
45
69
|
|
|
46
|
-
self.log_and_raise(
|
|
70
|
+
self.log_and_raise(
|
|
71
|
+
ValueError,
|
|
47
72
|
f"Failed to load env file: {env_file.name}. "
|
|
48
|
-
f"Resource files must start with 'database_' or 'file_' prefix."
|
|
49
|
-
|
|
73
|
+
f"Resource files must start with 'database_' or 'file_' prefix.",
|
|
74
|
+
)
|
|
50
75
|
|
|
51
76
|
def _import_module(self, module_file: Path) -> ModuleType:
|
|
52
77
|
"""Dynamically import a Python module from a file path"""
|
|
@@ -101,12 +126,12 @@ class Configuration(LoggedComponent):
|
|
|
101
126
|
raise
|
|
102
127
|
|
|
103
128
|
# 3. Not found
|
|
104
|
-
self.log_and_raise(
|
|
105
|
-
|
|
106
|
-
|
|
129
|
+
self.log_and_raise(
|
|
130
|
+
ValueError,
|
|
131
|
+
f"Resource not found: {resource_name}. Checked path: {resource_file}",
|
|
107
132
|
)
|
|
108
133
|
|
|
109
|
-
def get_pipeline(self, pipeline_name: str) ->
|
|
134
|
+
def get_pipeline(self, pipeline_name: str) -> PipelineType:
|
|
110
135
|
"""
|
|
111
136
|
Retrieve a pipeline definition by name.
|
|
112
137
|
Uses lazy loading: checks memory first, then attempts to load from file.
|
|
@@ -125,18 +150,26 @@ class Configuration(LoggedComponent):
|
|
|
125
150
|
# Find the definition in the module
|
|
126
151
|
for attr_name in dir(config_module):
|
|
127
152
|
attr = getattr(config_module, attr_name)
|
|
128
|
-
if isinstance(
|
|
153
|
+
if isinstance(
|
|
154
|
+
attr,
|
|
155
|
+
(
|
|
156
|
+
BasePipelineDefinition,
|
|
157
|
+
ProcedureDefinition,
|
|
158
|
+
OrchestratorDefinition,
|
|
159
|
+
),
|
|
160
|
+
):
|
|
129
161
|
self.pipelines[pipeline_name] = attr
|
|
130
162
|
self.logger.info(f"Lazily loaded pipeline: {pipeline_name}")
|
|
131
163
|
return attr
|
|
132
164
|
except Exception as e:
|
|
133
|
-
|
|
134
|
-
|
|
165
|
+
self.log_exception(e, f"Failed to load pipeline: {pipeline_name}")
|
|
166
|
+
raise
|
|
135
167
|
|
|
136
168
|
# 3. Not found
|
|
137
|
-
self.log_and_raise(
|
|
138
|
-
|
|
139
|
-
|
|
169
|
+
self.log_and_raise(
|
|
170
|
+
ValueError,
|
|
171
|
+
f"Pipeline not found: {pipeline_name}. Checked path: {pipeline_file}",
|
|
172
|
+
)
|
|
140
173
|
|
|
141
174
|
def get_all_resources(self) -> dict[str, ResourceConfig]:
|
|
142
175
|
"""
|
|
@@ -149,19 +182,19 @@ class Configuration(LoggedComponent):
|
|
|
149
182
|
|
|
150
183
|
# Discover all .env files
|
|
151
184
|
for env_file in resources_dir.glob("*.env"):
|
|
152
|
-
|
|
153
|
-
|
|
185
|
+
# Simple logging to debug discovery
|
|
186
|
+
self.logger.debug(f"Found potential resource file: {env_file.name}")
|
|
154
187
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
188
|
+
resource_name = env_file.stem
|
|
189
|
+
if resource_name not in self.resources:
|
|
190
|
+
try:
|
|
191
|
+
self.resources[resource_name] = self._load_env_file(env_file)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
self.logger.warning(f"Failed to load resource {resource_name}: {e}")
|
|
161
194
|
|
|
162
195
|
return self.resources
|
|
163
196
|
|
|
164
|
-
def get_all_pipelines(self) -> dict[str,
|
|
197
|
+
def get_all_pipelines(self) -> dict[str, PipelineType]:
|
|
165
198
|
"""
|
|
166
199
|
Retrieve all pipelines.
|
|
167
200
|
Scans the pipelines directory if not all loaded.
|
|
@@ -175,10 +208,10 @@ class Configuration(LoggedComponent):
|
|
|
175
208
|
|
|
176
209
|
pipeline_name = pipeline_file.stem
|
|
177
210
|
if pipeline_name not in self.pipelines:
|
|
178
|
-
|
|
179
|
-
|
|
211
|
+
# Helper to trigger lazy load
|
|
212
|
+
try:
|
|
180
213
|
self.get_pipeline(pipeline_name)
|
|
181
|
-
|
|
182
|
-
|
|
214
|
+
except Exception as e:
|
|
215
|
+
self.logger.warning(f"Failed to load pipeline {pipeline_name}: {e}")
|
|
183
216
|
|
|
184
217
|
return self.pipelines
|