easy-data-loader 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. easy_data_loader-0.1.2/PKG-INFO +110 -0
  2. easy_data_loader-0.1.2/README.md +84 -0
  3. easy_data_loader-0.1.2/pyproject.toml +43 -0
  4. easy_data_loader-0.1.2/src/easy_data_loader/__init__.py +21 -0
  5. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/cli.py +52 -29
  6. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/config_loader.py +69 -36
  7. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/custom_exceptions.py +1 -0
  8. easy_data_loader-0.1.2/src/easy_data_loader/data_inferrence.py +897 -0
  9. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/database_connector.py +57 -20
  10. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/database_operations.py +36 -17
  11. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/driver_detector.py +2 -1
  12. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/file_operations.py +90 -34
  13. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader/log.py +2 -1
  14. easy_data_loader-0.1.2/src/easy_data_loader/models.py +238 -0
  15. easy_data_loader-0.1.2/src/easy_data_loader/orchestrator.py +80 -0
  16. easy_data_loader-0.1.2/src/easy_data_loader/pipeline.py +231 -0
  17. easy_data_loader-0.1.2/src/easy_data_loader/pipeline_base.py +138 -0
  18. easy_data_loader-0.1.2/src/easy_data_loader/procedure_pipeline.py +75 -0
  19. easy_data_loader-0.1.2/src/easy_data_loader.egg-info/PKG-INFO +110 -0
  20. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/SOURCES.txt +2 -0
  21. easy_data_loader-0.1.2/src/easy_data_loader.egg-info/entry_points.txt +2 -0
  22. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/requires.txt +0 -1
  23. easy_data_loader-0.1.2/tests/test_data_inference.py +843 -0
  24. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/tests/test_imports.py +3 -0
  25. easy_data_loader-0.1.0/PKG-INFO +0 -52
  26. easy_data_loader-0.1.0/README.md +0 -32
  27. easy_data_loader-0.1.0/pyproject.toml +0 -37
  28. easy_data_loader-0.1.0/src/easy_data_loader/__init__.py +0 -11
  29. easy_data_loader-0.1.0/src/easy_data_loader/models.py +0 -168
  30. easy_data_loader-0.1.0/src/easy_data_loader/orchestrator.py +0 -59
  31. easy_data_loader-0.1.0/src/easy_data_loader/pipeline.py +0 -169
  32. easy_data_loader-0.1.0/src/easy_data_loader/pipeline_base.py +0 -121
  33. easy_data_loader-0.1.0/src/easy_data_loader/procedure_pipeline.py +0 -56
  34. easy_data_loader-0.1.0/src/easy_data_loader.egg-info/PKG-INFO +0 -52
  35. easy_data_loader-0.1.0/src/easy_data_loader.egg-info/entry_points.txt +0 -2
  36. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/LICENSE +0 -0
  37. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/setup.cfg +0 -0
  38. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/dependency_links.txt +0 -0
  39. {easy_data_loader-0.1.0 → easy_data_loader-0.1.2}/src/easy_data_loader.egg-info/top_level.txt +0 -0
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.4
2
+ Name: easy_data_loader
3
+ Version: 0.1.2
4
+ Summary: Data transfer utilities between files and databases
5
+ Author-email: Bojoi Gabriel <bojoigabriel@gmail.com>
6
+ Classifier: Development Status :: 3 - Alpha
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Topic :: Database
9
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.13
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: click>=8.3.0
17
+ Requires-Dist: openpyxl>=3.1.5
18
+ Requires-Dist: pandas>=2.3.3
19
+ Requires-Dist: pyarrow>=22.0.0
20
+ Requires-Dist: pydantic>=2.12.5
21
+ Requires-Dist: pydantic-settings>=2.12.0
22
+ Requires-Dist: pyodbc>=5.2.0
23
+ Requires-Dist: python-dotenv>=1.1.1
24
+ Requires-Dist: sqlalchemy>=2.0.43
25
+ Dynamic: license-file
26
+
27
+ # Easy Data Loader 🚀
28
+
29
+
30
+ [![PyPI version](https://badge.fury.io/py/easy-data-loader.svg)](https://badge.fury.io/py/easy-data-loader)
31
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
32
+ ![Downloads](https://static.pepy.tech/badge/easy-data-loader)
33
+
34
+ **Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various file data sources (csv, xlsx, parquet, orc) and databases (MSSQL, PostgreSQL and others).
35
+
36
+ ## ✨ Key Features
37
+ - **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
38
+ - **Integrated CLI**: Initialize a standardized project structure with a single command.
39
+ - **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
40
+ - **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
41
+ - **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
42
+
43
+ ---
44
+
45
+ ## 📦 Installation
46
+
47
+ Install directly via `pip` or `uv`:
48
+
49
+ ```bash
50
+ pip install easy_data_loader
51
+ uv add easy_data_loader
52
+ ```
53
+
54
+ ## 🚀 Getting Started
55
+
56
+ 1. Initialize a new project structure to generate template configurations:
57
+ ```bash
58
+ easy-data-loader init
59
+ ```
60
+ 2. Review the generated `config/` folders for sample resources and pipelines.
61
+ 3. Run all discovered pipelines across the active configurations:
62
+ ```bash
63
+ easy-data-loader run_all
64
+ ```
65
+
66
+ ## ✔️ Generic concepts
67
+
68
+ `easy_data_loader` uses `resources` as a way to define a file or a database. The resouces can represent either a source or a destination making posible the folowing ETL scenarios: file -> file, file -> database, database -> file, database -> database.
69
+
70
+ `easy_data_loader` project initializer will created the predefined folder structure `/config/resources` where the resources are expected to be defined following the current convention: the file type is .env and the file name must be prefixed with the resource type `file_` or `database_`. The predefined folder structure together with the naming convention enables `easy_data_loader` to find and load all resources.
71
+
72
+ A secondary predefined folder `/config/pipelines` will contain the pipeline definition files, which are regular Python files. There are 3 types of pipelines that can be defined:
73
+ - `LoadPipeline` the main pipeline type which transports data from source to destination
74
+ - `ProcedurePipeline` a pipeline dedicated for executing stored procedures inside a database
75
+ - `OrchestratorPipeline` a pipeline that can execute a group of pipelines sequentialy
76
+
77
+ ## LoadPipeline
78
+
79
+ In order to define a `LoadPipeline` we must use the `BasePipelineDefinition` from `easy_data_loader` as depicted in the example pipelines created by the initializer.
80
+ In the simplest form there are only a few mandatory parameters:
81
+ - `pipeline_name : str` - this name will be used to execute the pipeline
82
+ - `source : str` - the file name (without extension) coresponding to the desired resource to be the data source
83
+ - `destination : str` - the file name (without extension) coresponding to the desired resource to be the data destination
84
+
85
+ If either the source or destination are a database then additional parameters become mandatory:
86
+ - `source_sql : str` - can be a table name or a specific query in the SQL dialect of the source database flavor
87
+ - `destination_table : str` - table name where the data will be inserted
88
+
89
+ There are many other aspects of the pipeline that can be defined:
90
+ - `audit : str` - the pipeline has a built in audit functionality, it records certain information after the pipeline completes in a SqlLite database. If the user desires, the same information can be recorded in a database `resource`
91
+ - `validator: Pydantic BaseModel` - the data read from the source `resource` can be validated using an arbitrary defined Pydantic model before is written to destination
92
+ - `columns : Dict[str, ColumnDefinition]` - this parameter is used for strict control on how the data is written to destination; it has the dual purpose of renaming the columns and also define explicitly the data types (mainly for inserting into a database table); the `ColumnDefinition` is constructed with an optional `target_name: str` for renaming columns and / or a `data_type : SqlAlchemy Type` thus controling column data types, lenghts, precision etc.
93
+ - `read_parameters : Dict[str, Any]` and `write_parameters : Dict[str, Any]` - these parameters control how the data is being read or written from source to destination and provide an easy way to use special delimiters for files, drop and recreate the database table, etc. `easy_data_loader` is using pandas as the transport layer therefore the read and write parameters will be passed to the coresponding read and write functions supported by pandas.
94
+ - the pipeline has a set of predefined hooks allowing the execution of functions at specific moments during the execution: `file_pre_process : Callable` - executed before the file is read into the pandas DataFrame (e.g. unzip the file); `transform : Callable` - perform data transformation over the data already in the pandas DataFrame (requires pandas methods); `file_post_process : Callable` - after the pipeline completes and the data is written to the destination perform post processing on the source file (e.g. move the file to another folder)
95
+
96
+ ## ProcedurePipeline
97
+
98
+ This secondary pipeline type is responsible for executing one or more stored procedures inside a database.
99
+ To define one we need to use the `ProcedureDefinition` with the following parameters:
100
+ - `pipeline_name : str` - this name will be used to execute the pipeline
101
+ - `audit : str, optional` - database resource name where the audit info will be recorded
102
+ - `resource : str` - database resource name where the stored procedure(s) wil be executed
103
+ - `procedures : List[tuple(str, Optional[Dict[str, Any]])]` - list of one or more stord procedures along with optional procedures parameters as dictionaries
104
+
105
+ ## OrchestratorPipeline
106
+
107
+ This pipeline type is responsible of executing sequentially a set of pipelines, `LoadPipeline`s and / or `ProcedurePipeline`s. Very simple to define using the `OrchestratorDefinition` with:
108
+ - `orchestrator_name : str` - name by which the orchestrator is executer
109
+ - 'pipelines : List[str]` - list of pipelines to execute sequentially
110
+ - `fail_fast : bool, Default True` - if any of the pipelines fail the rest of the pipelines in the list do not get executed
@@ -0,0 +1,84 @@
1
+ # Easy Data Loader 🚀
2
+
3
+
4
+ [![PyPI version](https://badge.fury.io/py/easy-data-loader.svg)](https://badge.fury.io/py/easy-data-loader)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ ![Downloads](https://static.pepy.tech/badge/easy-data-loader)
7
+
8
+ **Easy Data Loader** is a flexible, modular Python library designed to streamline ETL (Extract, Transform, Load) processes between various file data sources (csv, xlsx, parquet, orc) and databases (MSSQL, PostgreSQL and others).
9
+
10
+ ## ✨ Key Features
11
+ - **Declarative Configuration**: Manage connections and pipelines through simple python files and `.env` resources.
12
+ - **Integrated CLI**: Initialize a standardized project structure with a single command.
13
+ - **Custom Transformation Hooks**: Inject your own Pandas transformation logic directly into the pipeline execution.
14
+ - **Performance Optimized**: Built-in support for chunked loading and writing to handle large datasets efficiently.
15
+ - **Extensible Architecture**: Uses a Factory Pattern for database connectors, making it easy to support new drivers.
16
+
17
+ ---
18
+
19
+ ## 📦 Installation
20
+
21
+ Install directly via `pip` or `uv`:
22
+
23
+ ```bash
24
+ pip install easy_data_loader
25
+ uv add easy_data_loader
26
+ ```
27
+
28
+ ## 🚀 Getting Started
29
+
30
+ 1. Initialize a new project structure to generate template configurations:
31
+ ```bash
32
+ easy-data-loader init
33
+ ```
34
+ 2. Review the generated `config/` folders for sample resources and pipelines.
35
+ 3. Run all discovered pipelines across the active configurations:
36
+ ```bash
37
+ easy-data-loader run_all
38
+ ```
39
+
40
+ ## ✔️ Generic concepts
41
+
42
+ `easy_data_loader` uses `resources` as a way to define a file or a database. The resouces can represent either a source or a destination making posible the folowing ETL scenarios: file -> file, file -> database, database -> file, database -> database.
43
+
44
+ `easy_data_loader` project initializer will created the predefined folder structure `/config/resources` where the resources are expected to be defined following the current convention: the file type is .env and the file name must be prefixed with the resource type `file_` or `database_`. The predefined folder structure together with the naming convention enables `easy_data_loader` to find and load all resources.
45
+
46
+ A secondary predefined folder `/config/pipelines` will contain the pipeline definition files, which are regular Python files. There are 3 types of pipelines that can be defined:
47
+ - `LoadPipeline` the main pipeline type which transports data from source to destination
48
+ - `ProcedurePipeline` a pipeline dedicated for executing stored procedures inside a database
49
+ - `OrchestratorPipeline` a pipeline that can execute a group of pipelines sequentialy
50
+
51
+ ## LoadPipeline
52
+
53
+ In order to define a `LoadPipeline` we must use the `BasePipelineDefinition` from `easy_data_loader` as depicted in the example pipelines created by the initializer.
54
+ In the simplest form there are only a few mandatory parameters:
55
+ - `pipeline_name : str` - this name will be used to execute the pipeline
56
+ - `source : str` - the file name (without extension) coresponding to the desired resource to be the data source
57
+ - `destination : str` - the file name (without extension) coresponding to the desired resource to be the data destination
58
+
59
+ If either the source or destination are a database then additional parameters become mandatory:
60
+ - `source_sql : str` - can be a table name or a specific query in the SQL dialect of the source database flavor
61
+ - `destination_table : str` - table name where the data will be inserted
62
+
63
+ There are many other aspects of the pipeline that can be defined:
64
+ - `audit : str` - the pipeline has a built in audit functionality, it records certain information after the pipeline completes in a SqlLite database. If the user desires, the same information can be recorded in a database `resource`
65
+ - `validator: Pydantic BaseModel` - the data read from the source `resource` can be validated using an arbitrary defined Pydantic model before is written to destination
66
+ - `columns : Dict[str, ColumnDefinition]` - this parameter is used for strict control on how the data is written to destination; it has the dual purpose of renaming the columns and also define explicitly the data types (mainly for inserting into a database table); the `ColumnDefinition` is constructed with an optional `target_name: str` for renaming columns and / or a `data_type : SqlAlchemy Type` thus controling column data types, lenghts, precision etc.
67
+ - `read_parameters : Dict[str, Any]` and `write_parameters : Dict[str, Any]` - these parameters control how the data is being read or written from source to destination and provide an easy way to use special delimiters for files, drop and recreate the database table, etc. `easy_data_loader` is using pandas as the transport layer therefore the read and write parameters will be passed to the coresponding read and write functions supported by pandas.
68
+ - the pipeline has a set of predefined hooks allowing the execution of functions at specific moments during the execution: `file_pre_process : Callable` - executed before the file is read into the pandas DataFrame (e.g. unzip the file); `transform : Callable` - perform data transformation over the data already in the pandas DataFrame (requires pandas methods); `file_post_process : Callable` - after the pipeline completes and the data is written to the destination perform post processing on the source file (e.g. move the file to another folder)
69
+
70
+ ## ProcedurePipeline
71
+
72
+ This secondary pipeline type is responsible for executing one or more stored procedures inside a database.
73
+ To define one we need to use the `ProcedureDefinition` with the following parameters:
74
+ - `pipeline_name : str` - this name will be used to execute the pipeline
75
+ - `audit : str, optional` - database resource name where the audit info will be recorded
76
+ - `resource : str` - database resource name where the stored procedure(s) wil be executed
77
+ - `procedures : List[tuple(str, Optional[Dict[str, Any]])]` - list of one or more stord procedures along with optional procedures parameters as dictionaries
78
+
79
+ ## OrchestratorPipeline
80
+
81
+ This pipeline type is responsible of executing sequentially a set of pipelines, `LoadPipeline`s and / or `ProcedurePipeline`s. Very simple to define using the `OrchestratorDefinition` with:
82
+ - `orchestrator_name : str` - name by which the orchestrator is executer
83
+ - 'pipelines : List[str]` - list of pipelines to execute sequentially
84
+ - `fail_fast : bool, Default True` - if any of the pipelines fail the rest of the pipelines in the list do not get executed
@@ -0,0 +1,43 @@
1
+ [project]
2
+ name = "easy_data_loader"
3
+ version = "0.1.2"
4
+ description = "Data transfer utilities between files and databases"
5
+ authors = [{ name = "Bojoi Gabriel", email = "bojoigabriel@gmail.com" }]
6
+ readme = "README.md"
7
+ requires-python = ">=3.13"
8
+ dependencies = [
9
+ "click>=8.3.0",
10
+ "openpyxl>=3.1.5",
11
+ "pandas>=2.3.3",
12
+ "pyarrow>=22.0.0",
13
+ "pydantic>=2.12.5",
14
+ "pydantic-settings>=2.12.0",
15
+ "pyodbc>=5.2.0",
16
+ "python-dotenv>=1.1.1",
17
+ "sqlalchemy>=2.0.43",
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 3 - Alpha",
21
+ "Intended Audience :: Developers",
22
+ "Topic :: Database",
23
+ "Topic :: Scientific/Engineering :: Information Analysis",
24
+ "License :: OSI Approved :: MIT License",
25
+ "Programming Language :: Python :: 3.13",
26
+ "Operating System :: OS Independent",
27
+ ]
28
+
29
+ [dependency-groups]
30
+ dev = ["ipykernel>=7.1.0", "pytest>=8.4.2", "ruff", "mypy", "pre-commit"]
31
+
32
+ [project.scripts]
33
+ easy-data-loader = "easy_data_loader.cli:main"
34
+
35
+ [tool.setuptools.packages.find]
36
+ where = ["src"]
37
+
38
+ [tool.pytest.ini_options]
39
+ pythonpath = "src"
40
+
41
+ [build-system]
42
+ requires = ["setuptools>=61.0"]
43
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,21 @@
1
+ __version__ = "0.1.0"
2
+
3
+ from .models import (
4
+ BasePipelineDefinition,
5
+ ColumnDefinition,
6
+ OrchestratorDefinition,
7
+ ProcedureDefinition,
8
+ )
9
+ from .orchestrator import OrchestratorPipeline
10
+ from .pipeline import LoadPipeline
11
+ from .procedure_pipeline import ProcedurePipeline
12
+
13
+ __all__ = [
14
+ "LoadPipeline",
15
+ "ProcedurePipeline",
16
+ "OrchestratorPipeline",
17
+ "BasePipelineDefinition",
18
+ "ProcedureDefinition",
19
+ "ColumnDefinition",
20
+ "OrchestratorDefinition",
21
+ ]
@@ -1,7 +1,7 @@
1
- import click
2
- import os
3
1
  from pathlib import Path
4
2
 
3
+ import click
4
+
5
5
  # Integrated templates
6
6
  PIPELINE_TEMPLATE = """
7
7
  from easy_data_loader.pipeline import LoadPipeline
@@ -89,34 +89,40 @@ CONN_PORT=1433
89
89
 
90
90
  FILE_ENV = """
91
91
  # file resource definition
92
- FILE_TYPE=CSV
93
- FOLDER_PATH=./data/imports
94
- FILE_NAME=large_sales_data
92
+ FILE_TYPE=CSV # can also be XLSX, PARQUET, ORC
93
+ FOLDER_PATH=./data/imports # source folder where the file is located
94
+ FILE_NAME=large_sales_data # exact file name without extension
95
+ #FILE_PATTERN=large_sales # file pattern to search in the source folder
95
96
  """
96
97
 
97
98
  MAIN = """
98
- from easy_data_loader.pipeline import LoadPipeline
99
+ from easy_data_loader import LoadPipeline, ProcedurePipeline
99
100
 
100
- # Run an ETL pipeline
101
- LoadPipeline(pipeline_name="example_pipeline").run()
101
+ def main():
102
+ # Run an ETL pipeline
103
+ LoadPipeline(pipeline_name="example_pipeline").run()
104
+
105
+ # Run a procedure pipeline
106
+ ProcedurePipeline(pipeline_name="example_procedure").run()
102
107
 
103
- # Run a procedure pipeline
104
- # from easy_data_loader.procedure_pipeline import ProcedurePipeline
105
- # ProcedurePipeline(pipeline_name="example_procedure").run()
108
+ if __name__ == "__main__":
109
+ main()
106
110
  """
107
111
 
112
+
108
113
  @click.group()
109
114
  def main():
110
- """Easy Data Loader CLI - ETL instrument between files and databases"""
115
+ """Easy Data Loader CLI - ETL instrument for files and databases"""
111
116
  pass
112
117
 
118
+
113
119
  @main.command()
114
120
  def init():
115
121
  """Initialize folder structure and sample files"""
116
122
  base_path = Path.cwd()
117
123
 
118
124
  # folders
119
- folders = ['config/resources', 'config/pipelines']
125
+ folders = ["config/resources", "config/pipelines"]
120
126
  for folder in folders:
121
127
  (base_path / folder).mkdir(parents=True, exist_ok=True)
122
128
 
@@ -127,7 +133,7 @@ def init():
127
133
  "config/pipelines/orchestrator_example.py": ORCHESTRATOR_TEMPLATE,
128
134
  "config/resources/database_example.env": DATABASE_ENV,
129
135
  "config/resources/file_example.env": FILE_ENV,
130
- "main.py" : MAIN,
136
+ "main.py": MAIN,
131
137
  }
132
138
 
133
139
  for name, content in files.items():
@@ -141,10 +147,12 @@ def init():
141
147
 
142
148
  click.echo("\nProject initialized successfully!")
143
149
 
150
+
144
151
  @main.command()
145
152
  def list():
146
153
  """List all discovered resources and pipelines"""
147
154
  from .config_loader import Configuration
155
+
148
156
  config = Configuration()
149
157
 
150
158
  click.echo("--- Discovered Resources ---")
@@ -157,19 +165,22 @@ def list():
157
165
 
158
166
 
159
167
  @main.command()
160
- @click.argument('resource_name')
161
- @click.argument('table_name')
168
+ @click.argument("resource_name")
169
+ @click.argument("table_name")
162
170
  def inspect_db(resource_name, table_name):
163
171
  """Inspect a database table and generate ColumnDefinition code"""
164
172
  from .config_loader import Configuration
165
173
  from .database_connector import CONNECTOR_FACTORY
166
174
  from .database_operations import DatabaseOperations
167
- from .models import ConnectionSettings
175
+
176
+ from .models import ServerBasedConnectionSettings, FileBasedConnectionSettings
168
177
 
169
178
  config = Configuration()
170
179
  resource = config.get_resource(resource_name)
171
180
 
172
- if not isinstance(resource, ConnectionSettings):
181
+ if not isinstance(
182
+ resource, (ServerBasedConnectionSettings, FileBasedConnectionSettings)
183
+ ):
173
184
  click.echo(f"Error: Resource '{resource_name}' is not a database connection.")
174
185
  return
175
186
 
@@ -186,7 +197,9 @@ def inspect_db(resource_name, table_name):
186
197
  click.echo(f"\n# Suggested Column definitions for {table_name}:")
187
198
  click.echo("columns={")
188
199
  for col, dtype in schema.items():
189
- click.echo(f' "{col}": ColumnDefinition(target_name="{col}", data_type={dtype}),')
200
+ click.echo(
201
+ f' "{col}": ColumnDefinition(target_name="{col}", data_type={dtype}),'
202
+ )
190
203
  click.echo("}")
191
204
 
192
205
 
@@ -194,9 +207,13 @@ def inspect_db(resource_name, table_name):
194
207
  def run_all():
195
208
  """Run all discovered pipelines and show status summary"""
196
209
  from .config_loader import Configuration
210
+ from .models import (
211
+ BasePipelineDefinition,
212
+ OrchestratorDefinition,
213
+ ProcedureDefinition,
214
+ )
197
215
  from .pipeline import LoadPipeline
198
216
  from .procedure_pipeline import ProcedurePipeline
199
- from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition
200
217
 
201
218
  config = Configuration()
202
219
  pipelines = config.get_all_pipelines()
@@ -219,6 +236,7 @@ def run_all():
219
236
  success = ProcedurePipeline(name).run()
220
237
  elif isinstance(definition, OrchestratorDefinition):
221
238
  from .orchestrator import OrchestratorPipeline
239
+
222
240
  success = OrchestratorPipeline(name).run()
223
241
  else:
224
242
  success = False
@@ -235,12 +253,13 @@ def run_all():
235
253
  for name, status in results.items():
236
254
  click.echo(f"{name:<25} | {status}")
237
255
 
256
+
238
257
  @main.command()
239
- @click.argument('orchestrator_name')
258
+ @click.argument("orchestrator_name")
240
259
  def run_orchestrator(orchestrator_name):
241
260
  """Run a specific orchestrator by name"""
242
261
  from .orchestrator import OrchestratorPipeline
243
-
262
+
244
263
  try:
245
264
  success = OrchestratorPipeline(orchestrator_name).run()
246
265
  if success:
@@ -256,7 +275,7 @@ def validate_resources():
256
275
  """Validate all configured resources"""
257
276
  from .config_loader import Configuration
258
277
  from .database_connector import CONNECTOR_FACTORY
259
- from .models import ConnectionSettings, FileSettings
278
+ from .models import FileSettings
260
279
 
261
280
  config = Configuration()
262
281
  resources = config.get_all_resources()
@@ -267,24 +286,28 @@ def validate_resources():
267
286
 
268
287
  click.echo(f"🔍 Validating {len(resources)} resources...\n")
269
288
 
289
+ from .models import ServerBasedConnectionSettings, FileBasedConnectionSettings
290
+
270
291
  results = {}
271
292
 
272
293
  for name, resource in resources.items():
273
294
  click.echo(f"Resource: {name} ... ", nl=False)
274
295
  try:
275
- if isinstance(resource, ConnectionSettings):
296
+ if isinstance(
297
+ resource, (ServerBasedConnectionSettings, FileBasedConnectionSettings)
298
+ ):
276
299
  # Validate Database Connection
277
- connector = CONNECTOR_FACTORY[resource.conn_server_type](resource)
300
+ CONNECTOR_FACTORY[resource.conn_server_type](resource)
278
301
  # The connector tests connection in __init__, so if we are here it passed
279
302
  results[name] = "OK (Connected)"
280
303
  elif isinstance(resource, FileSettings):
281
304
  # Validate File Path
282
305
  if resource.folder_path.exists():
283
- results[name] = "OK (Path Exists)"
306
+ results[name] = "OK (Path Exists)"
284
307
  else:
285
- raise ValueError(f"Path does not exist: {resource.folder_path}")
308
+ raise ValueError(f"Path does not exist: {resource.folder_path}")
286
309
  else:
287
- results[name] = "UNKNOWN TYPE"
310
+ results[name] = "UNKNOWN TYPE"
288
311
 
289
312
  except Exception as e:
290
313
  results[name] = f"FAILED: {str(e)}"
@@ -299,4 +322,4 @@ def validate_resources():
299
322
 
300
323
 
301
324
  if __name__ == "__main__":
302
- main()
325
+ main()
@@ -1,11 +1,18 @@
1
1
  import importlib.util
2
2
  from pathlib import Path
3
- from typing import Any, Dict, Union
4
3
  from types import ModuleType
5
- from dotenv import dotenv_values
4
+ from typing import Dict, Union
6
5
 
7
6
  from .log import LoggedComponent
8
- from .models import BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition, ConnectionSettings, FileSettings, ResourceConfig
7
+ from .models import (
8
+ BasePipelineDefinition,
9
+ FileSettings,
10
+ OrchestratorDefinition,
11
+ ProcedureDefinition,
12
+ ResourceConfig,
13
+ PipelineType,
14
+ ServerBasedConnectionSettings,
15
+ )
9
16
 
10
17
 
11
18
  class Configuration(LoggedComponent):
@@ -14,6 +21,7 @@ class Configuration(LoggedComponent):
14
21
  Resources and pipelines are loaded only when requested.
15
22
  This class implements the Singleton pattern.
16
23
  """
24
+
17
25
  _instance = None
18
26
  _initialized = False
19
27
 
@@ -26,10 +34,17 @@ class Configuration(LoggedComponent):
26
34
  if not self._initialized:
27
35
  super().__init__()
28
36
  self.config_dir = Path(config_dir)
29
- self.resources : Dict[str, ResourceConfig] = {}
30
- self.pipelines : Dict[str, Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]] = {}
37
+ self.resources: Dict[str, ResourceConfig] = {}
38
+ self.pipelines: Dict[
39
+ str,
40
+ Union[
41
+ BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition
42
+ ],
43
+ ] = {}
31
44
 
32
- self.logger.debug(f"Initializing configuration from directory: {config_dir}")
45
+ self.logger.debug(
46
+ f"Initializing configuration from directory: {config_dir}"
47
+ )
33
48
  self._initialized = True
34
49
 
35
50
  def _load_env_file(self, env_file: Path) -> ResourceConfig:
@@ -38,15 +53,25 @@ class Configuration(LoggedComponent):
38
53
 
39
54
  env_file_name = env_file.stem
40
55
 
41
- if env_file_name.startswith('database_'):
42
- return ConnectionSettings(_env_file=[env_file]) # type: ignore
43
- if env_file_name.startswith('file_'):
44
- return FileSettings(_env_file=[env_file]) # type: ignore
56
+ if env_file_name.startswith("database_"):
57
+ # Peek at the env file to determine which connection class to use
58
+ from dotenv import dotenv_values
59
+
60
+ raw_values = dotenv_values(env_file)
61
+ server_type_str = raw_values.get("CONN_SERVER_TYPE", "").upper()
62
+ if server_type_str == "SQLITE":
63
+ from .models import FileBasedConnectionSettings
64
+
65
+ return FileBasedConnectionSettings(_env_file=[env_file])
66
+ return ServerBasedConnectionSettings(_env_file=[env_file])
67
+ if env_file_name.startswith("file_"):
68
+ return FileSettings(_env_file=[env_file])
45
69
 
46
- self.log_and_raise(ValueError,
70
+ self.log_and_raise(
71
+ ValueError,
47
72
  f"Failed to load env file: {env_file.name}. "
48
- f"Resource files must start with 'database_' or 'file_' prefix."
49
- )
73
+ f"Resource files must start with 'database_' or 'file_' prefix.",
74
+ )
50
75
 
51
76
  def _import_module(self, module_file: Path) -> ModuleType:
52
77
  """Dynamically import a Python module from a file path"""
@@ -101,12 +126,12 @@ class Configuration(LoggedComponent):
101
126
  raise
102
127
 
103
128
  # 3. Not found
104
- self.log_and_raise(ValueError,
105
- f"Resource not found: {resource_name}. "
106
- f"Checked path: {resource_file}"
129
+ self.log_and_raise(
130
+ ValueError,
131
+ f"Resource not found: {resource_name}. Checked path: {resource_file}",
107
132
  )
108
133
 
109
- def get_pipeline(self, pipeline_name: str) -> Union[BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition]:
134
+ def get_pipeline(self, pipeline_name: str) -> PipelineType:
110
135
  """
111
136
  Retrieve a pipeline definition by name.
112
137
  Uses lazy loading: checks memory first, then attempts to load from file.
@@ -125,18 +150,26 @@ class Configuration(LoggedComponent):
125
150
  # Find the definition in the module
126
151
  for attr_name in dir(config_module):
127
152
  attr = getattr(config_module, attr_name)
128
- if isinstance(attr, (BasePipelineDefinition, ProcedureDefinition, OrchestratorDefinition)):
153
+ if isinstance(
154
+ attr,
155
+ (
156
+ BasePipelineDefinition,
157
+ ProcedureDefinition,
158
+ OrchestratorDefinition,
159
+ ),
160
+ ):
129
161
  self.pipelines[pipeline_name] = attr
130
162
  self.logger.info(f"Lazily loaded pipeline: {pipeline_name}")
131
163
  return attr
132
164
  except Exception as e:
133
- self.log_exception(e, f"Failed to load pipeline: {pipeline_name}")
134
- raise
165
+ self.log_exception(e, f"Failed to load pipeline: {pipeline_name}")
166
+ raise
135
167
 
136
168
  # 3. Not found
137
- self.log_and_raise(ValueError,
138
- f"Pipeline not found: {pipeline_name}. "
139
- f"Checked path: {pipeline_file}")
169
+ self.log_and_raise(
170
+ ValueError,
171
+ f"Pipeline not found: {pipeline_name}. Checked path: {pipeline_file}",
172
+ )
140
173
 
141
174
  def get_all_resources(self) -> dict[str, ResourceConfig]:
142
175
  """
@@ -149,19 +182,19 @@ class Configuration(LoggedComponent):
149
182
 
150
183
  # Discover all .env files
151
184
  for env_file in resources_dir.glob("*.env"):
152
- # Simple logging to debug discovery
153
- self.logger.debug(f"Found potential resource file: {env_file.name}")
185
+ # Simple logging to debug discovery
186
+ self.logger.debug(f"Found potential resource file: {env_file.name}")
154
187
 
155
- resource_name = env_file.stem
156
- if resource_name not in self.resources:
157
- try:
158
- self.resources[resource_name] = self._load_env_file(env_file)
159
- except Exception as e:
160
- self.logger.warning(f"Failed to load resource {resource_name}: {e}")
188
+ resource_name = env_file.stem
189
+ if resource_name not in self.resources:
190
+ try:
191
+ self.resources[resource_name] = self._load_env_file(env_file)
192
+ except Exception as e:
193
+ self.logger.warning(f"Failed to load resource {resource_name}: {e}")
161
194
 
162
195
  return self.resources
163
196
 
164
- def get_all_pipelines(self) -> dict[str, BasePipelineDefinition]:
197
+ def get_all_pipelines(self) -> dict[str, PipelineType]:
165
198
  """
166
199
  Retrieve all pipelines.
167
200
  Scans the pipelines directory if not all loaded.
@@ -175,10 +208,10 @@ class Configuration(LoggedComponent):
175
208
 
176
209
  pipeline_name = pipeline_file.stem
177
210
  if pipeline_name not in self.pipelines:
178
- # Helper to trigger lazy load
179
- try:
211
+ # Helper to trigger lazy load
212
+ try:
180
213
  self.get_pipeline(pipeline_name)
181
- except Exception as e:
182
- self.logger.warning(f"Failed to load pipeline {pipeline_name}: {e}")
214
+ except Exception as e:
215
+ self.logger.warning(f"Failed to load pipeline {pipeline_name}: {e}")
183
216
 
184
217
  return self.pipelines
@@ -15,6 +15,7 @@ class DatabaseOperationException(Exception):
15
15
  self.message = message
16
16
  super().__init__(self.message)
17
17
 
18
+
18
19
  class InvalidFileException(Exception):
19
20
  def __init__(self, message: str = "The provided file is invalid or corrupted"):
20
21
  self.message = message