dora-eda 3.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dora_eda-3.0.1/LICENSE +21 -0
- dora_eda-3.0.1/PKG-INFO +166 -0
- dora_eda-3.0.1/README.md +139 -0
- dora_eda-3.0.1/pyproject.toml +59 -0
- dora_eda-3.0.1/src/dora/__init__.py +1 -0
- dora_eda-3.0.1/src/dora/analyzer.py +99 -0
- dora_eda-3.0.1/src/dora/config_loader.py +27 -0
- dora_eda-3.0.1/src/dora/main.py +296 -0
- dora_eda-3.0.1/src/dora/plots/__init__.py +0 -0
- dora_eda-3.0.1/src/dora/plots/bivariate.py +90 -0
- dora_eda-3.0.1/src/dora/plots/multivariate.py +65 -0
- dora_eda-3.0.1/src/dora/plots/styling.py +62 -0
- dora_eda-3.0.1/src/dora/plots/univariate.py +103 -0
- dora_eda-3.0.1/src/dora/profiling.py +100 -0
- dora_eda-3.0.1/src/dora/reporting/__init__.py +0 -0
- dora_eda-3.0.1/src/dora/reporting/generator.py +70 -0
- dora_eda-3.0.1/src/dora/reporting/templates/report_template.html +208 -0
dora_eda-3.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Asif Sayyed
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dora_eda-3.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dora-eda
|
|
3
|
+
Version: 3.0.1
|
|
4
|
+
Summary: Exploratory data analysis and presentation tool
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Asif Sayyed
|
|
8
|
+
Author-email: asifdotexe@gmail.com
|
|
9
|
+
Requires-Python: >=3.13
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
+
Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
|
|
15
|
+
Requires-Dist: matplotlib (>=3.10.6,<4.0.0)
|
|
16
|
+
Requires-Dist: numpy (>=2.3.3,<3.0.0)
|
|
17
|
+
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
18
|
+
Requires-Dist: pandas (>=2.3.3,<3.0.0)
|
|
19
|
+
Requires-Dist: pyarrow (>=21.0.0,<22.0.0)
|
|
20
|
+
Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
|
|
21
|
+
Requires-Dist: seaborn (>=0.13.2,<0.14.0)
|
|
22
|
+
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
|
23
|
+
Requires-Dist: typer[rich] (>=0.19.2,<0.20.0)
|
|
24
|
+
Project-URL: Repository, https://github.com/Asifdotexe/DORA
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# Data Oriented Report Automator (DORA)
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<img src="data/assets/dora-updated-concept.png" alt="DORA Logo" width="200"/>
|
|
31
|
+
</p>
|
|
32
|
+
|
|
33
|
+
<em align="center">
|
|
34
|
+
An interactive command-line tool to automate Exploratory Data Analysis (EDA) and generate beautiful, insightful reports in seconds.
|
|
35
|
+
</em>
|
|
36
|
+
|
|
37
|
+
## Overview
|
|
38
|
+
|
|
39
|
+
Welcome to DORA! This isn't just a script; it's an intelligent EDA assistant. DORA empowers you to move from a raw dataset to a comprehensive HTML report with minimal effort. It is designed to be powerful and configurable for experts, yet simple enough for anyone to use thanks to its interactive wizard.
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
## 1. installation
|
|
43
|
+
Install DORA directly from PyPI using pip:
|
|
44
|
+
```bash
|
|
45
|
+
pip install dora-eda
|
|
46
|
+
# check version to validate installation
|
|
47
|
+
dora -v
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## 2. Usage
|
|
51
|
+
DORA has two modes of operation: Interactive (for first-time runs) and Config-Driven (for reproducible automation).
|
|
52
|
+
Run DORA without existing configuration (Fresh run)
|
|
53
|
+
|
|
54
|
+
## A. Interactive Mode (Quick Start)
|
|
55
|
+
Simply run the command without arguments. DORA will launch a wizard to guide you through the setup.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
dora
|
|
59
|
+
```
|
|
60
|
+
You will be prompted to:
|
|
61
|
+
- Select your data file (CSV, Excel, JSON, Parquet).
|
|
62
|
+
- Choose an output directory.
|
|
63
|
+
- (Optional) Select a target variable for focused analysis.
|
|
64
|
+
- Pick which analysis steps to perform.
|
|
65
|
+
- (Optional) Save your settings to a `config.yaml` file for future use.
|
|
66
|
+
|
|
67
|
+
## B. Config-Driven Mode (Advanced)
|
|
68
|
+
If you already have a configuration file (e.g., from a previous run), you can skip the wizard and run the analysis immediately.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
dora --config <path/to/config.yaml>
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Example `config.yaml`:**
|
|
75
|
+
```bash
|
|
76
|
+
# --- Input/Output Settings ---
|
|
77
|
+
input_file: 'data/insurance.csv'
|
|
78
|
+
output_dir: 'output/insurance_report'
|
|
79
|
+
report_title: 'Exploratory Data Analysis of Insurance Premiums'
|
|
80
|
+
|
|
81
|
+
# --- Dataset Settings ---
|
|
82
|
+
target_variable: 'charges'
|
|
83
|
+
|
|
84
|
+
# --- Analysis Pipeline ---
|
|
85
|
+
analysis_pipeline:
|
|
86
|
+
- profile:
|
|
87
|
+
enabled: true
|
|
88
|
+
- univariate:
|
|
89
|
+
enabled: true
|
|
90
|
+
plot_types:
|
|
91
|
+
numerical: ['histogram', 'boxplot']
|
|
92
|
+
categorical: ['barplot']
|
|
93
|
+
- bivariate:
|
|
94
|
+
enabled: true
|
|
95
|
+
target_centric: true
|
|
96
|
+
- multivariate:
|
|
97
|
+
enabled: true
|
|
98
|
+
correlation_cols: ['age', 'bmi', 'children', 'charges']
|
|
99
|
+
```
|
|
100
|
+
## 3. Supported Data Formats
|
|
101
|
+
DORA automatically detects and reads the following file types:
|
|
102
|
+
- CSV (`*.csv`)
|
|
103
|
+
- Excel (`*.xlsx`) - Note: Analyzes the first sheet only.
|
|
104
|
+
- JSON (`*.json`)
|
|
105
|
+
- Parquet (`*.parquet`)
|
|
106
|
+
|
|
107
|
+
## 4. Viewing the Output
|
|
108
|
+
After the analysis is complete, check your output directory for:
|
|
109
|
+
- 📄 `eda_report.html`: The full, interactive report. Open this in any web browser.
|
|
110
|
+
- 📈 `charts/`: A folder containing all generated plots as high-quality images.
|
|
111
|
+
|
|
112
|
+
# Developer Guide
|
|
113
|
+
Interested in contributing to DORA? Awesome! Follow these steps to set up your local development environment.
|
|
114
|
+
|
|
115
|
+
## 1. Prerequisites
|
|
116
|
+
You need Poetry for dependency management.
|
|
117
|
+
```bash
|
|
118
|
+
# Windows (Powershell)
|
|
119
|
+
(Invoke-WebRequest -Uri [https://install.python-poetry.org](https://install.python-poetry.org) -UseBasicParsing).Content | py -
|
|
120
|
+
|
|
121
|
+
# Linux/macOS
|
|
122
|
+
curl -sSL [https://install.python-poetry.org](https://install.python-poetry.org) | python3 -
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## 2. Setup
|
|
126
|
+
Clone the repository and install dependencies (including dev tools).
|
|
127
|
+
```bash
|
|
128
|
+
git clone https://github.com/Asifdotexe/DORA.git
|
|
129
|
+
cd dora
|
|
130
|
+
poetry install --with dev
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## 3. Code Quality
|
|
134
|
+
We use standard tools to keep the codebase clean. Please run these before submitting a PR.
|
|
135
|
+
|
|
136
|
+
**Automated Checks (Recommended):**
|
|
137
|
+
Install the pre-commit hooks once, and they will run automatically on every commit.
|
|
138
|
+
```bash
|
|
139
|
+
poetry run pre-commit install
|
|
140
|
+
```
|
|
141
|
+
**Manual Checks:**
|
|
142
|
+
```bash
|
|
143
|
+
# Format code
|
|
144
|
+
poetry run black .
|
|
145
|
+
poetry run isort .
|
|
146
|
+
|
|
147
|
+
# Lint code
|
|
148
|
+
poetry run pylint src/dora
|
|
149
|
+
```
|
|
150
|
+
**Running Tests:**
|
|
151
|
+
```bash
|
|
152
|
+
poetry run pytest
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## 4. How to Contribute
|
|
156
|
+
1. Fork the repository.
|
|
157
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`).
|
|
158
|
+
3. Commit your changes.
|
|
159
|
+
4. Push to the branch.
|
|
160
|
+
5. Open a Pull Request.
|
|
161
|
+
|
|
162
|
+
## License
|
|
163
|
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
|
164
|
+
|
|
165
|
+
Happy analyzing with DORA! 🎉
|
|
166
|
+
|
dora_eda-3.0.1/README.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Data Oriented Report Automator (DORA)
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="data/assets/dora-updated-concept.png" alt="DORA Logo" width="200"/>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<em align="center">
|
|
8
|
+
An interactive command-line tool to automate Exploratory Data Analysis (EDA) and generate beautiful, insightful reports in seconds.
|
|
9
|
+
</em>
|
|
10
|
+
|
|
11
|
+
## Overview
|
|
12
|
+
|
|
13
|
+
Welcome to DORA! This isn't just a script; it's an intelligent EDA assistant. DORA empowers you to move from a raw dataset to a comprehensive HTML report with minimal effort. It is designed to be powerful and configurable for experts, yet simple enough for anyone to use thanks to its interactive wizard.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## 1. installation
|
|
17
|
+
Install DORA directly from PyPI using pip:
|
|
18
|
+
```bash
|
|
19
|
+
pip install dora-eda
|
|
20
|
+
# check version to validate installation
|
|
21
|
+
dora -v
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## 2. Usage
|
|
25
|
+
DORA has two modes of operation: Interactive (for first-time runs) and Config-Driven (for reproducible automation).
|
|
26
|
+
Run DORA without existing configuration (Fresh run)
|
|
27
|
+
|
|
28
|
+
## A. Interactive Mode (Quick Start)
|
|
29
|
+
Simply run the command without arguments. DORA will launch a wizard to guide you through the setup.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
dora
|
|
33
|
+
```
|
|
34
|
+
You will be prompted to:
|
|
35
|
+
- Select your data file (CSV, Excel, JSON, Parquet).
|
|
36
|
+
- Choose an output directory.
|
|
37
|
+
- (Optional) Select a target variable for focused analysis.
|
|
38
|
+
- Pick which analysis steps to perform.
|
|
39
|
+
- (Optional) Save your settings to a `config.yaml` file for future use.
|
|
40
|
+
|
|
41
|
+
## B. Config-Driven Mode (Advanced)
|
|
42
|
+
If you already have a configuration file (e.g., from a previous run), you can skip the wizard and run the analysis immediately.
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
dora --config <path/to/config.yaml>
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Example `config.yaml`:**
|
|
49
|
+
```bash
|
|
50
|
+
# --- Input/Output Settings ---
|
|
51
|
+
input_file: 'data/insurance.csv'
|
|
52
|
+
output_dir: 'output/insurance_report'
|
|
53
|
+
report_title: 'Exploratory Data Analysis of Insurance Premiums'
|
|
54
|
+
|
|
55
|
+
# --- Dataset Settings ---
|
|
56
|
+
target_variable: 'charges'
|
|
57
|
+
|
|
58
|
+
# --- Analysis Pipeline ---
|
|
59
|
+
analysis_pipeline:
|
|
60
|
+
- profile:
|
|
61
|
+
enabled: true
|
|
62
|
+
- univariate:
|
|
63
|
+
enabled: true
|
|
64
|
+
plot_types:
|
|
65
|
+
numerical: ['histogram', 'boxplot']
|
|
66
|
+
categorical: ['barplot']
|
|
67
|
+
- bivariate:
|
|
68
|
+
enabled: true
|
|
69
|
+
target_centric: true
|
|
70
|
+
- multivariate:
|
|
71
|
+
enabled: true
|
|
72
|
+
correlation_cols: ['age', 'bmi', 'children', 'charges']
|
|
73
|
+
```
|
|
74
|
+
## 3. Supported Data Formats
|
|
75
|
+
DORA automatically detects and reads the following file types:
|
|
76
|
+
- CSV (`*.csv`)
|
|
77
|
+
- Excel (`*.xlsx`) - Note: Analyzes the first sheet only.
|
|
78
|
+
- JSON (`*.json`)
|
|
79
|
+
- Parquet (`*.parquet`)
|
|
80
|
+
|
|
81
|
+
## 4. Viewing the Output
|
|
82
|
+
After the analysis is complete, check your output directory for:
|
|
83
|
+
- 📄 `eda_report.html`: The full, interactive report. Open this in any web browser.
|
|
84
|
+
- 📈 `charts/`: A folder containing all generated plots as high-quality images.
|
|
85
|
+
|
|
86
|
+
# Developer Guide
|
|
87
|
+
Interested in contributing to DORA? Awesome! Follow these steps to set up your local development environment.
|
|
88
|
+
|
|
89
|
+
## 1. Prerequisites
|
|
90
|
+
You need Poetry for dependency management.
|
|
91
|
+
```bash
|
|
92
|
+
# Windows (Powershell)
|
|
93
|
+
(Invoke-WebRequest -Uri [https://install.python-poetry.org](https://install.python-poetry.org) -UseBasicParsing).Content | py -
|
|
94
|
+
|
|
95
|
+
# Linux/macOS
|
|
96
|
+
curl -sSL [https://install.python-poetry.org](https://install.python-poetry.org) | python3 -
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## 2. Setup
|
|
100
|
+
Clone the repository and install dependencies (including dev tools).
|
|
101
|
+
```bash
|
|
102
|
+
git clone https://github.com/Asifdotexe/DORA.git
|
|
103
|
+
cd dora
|
|
104
|
+
poetry install --with dev
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## 3. Code Quality
|
|
108
|
+
We use standard tools to keep the codebase clean. Please run these before submitting a PR.
|
|
109
|
+
|
|
110
|
+
**Automated Checks (Recommended):**
|
|
111
|
+
Install the pre-commit hooks once, and they will run automatically on every commit.
|
|
112
|
+
```bash
|
|
113
|
+
poetry run pre-commit install
|
|
114
|
+
```
|
|
115
|
+
**Manual Checks:**
|
|
116
|
+
```bash
|
|
117
|
+
# Format code
|
|
118
|
+
poetry run black .
|
|
119
|
+
poetry run isort .
|
|
120
|
+
|
|
121
|
+
# Lint code
|
|
122
|
+
poetry run pylint src/dora
|
|
123
|
+
```
|
|
124
|
+
**Running Tests:**
|
|
125
|
+
```bash
|
|
126
|
+
poetry run pytest
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 4. How to Contribute
|
|
130
|
+
1. Fork the repository.
|
|
131
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`).
|
|
132
|
+
3. Commit your changes.
|
|
133
|
+
4. Push to the branch.
|
|
134
|
+
5. Open a Pull Request.
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
|
138
|
+
|
|
139
|
+
Happy analyzing with DORA! 🎉
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dora-eda"
|
|
3
|
+
version = "3.0.1"
|
|
4
|
+
description = "Exploratory data analysis and presentation tool"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Asif Sayyed",email = "asifdotexe@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
license = {text = "MIT"}
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.13"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pandas (>=2.3.3,<3.0.0)",
|
|
13
|
+
"seaborn (>=0.13.2,<0.14.0)",
|
|
14
|
+
"numpy (>=2.3.3,<3.0.0)",
|
|
15
|
+
"matplotlib (>=3.10.6,<4.0.0)",
|
|
16
|
+
"tqdm (>=4.67.1,<5.0.0)",
|
|
17
|
+
"typer[rich] (>=0.19.2,<0.20.0)",
|
|
18
|
+
"pyyaml (>=6.0.3,<7.0.0)",
|
|
19
|
+
"jinja2 (>=3.1.6,<4.0.0)",
|
|
20
|
+
"openpyxl (>=3.1.5,<4.0.0)",
|
|
21
|
+
"pyarrow (>=21.0.0,<22.0.0)"
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"License :: OSI Approved :: MIT License",
|
|
27
|
+
"Operating System :: OS Independent",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
urls = {Repository = "https://github.com/Asifdotexe/DORA"}
|
|
32
|
+
|
|
33
|
+
[tool.poetry]
|
|
34
|
+
package-mode = true
|
|
35
|
+
packages = [{ include = "dora", from = "src" }]
|
|
36
|
+
|
|
37
|
+
[tool.poetry.scripts]
|
|
38
|
+
dora = "dora.main:main"
|
|
39
|
+
|
|
40
|
+
[tool.poetry.group.dev.dependencies]
|
|
41
|
+
pylint = "^3.3.9"
|
|
42
|
+
black = "^25.9.0"
|
|
43
|
+
isort = "^6.1.0"
|
|
44
|
+
pre-commit = "^4.3.0"
|
|
45
|
+
ipykernel = "^6.30.1"
|
|
46
|
+
pytest = "^8.4.2"
|
|
47
|
+
pytest-benchmark = "^5.1.0"
|
|
48
|
+
snakeviz = "^2.2.2"
|
|
49
|
+
|
|
50
|
+
[tool.pylint.format]
|
|
51
|
+
max-line-length = 120
|
|
52
|
+
|
|
53
|
+
[tool.pylint.messages_control]
|
|
54
|
+
disable = ["redefined-outer-name", "invalid-name", "assignment-from-no-return", "too-many-branches",
|
|
55
|
+
"unused-argument", "unnecessary-pass", "fixme", "too-few-public-methods"]
|
|
56
|
+
|
|
57
|
+
[build-system]
|
|
58
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
59
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module will orchestrate the analysis
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from .plots import bivariate, multivariate, univariate
|
|
11
|
+
from .profiling import generate_profile
|
|
12
|
+
from .reporting.generator import create_report
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Analyzer:
|
|
16
|
+
"""
|
|
17
|
+
Orchestrates the entire Exploratory data analysis process based on configuration.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, df: pd.DataFrame, config: dict):
|
|
21
|
+
self.df = df
|
|
22
|
+
self.config = config
|
|
23
|
+
self.output_dir = self.config["output_dir"]
|
|
24
|
+
self.charts_dir = os.path.join(self.output_dir, "charts")
|
|
25
|
+
self.report_data = {"title": self.config.get("report_title", "EDA Report")}
|
|
26
|
+
|
|
27
|
+
# To keep our project tidy, we'll create dedicated folders for our outputs right away.
|
|
28
|
+
# This prevents clutter and makes the final report easy to find.
|
|
29
|
+
os.makedirs(self.charts_dir, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
def run(self):
|
|
32
|
+
"""
|
|
33
|
+
Executes the analysis pipeline defined in the config.
|
|
34
|
+
This is the main conductor, stepping through the user's chosen analysis
|
|
35
|
+
plan and running each part in order.
|
|
36
|
+
"""
|
|
37
|
+
pipeline = self.config.get("analysis_pipeline", [])
|
|
38
|
+
|
|
39
|
+
for step in pipeline:
|
|
40
|
+
step_name = list(step.keys())[0]
|
|
41
|
+
params = step[step_name]
|
|
42
|
+
|
|
43
|
+
# We only run the steps that the user has explicitly enabled in the config.
|
|
44
|
+
# This makes the tool flexible and respects the user's choices.
|
|
45
|
+
if params and params.get("enabled", False):
|
|
46
|
+
logging.info("--- Running Step: %s ---", step_name.capitalize())
|
|
47
|
+
if step_name == "profile":
|
|
48
|
+
self._run_profiling()
|
|
49
|
+
elif step_name == "univariate":
|
|
50
|
+
self._run_univariate(params)
|
|
51
|
+
elif step_name == "bivariate":
|
|
52
|
+
self._run_bivariate(params)
|
|
53
|
+
elif step_name == "multivariate":
|
|
54
|
+
self._run_multivariate(params)
|
|
55
|
+
|
|
56
|
+
# After all the analysis is done, we compile everything into a
|
|
57
|
+
# beautiful, easy-to-read report.
|
|
58
|
+
self._generate_report()
|
|
59
|
+
|
|
60
|
+
def _run_profiling(self):
|
|
61
|
+
# This step gives us a quick summary of the dataset's health and structure.
|
|
62
|
+
profile_results = generate_profile(self.df)
|
|
63
|
+
self.report_data["profile"] = profile_results
|
|
64
|
+
|
|
65
|
+
def _run_univariate(self, params: dict):
|
|
66
|
+
# By looking at columns one by one, we can understand their individual characteristics.
|
|
67
|
+
univariate_plots = univariate.generate_plots(self.df, self.charts_dir, params)
|
|
68
|
+
self.report_data["univariate_plots"] = univariate_plots
|
|
69
|
+
|
|
70
|
+
def _run_bivariate(self, params: dict):
|
|
71
|
+
# Now we start looking for connections. If the user has a specific goal (a target variable),
|
|
72
|
+
# this is where we explore how other features might influence it.
|
|
73
|
+
target = self.config.get("target_variable")
|
|
74
|
+
|
|
75
|
+
# It's important to check if a target was actually provided.
|
|
76
|
+
# Running this analysis without one wouldn't make sense, so we'll skip it.
|
|
77
|
+
if params.get("target_centric") and not target:
|
|
78
|
+
logging.warning(
|
|
79
|
+
"Bivariate 'target_centric' is true, but no 'target_variable' is defined. Skipping."
|
|
80
|
+
)
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
bivariate_plots = bivariate.generate_plots(
|
|
84
|
+
self.df, target, self.charts_dir, params
|
|
85
|
+
)
|
|
86
|
+
self.report_data["bivariate_plots"] = bivariate_plots
|
|
87
|
+
|
|
88
|
+
def _run_multivariate(self, params: dict):
|
|
89
|
+
# This is where we see how numerical features interact with each other.
|
|
90
|
+
# The correlation matrix is a powerful tool to spot these broader relationships at a glance.
|
|
91
|
+
multivariate_plots = multivariate.generate_plots(
|
|
92
|
+
self.df, self.charts_dir, params
|
|
93
|
+
)
|
|
94
|
+
self.report_data["multivariate_plots"] = multivariate_plots
|
|
95
|
+
|
|
96
|
+
def _generate_report(self):
|
|
97
|
+
# We take all the charts and insights we've gathered and collate them into a single HTML report.
|
|
98
|
+
logging.info("--- Generating HTML Report ---")
|
|
99
|
+
create_report(self.report_data, self.output_dir)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module helps to load and validate the configuration file
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_config(config_path: Path) -> dict:
|
|
12
|
+
"""
|
|
13
|
+
Load and validate the YAML configuration file
|
|
14
|
+
|
|
15
|
+
:param config_path: Path to the YAML configuration file
|
|
16
|
+
:returns: A dictionary containing the configurations
|
|
17
|
+
"""
|
|
18
|
+
logging.info("Reading the configuration file from %s", config_path)
|
|
19
|
+
with open(config_path, "r", encoding="utf-8") as file:
|
|
20
|
+
config = yaml.safe_load(file) or {}
|
|
21
|
+
|
|
22
|
+
# validation
|
|
23
|
+
if "input_file" not in config or "output_dir" not in config:
|
|
24
|
+
raise ValueError("Config file must contain 'input_file' and 'output_dir'.")
|
|
25
|
+
|
|
26
|
+
logging.info("Configuration loaded successfully")
|
|
27
|
+
return config
|