khadee-eda 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khadee_eda-1.0.0/PKG-INFO +193 -0
- khadee_eda-1.0.0/README.md +180 -0
- khadee_eda-1.0.0/khadee_eda/__init__.py +194 -0
- khadee_eda-1.0.0/khadee_eda/assets/script.js +137 -0
- khadee_eda-1.0.0/khadee_eda/assets/style.css +1336 -0
- khadee_eda-1.0.0/khadee_eda/clean.py +287 -0
- khadee_eda-1.0.0/khadee_eda/config.py +121 -0
- khadee_eda-1.0.0/khadee_eda/engines/__init__.py +1 -0
- khadee_eda-1.0.0/khadee_eda/engines/correlation_engine.py +115 -0
- khadee_eda-1.0.0/khadee_eda/engines/dim_reduction.py +152 -0
- khadee_eda-1.0.0/khadee_eda/engines/missing_engine.py +170 -0
- khadee_eda-1.0.0/khadee_eda/engines/outlier_engine.py +190 -0
- khadee_eda-1.0.0/khadee_eda/engines/stats_engine.py +200 -0
- khadee_eda-1.0.0/khadee_eda/loader.py +221 -0
- khadee_eda-1.0.0/khadee_eda/renderers/__init__.py +1 -0
- khadee_eda-1.0.0/khadee_eda/renderers/chart_renderer.py +547 -0
- khadee_eda-1.0.0/khadee_eda/renderers/html_renderer.py +128 -0
- khadee_eda-1.0.0/khadee_eda/renderers/table_renderer.py +38 -0
- khadee_eda-1.0.0/khadee_eda/sections/__init__.py +1 -0
- khadee_eda-1.0.0/khadee_eda/sections/advanced_stats.py +47 -0
- khadee_eda-1.0.0/khadee_eda/sections/correlations.py +125 -0
- khadee_eda-1.0.0/khadee_eda/sections/distributions.py +102 -0
- khadee_eda-1.0.0/khadee_eda/sections/interactions.py +84 -0
- khadee_eda-1.0.0/khadee_eda/sections/missing.py +102 -0
- khadee_eda-1.0.0/khadee_eda/sections/model_readiness.py +334 -0
- khadee_eda-1.0.0/khadee_eda/sections/outliers.py +94 -0
- khadee_eda-1.0.0/khadee_eda/sections/overview.py +174 -0
- khadee_eda-1.0.0/khadee_eda/sections/sample.py +109 -0
- khadee_eda-1.0.0/khadee_eda/sections/variables.py +348 -0
- khadee_eda-1.0.0/khadee_eda/techniques/__init__.py +2 -0
- khadee_eda-1.0.0/khadee_eda/techniques/china.py +125 -0
- khadee_eda-1.0.0/khadee_eda/techniques/india.py +167 -0
- khadee_eda-1.0.0/khadee_eda/techniques/japan.py +177 -0
- khadee_eda-1.0.0/khadee_eda/techniques/us.py +108 -0
- khadee_eda-1.0.0/khadee_eda/type_detector.py +169 -0
- khadee_eda-1.0.0/khadee_eda/utils.py +130 -0
- khadee_eda-1.0.0/khadee_eda.egg-info/PKG-INFO +193 -0
- khadee_eda-1.0.0/khadee_eda.egg-info/SOURCES.txt +41 -0
- khadee_eda-1.0.0/khadee_eda.egg-info/dependency_links.txt +1 -0
- khadee_eda-1.0.0/khadee_eda.egg-info/top_level.txt +1 -0
- khadee_eda-1.0.0/pyproject.toml +14 -0
- khadee_eda-1.0.0/setup.cfg +4 -0
- khadee_eda-1.0.0/setup.py +42 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: khadee-eda
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Deep Insights EDA — Comprehensive data profiling with global AI techniques
|
|
5
|
+
Home-page: https://github.com/khadee/khadee-eda
|
|
6
|
+
Author: Khadee
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Python: >=3.8
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Dynamic: author
|
|
11
|
+
Dynamic: home-page
|
|
12
|
+
Dynamic: requires-python
|
|
13
|
+
|
|
14
|
+
# 🔬 Khadee EDA — Deep Insights Data Profiling & Cleaning
|
|
15
|
+
|
|
16
|
+
[](https://www.python.org/)
|
|
17
|
+
[](https://opensource.org/licenses/MIT)
|
|
18
|
+
[]()
|
|
19
|
+
[]()
|
|
20
|
+
|
|
21
|
+
`khadee-eda` is a next-generation, high-performance exploratory data analysis (EDA) and data cleaning library. It generates **stunning, glassmorphism-themed interactive HTML profiling reports** from any dataset and provides a robust, lightweight suite of cleaning tools equivalent to `dataprep.clean`.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## ⚡ Quick Start
|
|
26
|
+
|
|
27
|
+
### 1. Generating a Profiling Report
|
|
28
|
+
Auto-detects and loads data from CSV, Excel, JSON, Parquet, SQLite, and 10+ other formats.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from khadee_eda import ProfileReport
|
|
32
|
+
|
|
33
|
+
# Method A: Direct one-liner from file path
|
|
34
|
+
report = ProfileReport("train.csv", title="E-Commerce Analysis")
|
|
35
|
+
report.to_html("report.html")
|
|
36
|
+
|
|
37
|
+
# Method B: From an existing Pandas DataFrame
|
|
38
|
+
import pandas as pd
|
|
39
|
+
df = pd.read_csv("train.csv")
|
|
40
|
+
report = ProfileReport(df, title="Customer Profiles")
|
|
41
|
+
report.to_html("report.html")
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### 2. High-Performance Data Cleaning (`khadee_eda.clean`)
|
|
45
|
+
Direct, unified API for cleaning, standardizing, and preparing data (a lightweight alternative to `dataprep.clean`).
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from khadee_eda import clean
|
|
49
|
+
|
|
50
|
+
# Clean column headers (standardize to snake_case, PascalCase, camelCase, etc.)
|
|
51
|
+
df_clean = clean.clean_headers(df, case="snake")
|
|
52
|
+
|
|
53
|
+
# Impute missing values with mean, median, mode, or constant value
|
|
54
|
+
df_clean = clean.clean_missing(df_clean, columns=["age", "income"], strategy="median")
|
|
55
|
+
|
|
56
|
+
# Handle outliers by clipping (winsorization) or dropping rows
|
|
57
|
+
df_clean = clean.clean_outliers(df_clean, columns=["fare"], method="iqr", strategy="clip")
|
|
58
|
+
|
|
59
|
+
# Normalize and clean text columns (strip spaces, lowercase, remove special characters)
|
|
60
|
+
df_clean = clean.clean_text(df_clean, columns=["product_desc"], lowercase=True, remove_special=True)
|
|
61
|
+
|
|
62
|
+
# Remove duplicate rows
|
|
63
|
+
df_clean = clean.clean_duplicates(df_clean, columns=["user_id"])
|
|
64
|
+
|
|
65
|
+
# Run a complete, standard cleanup pass
|
|
66
|
+
df_clean = clean.clean_df(df)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## 📊 10 Structured Analysis Sections
|
|
72
|
+
|
|
73
|
+
Each HTML report is divided into 10 structured, deeply interactive sections:
|
|
74
|
+
|
|
75
|
+
1. **🏠 Overview**: High-level dataset shapes, reproduction metadata, alerts (missing cells, zero values, extreme correlations, duplicates), and detected data types.
|
|
76
|
+
2. **📊 Variables (Interactive Dropdown Explorer)**: Detailed statistics per variable (quantiles, descriptives, frequencies, categories). Includes a custom select dropdown menu to show/hide column details and dynamically resize Plotly visualizations.
|
|
77
|
+
3. **📈 Distributions**: Visual analysis of distributions via histogram grids, kernel density estimations (KDE), skewness, kurtosis, and normality tests.
|
|
78
|
+
4. **🔗 Correlations**: Pairwise comparison matrices using **Pearson**, **Spearman**, **Kendall**, and **Cramér's V** metrics represented as interactive heatmaps.
|
|
79
|
+
5. **❓ Missing Values**: Visual representation of missing data via matrices, counts, and imputation recommendations.
|
|
80
|
+
6. **🎯 Outliers**: Deep outlier diagnostic detailing detection using IQR, Z-score, Median Absolute Deviation (MAD), and Isolation Forest.
|
|
81
|
+
7. **🔄 Interactions**: Interactive bivariate scatter plots and grouped box plots.
|
|
82
|
+
8. **📐 Advanced Stats (Global AI Hub Methodologies)**: Unique statistical and machine learning frameworks tailored after analytical cultures across the globe (see below).
|
|
83
|
+
9. **🤖 Model Readiness**: Preprocessing checklists, ML model suitability rankings, and code recommendation generators.
|
|
84
|
+
10. **📋 Sample**: Interactive data table viewer showing the head, tail, duplicates, and data dictionary.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## 🌍 Global EDA Techniques
|
|
89
|
+
|
|
90
|
+
The **Advanced Stats** section includes 4 distinct regional analytical philosophies:
|
|
91
|
+
|
|
92
|
+
* **🇺🇸 US (ML-Readiness & Feature Engineering)**: Identifies feature importance, flags target leakage, and proposes engineered features.
|
|
93
|
+
* **🇮🇳 India (Statistical Foundations & Hypothesis Testing)**: Evaluates confidence intervals, conducts hypothesis testing, and fits target distributions.
|
|
94
|
+
* **🇯🇵 Japan (Quality Control & Process Analytics — Kaizen)**: Implements Shewhart control charts, calculates Process Capability Indexes ($C_p$/$C_{pk}$), checks stability indicators, and generates Pareto charts.
|
|
95
|
+
* **🇨🇳 China (Large-Scale Pattern Recognition)**: Generates PCA projections, evaluates Hopkins clustering statistics, provides K-Means elbow curves, and profiles data density.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## 📂 Supported Formats
|
|
100
|
+
|
|
101
|
+
No need to write separate loading code. `khadee-eda` automatically detects your dataset extension and uses optimized engines to parse it:
|
|
102
|
+
|
|
103
|
+
| Format | Extensions | Parser |
|
|
104
|
+
| :--- | :--- | :--- |
|
|
105
|
+
| **CSV / TSV** | `.csv`, `.tsv`, `.txt` | Pandas optimized parser with latin-1 fallback |
|
|
106
|
+
| **Excel** | `.xlsx`, `.xls`, `.xlsm`, `.xlsb` | openpyxl / xlrd engine |
|
|
107
|
+
| **JSON** | `.json` | Standard and JSON-lines parsed dynamically |
|
|
108
|
+
| **Parquet / Feather** | `.parquet`, `.feather` | PyArrow engine |
|
|
109
|
+
| **SQLite** | `.db`, `.sqlite`, `.sqlite3` | Built-in SQLite connection reader |
|
|
110
|
+
| **Pickle** | `.pkl`, `.pickle` | Standard Python pickle serializer |
|
|
111
|
+
| **Others** | `.h5`, `.hdf5`, `.xml`, `.dta`, `.sas7bdat`, `.sav` | Supporting PyTables, XML, Stata, SAS, SPSS |
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## 💾 Package Footprint & Download Size
|
|
116
|
+
|
|
117
|
+
Unlike heavier packages that bundle thick C++ binaries, `khadee-eda` is designed to be **incredibly lightweight** and fast to download.
|
|
118
|
+
|
|
119
|
+
### 1. Download Size (Pip / UV)
|
|
120
|
+
* **Wheel Size (`.whl`)**: **~85 KB**
|
|
121
|
+
* **Source Distribution (`.tar.gz`)**: **~90 KB**
|
|
122
|
+
* **Package Source Size**: **~170 KB** (Clean, pure Python logic + minimal glassmorphism style assets)
|
|
123
|
+
|
|
124
|
+
### 2. Dependency Size
|
|
125
|
+
If your machine already has standard data science packages (like `pandas`, `numpy`, `scipy`) cached, the installation completes instantly (~85 KB download). If installing into a blank virtual environment, pip/uv will download the scientific stack:
|
|
126
|
+
|
|
127
|
+
| Dependency | Purpose | Download Size (Approx.) |
|
|
128
|
+
| :--- | :--- | :--- |
|
|
129
|
+
| **pandas** | Data manipulation & structure | ~12 - 15 MB |
|
|
130
|
+
| **numpy** | Array computations | ~14 - 18 MB |
|
|
131
|
+
| **scipy** | Advanced statistics & tests | ~35 - 40 MB |
|
|
132
|
+
| **scikit-learn** | Machine learning engines & PCA | ~7 - 9 MB |
|
|
133
|
+
| **plotly** | Dynamic SVG visualizations | ~7 - 8 MB |
|
|
134
|
+
| **pyarrow** | High-performance Parquet storage | ~30 - 35 MB |
|
|
135
|
+
| **openpyxl** | Excel read/write compatibility | ~2 - 3 MB |
|
|
136
|
+
| **jinja2** | HTML templating engine | ~0.2 MB |
|
|
137
|
+
| **Total Dependencies** | Full Scientific Stack | **~110 - 130 MB** |
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## ⚙️ Selective Reports
|
|
142
|
+
|
|
143
|
+
Save compute time and reduce HTML sizes for large datasets by only rendering the sections or techniques you need:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from khadee_eda import ProfileReport
|
|
147
|
+
|
|
148
|
+
# Profile only Specific Sections
|
|
149
|
+
report = ProfileReport(
|
|
150
|
+
"dataset.csv",
|
|
151
|
+
sections=["overview", "variables", "model_readiness"]
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Render only Specific Global Techniques
|
|
155
|
+
report = ProfileReport(
|
|
156
|
+
"dataset.csv",
|
|
157
|
+
techniques=["japan", "us"]
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## 💎 Design & Visual Performance Excellence
|
|
164
|
+
|
|
165
|
+
* **Glassmorphism Dark Theme**: Standard EDA reports often look like boring 2010 tables. `khadee-eda` features a high-end, dark glassmorphism dashboard with neon accents, dynamic hover states, and smooth CSS micro-animations.
|
|
166
|
+
* **Instant PDF Export**: Features a beautiful floating "Download PDF" button that triggers browser printing. The custom media print styles automatically expand all hidden column cards, expand all tabs, hide navigational elements/dropdowns, and switch to a crisp ink-saving light template for a clean, professional corporate report.
|
|
167
|
+
* **WebGL Crash Mitigation**: Rendering dozens of ScatterGL plots on a single page causes modern browsers to exceed their WebGL context limit, crash, and display blank charts. `khadee-eda` compiles Scatter plots to optimized vector SVG path strings, ensuring **100% chart rendering reliability** without sacrificing interactive zoom or hover features.
|
|
168
|
+
* **Smart Dropdown Selectors**: Instead of scrolling endlessly through dozens of columns, the report includes a dynamic select element to view one variable card at a time, instantly resizing the embedded Plotly chart to prevent layout distortions.
|
|
169
|
+
* **Copyable Preprocessing Recommender**: When the library suggests cleaning operations (e.g., standardizing headers or imputing values), it displays a syntax-highlighted code block with a one-click copy button, generating context-aware code ready for your pipeline.
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## 📦 Installation
|
|
174
|
+
|
|
175
|
+
To install `khadee-eda` in development mode locally:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
git clone https://github.com/khadee/khadee-eda.git
|
|
179
|
+
cd khadee_EDA
|
|
180
|
+
pip install -e .
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
To install directly using `uv` (recommended for extreme speed):
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
uv pip install -e .
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## 📄 License
|
|
192
|
+
|
|
193
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# 🔬 Khadee EDA — Deep Insights Data Profiling & Cleaning
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[]()
|
|
6
|
+
[]()
|
|
7
|
+
|
|
8
|
+
`khadee-eda` is a next-generation, high-performance exploratory data analysis (EDA) and data cleaning library. It generates **stunning, glassmorphism-themed interactive HTML profiling reports** from any dataset and provides a robust, lightweight suite of cleaning tools equivalent to `dataprep.clean`.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## ⚡ Quick Start
|
|
13
|
+
|
|
14
|
+
### 1. Generating a Profiling Report
|
|
15
|
+
Auto-detects and loads data from CSV, Excel, JSON, Parquet, SQLite, and 10+ other formats.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from khadee_eda import ProfileReport
|
|
19
|
+
|
|
20
|
+
# Method A: Direct one-liner from file path
|
|
21
|
+
report = ProfileReport("train.csv", title="E-Commerce Analysis")
|
|
22
|
+
report.to_html("report.html")
|
|
23
|
+
|
|
24
|
+
# Method B: From an existing Pandas DataFrame
|
|
25
|
+
import pandas as pd
|
|
26
|
+
df = pd.read_csv("train.csv")
|
|
27
|
+
report = ProfileReport(df, title="Customer Profiles")
|
|
28
|
+
report.to_html("report.html")
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### 2. High-Performance Data Cleaning (`khadee_eda.clean`)
|
|
32
|
+
Direct, unified API for cleaning, standardizing, and preparing data (a lightweight alternative to `dataprep.clean`).
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from khadee_eda import clean
|
|
36
|
+
|
|
37
|
+
# Clean column headers (standardize to snake_case, PascalCase, camelCase, etc.)
|
|
38
|
+
df_clean = clean.clean_headers(df, case="snake")
|
|
39
|
+
|
|
40
|
+
# Impute missing values with mean, median, mode, or constant value
|
|
41
|
+
df_clean = clean.clean_missing(df_clean, columns=["age", "income"], strategy="median")
|
|
42
|
+
|
|
43
|
+
# Handle outliers by clipping (winsorization) or dropping rows
|
|
44
|
+
df_clean = clean.clean_outliers(df_clean, columns=["fare"], method="iqr", strategy="clip")
|
|
45
|
+
|
|
46
|
+
# Normalize and clean text columns (strip spaces, lowercase, remove special characters)
|
|
47
|
+
df_clean = clean.clean_text(df_clean, columns=["product_desc"], lowercase=True, remove_special=True)
|
|
48
|
+
|
|
49
|
+
# Remove duplicate rows
|
|
50
|
+
df_clean = clean.clean_duplicates(df_clean, columns=["user_id"])
|
|
51
|
+
|
|
52
|
+
# Run a complete, standard cleanup pass
|
|
53
|
+
df_clean = clean.clean_df(df)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## 📊 10 Structured Analysis Sections
|
|
59
|
+
|
|
60
|
+
Each HTML report is divided into 10 structured, deeply interactive sections:
|
|
61
|
+
|
|
62
|
+
1. **🏠 Overview**: High-level dataset shapes, reproduction metadata, alerts (missing cells, zero values, extreme correlations, duplicates), and detected data types.
|
|
63
|
+
2. **📊 Variables (Interactive Dropdown Explorer)**: Detailed statistics per variable (quantiles, descriptives, frequencies, categories). Includes a custom select dropdown menu to show/hide column details and dynamically resize Plotly visualizations.
|
|
64
|
+
3. **📈 Distributions**: Visual analysis of distributions via histogram grids, kernel density estimations (KDE), skewness, kurtosis, and normality tests.
|
|
65
|
+
4. **🔗 Correlations**: Pairwise comparison matrices using **Pearson**, **Spearman**, **Kendall**, and **Cramér's V** metrics represented as interactive heatmaps.
|
|
66
|
+
5. **❓ Missing Values**: Visual representation of missing data via matrices, counts, and imputation recommendations.
|
|
67
|
+
6. **🎯 Outliers**: Deep outlier diagnostic detailing detection using IQR, Z-score, Median Absolute Deviation (MAD), and Isolation Forest.
|
|
68
|
+
7. **🔄 Interactions**: Interactive bivariate scatter plots and grouped box plots.
|
|
69
|
+
8. **📐 Advanced Stats (Global AI Hub Methodologies)**: Unique statistical and machine learning frameworks tailored after analytical cultures across the globe (see below).
|
|
70
|
+
9. **🤖 Model Readiness**: Preprocessing checklists, ML model suitability rankings, and code recommendation generators.
|
|
71
|
+
10. **📋 Sample**: Interactive data table viewer showing the head, tail, duplicates, and data dictionary.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## 🌍 Global EDA Techniques
|
|
76
|
+
|
|
77
|
+
The **Advanced Stats** section includes 4 distinct regional analytical philosophies:
|
|
78
|
+
|
|
79
|
+
* **🇺🇸 US (ML-Readiness & Feature Engineering)**: Identifies feature importance, flags target leakage, and proposes engineered features.
|
|
80
|
+
* **🇮🇳 India (Statistical Foundations & Hypothesis Testing)**: Evaluates confidence intervals, conducts hypothesis testing, and fits target distributions.
|
|
81
|
+
* **🇯🇵 Japan (Quality Control & Process Analytics — Kaizen)**: Implements Shewhart control charts, calculates Process Capability Indexes ($C_p$/$C_{pk}$), checks stability indicators, and generates Pareto charts.
|
|
82
|
+
* **🇨🇳 China (Large-Scale Pattern Recognition)**: Generates PCA projections, evaluates Hopkins clustering statistics, provides K-Means elbow curves, and profiles data density.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## 📂 Supported Formats
|
|
87
|
+
|
|
88
|
+
No need to write separate loading code. `khadee-eda` automatically detects your dataset extension and uses optimized engines to parse it:
|
|
89
|
+
|
|
90
|
+
| Format | Extensions | Parser |
|
|
91
|
+
| :--- | :--- | :--- |
|
|
92
|
+
| **CSV / TSV** | `.csv`, `.tsv`, `.txt` | Pandas optimized parser with latin-1 fallback |
|
|
93
|
+
| **Excel** | `.xlsx`, `.xls`, `.xlsm`, `.xlsb` | openpyxl / xlrd engine |
|
|
94
|
+
| **JSON** | `.json` | Standard and JSON-lines parsed dynamically |
|
|
95
|
+
| **Parquet / Feather** | `.parquet`, `.feather` | PyArrow engine |
|
|
96
|
+
| **SQLite** | `.db`, `.sqlite`, `.sqlite3` | Built-in SQLite connection reader |
|
|
97
|
+
| **Pickle** | `.pkl`, `.pickle` | Standard Python pickle serializer |
|
|
98
|
+
| **Others** | `.h5`, `.hdf5`, `.xml`, `.dta`, `.sas7bdat`, `.sav` | Supporting PyTables, XML, Stata, SAS, SPSS |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 💾 Package Footprint & Download Size
|
|
103
|
+
|
|
104
|
+
Unlike heavier packages that bundle thick C++ binaries, `khadee-eda` is designed to be **incredibly lightweight** and fast to download.
|
|
105
|
+
|
|
106
|
+
### 1. Download Size (Pip / UV)
|
|
107
|
+
* **Wheel Size (`.whl`)**: **~85 KB**
|
|
108
|
+
* **Source Distribution (`.tar.gz`)**: **~90 KB**
|
|
109
|
+
* **Package Source Size**: **~170 KB** (Clean, pure Python logic + minimal glassmorphism style assets)
|
|
110
|
+
|
|
111
|
+
### 2. Dependency Size
|
|
112
|
+
If your machine already has standard data science packages (like `pandas`, `numpy`, `scipy`) cached, the installation completes instantly (~85 KB download). If installing into a blank virtual environment, pip/uv will download the scientific stack:
|
|
113
|
+
|
|
114
|
+
| Dependency | Purpose | Download Size (Approx.) |
|
|
115
|
+
| :--- | :--- | :--- |
|
|
116
|
+
| **pandas** | Data manipulation & structure | ~12 - 15 MB |
|
|
117
|
+
| **numpy** | Array computations | ~14 - 18 MB |
|
|
118
|
+
| **scipy** | Advanced statistics & tests | ~35 - 40 MB |
|
|
119
|
+
| **scikit-learn** | Machine learning engines & PCA | ~7 - 9 MB |
|
|
120
|
+
| **plotly** | Dynamic SVG visualizations | ~7 - 8 MB |
|
|
121
|
+
| **pyarrow** | High-performance Parquet storage | ~30 - 35 MB |
|
|
122
|
+
| **openpyxl** | Excel read/write compatibility | ~2 - 3 MB |
|
|
123
|
+
| **jinja2** | HTML templating engine | ~0.2 MB |
|
|
124
|
+
| **Total Dependencies** | Full Scientific Stack | **~110 - 130 MB** |
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## ⚙️ Selective Reports
|
|
129
|
+
|
|
130
|
+
Save compute time and reduce HTML sizes for large datasets by only rendering the sections or techniques you need:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from khadee_eda import ProfileReport
|
|
134
|
+
|
|
135
|
+
# Profile only Specific Sections
|
|
136
|
+
report = ProfileReport(
|
|
137
|
+
"dataset.csv",
|
|
138
|
+
sections=["overview", "variables", "model_readiness"]
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Render only Specific Global Techniques
|
|
142
|
+
report = ProfileReport(
|
|
143
|
+
"dataset.csv",
|
|
144
|
+
techniques=["japan", "us"]
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## 💎 Design & Visual Performance Excellence
|
|
151
|
+
|
|
152
|
+
* **Glassmorphism Dark Theme**: Standard EDA reports often look like boring 2010 tables. `khadee-eda` features a high-end, dark glassmorphism dashboard with neon accents, dynamic hover states, and smooth CSS micro-animations.
|
|
153
|
+
* **Instant PDF Export**: Features a beautiful floating "Download PDF" button that triggers browser printing. The custom media print styles automatically expand all hidden column cards, expand all tabs, hide navigational elements/dropdowns, and switch to a crisp ink-saving light template for a clean, professional corporate report.
|
|
154
|
+
* **WebGL Crash Mitigation**: Rendering dozens of ScatterGL plots on a single page causes modern browsers to exceed their WebGL context limit, crash, and display blank charts. `khadee-eda` compiles Scatter plots to optimized vector SVG path strings, ensuring **100% chart rendering reliability** without sacrificing interactive zoom or hover features.
|
|
155
|
+
* **Smart Dropdown Selectors**: Instead of scrolling endlessly through dozens of columns, the report includes a dynamic select element to view one variable card at a time, instantly resizing the embedded Plotly chart to prevent layout distortions.
|
|
156
|
+
* **Copyable Preprocessing Recommender**: When the library suggests cleaning operations (e.g., standardizing headers or imputing values), it displays a syntax-highlighted code block with a one-click copy button, generating context-aware code ready for your pipeline.
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## 📦 Installation
|
|
161
|
+
|
|
162
|
+
To install `khadee-eda` in development mode locally:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
git clone https://github.com/khadee/khadee-eda.git
|
|
166
|
+
cd khadee_EDA
|
|
167
|
+
pip install -e .
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
To install directly using `uv` (recommended for extreme speed):
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
uv pip install -e .
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## 📄 License
|
|
179
|
+
|
|
180
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Khadee EDA — Deep Insights Data Profiling
|
|
3
|
+
==========================================
|
|
4
|
+
|
|
5
|
+
A comprehensive EDA module that generates stunning HTML profiling reports
|
|
6
|
+
from any dataset format. Supports CSV, Excel, JSON, Parquet, and 10+ more formats.
|
|
7
|
+
|
|
8
|
+
Usage
|
|
9
|
+
-----
|
|
10
|
+
from khadee_eda import ProfileReport
|
|
11
|
+
|
|
12
|
+
# From file (any format — auto-detected)
|
|
13
|
+
report = ProfileReport("train.csv", title="My EDA Report")
|
|
14
|
+
report = ProfileReport("data.xlsx", title="Excel Analysis")
|
|
15
|
+
|
|
16
|
+
# From DataFrame
|
|
17
|
+
import pandas as pd
|
|
18
|
+
df = pd.read_csv("train.csv")
|
|
19
|
+
report = ProfileReport(df, title="My EDA Report")
|
|
20
|
+
|
|
21
|
+
# Generate HTML report
|
|
22
|
+
report.to_html("report.html")
|
|
23
|
+
|
|
24
|
+
# Selective sections
|
|
25
|
+
report = ProfileReport(df, sections=["overview", "variables", "correlations"])
|
|
26
|
+
|
|
27
|
+
# Selective techniques
|
|
28
|
+
report = ProfileReport(df, techniques=["us", "japan"])
|
|
29
|
+
|
|
30
|
+
Sub-modules
|
|
31
|
+
-----------
|
|
32
|
+
from khadee_eda.techniques import us, india, japan, china
|
|
33
|
+
from khadee_eda.engines import stats_engine, correlation_engine, missing_engine, outlier_engine
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
__version__ = "1.0.0"
|
|
37
|
+
__author__ = "Khadee"
|
|
38
|
+
|
|
39
|
+
import sys
|
|
40
|
+
import time
|
|
41
|
+
import warnings
|
|
42
|
+
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _print(msg):
|
|
47
|
+
"""Print with UTF-8 encoding fallback for Windows consoles."""
|
|
48
|
+
try:
|
|
49
|
+
print(msg)
|
|
50
|
+
except UnicodeEncodeError:
|
|
51
|
+
print(msg.encode("ascii", errors="replace").decode("ascii"))
|
|
52
|
+
|
|
53
|
+
from .config import ALL_SECTIONS, ALL_TECHNIQUES
|
|
54
|
+
from .loader import load_dataset
|
|
55
|
+
from .type_detector import detect_types
|
|
56
|
+
from .renderers.html_renderer import render_html
|
|
57
|
+
from . import clean
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ProfileReport:
|
|
61
|
+
"""
|
|
62
|
+
Generate a comprehensive EDA profiling report.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
source : str or pd.DataFrame
|
|
67
|
+
File path (auto-detects format from extension) or pandas DataFrame.
|
|
68
|
+
title : str, optional
|
|
69
|
+
Report title. Default: "Khadee EDA Report".
|
|
70
|
+
sections : list, optional
|
|
71
|
+
List of section IDs to include. Default: all 10 sections.
|
|
72
|
+
Options: overview, variables, distributions, correlations, missing,
|
|
73
|
+
outliers, interactions, advanced_stats, model_readiness, sample
|
|
74
|
+
techniques : list, optional
|
|
75
|
+
List of technique IDs for the Advanced Statistics section.
|
|
76
|
+
Default: all 4 techniques.
|
|
77
|
+
Options: us, india, japan, china
|
|
78
|
+
**kwargs : dict
|
|
79
|
+
Extra arguments passed to the file reader (e.g., sheet_name for Excel).
|
|
80
|
+
|
|
81
|
+
Examples
|
|
82
|
+
--------
|
|
83
|
+
>>> from khadee_eda import ProfileReport
|
|
84
|
+
>>> report = ProfileReport("train.csv", title="Profiling Report")
|
|
85
|
+
>>> report.to_html("report.html")
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(self, source, title="Khadee EDA Report", sections=None,
|
|
89
|
+
techniques=None, **kwargs):
|
|
90
|
+
self.title = title
|
|
91
|
+
self.sections = sections or ALL_SECTIONS
|
|
92
|
+
self.techniques = techniques or ALL_TECHNIQUES
|
|
93
|
+
self._start_time = time.time()
|
|
94
|
+
|
|
95
|
+
# Validate sections
|
|
96
|
+
for s in self.sections:
|
|
97
|
+
if s not in ALL_SECTIONS:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Unknown section: '{s}'. Available: {ALL_SECTIONS}"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Validate techniques
|
|
103
|
+
for t in self.techniques:
|
|
104
|
+
if t not in ALL_TECHNIQUES:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Unknown technique: '{t}'. Available: {ALL_TECHNIQUES}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Load data
|
|
110
|
+
_print("[*] Khadee EDA -- Loading dataset...")
|
|
111
|
+
self.df, self.metadata = load_dataset(source, **kwargs)
|
|
112
|
+
_print(f" [+] Loaded: {self.df.shape[0]:,} rows x {self.df.shape[1]:,} columns")
|
|
113
|
+
|
|
114
|
+
# Detect types
|
|
115
|
+
_print(" [*] Detecting column types...")
|
|
116
|
+
self.type_map = detect_types(self.df)
|
|
117
|
+
|
|
118
|
+
# Pre-compute report
|
|
119
|
+
_print(" [*] Analyzing data...")
|
|
120
|
+
self._sections_html = self._generate_sections()
|
|
121
|
+
|
|
122
|
+
elapsed = time.time() - self._start_time
|
|
123
|
+
_print(f" [+] Analysis complete in {elapsed:.2f}s")
|
|
124
|
+
|
|
125
|
+
def _generate_sections(self):
|
|
126
|
+
"""Generate HTML for all requested sections."""
|
|
127
|
+
from .sections import (
|
|
128
|
+
overview, variables, distributions, correlations,
|
|
129
|
+
missing, outliers, interactions, advanced_stats,
|
|
130
|
+
model_readiness, sample,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
section_generators = {
|
|
134
|
+
"overview": lambda: overview.generate(
|
|
135
|
+
self.df, self.type_map, self.metadata, self._start_time
|
|
136
|
+
),
|
|
137
|
+
"variables": lambda: variables.generate(self.df, self.type_map),
|
|
138
|
+
"distributions": lambda: distributions.generate(self.df, self.type_map),
|
|
139
|
+
"correlations": lambda: correlations.generate(self.df, self.type_map),
|
|
140
|
+
"missing": lambda: missing.generate(self.df, self.type_map),
|
|
141
|
+
"outliers": lambda: outliers.generate(self.df, self.type_map),
|
|
142
|
+
"interactions": lambda: interactions.generate(self.df, self.type_map),
|
|
143
|
+
"advanced_stats": lambda: advanced_stats.generate(
|
|
144
|
+
self.df, self.type_map, self.techniques
|
|
145
|
+
),
|
|
146
|
+
"model_readiness": lambda: model_readiness.generate(self.df, self.type_map),
|
|
147
|
+
"sample": lambda: sample.generate(self.df, self.type_map),
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
results = {}
|
|
151
|
+
for section_id in self.sections:
|
|
152
|
+
gen = section_generators.get(section_id)
|
|
153
|
+
if gen:
|
|
154
|
+
try:
|
|
155
|
+
results[section_id] = gen()
|
|
156
|
+
_print(f" [+] {section_id}")
|
|
157
|
+
except Exception as e:
|
|
158
|
+
warnings.warn(f"Error generating section '{section_id}': {e}")
|
|
159
|
+
results[section_id] = (
|
|
160
|
+
f'<div class="card"><h3 class="card-title">⚠️ Error in {section_id}</h3>'
|
|
161
|
+
f'<p class="error-message">{str(e)}</p></div>'
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return results
|
|
165
|
+
|
|
166
|
+
def to_html(self, output_path="report.html"):
|
|
167
|
+
"""
|
|
168
|
+
Generate and save the HTML report.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
output_path : str
|
|
173
|
+
Path to save the HTML report.
|
|
174
|
+
"""
|
|
175
|
+
_print(" [*] Generating HTML report...")
|
|
176
|
+
|
|
177
|
+
html = render_html(self.title, self._sections_html, self.sections)
|
|
178
|
+
|
|
179
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
180
|
+
f.write(html)
|
|
181
|
+
|
|
182
|
+
_print(f" [+] Report saved to: {output_path}")
|
|
183
|
+
return output_path
|
|
184
|
+
|
|
185
|
+
def to_html_string(self):
|
|
186
|
+
"""Return the HTML report as a string."""
|
|
187
|
+
return render_html(self.title, self._sections_html, self.sections)
|
|
188
|
+
|
|
189
|
+
def __repr__(self):
|
|
190
|
+
return (
|
|
191
|
+
f"ProfileReport("
|
|
192
|
+
f"rows={self.df.shape[0]:,}, cols={self.df.shape[1]:,}, "
|
|
193
|
+
f"sections={len(self.sections)}, techniques={len(self.techniques)})"
|
|
194
|
+
)
|