khadee-eda 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. khadee_eda-1.0.0/PKG-INFO +193 -0
  2. khadee_eda-1.0.0/README.md +180 -0
  3. khadee_eda-1.0.0/khadee_eda/__init__.py +194 -0
  4. khadee_eda-1.0.0/khadee_eda/assets/script.js +137 -0
  5. khadee_eda-1.0.0/khadee_eda/assets/style.css +1336 -0
  6. khadee_eda-1.0.0/khadee_eda/clean.py +287 -0
  7. khadee_eda-1.0.0/khadee_eda/config.py +121 -0
  8. khadee_eda-1.0.0/khadee_eda/engines/__init__.py +1 -0
  9. khadee_eda-1.0.0/khadee_eda/engines/correlation_engine.py +115 -0
  10. khadee_eda-1.0.0/khadee_eda/engines/dim_reduction.py +152 -0
  11. khadee_eda-1.0.0/khadee_eda/engines/missing_engine.py +170 -0
  12. khadee_eda-1.0.0/khadee_eda/engines/outlier_engine.py +190 -0
  13. khadee_eda-1.0.0/khadee_eda/engines/stats_engine.py +200 -0
  14. khadee_eda-1.0.0/khadee_eda/loader.py +221 -0
  15. khadee_eda-1.0.0/khadee_eda/renderers/__init__.py +1 -0
  16. khadee_eda-1.0.0/khadee_eda/renderers/chart_renderer.py +547 -0
  17. khadee_eda-1.0.0/khadee_eda/renderers/html_renderer.py +128 -0
  18. khadee_eda-1.0.0/khadee_eda/renderers/table_renderer.py +38 -0
  19. khadee_eda-1.0.0/khadee_eda/sections/__init__.py +1 -0
  20. khadee_eda-1.0.0/khadee_eda/sections/advanced_stats.py +47 -0
  21. khadee_eda-1.0.0/khadee_eda/sections/correlations.py +125 -0
  22. khadee_eda-1.0.0/khadee_eda/sections/distributions.py +102 -0
  23. khadee_eda-1.0.0/khadee_eda/sections/interactions.py +84 -0
  24. khadee_eda-1.0.0/khadee_eda/sections/missing.py +102 -0
  25. khadee_eda-1.0.0/khadee_eda/sections/model_readiness.py +334 -0
  26. khadee_eda-1.0.0/khadee_eda/sections/outliers.py +94 -0
  27. khadee_eda-1.0.0/khadee_eda/sections/overview.py +174 -0
  28. khadee_eda-1.0.0/khadee_eda/sections/sample.py +109 -0
  29. khadee_eda-1.0.0/khadee_eda/sections/variables.py +348 -0
  30. khadee_eda-1.0.0/khadee_eda/techniques/__init__.py +2 -0
  31. khadee_eda-1.0.0/khadee_eda/techniques/china.py +125 -0
  32. khadee_eda-1.0.0/khadee_eda/techniques/india.py +167 -0
  33. khadee_eda-1.0.0/khadee_eda/techniques/japan.py +177 -0
  34. khadee_eda-1.0.0/khadee_eda/techniques/us.py +108 -0
  35. khadee_eda-1.0.0/khadee_eda/type_detector.py +169 -0
  36. khadee_eda-1.0.0/khadee_eda/utils.py +130 -0
  37. khadee_eda-1.0.0/khadee_eda.egg-info/PKG-INFO +193 -0
  38. khadee_eda-1.0.0/khadee_eda.egg-info/SOURCES.txt +41 -0
  39. khadee_eda-1.0.0/khadee_eda.egg-info/dependency_links.txt +1 -0
  40. khadee_eda-1.0.0/khadee_eda.egg-info/top_level.txt +1 -0
  41. khadee_eda-1.0.0/pyproject.toml +14 -0
  42. khadee_eda-1.0.0/setup.cfg +4 -0
  43. khadee_eda-1.0.0/setup.py +42 -0
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.4
2
+ Name: khadee-eda
3
+ Version: 1.0.0
4
+ Summary: Deep Insights EDA — Comprehensive data profiling with global AI techniques
5
+ Home-page: https://github.com/khadee/khadee-eda
6
+ Author: Khadee
7
+ License: MIT
8
+ Requires-Python: >=3.8
9
+ Description-Content-Type: text/markdown
10
+ Dynamic: author
11
+ Dynamic: home-page
12
+ Dynamic: requires-python
13
+
14
+ # 🔬 Khadee EDA — Deep Insights Data Profiling & Cleaning
15
+
16
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
17
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
18
+ [![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg)]()
19
+ [![Package Size](https://img.shields.io/badge/wheel--size-~85_KB-blueviolet.svg)]()
20
+
21
+ `khadee-eda` is a next-generation, high-performance exploratory data analysis (EDA) and data cleaning library. It generates **stunning, glassmorphism-themed interactive HTML profiling reports** from any dataset and provides a robust, lightweight suite of cleaning tools equivalent to `dataprep.clean`.
22
+
23
+ ---
24
+
25
+ ## ⚡ Quick Start
26
+
27
+ ### 1. Generating a Profiling Report
28
+ Auto-detects and loads data from CSV, Excel, JSON, Parquet, SQLite, and 10+ other formats.
29
+
30
+ ```python
31
+ from khadee_eda import ProfileReport
32
+
33
+ # Method A: Direct one-liner from file path
34
+ report = ProfileReport("train.csv", title="E-Commerce Analysis")
35
+ report.to_html("report.html")
36
+
37
+ # Method B: From an existing Pandas DataFrame
38
+ import pandas as pd
39
+ df = pd.read_csv("train.csv")
40
+ report = ProfileReport(df, title="Customer Profiles")
41
+ report.to_html("report.html")
42
+ ```
43
+
44
+ ### 2. High-Performance Data Cleaning (`khadee_eda.clean`)
45
+ Direct, unified API for cleaning, standardizing, and preparing data (a lightweight alternative to `dataprep.clean`).
46
+
47
+ ```python
48
+ from khadee_eda import clean
49
+
50
+ # Clean column headers (standardize to snake_case, PascalCase, camelCase, etc.)
51
+ df_clean = clean.clean_headers(df, case="snake")
52
+
53
+ # Impute missing values with mean, median, mode, or constant value
54
+ df_clean = clean.clean_missing(df_clean, columns=["age", "income"], strategy="median")
55
+
56
+ # Handle outliers by clipping (winsorization) or dropping rows
57
+ df_clean = clean.clean_outliers(df_clean, columns=["fare"], method="iqr", strategy="clip")
58
+
59
+ # Normalize and clean text columns (strip spaces, lowercase, remove special characters)
60
+ df_clean = clean.clean_text(df_clean, columns=["product_desc"], lowercase=True, remove_special=True)
61
+
62
+ # Remove duplicate rows
63
+ df_clean = clean.clean_duplicates(df_clean, columns=["user_id"])
64
+
65
+ # Run a complete, standard cleanup pass
66
+ df_clean = clean.clean_df(df)
67
+ ```
68
+
69
+ ---
70
+
71
+ ## 📊 10 Structured Analysis Sections
72
+
73
+ Each HTML report is divided into 10 structured, deeply interactive sections:
74
+
75
+ 1. **🏠 Overview**: High-level dataset shapes, reproduction metadata, alerts (missing cells, zero values, extreme correlations, duplicates), and detected data types.
76
+ 2. **📊 Variables (Interactive Dropdown Explorer)**: Detailed statistics per variable (quantiles, descriptives, frequencies, categories). Includes a custom select dropdown menu to show/hide column details and dynamically resize Plotly visualizations.
77
+ 3. **📈 Distributions**: Visual analysis of distributions via histogram grids, kernel density estimations (KDE), skewness, kurtosis, and normality tests.
78
+ 4. **🔗 Correlations**: Pairwise comparison matrices using **Pearson**, **Spearman**, **Kendall**, and **Cramér's V** metrics represented as interactive heatmaps.
79
+ 5. **❓ Missing Values**: Visual representation of missing data via matrices, counts, and imputation recommendations.
80
+ 6. **🎯 Outliers**: Deep outlier diagnostic detailing detection using IQR, Z-score, Median Absolute Deviation (MAD), and Isolation Forest.
81
+ 7. **🔄 Interactions**: Interactive bivariate scatter plots and grouped box plots.
82
+ 8. **📐 Advanced Stats (Global AI Hub Methodologies)**: Unique statistical and machine learning frameworks tailored after analytical cultures across the globe (see below).
83
+ 9. **🤖 Model Readiness**: Preprocessing checklists, ML model suitability rankings, and code recommendation generators.
84
+ 10. **📋 Sample**: Interactive data table viewer showing the head, tail, duplicates, and data dictionary.
85
+
86
+ ---
87
+
88
+ ## 🌍 Global EDA Techniques
89
+
90
+ The **Advanced Stats** section includes 4 distinct regional analytical philosophies:
91
+
92
+ * **🇺🇸 US (ML-Readiness & Feature Engineering)**: Identifies feature importance, flags target leakage, and proposes engineered features.
93
+ * **🇮🇳 India (Statistical Foundations & Hypothesis Testing)**: Evaluates confidence intervals, conducts hypothesis testing, and fits target distributions.
94
+ * **🇯🇵 Japan (Quality Control & Process Analytics — Kaizen)**: Implements Shewhart control charts, calculates Process Capability Indexes ($C_p$/$C_{pk}$), checks stability indicators, and generates Pareto charts.
95
+ * **🇨🇳 China (Large-Scale Pattern Recognition)**: Generates PCA projections, evaluates Hopkins clustering statistics, provides K-Means elbow curves, and profiles data density.
96
+
97
+ ---
98
+
99
+ ## 📂 Supported Formats
100
+
101
+ No need to write separate loading code. `khadee-eda` automatically detects your dataset extension and uses optimized engines to parse it:
102
+
103
+ | Format | Extensions | Parser |
104
+ | :--- | :--- | :--- |
105
+ | **CSV / TSV** | `.csv`, `.tsv`, `.txt` | Pandas optimized parser with latin-1 fallback |
106
+ | **Excel** | `.xlsx`, `.xls`, `.xlsm`, `.xlsb` | openpyxl / xlrd engine |
107
+ | **JSON** | `.json` | Standard and JSON-lines parsed dynamically |
108
+ | **Parquet / Feather** | `.parquet`, `.feather` | PyArrow engine |
109
+ | **SQLite** | `.db`, `.sqlite`, `.sqlite3` | Built-in SQLite connection reader |
110
+ | **Pickle** | `.pkl`, `.pickle` | Standard Python pickle serializer |
111
+ | **Others** | `.h5`, `.hdf5`, `.xml`, `.dta`, `.sas7bdat`, `.sav` | Supporting PyTables, XML, Stata, SAS, SPSS |
112
+
113
+ ---
114
+
115
+ ## 💾 Package Footprint & Download Size
116
+
117
+ Unlike heavier packages that bundle thick C++ binaries, `khadee-eda` is designed to be **incredibly lightweight** and fast to download.
118
+
119
+ ### 1. Download Size (Pip / UV)
120
+ * **Wheel Size (`.whl`)**: **~85 KB**
121
+ * **Source Distribution (`.tar.gz`)**: **~90 KB**
122
+ * **Package Source Size**: **~170 KB** (Clean, pure Python logic + minimal glassmorphism style assets)
123
+
124
+ ### 2. Dependency Size
125
+ If your machine already has standard data science packages (like `pandas`, `numpy`, `scipy`) cached, the installation completes instantly (~85 KB download). If installing into a blank virtual environment, pip/uv will download the scientific stack:
126
+
127
+ | Dependency | Purpose | Download Size (Approx.) |
128
+ | :--- | :--- | :--- |
129
+ | **pandas** | Data manipulation & structure | ~12 - 15 MB |
130
+ | **numpy** | Array computations | ~14 - 18 MB |
131
+ | **scipy** | Advanced statistics & tests | ~35 - 40 MB |
132
+ | **scikit-learn** | Machine learning engines & PCA | ~7 - 9 MB |
133
+ | **plotly** | Dynamic SVG visualizations | ~7 - 8 MB |
134
+ | **pyarrow** | High-performance Parquet storage | ~30 - 35 MB |
135
+ | **openpyxl** | Excel read/write compatibility | ~2 - 3 MB |
136
+ | **jinja2** | HTML templating engine | ~0.2 MB |
137
+ | **Total Dependencies** | Full Scientific Stack | **~110 - 130 MB** |
138
+
139
+ ---
140
+
141
+ ## ⚙️ Selective Reports
142
+
143
+ Save compute time and reduce HTML sizes for large datasets by only rendering the sections or techniques you need:
144
+
145
+ ```python
146
+ from khadee_eda import ProfileReport
147
+
148
+ # Profile only Specific Sections
149
+ report = ProfileReport(
150
+ "dataset.csv",
151
+ sections=["overview", "variables", "model_readiness"]
152
+ )
153
+
154
+ # Render only Specific Global Techniques
155
+ report = ProfileReport(
156
+ "dataset.csv",
157
+ techniques=["japan", "us"]
158
+ )
159
+ ```
160
+
161
+ ---
162
+
163
+ ## 💎 Design & Visual Performance Excellence
164
+
165
+ * **Glassmorphism Dark Theme**: Standard EDA reports often look like boring 2010 tables. `khadee-eda` features a high-end, dark glassmorphism dashboard with neon accents, dynamic hover states, and smooth CSS micro-animations.
166
+ * **Instant PDF Export**: Features a beautiful floating "Download PDF" button that triggers browser printing. The custom media print styles automatically expand all hidden column cards, expand all tabs, hide navigational elements/dropdowns, and switch to a crisp ink-saving light template for a clean, professional corporate report.
167
+ * **WebGL Crash Mitigation**: Rendering dozens of ScatterGL plots on a single page causes modern browsers to exceed their WebGL context limit, crash, and display blank charts. `khadee-eda` compiles Scatter plots to optimized vector SVG path strings, ensuring **100% chart rendering reliability** without sacrificing interactive zoom or hover features.
168
+ * **Smart Dropdown Selectors**: Instead of scrolling endlessly through dozens of columns, the report includes a dynamic select element to view one variable card at a time, instantly resizing the embedded Plotly chart to prevent layout distortions.
169
+ * **Copyable Preprocessing Recommender**: When the library suggests cleaning operations (e.g., standardizing headers or imputing values), it displays a syntax-highlighted code block with a one-click copy button, generating context-aware code ready for your pipeline.
170
+
171
+ ---
172
+
173
+ ## 📦 Installation
174
+
175
+ To install `khadee-eda` in development mode locally:
176
+
177
+ ```bash
178
+ git clone https://github.com/khadee/khadee-eda.git
179
+ cd khadee_EDA
180
+ pip install -e .
181
+ ```
182
+
183
+ To install directly using `uv` (recommended for extreme speed):
184
+
185
+ ```bash
186
+ uv pip install -e .
187
+ ```
188
+
189
+ ---
190
+
191
+ ## 📄 License
192
+
193
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,180 @@
1
+ # 🔬 Khadee EDA — Deep Insights Data Profiling & Cleaning
2
+
3
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+ [![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg)]()
6
+ [![Package Size](https://img.shields.io/badge/wheel--size-~85_KB-blueviolet.svg)]()
7
+
8
+ `khadee-eda` is a next-generation, high-performance exploratory data analysis (EDA) and data cleaning library. It generates **stunning, glassmorphism-themed interactive HTML profiling reports** from any dataset and provides a robust, lightweight suite of cleaning tools equivalent to `dataprep.clean`.
9
+
10
+ ---
11
+
12
+ ## ⚡ Quick Start
13
+
14
+ ### 1. Generating a Profiling Report
15
+ Auto-detects and loads data from CSV, Excel, JSON, Parquet, SQLite, and 10+ other formats.
16
+
17
+ ```python
18
+ from khadee_eda import ProfileReport
19
+
20
+ # Method A: Direct one-liner from file path
21
+ report = ProfileReport("train.csv", title="E-Commerce Analysis")
22
+ report.to_html("report.html")
23
+
24
+ # Method B: From an existing Pandas DataFrame
25
+ import pandas as pd
26
+ df = pd.read_csv("train.csv")
27
+ report = ProfileReport(df, title="Customer Profiles")
28
+ report.to_html("report.html")
29
+ ```
30
+
31
+ ### 2. High-Performance Data Cleaning (`khadee_eda.clean`)
32
+ Direct, unified API for cleaning, standardizing, and preparing data (a lightweight alternative to `dataprep.clean`).
33
+
34
+ ```python
35
+ from khadee_eda import clean
36
+
37
+ # Clean column headers (standardize to snake_case, PascalCase, camelCase, etc.)
38
+ df_clean = clean.clean_headers(df, case="snake")
39
+
40
+ # Impute missing values with mean, median, mode, or constant value
41
+ df_clean = clean.clean_missing(df_clean, columns=["age", "income"], strategy="median")
42
+
43
+ # Handle outliers by clipping (winsorization) or dropping rows
44
+ df_clean = clean.clean_outliers(df_clean, columns=["fare"], method="iqr", strategy="clip")
45
+
46
+ # Normalize and clean text columns (strip spaces, lowercase, remove special characters)
47
+ df_clean = clean.clean_text(df_clean, columns=["product_desc"], lowercase=True, remove_special=True)
48
+
49
+ # Remove duplicate rows
50
+ df_clean = clean.clean_duplicates(df_clean, columns=["user_id"])
51
+
52
+ # Run a complete, standard cleanup pass
53
+ df_clean = clean.clean_df(df)
54
+ ```
55
+
56
+ ---
57
+
58
+ ## 📊 10 Structured Analysis Sections
59
+
60
+ Each HTML report is divided into 10 structured, deeply interactive sections:
61
+
62
+ 1. **🏠 Overview**: High-level dataset shapes, reproduction metadata, alerts (missing cells, zero values, extreme correlations, duplicates), and detected data types.
63
+ 2. **📊 Variables (Interactive Dropdown Explorer)**: Detailed statistics per variable (quantiles, descriptives, frequencies, categories). Includes a custom select dropdown menu to show/hide column details and dynamically resize Plotly visualizations.
64
+ 3. **📈 Distributions**: Visual analysis of distributions via histogram grids, kernel density estimations (KDE), skewness, kurtosis, and normality tests.
65
+ 4. **🔗 Correlations**: Pairwise comparison matrices using **Pearson**, **Spearman**, **Kendall**, and **Cramér's V** metrics represented as interactive heatmaps.
66
+ 5. **❓ Missing Values**: Visual representation of missing data via matrices, counts, and imputation recommendations.
67
+ 6. **🎯 Outliers**: Deep outlier diagnostic detailing detection using IQR, Z-score, Median Absolute Deviation (MAD), and Isolation Forest.
68
+ 7. **🔄 Interactions**: Interactive bivariate scatter plots and grouped box plots.
69
+ 8. **📐 Advanced Stats (Global AI Hub Methodologies)**: Unique statistical and machine learning frameworks tailored after analytical cultures across the globe (see below).
70
+ 9. **🤖 Model Readiness**: Preprocessing checklists, ML model suitability rankings, and code recommendation generators.
71
+ 10. **📋 Sample**: Interactive data table viewer showing the head, tail, duplicates, and data dictionary.
72
+
73
+ ---
74
+
75
+ ## 🌍 Global EDA Techniques
76
+
77
+ The **Advanced Stats** section includes 4 distinct regional analytical philosophies:
78
+
79
+ * **🇺🇸 US (ML-Readiness & Feature Engineering)**: Identifies feature importance, flags target leakage, and proposes engineered features.
80
+ * **🇮🇳 India (Statistical Foundations & Hypothesis Testing)**: Evaluates confidence intervals, conducts hypothesis testing, and fits target distributions.
81
+ * **🇯🇵 Japan (Quality Control & Process Analytics — Kaizen)**: Implements Shewhart control charts, calculates Process Capability Indexes ($C_p$/$C_{pk}$), checks stability indicators, and generates Pareto charts.
82
+ * **🇨🇳 China (Large-Scale Pattern Recognition)**: Generates PCA projections, evaluates Hopkins clustering statistics, provides K-Means elbow curves, and profiles data density.
83
+
84
+ ---
85
+
86
+ ## 📂 Supported Formats
87
+
88
+ No need to write separate loading code. `khadee-eda` automatically detects your dataset extension and uses optimized engines to parse it:
89
+
90
+ | Format | Extensions | Parser |
91
+ | :--- | :--- | :--- |
92
+ | **CSV / TSV** | `.csv`, `.tsv`, `.txt` | Pandas optimized parser with latin-1 fallback |
93
+ | **Excel** | `.xlsx`, `.xls`, `.xlsm`, `.xlsb` | openpyxl / xlrd engine |
94
+ | **JSON** | `.json` | Standard and JSON-lines parsed dynamically |
95
+ | **Parquet / Feather** | `.parquet`, `.feather` | PyArrow engine |
96
+ | **SQLite** | `.db`, `.sqlite`, `.sqlite3` | Built-in SQLite connection reader |
97
+ | **Pickle** | `.pkl`, `.pickle` | Standard Python pickle serializer |
98
+ | **Others** | `.h5`, `.hdf5`, `.xml`, `.dta`, `.sas7bdat`, `.sav` | Supporting PyTables, XML, Stata, SAS, SPSS |
99
+
100
+ ---
101
+
102
+ ## 💾 Package Footprint & Download Size
103
+
104
+ Unlike heavier packages that bundle thick C++ binaries, `khadee-eda` is designed to be **incredibly lightweight** and fast to download.
105
+
106
+ ### 1. Download Size (Pip / UV)
107
+ * **Wheel Size (`.whl`)**: **~85 KB**
108
+ * **Source Distribution (`.tar.gz`)**: **~90 KB**
109
+ * **Package Source Size**: **~170 KB** (Clean, pure Python logic + minimal glassmorphism style assets)
110
+
111
+ ### 2. Dependency Size
112
+ If your machine already has standard data science packages (like `pandas`, `numpy`, `scipy`) cached, the installation completes instantly (~85 KB download). If installing into a blank virtual environment, pip/uv will download the scientific stack:
113
+
114
+ | Dependency | Purpose | Download Size (Approx.) |
115
+ | :--- | :--- | :--- |
116
+ | **pandas** | Data manipulation & structure | ~12 - 15 MB |
117
+ | **numpy** | Array computations | ~14 - 18 MB |
118
+ | **scipy** | Advanced statistics & tests | ~35 - 40 MB |
119
+ | **scikit-learn** | Machine learning engines & PCA | ~7 - 9 MB |
120
+ | **plotly** | Dynamic SVG visualizations | ~7 - 8 MB |
121
+ | **pyarrow** | High-performance Parquet storage | ~30 - 35 MB |
122
+ | **openpyxl** | Excel read/write compatibility | ~2 - 3 MB |
123
+ | **jinja2** | HTML templating engine | ~0.2 MB |
124
+ | **Total Dependencies** | Full Scientific Stack | **~110 - 130 MB** |
125
+
126
+ ---
127
+
128
+ ## ⚙️ Selective Reports
129
+
130
+ Save compute time and reduce HTML sizes for large datasets by only rendering the sections or techniques you need:
131
+
132
+ ```python
133
+ from khadee_eda import ProfileReport
134
+
135
+ # Profile only Specific Sections
136
+ report = ProfileReport(
137
+ "dataset.csv",
138
+ sections=["overview", "variables", "model_readiness"]
139
+ )
140
+
141
+ # Render only Specific Global Techniques
142
+ report = ProfileReport(
143
+ "dataset.csv",
144
+ techniques=["japan", "us"]
145
+ )
146
+ ```
147
+
148
+ ---
149
+
150
+ ## 💎 Design & Visual Performance Excellence
151
+
152
+ * **Glassmorphism Dark Theme**: Standard EDA reports often look like boring 2010 tables. `khadee-eda` features a high-end, dark glassmorphism dashboard with neon accents, dynamic hover states, and smooth CSS micro-animations.
153
+ * **Instant PDF Export**: Features a beautiful floating "Download PDF" button that triggers browser printing. The custom media print styles automatically expand all hidden column cards, expand all tabs, hide navigational elements/dropdowns, and switch to a crisp ink-saving light template for a clean, professional corporate report.
154
+ * **WebGL Crash Mitigation**: Rendering dozens of ScatterGL plots on a single page causes modern browsers to exceed their WebGL context limit, crash, and display blank charts. `khadee-eda` compiles Scatter plots to optimized vector SVG path strings, ensuring **100% chart rendering reliability** without sacrificing interactive zoom or hover features.
155
+ * **Smart Dropdown Selectors**: Instead of scrolling endlessly through dozens of columns, the report includes a dynamic select element to view one variable card at a time, instantly resizing the embedded Plotly chart to prevent layout distortions.
156
+ * **Copyable Preprocessing Recommender**: When the library suggests cleaning operations (e.g., standardizing headers or imputing values), it displays a syntax-highlighted code block with a one-click copy button, generating context-aware code ready for your pipeline.
157
+
158
+ ---
159
+
160
+ ## 📦 Installation
161
+
162
+ To install `khadee-eda` in development mode locally:
163
+
164
+ ```bash
165
+ git clone https://github.com/khadee/khadee-eda.git
166
+ cd khadee_EDA
167
+ pip install -e .
168
+ ```
169
+
170
+ To install directly using `uv` (recommended for extreme speed):
171
+
172
+ ```bash
173
+ uv pip install -e .
174
+ ```
175
+
176
+ ---
177
+
178
+ ## 📄 License
179
+
180
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,194 @@
1
+ """
2
+ Khadee EDA — Deep Insights Data Profiling
3
+ ==========================================
4
+
5
+ A comprehensive EDA module that generates stunning HTML profiling reports
6
+ from any dataset format. Supports CSV, Excel, JSON, Parquet, and 10+ more formats.
7
+
8
+ Usage
9
+ -----
10
+ from khadee_eda import ProfileReport
11
+
12
+ # From file (any format — auto-detected)
13
+ report = ProfileReport("train.csv", title="My EDA Report")
14
+ report = ProfileReport("data.xlsx", title="Excel Analysis")
15
+
16
+ # From DataFrame
17
+ import pandas as pd
18
+ df = pd.read_csv("train.csv")
19
+ report = ProfileReport(df, title="My EDA Report")
20
+
21
+ # Generate HTML report
22
+ report.to_html("report.html")
23
+
24
+ # Selective sections
25
+ report = ProfileReport(df, sections=["overview", "variables", "correlations"])
26
+
27
+ # Selective techniques
28
+ report = ProfileReport(df, techniques=["us", "japan"])
29
+
30
+ Sub-modules
31
+ -----------
32
+ from khadee_eda.techniques import us, india, japan, china
33
+ from khadee_eda.engines import stats_engine, correlation_engine, missing_engine, outlier_engine
34
+ """
35
+
36
+ __version__ = "1.0.0"
37
+ __author__ = "Khadee"
38
+
39
+ import sys
40
+ import time
41
+ import warnings
42
+
43
+ import pandas as pd
44
+
45
+
46
+ def _print(msg):
47
+ """Print with UTF-8 encoding fallback for Windows consoles."""
48
+ try:
49
+ print(msg)
50
+ except UnicodeEncodeError:
51
+ print(msg.encode("ascii", errors="replace").decode("ascii"))
52
+
53
+ from .config import ALL_SECTIONS, ALL_TECHNIQUES
54
+ from .loader import load_dataset
55
+ from .type_detector import detect_types
56
+ from .renderers.html_renderer import render_html
57
+ from . import clean
58
+
59
+
60
+ class ProfileReport:
61
+ """
62
+ Generate a comprehensive EDA profiling report.
63
+
64
+ Parameters
65
+ ----------
66
+ source : str or pd.DataFrame
67
+ File path (auto-detects format from extension) or pandas DataFrame.
68
+ title : str, optional
69
+ Report title. Default: "Khadee EDA Report".
70
+ sections : list, optional
71
+ List of section IDs to include. Default: all 10 sections.
72
+ Options: overview, variables, distributions, correlations, missing,
73
+ outliers, interactions, advanced_stats, model_readiness, sample
74
+ techniques : list, optional
75
+ List of technique IDs for the Advanced Statistics section.
76
+ Default: all 4 techniques.
77
+ Options: us, india, japan, china
78
+ **kwargs : dict
79
+ Extra arguments passed to the file reader (e.g., sheet_name for Excel).
80
+
81
+ Examples
82
+ --------
83
+ >>> from khadee_eda import ProfileReport
84
+ >>> report = ProfileReport("train.csv", title="Profiling Report")
85
+ >>> report.to_html("report.html")
86
+ """
87
+
88
+ def __init__(self, source, title="Khadee EDA Report", sections=None,
89
+ techniques=None, **kwargs):
90
+ self.title = title
91
+ self.sections = sections or ALL_SECTIONS
92
+ self.techniques = techniques or ALL_TECHNIQUES
93
+ self._start_time = time.time()
94
+
95
+ # Validate sections
96
+ for s in self.sections:
97
+ if s not in ALL_SECTIONS:
98
+ raise ValueError(
99
+ f"Unknown section: '{s}'. Available: {ALL_SECTIONS}"
100
+ )
101
+
102
+ # Validate techniques
103
+ for t in self.techniques:
104
+ if t not in ALL_TECHNIQUES:
105
+ raise ValueError(
106
+ f"Unknown technique: '{t}'. Available: {ALL_TECHNIQUES}"
107
+ )
108
+
109
+ # Load data
110
+ _print("[*] Khadee EDA -- Loading dataset...")
111
+ self.df, self.metadata = load_dataset(source, **kwargs)
112
+ _print(f" [+] Loaded: {self.df.shape[0]:,} rows x {self.df.shape[1]:,} columns")
113
+
114
+ # Detect types
115
+ _print(" [*] Detecting column types...")
116
+ self.type_map = detect_types(self.df)
117
+
118
+ # Pre-compute report
119
+ _print(" [*] Analyzing data...")
120
+ self._sections_html = self._generate_sections()
121
+
122
+ elapsed = time.time() - self._start_time
123
+ _print(f" [+] Analysis complete in {elapsed:.2f}s")
124
+
125
+ def _generate_sections(self):
126
+ """Generate HTML for all requested sections."""
127
+ from .sections import (
128
+ overview, variables, distributions, correlations,
129
+ missing, outliers, interactions, advanced_stats,
130
+ model_readiness, sample,
131
+ )
132
+
133
+ section_generators = {
134
+ "overview": lambda: overview.generate(
135
+ self.df, self.type_map, self.metadata, self._start_time
136
+ ),
137
+ "variables": lambda: variables.generate(self.df, self.type_map),
138
+ "distributions": lambda: distributions.generate(self.df, self.type_map),
139
+ "correlations": lambda: correlations.generate(self.df, self.type_map),
140
+ "missing": lambda: missing.generate(self.df, self.type_map),
141
+ "outliers": lambda: outliers.generate(self.df, self.type_map),
142
+ "interactions": lambda: interactions.generate(self.df, self.type_map),
143
+ "advanced_stats": lambda: advanced_stats.generate(
144
+ self.df, self.type_map, self.techniques
145
+ ),
146
+ "model_readiness": lambda: model_readiness.generate(self.df, self.type_map),
147
+ "sample": lambda: sample.generate(self.df, self.type_map),
148
+ }
149
+
150
+ results = {}
151
+ for section_id in self.sections:
152
+ gen = section_generators.get(section_id)
153
+ if gen:
154
+ try:
155
+ results[section_id] = gen()
156
+ _print(f" [+] {section_id}")
157
+ except Exception as e:
158
+ warnings.warn(f"Error generating section '{section_id}': {e}")
159
+ results[section_id] = (
160
+ f'<div class="card"><h3 class="card-title">⚠️ Error in {section_id}</h3>'
161
+ f'<p class="error-message">{str(e)}</p></div>'
162
+ )
163
+
164
+ return results
165
+
166
+ def to_html(self, output_path="report.html"):
167
+ """
168
+ Generate and save the HTML report.
169
+
170
+ Parameters
171
+ ----------
172
+ output_path : str
173
+ Path to save the HTML report.
174
+ """
175
+ _print(" [*] Generating HTML report...")
176
+
177
+ html = render_html(self.title, self._sections_html, self.sections)
178
+
179
+ with open(output_path, "w", encoding="utf-8") as f:
180
+ f.write(html)
181
+
182
+ _print(f" [+] Report saved to: {output_path}")
183
+ return output_path
184
+
185
+ def to_html_string(self):
186
+ """Return the HTML report as a string."""
187
+ return render_html(self.title, self._sections_html, self.sections)
188
+
189
+ def __repr__(self):
190
+ return (
191
+ f"ProfileReport("
192
+ f"rows={self.df.shape[0]:,}, cols={self.df.shape[1]:,}, "
193
+ f"sections={len(self.sections)}, techniques={len(self.techniques)})"
194
+ )