khadee-eda 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khadee_eda/__init__.py +194 -0
- khadee_eda/assets/script.js +137 -0
- khadee_eda/assets/style.css +1336 -0
- khadee_eda/clean.py +287 -0
- khadee_eda/config.py +121 -0
- khadee_eda/engines/__init__.py +1 -0
- khadee_eda/engines/correlation_engine.py +115 -0
- khadee_eda/engines/dim_reduction.py +152 -0
- khadee_eda/engines/missing_engine.py +170 -0
- khadee_eda/engines/outlier_engine.py +190 -0
- khadee_eda/engines/stats_engine.py +200 -0
- khadee_eda/loader.py +221 -0
- khadee_eda/renderers/__init__.py +1 -0
- khadee_eda/renderers/chart_renderer.py +547 -0
- khadee_eda/renderers/html_renderer.py +128 -0
- khadee_eda/renderers/table_renderer.py +38 -0
- khadee_eda/sections/__init__.py +1 -0
- khadee_eda/sections/advanced_stats.py +47 -0
- khadee_eda/sections/correlations.py +125 -0
- khadee_eda/sections/distributions.py +102 -0
- khadee_eda/sections/interactions.py +84 -0
- khadee_eda/sections/missing.py +102 -0
- khadee_eda/sections/model_readiness.py +334 -0
- khadee_eda/sections/outliers.py +94 -0
- khadee_eda/sections/overview.py +174 -0
- khadee_eda/sections/sample.py +109 -0
- khadee_eda/sections/variables.py +348 -0
- khadee_eda/techniques/__init__.py +2 -0
- khadee_eda/techniques/china.py +125 -0
- khadee_eda/techniques/india.py +167 -0
- khadee_eda/techniques/japan.py +177 -0
- khadee_eda/techniques/us.py +108 -0
- khadee_eda/type_detector.py +169 -0
- khadee_eda/utils.py +130 -0
- khadee_eda-1.0.0.dist-info/METADATA +193 -0
- khadee_eda-1.0.0.dist-info/RECORD +38 -0
- khadee_eda-1.0.0.dist-info/WHEEL +5 -0
- khadee_eda-1.0.0.dist-info/top_level.txt +1 -0
khadee_eda/__init__.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Khadee EDA — Deep Insights Data Profiling
|
|
3
|
+
==========================================
|
|
4
|
+
|
|
5
|
+
A comprehensive EDA module that generates stunning HTML profiling reports
|
|
6
|
+
from any dataset format. Supports CSV, Excel, JSON, Parquet, and 10+ more formats.
|
|
7
|
+
|
|
8
|
+
Usage
|
|
9
|
+
-----
|
|
10
|
+
from khadee_eda import ProfileReport
|
|
11
|
+
|
|
12
|
+
# From file (any format — auto-detected)
|
|
13
|
+
report = ProfileReport("train.csv", title="My EDA Report")
|
|
14
|
+
report = ProfileReport("data.xlsx", title="Excel Analysis")
|
|
15
|
+
|
|
16
|
+
# From DataFrame
|
|
17
|
+
import pandas as pd
|
|
18
|
+
df = pd.read_csv("train.csv")
|
|
19
|
+
report = ProfileReport(df, title="My EDA Report")
|
|
20
|
+
|
|
21
|
+
# Generate HTML report
|
|
22
|
+
report.to_html("report.html")
|
|
23
|
+
|
|
24
|
+
# Selective sections
|
|
25
|
+
report = ProfileReport(df, sections=["overview", "variables", "correlations"])
|
|
26
|
+
|
|
27
|
+
# Selective techniques
|
|
28
|
+
report = ProfileReport(df, techniques=["us", "japan"])
|
|
29
|
+
|
|
30
|
+
Sub-modules
|
|
31
|
+
-----------
|
|
32
|
+
from khadee_eda.techniques import us, india, japan, china
|
|
33
|
+
from khadee_eda.engines import stats_engine, correlation_engine, missing_engine, outlier_engine
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
__version__ = "1.0.0"
|
|
37
|
+
__author__ = "Khadee"
|
|
38
|
+
|
|
39
|
+
import sys
|
|
40
|
+
import time
|
|
41
|
+
import warnings
|
|
42
|
+
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _print(msg):
|
|
47
|
+
"""Print with UTF-8 encoding fallback for Windows consoles."""
|
|
48
|
+
try:
|
|
49
|
+
print(msg)
|
|
50
|
+
except UnicodeEncodeError:
|
|
51
|
+
print(msg.encode("ascii", errors="replace").decode("ascii"))
|
|
52
|
+
|
|
53
|
+
from .config import ALL_SECTIONS, ALL_TECHNIQUES
|
|
54
|
+
from .loader import load_dataset
|
|
55
|
+
from .type_detector import detect_types
|
|
56
|
+
from .renderers.html_renderer import render_html
|
|
57
|
+
from . import clean
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ProfileReport:
|
|
61
|
+
"""
|
|
62
|
+
Generate a comprehensive EDA profiling report.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
source : str or pd.DataFrame
|
|
67
|
+
File path (auto-detects format from extension) or pandas DataFrame.
|
|
68
|
+
title : str, optional
|
|
69
|
+
Report title. Default: "Khadee EDA Report".
|
|
70
|
+
sections : list, optional
|
|
71
|
+
List of section IDs to include. Default: all 10 sections.
|
|
72
|
+
Options: overview, variables, distributions, correlations, missing,
|
|
73
|
+
outliers, interactions, advanced_stats, model_readiness, sample
|
|
74
|
+
techniques : list, optional
|
|
75
|
+
List of technique IDs for the Advanced Statistics section.
|
|
76
|
+
Default: all 4 techniques.
|
|
77
|
+
Options: us, india, japan, china
|
|
78
|
+
**kwargs : dict
|
|
79
|
+
Extra arguments passed to the file reader (e.g., sheet_name for Excel).
|
|
80
|
+
|
|
81
|
+
Examples
|
|
82
|
+
--------
|
|
83
|
+
>>> from khadee_eda import ProfileReport
|
|
84
|
+
>>> report = ProfileReport("train.csv", title="Profiling Report")
|
|
85
|
+
>>> report.to_html("report.html")
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(self, source, title="Khadee EDA Report", sections=None,
|
|
89
|
+
techniques=None, **kwargs):
|
|
90
|
+
self.title = title
|
|
91
|
+
self.sections = sections or ALL_SECTIONS
|
|
92
|
+
self.techniques = techniques or ALL_TECHNIQUES
|
|
93
|
+
self._start_time = time.time()
|
|
94
|
+
|
|
95
|
+
# Validate sections
|
|
96
|
+
for s in self.sections:
|
|
97
|
+
if s not in ALL_SECTIONS:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Unknown section: '{s}'. Available: {ALL_SECTIONS}"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Validate techniques
|
|
103
|
+
for t in self.techniques:
|
|
104
|
+
if t not in ALL_TECHNIQUES:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Unknown technique: '{t}'. Available: {ALL_TECHNIQUES}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Load data
|
|
110
|
+
_print("[*] Khadee EDA -- Loading dataset...")
|
|
111
|
+
self.df, self.metadata = load_dataset(source, **kwargs)
|
|
112
|
+
_print(f" [+] Loaded: {self.df.shape[0]:,} rows x {self.df.shape[1]:,} columns")
|
|
113
|
+
|
|
114
|
+
# Detect types
|
|
115
|
+
_print(" [*] Detecting column types...")
|
|
116
|
+
self.type_map = detect_types(self.df)
|
|
117
|
+
|
|
118
|
+
# Pre-compute report
|
|
119
|
+
_print(" [*] Analyzing data...")
|
|
120
|
+
self._sections_html = self._generate_sections()
|
|
121
|
+
|
|
122
|
+
elapsed = time.time() - self._start_time
|
|
123
|
+
_print(f" [+] Analysis complete in {elapsed:.2f}s")
|
|
124
|
+
|
|
125
|
+
def _generate_sections(self):
|
|
126
|
+
"""Generate HTML for all requested sections."""
|
|
127
|
+
from .sections import (
|
|
128
|
+
overview, variables, distributions, correlations,
|
|
129
|
+
missing, outliers, interactions, advanced_stats,
|
|
130
|
+
model_readiness, sample,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
section_generators = {
|
|
134
|
+
"overview": lambda: overview.generate(
|
|
135
|
+
self.df, self.type_map, self.metadata, self._start_time
|
|
136
|
+
),
|
|
137
|
+
"variables": lambda: variables.generate(self.df, self.type_map),
|
|
138
|
+
"distributions": lambda: distributions.generate(self.df, self.type_map),
|
|
139
|
+
"correlations": lambda: correlations.generate(self.df, self.type_map),
|
|
140
|
+
"missing": lambda: missing.generate(self.df, self.type_map),
|
|
141
|
+
"outliers": lambda: outliers.generate(self.df, self.type_map),
|
|
142
|
+
"interactions": lambda: interactions.generate(self.df, self.type_map),
|
|
143
|
+
"advanced_stats": lambda: advanced_stats.generate(
|
|
144
|
+
self.df, self.type_map, self.techniques
|
|
145
|
+
),
|
|
146
|
+
"model_readiness": lambda: model_readiness.generate(self.df, self.type_map),
|
|
147
|
+
"sample": lambda: sample.generate(self.df, self.type_map),
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
results = {}
|
|
151
|
+
for section_id in self.sections:
|
|
152
|
+
gen = section_generators.get(section_id)
|
|
153
|
+
if gen:
|
|
154
|
+
try:
|
|
155
|
+
results[section_id] = gen()
|
|
156
|
+
_print(f" [+] {section_id}")
|
|
157
|
+
except Exception as e:
|
|
158
|
+
warnings.warn(f"Error generating section '{section_id}': {e}")
|
|
159
|
+
results[section_id] = (
|
|
160
|
+
f'<div class="card"><h3 class="card-title">⚠️ Error in {section_id}</h3>'
|
|
161
|
+
f'<p class="error-message">{str(e)}</p></div>'
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return results
|
|
165
|
+
|
|
166
|
+
def to_html(self, output_path="report.html"):
|
|
167
|
+
"""
|
|
168
|
+
Generate and save the HTML report.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
output_path : str
|
|
173
|
+
Path to save the HTML report.
|
|
174
|
+
"""
|
|
175
|
+
_print(" [*] Generating HTML report...")
|
|
176
|
+
|
|
177
|
+
html = render_html(self.title, self._sections_html, self.sections)
|
|
178
|
+
|
|
179
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
180
|
+
f.write(html)
|
|
181
|
+
|
|
182
|
+
_print(f" [+] Report saved to: {output_path}")
|
|
183
|
+
return output_path
|
|
184
|
+
|
|
185
|
+
def to_html_string(self):
|
|
186
|
+
"""Return the HTML report as a string."""
|
|
187
|
+
return render_html(self.title, self._sections_html, self.sections)
|
|
188
|
+
|
|
189
|
+
def __repr__(self):
|
|
190
|
+
return (
|
|
191
|
+
f"ProfileReport("
|
|
192
|
+
f"rows={self.df.shape[0]:,}, cols={self.df.shape[1]:,}, "
|
|
193
|
+
f"sections={len(self.sections)}, techniques={len(self.techniques)})"
|
|
194
|
+
)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/* ============================================================
|
|
2
|
+
Khadee EDA — Report Interactivity
|
|
3
|
+
============================================================ */
|
|
4
|
+
|
|
5
|
+
// ── Sidebar Navigation Active Tracking ──
|
|
6
|
+
(function() {
|
|
7
|
+
const navLinks = document.querySelectorAll('.nav-link');
|
|
8
|
+
const sections = document.querySelectorAll('.report-section');
|
|
9
|
+
|
|
10
|
+
// Smooth scroll on nav click
|
|
11
|
+
navLinks.forEach(link => {
|
|
12
|
+
link.addEventListener('click', function(e) {
|
|
13
|
+
e.preventDefault();
|
|
14
|
+
const targetId = this.getAttribute('href').substring(1);
|
|
15
|
+
const target = document.getElementById(targetId);
|
|
16
|
+
if (target) {
|
|
17
|
+
target.scrollIntoView({ behavior: 'smooth', block: 'start' });
|
|
18
|
+
}
|
|
19
|
+
// Close mobile sidebar
|
|
20
|
+
document.getElementById('sidebar').classList.remove('open');
|
|
21
|
+
});
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
// Intersection Observer for active nav highlighting
|
|
25
|
+
if (sections.length > 0 && 'IntersectionObserver' in window) {
|
|
26
|
+
const observer = new IntersectionObserver(function(entries) {
|
|
27
|
+
entries.forEach(function(entry) {
|
|
28
|
+
if (entry.isIntersecting) {
|
|
29
|
+
const sectionId = entry.target.getAttribute('data-section');
|
|
30
|
+
navLinks.forEach(function(link) {
|
|
31
|
+
link.classList.remove('active');
|
|
32
|
+
if (link.getAttribute('data-section') === sectionId) {
|
|
33
|
+
link.classList.add('active');
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
}, {
|
|
39
|
+
rootMargin: '-20% 0px -70% 0px',
|
|
40
|
+
threshold: 0
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
sections.forEach(function(section) {
|
|
44
|
+
observer.observe(section);
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Set first nav link active by default
|
|
49
|
+
if (navLinks.length > 0) {
|
|
50
|
+
navLinks[0].classList.add('active');
|
|
51
|
+
}
|
|
52
|
+
})();
|
|
53
|
+
|
|
54
|
+
// ── Fade-in Animation on Scroll ──
|
|
55
|
+
(function() {
|
|
56
|
+
var sections = document.querySelectorAll('.report-section');
|
|
57
|
+
sections.forEach(function(section, index) {
|
|
58
|
+
section.style.animationDelay = (index * 0.05) + 's';
|
|
59
|
+
});
|
|
60
|
+
})();
|
|
61
|
+
|
|
62
|
+
// ── Tab Switching ──
|
|
63
|
+
function switchTab(btn, tabId) {
|
|
64
|
+
// Deactivate all tabs in the same container
|
|
65
|
+
var container = btn.closest('.tab-container');
|
|
66
|
+
if (!container) return;
|
|
67
|
+
|
|
68
|
+
container.querySelectorAll('.tab-btn').forEach(function(b) {
|
|
69
|
+
b.classList.remove('active');
|
|
70
|
+
});
|
|
71
|
+
container.querySelectorAll('.tab-content').forEach(function(c) {
|
|
72
|
+
c.style.display = 'none';
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
btn.classList.add('active');
|
|
76
|
+
var target = document.getElementById(tabId);
|
|
77
|
+
if (target) {
|
|
78
|
+
target.style.display = 'block';
|
|
79
|
+
// Trigger Plotly resize for charts that were hidden
|
|
80
|
+
var charts = target.querySelectorAll('.plotly-chart');
|
|
81
|
+
charts.forEach(function(chart) {
|
|
82
|
+
if (window.Plotly && chart.data) {
|
|
83
|
+
Plotly.Plots.resize(chart);
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ── Mobile Sidebar Toggle ──
|
|
90
|
+
function toggleSidebar() {
|
|
91
|
+
var sidebar = document.getElementById('sidebar');
|
|
92
|
+
sidebar.classList.toggle('open');
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Close sidebar on outside click (mobile)
|
|
96
|
+
document.addEventListener('click', function(e) {
|
|
97
|
+
var sidebar = document.getElementById('sidebar');
|
|
98
|
+
var menuBtn = document.getElementById('mobile-menu-btn');
|
|
99
|
+
if (sidebar && sidebar.classList.contains('open')) {
|
|
100
|
+
if (!sidebar.contains(e.target) && !menuBtn.contains(e.target)) {
|
|
101
|
+
sidebar.classList.remove('open');
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
// ── Window resize: trigger Plotly resize ──
|
|
107
|
+
var resizeTimeout;
|
|
108
|
+
window.addEventListener('resize', function() {
|
|
109
|
+
clearTimeout(resizeTimeout);
|
|
110
|
+
resizeTimeout = setTimeout(function() {
|
|
111
|
+
document.querySelectorAll('.plotly-chart').forEach(function(chart) {
|
|
112
|
+
if (window.Plotly && chart.data) {
|
|
113
|
+
Plotly.Plots.resize(chart);
|
|
114
|
+
}
|
|
115
|
+
});
|
|
116
|
+
}, 200);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
// ── Variable Card Dropdown Switcher ──
|
|
120
|
+
function showVariableCard(colId) {
|
|
121
|
+
var cards = document.querySelectorAll('.variable-card');
|
|
122
|
+
cards.forEach(function(card) {
|
|
123
|
+
card.style.display = 'none';
|
|
124
|
+
});
|
|
125
|
+
var targetCard = document.getElementById('var-' + colId);
|
|
126
|
+
if (targetCard) {
|
|
127
|
+
targetCard.style.display = 'block';
|
|
128
|
+
// Resize charts inside the newly shown card
|
|
129
|
+
var charts = targetCard.querySelectorAll('.plotly-chart');
|
|
130
|
+
charts.forEach(function(chart) {
|
|
131
|
+
if (window.Plotly && chart.data) {
|
|
132
|
+
Plotly.Plots.resize(chart);
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|