eda-k 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eda_k-0.1.0/MANIFEST.in +6 -0
- eda_k-0.1.0/PKG-INFO +137 -0
- eda_k-0.1.0/README.md +113 -0
- eda_k-0.1.0/chanfelog.txt +6 -0
- eda_k-0.1.0/licence.txt +7 -0
- eda_k-0.1.0/pyproject.toml +32 -0
- eda_k-0.1.0/requirements.txt +1 -0
- eda_k-0.1.0/setup.cfg +4 -0
- eda_k-0.1.0/src/eda_k/__init__.py +218 -0
- eda_k-0.1.0/src/eda_k/charts.py +428 -0
- eda_k-0.1.0/src/eda_k/chat_assistant.py +252 -0
- eda_k-0.1.0/src/eda_k/eda_engine.py +364 -0
- eda_k-0.1.0/src/eda_k/report_builder.py +461 -0
- eda_k-0.1.0/src/eda_k.egg-info/PKG-INFO +137 -0
- eda_k-0.1.0/src/eda_k.egg-info/SOURCES.txt +16 -0
- eda_k-0.1.0/src/eda_k.egg-info/dependency_links.txt +1 -0
- eda_k-0.1.0/src/eda_k.egg-info/requires.txt +16 -0
- eda_k-0.1.0/src/eda_k.egg-info/top_level.txt +1 -0
eda_k-0.1.0/MANIFEST.in
ADDED
eda_k-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: eda-k
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automated, local exploratory data analysis: stats, charts, correlations, outliers, a chat assistant, and self-contained HTML reports.
|
|
5
|
+
Author: Kishan Prajapati
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: licence.txt
|
|
10
|
+
Requires-Dist: pandas>=2.0
|
|
11
|
+
Requires-Dist: numpy>=1.24
|
|
12
|
+
Requires-Dist: scipy>=1.10
|
|
13
|
+
Requires-Dist: plotly>=5.20
|
|
14
|
+
Requires-Dist: openpyxl>=3.1
|
|
15
|
+
Requires-Dist: xlrd>=2.0
|
|
16
|
+
Requires-Dist: pyarrow>=14.0
|
|
17
|
+
Provides-Extra: app
|
|
18
|
+
Requires-Dist: streamlit>=1.36; extra == "app"
|
|
19
|
+
Provides-Extra: trend
|
|
20
|
+
Requires-Dist: statsmodels; extra == "trend"
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest; extra == "dev"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# eda-k
|
|
26
|
+
|
|
27
|
+
Automated, local exploratory data analysis — as a **Python library** you can
|
|
28
|
+
`import`, with an optional Streamlit UI on top.
|
|
29
|
+
|
|
30
|
+
Runs 100% locally. Your data never leaves your machine, no API key needed.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install -e .
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Want everything in one shot (library + Streamlit app + OLS trendlines)?
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e ".[app,trend]"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or pick extras individually:
|
|
47
|
+
|
|
48
|
+
Need the bundled Streamlit app too?
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install -e ".[app]"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Need OLS trendlines on scatter plots (`charts.pairwise_scatter_with_trendline`)?
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install -e ".[trend]"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Use it as a library
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import eda_k
|
|
66
|
+
|
|
67
|
+
result = eda_k.analyze("data.csv") # path, file-like, or DataFrame all work
|
|
68
|
+
|
|
69
|
+
print(result) # <EDAResult 'data.csv' rows=150 cols=5 ...>
|
|
70
|
+
print(result.summary()) # quick text overview
|
|
71
|
+
print(result.ask("which columns have missing values?"))
|
|
72
|
+
|
|
73
|
+
result.to_html("report.html") # self-contained HTML report (charts inline)
|
|
74
|
+
result.to_csv_zip("tables.zip") # every summary table as CSVs in one ZIP
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
`result.df` is the loaded `pandas.DataFrame`, and `result.results` is the raw
|
|
78
|
+
dict of every table (`overview`, `missing_summary`, `numeric_summary`,
|
|
79
|
+
`outliers`, `categorical_summary`, `correlation`, `top_correlations`,
|
|
80
|
+
`dtype_table`) if you want to work with the data directly.
|
|
81
|
+
|
|
82
|
+
### Lower-level access
|
|
83
|
+
|
|
84
|
+
The original modules are available as submodules, unchanged, for full control:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from eda_k import eda_engine, charts, chat_assistant, report_builder
|
|
88
|
+
|
|
89
|
+
df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
|
|
90
|
+
results = eda_engine.run_full_eda(df)
|
|
91
|
+
fig = charts.histogram(df, "some_column")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Use the Streamlit UI
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install -e ".[app]"
|
|
100
|
+
streamlit run apps/streamlit_app.py
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Opens a browser tab with upload, tabs (Overview / Missing / Numeric / Outliers
|
|
104
|
+
/ Categorical / Correlation / Chat / Download), and one-click export of the
|
|
105
|
+
HTML report or a ZIP of CSVs — same as before, just now built on top of the
|
|
106
|
+
installed `eda_k` package instead of loose scripts.
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Project layout
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
eda-k/
|
|
114
|
+
├── pyproject.toml
|
|
115
|
+
├── README.md
|
|
116
|
+
├── requirements.txt # convenience: pip install -r requirements.txt == pip install -e ".[app]"
|
|
117
|
+
├── src/
|
|
118
|
+
│ └── eda_k/
|
|
119
|
+
│ ├── __init__.py # public API: analyze(), EDAResult
|
|
120
|
+
│ ├── eda_engine.py # core analysis (pandas/numpy/scipy, no UI)
|
|
121
|
+
│ ├── charts.py # Plotly chart builders
|
|
122
|
+
│ ├── chat_assistant.py # local rule-based Q&A
|
|
123
|
+
│ └── report_builder.py # self-contained HTML report builder
|
|
124
|
+
└── apps/
|
|
125
|
+
└── streamlit_app.py # optional UI, imports from the installed package
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Supported file types
|
|
129
|
+
CSV, TSV, TXT (auto-delimiter-detect), XLSX, XLS, JSON, Parquet.
|
|
130
|
+
|
|
131
|
+
## Notes / known limits
|
|
132
|
+
- Very large files (millions of rows) will be slower to chart; consider
|
|
133
|
+
sampling first if you hit performance issues.
|
|
134
|
+
- The "likely datetime column" detector is a heuristic on a small sample —
|
|
135
|
+
always double check it against the Overview before trusting it blindly.
|
|
136
|
+
- Normality test (Shapiro-Wilk) auto-samples to 5,000 rows for large columns
|
|
137
|
+
for speed.
|
eda_k-0.1.0/README.md
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# eda-k
|
|
2
|
+
|
|
3
|
+
Automated, local exploratory data analysis — as a **Python library** you can
|
|
4
|
+
`import`, with an optional Streamlit UI on top.
|
|
5
|
+
|
|
6
|
+
Runs 100% locally. Your data never leaves your machine, no API key needed.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install -e .
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Want everything in one shot (library + Streamlit app + OLS trendlines)?
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -e ".[app,trend]"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or pick extras individually:
|
|
23
|
+
|
|
24
|
+
Need the bundled Streamlit app too?
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install -e ".[app]"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Need OLS trendlines on scatter plots (`charts.pairwise_scatter_with_trendline`)?
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install -e ".[trend]"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Use it as a library
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import eda_k
|
|
42
|
+
|
|
43
|
+
result = eda_k.analyze("data.csv") # path, file-like, or DataFrame all work
|
|
44
|
+
|
|
45
|
+
print(result) # <EDAResult 'data.csv' rows=150 cols=5 ...>
|
|
46
|
+
print(result.summary()) # quick text overview
|
|
47
|
+
print(result.ask("which columns have missing values?"))
|
|
48
|
+
|
|
49
|
+
result.to_html("report.html") # self-contained HTML report (charts inline)
|
|
50
|
+
result.to_csv_zip("tables.zip") # every summary table as CSVs in one ZIP
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
`result.df` is the loaded `pandas.DataFrame`, and `result.results` is the raw
|
|
54
|
+
dict of every table (`overview`, `missing_summary`, `numeric_summary`,
|
|
55
|
+
`outliers`, `categorical_summary`, `correlation`, `top_correlations`,
|
|
56
|
+
`dtype_table`) if you want to work with the data directly.
|
|
57
|
+
|
|
58
|
+
### Lower-level access
|
|
59
|
+
|
|
60
|
+
The original modules are available as submodules, unchanged, for full control:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from eda_k import eda_engine, charts, chat_assistant, report_builder
|
|
64
|
+
|
|
65
|
+
df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
|
|
66
|
+
results = eda_engine.run_full_eda(df)
|
|
67
|
+
fig = charts.histogram(df, "some_column")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Use the Streamlit UI
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install -e ".[app]"
|
|
76
|
+
streamlit run apps/streamlit_app.py
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Opens a browser tab with upload, tabs (Overview / Missing / Numeric / Outliers
|
|
80
|
+
/ Categorical / Correlation / Chat / Download), and one-click export of the
|
|
81
|
+
HTML report or a ZIP of CSVs — same as before, just now built on top of the
|
|
82
|
+
installed `eda_k` package instead of loose scripts.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Project layout
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
eda-k/
|
|
90
|
+
├── pyproject.toml
|
|
91
|
+
├── README.md
|
|
92
|
+
├── requirements.txt # convenience: pip install -r requirements.txt == pip install -e ".[app]"
|
|
93
|
+
├── src/
|
|
94
|
+
│ └── eda_k/
|
|
95
|
+
│ ├── __init__.py # public API: analyze(), EDAResult
|
|
96
|
+
│ ├── eda_engine.py # core analysis (pandas/numpy/scipy, no UI)
|
|
97
|
+
│ ├── charts.py # Plotly chart builders
|
|
98
|
+
│ ├── chat_assistant.py # local rule-based Q&A
|
|
99
|
+
│ └── report_builder.py # self-contained HTML report builder
|
|
100
|
+
└── apps/
|
|
101
|
+
└── streamlit_app.py # optional UI, imports from the installed package
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Supported file types
|
|
105
|
+
CSV, TSV, TXT (auto-delimiter-detect), XLSX, XLS, JSON, Parquet.
|
|
106
|
+
|
|
107
|
+
## Notes / known limits
|
|
108
|
+
- Very large files (millions of rows) will be slower to chart; consider
|
|
109
|
+
sampling first if you hit performance issues.
|
|
110
|
+
- The "likely datetime column" detector is a heuristic on a small sample —
|
|
111
|
+
always double check it against the Overview before trusting it blindly.
|
|
112
|
+
- Normality test (Shapiro-Wilk) auto-samples to 5,000 rows for large columns
|
|
113
|
+
for speed.
|
eda_k-0.1.0/licence.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2026 kishan prajapati
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "eda-k"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Automated, local exploratory data analysis: stats, charts, correlations, outliers, a chat assistant, and self-contained HTML reports."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Kishan Prajapati" }]
|
|
13
|
+
|
|
14
|
+
dependencies = [
|
|
15
|
+
"pandas>=2.0",
|
|
16
|
+
"numpy>=1.24",
|
|
17
|
+
"scipy>=1.10",
|
|
18
|
+
"plotly>=5.20",
|
|
19
|
+
"openpyxl>=3.1",
|
|
20
|
+
"xlrd>=2.0",
|
|
21
|
+
"pyarrow>=14.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
# Needed only for the bundled Streamlit UI in apps/streamlit_app.py
|
|
26
|
+
app = ["streamlit>=1.36"]
|
|
27
|
+
# Needed only for charts.pairwise_scatter_with_trendline()
|
|
28
|
+
trend = ["statsmodels"]
|
|
29
|
+
dev = ["pytest"]
|
|
30
|
+
|
|
31
|
+
[tool.setuptools.packages.find]
|
|
32
|
+
where = ["src"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
-e .[app]
|
eda_k-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
eda_k
|
|
3
|
+
=====
|
|
4
|
+
A local, no-API-key automated EDA toolkit.
|
|
5
|
+
|
|
6
|
+
Quick start
|
|
7
|
+
-----------
|
|
8
|
+
import eda_k
|
|
9
|
+
|
|
10
|
+
result = eda_k.analyze("data.csv")
|
|
11
|
+
result.summary() # quick text summary
|
|
12
|
+
result.to_html("report.html") # full self-contained HTML report
|
|
13
|
+
result.to_csv_zip("tables.zip") # all summary tables as a ZIP of CSVs
|
|
14
|
+
result.ask("which columns have missing values?")
|
|
15
|
+
|
|
16
|
+
Lower-level building blocks (`eda_engine`, `charts`, `chat_assistant`,
|
|
17
|
+
`report_builder`) remain available as submodules for anyone who wants
|
|
18
|
+
finer-grained control, e.g.:
|
|
19
|
+
|
|
20
|
+
from eda_k import eda_engine, charts
|
|
21
|
+
df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
|
|
22
|
+
results = eda_engine.run_full_eda(df)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import io
|
|
28
|
+
import zipfile
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Optional, Union
|
|
31
|
+
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
from . import charts, chat_assistant, eda_engine, report_builder
|
|
35
|
+
from .chat_assistant import SUGGESTED_QUESTIONS, answer_question
|
|
36
|
+
from .eda_engine import load_file, run_full_eda
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"analyze",
|
|
40
|
+
"EDAResult",
|
|
41
|
+
"eda_engine",
|
|
42
|
+
"charts",
|
|
43
|
+
"chat_assistant",
|
|
44
|
+
"report_builder",
|
|
45
|
+
"load_file",
|
|
46
|
+
"run_full_eda",
|
|
47
|
+
"answer_question",
|
|
48
|
+
"SUGGESTED_QUESTIONS",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
__version__ = "0.1.0"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class EDAResult:
|
|
55
|
+
"""
|
|
56
|
+
Convenience wrapper bundling a DataFrame with its EDA results.
|
|
57
|
+
|
|
58
|
+
Returned by `eda_k.analyze(...)`. Wraps the same `results` dict
|
|
59
|
+
produced by `eda_engine.run_full_eda`, plus helpers for reporting,
|
|
60
|
+
exporting, and Q&A — so you don't need Streamlit to get value out
|
|
61
|
+
of this library.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(self, df: pd.DataFrame, results: dict, name: str = "dataset"):
|
|
65
|
+
self.df = df
|
|
66
|
+
self.results = results
|
|
67
|
+
self.name = name
|
|
68
|
+
|
|
69
|
+
# -- introspection -----------------------------------------------------
|
|
70
|
+
|
|
71
|
+
def __repr__(self) -> str:
|
|
72
|
+
ov = self.results["overview"]
|
|
73
|
+
return (
|
|
74
|
+
f"<EDAResult '{self.name}' rows={ov['n_rows']:,} "
|
|
75
|
+
f"cols={ov['n_cols']:,} missing={ov['missing_pct']}% "
|
|
76
|
+
f"duplicates={ov['duplicate_rows']:,}>"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def summary(self) -> str:
|
|
80
|
+
"""Return a short human-readable text summary."""
|
|
81
|
+
return answer_question("give me a summary of this dataset", self.df, self.results)
|
|
82
|
+
|
|
83
|
+
def ask(self, question: str) -> str:
|
|
84
|
+
"""Ask a natural-language question about the dataset."""
|
|
85
|
+
return answer_question(question, self.df, self.results)
|
|
86
|
+
|
|
87
|
+
# -- figures -------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def build_figures(self, top_n_cats: int = 10) -> dict:
|
|
90
|
+
"""Build the full set of Plotly figures used in the HTML report."""
|
|
91
|
+
ov = self.results["overview"]
|
|
92
|
+
return {
|
|
93
|
+
"missing_bar": charts.missing_values_bar(self.results["missing_summary"]),
|
|
94
|
+
"histograms": {c: charts.histogram(self.df, c) for c in ov["numeric_cols"]},
|
|
95
|
+
"boxplots": {c: charts.boxplot(self.df, c) for c in ov["numeric_cols"]},
|
|
96
|
+
"categorical_bars": {
|
|
97
|
+
c: charts.bar_categorical(self.df, c, top_n=top_n_cats)
|
|
98
|
+
for c in ov["categorical_cols"]
|
|
99
|
+
},
|
|
100
|
+
"corr_heatmap": (
|
|
101
|
+
charts.correlation_heatmap(self.results["correlation"])
|
|
102
|
+
if not self.results["correlation"].empty
|
|
103
|
+
else None
|
|
104
|
+
),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# -- export --------------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
def to_html(
|
|
110
|
+
self,
|
|
111
|
+
path: Optional[Union[str, Path]] = None,
|
|
112
|
+
include_advanced_stats: bool = True,
|
|
113
|
+
top_n_cats: int = 10,
|
|
114
|
+
) -> str:
|
|
115
|
+
"""
|
|
116
|
+
Build the self-contained HTML report.
|
|
117
|
+
|
|
118
|
+
If `path` is given, also writes the report to disk.
|
|
119
|
+
Always returns the HTML string.
|
|
120
|
+
"""
|
|
121
|
+
figures = self.build_figures(top_n_cats=top_n_cats)
|
|
122
|
+
html_str = report_builder.build_html_report(
|
|
123
|
+
self.df,
|
|
124
|
+
self.results,
|
|
125
|
+
figures,
|
|
126
|
+
filename=self.name,
|
|
127
|
+
include_advanced_stats=include_advanced_stats,
|
|
128
|
+
)
|
|
129
|
+
if path is not None:
|
|
130
|
+
Path(path).write_text(html_str, encoding="utf-8")
|
|
131
|
+
return html_str
|
|
132
|
+
|
|
133
|
+
def to_csv_zip(self, path: Optional[Union[str, Path]] = None) -> bytes:
|
|
134
|
+
"""
|
|
135
|
+
Build a ZIP archive of every summary table as CSV.
|
|
136
|
+
|
|
137
|
+
If `path` is given, also writes the ZIP to disk.
|
|
138
|
+
Always returns the ZIP bytes.
|
|
139
|
+
"""
|
|
140
|
+
results = self.results
|
|
141
|
+
buf = io.BytesIO()
|
|
142
|
+
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
143
|
+
zf.writestr("dtype_summary.csv", results["dtype_table"].to_csv(index=False))
|
|
144
|
+
zf.writestr("missing_values.csv", results["missing_summary"].to_csv(index=False))
|
|
145
|
+
zf.writestr("numeric_summary.csv", results["numeric_summary"].to_csv(index=False))
|
|
146
|
+
zf.writestr("outliers.csv", results["outliers"].to_csv(index=False))
|
|
147
|
+
if not results["correlation"].empty:
|
|
148
|
+
zf.writestr("correlation_matrix.csv", results["correlation"].to_csv())
|
|
149
|
+
zf.writestr("top_correlations.csv", results["top_correlations"].to_csv(index=False))
|
|
150
|
+
for col, info in results["categorical_summary"].items():
|
|
151
|
+
safe = "".join(ch if ch.isalnum() else "_" for ch in col)
|
|
152
|
+
zf.writestr(f"categorical_{safe}.csv", info["top_values"].to_csv(index=False))
|
|
153
|
+
data = buf.getvalue()
|
|
154
|
+
if path is not None:
|
|
155
|
+
Path(path).write_bytes(data)
|
|
156
|
+
return data
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def analyze(
|
|
160
|
+
source: Union[str, Path, "io.IOBase", pd.DataFrame],
|
|
161
|
+
filename: Optional[str] = None,
|
|
162
|
+
outlier_method: str = "Both",
|
|
163
|
+
correlation_method: str = "pearson",
|
|
164
|
+
max_sample_size: int = 5000,
|
|
165
|
+
) -> EDAResult:
|
|
166
|
+
"""
|
|
167
|
+
Run a full automated EDA pipeline on a file path, file-like object,
|
|
168
|
+
or an existing DataFrame, and return an `EDAResult`.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
source : str | Path | file-like | pd.DataFrame
|
|
173
|
+
A path to a CSV/TSV/TXT/XLSX/XLS/JSON/Parquet file, an open
|
|
174
|
+
file-like object, or an already-loaded DataFrame.
|
|
175
|
+
filename : str, optional
|
|
176
|
+
Used for display/report naming and (when `source` is a path-like
|
|
177
|
+
string/Path) to infer the file type. Required when `source` is a
|
|
178
|
+
file-like object without a `.name` attribute and not a DataFrame.
|
|
179
|
+
outlier_method : {"IQR", "Z-score", "Both"}
|
|
180
|
+
correlation_method : {"pearson", "spearman", "kendall"}
|
|
181
|
+
max_sample_size : int
|
|
182
|
+
Max rows sampled for the Shapiro-Wilk normality test.
|
|
183
|
+
|
|
184
|
+
Examples
|
|
185
|
+
--------
|
|
186
|
+
>>> result = eda_k.analyze("data.csv")
|
|
187
|
+
>>> result.summary()
|
|
188
|
+
>>> result.to_html("report.html")
|
|
189
|
+
"""
|
|
190
|
+
if isinstance(source, pd.DataFrame):
|
|
191
|
+
df = source.copy()
|
|
192
|
+
name = filename or "dataset"
|
|
193
|
+
else:
|
|
194
|
+
if isinstance(source, (str, Path)):
|
|
195
|
+
path = Path(source)
|
|
196
|
+
name = filename or path.name
|
|
197
|
+
with open(path, "rb") as f:
|
|
198
|
+
df = load_file(f, name)
|
|
199
|
+
else:
|
|
200
|
+
# file-like object
|
|
201
|
+
name = filename or getattr(source, "name", None)
|
|
202
|
+
if not name:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
"filename is required when passing a file-like object "
|
|
205
|
+
"without a '.name' attribute"
|
|
206
|
+
)
|
|
207
|
+
df = load_file(source, name)
|
|
208
|
+
|
|
209
|
+
if df.empty:
|
|
210
|
+
raise ValueError(f"'{name}' loaded but contains no rows.")
|
|
211
|
+
|
|
212
|
+
results = run_full_eda(
|
|
213
|
+
df,
|
|
214
|
+
outlier_method=outlier_method,
|
|
215
|
+
correlation_method=correlation_method,
|
|
216
|
+
max_sample_size=max_sample_size,
|
|
217
|
+
)
|
|
218
|
+
return EDAResult(df, results, name=name)
|