eda-k 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ global-
2
+ include *.txt *.py
3
+ include README.md
4
+ include pyproject.toml
5
+ include requirements.txt
6
+ recursive-include src *.py
eda_k-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: eda-k
3
+ Version: 0.1.0
4
+ Summary: Automated, local exploratory data analysis: stats, charts, correlations, outliers, a chat assistant, and self-contained HTML reports.
5
+ Author: Kishan Prajapati
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ License-File: licence.txt
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: numpy>=1.24
12
+ Requires-Dist: scipy>=1.10
13
+ Requires-Dist: plotly>=5.20
14
+ Requires-Dist: openpyxl>=3.1
15
+ Requires-Dist: xlrd>=2.0
16
+ Requires-Dist: pyarrow>=14.0
17
+ Provides-Extra: app
18
+ Requires-Dist: streamlit>=1.36; extra == "app"
19
+ Provides-Extra: trend
20
+ Requires-Dist: statsmodels; extra == "trend"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest; extra == "dev"
23
+ Dynamic: license-file
24
+
25
+ # eda-k
26
+
27
+ Automated, local exploratory data analysis — as a **Python library** you can
28
+ `import`, with an optional Streamlit UI on top.
29
+
30
+ Runs 100% locally. Your data never leaves your machine, no API key needed.
31
+
32
+ ---
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install -e .
38
+ ```
39
+
40
+ Want everything in one shot (library + Streamlit app + OLS trendlines)?
41
+
42
+ ```bash
43
+ pip install -e ".[app,trend]"
44
+ ```
45
+
46
+ Or pick extras individually:
47
+
48
+ Need the bundled Streamlit app too?
49
+
50
+ ```bash
51
+ pip install -e ".[app]"
52
+ ```
53
+
54
+ Need OLS trendlines on scatter plots (`charts.pairwise_scatter_with_trendline`)?
55
+
56
+ ```bash
57
+ pip install -e ".[trend]"
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Use it as a library
63
+
64
+ ```python
65
+ import eda_k
66
+
67
+ result = eda_k.analyze("data.csv") # path, file-like, or DataFrame all work
68
+
69
+ print(result) # <EDAResult 'data.csv' rows=150 cols=5 ...>
70
+ print(result.summary()) # quick text overview
71
+ print(result.ask("which columns have missing values?"))
72
+
73
+ result.to_html("report.html") # self-contained HTML report (charts inline)
74
+ result.to_csv_zip("tables.zip") # every summary table as CSVs in one ZIP
75
+ ```
76
+
77
+ `result.df` is the loaded `pandas.DataFrame`, and `result.results` is the raw
78
+ dict of every table (`overview`, `missing_summary`, `numeric_summary`,
79
+ `outliers`, `categorical_summary`, `correlation`, `top_correlations`,
80
+ `dtype_table`) if you want to work with the data directly.
81
+
82
+ ### Lower-level access
83
+
84
+ The original modules are available as submodules, unchanged, for full control:
85
+
86
+ ```python
87
+ from eda_k import eda_engine, charts, chat_assistant, report_builder
88
+
89
+ df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
90
+ results = eda_engine.run_full_eda(df)
91
+ fig = charts.histogram(df, "some_column")
92
+ ```
93
+
94
+ ---
95
+
96
+ ## Use the Streamlit UI
97
+
98
+ ```bash
99
+ pip install -e ".[app]"
100
+ streamlit run apps/streamlit_app.py
101
+ ```
102
+
103
+ Opens a browser tab with upload, tabs (Overview / Missing / Numeric / Outliers
104
+ / Categorical / Correlation / Chat / Download), and one-click export of the
105
+ HTML report or a ZIP of CSVs — same as before, just now built on top of the
106
+ installed `eda_k` package instead of loose scripts.
107
+
108
+ ---
109
+
110
+ ## Project layout
111
+
112
+ ```
113
+ eda-k/
114
+ ├── pyproject.toml
115
+ ├── README.md
116
+ ├── requirements.txt # convenience: pip install -r requirements.txt == pip install -e ".[app]"
117
+ ├── src/
118
+ │ └── eda_k/
119
+ │ ├── __init__.py # public API: analyze(), EDAResult
120
+ │ ├── eda_engine.py # core analysis (pandas/numpy/scipy, no UI)
121
+ │ ├── charts.py # Plotly chart builders
122
+ │ ├── chat_assistant.py # local rule-based Q&A
123
+ │ └── report_builder.py # self-contained HTML report builder
124
+ └── apps/
125
+ └── streamlit_app.py # optional UI, imports from the installed package
126
+ ```
127
+
128
+ ## Supported file types
129
+ CSV, TSV, TXT (auto-delimiter-detect), XLSX, XLS, JSON, Parquet.
130
+
131
+ ## Notes / known limits
132
+ - Very large files (millions of rows) will be slower to chart; consider
133
+ sampling first if you hit performance issues.
134
+ - The "likely datetime column" detector is a heuristic on a small sample —
135
+ always double check it against the Overview before trusting it blindly.
136
+ - Normality test (Shapiro-Wilk) auto-samples to 5,000 rows for large columns
137
+ for speed.
eda_k-0.1.0/README.md ADDED
@@ -0,0 +1,113 @@
1
+ # eda-k
2
+
3
+ Automated, local exploratory data analysis — as a **Python library** you can
4
+ `import`, with an optional Streamlit UI on top.
5
+
6
+ Runs 100% locally. Your data never leaves your machine, no API key needed.
7
+
8
+ ---
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install -e .
14
+ ```
15
+
16
+ Want everything in one shot (library + Streamlit app + OLS trendlines)?
17
+
18
+ ```bash
19
+ pip install -e ".[app,trend]"
20
+ ```
21
+
22
+ Or pick extras individually:
23
+
24
+ Need the bundled Streamlit app too?
25
+
26
+ ```bash
27
+ pip install -e ".[app]"
28
+ ```
29
+
30
+ Need OLS trendlines on scatter plots (`charts.pairwise_scatter_with_trendline`)?
31
+
32
+ ```bash
33
+ pip install -e ".[trend]"
34
+ ```
35
+
36
+ ---
37
+
38
+ ## Use it as a library
39
+
40
+ ```python
41
+ import eda_k
42
+
43
+ result = eda_k.analyze("data.csv") # path, file-like, or DataFrame all work
44
+
45
+ print(result) # <EDAResult 'data.csv' rows=150 cols=5 ...>
46
+ print(result.summary()) # quick text overview
47
+ print(result.ask("which columns have missing values?"))
48
+
49
+ result.to_html("report.html") # self-contained HTML report (charts inline)
50
+ result.to_csv_zip("tables.zip") # every summary table as CSVs in one ZIP
51
+ ```
52
+
53
+ `result.df` is the loaded `pandas.DataFrame`, and `result.results` is the raw
54
+ dict of every table (`overview`, `missing_summary`, `numeric_summary`,
55
+ `outliers`, `categorical_summary`, `correlation`, `top_correlations`,
56
+ `dtype_table`) if you want to work with the data directly.
57
+
58
+ ### Lower-level access
59
+
60
+ The original modules are available as submodules, unchanged, for full control:
61
+
62
+ ```python
63
+ from eda_k import eda_engine, charts, chat_assistant, report_builder
64
+
65
+ df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
66
+ results = eda_engine.run_full_eda(df)
67
+ fig = charts.histogram(df, "some_column")
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Use the Streamlit UI
73
+
74
+ ```bash
75
+ pip install -e ".[app]"
76
+ streamlit run apps/streamlit_app.py
77
+ ```
78
+
79
+ Opens a browser tab with upload, tabs (Overview / Missing / Numeric / Outliers
80
+ / Categorical / Correlation / Chat / Download), and one-click export of the
81
+ HTML report or a ZIP of CSVs — same as before, just now built on top of the
82
+ installed `eda_k` package instead of loose scripts.
83
+
84
+ ---
85
+
86
+ ## Project layout
87
+
88
+ ```
89
+ eda-k/
90
+ ├── pyproject.toml
91
+ ├── README.md
92
+ ├── requirements.txt # convenience: pip install -r requirements.txt == pip install -e ".[app]"
93
+ ├── src/
94
+ │ └── eda_k/
95
+ │ ├── __init__.py # public API: analyze(), EDAResult
96
+ │ ├── eda_engine.py # core analysis (pandas/numpy/scipy, no UI)
97
+ │ ├── charts.py # Plotly chart builders
98
+ │ ├── chat_assistant.py # local rule-based Q&A
99
+ │ └── report_builder.py # self-contained HTML report builder
100
+ └── apps/
101
+ └── streamlit_app.py # optional UI, imports from the installed package
102
+ ```
103
+
104
+ ## Supported file types
105
+ CSV, TSV, TXT (auto-delimiter-detect), XLSX, XLS, JSON, Parquet.
106
+
107
+ ## Notes / known limits
108
+ - Very large files (millions of rows) will be slower to chart; consider
109
+ sampling first if you hit performance issues.
110
+ - The "likely datetime column" detector is a heuristic on a small sample —
111
+ always double check it against the Overview before trusting it blindly.
112
+ - Normality test (Shapiro-Wilk) auto-samples to 5,000 rows for large columns
113
+ for speed.
@@ -0,0 +1,6 @@
1
+ change log
2
+ ============
3
+
4
+ 0.1.0 (24/06/2026)
5
+ --------------------
6
+ - Frist Release
@@ -0,0 +1,7 @@
1
+ Copyright 2026 kishan prajapati
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "eda-k"
7
+ version = "0.1.0"
8
+ description = "Automated, local exploratory data analysis: stats, charts, correlations, outliers, a chat assistant, and self-contained HTML reports."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Kishan Prajapati" }]
13
+
14
+ dependencies = [
15
+ "pandas>=2.0",
16
+ "numpy>=1.24",
17
+ "scipy>=1.10",
18
+ "plotly>=5.20",
19
+ "openpyxl>=3.1",
20
+ "xlrd>=2.0",
21
+ "pyarrow>=14.0",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ # Needed only for the bundled Streamlit UI in apps/streamlit_app.py
26
+ app = ["streamlit>=1.36"]
27
+ # Needed only for charts.pairwise_scatter_with_trendline()
28
+ trend = ["statsmodels"]
29
+ dev = ["pytest"]
30
+
31
+ [tool.setuptools.packages.find]
32
+ where = ["src"]
@@ -0,0 +1 @@
1
+ -e .[app]
eda_k-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,218 @@
1
+ """
2
+ eda_k
3
+ =====
4
+ A local, no-API-key automated EDA toolkit.
5
+
6
+ Quick start
7
+ -----------
8
+ import eda_k
9
+
10
+ result = eda_k.analyze("data.csv")
11
+ result.summary() # quick text summary
12
+ result.to_html("report.html") # full self-contained HTML report
13
+ result.to_csv_zip("tables.zip") # all summary tables as a ZIP of CSVs
14
+ result.ask("which columns have missing values?")
15
+
16
+ Lower-level building blocks (`eda_engine`, `charts`, `chat_assistant`,
17
+ `report_builder`) remain available as submodules for anyone who wants
18
+ finer-grained control, e.g.:
19
+
20
+ from eda_k import eda_engine, charts
21
+ df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
22
+ results = eda_engine.run_full_eda(df)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import io
28
+ import zipfile
29
+ from pathlib import Path
30
+ from typing import Optional, Union
31
+
32
+ import pandas as pd
33
+
34
+ from . import charts, chat_assistant, eda_engine, report_builder
35
+ from .chat_assistant import SUGGESTED_QUESTIONS, answer_question
36
+ from .eda_engine import load_file, run_full_eda
37
+
38
+ __all__ = [
39
+ "analyze",
40
+ "EDAResult",
41
+ "eda_engine",
42
+ "charts",
43
+ "chat_assistant",
44
+ "report_builder",
45
+ "load_file",
46
+ "run_full_eda",
47
+ "answer_question",
48
+ "SUGGESTED_QUESTIONS",
49
+ ]
50
+
51
+ __version__ = "0.1.0"
52
+
53
+
54
+ class EDAResult:
55
+ """
56
+ Convenience wrapper bundling a DataFrame with its EDA results.
57
+
58
+ Returned by `eda_k.analyze(...)`. Wraps the same `results` dict
59
+ produced by `eda_engine.run_full_eda`, plus helpers for reporting,
60
+ exporting, and Q&A — so you don't need Streamlit to get value out
61
+ of this library.
62
+ """
63
+
64
+ def __init__(self, df: pd.DataFrame, results: dict, name: str = "dataset"):
65
+ self.df = df
66
+ self.results = results
67
+ self.name = name
68
+
69
+ # -- introspection -----------------------------------------------------
70
+
71
+ def __repr__(self) -> str:
72
+ ov = self.results["overview"]
73
+ return (
74
+ f"<EDAResult '{self.name}' rows={ov['n_rows']:,} "
75
+ f"cols={ov['n_cols']:,} missing={ov['missing_pct']}% "
76
+ f"duplicates={ov['duplicate_rows']:,}>"
77
+ )
78
+
79
+ def summary(self) -> str:
80
+ """Return a short human-readable text summary."""
81
+ return answer_question("give me a summary of this dataset", self.df, self.results)
82
+
83
+ def ask(self, question: str) -> str:
84
+ """Ask a natural-language question about the dataset."""
85
+ return answer_question(question, self.df, self.results)
86
+
87
+ # -- figures -------------------------------------------------------------
88
+
89
+ def build_figures(self, top_n_cats: int = 10) -> dict:
90
+ """Build the full set of Plotly figures used in the HTML report."""
91
+ ov = self.results["overview"]
92
+ return {
93
+ "missing_bar": charts.missing_values_bar(self.results["missing_summary"]),
94
+ "histograms": {c: charts.histogram(self.df, c) for c in ov["numeric_cols"]},
95
+ "boxplots": {c: charts.boxplot(self.df, c) for c in ov["numeric_cols"]},
96
+ "categorical_bars": {
97
+ c: charts.bar_categorical(self.df, c, top_n=top_n_cats)
98
+ for c in ov["categorical_cols"]
99
+ },
100
+ "corr_heatmap": (
101
+ charts.correlation_heatmap(self.results["correlation"])
102
+ if not self.results["correlation"].empty
103
+ else None
104
+ ),
105
+ }
106
+
107
+ # -- export --------------------------------------------------------------
108
+
109
+ def to_html(
110
+ self,
111
+ path: Optional[Union[str, Path]] = None,
112
+ include_advanced_stats: bool = True,
113
+ top_n_cats: int = 10,
114
+ ) -> str:
115
+ """
116
+ Build the self-contained HTML report.
117
+
118
+ If `path` is given, also writes the report to disk.
119
+ Always returns the HTML string.
120
+ """
121
+ figures = self.build_figures(top_n_cats=top_n_cats)
122
+ html_str = report_builder.build_html_report(
123
+ self.df,
124
+ self.results,
125
+ figures,
126
+ filename=self.name,
127
+ include_advanced_stats=include_advanced_stats,
128
+ )
129
+ if path is not None:
130
+ Path(path).write_text(html_str, encoding="utf-8")
131
+ return html_str
132
+
133
+ def to_csv_zip(self, path: Optional[Union[str, Path]] = None) -> bytes:
134
+ """
135
+ Build a ZIP archive of every summary table as CSV.
136
+
137
+ If `path` is given, also writes the ZIP to disk.
138
+ Always returns the ZIP bytes.
139
+ """
140
+ results = self.results
141
+ buf = io.BytesIO()
142
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
143
+ zf.writestr("dtype_summary.csv", results["dtype_table"].to_csv(index=False))
144
+ zf.writestr("missing_values.csv", results["missing_summary"].to_csv(index=False))
145
+ zf.writestr("numeric_summary.csv", results["numeric_summary"].to_csv(index=False))
146
+ zf.writestr("outliers.csv", results["outliers"].to_csv(index=False))
147
+ if not results["correlation"].empty:
148
+ zf.writestr("correlation_matrix.csv", results["correlation"].to_csv())
149
+ zf.writestr("top_correlations.csv", results["top_correlations"].to_csv(index=False))
150
+ for col, info in results["categorical_summary"].items():
151
+ safe = "".join(ch if ch.isalnum() else "_" for ch in col)
152
+ zf.writestr(f"categorical_{safe}.csv", info["top_values"].to_csv(index=False))
153
+ data = buf.getvalue()
154
+ if path is not None:
155
+ Path(path).write_bytes(data)
156
+ return data
157
+
158
+
159
+ def analyze(
160
+ source: Union[str, Path, "io.IOBase", pd.DataFrame],
161
+ filename: Optional[str] = None,
162
+ outlier_method: str = "Both",
163
+ correlation_method: str = "pearson",
164
+ max_sample_size: int = 5000,
165
+ ) -> EDAResult:
166
+ """
167
+ Run a full automated EDA pipeline on a file path, file-like object,
168
+ or an existing DataFrame, and return an `EDAResult`.
169
+
170
+ Parameters
171
+ ----------
172
+ source : str | Path | file-like | pd.DataFrame
173
+ A path to a CSV/TSV/TXT/XLSX/XLS/JSON/Parquet file, an open
174
+ file-like object, or an already-loaded DataFrame.
175
+ filename : str, optional
176
+ Used for display/report naming and (when `source` is a path-like
177
+ string/Path) to infer the file type. Required when `source` is a
178
+ file-like object without a `.name` attribute and not a DataFrame.
179
+ outlier_method : {"IQR", "Z-score", "Both"}
180
+ correlation_method : {"pearson", "spearman", "kendall"}
181
+ max_sample_size : int
182
+ Max rows sampled for the Shapiro-Wilk normality test.
183
+
184
+ Examples
185
+ --------
186
+ >>> result = eda_k.analyze("data.csv")
187
+ >>> result.summary()
188
+ >>> result.to_html("report.html")
189
+ """
190
+ if isinstance(source, pd.DataFrame):
191
+ df = source.copy()
192
+ name = filename or "dataset"
193
+ else:
194
+ if isinstance(source, (str, Path)):
195
+ path = Path(source)
196
+ name = filename or path.name
197
+ with open(path, "rb") as f:
198
+ df = load_file(f, name)
199
+ else:
200
+ # file-like object
201
+ name = filename or getattr(source, "name", None)
202
+ if not name:
203
+ raise ValueError(
204
+ "filename is required when passing a file-like object "
205
+ "without a '.name' attribute"
206
+ )
207
+ df = load_file(source, name)
208
+
209
+ if df.empty:
210
+ raise ValueError(f"'{name}' loaded but contains no rows.")
211
+
212
+ results = run_full_eda(
213
+ df,
214
+ outlier_method=outlier_method,
215
+ correlation_method=correlation_method,
216
+ max_sample_size=max_sample_size,
217
+ )
218
+ return EDAResult(df, results, name=name)