firstlook 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firstlook/__init__.py +24 -0
- firstlook/baseline.py +88 -0
- firstlook/detect.py +36 -0
- firstlook/recommend.py +81 -0
- firstlook/report.py +123 -0
- firstlook/theme.py +49 -0
- firstlook/visualize.py +125 -0
- firstlook-0.1.0.dist-info/METADATA +120 -0
- firstlook-0.1.0.dist-info/RECORD +12 -0
- firstlook-0.1.0.dist-info/WHEEL +5 -0
- firstlook-0.1.0.dist-info/licenses/LICENSE +21 -0
- firstlook-0.1.0.dist-info/top_level.txt +1 -0
firstlook/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""firstlook — drop in any dataframe, get models to try and the right charts.
|
|
2
|
+
|
|
3
|
+
import firstlook
|
|
4
|
+
firstlook.at(df, target="species")
|
|
5
|
+
|
|
6
|
+
That one call detects the problem type, recommends models (with reasons and
|
|
7
|
+
data-aware warnings), and renders a dark, interactive Plotly dashboard. It's
|
|
8
|
+
the first look you take at any dataset, done for you.
|
|
9
|
+
"""
|
|
10
|
+
from . import theme # noqa: F401 (registers the "firstlook" Plotly template on import)
|
|
11
|
+
from .detect import detect_task
|
|
12
|
+
from .recommend import recommend, Recommendation
|
|
13
|
+
from .visualize import visualize
|
|
14
|
+
from .report import play, Report
|
|
15
|
+
from .baseline import fit_baseline, Baseline
|
|
16
|
+
from .theme import THEME
|
|
17
|
+
|
|
18
|
+
at = play # headline alias: firstlook.at(df, target=...)
|
|
19
|
+
|
|
20
|
+
__version__ = "0.1.0"
|
|
21
|
+
__all__ = [
|
|
22
|
+
"at", "play", "recommend", "visualize", "detect_task", "fit_baseline",
|
|
23
|
+
"Report", "Recommendation", "Baseline", "THEME", "__version__",
|
|
24
|
+
]
|
firstlook/baseline.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Train the recommended baseline model and report an honest score.
|
|
2
|
+
|
|
3
|
+
Needs scikit-learn (the ``fit`` / ``dev`` extra). Preprocessing is built into
|
|
4
|
+
the pipeline (median-impute + scale numerics, most-frequent-impute + one-hot
|
|
5
|
+
categoricals), so it fits straight on the messy data the recommender only warns
|
|
6
|
+
about. Scoring is cross-validated, so a tiny test split can't flatter or wreck it.
|
|
7
|
+
"""
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Baseline:
|
|
14
|
+
model: Optional[str]
|
|
15
|
+
metric: Optional[str]
|
|
16
|
+
score: Optional[float]
|
|
17
|
+
std: Optional[float] = None
|
|
18
|
+
cv: Optional[int] = None
|
|
19
|
+
note: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
def __str__(self):
|
|
22
|
+
if self.score is None:
|
|
23
|
+
return f"baseline: {self.note}"
|
|
24
|
+
return (f"baseline: {self.model} {self.metric} {self.score:.3f} "
|
|
25
|
+
f"+/- {self.std:.3f} ({self.cv}-fold CV)")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def fit_baseline(df, target, task):
|
|
29
|
+
"""Cross-validate the recommended start model; return a :class:`Baseline`."""
|
|
30
|
+
if task == "clustering" or target is None:
|
|
31
|
+
return Baseline(None, None, None, note="no target, nothing to fit")
|
|
32
|
+
try:
|
|
33
|
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
34
|
+
from sklearn.model_selection import cross_val_score
|
|
35
|
+
from sklearn.pipeline import Pipeline
|
|
36
|
+
from sklearn.compose import ColumnTransformer
|
|
37
|
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
|
38
|
+
from sklearn.impute import SimpleImputer
|
|
39
|
+
except ImportError as e:
|
|
40
|
+
raise ImportError(
|
|
41
|
+
"baseline fitting needs scikit-learn; install it with "
|
|
42
|
+
"`pip install 'firstlook[fit]'`"
|
|
43
|
+
) from e
|
|
44
|
+
|
|
45
|
+
data = df.dropna(subset=[target])
|
|
46
|
+
y = data[target]
|
|
47
|
+
X = data.drop(columns=[target])
|
|
48
|
+
n = len(data)
|
|
49
|
+
if n < 10 or X.shape[1] == 0:
|
|
50
|
+
return Baseline(None, None, None, note="too few rows to score reliably")
|
|
51
|
+
|
|
52
|
+
num = X.select_dtypes(include="number").columns.tolist()
|
|
53
|
+
cat = [c for c in X.columns if c not in num]
|
|
54
|
+
pre = ColumnTransformer(
|
|
55
|
+
transformers=[
|
|
56
|
+
("num", Pipeline([("imp", SimpleImputer(strategy="median")),
|
|
57
|
+
("sc", StandardScaler())]), num),
|
|
58
|
+
("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
|
|
59
|
+
("oh", OneHotEncoder(handle_unknown="ignore"))]), cat),
|
|
60
|
+
],
|
|
61
|
+
remainder="drop",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if task == "regression":
|
|
65
|
+
model, name, metric, scoring = LinearRegression(), "LinearRegression", "R2", "r2"
|
|
66
|
+
cv = min(5, max(2, n // 4))
|
|
67
|
+
else:
|
|
68
|
+
vc = y.value_counts()
|
|
69
|
+
per_class = int(vc.min())
|
|
70
|
+
if per_class < 2:
|
|
71
|
+
return Baseline("LogisticRegression", "accuracy", None,
|
|
72
|
+
note="a class has <2 samples; can't cross-validate")
|
|
73
|
+
name = "LogisticRegression"
|
|
74
|
+
cv = min(5, max(2, per_class))
|
|
75
|
+
# plain accuracy flatters imbalanced data (a majority-only guesser scores
|
|
76
|
+
# high), so on imbalance we both weight the classes (the tool's own advice)
|
|
77
|
+
# and score with balanced accuracy. Otherwise plain LogisticRegression + accuracy.
|
|
78
|
+
if (vc.max() / len(y)) > 0.7:
|
|
79
|
+
model = LogisticRegression(max_iter=1000, class_weight="balanced")
|
|
80
|
+
metric, scoring = "balanced accuracy", "balanced_accuracy"
|
|
81
|
+
else:
|
|
82
|
+
model = LogisticRegression(max_iter=1000)
|
|
83
|
+
metric, scoring = "accuracy", "accuracy"
|
|
84
|
+
|
|
85
|
+
pipe = Pipeline([("pre", pre), ("model", model)])
|
|
86
|
+
scores = cross_val_score(pipe, X, y, cv=cv, scoring=scoring)
|
|
87
|
+
return Baseline(model=name, metric=metric, score=float(scores.mean()),
|
|
88
|
+
std=float(scores.std()), cv=cv)
|
firstlook/detect.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Figure out what kind of problem a dataframe poses."""
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
TASKS = ("regression", "classification", "clustering")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_task(df, target=None):
|
|
8
|
+
"""Return ``"regression"``, ``"classification"``, or ``"clustering"``.
|
|
9
|
+
|
|
10
|
+
No target -> clustering. A non-numeric target -> classification. A numeric
|
|
11
|
+
target is classification only when it has few distinct values relative to
|
|
12
|
+
the number of rows (encoded labels), otherwise regression. The ratio check
|
|
13
|
+
is what stops a small regression set (e.g. 7 distinct MPG values in 7 rows)
|
|
14
|
+
from being mistaken for 7-class classification.
|
|
15
|
+
"""
|
|
16
|
+
if target is None:
|
|
17
|
+
return "clustering"
|
|
18
|
+
if target not in df.columns:
|
|
19
|
+
raise KeyError(f"target {target!r} is not a column. Columns: {list(df.columns)}")
|
|
20
|
+
|
|
21
|
+
y = df[target].dropna()
|
|
22
|
+
if y.empty:
|
|
23
|
+
return "clustering"
|
|
24
|
+
|
|
25
|
+
if pd.api.types.is_bool_dtype(y):
|
|
26
|
+
return "classification"
|
|
27
|
+
if not pd.api.types.is_numeric_dtype(y):
|
|
28
|
+
return "classification"
|
|
29
|
+
|
|
30
|
+
n, nun = len(y), y.nunique()
|
|
31
|
+
if nun == 2:
|
|
32
|
+
return "classification" # binary is classification regardless of dtype/size
|
|
33
|
+
if pd.api.types.is_float_dtype(y):
|
|
34
|
+
return "classification" if (nun <= 10 and nun / n < 0.05) else "regression"
|
|
35
|
+
# integer target
|
|
36
|
+
return "classification" if (nun <= 20 and nun / n < 0.2) else "regression"
|
firstlook/recommend.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Recommend models to try, with reasons and data-aware warnings.
|
|
2
|
+
|
|
3
|
+
The model lists follow scikit-learn's algorithm cheat-sheet; the warnings come
|
|
4
|
+
from looking at the actual data (imbalance, categoricals, missing values, tiny n).
|
|
5
|
+
"""
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import List, Tuple
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from .detect import detect_task
|
|
12
|
+
|
|
13
|
+
_REGRESSION = [
|
|
14
|
+
("LinearRegression", "interpretable baseline, the right first try"),
|
|
15
|
+
("Ridge / Lasso", "linear + regularization; Lasso also drops weak features"),
|
|
16
|
+
("RandomForestRegressor", "non-linear, handles interactions, little tuning"),
|
|
17
|
+
("GradientBoosting / XGBoost", "usually the top scorer on tabular data"),
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
_CLASSIFICATION = [
|
|
21
|
+
("LogisticRegression", "interpretable baseline, gives probabilities"),
|
|
22
|
+
("KNeighborsClassifier", "simple, non-linear, nothing to train"),
|
|
23
|
+
("RandomForestClassifier", "strong default, handles mixed features"),
|
|
24
|
+
("GradientBoosting / XGBoost", "usually best on tabular data"),
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
_CLUSTERING = [
|
|
28
|
+
("KMeans", "the default when you roughly know how many groups"),
|
|
29
|
+
("DBSCAN", "arbitrary shapes, no k, flags outliers"),
|
|
30
|
+
("PCA -> cluster", "reduce dimensions first if many features"),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Recommendation:
|
|
36
|
+
task: str
|
|
37
|
+
start: str
|
|
38
|
+
models: List[Tuple[str, str]]
|
|
39
|
+
notes: List[str]
|
|
40
|
+
|
|
41
|
+
def __str__(self): # plain-text rendering for scripts / print()
|
|
42
|
+
lines = [f"task: {self.task} start with: {self.start}", "", "models to try:"]
|
|
43
|
+
lines += [f" - {m:28} {r}" for m, r in self.models]
|
|
44
|
+
if self.notes:
|
|
45
|
+
lines += ["", "watch out for:"] + [f" -> {n}" for n in self.notes]
|
|
46
|
+
return "\n".join(lines)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def recommend(df, target=None, task=None):
|
|
50
|
+
"""Return a :class:`Recommendation` for ``df`` predicting ``target``."""
|
|
51
|
+
task = task or detect_task(df, target)
|
|
52
|
+
n = len(df)
|
|
53
|
+
feats = [c for c in df.columns if c != target]
|
|
54
|
+
n_cat = sum(not pd.api.types.is_numeric_dtype(df[c]) for c in feats)
|
|
55
|
+
notes = []
|
|
56
|
+
|
|
57
|
+
if n < 50:
|
|
58
|
+
notes.append(f"only {n} rows - any model will be shaky; more data is the biggest win")
|
|
59
|
+
|
|
60
|
+
if task == "regression":
|
|
61
|
+
start, models = "LinearRegression", _REGRESSION
|
|
62
|
+
elif task == "classification":
|
|
63
|
+
start, models = "LogisticRegression", _CLASSIFICATION
|
|
64
|
+
if target is not None:
|
|
65
|
+
y = df[target]
|
|
66
|
+
notes.append(f"{y.nunique()} classes")
|
|
67
|
+
vc = y.value_counts(normalize=True)
|
|
68
|
+
if len(vc) and vc.max() > 0.7:
|
|
69
|
+
notes.append(
|
|
70
|
+
f"imbalanced - {vc.idxmax()!r} is {vc.max()*100:.0f}% of rows; "
|
|
71
|
+
"watch recall, try class_weight"
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
start, models = "KMeans", _CLUSTERING
|
|
75
|
+
|
|
76
|
+
if n_cat:
|
|
77
|
+
notes.append(f"{n_cat} categorical feature(s) - encode (one-hot/ordinal) before fitting")
|
|
78
|
+
if df.isna().any().any():
|
|
79
|
+
notes.append("missing values present - impute or drop first")
|
|
80
|
+
|
|
81
|
+
return Recommendation(task=task, start=start, models=models, notes=notes)
|
firstlook/report.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""``play()`` / ``at()`` — the one call that ties it all together."""
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from . import theme
|
|
5
|
+
from .detect import detect_task
|
|
6
|
+
from .recommend import recommend, Recommendation
|
|
7
|
+
from .visualize import visualize
|
|
8
|
+
from .baseline import fit_baseline, Baseline
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _in_notebook():
|
|
12
|
+
try:
|
|
13
|
+
from IPython import get_ipython
|
|
14
|
+
ip = get_ipython()
|
|
15
|
+
return ip is not None and ip.__class__.__name__ == "ZMQInteractiveShell"
|
|
16
|
+
except Exception:
|
|
17
|
+
return False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _card_html(title, rec, shape, target, baseline=None):
|
|
21
|
+
accent = theme.ACCENT.get(rec.task, theme.CYAN)
|
|
22
|
+
models = "".join(
|
|
23
|
+
f'<li><span style="color:{theme.CYAN};font-family:monospace;font-weight:600">{m}</span>'
|
|
24
|
+
f' <span style="color:#b9b9cc">{r}</span></li>' for m, r in rec.models)
|
|
25
|
+
notes = "".join(f'<li style="color:#cfcfe0"><span style="color:{accent}">→ </span>{n}</li>'
|
|
26
|
+
for n in rec.notes) or '<li style="color:#cfcfe0">clean and ready to model.</li>'
|
|
27
|
+
base = ""
|
|
28
|
+
if baseline is not None and baseline.score is not None:
|
|
29
|
+
base = (f'<div style="margin-top:14px;padding:10px 14px;background:#11112a;border-radius:8px;'
|
|
30
|
+
f'font-size:13.5px"><span style="color:#8a8aa0">baseline</span> '
|
|
31
|
+
f'<span style="color:{theme.CYAN};font-family:monospace">{baseline.model}</span> · '
|
|
32
|
+
f'{baseline.metric} <span style="color:{accent};font-weight:700">{baseline.score:.3f}</span> '
|
|
33
|
+
f'<span style="color:#8a8aa0">±{baseline.std:.3f} ({baseline.cv}-fold CV)</span></div>')
|
|
34
|
+
elif baseline is not None and baseline.note:
|
|
35
|
+
base = f'<div style="margin-top:14px;font-size:13px;color:#8a8aa0">baseline: {baseline.note}</div>'
|
|
36
|
+
return f"""<div style="background:{theme.BG};color:{theme.INK};font-family:Inter,system-ui,sans-serif;
|
|
37
|
+
border-radius:12px;padding:18px 20px;max-width:680px">
|
|
38
|
+
<div style="font-size:20px;font-weight:800;letter-spacing:-.02em">{title}</div>
|
|
39
|
+
<div style="margin:6px 0 14px;font-size:13px;color:#8a8aa0">
|
|
40
|
+
<span style="background:{accent};color:#08080f;font-weight:700;font-size:11px;padding:3px 11px;
|
|
41
|
+
border-radius:999px;text-transform:uppercase;letter-spacing:.08em">{rec.task}</span>
|
|
42
|
+
{shape[0]} rows · {shape[1]} cols · target: <code>{target}</code></div>
|
|
43
|
+
<div style="display:grid;grid-template-columns:1fr 1fr;gap:16px">
|
|
44
|
+
<div><div style="font-size:12px;letter-spacing:.1em;text-transform:uppercase;color:#8a8aa0;
|
|
45
|
+
margin-bottom:8px">models to try</div>
|
|
46
|
+
<ul style="list-style:none;margin:0;padding:0;font-size:13.5px;line-height:1.5">{models}</ul>
|
|
47
|
+
<div style="margin-top:10px;font-size:13px;color:#8a8aa0">start with
|
|
48
|
+
<span style="color:{accent};font-weight:700">{rec.start}</span></div></div>
|
|
49
|
+
<div><div style="font-size:12px;letter-spacing:.1em;text-transform:uppercase;color:#8a8aa0;
|
|
50
|
+
margin-bottom:8px">watch out for</div>
|
|
51
|
+
<ul style="list-style:none;margin:0;padding:0;font-size:13.5px;line-height:1.5">{notes}</ul></div>
|
|
52
|
+
</div>{base}</div>"""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class Report:
|
|
57
|
+
title: str
|
|
58
|
+
task: str
|
|
59
|
+
recommendation: Recommendation
|
|
60
|
+
figure: object
|
|
61
|
+
shape: tuple
|
|
62
|
+
target: str
|
|
63
|
+
baseline: object = None
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def start(self):
|
|
67
|
+
return self.recommendation.start
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def models(self):
|
|
71
|
+
return self.recommendation.models
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def notes(self):
|
|
75
|
+
return self.recommendation.notes
|
|
76
|
+
|
|
77
|
+
def show(self):
|
|
78
|
+
"""Render in a notebook (rich card + interactive charts) or print (scripts)."""
|
|
79
|
+
if _in_notebook():
|
|
80
|
+
from IPython.display import HTML, display
|
|
81
|
+
display(HTML(_card_html(self.title, self.recommendation, self.shape, self.target, self.baseline)))
|
|
82
|
+
self.figure.show()
|
|
83
|
+
else:
|
|
84
|
+
print(self.title)
|
|
85
|
+
print(self.recommendation)
|
|
86
|
+
if self.baseline is not None:
|
|
87
|
+
print("\n" + str(self.baseline))
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
def to_html(self, path):
|
|
91
|
+
"""Write a standalone dark dashboard (recommendations + charts) to ``path``."""
|
|
92
|
+
chart = self.figure.to_html(full_html=False, include_plotlyjs="cdn",
|
|
93
|
+
config={"displayModeBar": False})
|
|
94
|
+
card = _card_html(self.title, self.recommendation, self.shape, self.target, self.baseline)
|
|
95
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
96
|
+
f.write(f'<!DOCTYPE html><html><head><meta charset="UTF-8"><title>{self.title}</title></head>'
|
|
97
|
+
f'<body style="margin:0;background:{theme.BG}">'
|
|
98
|
+
f'<div style="max-width:1080px;margin:0 auto;padding:24px">{card}{chart}</div>'
|
|
99
|
+
f"</body></html>")
|
|
100
|
+
return path
|
|
101
|
+
|
|
102
|
+
def _repr_html_(self):
|
|
103
|
+
return _card_html(self.title, self.recommendation, self.shape, self.target, self.baseline)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def play(df, target=None, title=None, show=True, fit=False):
|
|
107
|
+
"""Inspect ``df``: detect the task, recommend models, and chart the data.
|
|
108
|
+
|
|
109
|
+
Set ``fit=True`` to also cross-validate the recommended baseline model and
|
|
110
|
+
report a score (needs scikit-learn: ``pip install 'firstlook[fit]'``).
|
|
111
|
+
|
|
112
|
+
Returns a :class:`Report` (``.task``, ``.start``, ``.models``, ``.notes``,
|
|
113
|
+
``.figure``, ``.baseline``, ``.to_html(path)``).
|
|
114
|
+
"""
|
|
115
|
+
task = detect_task(df, target)
|
|
116
|
+
rec = recommend(df, target, task)
|
|
117
|
+
fig = visualize(df, target, task)
|
|
118
|
+
baseline = fit_baseline(df, target, task) if fit else None
|
|
119
|
+
report = Report(title=title or "your data", task=task, recommendation=rec,
|
|
120
|
+
figure=fig, shape=df.shape, target=target, baseline=baseline)
|
|
121
|
+
if show:
|
|
122
|
+
report.show()
|
|
123
|
+
return report
|
firstlook/theme.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Brand theme — a registered Plotly template plus the raw palette.
|
|
2
|
+
|
|
3
|
+
Importing firstlook registers a ``"firstlook"`` Plotly template, so any figure
|
|
4
|
+
(yours or ours) can opt into the dark, high-contrast look with
|
|
5
|
+
``fig.update_layout(template="firstlook")``.
|
|
6
|
+
"""
|
|
7
|
+
import plotly.graph_objects as go
|
|
8
|
+
import plotly.io as pio
|
|
9
|
+
|
|
10
|
+
BG = "#080814"
|
|
11
|
+
PANEL = "#0c0c1f"
|
|
12
|
+
INK = "#e9e9f4"
|
|
13
|
+
MUTED = "#9a9ab0"
|
|
14
|
+
GRID = "rgba(255,255,255,0.07)"
|
|
15
|
+
|
|
16
|
+
CYAN = "#00E5FF"
|
|
17
|
+
GOLD = "#FFD27F"
|
|
18
|
+
PINK = "#FF4081"
|
|
19
|
+
PURPLE = "#7C4DFF"
|
|
20
|
+
GREEN = "#00E676"
|
|
21
|
+
AMBER = "#FFAB00"
|
|
22
|
+
|
|
23
|
+
PALETTE = [CYAN, PINK, GOLD, PURPLE, GREEN, AMBER]
|
|
24
|
+
|
|
25
|
+
ACCENT = {"classification": PINK, "regression": GOLD, "clustering": PURPLE}
|
|
26
|
+
|
|
27
|
+
THEME = {
|
|
28
|
+
"bg": BG, "panel": PANEL, "ink": INK, "muted": MUTED, "grid": GRID,
|
|
29
|
+
"palette": PALETTE, "cyan": CYAN, "gold": GOLD, "pink": PINK,
|
|
30
|
+
"purple": PURPLE, "green": GREEN, "amber": AMBER,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _register():
|
|
35
|
+
tpl = go.layout.Template()
|
|
36
|
+
tpl.layout = go.Layout(
|
|
37
|
+
paper_bgcolor=BG,
|
|
38
|
+
plot_bgcolor=PANEL,
|
|
39
|
+
font=dict(color=INK, family="Inter, system-ui, -apple-system, sans-serif"),
|
|
40
|
+
colorway=PALETTE,
|
|
41
|
+
xaxis=dict(gridcolor=GRID, zerolinecolor=GRID, color=MUTED),
|
|
42
|
+
yaxis=dict(gridcolor=GRID, zerolinecolor=GRID, color=MUTED),
|
|
43
|
+
title=dict(font=dict(color=INK)),
|
|
44
|
+
legend=dict(bgcolor="rgba(0,0,0,0)"),
|
|
45
|
+
)
|
|
46
|
+
pio.templates["firstlook"] = tpl
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
_register()
|
firstlook/visualize.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Auto-pick the right charts for a dataframe and return one Plotly figure."""
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import plotly.graph_objects as go
|
|
5
|
+
from plotly.subplots import make_subplots
|
|
6
|
+
|
|
7
|
+
from . import theme
|
|
8
|
+
from .detect import detect_task
|
|
9
|
+
|
|
10
|
+
PALETTE = theme.PALETTE
|
|
11
|
+
CYAN, GOLD = theme.CYAN, theme.GOLD
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _numeric_feats(df, target):
|
|
15
|
+
return [c for c in df.columns if c != target and pd.api.types.is_numeric_dtype(df[c])]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _corr_ratio(cats, values):
|
|
19
|
+
"""Correlation ratio (eta-squared): association between a categorical
|
|
20
|
+
target and a numeric feature — between-group variance over total variance."""
|
|
21
|
+
d = pd.DataFrame({"c": cats.values, "v": pd.to_numeric(values, errors="coerce").values}).dropna()
|
|
22
|
+
if d.empty:
|
|
23
|
+
return 0.0
|
|
24
|
+
grand = d["v"].mean()
|
|
25
|
+
ss_total = ((d["v"] - grand) ** 2).sum()
|
|
26
|
+
if ss_total == 0:
|
|
27
|
+
return 0.0
|
|
28
|
+
ss_between = sum(len(g) * (g["v"].mean() - grand) ** 2 for _, g in d.groupby("c"))
|
|
29
|
+
return ss_between / ss_total
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _rank_features(df, target, num, task):
|
|
33
|
+
"""Order numeric features by association with the target.
|
|
34
|
+
|
|
35
|
+
Classification uses the correlation ratio (eta-squared) — the right measure
|
|
36
|
+
for a categorical target. Regression uses |Pearson r|. Ranking features by
|
|
37
|
+
correlation with integer-encoded class labels would impose a fake ordering
|
|
38
|
+
on the classes, so we don't.
|
|
39
|
+
"""
|
|
40
|
+
if target is None or not num:
|
|
41
|
+
return num
|
|
42
|
+
y = df[target]
|
|
43
|
+
if task == "classification":
|
|
44
|
+
score = {c: _corr_ratio(y, df[c]) for c in num}
|
|
45
|
+
else:
|
|
46
|
+
def pear(c):
|
|
47
|
+
try:
|
|
48
|
+
return abs(np.corrcoef(df[c].astype(float), y.astype(float))[0, 1])
|
|
49
|
+
except Exception:
|
|
50
|
+
return 0.0
|
|
51
|
+
score = {c: pear(c) for c in num}
|
|
52
|
+
return sorted(num, key=lambda c: np.nan_to_num(score[c]), reverse=True)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def visualize(df, target=None, task=None):
|
|
56
|
+
"""Return a 2x2 Plotly dashboard chosen to fit the data and task."""
|
|
57
|
+
task = task or detect_task(df, target)
|
|
58
|
+
num = _numeric_feats(df, target)
|
|
59
|
+
ranked = _rank_features(df, target, num, task)
|
|
60
|
+
|
|
61
|
+
titles = [f"target: {target}" if target else "first feature",
|
|
62
|
+
"the key relationship", "feature correlations", "a closer look"]
|
|
63
|
+
fig = make_subplots(
|
|
64
|
+
rows=2, cols=2, subplot_titles=titles,
|
|
65
|
+
specs=[[{"type": "xy"}, {"type": "xy"}], [{"type": "heatmap"}, {"type": "xy"}]],
|
|
66
|
+
vertical_spacing=0.16, horizontal_spacing=0.12,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# P1 - target (or first feature) distribution
|
|
70
|
+
if task == "classification" and target is not None:
|
|
71
|
+
vc = df[target].value_counts()
|
|
72
|
+
fig.add_trace(go.Bar(x=[str(i) for i in vc.index], y=vc.values,
|
|
73
|
+
marker_color=PALETTE[: len(vc)], showlegend=False), 1, 1)
|
|
74
|
+
else:
|
|
75
|
+
col = target if (target and pd.api.types.is_numeric_dtype(df[target])) else (ranked[0] if ranked else None)
|
|
76
|
+
if col is not None:
|
|
77
|
+
fig.add_trace(go.Histogram(x=df[col], marker_color=CYAN, showlegend=False), 1, 1)
|
|
78
|
+
|
|
79
|
+
# P2 - the key relationship
|
|
80
|
+
if task == "classification" and target is not None and len(ranked) >= 2:
|
|
81
|
+
f1, f2 = ranked[0], ranked[1]
|
|
82
|
+
for i, (cls, g) in enumerate(df.groupby(target)):
|
|
83
|
+
fig.add_trace(go.Scatter(x=g[f1], y=g[f2], mode="markers", name=str(cls),
|
|
84
|
+
marker=dict(size=8, color=PALETTE[i % len(PALETTE)],
|
|
85
|
+
line=dict(color="#04101a", width=0.5))), 1, 2)
|
|
86
|
+
fig.update_xaxes(title_text=f1, row=1, col=2)
|
|
87
|
+
fig.update_yaxes(title_text=f2, row=1, col=2)
|
|
88
|
+
elif target is not None and ranked:
|
|
89
|
+
f1 = ranked[0]
|
|
90
|
+
fig.add_trace(go.Scatter(x=df[f1], y=df[target], mode="markers", showlegend=False,
|
|
91
|
+
marker=dict(size=9, color=CYAN, line=dict(color="#04101a", width=0.5))), 1, 2)
|
|
92
|
+
fig.update_xaxes(title_text=f1, row=1, col=2)
|
|
93
|
+
fig.update_yaxes(title_text=str(target), row=1, col=2)
|
|
94
|
+
elif len(ranked) >= 2: # clustering / no target
|
|
95
|
+
fig.add_trace(go.Scatter(x=df[ranked[0]], y=df[ranked[1]], mode="markers", showlegend=False,
|
|
96
|
+
marker=dict(size=8, color=theme.PURPLE)), 1, 2)
|
|
97
|
+
fig.update_xaxes(title_text=ranked[0], row=1, col=2)
|
|
98
|
+
fig.update_yaxes(title_text=ranked[1], row=1, col=2)
|
|
99
|
+
|
|
100
|
+
# P3 - correlation heatmap
|
|
101
|
+
cols = num + ([target] if (target and pd.api.types.is_numeric_dtype(df[target])) else [])
|
|
102
|
+
if len(cols) >= 2:
|
|
103
|
+
corr = df[cols].corr()
|
|
104
|
+
fig.add_trace(go.Heatmap(z=corr.values, x=cols, y=cols, zmin=-1, zmax=1,
|
|
105
|
+
colorscale=[[0, theme.PINK], [0.5, "#1a1a2e"], [1, CYAN]],
|
|
106
|
+
showscale=False), 2, 1)
|
|
107
|
+
|
|
108
|
+
# P4 - a closer look
|
|
109
|
+
if task == "classification" and target is not None and ranked:
|
|
110
|
+
f1 = ranked[0]
|
|
111
|
+
for i, (cls, g) in enumerate(df.groupby(target)):
|
|
112
|
+
fig.add_trace(go.Box(y=g[f1], name=str(cls), marker_color=PALETTE[i % len(PALETTE)],
|
|
113
|
+
showlegend=False), 2, 2)
|
|
114
|
+
fig.update_yaxes(title_text=f1, row=2, col=2)
|
|
115
|
+
elif ranked:
|
|
116
|
+
f = ranked[1] if len(ranked) >= 2 else ranked[0]
|
|
117
|
+
fig.add_trace(go.Histogram(x=df[f], marker_color=GOLD, showlegend=False), 2, 2)
|
|
118
|
+
fig.update_xaxes(title_text=f, row=2, col=2)
|
|
119
|
+
|
|
120
|
+
fig.update_layout(template="firstlook", height=620, margin=dict(l=55, r=30, t=50, b=45),
|
|
121
|
+
legend=dict(font=dict(size=11)))
|
|
122
|
+
for a in fig.layout.annotations:
|
|
123
|
+
a.font.color = theme.INK
|
|
124
|
+
a.font.size = 13
|
|
125
|
+
return fig
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: firstlook
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Drop in any dataframe and get models worth trying plus the right charts.
|
|
5
|
+
Author: Siddhant Rajhans
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/siddhant-rajhans/firstlook
|
|
8
|
+
Project-URL: Issues, https://github.com/siddhant-rajhans/firstlook/issues
|
|
9
|
+
Keywords: eda,automl,visualization,plotly,data-science,machine-learning
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: pandas>=1.5
|
|
18
|
+
Requires-Dist: numpy>=1.23
|
|
19
|
+
Requires-Dist: plotly>=5.0
|
|
20
|
+
Provides-Extra: fit
|
|
21
|
+
Requires-Dist: scikit-learn>=1.1; extra == "fit"
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
24
|
+
Requires-Dist: scikit-learn>=1.1; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# firstlook
|
|
28
|
+
|
|
29
|
+
Drop in any dataframe and get models worth trying plus the right charts. The first look you take at any dataset, done for you.
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import firstlook
|
|
33
|
+
firstlook.at(df, target="species")
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
One call and you get three things back:
|
|
37
|
+
|
|
38
|
+
1. **The problem type:** regression, classification, or clustering, inferred from the data.
|
|
39
|
+
2. **Models to try:** ranked, with a one-line reason each, plus data-aware warnings (class imbalance, categoricals that need encoding, missing values, too few rows).
|
|
40
|
+
3. **The right charts:** a dark, interactive Plotly dashboard that picks the chart per column: bar/histogram for the target, a scatter colored by class (or feature-vs-target for regression), a correlation heatmap, and a closer look.
|
|
41
|
+
|
|
42
|
+
It works in Jupyter (rich card + interactive charts render inline) and in plain scripts (prints the recommendation; `report.to_html("out.html")` for the visuals).
|
|
43
|
+
|
|
44
|
+
**See it on real data:** [`examples/`](examples/) runs the full understand → visualize → model arc on three datasets — breast cancer, diabetes, and a messy churn set — with the dashboards rendered.
|
|
45
|
+
|
|
46
|
+
## Why
|
|
47
|
+
|
|
48
|
+
Every project starts the same way: load the data, squint at it, remember which chart goes with which column, half-remember the sklearn cheat-sheet. `firstlook` does that opening move for you so you can get to the actual modeling without writing a wall of matplotlib.
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install firstlook
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
From source, for development:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
git clone https://github.com/siddhant-rajhans/firstlook
|
|
60
|
+
cd firstlook
|
|
61
|
+
pip install -e ".[dev]"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Use
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import firstlook
|
|
68
|
+
from sklearn.datasets import load_iris
|
|
69
|
+
|
|
70
|
+
iris = load_iris(as_frame=True).frame
|
|
71
|
+
report = firstlook.at(iris, target="target")
|
|
72
|
+
|
|
73
|
+
report.task # "classification"
|
|
74
|
+
report.start # "LogisticRegression"
|
|
75
|
+
report.models # [("LogisticRegression", "..."), ...]
|
|
76
|
+
report.notes # ["3 classes", ...]
|
|
77
|
+
report.figure # the Plotly figure (restyle or export it)
|
|
78
|
+
report.to_html("iris.html")
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Need just one piece?
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
firstlook.detect_task(df, target="price") # "regression"
|
|
85
|
+
firstlook.recommend(df, target="price") # a Recommendation (task, start, models, notes)
|
|
86
|
+
firstlook.visualize(df, target="price") # a Plotly figure
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The dark theme is also a registered Plotly template you can use on your own figures:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
fig.update_layout(template="firstlook")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
(`firstlook.at` and `firstlook.play` are the same call.)
|
|
96
|
+
|
|
97
|
+
## Get a baseline score, too
|
|
98
|
+
|
|
99
|
+
Pass `fit=True` and `firstlook` trains the recommended model and cross-validates it, so the recommendation comes with a real score attached. Preprocessing (impute, scale, one-hot) is built in, so it fits straight on messy data:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
report = firstlook.at(df, target="price", fit=True)
|
|
103
|
+
report.baseline # Baseline(model="LinearRegression", metric="R2", score=0.97, ...)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Needs scikit-learn: `pip install "firstlook[fit]"`.
|
|
107
|
+
|
|
108
|
+
## What it handles today
|
|
109
|
+
|
|
110
|
+
Tabular regression, classification, and clustering. Image / text / time-series problems are out of scope for now.
|
|
111
|
+
|
|
112
|
+
## Roadmap
|
|
113
|
+
|
|
114
|
+
- More chart types (pair plots, missingness maps, target-vs-time).
|
|
115
|
+
- A light/lab theme alongside the dark one.
|
|
116
|
+
- Image / text / time-series support beyond tabular.
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
|
|
120
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firstlook/__init__.py,sha256=cg_LbayYdbYsZMZmb0bMWN5_cuQyAhEc-RqFpA81iPU,898
|
|
2
|
+
firstlook/baseline.py,sha256=QnNx1dLlSiyTSOUPP6YM_GFWkOUVJB48P0yC_tui1Wk,3714
|
|
3
|
+
firstlook/detect.py,sha256=iY9svwisV8D9sM6NCdxMFojijGGfzkwwPZUXzh9ffx8,1375
|
|
4
|
+
firstlook/recommend.py,sha256=MmFN7zWqJibbWJUeU7OsBLtOgGJVxicoSQbklo28Z-g,3032
|
|
5
|
+
firstlook/report.py,sha256=iUazl-U_rRkICgWuJqk7gJHgnr42N9sLXC3vDXnVJ1Y,5714
|
|
6
|
+
firstlook/theme.py,sha256=45g3vq0hFYtFNj_ews9zbt95X4rvDvJChsHm6oSF_v4,1386
|
|
7
|
+
firstlook/visualize.py,sha256=aEHT9jiwVhCPXznMQdbO1HMmqsypx10CEQEbPwtmRfo,5613
|
|
8
|
+
firstlook-0.1.0.dist-info/licenses/LICENSE,sha256=4-zYyAl-RO3VAe6hTyFK-8xzWkhCuqLHxHnVeHpR5LY,1073
|
|
9
|
+
firstlook-0.1.0.dist-info/METADATA,sha256=hFPKuXqDhlU-JqnUe-Zl6-nKvKbkplmjG7g32f3IM0Q,4350
|
|
10
|
+
firstlook-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
11
|
+
firstlook-0.1.0.dist-info/top_level.txt,sha256=6N07TqZMme-jbvkgxtGruj6W1OCr2ZQELepRGZL-U8Q,10
|
|
12
|
+
firstlook-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Siddhant Rajhans
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
firstlook
|