dissectml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dissectml/__init__.py +155 -0
- dissectml/_compat.py +111 -0
- dissectml/_config.py +110 -0
- dissectml/_io.py +63 -0
- dissectml/_lazy.py +61 -0
- dissectml/_sampling.py +72 -0
- dissectml/_types.py +114 -0
- dissectml/_version.py +1 -0
- dissectml/battle/__init__.py +97 -0
- dissectml/battle/catalog.py +240 -0
- dissectml/battle/param_grids.py +193 -0
- dissectml/battle/preprocessing.py +268 -0
- dissectml/battle/registry.py +134 -0
- dissectml/battle/result.py +201 -0
- dissectml/battle/runner.py +359 -0
- dissectml/battle/tuner.py +253 -0
- dissectml/compare/__init__.py +32 -0
- dissectml/compare/comparator.py +197 -0
- dissectml/compare/curves.py +304 -0
- dissectml/compare/error_analysis.py +268 -0
- dissectml/compare/metrics_table.py +123 -0
- dissectml/compare/pareto.py +114 -0
- dissectml/compare/shap_compare.py +157 -0
- dissectml/compare/significance.py +159 -0
- dissectml/core/__init__.py +6 -0
- dissectml/core/base.py +144 -0
- dissectml/core/data_container.py +165 -0
- dissectml/core/pipeline.py +59 -0
- dissectml/core/progress.py +78 -0
- dissectml/core/validators.py +188 -0
- dissectml/datasets/__init__.py +128 -0
- dissectml/datasets/data/titanic.csv +892 -0
- dissectml/eda/__init__.py +26 -0
- dissectml/eda/_base.py +177 -0
- dissectml/eda/bivariate.py +280 -0
- dissectml/eda/clusters.py +328 -0
- dissectml/eda/correlations.py +285 -0
- dissectml/eda/interactions.py +297 -0
- dissectml/eda/missing.py +301 -0
- dissectml/eda/outliers.py +246 -0
- dissectml/eda/overview.py +235 -0
- dissectml/eda/result.py +244 -0
- dissectml/eda/statistical_tests.py +316 -0
- dissectml/eda/target_analysis.py +351 -0
- dissectml/eda/univariate.py +240 -0
- dissectml/exceptions.py +67 -0
- dissectml/intelligence/__init__.py +68 -0
- dissectml/intelligence/feature_importance.py +168 -0
- dissectml/intelligence/leakage.py +309 -0
- dissectml/intelligence/multicollinearity.py +237 -0
- dissectml/intelligence/readiness.py +327 -0
- dissectml/intelligence/recommendations.py +213 -0
- dissectml/intelligence/result.py +212 -0
- dissectml/report/__init__.py +19 -0
- dissectml/report/assets/script.js +112 -0
- dissectml/report/assets/style.css +286 -0
- dissectml/report/builder.py +132 -0
- dissectml/report/html_renderer.py +578 -0
- dissectml/report/narrative.py +191 -0
- dissectml/report/pdf_renderer.py +74 -0
- dissectml/report/sections/__init__.py +15 -0
- dissectml/report/sections/battle_section.py +92 -0
- dissectml/report/sections/compare_section.py +90 -0
- dissectml/report/sections/eda_section.py +89 -0
- dissectml/report/sections/intelligence_section.py +111 -0
- dissectml/report/sections/summary_section.py +112 -0
- dissectml/report/templates/base.html.j2 +43 -0
- dissectml/report/templates/components/chart_container.html.j2 +12 -0
- dissectml/report/templates/components/collapsible.html.j2 +9 -0
- dissectml/report/templates/components/table.html.j2 +23 -0
- dissectml/report/templates/components/toc.html.j2 +11 -0
- dissectml/report/templates/section.html.j2 +16 -0
- dissectml/viz/__init__.py +12 -0
- dissectml/viz/charts.py +210 -0
- dissectml/viz/display.py +78 -0
- dissectml/viz/theme.py +83 -0
- dissectml-0.1.0.dist-info/METADATA +357 -0
- dissectml-0.1.0.dist-info/RECORD +80 -0
- dissectml-0.1.0.dist-info/WHEEL +4 -0
- dissectml-0.1.0.dist-info/licenses/LICENSE +21 -0
dissectml/__init__.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""
|
|
2
|
+
InsightML — The missing middle layer between EDA and AutoML.
|
|
3
|
+
|
|
4
|
+
Unified pipeline from deep data understanding to model comparison,
|
|
5
|
+
in as few as 3 function calls.
|
|
6
|
+
|
|
7
|
+
Quick start::
|
|
8
|
+
|
|
9
|
+
import dissectml as iml
|
|
10
|
+
|
|
11
|
+
# Deep EDA (v0.1+)
|
|
12
|
+
eda = iml.explore(df)
|
|
13
|
+
eda.overview.show()
|
|
14
|
+
eda.correlations.heatmap()
|
|
15
|
+
|
|
16
|
+
# Model battle (v0.2+)
|
|
17
|
+
models = iml.battle(df, target="price")
|
|
18
|
+
|
|
19
|
+
# Full pipeline (v0.4+)
|
|
20
|
+
report = iml.analyze(df, target="price")
|
|
21
|
+
report.export("report.html")
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from dissectml._compat import to_pandas
|
|
27
|
+
from dissectml._config import InsightMLConfig, config_context, get_config, set_config
|
|
28
|
+
from dissectml._version import __version__
|
|
29
|
+
from dissectml.battle import battle
|
|
30
|
+
from dissectml.compare import ModelComparator
|
|
31
|
+
from dissectml.datasets import load_housing, load_titanic
|
|
32
|
+
from dissectml.eda import explore
|
|
33
|
+
from dissectml.exceptions import InsightMLError
|
|
34
|
+
from dissectml.intelligence import analyze_intelligence
|
|
35
|
+
from dissectml.report import AnalysisReport
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"__version__",
|
|
39
|
+
# Public API
|
|
40
|
+
"explore",
|
|
41
|
+
"battle",
|
|
42
|
+
"analyze_intelligence",
|
|
43
|
+
"analyze",
|
|
44
|
+
"ModelComparator",
|
|
45
|
+
"AnalysisReport",
|
|
46
|
+
# Datasets
|
|
47
|
+
"load_titanic",
|
|
48
|
+
"load_housing",
|
|
49
|
+
# Compat
|
|
50
|
+
"to_pandas",
|
|
51
|
+
# Config
|
|
52
|
+
"InsightMLConfig",
|
|
53
|
+
"get_config",
|
|
54
|
+
"set_config",
|
|
55
|
+
"config_context",
|
|
56
|
+
# Exceptions
|
|
57
|
+
"InsightMLError",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def analyze(
|
|
62
|
+
df,
|
|
63
|
+
target: str,
|
|
64
|
+
*,
|
|
65
|
+
task: str | None = None,
|
|
66
|
+
run_battle: bool = True,
|
|
67
|
+
battle_families: list[str] | None = None,
|
|
68
|
+
battle_models: list[str] | None = None,
|
|
69
|
+
battle_exclude: list[str] | None = None,
|
|
70
|
+
cv: int | None = None,
|
|
71
|
+
n_jobs: int | None = None,
|
|
72
|
+
datetime_col: str | None = None,
|
|
73
|
+
) -> AnalysisReport:
|
|
74
|
+
"""Full pipeline: EDA → Intelligence → Battle → Compare → Report.
|
|
75
|
+
|
|
76
|
+
Runs all five stages and returns an :class:`~dissectml.report.AnalysisReport`
|
|
77
|
+
that can be inspected interactively or exported to HTML.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
df: Input DataFrame (features + target column).
|
|
81
|
+
target: Name of the target column.
|
|
82
|
+
task: ``"classification"`` or ``"regression"``. Inferred if None.
|
|
83
|
+
run_battle: If False, skip Stages 3-4 (EDA + Intelligence only).
|
|
84
|
+
battle_families: Filter models by family for the battle stage.
|
|
85
|
+
battle_models: Explicit model names for the battle stage.
|
|
86
|
+
battle_exclude: Model names to exclude from battle.
|
|
87
|
+
cv: CV folds for the battle stage.
|
|
88
|
+
n_jobs: Parallel workers for the battle stage.
|
|
89
|
+
datetime_col: Optional datetime column for temporal leakage detection.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
:class:`~dissectml.report.AnalysisReport`
|
|
93
|
+
|
|
94
|
+
Example::
|
|
95
|
+
|
|
96
|
+
import dissectml as iml
|
|
97
|
+
report = iml.analyze(df, target="survived")
|
|
98
|
+
report.summary()
|
|
99
|
+
report.export("report.html")
|
|
100
|
+
"""
|
|
101
|
+
import pandas as pd
|
|
102
|
+
|
|
103
|
+
if not isinstance(df, pd.DataFrame):
|
|
104
|
+
raise TypeError(f"df must be a pandas DataFrame, got {type(df).__name__}")
|
|
105
|
+
if target not in df.columns:
|
|
106
|
+
raise KeyError(f"Target column '{target}' not found in DataFrame.")
|
|
107
|
+
|
|
108
|
+
feature_cols = [c for c in df.columns if c != target]
|
|
109
|
+
n_samples = len(df)
|
|
110
|
+
n_features = len(feature_cols)
|
|
111
|
+
|
|
112
|
+
# Infer task if not provided
|
|
113
|
+
if task is None or task == "auto":
|
|
114
|
+
from dissectml.battle.runner import _infer_task
|
|
115
|
+
task = _infer_task(df[target])
|
|
116
|
+
|
|
117
|
+
# Stage 1: EDA
|
|
118
|
+
eda_result = explore(df, target=target)
|
|
119
|
+
|
|
120
|
+
# Stage 2: Intelligence
|
|
121
|
+
intel_result = analyze_intelligence(
|
|
122
|
+
df, target=target, task=task,
|
|
123
|
+
datetime_col=datetime_col,
|
|
124
|
+
eda_result=eda_result,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if not run_battle:
|
|
128
|
+
return AnalysisReport(
|
|
129
|
+
task=task, target=target,
|
|
130
|
+
n_samples=n_samples, n_features=n_features,
|
|
131
|
+
eda=eda_result, intelligence=intel_result,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Stage 3: Battle
|
|
135
|
+
battle_result = battle(
|
|
136
|
+
df, target=target, task=task,
|
|
137
|
+
families=battle_families,
|
|
138
|
+
models=battle_models,
|
|
139
|
+
exclude=battle_exclude,
|
|
140
|
+
cv=cv,
|
|
141
|
+
n_jobs=n_jobs,
|
|
142
|
+
eda_result=eda_result,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Stage 4: Compare
|
|
146
|
+
X = df.drop(columns=[target])
|
|
147
|
+
y = df[target]
|
|
148
|
+
comparator = ModelComparator(battle_result, X=X, y=y)
|
|
149
|
+
|
|
150
|
+
return AnalysisReport(
|
|
151
|
+
task=task, target=target,
|
|
152
|
+
n_samples=n_samples, n_features=n_features,
|
|
153
|
+
eda=eda_result, intelligence=intel_result,
|
|
154
|
+
models=battle_result, compare=comparator,
|
|
155
|
+
)
|
dissectml/_compat.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Compatibility layer — accept Polars DataFrames, file paths, and dicts in public API.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from dissectml._compat import to_pandas
|
|
6
|
+
|
|
7
|
+
df = to_pandas(user_input) # works for pd.DataFrame, pl.DataFrame, Path, str, dict, list
|
|
8
|
+
|
|
9
|
+
All public-facing functions that accept data should call ``to_pandas`` first.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def to_pandas(data: Any, **read_csv_kwargs: Any) -> pd.DataFrame:
|
|
21
|
+
"""Convert various input types to a pandas DataFrame.
|
|
22
|
+
|
|
23
|
+
Supported inputs:
|
|
24
|
+
|
|
25
|
+
* ``pandas.DataFrame`` — returned unchanged.
|
|
26
|
+
* ``polars.DataFrame`` or ``polars.LazyFrame`` — converted via ``.to_pandas()``.
|
|
27
|
+
* ``str`` / ``pathlib.Path`` — read as CSV (pass extra kwargs to ``pd.read_csv``).
|
|
28
|
+
* ``dict`` — passed to ``pd.DataFrame.from_dict``.
|
|
29
|
+
* ``list[dict]`` — passed to ``pd.DataFrame(data)``.
|
|
30
|
+
* ``numpy.ndarray`` — passed to ``pd.DataFrame(data)``.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
data: Input data.
|
|
34
|
+
**read_csv_kwargs: Forwarded to :func:`pandas.read_csv` when *data* is a path.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
pandas DataFrame.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
TypeError: If *data* is an unsupported type.
|
|
41
|
+
"""
|
|
42
|
+
if isinstance(data, pd.DataFrame):
|
|
43
|
+
return data
|
|
44
|
+
|
|
45
|
+
# --- Polars (optional dep) ---
|
|
46
|
+
try:
|
|
47
|
+
import polars as pl # noqa: F401 (only imported if available)
|
|
48
|
+
|
|
49
|
+
if isinstance(data, pl.DataFrame):
|
|
50
|
+
return data.to_pandas()
|
|
51
|
+
if isinstance(data, pl.LazyFrame):
|
|
52
|
+
return data.collect().to_pandas()
|
|
53
|
+
except ImportError:
|
|
54
|
+
pass # Polars not installed — skip
|
|
55
|
+
|
|
56
|
+
# --- File path ---
|
|
57
|
+
if isinstance(data, (str, Path)):
|
|
58
|
+
path = Path(data)
|
|
59
|
+
suffix = path.suffix.lower()
|
|
60
|
+
if suffix == ".csv":
|
|
61
|
+
return pd.read_csv(path, **read_csv_kwargs)
|
|
62
|
+
if suffix in (".parquet", ".pq"):
|
|
63
|
+
return pd.read_parquet(path, **read_csv_kwargs)
|
|
64
|
+
if suffix in (".xlsx", ".xls"):
|
|
65
|
+
return pd.read_excel(path, **read_csv_kwargs)
|
|
66
|
+
if suffix == ".json":
|
|
67
|
+
return pd.read_json(path, **read_csv_kwargs)
|
|
68
|
+
# Fallback: try CSV
|
|
69
|
+
return pd.read_csv(path, **read_csv_kwargs)
|
|
70
|
+
|
|
71
|
+
# --- dict / list / array-like ---
|
|
72
|
+
if isinstance(data, dict):
|
|
73
|
+
return pd.DataFrame(data)
|
|
74
|
+
|
|
75
|
+
if isinstance(data, list):
|
|
76
|
+
return pd.DataFrame(data)
|
|
77
|
+
|
|
78
|
+
# --- numpy ndarray ---
|
|
79
|
+
try:
|
|
80
|
+
import numpy as np
|
|
81
|
+
|
|
82
|
+
if isinstance(data, np.ndarray):
|
|
83
|
+
return pd.DataFrame(data)
|
|
84
|
+
except ImportError:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
raise TypeError(
|
|
88
|
+
f"Unsupported data type: {type(data).__name__}. "
|
|
89
|
+
"Expected pandas DataFrame, polars DataFrame, file path (str/Path), "
|
|
90
|
+
"dict, or list of dicts."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def is_polars_available() -> bool:
|
|
95
|
+
"""Return True if polars is importable."""
|
|
96
|
+
try:
|
|
97
|
+
import polars # noqa: F401
|
|
98
|
+
return True
|
|
99
|
+
except ImportError:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_pandas_version() -> tuple[int, ...]:
|
|
104
|
+
"""Return pandas version as a tuple of ints, e.g. (2, 1, 0)."""
|
|
105
|
+
parts = pd.__version__.split(".")
|
|
106
|
+
result = []
|
|
107
|
+
for p in parts:
|
|
108
|
+
# Strip any non-numeric suffix like "1rc1"
|
|
109
|
+
num = "".join(c for c in p if c.isdigit())
|
|
110
|
+
result.append(int(num) if num else 0)
|
|
111
|
+
return tuple(result)
|
dissectml/_config.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Global configuration for InsightML with context manager support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import copy
|
|
7
|
+
import threading
|
|
8
|
+
from collections.abc import Generator
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class InsightMLConfig:
|
|
15
|
+
"""Configuration dataclass for InsightML.
|
|
16
|
+
|
|
17
|
+
Three override levels (highest to lowest priority):
|
|
18
|
+
per-call kwargs > config_context() > set_config() > defaults
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# --- EDA ---
|
|
22
|
+
categorical_threshold: int = 50 # nunique <= this -> CATEGORICAL
|
|
23
|
+
high_cardinality_threshold: int = 100 # nunique > this -> HIGH_CARDINALITY
|
|
24
|
+
text_min_avg_length: int = 30 # avg str length > this -> TEXT
|
|
25
|
+
significance_level: float = 0.05 # p-value threshold for all tests
|
|
26
|
+
iqr_multiplier: float = 1.5 # outlier IQR fence multiplier
|
|
27
|
+
zscore_threshold: float = 3.0 # outlier z-score cutoff
|
|
28
|
+
isolation_forest_contamination: float = 0.05
|
|
29
|
+
max_k_clusters: int = 10 # max K for auto K-Means
|
|
30
|
+
max_bivariate_pairs: int = 30 # pair limit for bivariate analysis
|
|
31
|
+
correlation_methods: list[str] = field(
|
|
32
|
+
default_factory=lambda: ["pearson", "spearman", "cramers_v"]
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# --- Battle ---
|
|
36
|
+
cv_folds: int = 5
|
|
37
|
+
timeout_per_model: int = 300 # seconds per model during training
|
|
38
|
+
n_jobs: int = -1 # joblib parallelism (-1 = all cores)
|
|
39
|
+
random_state: int = 42
|
|
40
|
+
|
|
41
|
+
# --- Scale ---
|
|
42
|
+
large_dataset_threshold: int = 100_000 # rows; triggers auto-sampling
|
|
43
|
+
sample_size: int = 50_000 # subsample size for expensive ops
|
|
44
|
+
|
|
45
|
+
# --- Report ---
|
|
46
|
+
report_theme: str = "default"
|
|
47
|
+
plotly_template: str = "plotly_white"
|
|
48
|
+
|
|
49
|
+
# --- General ---
|
|
50
|
+
verbosity: int = 1 # 0=silent, 1=progress, 2=debug
|
|
51
|
+
|
|
52
|
+
def copy_with(self, **kwargs: Any) -> InsightMLConfig:
|
|
53
|
+
"""Return a copy of this config with the given fields overridden."""
|
|
54
|
+
cfg = copy.copy(self)
|
|
55
|
+
for key, value in kwargs.items():
|
|
56
|
+
if not hasattr(cfg, key):
|
|
57
|
+
raise ValueError(f"Unknown config key: {key!r}")
|
|
58
|
+
setattr(cfg, key, value)
|
|
59
|
+
return cfg
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
# Global config state (thread-local for context manager support)
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
_global_config = InsightMLConfig()
|
|
67
|
+
_thread_local = threading.local()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_config() -> InsightMLConfig:
|
|
71
|
+
"""Return the currently active configuration.
|
|
72
|
+
|
|
73
|
+
Returns thread-local config if inside config_context(), else the global config.
|
|
74
|
+
"""
|
|
75
|
+
return getattr(_thread_local, "config", _global_config)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def set_config(**kwargs: Any) -> None:
|
|
79
|
+
"""Update the global default configuration.
|
|
80
|
+
|
|
81
|
+
Example::
|
|
82
|
+
|
|
83
|
+
iml.set_config(cv_folds=10, verbosity=0)
|
|
84
|
+
"""
|
|
85
|
+
global _global_config
|
|
86
|
+
for key, value in kwargs.items():
|
|
87
|
+
if not hasattr(_global_config, key):
|
|
88
|
+
raise ValueError(f"Unknown config key: {key!r}")
|
|
89
|
+
setattr(_global_config, key, value)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@contextlib.contextmanager
|
|
93
|
+
def config_context(**kwargs: Any) -> Generator[InsightMLConfig, None, None]:
|
|
94
|
+
"""Temporarily override configuration within a with-block.
|
|
95
|
+
|
|
96
|
+
Example::
|
|
97
|
+
|
|
98
|
+
with iml.config_context(cv_folds=3):
|
|
99
|
+
result = iml.battle(df, target="y")
|
|
100
|
+
"""
|
|
101
|
+
old_config = getattr(_thread_local, "config", None)
|
|
102
|
+
base = old_config if old_config is not None else _global_config
|
|
103
|
+
_thread_local.config = base.copy_with(**kwargs)
|
|
104
|
+
try:
|
|
105
|
+
yield _thread_local.config
|
|
106
|
+
finally:
|
|
107
|
+
if old_config is None:
|
|
108
|
+
del _thread_local.config
|
|
109
|
+
else:
|
|
110
|
+
_thread_local.config = old_config
|
dissectml/_io.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""File I/O helpers — load DataFrames from CSV, Excel, Parquet, or JSON."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from dissectml.exceptions import DependencyError, UnsupportedFormatError
|
|
11
|
+
|
|
12
|
+
# Supported extensions and their pandas readers
|
|
13
|
+
_READERS: dict[str, Any] = {
|
|
14
|
+
".csv": pd.read_csv,
|
|
15
|
+
".tsv": lambda p, **kw: pd.read_csv(p, sep="\t", **kw),
|
|
16
|
+
".xlsx": pd.read_excel,
|
|
17
|
+
".xls": pd.read_excel,
|
|
18
|
+
".parquet": pd.read_parquet,
|
|
19
|
+
".json": pd.read_json,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
SUPPORTED_EXTENSIONS = list(_READERS.keys())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def read_data(path: str | Path, **kwargs: Any) -> pd.DataFrame:
|
|
26
|
+
"""Load a DataFrame from a file path.
|
|
27
|
+
|
|
28
|
+
Supports CSV, TSV, Excel (.xlsx/.xls), Parquet, and JSON.
|
|
29
|
+
Extra kwargs are passed directly to the underlying pandas reader.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
path: Path to the data file.
|
|
33
|
+
**kwargs: Additional keyword arguments forwarded to the pandas reader.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Loaded DataFrame.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
UnsupportedFormatError: If the file extension is not supported.
|
|
40
|
+
DependencyError: If a required optional reader package is missing.
|
|
41
|
+
FileNotFoundError: If the file does not exist.
|
|
42
|
+
"""
|
|
43
|
+
path = Path(path)
|
|
44
|
+
if not path.exists():
|
|
45
|
+
raise FileNotFoundError(f"Data file not found: {path}")
|
|
46
|
+
|
|
47
|
+
ext = path.suffix.lower()
|
|
48
|
+
if ext not in _READERS:
|
|
49
|
+
raise UnsupportedFormatError(
|
|
50
|
+
f"Unsupported file format: '{ext}'. "
|
|
51
|
+
f"Supported formats: {SUPPORTED_EXTENSIONS}"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
reader = _READERS[ext]
|
|
55
|
+
try:
|
|
56
|
+
return reader(path, **kwargs)
|
|
57
|
+
except ImportError as exc:
|
|
58
|
+
# e.g., openpyxl not installed for .xlsx, pyarrow missing for .parquet
|
|
59
|
+
missing = str(exc).split("'")[1] if "'" in str(exc) else str(exc)
|
|
60
|
+
raise DependencyError(
|
|
61
|
+
f"Reading '{ext}' files requires '{missing}'. "
|
|
62
|
+
f"Install it with: pip install {missing}"
|
|
63
|
+
) from exc
|
dissectml/_lazy.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Optional dependency guard for InsightML extras."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
from types import ModuleType
|
|
7
|
+
|
|
8
|
+
from dissectml.exceptions import OptionalDependencyError
|
|
9
|
+
|
|
10
|
+
# Maps package import name -> (install name, extra group)
|
|
11
|
+
_EXTRA_MAP: dict[str, tuple[str, str]] = {
|
|
12
|
+
"xgboost": ("xgboost", "boost"),
|
|
13
|
+
"lightgbm": ("lightgbm", "boost"),
|
|
14
|
+
"catboost": ("catboost", "boost"),
|
|
15
|
+
"shap": ("shap", "explain"),
|
|
16
|
+
"weasyprint": ("weasyprint", "report"),
|
|
17
|
+
"kaleido": ("kaleido", "report"),
|
|
18
|
+
"polars": ("polars", "scale"),
|
|
19
|
+
"optuna": ("optuna", "scale"),
|
|
20
|
+
"openpyxl": ("openpyxl", "pip install openpyxl"),
|
|
21
|
+
"pyarrow": ("pyarrow", "pip install pyarrow"),
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def require(package_name: str, extra: str | None = None) -> ModuleType:
|
|
26
|
+
"""Import an optional package, raising a helpful error if missing.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
package_name: The import name of the package (e.g., "xgboost").
|
|
30
|
+
extra: The dissectml extra group to suggest (e.g., "boost").
|
|
31
|
+
If None, inferred from _EXTRA_MAP or shown as bare install.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
The imported module.
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
OptionalDependencyError: If the package is not installed.
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
return importlib.import_module(package_name)
|
|
41
|
+
except ImportError:
|
|
42
|
+
if extra is None:
|
|
43
|
+
entry = _EXTRA_MAP.get(package_name)
|
|
44
|
+
extra = entry[1] if entry else package_name
|
|
45
|
+
if extra.startswith("pip install"):
|
|
46
|
+
install_hint = extra
|
|
47
|
+
else:
|
|
48
|
+
install_hint = f"pip install dissectml[{extra}]"
|
|
49
|
+
raise OptionalDependencyError(
|
|
50
|
+
f"Optional dependency '{package_name}' is not installed. "
|
|
51
|
+
f"Install it with: {install_hint}"
|
|
52
|
+
) from None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def is_available(package_name: str) -> bool:
|
|
56
|
+
"""Check if an optional package is importable (no error raised)."""
|
|
57
|
+
try:
|
|
58
|
+
importlib.import_module(package_name)
|
|
59
|
+
return True
|
|
60
|
+
except ImportError:
|
|
61
|
+
return False
|
dissectml/_sampling.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Smart sampling strategies for large datasets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from dissectml._config import InsightMLConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def smart_sample(
|
|
11
|
+
df: pd.DataFrame,
|
|
12
|
+
target: str | None = None,
|
|
13
|
+
config: InsightMLConfig | None = None,
|
|
14
|
+
*,
|
|
15
|
+
force: bool = False,
|
|
16
|
+
) -> pd.DataFrame:
|
|
17
|
+
"""Return a representative subsample if the DataFrame exceeds the size threshold.
|
|
18
|
+
|
|
19
|
+
Sampling strategy (in priority order):
|
|
20
|
+
1. Stratified — when target is provided (preserves class distribution)
|
|
21
|
+
2. Temporal — when a datetime column is detected (preserves time ordering)
|
|
22
|
+
3. Random — fallback with fixed random_state for reproducibility
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
df: Input DataFrame.
|
|
26
|
+
target: Target column name (enables stratified sampling if given).
|
|
27
|
+
config: InsightML configuration. Uses global config if None.
|
|
28
|
+
force: If True, always sample even below threshold.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
The original DataFrame if small enough, otherwise a subsample.
|
|
32
|
+
"""
|
|
33
|
+
if config is None:
|
|
34
|
+
from dissectml._config import get_config
|
|
35
|
+
config = get_config()
|
|
36
|
+
|
|
37
|
+
n = len(df)
|
|
38
|
+
if not force and n <= config.large_dataset_threshold:
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
sample_size = min(config.sample_size, n)
|
|
42
|
+
|
|
43
|
+
# --- Stratified sampling (classification target present) ---
|
|
44
|
+
if target is not None and target in df.columns:
|
|
45
|
+
target_col = df[target]
|
|
46
|
+
if target_col.dtype in ("object", "category", "bool") or (
|
|
47
|
+
target_col.nunique() <= 50
|
|
48
|
+
):
|
|
49
|
+
try:
|
|
50
|
+
return df.groupby(target, group_keys=False).apply(
|
|
51
|
+
lambda g: g.sample(
|
|
52
|
+
frac=sample_size / n,
|
|
53
|
+
random_state=config.random_state,
|
|
54
|
+
)
|
|
55
|
+
).reset_index(drop=True)
|
|
56
|
+
except Exception:
|
|
57
|
+
pass # Fall through to temporal or random
|
|
58
|
+
|
|
59
|
+
# --- Temporal sampling (datetime column present) ---
|
|
60
|
+
datetime_cols = df.select_dtypes(include=["datetime64"]).columns.tolist()
|
|
61
|
+
if datetime_cols:
|
|
62
|
+
dt_col = datetime_cols[0]
|
|
63
|
+
sorted_df = df.sort_values(dt_col)
|
|
64
|
+
# Sample evenly spaced indices to preserve time distribution
|
|
65
|
+
step = max(1, n // sample_size)
|
|
66
|
+
indices = list(range(0, n, step))[:sample_size]
|
|
67
|
+
return sorted_df.iloc[indices].reset_index(drop=True)
|
|
68
|
+
|
|
69
|
+
# --- Random sampling (fallback) ---
|
|
70
|
+
return df.sample(n=sample_size, random_state=config.random_state).reset_index(
|
|
71
|
+
drop=True
|
|
72
|
+
)
|
dissectml/_types.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Enums, TypedDicts, and type aliases used across InsightML."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, TypedDict
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
# Enums
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
class TaskType(str, Enum):
|
|
16
|
+
CLASSIFICATION = "classification"
|
|
17
|
+
REGRESSION = "regression"
|
|
18
|
+
AUTO = "auto"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ColumnType(str, Enum):
|
|
22
|
+
NUMERIC = "numeric"
|
|
23
|
+
CATEGORICAL = "categorical"
|
|
24
|
+
DATETIME = "datetime"
|
|
25
|
+
TEXT = "text"
|
|
26
|
+
BOOLEAN = "boolean"
|
|
27
|
+
HIGH_CARDINALITY = "high_cardinality"
|
|
28
|
+
CONSTANT = "constant"
|
|
29
|
+
UNIQUE_ID = "unique_id"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class MissingnessType(str, Enum):
|
|
33
|
+
MCAR = "MCAR" # Missing Completely At Random
|
|
34
|
+
MAR = "MAR" # Missing At Random (depends on observed data)
|
|
35
|
+
MNAR = "MNAR" # Missing Not At Random (depends on missing value itself)
|
|
36
|
+
UNKNOWN = "unknown"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TuningMode(str, Enum):
|
|
40
|
+
QUICK = "quick" # Default hyperparameters only (no search)
|
|
41
|
+
TUNED = "tuned" # RandomizedSearchCV on top-N models
|
|
42
|
+
CUSTOM = "custom" # User-provided param grids
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# TypedDicts
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
class ColumnProfile(TypedDict, total=False):
|
|
50
|
+
"""Per-column statistics computed by DataOverview."""
|
|
51
|
+
name: str
|
|
52
|
+
dtype: str
|
|
53
|
+
inferred_type: str # ColumnType value
|
|
54
|
+
count: int
|
|
55
|
+
unique: int
|
|
56
|
+
missing_count: int
|
|
57
|
+
missing_pct: float
|
|
58
|
+
memory_bytes: int
|
|
59
|
+
# Numeric fields
|
|
60
|
+
mean: float
|
|
61
|
+
median: float
|
|
62
|
+
std: float
|
|
63
|
+
variance: float
|
|
64
|
+
min: float
|
|
65
|
+
max: float
|
|
66
|
+
range: float
|
|
67
|
+
iqr: float
|
|
68
|
+
q1: float
|
|
69
|
+
q3: float
|
|
70
|
+
skewness: float
|
|
71
|
+
kurtosis: float
|
|
72
|
+
# Categorical fields
|
|
73
|
+
top_value: Any
|
|
74
|
+
top_freq: int
|
|
75
|
+
cardinality_ratio: float
|
|
76
|
+
value_counts: dict[str, int]
|
|
77
|
+
# DateTime fields
|
|
78
|
+
dt_min: str
|
|
79
|
+
dt_max: str
|
|
80
|
+
range_days: float
|
|
81
|
+
inferred_frequency: str | None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DataSchema(TypedDict):
|
|
85
|
+
"""Schema inferred from the dataset."""
|
|
86
|
+
column_types: dict[str, ColumnType]
|
|
87
|
+
numeric_cols: list[str]
|
|
88
|
+
categorical_cols: list[str]
|
|
89
|
+
datetime_cols: list[str]
|
|
90
|
+
text_cols: list[str]
|
|
91
|
+
boolean_cols: list[str]
|
|
92
|
+
high_cardinality_cols: list[str]
|
|
93
|
+
constant_cols: list[str]
|
|
94
|
+
unique_id_cols: list[str]
|
|
95
|
+
target_col: str | None
|
|
96
|
+
task: TaskType
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class LeakageWarning(TypedDict):
|
|
100
|
+
"""A potential data leakage warning for a feature."""
|
|
101
|
+
column: str
|
|
102
|
+
score: float
|
|
103
|
+
method: str # "high_correlation" | "mutual_information" | "temporal" | "derived"
|
|
104
|
+
severity: str # "critical" | "warning" | "info"
|
|
105
|
+
explanation: str
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
# Type aliases
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
DataFrame = pd.DataFrame
|
|
113
|
+
Series = pd.Series
|
|
114
|
+
Array = np.ndarray
|
dissectml/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|