dissectml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. dissectml/__init__.py +155 -0
  2. dissectml/_compat.py +111 -0
  3. dissectml/_config.py +110 -0
  4. dissectml/_io.py +63 -0
  5. dissectml/_lazy.py +61 -0
  6. dissectml/_sampling.py +72 -0
  7. dissectml/_types.py +114 -0
  8. dissectml/_version.py +1 -0
  9. dissectml/battle/__init__.py +97 -0
  10. dissectml/battle/catalog.py +240 -0
  11. dissectml/battle/param_grids.py +193 -0
  12. dissectml/battle/preprocessing.py +268 -0
  13. dissectml/battle/registry.py +134 -0
  14. dissectml/battle/result.py +201 -0
  15. dissectml/battle/runner.py +359 -0
  16. dissectml/battle/tuner.py +253 -0
  17. dissectml/compare/__init__.py +32 -0
  18. dissectml/compare/comparator.py +197 -0
  19. dissectml/compare/curves.py +304 -0
  20. dissectml/compare/error_analysis.py +268 -0
  21. dissectml/compare/metrics_table.py +123 -0
  22. dissectml/compare/pareto.py +114 -0
  23. dissectml/compare/shap_compare.py +157 -0
  24. dissectml/compare/significance.py +159 -0
  25. dissectml/core/__init__.py +6 -0
  26. dissectml/core/base.py +144 -0
  27. dissectml/core/data_container.py +165 -0
  28. dissectml/core/pipeline.py +59 -0
  29. dissectml/core/progress.py +78 -0
  30. dissectml/core/validators.py +188 -0
  31. dissectml/datasets/__init__.py +128 -0
  32. dissectml/datasets/data/titanic.csv +892 -0
  33. dissectml/eda/__init__.py +26 -0
  34. dissectml/eda/_base.py +177 -0
  35. dissectml/eda/bivariate.py +280 -0
  36. dissectml/eda/clusters.py +328 -0
  37. dissectml/eda/correlations.py +285 -0
  38. dissectml/eda/interactions.py +297 -0
  39. dissectml/eda/missing.py +301 -0
  40. dissectml/eda/outliers.py +246 -0
  41. dissectml/eda/overview.py +235 -0
  42. dissectml/eda/result.py +244 -0
  43. dissectml/eda/statistical_tests.py +316 -0
  44. dissectml/eda/target_analysis.py +351 -0
  45. dissectml/eda/univariate.py +240 -0
  46. dissectml/exceptions.py +67 -0
  47. dissectml/intelligence/__init__.py +68 -0
  48. dissectml/intelligence/feature_importance.py +168 -0
  49. dissectml/intelligence/leakage.py +309 -0
  50. dissectml/intelligence/multicollinearity.py +237 -0
  51. dissectml/intelligence/readiness.py +327 -0
  52. dissectml/intelligence/recommendations.py +213 -0
  53. dissectml/intelligence/result.py +212 -0
  54. dissectml/report/__init__.py +19 -0
  55. dissectml/report/assets/script.js +112 -0
  56. dissectml/report/assets/style.css +286 -0
  57. dissectml/report/builder.py +132 -0
  58. dissectml/report/html_renderer.py +578 -0
  59. dissectml/report/narrative.py +191 -0
  60. dissectml/report/pdf_renderer.py +74 -0
  61. dissectml/report/sections/__init__.py +15 -0
  62. dissectml/report/sections/battle_section.py +92 -0
  63. dissectml/report/sections/compare_section.py +90 -0
  64. dissectml/report/sections/eda_section.py +89 -0
  65. dissectml/report/sections/intelligence_section.py +111 -0
  66. dissectml/report/sections/summary_section.py +112 -0
  67. dissectml/report/templates/base.html.j2 +43 -0
  68. dissectml/report/templates/components/chart_container.html.j2 +12 -0
  69. dissectml/report/templates/components/collapsible.html.j2 +9 -0
  70. dissectml/report/templates/components/table.html.j2 +23 -0
  71. dissectml/report/templates/components/toc.html.j2 +11 -0
  72. dissectml/report/templates/section.html.j2 +16 -0
  73. dissectml/viz/__init__.py +12 -0
  74. dissectml/viz/charts.py +210 -0
  75. dissectml/viz/display.py +78 -0
  76. dissectml/viz/theme.py +83 -0
  77. dissectml-0.1.0.dist-info/METADATA +357 -0
  78. dissectml-0.1.0.dist-info/RECORD +80 -0
  79. dissectml-0.1.0.dist-info/WHEEL +4 -0
  80. dissectml-0.1.0.dist-info/licenses/LICENSE +21 -0
dissectml/__init__.py ADDED
@@ -0,0 +1,155 @@
1
+ """
2
+ InsightML — The missing middle layer between EDA and AutoML.
3
+
4
+ Unified pipeline from deep data understanding to model comparison,
5
+ in as few as 3 function calls.
6
+
7
+ Quick start::
8
+
9
+ import dissectml as iml
10
+
11
+ # Deep EDA (v0.1+)
12
+ eda = iml.explore(df)
13
+ eda.overview.show()
14
+ eda.correlations.heatmap()
15
+
16
+ # Model battle (v0.2+)
17
+ models = iml.battle(df, target="price")
18
+
19
+ # Full pipeline (v0.4+)
20
+ report = iml.analyze(df, target="price")
21
+ report.export("report.html")
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from dissectml._compat import to_pandas
27
+ from dissectml._config import InsightMLConfig, config_context, get_config, set_config
28
+ from dissectml._version import __version__
29
+ from dissectml.battle import battle
30
+ from dissectml.compare import ModelComparator
31
+ from dissectml.datasets import load_housing, load_titanic
32
+ from dissectml.eda import explore
33
+ from dissectml.exceptions import InsightMLError
34
+ from dissectml.intelligence import analyze_intelligence
35
+ from dissectml.report import AnalysisReport
36
+
37
+ __all__ = [
38
+ "__version__",
39
+ # Public API
40
+ "explore",
41
+ "battle",
42
+ "analyze_intelligence",
43
+ "analyze",
44
+ "ModelComparator",
45
+ "AnalysisReport",
46
+ # Datasets
47
+ "load_titanic",
48
+ "load_housing",
49
+ # Compat
50
+ "to_pandas",
51
+ # Config
52
+ "InsightMLConfig",
53
+ "get_config",
54
+ "set_config",
55
+ "config_context",
56
+ # Exceptions
57
+ "InsightMLError",
58
+ ]
59
+
60
+
61
+ def analyze(
62
+ df,
63
+ target: str,
64
+ *,
65
+ task: str | None = None,
66
+ run_battle: bool = True,
67
+ battle_families: list[str] | None = None,
68
+ battle_models: list[str] | None = None,
69
+ battle_exclude: list[str] | None = None,
70
+ cv: int | None = None,
71
+ n_jobs: int | None = None,
72
+ datetime_col: str | None = None,
73
+ ) -> AnalysisReport:
74
+ """Full pipeline: EDA → Intelligence → Battle → Compare → Report.
75
+
76
+ Runs all five stages and returns an :class:`~dissectml.report.AnalysisReport`
77
+ that can be inspected interactively or exported to HTML.
78
+
79
+ Args:
80
+ df: Input DataFrame (features + target column).
81
+ target: Name of the target column.
82
+ task: ``"classification"`` or ``"regression"``. Inferred if None.
83
+ run_battle: If False, skip Stages 3-4 (EDA + Intelligence only).
84
+ battle_families: Filter models by family for the battle stage.
85
+ battle_models: Explicit model names for the battle stage.
86
+ battle_exclude: Model names to exclude from battle.
87
+ cv: CV folds for the battle stage.
88
+ n_jobs: Parallel workers for the battle stage.
89
+ datetime_col: Optional datetime column for temporal leakage detection.
90
+
91
+ Returns:
92
+ :class:`~dissectml.report.AnalysisReport`
93
+
94
+ Example::
95
+
96
+ import dissectml as iml
97
+ report = iml.analyze(df, target="survived")
98
+ report.summary()
99
+ report.export("report.html")
100
+ """
101
+ import pandas as pd
102
+
103
+ if not isinstance(df, pd.DataFrame):
104
+ raise TypeError(f"df must be a pandas DataFrame, got {type(df).__name__}")
105
+ if target not in df.columns:
106
+ raise KeyError(f"Target column '{target}' not found in DataFrame.")
107
+
108
+ feature_cols = [c for c in df.columns if c != target]
109
+ n_samples = len(df)
110
+ n_features = len(feature_cols)
111
+
112
+ # Infer task if not provided
113
+ if task is None or task == "auto":
114
+ from dissectml.battle.runner import _infer_task
115
+ task = _infer_task(df[target])
116
+
117
+ # Stage 1: EDA
118
+ eda_result = explore(df, target=target)
119
+
120
+ # Stage 2: Intelligence
121
+ intel_result = analyze_intelligence(
122
+ df, target=target, task=task,
123
+ datetime_col=datetime_col,
124
+ eda_result=eda_result,
125
+ )
126
+
127
+ if not run_battle:
128
+ return AnalysisReport(
129
+ task=task, target=target,
130
+ n_samples=n_samples, n_features=n_features,
131
+ eda=eda_result, intelligence=intel_result,
132
+ )
133
+
134
+ # Stage 3: Battle
135
+ battle_result = battle(
136
+ df, target=target, task=task,
137
+ families=battle_families,
138
+ models=battle_models,
139
+ exclude=battle_exclude,
140
+ cv=cv,
141
+ n_jobs=n_jobs,
142
+ eda_result=eda_result,
143
+ )
144
+
145
+ # Stage 4: Compare
146
+ X = df.drop(columns=[target])
147
+ y = df[target]
148
+ comparator = ModelComparator(battle_result, X=X, y=y)
149
+
150
+ return AnalysisReport(
151
+ task=task, target=target,
152
+ n_samples=n_samples, n_features=n_features,
153
+ eda=eda_result, intelligence=intel_result,
154
+ models=battle_result, compare=comparator,
155
+ )
dissectml/_compat.py ADDED
@@ -0,0 +1,111 @@
1
+ """Compatibility layer — accept Polars DataFrames, file paths, and dicts in public API.
2
+
3
+ Usage::
4
+
5
+ from dissectml._compat import to_pandas
6
+
7
+ df = to_pandas(user_input) # works for pd.DataFrame, pl.DataFrame, Path, str, dict, list
8
+
9
+ All public-facing functions that accept data should call ``to_pandas`` first.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ import pandas as pd
18
+
19
+
20
+ def to_pandas(data: Any, **read_csv_kwargs: Any) -> pd.DataFrame:
21
+ """Convert various input types to a pandas DataFrame.
22
+
23
+ Supported inputs:
24
+
25
+ * ``pandas.DataFrame`` — returned unchanged.
26
+ * ``polars.DataFrame`` or ``polars.LazyFrame`` — converted via ``.to_pandas()``.
27
+ * ``str`` / ``pathlib.Path`` — read as CSV (pass extra kwargs to ``pd.read_csv``).
28
+ * ``dict`` — passed to ``pd.DataFrame.from_dict``.
29
+ * ``list[dict]`` — passed to ``pd.DataFrame(data)``.
30
+ * ``numpy.ndarray`` — passed to ``pd.DataFrame(data)``.
31
+
32
+ Args:
33
+ data: Input data.
34
+ **read_csv_kwargs: Forwarded to :func:`pandas.read_csv` when *data* is a path.
35
+
36
+ Returns:
37
+ pandas DataFrame.
38
+
39
+ Raises:
40
+ TypeError: If *data* is an unsupported type.
41
+ """
42
+ if isinstance(data, pd.DataFrame):
43
+ return data
44
+
45
+ # --- Polars (optional dep) ---
46
+ try:
47
+ import polars as pl # noqa: F401 (only imported if available)
48
+
49
+ if isinstance(data, pl.DataFrame):
50
+ return data.to_pandas()
51
+ if isinstance(data, pl.LazyFrame):
52
+ return data.collect().to_pandas()
53
+ except ImportError:
54
+ pass # Polars not installed — skip
55
+
56
+ # --- File path ---
57
+ if isinstance(data, (str, Path)):
58
+ path = Path(data)
59
+ suffix = path.suffix.lower()
60
+ if suffix == ".csv":
61
+ return pd.read_csv(path, **read_csv_kwargs)
62
+ if suffix in (".parquet", ".pq"):
63
+ return pd.read_parquet(path, **read_csv_kwargs)
64
+ if suffix in (".xlsx", ".xls"):
65
+ return pd.read_excel(path, **read_csv_kwargs)
66
+ if suffix == ".json":
67
+ return pd.read_json(path, **read_csv_kwargs)
68
+ # Fallback: try CSV
69
+ return pd.read_csv(path, **read_csv_kwargs)
70
+
71
+ # --- dict / list / array-like ---
72
+ if isinstance(data, dict):
73
+ return pd.DataFrame(data)
74
+
75
+ if isinstance(data, list):
76
+ return pd.DataFrame(data)
77
+
78
+ # --- numpy ndarray ---
79
+ try:
80
+ import numpy as np
81
+
82
+ if isinstance(data, np.ndarray):
83
+ return pd.DataFrame(data)
84
+ except ImportError:
85
+ pass
86
+
87
+ raise TypeError(
88
+ f"Unsupported data type: {type(data).__name__}. "
89
+ "Expected pandas DataFrame, polars DataFrame, file path (str/Path), "
90
+ "dict, or list of dicts."
91
+ )
92
+
93
+
94
+ def is_polars_available() -> bool:
95
+ """Return True if polars is importable."""
96
+ try:
97
+ import polars # noqa: F401
98
+ return True
99
+ except ImportError:
100
+ return False
101
+
102
+
103
+ def get_pandas_version() -> tuple[int, ...]:
104
+ """Return pandas version as a tuple of ints, e.g. (2, 1, 0)."""
105
+ parts = pd.__version__.split(".")
106
+ result = []
107
+ for p in parts:
108
+ # Strip any non-numeric suffix like "1rc1"
109
+ num = "".join(c for c in p if c.isdigit())
110
+ result.append(int(num) if num else 0)
111
+ return tuple(result)
dissectml/_config.py ADDED
@@ -0,0 +1,110 @@
1
+ """Global configuration for InsightML with context manager support."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import copy
7
+ import threading
8
+ from collections.abc import Generator
9
+ from dataclasses import dataclass, field
10
+ from typing import Any
11
+
12
+
13
+ @dataclass
14
+ class InsightMLConfig:
15
+ """Configuration dataclass for InsightML.
16
+
17
+ Three override levels (highest to lowest priority):
18
+ per-call kwargs > config_context() > set_config() > defaults
19
+ """
20
+
21
+ # --- EDA ---
22
+ categorical_threshold: int = 50 # nunique <= this -> CATEGORICAL
23
+ high_cardinality_threshold: int = 100 # nunique > this -> HIGH_CARDINALITY
24
+ text_min_avg_length: int = 30 # avg str length > this -> TEXT
25
+ significance_level: float = 0.05 # p-value threshold for all tests
26
+ iqr_multiplier: float = 1.5 # outlier IQR fence multiplier
27
+ zscore_threshold: float = 3.0 # outlier z-score cutoff
28
+ isolation_forest_contamination: float = 0.05
29
+ max_k_clusters: int = 10 # max K for auto K-Means
30
+ max_bivariate_pairs: int = 30 # pair limit for bivariate analysis
31
+ correlation_methods: list[str] = field(
32
+ default_factory=lambda: ["pearson", "spearman", "cramers_v"]
33
+ )
34
+
35
+ # --- Battle ---
36
+ cv_folds: int = 5
37
+ timeout_per_model: int = 300 # seconds per model during training
38
+ n_jobs: int = -1 # joblib parallelism (-1 = all cores)
39
+ random_state: int = 42
40
+
41
+ # --- Scale ---
42
+ large_dataset_threshold: int = 100_000 # rows; triggers auto-sampling
43
+ sample_size: int = 50_000 # subsample size for expensive ops
44
+
45
+ # --- Report ---
46
+ report_theme: str = "default"
47
+ plotly_template: str = "plotly_white"
48
+
49
+ # --- General ---
50
+ verbosity: int = 1 # 0=silent, 1=progress, 2=debug
51
+
52
+ def copy_with(self, **kwargs: Any) -> InsightMLConfig:
53
+ """Return a copy of this config with the given fields overridden."""
54
+ cfg = copy.copy(self)
55
+ for key, value in kwargs.items():
56
+ if not hasattr(cfg, key):
57
+ raise ValueError(f"Unknown config key: {key!r}")
58
+ setattr(cfg, key, value)
59
+ return cfg
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Global config state (thread-local for context manager support)
64
+ # ---------------------------------------------------------------------------
65
+
66
+ _global_config = InsightMLConfig()
67
+ _thread_local = threading.local()
68
+
69
+
70
+ def get_config() -> InsightMLConfig:
71
+ """Return the currently active configuration.
72
+
73
+ Returns thread-local config if inside config_context(), else the global config.
74
+ """
75
+ return getattr(_thread_local, "config", _global_config)
76
+
77
+
78
+ def set_config(**kwargs: Any) -> None:
79
+ """Update the global default configuration.
80
+
81
+ Example::
82
+
83
+ iml.set_config(cv_folds=10, verbosity=0)
84
+ """
85
+ global _global_config
86
+ for key, value in kwargs.items():
87
+ if not hasattr(_global_config, key):
88
+ raise ValueError(f"Unknown config key: {key!r}")
89
+ setattr(_global_config, key, value)
90
+
91
+
92
+ @contextlib.contextmanager
93
+ def config_context(**kwargs: Any) -> Generator[InsightMLConfig, None, None]:
94
+ """Temporarily override configuration within a with-block.
95
+
96
+ Example::
97
+
98
+ with iml.config_context(cv_folds=3):
99
+ result = iml.battle(df, target="y")
100
+ """
101
+ old_config = getattr(_thread_local, "config", None)
102
+ base = old_config if old_config is not None else _global_config
103
+ _thread_local.config = base.copy_with(**kwargs)
104
+ try:
105
+ yield _thread_local.config
106
+ finally:
107
+ if old_config is None:
108
+ del _thread_local.config
109
+ else:
110
+ _thread_local.config = old_config
dissectml/_io.py ADDED
@@ -0,0 +1,63 @@
1
+ """File I/O helpers — load DataFrames from CSV, Excel, Parquet, or JSON."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from dissectml.exceptions import DependencyError, UnsupportedFormatError
11
+
12
+ # Supported extensions and their pandas readers
13
+ _READERS: dict[str, Any] = {
14
+ ".csv": pd.read_csv,
15
+ ".tsv": lambda p, **kw: pd.read_csv(p, sep="\t", **kw),
16
+ ".xlsx": pd.read_excel,
17
+ ".xls": pd.read_excel,
18
+ ".parquet": pd.read_parquet,
19
+ ".json": pd.read_json,
20
+ }
21
+
22
+ SUPPORTED_EXTENSIONS = list(_READERS.keys())
23
+
24
+
25
+ def read_data(path: str | Path, **kwargs: Any) -> pd.DataFrame:
26
+ """Load a DataFrame from a file path.
27
+
28
+ Supports CSV, TSV, Excel (.xlsx/.xls), Parquet, and JSON.
29
+ Extra kwargs are passed directly to the underlying pandas reader.
30
+
31
+ Args:
32
+ path: Path to the data file.
33
+ **kwargs: Additional keyword arguments forwarded to the pandas reader.
34
+
35
+ Returns:
36
+ Loaded DataFrame.
37
+
38
+ Raises:
39
+ UnsupportedFormatError: If the file extension is not supported.
40
+ DependencyError: If a required optional reader package is missing.
41
+ FileNotFoundError: If the file does not exist.
42
+ """
43
+ path = Path(path)
44
+ if not path.exists():
45
+ raise FileNotFoundError(f"Data file not found: {path}")
46
+
47
+ ext = path.suffix.lower()
48
+ if ext not in _READERS:
49
+ raise UnsupportedFormatError(
50
+ f"Unsupported file format: '{ext}'. "
51
+ f"Supported formats: {SUPPORTED_EXTENSIONS}"
52
+ )
53
+
54
+ reader = _READERS[ext]
55
+ try:
56
+ return reader(path, **kwargs)
57
+ except ImportError as exc:
58
+ # e.g., openpyxl not installed for .xlsx, pyarrow missing for .parquet
59
+ missing = str(exc).split("'")[1] if "'" in str(exc) else str(exc)
60
+ raise DependencyError(
61
+ f"Reading '{ext}' files requires '{missing}'. "
62
+ f"Install it with: pip install {missing}"
63
+ ) from exc
dissectml/_lazy.py ADDED
@@ -0,0 +1,61 @@
1
+ """Optional dependency guard for InsightML extras."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ from types import ModuleType
7
+
8
+ from dissectml.exceptions import OptionalDependencyError
9
+
10
+ # Maps package import name -> (install name, extra group)
11
+ _EXTRA_MAP: dict[str, tuple[str, str]] = {
12
+ "xgboost": ("xgboost", "boost"),
13
+ "lightgbm": ("lightgbm", "boost"),
14
+ "catboost": ("catboost", "boost"),
15
+ "shap": ("shap", "explain"),
16
+ "weasyprint": ("weasyprint", "report"),
17
+ "kaleido": ("kaleido", "report"),
18
+ "polars": ("polars", "scale"),
19
+ "optuna": ("optuna", "scale"),
20
+ "openpyxl": ("openpyxl", "pip install openpyxl"),
21
+ "pyarrow": ("pyarrow", "pip install pyarrow"),
22
+ }
23
+
24
+
25
+ def require(package_name: str, extra: str | None = None) -> ModuleType:
26
+ """Import an optional package, raising a helpful error if missing.
27
+
28
+ Args:
29
+ package_name: The import name of the package (e.g., "xgboost").
30
+ extra: The dissectml extra group to suggest (e.g., "boost").
31
+ If None, inferred from _EXTRA_MAP or shown as bare install.
32
+
33
+ Returns:
34
+ The imported module.
35
+
36
+ Raises:
37
+ OptionalDependencyError: If the package is not installed.
38
+ """
39
+ try:
40
+ return importlib.import_module(package_name)
41
+ except ImportError:
42
+ if extra is None:
43
+ entry = _EXTRA_MAP.get(package_name)
44
+ extra = entry[1] if entry else package_name
45
+ if extra.startswith("pip install"):
46
+ install_hint = extra
47
+ else:
48
+ install_hint = f"pip install dissectml[{extra}]"
49
+ raise OptionalDependencyError(
50
+ f"Optional dependency '{package_name}' is not installed. "
51
+ f"Install it with: {install_hint}"
52
+ ) from None
53
+
54
+
55
+ def is_available(package_name: str) -> bool:
56
+ """Check if an optional package is importable (no error raised)."""
57
+ try:
58
+ importlib.import_module(package_name)
59
+ return True
60
+ except ImportError:
61
+ return False
dissectml/_sampling.py ADDED
@@ -0,0 +1,72 @@
1
+ """Smart sampling strategies for large datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from dissectml._config import InsightMLConfig
8
+
9
+
10
+ def smart_sample(
11
+ df: pd.DataFrame,
12
+ target: str | None = None,
13
+ config: InsightMLConfig | None = None,
14
+ *,
15
+ force: bool = False,
16
+ ) -> pd.DataFrame:
17
+ """Return a representative subsample if the DataFrame exceeds the size threshold.
18
+
19
+ Sampling strategy (in priority order):
20
+ 1. Stratified — when target is provided (preserves class distribution)
21
+ 2. Temporal — when a datetime column is detected (preserves time ordering)
22
+ 3. Random — fallback with fixed random_state for reproducibility
23
+
24
+ Args:
25
+ df: Input DataFrame.
26
+ target: Target column name (enables stratified sampling if given).
27
+ config: InsightML configuration. Uses global config if None.
28
+ force: If True, always sample even below threshold.
29
+
30
+ Returns:
31
+ The original DataFrame if small enough, otherwise a subsample.
32
+ """
33
+ if config is None:
34
+ from dissectml._config import get_config
35
+ config = get_config()
36
+
37
+ n = len(df)
38
+ if not force and n <= config.large_dataset_threshold:
39
+ return df
40
+
41
+ sample_size = min(config.sample_size, n)
42
+
43
+ # --- Stratified sampling (classification target present) ---
44
+ if target is not None and target in df.columns:
45
+ target_col = df[target]
46
+ if target_col.dtype in ("object", "category", "bool") or (
47
+ target_col.nunique() <= 50
48
+ ):
49
+ try:
50
+ return df.groupby(target, group_keys=False).apply(
51
+ lambda g: g.sample(
52
+ frac=sample_size / n,
53
+ random_state=config.random_state,
54
+ )
55
+ ).reset_index(drop=True)
56
+ except Exception:
57
+ pass # Fall through to temporal or random
58
+
59
+ # --- Temporal sampling (datetime column present) ---
60
+ datetime_cols = df.select_dtypes(include=["datetime64"]).columns.tolist()
61
+ if datetime_cols:
62
+ dt_col = datetime_cols[0]
63
+ sorted_df = df.sort_values(dt_col)
64
+ # Sample evenly spaced indices to preserve time distribution
65
+ step = max(1, n // sample_size)
66
+ indices = list(range(0, n, step))[:sample_size]
67
+ return sorted_df.iloc[indices].reset_index(drop=True)
68
+
69
+ # --- Random sampling (fallback) ---
70
+ return df.sample(n=sample_size, random_state=config.random_state).reset_index(
71
+ drop=True
72
+ )
dissectml/_types.py ADDED
@@ -0,0 +1,114 @@
1
+ """Enums, TypedDicts, and type aliases used across InsightML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+ from typing import Any, TypedDict
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Enums
13
+ # ---------------------------------------------------------------------------
14
+
15
+ class TaskType(str, Enum):
16
+ CLASSIFICATION = "classification"
17
+ REGRESSION = "regression"
18
+ AUTO = "auto"
19
+
20
+
21
+ class ColumnType(str, Enum):
22
+ NUMERIC = "numeric"
23
+ CATEGORICAL = "categorical"
24
+ DATETIME = "datetime"
25
+ TEXT = "text"
26
+ BOOLEAN = "boolean"
27
+ HIGH_CARDINALITY = "high_cardinality"
28
+ CONSTANT = "constant"
29
+ UNIQUE_ID = "unique_id"
30
+
31
+
32
+ class MissingnessType(str, Enum):
33
+ MCAR = "MCAR" # Missing Completely At Random
34
+ MAR = "MAR" # Missing At Random (depends on observed data)
35
+ MNAR = "MNAR" # Missing Not At Random (depends on missing value itself)
36
+ UNKNOWN = "unknown"
37
+
38
+
39
+ class TuningMode(str, Enum):
40
+ QUICK = "quick" # Default hyperparameters only (no search)
41
+ TUNED = "tuned" # RandomizedSearchCV on top-N models
42
+ CUSTOM = "custom" # User-provided param grids
43
+
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # TypedDicts
47
+ # ---------------------------------------------------------------------------
48
+
49
+ class ColumnProfile(TypedDict, total=False):
50
+ """Per-column statistics computed by DataOverview."""
51
+ name: str
52
+ dtype: str
53
+ inferred_type: str # ColumnType value
54
+ count: int
55
+ unique: int
56
+ missing_count: int
57
+ missing_pct: float
58
+ memory_bytes: int
59
+ # Numeric fields
60
+ mean: float
61
+ median: float
62
+ std: float
63
+ variance: float
64
+ min: float
65
+ max: float
66
+ range: float
67
+ iqr: float
68
+ q1: float
69
+ q3: float
70
+ skewness: float
71
+ kurtosis: float
72
+ # Categorical fields
73
+ top_value: Any
74
+ top_freq: int
75
+ cardinality_ratio: float
76
+ value_counts: dict[str, int]
77
+ # DateTime fields
78
+ dt_min: str
79
+ dt_max: str
80
+ range_days: float
81
+ inferred_frequency: str | None
82
+
83
+
84
+ class DataSchema(TypedDict):
85
+ """Schema inferred from the dataset."""
86
+ column_types: dict[str, ColumnType]
87
+ numeric_cols: list[str]
88
+ categorical_cols: list[str]
89
+ datetime_cols: list[str]
90
+ text_cols: list[str]
91
+ boolean_cols: list[str]
92
+ high_cardinality_cols: list[str]
93
+ constant_cols: list[str]
94
+ unique_id_cols: list[str]
95
+ target_col: str | None
96
+ task: TaskType
97
+
98
+
99
+ class LeakageWarning(TypedDict):
100
+ """A potential data leakage warning for a feature."""
101
+ column: str
102
+ score: float
103
+ method: str # "high_correlation" | "mutual_information" | "temporal" | "derived"
104
+ severity: str # "critical" | "warning" | "info"
105
+ explanation: str
106
+
107
+
108
+ # ---------------------------------------------------------------------------
109
+ # Type aliases
110
+ # ---------------------------------------------------------------------------
111
+
112
+ DataFrame = pd.DataFrame
113
+ Series = pd.Series
114
+ Array = np.ndarray
dissectml/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"