dataforge-ml 0.11.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-1.0.1/PKG-INFO +95 -0
- dataforge_ml-1.0.1/README.md +71 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/pyproject.toml +3 -2
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/__init__.py +2 -2
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/config.py +13 -3
- dataforge_ml-1.0.1/src/dataforge_ml/imputation/__init__.py +21 -0
- dataforge_ml-1.0.1/src/dataforge_ml/imputation/_config.py +165 -0
- dataforge_ml-1.0.1/src/dataforge_ml/imputation/_fitted_imputer.py +290 -0
- dataforge_ml-1.0.1/src/dataforge_ml/imputation/_numeric_imputer.py +372 -0
- dataforge_ml-1.0.1/src/dataforge_ml/imputation/orchestrator.py +163 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/models/_data_types.py +2 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/__init__.py +2 -2
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_base.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_boolean_profiler.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_categorical.py +10 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_categorical_config.py +4 -1
- dataforge_ml-0.11.0/src/dataforge_ml/profiling/config.py → dataforge_ml-1.0.1/src/dataforge_ml/profiling/_config.py +2 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_correlation_config.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_correlation_profiler.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_missingness_profiler.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_tabular.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_target_profiler.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_text_profiler.py +1 -1
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_type_detector.py +1 -1
- dataforge_ml-0.11.0/src/dataforge_ml/profiling/structural.py → dataforge_ml-1.0.1/src/dataforge_ml/profiling/orchestrator.py +3 -2
- dataforge_ml-1.0.1/src/dataforge_ml/splitting/_profile_signals.py +174 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/splitting/_splitter.py +114 -2
- {dataforge_ml-0.11.0/src/dataforge_ml/profiling → dataforge_ml-1.0.1/src/dataforge_ml/utils}/_null_detection.py +2 -2
- dataforge_ml-1.0.1/src/dataforge_ml/utils/_null_normalization.py +64 -0
- dataforge_ml-1.0.1/src/dataforge_ml.egg-info/PKG-INFO +95 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/SOURCES.txt +10 -3
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/requires.txt +1 -0
- dataforge_ml-0.11.0/PKG-INFO +0 -36
- dataforge_ml-0.11.0/README.md +0 -13
- dataforge_ml-0.11.0/src/dataforge_ml.egg-info/PKG-INFO +0 -36
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/LICENSE +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/setup.cfg +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.11.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge-ml
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: A automated feature engineering and designing pipeline library
|
|
5
|
+
License: MIT
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Requires-Python: >3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: polars>=1.0.0
|
|
15
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
16
|
+
Requires-Dist: scipy>=1.10.0
|
|
17
|
+
Requires-Dist: numpy>=2.0.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: chardet>=5.0.0
|
|
20
|
+
Requires-Dist: iterative-stratification>=0.1.9
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# DataForgeML
|
|
26
|
+
|
|
27
|
+
[](https://deepwiki.com/DEVunderdog/DataForgeML)
|
|
28
|
+
|
|
29
|
+
Automated data profiling and splitting pipeline for ML datasets.
|
|
30
|
+
|
|
31
|
+
DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install dataforge-ml
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
|
|
43
|
+
|
|
44
|
+
df = DataLoader().load("titanic.csv")
|
|
45
|
+
|
|
46
|
+
config = PipelineConfig()
|
|
47
|
+
result = StructuralProfiler(config).profile(df)
|
|
48
|
+
|
|
49
|
+
print(result.columns["Age"].semantic_type) # SemanticType.Numeric
|
|
50
|
+
print(result.dataset.row_count) # total rows
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
`DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
|
|
54
|
+
|
|
55
|
+
## Column Type Overrides
|
|
56
|
+
|
|
57
|
+
Override the auto-detected type for any column before profiling:
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
config = PipelineConfig()
|
|
61
|
+
config.set_column_type("PassengerId", "identifier") # skip stats entirely
|
|
62
|
+
config.set_columns_type(["Survived", "Pclass"], "categorical")
|
|
63
|
+
|
|
64
|
+
result = StructuralProfiler(config).profile(df)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
To drop a column from all processing entirely, use `exclude_columns`:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Splitting
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from dataforge_ml import DataLoader, DataSplitter
|
|
77
|
+
|
|
78
|
+
df = DataLoader().load("titanic.csv")
|
|
79
|
+
splitter = DataSplitter(df, target="Survived", random_seed=42)
|
|
80
|
+
|
|
81
|
+
# Random train/test split (stratified by default when target is set)
|
|
82
|
+
split = splitter.random_split(test_size=0.2)
|
|
83
|
+
print(split.train.shape, split.test.shape)
|
|
84
|
+
|
|
85
|
+
# Chronological split (no temporal leakage)
|
|
86
|
+
split = splitter.time_split(time_column="date", test_size=0.2)
|
|
87
|
+
|
|
88
|
+
# K-fold cross-validation
|
|
89
|
+
for fold in splitter.kfold(k=5):
|
|
90
|
+
print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# DataForgeML
|
|
2
|
+
|
|
3
|
+
[](https://deepwiki.com/DEVunderdog/DataForgeML)
|
|
4
|
+
|
|
5
|
+
Automated data profiling and splitting pipeline for ML datasets.
|
|
6
|
+
|
|
7
|
+
DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install dataforge-ml
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
|
|
19
|
+
|
|
20
|
+
df = DataLoader().load("titanic.csv")
|
|
21
|
+
|
|
22
|
+
config = PipelineConfig()
|
|
23
|
+
result = StructuralProfiler(config).profile(df)
|
|
24
|
+
|
|
25
|
+
print(result.columns["Age"].semantic_type) # SemanticType.Numeric
|
|
26
|
+
print(result.dataset.row_count) # total rows
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
|
|
30
|
+
|
|
31
|
+
## Column Type Overrides
|
|
32
|
+
|
|
33
|
+
Override the auto-detected type for any column before profiling:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
config = PipelineConfig()
|
|
37
|
+
config.set_column_type("PassengerId", "identifier") # skip stats entirely
|
|
38
|
+
config.set_columns_type(["Survived", "Pclass"], "categorical")
|
|
39
|
+
|
|
40
|
+
result = StructuralProfiler(config).profile(df)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
To drop a column from all processing entirely, use `exclude_columns`:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Splitting
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from dataforge_ml import DataLoader, DataSplitter
|
|
53
|
+
|
|
54
|
+
df = DataLoader().load("titanic.csv")
|
|
55
|
+
splitter = DataSplitter(df, target="Survived", random_seed=42)
|
|
56
|
+
|
|
57
|
+
# Random train/test split (stratified by default when target is set)
|
|
58
|
+
split = splitter.random_split(test_size=0.2)
|
|
59
|
+
print(split.train.shape, split.test.shape)
|
|
60
|
+
|
|
61
|
+
# Chronological split (no temporal leakage)
|
|
62
|
+
split = splitter.time_split(time_column="date", test_size=0.2)
|
|
63
|
+
|
|
64
|
+
# K-fold cross-validation
|
|
65
|
+
for fold in splitter.kfold(k=5):
|
|
66
|
+
print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
MIT
|
|
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "dataforge-ml"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "1.0.1"
|
|
8
8
|
description = "A automated feature engineering and designing pipeline library"
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = "
|
|
10
|
+
requires-python = ">3.10"
|
|
11
11
|
license = {text = "MIT"}
|
|
12
12
|
classifiers = [
|
|
13
13
|
"License :: OSI Approved :: MIT License",
|
|
@@ -23,6 +23,7 @@ dependencies = [
|
|
|
23
23
|
"numpy>=2.0.0",
|
|
24
24
|
"pandas>=2.0.0",
|
|
25
25
|
"chardet>=5.0.0",
|
|
26
|
+
"iterative-stratification>=0.1.9",
|
|
26
27
|
]
|
|
27
28
|
|
|
28
29
|
[project.optional-dependencies]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
|
|
2
|
-
from .profiling.
|
|
3
|
-
from .profiling.
|
|
2
|
+
from .profiling.orchestrator import StructuralProfiler
|
|
3
|
+
from .profiling._config import (
|
|
4
4
|
ProfileConfig,
|
|
5
5
|
StructuralProfileResult,
|
|
6
6
|
ColumnProfile,
|
|
@@ -6,7 +6,8 @@ from enum import StrEnum
|
|
|
6
6
|
from typing import TYPE_CHECKING, Union
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
|
-
from dataforge_ml.profiling.
|
|
9
|
+
from dataforge_ml.profiling._config import ProfileConfig
|
|
10
|
+
from dataforge_ml.imputation._config import ImputationConfig
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class SemanticType(StrEnum):
|
|
@@ -32,10 +33,15 @@ class PipelinePhase(StrEnum):
|
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
def _default_profile_config() -> ProfileConfig:
|
|
35
|
-
from dataforge_ml.profiling.
|
|
36
|
+
from dataforge_ml.profiling._config import ProfileConfig
|
|
36
37
|
return ProfileConfig()
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
def _default_imputation_config() -> ImputationConfig:
|
|
41
|
+
from dataforge_ml.imputation._config import ImputationConfig
|
|
42
|
+
return ImputationConfig()
|
|
43
|
+
|
|
44
|
+
|
|
39
45
|
@dataclass
|
|
40
46
|
class PipelineConfig:
|
|
41
47
|
"""
|
|
@@ -58,6 +64,7 @@ class PipelineConfig:
|
|
|
58
64
|
phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
|
|
59
65
|
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
60
66
|
profiling: ProfileConfig = field(default_factory=_default_profile_config)
|
|
67
|
+
imputation: ImputationConfig = field(default_factory=_default_imputation_config)
|
|
61
68
|
|
|
62
69
|
def resolve_active_columns(
|
|
63
70
|
self, phase: PipelinePhase, available_columns: list[str]
|
|
@@ -107,11 +114,13 @@ class PipelineConfig:
|
|
|
107
114
|
for col, sem_type in self.column_overrides.items()
|
|
108
115
|
},
|
|
109
116
|
"profiling": self.profiling.to_dict(),
|
|
117
|
+
"imputation": self.imputation.to_dict(),
|
|
110
118
|
}
|
|
111
119
|
|
|
112
120
|
@classmethod
|
|
113
121
|
def from_dict(cls, data: dict) -> PipelineConfig:
|
|
114
|
-
from dataforge_ml.profiling.
|
|
122
|
+
from dataforge_ml.profiling._config import ProfileConfig
|
|
123
|
+
from dataforge_ml.imputation._config import ImputationConfig
|
|
115
124
|
return cls(
|
|
116
125
|
exclude_columns=list(data.get("exclude_columns", [])),
|
|
117
126
|
phase_exclusions={
|
|
@@ -123,6 +132,7 @@ class PipelineConfig:
|
|
|
123
132
|
for col, sem_str in data.get("column_overrides", {}).items()
|
|
124
133
|
},
|
|
125
134
|
profiling=ProfileConfig.from_dict(data.get("profiling", {})),
|
|
135
|
+
imputation=ImputationConfig.from_dict(data.get("imputation", {})),
|
|
126
136
|
)
|
|
127
137
|
|
|
128
138
|
def to_json(self, indent: int = 2) -> str:
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from ._config import (
|
|
2
|
+
ColumnImputationRecord,
|
|
3
|
+
ImputationConfig,
|
|
4
|
+
ImputationResult,
|
|
5
|
+
ImputationStrategy,
|
|
6
|
+
NumericImputationConfig,
|
|
7
|
+
)
|
|
8
|
+
from ._fitted_imputer import FittedImputer, UnfittedColumnError
|
|
9
|
+
from .orchestrator import ImputationOrchestrator, SplitImbalanceWarning
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ImputationStrategy",
|
|
13
|
+
"NumericImputationConfig",
|
|
14
|
+
"ImputationConfig",
|
|
15
|
+
"ColumnImputationRecord",
|
|
16
|
+
"ImputationResult",
|
|
17
|
+
"FittedImputer",
|
|
18
|
+
"UnfittedColumnError",
|
|
19
|
+
"ImputationOrchestrator",
|
|
20
|
+
"SplitImbalanceWarning",
|
|
21
|
+
]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration and result dataclasses for the imputation phase — Phase 2.
|
|
3
|
+
|
|
4
|
+
ImputationConfig controls strategy thresholds and MNAR declarations.
|
|
5
|
+
Result dataclasses carry per-column audit records and the imputed DataFrame.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import StrEnum
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
import polars as pl
|
|
15
|
+
|
|
16
|
+
from ..config import SemanticType
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ImputationStrategy(StrEnum):
|
|
20
|
+
Mean = "mean"
|
|
21
|
+
Median = "median"
|
|
22
|
+
Mode = "mode"
|
|
23
|
+
KNN = "knn"
|
|
24
|
+
Regression = "regression"
|
|
25
|
+
MICE = "mice"
|
|
26
|
+
Constant = "constant"
|
|
27
|
+
Dropped = "dropped"
|
|
28
|
+
Passthrough = "passthrough"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class NumericImputationConfig:
|
|
33
|
+
"""
|
|
34
|
+
Operational thresholds for the numeric imputation sub-processor.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
knn_max_rows : int
|
|
39
|
+
Maximum number of rows before KNN is skipped in favour of Regression.
|
|
40
|
+
knn_max_features : int
|
|
41
|
+
Maximum number of features before KNN is skipped in favour of Regression.
|
|
42
|
+
regression_min_rows : int
|
|
43
|
+
Minimum number of rows required to fit a stable Regression model.
|
|
44
|
+
mnar_constant_fill : float
|
|
45
|
+
Constant value used to fill MNAR-declared numeric columns.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
knn_max_rows: int = 50_000
|
|
49
|
+
knn_max_features: int = 50
|
|
50
|
+
regression_min_rows: int = 500
|
|
51
|
+
mnar_constant_fill: float = -1
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> dict:
|
|
54
|
+
return {
|
|
55
|
+
"knn_max_rows": self.knn_max_rows,
|
|
56
|
+
"knn_max_features": self.knn_max_features,
|
|
57
|
+
"regression_min_rows": self.regression_min_rows,
|
|
58
|
+
"mnar_constant_fill": self.mnar_constant_fill,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def from_dict(cls, data: dict) -> NumericImputationConfig:
|
|
63
|
+
return cls(
|
|
64
|
+
knn_max_rows=int(data.get("knn_max_rows", 50_000)),
|
|
65
|
+
knn_max_features=int(data.get("knn_max_features", 50)),
|
|
66
|
+
regression_min_rows=int(data.get("regression_min_rows", 500)),
|
|
67
|
+
mnar_constant_fill=float(data.get("mnar_constant_fill", -1)),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class ImputationConfig:
|
|
73
|
+
"""
|
|
74
|
+
Cross-type Phase 2 configuration.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
numeric : NumericImputationConfig
|
|
79
|
+
Thresholds and fill values for numeric imputation.
|
|
80
|
+
mnar_columns : list[str]
|
|
81
|
+
Columns declared by the user as Missing Not At Random.
|
|
82
|
+
These receive Constant fill + a missingness indicator regardless
|
|
83
|
+
of the signals detected in Phase 1.
|
|
84
|
+
add_indicator_columns : list[str]
|
|
85
|
+
Columns for which a binary missingness indicator should be added
|
|
86
|
+
even when they are not MNAR.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
numeric: NumericImputationConfig = field(default_factory=NumericImputationConfig)
|
|
90
|
+
mnar_columns: list[str] = field(default_factory=list)
|
|
91
|
+
add_indicator_columns: list[str] = field(default_factory=list)
|
|
92
|
+
|
|
93
|
+
def to_dict(self) -> dict:
|
|
94
|
+
return {
|
|
95
|
+
"numeric": self.numeric.to_dict(),
|
|
96
|
+
"mnar_columns": list(self.mnar_columns),
|
|
97
|
+
"add_indicator_columns": list(self.add_indicator_columns),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_dict(cls, data: dict) -> ImputationConfig:
|
|
102
|
+
return cls(
|
|
103
|
+
numeric=NumericImputationConfig.from_dict(data.get("numeric", {})),
|
|
104
|
+
mnar_columns=list(data.get("mnar_columns", [])),
|
|
105
|
+
add_indicator_columns=list(data.get("add_indicator_columns", [])),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class ColumnImputationRecord:
|
|
111
|
+
"""
|
|
112
|
+
Per-column audit entry produced after fit().
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
column : str
|
|
117
|
+
Column name.
|
|
118
|
+
semantic_type : SemanticType
|
|
119
|
+
Detected semantic type of the column.
|
|
120
|
+
strategy : ImputationStrategy
|
|
121
|
+
Strategy applied to this column.
|
|
122
|
+
fill_value : Any, optional
|
|
123
|
+
Scalar fill value learned from training data (None for model-based strategies).
|
|
124
|
+
indicator_added : bool
|
|
125
|
+
Whether a binary missingness indicator column was appended.
|
|
126
|
+
signals : list[str]
|
|
127
|
+
Human-readable reasons that drove the strategy decision.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
column: str
|
|
131
|
+
semantic_type: SemanticType
|
|
132
|
+
strategy: ImputationStrategy
|
|
133
|
+
fill_value: Optional[Any] = None
|
|
134
|
+
indicator_added: bool = False
|
|
135
|
+
signals: list[str] = field(default_factory=list)
|
|
136
|
+
|
|
137
|
+
def to_dict(self) -> dict:
|
|
138
|
+
return {
|
|
139
|
+
"column": self.column,
|
|
140
|
+
"semantic_type": str(self.semantic_type),
|
|
141
|
+
"strategy": str(self.strategy),
|
|
142
|
+
"fill_value": self.fill_value,
|
|
143
|
+
"indicator_added": self.indicator_added,
|
|
144
|
+
"signals": list(self.signals),
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class ImputationResult:
|
|
150
|
+
"""
|
|
151
|
+
Output of FittedImputer.transform().
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
dataframe : pl.DataFrame
|
|
156
|
+
DataFrame with imputed values (and any indicator columns appended).
|
|
157
|
+
records : dict[str, ColumnImputationRecord]
|
|
158
|
+
Per-column audit log keyed by column name.
|
|
159
|
+
dropped_columns : list[str]
|
|
160
|
+
Columns removed because they exceeded the drop threshold (>50% missing).
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
dataframe: pl.DataFrame
|
|
164
|
+
records: dict[str, ColumnImputationRecord] = field(default_factory=dict)
|
|
165
|
+
dropped_columns: list[str] = field(default_factory=list)
|