pypreflight 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- preflight/__init__.py +210 -0
- preflight/assembler.py +384 -0
- preflight/cleaner.py +333 -0
- preflight/cli.py +108 -0
- preflight/engineer.py +270 -0
- preflight/profiler.py +347 -0
- preflight/report.py +388 -0
- preflight/types.py +70 -0
- pypreflight-0.1.0.dist-info/METADATA +128 -0
- pypreflight-0.1.0.dist-info/RECORD +13 -0
- pypreflight-0.1.0.dist-info/WHEEL +4 -0
- pypreflight-0.1.0.dist-info/entry_points.txt +2 -0
- pypreflight-0.1.0.dist-info/licenses/LICENSE +21 -0
preflight/__init__.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
exposes prepare(), profile(), clean(), engineer(), compare() as public API
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from preflight import assembler
|
|
7
|
+
from preflight.types import PrepResult
|
|
8
|
+
from preflight.report import Report
|
|
9
|
+
from preflight.profiler import run_profiler
|
|
10
|
+
from preflight.cleaner import run_cleaner
|
|
11
|
+
from preflight.engineer import run_engineer
|
|
12
|
+
def prepare(
|
|
13
|
+
df: pd.DataFrame,
|
|
14
|
+
target: str,
|
|
15
|
+
task: str = "classification",
|
|
16
|
+
model_hint: str = "tree",
|
|
17
|
+
drop_threshold: float = 0.6,
|
|
18
|
+
outlier_method: str = "iqr",
|
|
19
|
+
cardinality_threshold: int = 20,
|
|
20
|
+
) -> PrepResult:
|
|
21
|
+
if len(df) == 0:
|
|
22
|
+
raise ValueError("DataFrame cannot be empty")
|
|
23
|
+
|
|
24
|
+
if target not in df.columns:
|
|
25
|
+
raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
|
|
26
|
+
|
|
27
|
+
if len(df.columns) < 2:
|
|
28
|
+
raise ValueError("DataFrame must contain at least one feature column besides the target")
|
|
29
|
+
|
|
30
|
+
if df.columns.duplicated().any():
|
|
31
|
+
raise ValueError("DataFrame contains duplicate column names")
|
|
32
|
+
|
|
33
|
+
if task not in ["classification", "regression"]:
|
|
34
|
+
raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
|
|
35
|
+
|
|
36
|
+
if model_hint not in ["tree", "linear"]:
|
|
37
|
+
raise ValueError(f"model_hint must be 'tree' or 'linear', got '{model_hint}'")
|
|
38
|
+
|
|
39
|
+
return assembler.run_assembler(
|
|
40
|
+
df=df,
|
|
41
|
+
target=target,
|
|
42
|
+
task=task,
|
|
43
|
+
model_hint=model_hint,
|
|
44
|
+
drop_threshold=drop_threshold,
|
|
45
|
+
outlier_method=outlier_method,
|
|
46
|
+
cardinality_threshold=cardinality_threshold,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def profile(
|
|
50
|
+
df: pd.DataFrame,
|
|
51
|
+
target: str,
|
|
52
|
+
task: str = "classification",
|
|
53
|
+
cardinality_threshold: int = 20,
|
|
54
|
+
) -> PrepResult:
|
|
55
|
+
if len(df) == 0:
|
|
56
|
+
raise ValueError("DataFrame cannot be empty")
|
|
57
|
+
if target not in df.columns:
|
|
58
|
+
raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
|
|
59
|
+
if len(df.columns) < 2:
|
|
60
|
+
raise ValueError("DataFrame must contain at least one feature column besides the target")
|
|
61
|
+
if df.columns.duplicated().any():
|
|
62
|
+
raise ValueError("DataFrame contains duplicate column names")
|
|
63
|
+
if task not in ["classification", "regression"]:
|
|
64
|
+
raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
|
|
65
|
+
|
|
66
|
+
profiles, profiler_entries = run_profiler(
|
|
67
|
+
df=df, target=target, task=task, cardinality_threshold=cardinality_threshold
|
|
68
|
+
)
|
|
69
|
+
return PrepResult(
|
|
70
|
+
df=df,
|
|
71
|
+
pipeline=None,
|
|
72
|
+
report=Report(profiler_entries)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def clean(
|
|
76
|
+
df: pd.DataFrame,
|
|
77
|
+
target: str,
|
|
78
|
+
task: str = "classification",
|
|
79
|
+
drop_threshold: float = 0.6,
|
|
80
|
+
outlier_method: str = "iqr",
|
|
81
|
+
cardinality_threshold: int = 20,
|
|
82
|
+
) -> PrepResult:
|
|
83
|
+
if len(df) == 0:
|
|
84
|
+
raise ValueError("DataFrame cannot be empty")
|
|
85
|
+
if target not in df.columns:
|
|
86
|
+
raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
|
|
87
|
+
if len(df.columns) < 2:
|
|
88
|
+
raise ValueError("DataFrame must contain at least one feature column besides the target")
|
|
89
|
+
if df.columns.duplicated().any():
|
|
90
|
+
raise ValueError("DataFrame contains duplicate column names")
|
|
91
|
+
if task not in ["classification", "regression"]:
|
|
92
|
+
raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
|
|
93
|
+
|
|
94
|
+
profiles, profiler_entries = run_profiler(
|
|
95
|
+
df=df, target=target, task=task, cardinality_threshold=cardinality_threshold
|
|
96
|
+
)
|
|
97
|
+
df_clean, surviving_profiles, cleaner_entries, specs = run_cleaner(
|
|
98
|
+
df=df, profiles=profiles, target=target, drop_threshold=drop_threshold, outlier_method=outlier_method
|
|
99
|
+
)
|
|
100
|
+
return PrepResult(
|
|
101
|
+
df=df_clean,
|
|
102
|
+
pipeline=None,
|
|
103
|
+
report=Report(profiler_entries + cleaner_entries)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def engineer(
|
|
107
|
+
df: pd.DataFrame,
|
|
108
|
+
target: str,
|
|
109
|
+
task: str = "classification",
|
|
110
|
+
model_hint: str = "tree",
|
|
111
|
+
drop_threshold: float = 0.6,
|
|
112
|
+
outlier_method: str = "iqr",
|
|
113
|
+
cardinality_threshold: int = 20,
|
|
114
|
+
) -> PrepResult:
|
|
115
|
+
if len(df) == 0:
|
|
116
|
+
raise ValueError("DataFrame cannot be empty")
|
|
117
|
+
if target not in df.columns:
|
|
118
|
+
raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
|
|
119
|
+
if len(df.columns) < 2:
|
|
120
|
+
raise ValueError("DataFrame must contain at least one feature column besides the target")
|
|
121
|
+
if df.columns.duplicated().any():
|
|
122
|
+
raise ValueError("DataFrame contains duplicate column names")
|
|
123
|
+
if task not in ["classification", "regression"]:
|
|
124
|
+
raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
|
|
125
|
+
if model_hint not in ["tree", "linear"]:
|
|
126
|
+
raise ValueError(f"model_hint must be 'tree' or 'linear', got '{model_hint}'")
|
|
127
|
+
|
|
128
|
+
profiles, profiler_entries = run_profiler(
|
|
129
|
+
df=df, target=target, task=task, cardinality_threshold=cardinality_threshold
|
|
130
|
+
)
|
|
131
|
+
df_clean, surviving_profiles, cleaner_entries, specs_c = run_cleaner(
|
|
132
|
+
df=df, profiles=profiles, target=target, drop_threshold=drop_threshold, outlier_method=outlier_method
|
|
133
|
+
)
|
|
134
|
+
df_eng, engineer_entries, specs_e = run_engineer(
|
|
135
|
+
df=df_clean, profiles=surviving_profiles, target=target, model_hint=model_hint, cardinality_threshold=cardinality_threshold
|
|
136
|
+
)
|
|
137
|
+
return PrepResult(
|
|
138
|
+
df=df_eng,
|
|
139
|
+
pipeline=None,
|
|
140
|
+
report=Report(profiler_entries + cleaner_entries + engineer_entries)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def _extract_actions_per_column(report: Report | None) -> dict[str, set[str]]:
|
|
144
|
+
if report is None:
|
|
145
|
+
return {}
|
|
146
|
+
|
|
147
|
+
actions = {}
|
|
148
|
+
for entry in report.entries:
|
|
149
|
+
if entry.column not in actions:
|
|
150
|
+
actions[entry.column] = set()
|
|
151
|
+
actions[entry.column].add(entry.action)
|
|
152
|
+
return actions
|
|
153
|
+
|
|
154
|
+
def _compute_decision_diff(actions_a: dict[str, set[str]], actions_b: dict[str, set[str]]) -> list[str]:
|
|
155
|
+
diff = []
|
|
156
|
+
shared_cols = set(actions_a.keys()).intersection(actions_b.keys())
|
|
157
|
+
for col in sorted(shared_cols):
|
|
158
|
+
if actions_a[col] != actions_b[col]:
|
|
159
|
+
diff.append(col)
|
|
160
|
+
return diff
|
|
161
|
+
|
|
162
|
+
def compare(a: PrepResult, b: PrepResult) -> dict:
|
|
163
|
+
cols_a = set(a.df.columns)
|
|
164
|
+
cols_b = set(b.df.columns)
|
|
165
|
+
|
|
166
|
+
columns_only_in_a = sorted(cols_a - cols_b)
|
|
167
|
+
columns_only_in_b = sorted(cols_b - cols_a)
|
|
168
|
+
columns_in_both = sorted(cols_a.intersection(cols_b))
|
|
169
|
+
|
|
170
|
+
report_entry_counts_a = a.report.summary_counts() if a.report is not None else None
|
|
171
|
+
report_entry_counts_b = b.report.summary_counts() if b.report is not None else None
|
|
172
|
+
|
|
173
|
+
actions_a = _extract_actions_per_column(a.report)
|
|
174
|
+
actions_b = _extract_actions_per_column(b.report)
|
|
175
|
+
|
|
176
|
+
if a.report is None or b.report is None:
|
|
177
|
+
decision_diff = []
|
|
178
|
+
else:
|
|
179
|
+
decision_diff = _compute_decision_diff(actions_a, actions_b)
|
|
180
|
+
|
|
181
|
+
diff = {
|
|
182
|
+
"shape_a": a.df.shape,
|
|
183
|
+
"shape_b": b.df.shape,
|
|
184
|
+
"columns_only_in_a": columns_only_in_a,
|
|
185
|
+
"columns_only_in_b": columns_only_in_b,
|
|
186
|
+
"columns_in_both": columns_in_both,
|
|
187
|
+
"report_entry_counts_a": report_entry_counts_a,
|
|
188
|
+
"report_entry_counts_b": report_entry_counts_b,
|
|
189
|
+
"decision_diff": decision_diff,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
# Print summary
|
|
193
|
+
print("=== PreFlight Compare ===")
|
|
194
|
+
print(f"Shape A: {diff['shape_a']}")
|
|
195
|
+
print(f"Shape B: {diff['shape_b']}")
|
|
196
|
+
|
|
197
|
+
if columns_only_in_a:
|
|
198
|
+
print(f"Columns only in A ({len(columns_only_in_a)}): {', '.join(columns_only_in_a[:5])}{'...' if len(columns_only_in_a) > 5 else ''}")
|
|
199
|
+
|
|
200
|
+
if columns_only_in_b:
|
|
201
|
+
print(f"Columns only in B ({len(columns_only_in_b)}): {', '.join(columns_only_in_b[:5])}{'...' if len(columns_only_in_b) > 5 else ''}")
|
|
202
|
+
|
|
203
|
+
if decision_diff:
|
|
204
|
+
print(f"Differing decisions in shared columns:")
|
|
205
|
+
for col in decision_diff:
|
|
206
|
+
print(f" {col}: A={actions_a.get(col, set())} | B={actions_b.get(col, set())}")
|
|
207
|
+
|
|
208
|
+
print("=========================")
|
|
209
|
+
|
|
210
|
+
return diff
|
preflight/assembler.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
4
|
+
from sklearn.pipeline import Pipeline
|
|
5
|
+
from sklearn.exceptions import NotFittedError
|
|
6
|
+
|
|
7
|
+
from preflight.types import ColumnProfile, SemanticType, PrepResult
|
|
8
|
+
from preflight.profiler import run_profiler
|
|
9
|
+
from preflight.report import Report
|
|
10
|
+
from preflight.cleaner import (
|
|
11
|
+
run_cleaner,
|
|
12
|
+
add_missing_indicator,
|
|
13
|
+
normalize_category_values,
|
|
14
|
+
coerce_string_dates_to_datetime,
|
|
15
|
+
group_rare_categories,
|
|
16
|
+
)
|
|
17
|
+
from preflight.engineer import run_engineer, expand_datetime
|
|
18
|
+
|
|
19
|
+
# =====================================================================
|
|
20
|
+
# Architectural Decision: Two-Phase Pipeline Assembly
|
|
21
|
+
# ---------------------------------------------------------------------
|
|
22
|
+
# EngineerTransformer requires the POST-CLEANING column profiles.
|
|
23
|
+
# However, the surviving profiles are only known AFTER CleanerTransformer
|
|
24
|
+
# computes its dynamic drops during `fit()`.
|
|
25
|
+
#
|
|
26
|
+
# To resolve this coupling without breaking standard sklearn Pipeline
|
|
27
|
+
# semantics, the `build_pipeline` function returns an UNFIT pipeline
|
|
28
|
+
# (which would fail or process dropped columns incorrectly if fitted
|
|
29
|
+
# blindly). Instead, `build_pipeline_two_phase` acts as the actual
|
|
30
|
+
# usable constructor: it manually fits the Cleaner stage, reads the
|
|
31
|
+
# resulting `fitted_profiles_`, uses them to construct and fit the
|
|
32
|
+
# Engineer stage, and then returns a fully assembled, pre-fitted
|
|
33
|
+
# Pipeline ready for `transform()`.
|
|
34
|
+
# =====================================================================
|
|
35
|
+
|
|
36
|
+
class CleanerTransformer(BaseEstimator, TransformerMixin):
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
profiles: list[ColumnProfile],
|
|
40
|
+
target: str,
|
|
41
|
+
drop_threshold: float = 0.6,
|
|
42
|
+
outlier_method: str = "iqr",
|
|
43
|
+
) -> None:
|
|
44
|
+
self.profiles = profiles
|
|
45
|
+
self.target = target
|
|
46
|
+
self.drop_threshold = drop_threshold
|
|
47
|
+
self.outlier_method = outlier_method
|
|
48
|
+
|
|
49
|
+
def fit(self, X: pd.DataFrame, y=None) -> "CleanerTransformer":
|
|
50
|
+
# run_cleaner computes the required stats
|
|
51
|
+
df_clean, self.fitted_profiles_, self.report_entries_, self.specs_ = run_cleaner(
|
|
52
|
+
X, self.profiles, self.target, self.drop_threshold, self.outlier_method
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
surviving_cols = {p.name for p in self.fitted_profiles_}
|
|
56
|
+
self.columns_to_drop_ = [p.name for p in self.profiles if p.name not in surviving_cols]
|
|
57
|
+
|
|
58
|
+
# Augment specs_ with exact bounds and indicators to avoid recomputing on test data
|
|
59
|
+
for p in self.fitted_profiles_:
|
|
60
|
+
col = p.name
|
|
61
|
+
if col not in self.specs_:
|
|
62
|
+
self.specs_[col] = {}
|
|
63
|
+
|
|
64
|
+
# Missing indicator logic from cleaner.py
|
|
65
|
+
if p.missing_rate > 0 and p.semantic_type in (SemanticType.CATEGORICAL_LOW, SemanticType.NUMERIC_FEATURE):
|
|
66
|
+
self.specs_[col]["add_missing_indicator"] = True
|
|
67
|
+
|
|
68
|
+
# Winsorization bounds logic
|
|
69
|
+
if p.semantic_type == SemanticType.NUMERIC_FEATURE and "outlier_method" in self.specs_[col]:
|
|
70
|
+
series = X[col]
|
|
71
|
+
method = self.specs_[col]["outlier_method"]
|
|
72
|
+
if method == "iqr":
|
|
73
|
+
q1 = series.quantile(0.25)
|
|
74
|
+
q3 = series.quantile(0.75)
|
|
75
|
+
iqr = q3 - q1
|
|
76
|
+
self.specs_[col]["lower_bound"] = q1 - 1.5 * iqr
|
|
77
|
+
self.specs_[col]["upper_bound"] = q3 + 1.5 * iqr
|
|
78
|
+
elif method == "zscore":
|
|
79
|
+
mean = series.mean()
|
|
80
|
+
std = series.std()
|
|
81
|
+
self.specs_[col]["lower_bound"] = mean - 3 * std
|
|
82
|
+
self.specs_[col]["upper_bound"] = mean + 3 * std
|
|
83
|
+
|
|
84
|
+
return self
|
|
85
|
+
|
|
86
|
+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
87
|
+
if not hasattr(self, "specs_"):
|
|
88
|
+
raise NotFittedError("CleanerTransformer is not fitted yet. Call 'fit' before 'transform'.")
|
|
89
|
+
|
|
90
|
+
X_out = X.copy()
|
|
91
|
+
|
|
92
|
+
# 1. Drop columns (by name, not by recomputing missingness)
|
|
93
|
+
cols_to_drop = [c for c in self.columns_to_drop_ if c in X_out.columns]
|
|
94
|
+
X_out = X_out.drop(columns=cols_to_drop)
|
|
95
|
+
|
|
96
|
+
# 2. Duplicate row removal is explicitly skipped here.
|
|
97
|
+
# Removing rows during transform (inference) would silently drop rows the caller expects back.
|
|
98
|
+
|
|
99
|
+
for p in self.fitted_profiles_:
|
|
100
|
+
col = p.name
|
|
101
|
+
if col not in X_out.columns or col == self.target:
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
spec = self.specs_.get(col, {})
|
|
105
|
+
stype = p.semantic_type
|
|
106
|
+
|
|
107
|
+
# Category normalization applies to both CATEGORICAL_LOW and CATEGORICAL_HIGH
|
|
108
|
+
if stype in (SemanticType.CATEGORICAL_LOW, SemanticType.CATEGORICAL_HIGH):
|
|
109
|
+
X_out[col] = normalize_category_values(X_out[col])
|
|
110
|
+
|
|
111
|
+
# Date coercion
|
|
112
|
+
if stype == SemanticType.DATETIME_STRING:
|
|
113
|
+
X_out[col] = coerce_string_dates_to_datetime(X_out[col])
|
|
114
|
+
|
|
115
|
+
# Group rare categories
|
|
116
|
+
if stype == SemanticType.CATEGORICAL_HIGH and "rare_categories" in spec:
|
|
117
|
+
X_out[col] = group_rare_categories(X_out[col], spec["rare_categories"])
|
|
118
|
+
|
|
119
|
+
# Add missing indicator based on frozen fit-time decision
|
|
120
|
+
if spec.get("add_missing_indicator"):
|
|
121
|
+
missing_ind = add_missing_indicator(X_out[col])
|
|
122
|
+
X_out[missing_ind.name] = missing_ind
|
|
123
|
+
|
|
124
|
+
# Impute values
|
|
125
|
+
if "impute_strategy" in spec:
|
|
126
|
+
X_out[col] = X_out[col].fillna(spec["impute_value"])
|
|
127
|
+
|
|
128
|
+
# Winsorize using bounds computed at fit-time
|
|
129
|
+
if stype == SemanticType.NUMERIC_FEATURE and "lower_bound" in spec and "upper_bound" in spec:
|
|
130
|
+
X_out[col] = X_out[col].clip(
|
|
131
|
+
lower=spec["lower_bound"],
|
|
132
|
+
upper=spec["upper_bound"]
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return X_out
|
|
136
|
+
|
|
137
|
+
def get_feature_names_out(self, input_features=None) -> np.ndarray:
|
|
138
|
+
if not hasattr(self, "fitted_profiles_"):
|
|
139
|
+
raise NotFittedError("CleanerTransformer is not fitted yet.")
|
|
140
|
+
cols = []
|
|
141
|
+
missing_cols = []
|
|
142
|
+
for p in self.fitted_profiles_:
|
|
143
|
+
cols.append(p.name)
|
|
144
|
+
if self.specs_.get(p.name, {}).get("add_missing_indicator"):
|
|
145
|
+
missing_cols.append(f"{p.name}_missing")
|
|
146
|
+
return np.array(cols + missing_cols)
|
|
147
|
+
|
|
148
|
+
class EngineerTransformer(BaseEstimator, TransformerMixin):
|
|
149
|
+
def __init__(
|
|
150
|
+
self,
|
|
151
|
+
profiles: list[ColumnProfile],
|
|
152
|
+
target: str,
|
|
153
|
+
model_hint: str,
|
|
154
|
+
cardinality_threshold: int = 20,
|
|
155
|
+
) -> None:
|
|
156
|
+
self.profiles = profiles
|
|
157
|
+
self.target = target
|
|
158
|
+
self.model_hint = model_hint
|
|
159
|
+
self.cardinality_threshold = cardinality_threshold
|
|
160
|
+
|
|
161
|
+
def fit(self, X: pd.DataFrame, y=None) -> "EngineerTransformer":
|
|
162
|
+
if y is None:
|
|
163
|
+
has_high_card = any(p.semantic_type == SemanticType.CATEGORICAL_HIGH for p in self.profiles)
|
|
164
|
+
if has_high_card:
|
|
165
|
+
raise ValueError("Target y must be provided for target encoding high-cardinality columns.")
|
|
166
|
+
|
|
167
|
+
df_for_engineer = X.copy()
|
|
168
|
+
if y is not None:
|
|
169
|
+
df_for_engineer[self.target] = y
|
|
170
|
+
|
|
171
|
+
df_eng, self.report_entries_, self.specs_ = run_engineer(
|
|
172
|
+
df_for_engineer, self.profiles, self.target, self.model_hint, self.cardinality_threshold
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
cols = list(df_eng.columns)
|
|
176
|
+
if self.target in cols and (y is not None and self.target not in X.columns):
|
|
177
|
+
cols.remove(self.target)
|
|
178
|
+
self.output_columns_ = cols
|
|
179
|
+
|
|
180
|
+
if y is not None:
|
|
181
|
+
if isinstance(y, pd.Series):
|
|
182
|
+
global_mean = y.mean()
|
|
183
|
+
else:
|
|
184
|
+
global_mean = pd.Series(y).mean()
|
|
185
|
+
for p in self.profiles:
|
|
186
|
+
if p.semantic_type == SemanticType.CATEGORICAL_HIGH:
|
|
187
|
+
if p.name in self.specs_ and isinstance(self.specs_[p.name], dict):
|
|
188
|
+
self.specs_[p.name]["global_mean"] = global_mean
|
|
189
|
+
|
|
190
|
+
return self
|
|
191
|
+
|
|
192
|
+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
193
|
+
if not hasattr(self, "specs_"):
|
|
194
|
+
raise NotFittedError("EngineerTransformer is not fitted yet.")
|
|
195
|
+
|
|
196
|
+
X_out = X.copy()
|
|
197
|
+
|
|
198
|
+
for p in self.profiles:
|
|
199
|
+
col = p.name
|
|
200
|
+
stype = p.semantic_type
|
|
201
|
+
|
|
202
|
+
if col not in X_out.columns or col == self.target:
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
spec = self.specs_.get(col, {})
|
|
206
|
+
trans_type = spec.get("transform")
|
|
207
|
+
|
|
208
|
+
if trans_type == "passthrough":
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
elif trans_type == "expand_datetime":
|
|
212
|
+
expanded = expand_datetime(X_out[col])
|
|
213
|
+
X_out = X_out.drop(columns=[col]).join(expanded)
|
|
214
|
+
|
|
215
|
+
elif trans_type == "ordinal_encode":
|
|
216
|
+
mapping = spec["mapping"]
|
|
217
|
+
X_out[col] = X_out[col].map(mapping).fillna(-1).astype(int)
|
|
218
|
+
|
|
219
|
+
elif trans_type == "one_hot_encode":
|
|
220
|
+
dummies = pd.get_dummies(X_out[col], prefix=col, prefix_sep='_')
|
|
221
|
+
X_out = X_out.drop(columns=[col]).join(dummies)
|
|
222
|
+
|
|
223
|
+
elif trans_type == "target_encode_cross_fit":
|
|
224
|
+
mapping = spec["mapping"]
|
|
225
|
+
global_mean = spec.get("global_mean", 0.0)
|
|
226
|
+
X_out[col] = X_out[col].map(mapping).fillna(global_mean)
|
|
227
|
+
|
|
228
|
+
elif trans_type == "linear_numeric":
|
|
229
|
+
if spec.get("log1p"):
|
|
230
|
+
X_out[col] = np.log1p(X_out[col])
|
|
231
|
+
|
|
232
|
+
params = spec["scale_params"]
|
|
233
|
+
mean_val = params["mean"]
|
|
234
|
+
std_val = params["std"]
|
|
235
|
+
X_out[col] = (X_out[col] - mean_val) / std_val
|
|
236
|
+
|
|
237
|
+
for expected_col in self.output_columns_:
|
|
238
|
+
if expected_col not in X_out.columns:
|
|
239
|
+
X_out[expected_col] = False
|
|
240
|
+
|
|
241
|
+
X_out = X_out.reindex(columns=self.output_columns_)
|
|
242
|
+
|
|
243
|
+
return X_out
|
|
244
|
+
|
|
245
|
+
def get_feature_names_out(self, input_features=None) -> np.ndarray:
|
|
246
|
+
if not hasattr(self, "output_columns_"):
|
|
247
|
+
raise NotFittedError("EngineerTransformer is not fitted yet.")
|
|
248
|
+
return np.array(self.output_columns_)
|
|
249
|
+
|
|
250
|
+
def build_pipeline(
|
|
251
|
+
profiles: list[ColumnProfile],
|
|
252
|
+
target: str,
|
|
253
|
+
model_hint: str,
|
|
254
|
+
drop_threshold: float = 0.6,
|
|
255
|
+
outlier_method: str = "iqr",
|
|
256
|
+
cardinality_threshold: int = 20,
|
|
257
|
+
) -> Pipeline:
|
|
258
|
+
cleaner = CleanerTransformer(
|
|
259
|
+
profiles=profiles,
|
|
260
|
+
target=target,
|
|
261
|
+
drop_threshold=drop_threshold,
|
|
262
|
+
outlier_method=outlier_method,
|
|
263
|
+
)
|
|
264
|
+
engineer = EngineerTransformer(
|
|
265
|
+
profiles=profiles, # Will be overridden in two-phase fit
|
|
266
|
+
target=target,
|
|
267
|
+
model_hint=model_hint,
|
|
268
|
+
cardinality_threshold=cardinality_threshold,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
pipeline = Pipeline([
|
|
272
|
+
("cleaner", cleaner),
|
|
273
|
+
("engineer", engineer),
|
|
274
|
+
])
|
|
275
|
+
pipeline.set_output(transform="pandas")
|
|
276
|
+
return pipeline
|
|
277
|
+
|
|
278
|
+
def build_pipeline_two_phase(
|
|
279
|
+
profiles: list[ColumnProfile],
|
|
280
|
+
target: str,
|
|
281
|
+
model_hint: str,
|
|
282
|
+
X: pd.DataFrame,
|
|
283
|
+
y: pd.Series,
|
|
284
|
+
drop_threshold: float = 0.6,
|
|
285
|
+
outlier_method: str = "iqr",
|
|
286
|
+
cardinality_threshold: int = 20,
|
|
287
|
+
) -> Pipeline:
|
|
288
|
+
cleaner = CleanerTransformer(
|
|
289
|
+
profiles=profiles,
|
|
290
|
+
target=target,
|
|
291
|
+
drop_threshold=drop_threshold,
|
|
292
|
+
outlier_method=outlier_method,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
cleaner.fit(X, y)
|
|
296
|
+
X_cleaned = cleaner.transform(X)
|
|
297
|
+
|
|
298
|
+
engineer = EngineerTransformer(
|
|
299
|
+
profiles=cleaner.fitted_profiles_,
|
|
300
|
+
target=target,
|
|
301
|
+
model_hint=model_hint,
|
|
302
|
+
cardinality_threshold=cardinality_threshold,
|
|
303
|
+
)
|
|
304
|
+
engineer.fit(X_cleaned, y)
|
|
305
|
+
|
|
306
|
+
pipeline = Pipeline([
|
|
307
|
+
("cleaner", cleaner),
|
|
308
|
+
("engineer", engineer),
|
|
309
|
+
])
|
|
310
|
+
pipeline.set_output(transform="pandas")
|
|
311
|
+
return pipeline
|
|
312
|
+
|
|
313
|
+
def run_assembler(
|
|
314
|
+
df: pd.DataFrame,
|
|
315
|
+
target: str,
|
|
316
|
+
task: str,
|
|
317
|
+
model_hint: str,
|
|
318
|
+
drop_threshold: float = 0.6,
|
|
319
|
+
outlier_method: str = "iqr",
|
|
320
|
+
cardinality_threshold: int = 20,
|
|
321
|
+
) -> PrepResult:
|
|
322
|
+
# 1. Run profiler
|
|
323
|
+
profiles, profiler_report = run_profiler(
|
|
324
|
+
df, target, task, cardinality_threshold
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# 2. Split df into X and y
|
|
328
|
+
X = df.drop(columns=[target])
|
|
329
|
+
y = df[target]
|
|
330
|
+
|
|
331
|
+
# 3. Call build_pipeline_two_phase
|
|
332
|
+
pipeline = build_pipeline_two_phase(
|
|
333
|
+
profiles=profiles,
|
|
334
|
+
target=target,
|
|
335
|
+
model_hint=model_hint,
|
|
336
|
+
X=X,
|
|
337
|
+
y=y,
|
|
338
|
+
drop_threshold=drop_threshold,
|
|
339
|
+
outlier_method=outlier_method,
|
|
340
|
+
cardinality_threshold=cardinality_threshold,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# 4. Transform X and reattach target
|
|
344
|
+
X_final = pipeline.transform(X)
|
|
345
|
+
final_df = X_final.copy()
|
|
346
|
+
final_df[target] = y
|
|
347
|
+
|
|
348
|
+
# 5. Collect all ReportEntries
|
|
349
|
+
cleaner_step = pipeline.named_steps["cleaner"]
|
|
350
|
+
engineer_step = pipeline.named_steps["engineer"]
|
|
351
|
+
all_entries = profiler_report + cleaner_step.report_entries_ + engineer_step.report_entries_
|
|
352
|
+
|
|
353
|
+
# 6. Construct Report
|
|
354
|
+
report = Report(
|
|
355
|
+
entries=all_entries,
|
|
356
|
+
df=df,
|
|
357
|
+
profiles=profiles,
|
|
358
|
+
target=target
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# 7. Return PrepResult
|
|
362
|
+
return PrepResult(
|
|
363
|
+
df=final_df,
|
|
364
|
+
pipeline=pipeline,
|
|
365
|
+
report=report
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
def transform_new_data(
|
|
369
|
+
pipeline: Pipeline,
|
|
370
|
+
new_df: pd.DataFrame,
|
|
371
|
+
target: str | None = None
|
|
372
|
+
) -> pd.DataFrame:
|
|
373
|
+
df_in = new_df.copy()
|
|
374
|
+
target_series = None
|
|
375
|
+
|
|
376
|
+
if target is not None and target in df_in.columns:
|
|
377
|
+
target_series = df_in.pop(target)
|
|
378
|
+
|
|
379
|
+
df_out = pipeline.transform(df_in)
|
|
380
|
+
|
|
381
|
+
if target_series is not None:
|
|
382
|
+
df_out[target] = target_series
|
|
383
|
+
|
|
384
|
+
return df_out
|