pypreflight 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
preflight/__init__.py ADDED
@@ -0,0 +1,210 @@
1
+ """
2
+ exposes prepare(), profile(), clean(), engineer(), compare() as public API
3
+ """
4
+
5
+ import pandas as pd
6
+ from preflight import assembler
7
+ from preflight.types import PrepResult
8
+ from preflight.report import Report
9
+ from preflight.profiler import run_profiler
10
+ from preflight.cleaner import run_cleaner
11
+ from preflight.engineer import run_engineer
12
+ def prepare(
13
+ df: pd.DataFrame,
14
+ target: str,
15
+ task: str = "classification",
16
+ model_hint: str = "tree",
17
+ drop_threshold: float = 0.6,
18
+ outlier_method: str = "iqr",
19
+ cardinality_threshold: int = 20,
20
+ ) -> PrepResult:
21
+ if len(df) == 0:
22
+ raise ValueError("DataFrame cannot be empty")
23
+
24
+ if target not in df.columns:
25
+ raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
26
+
27
+ if len(df.columns) < 2:
28
+ raise ValueError("DataFrame must contain at least one feature column besides the target")
29
+
30
+ if df.columns.duplicated().any():
31
+ raise ValueError("DataFrame contains duplicate column names")
32
+
33
+ if task not in ["classification", "regression"]:
34
+ raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
35
+
36
+ if model_hint not in ["tree", "linear"]:
37
+ raise ValueError(f"model_hint must be 'tree' or 'linear', got '{model_hint}'")
38
+
39
+ return assembler.run_assembler(
40
+ df=df,
41
+ target=target,
42
+ task=task,
43
+ model_hint=model_hint,
44
+ drop_threshold=drop_threshold,
45
+ outlier_method=outlier_method,
46
+ cardinality_threshold=cardinality_threshold,
47
+ )
48
+
49
+ def profile(
50
+ df: pd.DataFrame,
51
+ target: str,
52
+ task: str = "classification",
53
+ cardinality_threshold: int = 20,
54
+ ) -> PrepResult:
55
+ if len(df) == 0:
56
+ raise ValueError("DataFrame cannot be empty")
57
+ if target not in df.columns:
58
+ raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
59
+ if len(df.columns) < 2:
60
+ raise ValueError("DataFrame must contain at least one feature column besides the target")
61
+ if df.columns.duplicated().any():
62
+ raise ValueError("DataFrame contains duplicate column names")
63
+ if task not in ["classification", "regression"]:
64
+ raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
65
+
66
+ profiles, profiler_entries = run_profiler(
67
+ df=df, target=target, task=task, cardinality_threshold=cardinality_threshold
68
+ )
69
+ return PrepResult(
70
+ df=df,
71
+ pipeline=None,
72
+ report=Report(profiler_entries)
73
+ )
74
+
75
+ def clean(
76
+ df: pd.DataFrame,
77
+ target: str,
78
+ task: str = "classification",
79
+ drop_threshold: float = 0.6,
80
+ outlier_method: str = "iqr",
81
+ cardinality_threshold: int = 20,
82
+ ) -> PrepResult:
83
+ if len(df) == 0:
84
+ raise ValueError("DataFrame cannot be empty")
85
+ if target not in df.columns:
86
+ raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
87
+ if len(df.columns) < 2:
88
+ raise ValueError("DataFrame must contain at least one feature column besides the target")
89
+ if df.columns.duplicated().any():
90
+ raise ValueError("DataFrame contains duplicate column names")
91
+ if task not in ["classification", "regression"]:
92
+ raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
93
+
94
+ profiles, profiler_entries = run_profiler(
95
+ df=df, target=target, task=task, cardinality_threshold=cardinality_threshold
96
+ )
97
+ df_clean, surviving_profiles, cleaner_entries, specs = run_cleaner(
98
+ df=df, profiles=profiles, target=target, drop_threshold=drop_threshold, outlier_method=outlier_method
99
+ )
100
+ return PrepResult(
101
+ df=df_clean,
102
+ pipeline=None,
103
+ report=Report(profiler_entries + cleaner_entries)
104
+ )
105
+
106
+ def engineer(
107
+ df: pd.DataFrame,
108
+ target: str,
109
+ task: str = "classification",
110
+ model_hint: str = "tree",
111
+ drop_threshold: float = 0.6,
112
+ outlier_method: str = "iqr",
113
+ cardinality_threshold: int = 20,
114
+ ) -> PrepResult:
115
+ if len(df) == 0:
116
+ raise ValueError("DataFrame cannot be empty")
117
+ if target not in df.columns:
118
+ raise ValueError(f"target '{target}' not found in DataFrame columns: {list(df.columns)}")
119
+ if len(df.columns) < 2:
120
+ raise ValueError("DataFrame must contain at least one feature column besides the target")
121
+ if df.columns.duplicated().any():
122
+ raise ValueError("DataFrame contains duplicate column names")
123
+ if task not in ["classification", "regression"]:
124
+ raise ValueError(f"task must be 'classification' or 'regression', got '{task}'")
125
+ if model_hint not in ["tree", "linear"]:
126
+ raise ValueError(f"model_hint must be 'tree' or 'linear', got '{model_hint}'")
127
+
128
+ profiles, profiler_entries = run_profiler(
129
+ df=df, target=target, task=task, cardinality_threshold=cardinality_threshold
130
+ )
131
+ df_clean, surviving_profiles, cleaner_entries, specs_c = run_cleaner(
132
+ df=df, profiles=profiles, target=target, drop_threshold=drop_threshold, outlier_method=outlier_method
133
+ )
134
+ df_eng, engineer_entries, specs_e = run_engineer(
135
+ df=df_clean, profiles=surviving_profiles, target=target, model_hint=model_hint, cardinality_threshold=cardinality_threshold
136
+ )
137
+ return PrepResult(
138
+ df=df_eng,
139
+ pipeline=None,
140
+ report=Report(profiler_entries + cleaner_entries + engineer_entries)
141
+ )
142
+
143
+ def _extract_actions_per_column(report: Report | None) -> dict[str, set[str]]:
144
+ if report is None:
145
+ return {}
146
+
147
+ actions = {}
148
+ for entry in report.entries:
149
+ if entry.column not in actions:
150
+ actions[entry.column] = set()
151
+ actions[entry.column].add(entry.action)
152
+ return actions
153
+
154
+ def _compute_decision_diff(actions_a: dict[str, set[str]], actions_b: dict[str, set[str]]) -> list[str]:
155
+ diff = []
156
+ shared_cols = set(actions_a.keys()).intersection(actions_b.keys())
157
+ for col in sorted(shared_cols):
158
+ if actions_a[col] != actions_b[col]:
159
+ diff.append(col)
160
+ return diff
161
+
162
+ def compare(a: PrepResult, b: PrepResult) -> dict:
163
+ cols_a = set(a.df.columns)
164
+ cols_b = set(b.df.columns)
165
+
166
+ columns_only_in_a = sorted(cols_a - cols_b)
167
+ columns_only_in_b = sorted(cols_b - cols_a)
168
+ columns_in_both = sorted(cols_a.intersection(cols_b))
169
+
170
+ report_entry_counts_a = a.report.summary_counts() if a.report is not None else None
171
+ report_entry_counts_b = b.report.summary_counts() if b.report is not None else None
172
+
173
+ actions_a = _extract_actions_per_column(a.report)
174
+ actions_b = _extract_actions_per_column(b.report)
175
+
176
+ if a.report is None or b.report is None:
177
+ decision_diff = []
178
+ else:
179
+ decision_diff = _compute_decision_diff(actions_a, actions_b)
180
+
181
+ diff = {
182
+ "shape_a": a.df.shape,
183
+ "shape_b": b.df.shape,
184
+ "columns_only_in_a": columns_only_in_a,
185
+ "columns_only_in_b": columns_only_in_b,
186
+ "columns_in_both": columns_in_both,
187
+ "report_entry_counts_a": report_entry_counts_a,
188
+ "report_entry_counts_b": report_entry_counts_b,
189
+ "decision_diff": decision_diff,
190
+ }
191
+
192
+ # Print summary
193
+ print("=== PreFlight Compare ===")
194
+ print(f"Shape A: {diff['shape_a']}")
195
+ print(f"Shape B: {diff['shape_b']}")
196
+
197
+ if columns_only_in_a:
198
+ print(f"Columns only in A ({len(columns_only_in_a)}): {', '.join(columns_only_in_a[:5])}{'...' if len(columns_only_in_a) > 5 else ''}")
199
+
200
+ if columns_only_in_b:
201
+ print(f"Columns only in B ({len(columns_only_in_b)}): {', '.join(columns_only_in_b[:5])}{'...' if len(columns_only_in_b) > 5 else ''}")
202
+
203
+ if decision_diff:
204
+ print(f"Differing decisions in shared columns:")
205
+ for col in decision_diff:
206
+ print(f" {col}: A={actions_a.get(col, set())} | B={actions_b.get(col, set())}")
207
+
208
+ print("=========================")
209
+
210
+ return diff
preflight/assembler.py ADDED
@@ -0,0 +1,384 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from sklearn.pipeline import Pipeline
5
+ from sklearn.exceptions import NotFittedError
6
+
7
+ from preflight.types import ColumnProfile, SemanticType, PrepResult
8
+ from preflight.profiler import run_profiler
9
+ from preflight.report import Report
10
+ from preflight.cleaner import (
11
+ run_cleaner,
12
+ add_missing_indicator,
13
+ normalize_category_values,
14
+ coerce_string_dates_to_datetime,
15
+ group_rare_categories,
16
+ )
17
+ from preflight.engineer import run_engineer, expand_datetime
18
+
19
+ # =====================================================================
20
+ # Architectural Decision: Two-Phase Pipeline Assembly
21
+ # ---------------------------------------------------------------------
22
+ # EngineerTransformer requires the POST-CLEANING column profiles.
23
+ # However, the surviving profiles are only known AFTER CleanerTransformer
24
+ # computes its dynamic drops during `fit()`.
25
+ #
26
+ # To resolve this coupling without breaking standard sklearn Pipeline
27
+ # semantics, the `build_pipeline` function returns an UNFIT pipeline
28
+ # (which would fail or process dropped columns incorrectly if fitted
29
+ # blindly). Instead, `build_pipeline_two_phase` acts as the actual
30
+ # usable constructor: it manually fits the Cleaner stage, reads the
31
+ # resulting `fitted_profiles_`, uses them to construct and fit the
32
+ # Engineer stage, and then returns a fully assembled, pre-fitted
33
+ # Pipeline ready for `transform()`.
34
+ # =====================================================================
35
+
36
+ class CleanerTransformer(BaseEstimator, TransformerMixin):
37
+ def __init__(
38
+ self,
39
+ profiles: list[ColumnProfile],
40
+ target: str,
41
+ drop_threshold: float = 0.6,
42
+ outlier_method: str = "iqr",
43
+ ) -> None:
44
+ self.profiles = profiles
45
+ self.target = target
46
+ self.drop_threshold = drop_threshold
47
+ self.outlier_method = outlier_method
48
+
49
+ def fit(self, X: pd.DataFrame, y=None) -> "CleanerTransformer":
50
+ # run_cleaner computes the required stats
51
+ df_clean, self.fitted_profiles_, self.report_entries_, self.specs_ = run_cleaner(
52
+ X, self.profiles, self.target, self.drop_threshold, self.outlier_method
53
+ )
54
+
55
+ surviving_cols = {p.name for p in self.fitted_profiles_}
56
+ self.columns_to_drop_ = [p.name for p in self.profiles if p.name not in surviving_cols]
57
+
58
+ # Augment specs_ with exact bounds and indicators to avoid recomputing on test data
59
+ for p in self.fitted_profiles_:
60
+ col = p.name
61
+ if col not in self.specs_:
62
+ self.specs_[col] = {}
63
+
64
+ # Missing indicator logic from cleaner.py
65
+ if p.missing_rate > 0 and p.semantic_type in (SemanticType.CATEGORICAL_LOW, SemanticType.NUMERIC_FEATURE):
66
+ self.specs_[col]["add_missing_indicator"] = True
67
+
68
+ # Winsorization bounds logic
69
+ if p.semantic_type == SemanticType.NUMERIC_FEATURE and "outlier_method" in self.specs_[col]:
70
+ series = X[col]
71
+ method = self.specs_[col]["outlier_method"]
72
+ if method == "iqr":
73
+ q1 = series.quantile(0.25)
74
+ q3 = series.quantile(0.75)
75
+ iqr = q3 - q1
76
+ self.specs_[col]["lower_bound"] = q1 - 1.5 * iqr
77
+ self.specs_[col]["upper_bound"] = q3 + 1.5 * iqr
78
+ elif method == "zscore":
79
+ mean = series.mean()
80
+ std = series.std()
81
+ self.specs_[col]["lower_bound"] = mean - 3 * std
82
+ self.specs_[col]["upper_bound"] = mean + 3 * std
83
+
84
+ return self
85
+
86
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
87
+ if not hasattr(self, "specs_"):
88
+ raise NotFittedError("CleanerTransformer is not fitted yet. Call 'fit' before 'transform'.")
89
+
90
+ X_out = X.copy()
91
+
92
+ # 1. Drop columns (by name, not by recomputing missingness)
93
+ cols_to_drop = [c for c in self.columns_to_drop_ if c in X_out.columns]
94
+ X_out = X_out.drop(columns=cols_to_drop)
95
+
96
+ # 2. Duplicate row removal is explicitly skipped here.
97
+ # Removing rows during transform (inference) would silently drop rows the caller expects back.
98
+
99
+ for p in self.fitted_profiles_:
100
+ col = p.name
101
+ if col not in X_out.columns or col == self.target:
102
+ continue
103
+
104
+ spec = self.specs_.get(col, {})
105
+ stype = p.semantic_type
106
+
107
+ # Category normalization applies to both CATEGORICAL_LOW and CATEGORICAL_HIGH
108
+ if stype in (SemanticType.CATEGORICAL_LOW, SemanticType.CATEGORICAL_HIGH):
109
+ X_out[col] = normalize_category_values(X_out[col])
110
+
111
+ # Date coercion
112
+ if stype == SemanticType.DATETIME_STRING:
113
+ X_out[col] = coerce_string_dates_to_datetime(X_out[col])
114
+
115
+ # Group rare categories
116
+ if stype == SemanticType.CATEGORICAL_HIGH and "rare_categories" in spec:
117
+ X_out[col] = group_rare_categories(X_out[col], spec["rare_categories"])
118
+
119
+ # Add missing indicator based on frozen fit-time decision
120
+ if spec.get("add_missing_indicator"):
121
+ missing_ind = add_missing_indicator(X_out[col])
122
+ X_out[missing_ind.name] = missing_ind
123
+
124
+ # Impute values
125
+ if "impute_strategy" in spec:
126
+ X_out[col] = X_out[col].fillna(spec["impute_value"])
127
+
128
+ # Winsorize using bounds computed at fit-time
129
+ if stype == SemanticType.NUMERIC_FEATURE and "lower_bound" in spec and "upper_bound" in spec:
130
+ X_out[col] = X_out[col].clip(
131
+ lower=spec["lower_bound"],
132
+ upper=spec["upper_bound"]
133
+ )
134
+
135
+ return X_out
136
+
137
+ def get_feature_names_out(self, input_features=None) -> np.ndarray:
138
+ if not hasattr(self, "fitted_profiles_"):
139
+ raise NotFittedError("CleanerTransformer is not fitted yet.")
140
+ cols = []
141
+ missing_cols = []
142
+ for p in self.fitted_profiles_:
143
+ cols.append(p.name)
144
+ if self.specs_.get(p.name, {}).get("add_missing_indicator"):
145
+ missing_cols.append(f"{p.name}_missing")
146
+ return np.array(cols + missing_cols)
147
+
148
+ class EngineerTransformer(BaseEstimator, TransformerMixin):
149
+ def __init__(
150
+ self,
151
+ profiles: list[ColumnProfile],
152
+ target: str,
153
+ model_hint: str,
154
+ cardinality_threshold: int = 20,
155
+ ) -> None:
156
+ self.profiles = profiles
157
+ self.target = target
158
+ self.model_hint = model_hint
159
+ self.cardinality_threshold = cardinality_threshold
160
+
161
+ def fit(self, X: pd.DataFrame, y=None) -> "EngineerTransformer":
162
+ if y is None:
163
+ has_high_card = any(p.semantic_type == SemanticType.CATEGORICAL_HIGH for p in self.profiles)
164
+ if has_high_card:
165
+ raise ValueError("Target y must be provided for target encoding high-cardinality columns.")
166
+
167
+ df_for_engineer = X.copy()
168
+ if y is not None:
169
+ df_for_engineer[self.target] = y
170
+
171
+ df_eng, self.report_entries_, self.specs_ = run_engineer(
172
+ df_for_engineer, self.profiles, self.target, self.model_hint, self.cardinality_threshold
173
+ )
174
+
175
+ cols = list(df_eng.columns)
176
+ if self.target in cols and (y is not None and self.target not in X.columns):
177
+ cols.remove(self.target)
178
+ self.output_columns_ = cols
179
+
180
+ if y is not None:
181
+ if isinstance(y, pd.Series):
182
+ global_mean = y.mean()
183
+ else:
184
+ global_mean = pd.Series(y).mean()
185
+ for p in self.profiles:
186
+ if p.semantic_type == SemanticType.CATEGORICAL_HIGH:
187
+ if p.name in self.specs_ and isinstance(self.specs_[p.name], dict):
188
+ self.specs_[p.name]["global_mean"] = global_mean
189
+
190
+ return self
191
+
192
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
193
+ if not hasattr(self, "specs_"):
194
+ raise NotFittedError("EngineerTransformer is not fitted yet.")
195
+
196
+ X_out = X.copy()
197
+
198
+ for p in self.profiles:
199
+ col = p.name
200
+ stype = p.semantic_type
201
+
202
+ if col not in X_out.columns or col == self.target:
203
+ continue
204
+
205
+ spec = self.specs_.get(col, {})
206
+ trans_type = spec.get("transform")
207
+
208
+ if trans_type == "passthrough":
209
+ continue
210
+
211
+ elif trans_type == "expand_datetime":
212
+ expanded = expand_datetime(X_out[col])
213
+ X_out = X_out.drop(columns=[col]).join(expanded)
214
+
215
+ elif trans_type == "ordinal_encode":
216
+ mapping = spec["mapping"]
217
+ X_out[col] = X_out[col].map(mapping).fillna(-1).astype(int)
218
+
219
+ elif trans_type == "one_hot_encode":
220
+ dummies = pd.get_dummies(X_out[col], prefix=col, prefix_sep='_')
221
+ X_out = X_out.drop(columns=[col]).join(dummies)
222
+
223
+ elif trans_type == "target_encode_cross_fit":
224
+ mapping = spec["mapping"]
225
+ global_mean = spec.get("global_mean", 0.0)
226
+ X_out[col] = X_out[col].map(mapping).fillna(global_mean)
227
+
228
+ elif trans_type == "linear_numeric":
229
+ if spec.get("log1p"):
230
+ X_out[col] = np.log1p(X_out[col])
231
+
232
+ params = spec["scale_params"]
233
+ mean_val = params["mean"]
234
+ std_val = params["std"]
235
+ X_out[col] = (X_out[col] - mean_val) / std_val
236
+
237
+ for expected_col in self.output_columns_:
238
+ if expected_col not in X_out.columns:
239
+ X_out[expected_col] = False
240
+
241
+ X_out = X_out.reindex(columns=self.output_columns_)
242
+
243
+ return X_out
244
+
245
+ def get_feature_names_out(self, input_features=None) -> np.ndarray:
246
+ if not hasattr(self, "output_columns_"):
247
+ raise NotFittedError("EngineerTransformer is not fitted yet.")
248
+ return np.array(self.output_columns_)
249
+
250
+ def build_pipeline(
251
+ profiles: list[ColumnProfile],
252
+ target: str,
253
+ model_hint: str,
254
+ drop_threshold: float = 0.6,
255
+ outlier_method: str = "iqr",
256
+ cardinality_threshold: int = 20,
257
+ ) -> Pipeline:
258
+ cleaner = CleanerTransformer(
259
+ profiles=profiles,
260
+ target=target,
261
+ drop_threshold=drop_threshold,
262
+ outlier_method=outlier_method,
263
+ )
264
+ engineer = EngineerTransformer(
265
+ profiles=profiles, # Will be overridden in two-phase fit
266
+ target=target,
267
+ model_hint=model_hint,
268
+ cardinality_threshold=cardinality_threshold,
269
+ )
270
+
271
+ pipeline = Pipeline([
272
+ ("cleaner", cleaner),
273
+ ("engineer", engineer),
274
+ ])
275
+ pipeline.set_output(transform="pandas")
276
+ return pipeline
277
+
278
+ def build_pipeline_two_phase(
279
+ profiles: list[ColumnProfile],
280
+ target: str,
281
+ model_hint: str,
282
+ X: pd.DataFrame,
283
+ y: pd.Series,
284
+ drop_threshold: float = 0.6,
285
+ outlier_method: str = "iqr",
286
+ cardinality_threshold: int = 20,
287
+ ) -> Pipeline:
288
+ cleaner = CleanerTransformer(
289
+ profiles=profiles,
290
+ target=target,
291
+ drop_threshold=drop_threshold,
292
+ outlier_method=outlier_method,
293
+ )
294
+
295
+ cleaner.fit(X, y)
296
+ X_cleaned = cleaner.transform(X)
297
+
298
+ engineer = EngineerTransformer(
299
+ profiles=cleaner.fitted_profiles_,
300
+ target=target,
301
+ model_hint=model_hint,
302
+ cardinality_threshold=cardinality_threshold,
303
+ )
304
+ engineer.fit(X_cleaned, y)
305
+
306
+ pipeline = Pipeline([
307
+ ("cleaner", cleaner),
308
+ ("engineer", engineer),
309
+ ])
310
+ pipeline.set_output(transform="pandas")
311
+ return pipeline
312
+
313
+ def run_assembler(
314
+ df: pd.DataFrame,
315
+ target: str,
316
+ task: str,
317
+ model_hint: str,
318
+ drop_threshold: float = 0.6,
319
+ outlier_method: str = "iqr",
320
+ cardinality_threshold: int = 20,
321
+ ) -> PrepResult:
322
+ # 1. Run profiler
323
+ profiles, profiler_report = run_profiler(
324
+ df, target, task, cardinality_threshold
325
+ )
326
+
327
+ # 2. Split df into X and y
328
+ X = df.drop(columns=[target])
329
+ y = df[target]
330
+
331
+ # 3. Call build_pipeline_two_phase
332
+ pipeline = build_pipeline_two_phase(
333
+ profiles=profiles,
334
+ target=target,
335
+ model_hint=model_hint,
336
+ X=X,
337
+ y=y,
338
+ drop_threshold=drop_threshold,
339
+ outlier_method=outlier_method,
340
+ cardinality_threshold=cardinality_threshold,
341
+ )
342
+
343
+ # 4. Transform X and reattach target
344
+ X_final = pipeline.transform(X)
345
+ final_df = X_final.copy()
346
+ final_df[target] = y
347
+
348
+ # 5. Collect all ReportEntries
349
+ cleaner_step = pipeline.named_steps["cleaner"]
350
+ engineer_step = pipeline.named_steps["engineer"]
351
+ all_entries = profiler_report + cleaner_step.report_entries_ + engineer_step.report_entries_
352
+
353
+ # 6. Construct Report
354
+ report = Report(
355
+ entries=all_entries,
356
+ df=df,
357
+ profiles=profiles,
358
+ target=target
359
+ )
360
+
361
+ # 7. Return PrepResult
362
+ return PrepResult(
363
+ df=final_df,
364
+ pipeline=pipeline,
365
+ report=report
366
+ )
367
+
368
+ def transform_new_data(
369
+ pipeline: Pipeline,
370
+ new_df: pd.DataFrame,
371
+ target: str | None = None
372
+ ) -> pd.DataFrame:
373
+ df_in = new_df.copy()
374
+ target_series = None
375
+
376
+ if target is not None and target in df_in.columns:
377
+ target_series = df_in.pop(target)
378
+
379
+ df_out = pipeline.transform(df_in)
380
+
381
+ if target_series is not None:
382
+ df_out[target] = target_series
383
+
384
+ return df_out