p2predict 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
p2predict/cli/train.py ADDED
@@ -0,0 +1,659 @@
1
+ import datetime
2
+ import os
3
+ import sys
4
+
5
+ # Check for --json *before* starting any module-level Halo spinner or
6
+ # Rich output — under JSON mode stdout must be exclusively the response
7
+ # document. Light-touch argv sniffing is fine here because Click won't
8
+ # rewrite "--json" in any way that breaks this check.
9
+ _JSON_MODE_FROM_ARGV = "--json" in sys.argv
10
+
11
+ from halo import Halo
12
+
13
+ if not _JSON_MODE_FROM_ARGV:
14
+ spinner = Halo(text="Loading P2Predict", spinner="pong")
15
+ spinner.start()
16
+ else:
17
+ spinner = None
18
+
19
+ import click
20
+ import pandas as pd
21
+ import questionary
22
+ from rich.console import Console
23
+ from rich.prompt import Prompt
24
+
25
+ from p2predict import plotting
26
+ from p2predict.cmdline_io import print_feature_stats, print_feature_weights, print_logo
27
+ from p2predict.hpo_training import hyper_parameter_tuning
28
+ from p2predict.json_output import JSON_SCHEMA_VERSION, emit, emit_error
29
+ from p2predict.model_evals import evaluate_model
30
+ from p2predict.intervals import compute_calibration_residuals
31
+ from p2predict.outliers import (
32
+ POLICIES as OUTLIER_POLICIES,
33
+ apply_feature_outlier_policy,
34
+ apply_outlier_policy,
35
+ )
36
+ from p2predict.feature_selection import (
37
+ find_high_variation_features,
38
+ find_no_variation_features,
39
+ get_most_predictable_features,
40
+ )
41
+ from p2predict.prepare_data import prepare_data
42
+ from p2predict.trained_model_io import SaveModel, Serialize_Trained_Model, load_csv_file
43
+ from p2predict.training import (
44
+ ALGORITHMS,
45
+ auto_train,
46
+ extract_feature_importances,
47
+ resolve_log_target,
48
+ start_training,
49
+ )
50
+ from p2predict.ui_console import print_dataframe
51
+
52
+ if spinner is not None:
53
+ spinner.stop()
54
+
55
+
56
+ def _abort(json_mode: bool, console, code: str, message: str) -> None:
57
+ """Same shape as predict.py — emit JSON error or red Rich abort."""
58
+ if json_mode:
59
+ emit_error("train", code, message)
60
+ console.print(f"Aborted: {message}", style="bold red")
61
+ raise SystemExit(1)
62
+
63
+
64
+ def _outlier_summary_block(summary: dict) -> dict:
65
+ """Coerce the target-side outlier summary into JSON-shaped values."""
66
+ return {
67
+ "policy": summary.get("policy"),
68
+ "applied": summary.get("applied"),
69
+ "n_outliers": int(summary.get("n_outliers", 0)),
70
+ "n_total": int(summary.get("n_total", 0)),
71
+ "lower": (None if pd.isna(summary.get("lower", float("nan")))
72
+ else float(summary.get("lower"))),
73
+ "upper": (None if pd.isna(summary.get("upper", float("nan")))
74
+ else float(summary.get("upper"))),
75
+ }
76
+
77
+
78
+ def _feature_outlier_summary_block(summary: dict) -> dict:
79
+ return {
80
+ "policy": summary.get("policy"),
81
+ "applied": summary.get("applied"),
82
+ "n_outliers_total": int(summary.get("n_outliers_total", 0)),
83
+ "n_total": int(summary.get("n_total", 0)),
84
+ "per_column": {
85
+ col: {
86
+ "n_outliers": int(stats.get("n_outliers", 0)),
87
+ "lower": (None if pd.isna(stats.get("lower", float("nan")))
88
+ else float(stats.get("lower"))),
89
+ "upper": (None if pd.isna(stats.get("upper", float("nan")))
90
+ else float(stats.get("upper"))),
91
+ }
92
+ for col, stats in summary.get("per_column", {}).items()
93
+ },
94
+ }
95
+
96
+
97
+ @click.command()
98
+ @click.option("-i", "--input", type=click.Path(exists=True), default=None,
99
+ help="Path to the CSV file containing the training dataset.")
100
+ @click.option("-t", "--target",
101
+ help='Name of the feature to predict (e.g., "Price").')
102
+ @click.option("-x", "--expert", is_flag=True, default=None,
103
+ help="Enable Expert Mode for more control over the training process.")
104
+ @click.option("-a", "--algorithm", type=click.Choice(list(ALGORITHMS)),
105
+ help="ML algorithm for expert mode.")
106
+ @click.option("-v", "--verbose", is_flag=True, default=None,
107
+ help="Enable verbose output.")
108
+ @click.option("-c", "--interactive", is_flag=True, default=None,
109
+ help="Enable interactive mode for guided input.")
110
+ @click.option("-tf", "--training_features",
111
+ help='Comma-separated list of features (e.g., "Weight,Size,Color").')
112
+ @click.option("-b", "--budget", type=click.Choice(["fast", "thorough"]), default="fast",
113
+ help="HPO search budget. 'fast' = small search, 'thorough' = wider search (slower).")
114
+ @click.option("--max-features", "max_features", type=click.IntRange(min=2), default=6,
115
+ show_default=True,
116
+ help="Auto mode only: cap on how many top-ranked features auto-selection keeps. "
117
+ "Default 6 preserves prior behaviour. Pass a higher number (or use -tf to pick "
118
+ "features explicitly) when ranking suggests more columns are predictive. "
119
+ "Ignored in expert mode (the user selects features interactively or via -tf).")
120
+ @click.option("--tune/--no-tune", default=None,
121
+ help="Expert mode only: run HPO on the chosen algorithm and save the tuned model.")
122
+ @click.option("--outliers", type=click.Choice(list(OUTLIER_POLICIES)), default="warn",
123
+ help="How to handle outliers in the target column (Tukey IQR rule). "
124
+ "'warn' (default) = report only; 'drop' = remove rows; "
125
+ "'winsorize' = cap values; 'keep' = silent.")
126
+ @click.option("--feature-outliers", type=click.Choice(list(OUTLIER_POLICIES)), default="warn",
127
+ help="How to handle outliers in the numerical feature columns "
128
+ "(Tukey IQR per column). 'drop' removes any row that has an "
129
+ "outlier in any feature column; 'winsorize' caps each column "
130
+ "at its own IQR bounds. 'warn' (default) = report only. "
131
+ "Categorical features are ignored.")
132
+ @click.option("--time-column", default=None,
133
+ help="Name of a date/time column. When given, the train/test split and CV "
134
+ "become chronological (TimeSeriesSplit), which prevents look-ahead bias "
135
+ "for time-ordered data. The column is excluded from features.")
136
+ @click.option("--log-target", "log_target_mode",
137
+ type=click.Choice(["auto", "on", "off"]), default="auto",
138
+ show_default=True,
139
+ help="Override the automatic skew-based decision on whether to wrap "
140
+ "the target with log/exp. 'auto' = wrap when scipy.stats.skew(y) > 1.0 "
141
+ "(the prior behaviour). 'on' = always wrap (the right default for "
142
+ "multiplicative quantities like prices/costs/weights, regardless of "
143
+ "sample skew — keeps conformal intervals strictly positive and SHAP "
144
+ "factors multiplicative). 'off' = never wrap. 'on' aborts cleanly "
145
+ "if any training target is non-positive.")
146
+ @click.option("--report", "report", type=click.Path(), default=None,
147
+ help="Write the procurement-style PDF model-quality report to PATH "
148
+ "after training. Works in both auto and expert mode, and with "
149
+ "or without --interactive — pass this whenever you want the "
150
+ "PDF without answering an interactive prompt.")
151
+ @click.option("--json", "json_mode", is_flag=True, default=False,
152
+ help="Emit machine-readable JSON to stdout instead of "
153
+ "Rich-formatted output. Useful for agents and scripts. "
154
+ "See p2predict.json_output for the schema.")
155
+ def train(input, target, expert, algorithm, verbose, interactive, training_features,
156
+ budget, max_features, tune, outliers, feature_outliers, time_column,
157
+ log_target_mode, report, json_mode):
158
+
159
+ # Redirect Rich to /dev/null under --json so any console.print that
160
+ # escapes a guard cannot corrupt the JSON document on stdout.
161
+ if json_mode:
162
+ console = Console(file=open(os.devnull, "w"))
163
+ else:
164
+ console = Console()
165
+
166
+ response: dict = {
167
+ "schema_version": JSON_SCHEMA_VERSION,
168
+ "command": "train",
169
+ }
170
+
171
+ if not json_mode:
172
+ print("")
173
+ print_logo()
174
+ print("")
175
+
176
+ mode_label = "Expert mode" if expert else "Auto mode"
177
+ response["mode"] = "expert" if expert else "auto"
178
+ if not json_mode:
179
+ console.print(f"Welcome to P2Predict! '{mode_label}' is active.", style="bold blue")
180
+
181
+ # Interactive mode is incompatible with --json (it would prompt).
182
+ if json_mode and interactive:
183
+ _abort(json_mode, console, "interactive_with_json",
184
+ "interactive mode is not supported with --json.")
185
+
186
+ if interactive:
187
+ if not input:
188
+ input = questionary.path("Enter CSV file path").ask()
189
+ if not input:
190
+ _abort(json_mode, console, "missing_input",
191
+ "You must provide an input file.")
192
+ else:
193
+ if not input:
194
+ _abort(json_mode, console, "missing_input",
195
+ "You must provide --input. Use -c for interactive mode.")
196
+ if not target:
197
+ _abort(json_mode, console, "missing_target",
198
+ "You must provide --target. Use -c for interactive mode.")
199
+
200
+ if expert:
201
+ if interactive and not algorithm:
202
+ algorithm = questionary.select(
203
+ "Please choose an ML algorithm:", choices=list(ALGORITHMS)
204
+ ).ask()
205
+ if not algorithm:
206
+ _abort(json_mode, console, "missing_algorithm",
207
+ "You must select a training algorithm.")
208
+ elif not interactive:
209
+ if not algorithm:
210
+ _abort(json_mode, console, "missing_algorithm",
211
+ "You must pre-select --algorithm in expert mode (or use -c).")
212
+ if not training_features:
213
+ _abort(json_mode, console, "missing_features",
214
+ "You must provide --training_features in expert mode (or use -c).")
215
+
216
+ data = load_csv_file(input)
217
+ rows_loaded = int(data.shape[0])
218
+
219
+ if not json_mode:
220
+ print("")
221
+ console.print(
222
+ f"Training file '{input}' imported into P2Predict > "
223
+ f"{data.shape[0]} rows x {data.shape[1]} columns loaded."
224
+ )
225
+ print("")
226
+
227
+ if not target:
228
+ target = questionary.select("Enter target column", choices=data.columns.tolist()).ask()
229
+ if not target:
230
+ _abort(json_mode, console, "missing_target",
231
+ "A target feature is required.")
232
+
233
+ if time_column is not None and time_column not in data.columns:
234
+ _abort(json_mode, console, "bad_time_column",
235
+ f"--time-column '{time_column}' not found in CSV.")
236
+ if time_column is not None:
237
+ try:
238
+ data[time_column] = pd.to_datetime(data[time_column])
239
+ except Exception as exc:
240
+ _abort(json_mode, console, "bad_time_column",
241
+ f"could not parse --time-column '{time_column}': {exc}")
242
+ if not json_mode:
243
+ console.print(
244
+ f"Time-aware mode: train/test split and CV will be chronological on "
245
+ f"'{time_column}'.",
246
+ style="bold blue",
247
+ )
248
+
249
+ if target not in data.columns:
250
+ _abort(json_mode, console, "unknown_target",
251
+ f"--target '{target}' not found in CSV.")
252
+
253
+ # Drop only rows whose TARGET is NA — those rows can't supervise training
254
+ # and can't be scored. Rows with NAs only in *feature* columns are kept:
255
+ # XGBoost handles them natively and the random_forest/ridge preprocessors
256
+ # impute (see build_preprocessor). This replaces the old blanket
257
+ # df.dropna() at CSV load, which silently discarded ~half the catalogue.
258
+ rows_before_target_drop = int(data.shape[0])
259
+ data = data[data[target].notna()]
260
+ rows_dropped_target_na = rows_before_target_drop - int(data.shape[0])
261
+ if data.empty:
262
+ _abort(json_mode, console, "all_target_na",
263
+ f"every row has a missing value in target column '{target}'.")
264
+ if rows_dropped_target_na > 0 and not json_mode:
265
+ console.print(
266
+ f"Dropped {rows_dropped_target_na} row(s) with a missing "
267
+ f"'{target}' value; {data.shape[0]} rows remain.",
268
+ style="yellow",
269
+ )
270
+
271
+ data, outlier_summary = apply_outlier_policy(data, target, policy=outliers)
272
+ if outlier_summary["n_outliers"] > 0 and not json_mode:
273
+ pct = 100.0 * outlier_summary["n_outliers"] / max(outlier_summary["n_total"], 1)
274
+ action_msg = {
275
+ "keep": "kept as-is",
276
+ "warn": "kept as-is — pass --outliers drop or winsorize to mitigate",
277
+ "drop": "dropped",
278
+ "winsorize": "winsorized to the IQR bounds",
279
+ }[outliers]
280
+ console.print(
281
+ f"Outliers in '{target}': {outlier_summary['n_outliers']} of "
282
+ f"{outlier_summary['n_total']} rows ({pct:.1f}%) outside "
283
+ f"[{outlier_summary['lower']:.2f}, {outlier_summary['upper']:.2f}] — {action_msg}.",
284
+ style="bold yellow",
285
+ )
286
+ print("")
287
+
288
+ feature_outlier_candidates = [
289
+ c for c in data.columns if c != target and c != time_column
290
+ ]
291
+ data, feature_outlier_summary = apply_feature_outlier_policy(
292
+ data, feature_outlier_candidates, policy=feature_outliers
293
+ )
294
+ if feature_outlier_summary["n_outliers_total"] > 0 and not json_mode:
295
+ pct = (
296
+ 100.0 * feature_outlier_summary["n_outliers_total"]
297
+ / max(feature_outlier_summary["n_total"], 1)
298
+ )
299
+ feature_action_msg = {
300
+ "keep": "kept as-is",
301
+ "warn": "kept as-is — pass --feature-outliers drop or winsorize to mitigate",
302
+ "drop": "rows dropped",
303
+ "winsorize": "values winsorized per column",
304
+ }[feature_outliers]
305
+ affected = {
306
+ col: stats for col, stats in feature_outlier_summary["per_column"].items()
307
+ if stats["n_outliers"] > 0
308
+ }
309
+ affected_details = ", ".join(
310
+ f"{col} ({stats['n_outliers']})" for col, stats in affected.items()
311
+ )
312
+ console.print(
313
+ f"Outliers in feature columns: {feature_outlier_summary['n_outliers_total']} "
314
+ f"of {feature_outlier_summary['n_total']} rows ({pct:.1f}%) affected "
315
+ f"[{affected_details}] — {feature_action_msg}.",
316
+ style="bold yellow",
317
+ )
318
+ print("")
319
+
320
+ feature_data = data.drop(columns=[time_column]) if time_column else data
321
+ high_vars = find_high_variation_features(feature_data)
322
+ low_vars = find_no_variation_features(feature_data)
323
+
324
+ if not json_mode:
325
+ print("")
326
+ console.print("Low-information features detected:")
327
+ console.print(f"No information content: {low_vars}")
328
+ console.print(f"High variation (potentially noisy): {high_vars}")
329
+ print("")
330
+
331
+ if interactive and (low_vars or high_vars):
332
+ to_remove = questionary.checkbox(
333
+ "Which features would you like to remove? ", choices=low_vars + high_vars
334
+ ).ask()
335
+ if to_remove:
336
+ data = data.drop(to_remove, axis=1)
337
+ elif low_vars:
338
+ data = data.drop(low_vars, axis=1)
339
+
340
+ feature_data = data.drop(columns=[time_column]) if time_column else data
341
+
342
+ if not training_features:
343
+ if expert:
344
+ best_features_ranked = get_most_predictable_features(feature_data, target)
345
+ if not json_mode:
346
+ console.print("Best features detected for prediction:", style="bold white")
347
+ print("")
348
+ print_dataframe(best_features_ranked)
349
+
350
+ options_list = [c for c in feature_data.columns.tolist() if c != target]
351
+ selected_columns = questionary.checkbox(
352
+ "Select the features for training: ", choices=options_list
353
+ ).ask()
354
+ if not selected_columns:
355
+ _abort(json_mode, console, "missing_features",
356
+ "You must select training features.")
357
+ else:
358
+ ranked = get_most_predictable_features(feature_data, target, output_only_headers=True)
359
+ n_ranked = len(ranked)
360
+ cap = max(2, min(n_ranked, max_features))
361
+ selected_columns = ranked.head(cap).tolist()
362
+ if not json_mode:
363
+ console.print(
364
+ f"Auto-selected features for training: {selected_columns}", style="bold blue"
365
+ )
366
+ if n_ranked > cap:
367
+ console.print(
368
+ f"Auto-selected {cap} of {n_ranked} features "
369
+ f"(use --max-features to override or pass -tf).",
370
+ style="italic",
371
+ )
372
+ print("")
373
+ else:
374
+ requested = [c.strip() for c in training_features.split(",")]
375
+ missing = [c for c in requested if c not in data.columns]
376
+ if missing:
377
+ _abort(json_mode, console, "unknown_features",
378
+ f"requested features not in CSV: {missing}")
379
+ selected_columns = requested
380
+
381
+ target_column = target
382
+
383
+ if time_column is not None and time_column in selected_columns:
384
+ selected_columns = [c for c in selected_columns if c != time_column]
385
+
386
+ X_train, X_test, y_train, y_test, numerical_cols, categorical_cols = prepare_data(
387
+ data, selected_columns, target_column, time_column=time_column
388
+ )
389
+ time_aware = time_column is not None
390
+
391
+ # Resolve --log-target up front so the same decision flows into auto,
392
+ # expert, and the late-arriving expert+interactive tuning branch. The
393
+ # 'on' safety check (y_train > 0) runs here rather than inside
394
+ # resolve_log_target() so the CLI surfaces a friendly --json abort.
395
+ if log_target_mode == "on":
396
+ try:
397
+ y_arr = y_train.to_numpy(dtype=float)
398
+ except Exception:
399
+ y_arr = None
400
+ if y_arr is None or y_arr.size == 0 or (y_arr <= 0).any():
401
+ _abort(json_mode, console, "log_target_non_positive",
402
+ "--log-target on requires all training targets to be strictly "
403
+ "positive; found non-positive values in y_train.")
404
+ log_target_override, log_target_decision = resolve_log_target(
405
+ y_train, mode=log_target_mode
406
+ )
407
+
408
+ if expert and interactive:
409
+ if questionary.confirm("Plot histograms of the selected features?").ask():
410
+ plotting.plot_histograms(data[selected_columns])
411
+ if not json_mode:
412
+ print("")
413
+
414
+ if expert and not json_mode:
415
+ console.print("Numerical feature analysis:", style="bold white")
416
+ print("")
417
+ print_feature_stats(data[list(numerical_cols)])
418
+ print("")
419
+
420
+ scores: dict = {}
421
+ if expert:
422
+ if tune is None and interactive:
423
+ tune = questionary.confirm(
424
+ "Run hyperparameter tuning (slower, usually higher accuracy)?"
425
+ ).ask()
426
+ tune = bool(tune)
427
+
428
+ if not json_mode:
429
+ inner_spinner = Halo(
430
+ text=f"Training {algorithm} (tune={tune}, budget={budget})...", spinner="pong"
431
+ )
432
+ inner_spinner.start()
433
+ else:
434
+ inner_spinner = None
435
+
436
+ model, feature_weights, log_target = start_training(
437
+ X_train, y_train, numerical_cols, categorical_cols, algorithm,
438
+ budget=budget, tune=tune, time_aware=time_aware,
439
+ log_target=log_target_override,
440
+ )
441
+ if inner_spinner is not None:
442
+ inner_spinner.stop()
443
+ print_feature_weights(feature_weights)
444
+ print("")
445
+ if log_target:
446
+ console.print(
447
+ "Note: log-target transform applied (target is positive and skewed).",
448
+ style="italic",
449
+ )
450
+ else:
451
+ if not json_mode:
452
+ inner_spinner = Halo(
453
+ text=f"Auto-mode model selection (budget={budget})...", spinner="pong"
454
+ )
455
+ inner_spinner.start()
456
+ else:
457
+ inner_spinner = None
458
+
459
+ model, algorithm, scores, log_target = auto_train(
460
+ X_train, y_train, numerical_cols, categorical_cols,
461
+ budget=budget, time_aware=time_aware,
462
+ log_target=log_target_override,
463
+ )
464
+ if inner_spinner is not None:
465
+ inner_spinner.stop()
466
+ console.print(f"Selected best algorithm: [bold]{algorithm}[/bold]")
467
+ for algo, score in scores.items():
468
+ console.print(f" {algo}: CV R² = {round(score, 3)}")
469
+ if log_target:
470
+ console.print(
471
+ "Note: log-target transform applied (target is positive and skewed).",
472
+ style="italic",
473
+ )
474
+
475
+ if inner_spinner is not None:
476
+ inner_spinner.succeed("Training finished.")
477
+ print("")
478
+
479
+ mae, r2, p_value, rmse = evaluate_model(X_test, y_test, model)
480
+
481
+ # Quality label, computed once and used in both the Rich and JSON paths.
482
+ # Shared with the MCP layer via p2predict.quality (single source of truth).
483
+ from p2predict.quality import r2_quality_label
484
+ quality_label = r2_quality_label(r2)
485
+
486
+ if not json_mode:
487
+ if expert:
488
+ console.print("Model Key Performance Metrics:", style="bold white")
489
+ console.print(f"Model R² Score: {round(r2, 2)}")
490
+ console.print(f"Mean Absolute Error: {round(mae, 2)}")
491
+ console.print(f"RMSE: {round(rmse, 2)}")
492
+ console.print(f"Residual bias p-value: {round(p_value, 4)}")
493
+ print("")
494
+ else:
495
+ console.print("Model Performance Summary:", style="bold white")
496
+ style = {"Excellent": "bold green",
497
+ "Good": "bold yellow",
498
+ "Needs Improvement": "bold red"}[quality_label]
499
+ console.print(f"Model Quality: {quality_label}", style=style)
500
+ console.print(f"R² Score: {round(r2 * 100, 1)}%")
501
+ console.print(f"Mean Absolute Error: {round(mae, 2)}")
502
+ console.print(f"RMSE: {round(rmse, 2)}")
503
+ if p_value < 0.05:
504
+ console.print(
505
+ "Residuals show systematic bias — consider expert mode for tuning.",
506
+ style="italic bold yellow",
507
+ )
508
+ if quality_label == "Needs Improvement":
509
+ console.print(
510
+ "Recommendation: try expert mode with --tune, or collect more data.",
511
+ style="bold",
512
+ )
513
+ print("")
514
+
515
+ # Fallback prompt for the legacy expert+interactive flow. Skip it if
516
+ # the user already passed --report PATH — we'll write the report after
517
+ # save and don't want to ask twice.
518
+ if expert and interactive and report is None:
519
+ if questionary.confirm("Generate the model quality PDF report?").ask():
520
+ report = Prompt.ask("Enter PDF name (e.g., report.pdf)")
521
+
522
+ if expert and interactive and not tune:
523
+ if questionary.confirm(
524
+ "Run hyperparameter tuning now to try for a better model?"
525
+ ).ask():
526
+ tune_spinner = Halo("Tuning...", spinner="pong")
527
+ tune_spinner.start()
528
+ tuned_model, tuned_score, log_target = hyper_parameter_tuning(
529
+ X_train=X_train,
530
+ y_train=y_train,
531
+ numerical_cols=numerical_cols,
532
+ categorical_cols=categorical_cols,
533
+ algorithm=algorithm,
534
+ budget=budget,
535
+ time_aware=time_aware,
536
+ log_target=log_target_override,
537
+ )
538
+ tune_spinner.stop()
539
+ mae_t, r2_t, _, rmse_t = evaluate_model(X_test, y_test, tuned_model)
540
+ console.print(
541
+ f"Tuned R²={round(r2_t, 3)} (was {round(r2, 3)}), "
542
+ f"MAE={round(mae_t, 2)} (was {round(mae, 2)})"
543
+ )
544
+ if r2_t > r2:
545
+ console.print("Keeping tuned model.", style="bold green")
546
+ model = tuned_model
547
+ r2 = r2_t
548
+ else:
549
+ console.print("Tuned model did not improve; keeping original.", style="italic")
550
+ print("")
551
+
552
+ # Background sample for SHAP's LinearExplainer + conformal calibration
553
+ # for likely-range intervals are persisted alongside the model.
554
+ background_n = min(100, len(X_train))
555
+ background_sample = (
556
+ X_train.sample(n=background_n, random_state=0).reset_index(drop=True)
557
+ if background_n > 0
558
+ else None
559
+ )
560
+ calibration = compute_calibration_residuals(model, X_test, y_test)
561
+
562
+ model_metadata = Serialize_Trained_Model(
563
+ algorithm,
564
+ selected_columns,
565
+ target_column,
566
+ model,
567
+ r2,
568
+ log_target=log_target,
569
+ background_sample=background_sample,
570
+ calibration=calibration,
571
+ )
572
+
573
+ # Feature importances. Extracted once, reused for both the PDF report
574
+ # and the JSON payload. Don't fail the train CLI if extraction misbehaves
575
+ # — surface it as missing instead.
576
+ try:
577
+ importances = extract_feature_importances(model, X_train)
578
+ importances_block = [
579
+ {"feature": k, "importance": float(v)} for k, v in importances
580
+ ]
581
+ except Exception:
582
+ importances = None
583
+ importances_block = []
584
+
585
+ saved_model_path: str | None = None
586
+ if interactive:
587
+ if questionary.confirm("Save the model?").ask():
588
+ model_name = questionary.text("Enter model name (e.g., my_model.model)").ask()
589
+ SaveModel(model_metadata, model_name)
590
+ saved_model_path = model_name
591
+ else:
592
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
593
+ model_name = f"models/{algorithm}_{target}_{timestamp}.model"
594
+ SaveModel(model_metadata, model_name)
595
+ saved_model_path = model_name
596
+ if not json_mode:
597
+ console.print(f"Model saved to {model_name}", style="bold green")
598
+
599
+ report_path: str | None = None
600
+ if report:
601
+ y_pred_test = model.predict(X_test)
602
+ plotting.plot_results_pdf(
603
+ y_test,
604
+ y_pred_test,
605
+ report,
606
+ target_name=target,
607
+ model_name=algorithm,
608
+ n_train=len(X_train),
609
+ training_date=datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
610
+ feature_importances=importances,
611
+ )
612
+ report_path = report
613
+ if not json_mode:
614
+ console.print(f"PDF report written to {report}", style="bold green")
615
+
616
+ if not json_mode:
617
+ print("")
618
+ return
619
+
620
+ # ---- JSON path ----
621
+ response.update({
622
+ "input": {
623
+ "csv_path": str(input),
624
+ "rows_loaded": rows_loaded,
625
+ "rows_dropped_target_na": rows_dropped_target_na,
626
+ "rows_used": int(data.shape[0]),
627
+ "rows_after_outlier_handling": int(data.shape[0]),
628
+ "target": target,
629
+ },
630
+ "time_column": time_column,
631
+ "outliers": {
632
+ "target": _outlier_summary_block(outlier_summary),
633
+ "features": _feature_outlier_summary_block(feature_outlier_summary),
634
+ },
635
+ "low_info_features": {
636
+ "no_information": list(low_vars),
637
+ "high_variation": list(high_vars),
638
+ },
639
+ "features_selected": list(selected_columns),
640
+ "algorithm_selected": algorithm,
641
+ "log_target": bool(log_target),
642
+ "log_target_decision": log_target_decision,
643
+ "cv_scores": {k: float(v) for k, v in scores.items()} if scores else {},
644
+ "feature_importances": importances_block,
645
+ "evaluation": {
646
+ "r2": float(r2),
647
+ "mae": float(mae),
648
+ "rmse": float(rmse),
649
+ "residual_bias_p_value": float(p_value),
650
+ "quality_label": quality_label,
651
+ },
652
+ "model_path": saved_model_path,
653
+ "report_path": report_path,
654
+ })
655
+ emit(response)
656
+
657
+
658
+ if __name__ == "__main__":
659
+ train()