ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +48 -22
- ins_pricing/__init__.py +142 -90
- ins_pricing/cli/BayesOpt_entry.py +58 -46
- ins_pricing/cli/BayesOpt_incremental.py +77 -110
- ins_pricing/cli/Explain_Run.py +42 -23
- ins_pricing/cli/Explain_entry.py +551 -577
- ins_pricing/cli/Pricing_Run.py +42 -23
- ins_pricing/cli/bayesopt_entry_runner.py +51 -16
- ins_pricing/cli/utils/bootstrap.py +23 -0
- ins_pricing/cli/utils/cli_common.py +256 -256
- ins_pricing/cli/utils/cli_config.py +379 -360
- ins_pricing/cli/utils/import_resolver.py +375 -358
- ins_pricing/cli/utils/notebook_utils.py +256 -242
- ins_pricing/cli/watchdog_run.py +216 -198
- ins_pricing/frontend/__init__.py +10 -10
- ins_pricing/frontend/app.py +132 -61
- ins_pricing/frontend/config_builder.py +33 -0
- ins_pricing/frontend/example_config.json +11 -0
- ins_pricing/frontend/example_workflows.py +1 -1
- ins_pricing/frontend/runner.py +340 -388
- ins_pricing/governance/__init__.py +20 -20
- ins_pricing/governance/release.py +159 -159
- ins_pricing/modelling/README.md +1 -1
- ins_pricing/modelling/__init__.py +147 -92
- ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
- ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
- ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
- ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
- ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
- ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
- ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
- ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
- ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
- ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
- ins_pricing/modelling/explain/__init__.py +55 -55
- ins_pricing/modelling/explain/metrics.py +27 -174
- ins_pricing/modelling/explain/permutation.py +237 -237
- ins_pricing/modelling/plotting/__init__.py +40 -36
- ins_pricing/modelling/plotting/compat.py +228 -0
- ins_pricing/modelling/plotting/curves.py +572 -572
- ins_pricing/modelling/plotting/diagnostics.py +163 -163
- ins_pricing/modelling/plotting/geo.py +362 -362
- ins_pricing/modelling/plotting/importance.py +121 -121
- ins_pricing/pricing/__init__.py +27 -27
- ins_pricing/pricing/factors.py +67 -56
- ins_pricing/production/__init__.py +35 -25
- ins_pricing/production/{predict.py → inference.py} +140 -57
- ins_pricing/production/monitoring.py +8 -21
- ins_pricing/reporting/__init__.py +11 -11
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/production/test_inference.py +90 -0
- ins_pricing/utils/__init__.py +112 -78
- ins_pricing/utils/device.py +258 -237
- ins_pricing/utils/features.py +53 -0
- ins_pricing/utils/io.py +72 -0
- ins_pricing/utils/logging.py +34 -1
- ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
- ins_pricing/utils/metrics.py +158 -24
- ins_pricing/utils/numerics.py +76 -0
- ins_pricing/utils/paths.py +9 -1
- ins_pricing/utils/profiling.py +8 -4
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
- ins_pricing-0.5.1.dist-info/RECORD +132 -0
- ins_pricing/modelling/core/BayesOpt.py +0 -146
- ins_pricing/modelling/core/__init__.py +0 -1
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
- ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
- ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
- ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
- ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
- ins_pricing/modelling/core/bayesopt/utils.py +0 -105
- ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
- ins_pricing/tests/production/test_predict.py +0 -233
- ins_pricing-0.4.5.dist-info/RECORD +0 -130
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,237 +1,237 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Callable, Optional, Sequence
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
from joblib import Parallel, delayed
|
|
8
|
-
|
|
9
|
-
from .metrics import resolve_metric
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def _compute_feature_importance(
|
|
13
|
-
feat, X_data, y_arr, w_arr, predict_fn, metric_fn,
|
|
14
|
-
baseline_score, higher_is_better, n_repeats, random_state, metric_name,
|
|
15
|
-
return_scores, is_dataframe=True, feat_idx=None
|
|
16
|
-
):
|
|
17
|
-
"""Helper function to compute importance for a single feature (parallelizable)."""
|
|
18
|
-
rng = np.random.default_rng(random_state)
|
|
19
|
-
|
|
20
|
-
if is_dataframe:
|
|
21
|
-
# Work on a copy for thread safety in parallel execution
|
|
22
|
-
X_work = X_data.copy()
|
|
23
|
-
orig_values = X_work[feat].to_numpy(copy=False).copy()
|
|
24
|
-
scores = []
|
|
25
|
-
for _ in range(n_repeats):
|
|
26
|
-
X_work[feat] = rng.permutation(orig_values)
|
|
27
|
-
pred = predict_fn(X_work)
|
|
28
|
-
score = metric_fn(y_arr, pred, w_arr)
|
|
29
|
-
scores.append(float(score))
|
|
30
|
-
else:
|
|
31
|
-
X_work = X_data.copy()
|
|
32
|
-
orig_col = X_data[:, feat_idx].copy()
|
|
33
|
-
scores = []
|
|
34
|
-
for _ in range(n_repeats):
|
|
35
|
-
X_work[:, feat_idx] = rng.permutation(orig_col)
|
|
36
|
-
pred = predict_fn(X_work)
|
|
37
|
-
score = metric_fn(y_arr, pred, w_arr)
|
|
38
|
-
scores.append(float(score))
|
|
39
|
-
|
|
40
|
-
scores_arr = np.asarray(scores, dtype=float)
|
|
41
|
-
if higher_is_better:
|
|
42
|
-
delta = baseline_score - scores_arr
|
|
43
|
-
else:
|
|
44
|
-
delta = scores_arr - baseline_score
|
|
45
|
-
|
|
46
|
-
entry = {
|
|
47
|
-
"feature": feat,
|
|
48
|
-
"importance_mean": float(np.mean(delta)),
|
|
49
|
-
"importance_std": float(np.std(delta)),
|
|
50
|
-
"baseline_score": float(baseline_score),
|
|
51
|
-
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
52
|
-
"metric": metric_name,
|
|
53
|
-
}
|
|
54
|
-
if return_scores:
|
|
55
|
-
entry["permutation_scores"] = scores
|
|
56
|
-
return entry
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def _prepare_data(X, y, sample_weight, max_rows, rng):
|
|
60
|
-
y_arr = np.asarray(y)
|
|
61
|
-
if y_arr.ndim != 1:
|
|
62
|
-
y_arr = y_arr.reshape(-1)
|
|
63
|
-
|
|
64
|
-
w_arr = None
|
|
65
|
-
if sample_weight is not None:
|
|
66
|
-
w_arr = np.asarray(sample_weight).reshape(-1)
|
|
67
|
-
if w_arr.shape[0] != y_arr.shape[0]:
|
|
68
|
-
raise ValueError("sample_weight length must match y.")
|
|
69
|
-
|
|
70
|
-
if isinstance(X, pd.DataFrame):
|
|
71
|
-
X_data = X
|
|
72
|
-
if len(X_data) != len(y_arr):
|
|
73
|
-
raise ValueError("X and y must have the same length.")
|
|
74
|
-
if max_rows and len(X_data) > max_rows:
|
|
75
|
-
idx = rng.choice(len(X_data), size=int(max_rows), replace=False)
|
|
76
|
-
X_data = X_data.iloc[idx].copy()
|
|
77
|
-
y_arr = y_arr[idx]
|
|
78
|
-
if w_arr is not None:
|
|
79
|
-
w_arr = w_arr[idx]
|
|
80
|
-
return X_data, y_arr, w_arr
|
|
81
|
-
|
|
82
|
-
X_np = np.asarray(X)
|
|
83
|
-
if X_np.ndim != 2:
|
|
84
|
-
raise ValueError("X must be 2d when not a DataFrame.")
|
|
85
|
-
if X_np.shape[0] != y_arr.shape[0]:
|
|
86
|
-
raise ValueError("X and y must have the same length.")
|
|
87
|
-
if max_rows and X_np.shape[0] > max_rows:
|
|
88
|
-
idx = rng.choice(X_np.shape[0], size=int(max_rows), replace=False)
|
|
89
|
-
X_np = X_np[idx]
|
|
90
|
-
y_arr = y_arr[idx]
|
|
91
|
-
if w_arr is not None:
|
|
92
|
-
w_arr = w_arr[idx]
|
|
93
|
-
return X_np, y_arr, w_arr
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def permutation_importance(
|
|
97
|
-
predict_fn: Callable,
|
|
98
|
-
X,
|
|
99
|
-
y,
|
|
100
|
-
*,
|
|
101
|
-
sample_weight=None,
|
|
102
|
-
metric: str | Callable = "auto",
|
|
103
|
-
task_type: Optional[str] = None,
|
|
104
|
-
higher_is_better: Optional[bool] = None,
|
|
105
|
-
n_repeats: int = 5,
|
|
106
|
-
random_state: Optional[int] = None,
|
|
107
|
-
max_rows: Optional[int] = 5000,
|
|
108
|
-
features: Optional[Sequence[str]] = None,
|
|
109
|
-
return_scores: bool = False,
|
|
110
|
-
safe_copy: bool = False,
|
|
111
|
-
n_jobs: Optional[int] = None,
|
|
112
|
-
) -> pd.DataFrame:
|
|
113
|
-
"""Permutation importance on tabular data.
|
|
114
|
-
|
|
115
|
-
predict_fn should accept the same type as X (DataFrame or ndarray).
|
|
116
|
-
Set safe_copy=True if predict_fn mutates its input.
|
|
117
|
-
Set n_jobs to enable parallel processing across features (default: None = sequential).
|
|
118
|
-
"""
|
|
119
|
-
rng = np.random.default_rng(random_state)
|
|
120
|
-
n_repeats = max(1, int(n_repeats))
|
|
121
|
-
|
|
122
|
-
X_data, y_arr, w_arr = _prepare_data(X, y, sample_weight, max_rows, rng)
|
|
123
|
-
metric_fn, higher_is_better, metric_name = resolve_metric(
|
|
124
|
-
metric, task_type=task_type, higher_is_better=higher_is_better
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
baseline_pred = predict_fn(X_data)
|
|
128
|
-
baseline_score = metric_fn(y_arr, baseline_pred, w_arr)
|
|
129
|
-
|
|
130
|
-
if isinstance(X_data, pd.DataFrame):
|
|
131
|
-
feature_names = list(X_data.columns)
|
|
132
|
-
if features is not None:
|
|
133
|
-
feature_names = [f for f in features if f in X_data.columns]
|
|
134
|
-
|
|
135
|
-
# Use parallel processing if n_jobs is specified
|
|
136
|
-
if n_jobs is not None and n_jobs != 1:
|
|
137
|
-
# Generate different random seeds for each feature to ensure reproducibility
|
|
138
|
-
seeds = [random_state + i if random_state is not None else None
|
|
139
|
-
for i in range(len(feature_names))]
|
|
140
|
-
results = Parallel(n_jobs=n_jobs, prefer="threads")(
|
|
141
|
-
delayed(_compute_feature_importance)(
|
|
142
|
-
feat, X_data, y_arr, w_arr, predict_fn, metric_fn,
|
|
143
|
-
baseline_score, higher_is_better, n_repeats, seed,
|
|
144
|
-
metric_name, return_scores, is_dataframe=True
|
|
145
|
-
)
|
|
146
|
-
for feat, seed in zip(feature_names, seeds)
|
|
147
|
-
)
|
|
148
|
-
else:
|
|
149
|
-
# Sequential processing (original optimized version)
|
|
150
|
-
X_perm = X_data if not safe_copy else X_data.copy()
|
|
151
|
-
results = []
|
|
152
|
-
for feat in feature_names:
|
|
153
|
-
# Store original values directly without extra copy
|
|
154
|
-
orig_values = X_perm[feat].to_numpy(copy=False)
|
|
155
|
-
orig_copy = orig_values.copy() # Only copy the column, not the entire DataFrame
|
|
156
|
-
scores = []
|
|
157
|
-
for _ in range(n_repeats):
|
|
158
|
-
X_perm[feat] = rng.permutation(orig_copy)
|
|
159
|
-
pred = predict_fn(X_perm)
|
|
160
|
-
score = metric_fn(y_arr, pred, w_arr)
|
|
161
|
-
scores.append(float(score))
|
|
162
|
-
# Restore original column values
|
|
163
|
-
X_perm[feat] = orig_copy
|
|
164
|
-
|
|
165
|
-
scores_arr = np.asarray(scores, dtype=float)
|
|
166
|
-
if higher_is_better:
|
|
167
|
-
delta = baseline_score - scores_arr
|
|
168
|
-
else:
|
|
169
|
-
delta = scores_arr - baseline_score
|
|
170
|
-
entry = {
|
|
171
|
-
"feature": feat,
|
|
172
|
-
"importance_mean": float(np.mean(delta)),
|
|
173
|
-
"importance_std": float(np.std(delta)),
|
|
174
|
-
"baseline_score": float(baseline_score),
|
|
175
|
-
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
176
|
-
"metric": metric_name,
|
|
177
|
-
}
|
|
178
|
-
if return_scores:
|
|
179
|
-
entry["permutation_scores"] = scores
|
|
180
|
-
results.append(entry)
|
|
181
|
-
else:
|
|
182
|
-
if features is not None:
|
|
183
|
-
if len(features) != X_data.shape[1]:
|
|
184
|
-
raise ValueError("features length must match X columns for ndarray input.")
|
|
185
|
-
feature_names = list(features)
|
|
186
|
-
else:
|
|
187
|
-
feature_names = [f"x{i}" for i in range(X_data.shape[1])]
|
|
188
|
-
|
|
189
|
-
X_base = np.asarray(X_data)
|
|
190
|
-
|
|
191
|
-
# Use parallel processing if n_jobs is specified
|
|
192
|
-
if n_jobs is not None and n_jobs != 1:
|
|
193
|
-
seeds = [random_state + i if random_state is not None else None
|
|
194
|
-
for i in range(len(feature_names))]
|
|
195
|
-
results = Parallel(n_jobs=n_jobs, prefer="threads")(
|
|
196
|
-
delayed(_compute_feature_importance)(
|
|
197
|
-
feat, X_base, y_arr, w_arr, predict_fn, metric_fn,
|
|
198
|
-
baseline_score, higher_is_better, n_repeats, seed,
|
|
199
|
-
metric_name, return_scores, is_dataframe=False, feat_idx=idx
|
|
200
|
-
)
|
|
201
|
-
for idx, (feat, seed) in enumerate(zip(feature_names, seeds))
|
|
202
|
-
)
|
|
203
|
-
else:
|
|
204
|
-
# Sequential processing
|
|
205
|
-
X_perm = X_base.copy()
|
|
206
|
-
results = []
|
|
207
|
-
for idx, feat in enumerate(feature_names):
|
|
208
|
-
orig_col = X_base[:, idx].copy()
|
|
209
|
-
scores = []
|
|
210
|
-
for _ in range(n_repeats):
|
|
211
|
-
X_perm[:, idx] = rng.permutation(orig_col)
|
|
212
|
-
pred_input = X_perm.copy() if safe_copy else X_perm
|
|
213
|
-
pred = predict_fn(pred_input)
|
|
214
|
-
score = metric_fn(y_arr, pred, w_arr)
|
|
215
|
-
scores.append(float(score))
|
|
216
|
-
X_perm[:, idx] = orig_col
|
|
217
|
-
|
|
218
|
-
scores_arr = np.asarray(scores, dtype=float)
|
|
219
|
-
if higher_is_better:
|
|
220
|
-
delta = baseline_score - scores_arr
|
|
221
|
-
else:
|
|
222
|
-
delta = scores_arr - baseline_score
|
|
223
|
-
entry = {
|
|
224
|
-
"feature": feat,
|
|
225
|
-
"importance_mean": float(np.mean(delta)),
|
|
226
|
-
"importance_std": float(np.std(delta)),
|
|
227
|
-
"baseline_score": float(baseline_score),
|
|
228
|
-
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
229
|
-
"metric": metric_name,
|
|
230
|
-
}
|
|
231
|
-
if return_scores:
|
|
232
|
-
entry["permutation_scores"] = scores
|
|
233
|
-
results.append(entry)
|
|
234
|
-
|
|
235
|
-
df = pd.DataFrame(results)
|
|
236
|
-
df = df.sort_values(by="importance_mean", ascending=False).reset_index(drop=True)
|
|
237
|
-
return df
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Callable, Optional, Sequence
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from joblib import Parallel, delayed
|
|
8
|
+
|
|
9
|
+
from ins_pricing.modelling.explain.metrics import resolve_metric
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _compute_feature_importance(
|
|
13
|
+
feat, X_data, y_arr, w_arr, predict_fn, metric_fn,
|
|
14
|
+
baseline_score, higher_is_better, n_repeats, random_state, metric_name,
|
|
15
|
+
return_scores, is_dataframe=True, feat_idx=None
|
|
16
|
+
):
|
|
17
|
+
"""Helper function to compute importance for a single feature (parallelizable)."""
|
|
18
|
+
rng = np.random.default_rng(random_state)
|
|
19
|
+
|
|
20
|
+
if is_dataframe:
|
|
21
|
+
# Work on a copy for thread safety in parallel execution
|
|
22
|
+
X_work = X_data.copy()
|
|
23
|
+
orig_values = X_work[feat].to_numpy(copy=False).copy()
|
|
24
|
+
scores = []
|
|
25
|
+
for _ in range(n_repeats):
|
|
26
|
+
X_work[feat] = rng.permutation(orig_values)
|
|
27
|
+
pred = predict_fn(X_work)
|
|
28
|
+
score = metric_fn(y_arr, pred, w_arr)
|
|
29
|
+
scores.append(float(score))
|
|
30
|
+
else:
|
|
31
|
+
X_work = X_data.copy()
|
|
32
|
+
orig_col = X_data[:, feat_idx].copy()
|
|
33
|
+
scores = []
|
|
34
|
+
for _ in range(n_repeats):
|
|
35
|
+
X_work[:, feat_idx] = rng.permutation(orig_col)
|
|
36
|
+
pred = predict_fn(X_work)
|
|
37
|
+
score = metric_fn(y_arr, pred, w_arr)
|
|
38
|
+
scores.append(float(score))
|
|
39
|
+
|
|
40
|
+
scores_arr = np.asarray(scores, dtype=float)
|
|
41
|
+
if higher_is_better:
|
|
42
|
+
delta = baseline_score - scores_arr
|
|
43
|
+
else:
|
|
44
|
+
delta = scores_arr - baseline_score
|
|
45
|
+
|
|
46
|
+
entry = {
|
|
47
|
+
"feature": feat,
|
|
48
|
+
"importance_mean": float(np.mean(delta)),
|
|
49
|
+
"importance_std": float(np.std(delta)),
|
|
50
|
+
"baseline_score": float(baseline_score),
|
|
51
|
+
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
52
|
+
"metric": metric_name,
|
|
53
|
+
}
|
|
54
|
+
if return_scores:
|
|
55
|
+
entry["permutation_scores"] = scores
|
|
56
|
+
return entry
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _prepare_data(X, y, sample_weight, max_rows, rng):
|
|
60
|
+
y_arr = np.asarray(y)
|
|
61
|
+
if y_arr.ndim != 1:
|
|
62
|
+
y_arr = y_arr.reshape(-1)
|
|
63
|
+
|
|
64
|
+
w_arr = None
|
|
65
|
+
if sample_weight is not None:
|
|
66
|
+
w_arr = np.asarray(sample_weight).reshape(-1)
|
|
67
|
+
if w_arr.shape[0] != y_arr.shape[0]:
|
|
68
|
+
raise ValueError("sample_weight length must match y.")
|
|
69
|
+
|
|
70
|
+
if isinstance(X, pd.DataFrame):
|
|
71
|
+
X_data = X
|
|
72
|
+
if len(X_data) != len(y_arr):
|
|
73
|
+
raise ValueError("X and y must have the same length.")
|
|
74
|
+
if max_rows and len(X_data) > max_rows:
|
|
75
|
+
idx = rng.choice(len(X_data), size=int(max_rows), replace=False)
|
|
76
|
+
X_data = X_data.iloc[idx].copy()
|
|
77
|
+
y_arr = y_arr[idx]
|
|
78
|
+
if w_arr is not None:
|
|
79
|
+
w_arr = w_arr[idx]
|
|
80
|
+
return X_data, y_arr, w_arr
|
|
81
|
+
|
|
82
|
+
X_np = np.asarray(X)
|
|
83
|
+
if X_np.ndim != 2:
|
|
84
|
+
raise ValueError("X must be 2d when not a DataFrame.")
|
|
85
|
+
if X_np.shape[0] != y_arr.shape[0]:
|
|
86
|
+
raise ValueError("X and y must have the same length.")
|
|
87
|
+
if max_rows and X_np.shape[0] > max_rows:
|
|
88
|
+
idx = rng.choice(X_np.shape[0], size=int(max_rows), replace=False)
|
|
89
|
+
X_np = X_np[idx]
|
|
90
|
+
y_arr = y_arr[idx]
|
|
91
|
+
if w_arr is not None:
|
|
92
|
+
w_arr = w_arr[idx]
|
|
93
|
+
return X_np, y_arr, w_arr
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def permutation_importance(
|
|
97
|
+
predict_fn: Callable,
|
|
98
|
+
X,
|
|
99
|
+
y,
|
|
100
|
+
*,
|
|
101
|
+
sample_weight=None,
|
|
102
|
+
metric: str | Callable = "auto",
|
|
103
|
+
task_type: Optional[str] = None,
|
|
104
|
+
higher_is_better: Optional[bool] = None,
|
|
105
|
+
n_repeats: int = 5,
|
|
106
|
+
random_state: Optional[int] = None,
|
|
107
|
+
max_rows: Optional[int] = 5000,
|
|
108
|
+
features: Optional[Sequence[str]] = None,
|
|
109
|
+
return_scores: bool = False,
|
|
110
|
+
safe_copy: bool = False,
|
|
111
|
+
n_jobs: Optional[int] = None,
|
|
112
|
+
) -> pd.DataFrame:
|
|
113
|
+
"""Permutation importance on tabular data.
|
|
114
|
+
|
|
115
|
+
predict_fn should accept the same type as X (DataFrame or ndarray).
|
|
116
|
+
Set safe_copy=True if predict_fn mutates its input.
|
|
117
|
+
Set n_jobs to enable parallel processing across features (default: None = sequential).
|
|
118
|
+
"""
|
|
119
|
+
rng = np.random.default_rng(random_state)
|
|
120
|
+
n_repeats = max(1, int(n_repeats))
|
|
121
|
+
|
|
122
|
+
X_data, y_arr, w_arr = _prepare_data(X, y, sample_weight, max_rows, rng)
|
|
123
|
+
metric_fn, higher_is_better, metric_name = resolve_metric(
|
|
124
|
+
metric, task_type=task_type, higher_is_better=higher_is_better
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
baseline_pred = predict_fn(X_data)
|
|
128
|
+
baseline_score = metric_fn(y_arr, baseline_pred, w_arr)
|
|
129
|
+
|
|
130
|
+
if isinstance(X_data, pd.DataFrame):
|
|
131
|
+
feature_names = list(X_data.columns)
|
|
132
|
+
if features is not None:
|
|
133
|
+
feature_names = [f for f in features if f in X_data.columns]
|
|
134
|
+
|
|
135
|
+
# Use parallel processing if n_jobs is specified
|
|
136
|
+
if n_jobs is not None and n_jobs != 1:
|
|
137
|
+
# Generate different random seeds for each feature to ensure reproducibility
|
|
138
|
+
seeds = [random_state + i if random_state is not None else None
|
|
139
|
+
for i in range(len(feature_names))]
|
|
140
|
+
results = Parallel(n_jobs=n_jobs, prefer="threads")(
|
|
141
|
+
delayed(_compute_feature_importance)(
|
|
142
|
+
feat, X_data, y_arr, w_arr, predict_fn, metric_fn,
|
|
143
|
+
baseline_score, higher_is_better, n_repeats, seed,
|
|
144
|
+
metric_name, return_scores, is_dataframe=True
|
|
145
|
+
)
|
|
146
|
+
for feat, seed in zip(feature_names, seeds)
|
|
147
|
+
)
|
|
148
|
+
else:
|
|
149
|
+
# Sequential processing (original optimized version)
|
|
150
|
+
X_perm = X_data if not safe_copy else X_data.copy()
|
|
151
|
+
results = []
|
|
152
|
+
for feat in feature_names:
|
|
153
|
+
# Store original values directly without extra copy
|
|
154
|
+
orig_values = X_perm[feat].to_numpy(copy=False)
|
|
155
|
+
orig_copy = orig_values.copy() # Only copy the column, not the entire DataFrame
|
|
156
|
+
scores = []
|
|
157
|
+
for _ in range(n_repeats):
|
|
158
|
+
X_perm[feat] = rng.permutation(orig_copy)
|
|
159
|
+
pred = predict_fn(X_perm)
|
|
160
|
+
score = metric_fn(y_arr, pred, w_arr)
|
|
161
|
+
scores.append(float(score))
|
|
162
|
+
# Restore original column values
|
|
163
|
+
X_perm[feat] = orig_copy
|
|
164
|
+
|
|
165
|
+
scores_arr = np.asarray(scores, dtype=float)
|
|
166
|
+
if higher_is_better:
|
|
167
|
+
delta = baseline_score - scores_arr
|
|
168
|
+
else:
|
|
169
|
+
delta = scores_arr - baseline_score
|
|
170
|
+
entry = {
|
|
171
|
+
"feature": feat,
|
|
172
|
+
"importance_mean": float(np.mean(delta)),
|
|
173
|
+
"importance_std": float(np.std(delta)),
|
|
174
|
+
"baseline_score": float(baseline_score),
|
|
175
|
+
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
176
|
+
"metric": metric_name,
|
|
177
|
+
}
|
|
178
|
+
if return_scores:
|
|
179
|
+
entry["permutation_scores"] = scores
|
|
180
|
+
results.append(entry)
|
|
181
|
+
else:
|
|
182
|
+
if features is not None:
|
|
183
|
+
if len(features) != X_data.shape[1]:
|
|
184
|
+
raise ValueError("features length must match X columns for ndarray input.")
|
|
185
|
+
feature_names = list(features)
|
|
186
|
+
else:
|
|
187
|
+
feature_names = [f"x{i}" for i in range(X_data.shape[1])]
|
|
188
|
+
|
|
189
|
+
X_base = np.asarray(X_data)
|
|
190
|
+
|
|
191
|
+
# Use parallel processing if n_jobs is specified
|
|
192
|
+
if n_jobs is not None and n_jobs != 1:
|
|
193
|
+
seeds = [random_state + i if random_state is not None else None
|
|
194
|
+
for i in range(len(feature_names))]
|
|
195
|
+
results = Parallel(n_jobs=n_jobs, prefer="threads")(
|
|
196
|
+
delayed(_compute_feature_importance)(
|
|
197
|
+
feat, X_base, y_arr, w_arr, predict_fn, metric_fn,
|
|
198
|
+
baseline_score, higher_is_better, n_repeats, seed,
|
|
199
|
+
metric_name, return_scores, is_dataframe=False, feat_idx=idx
|
|
200
|
+
)
|
|
201
|
+
for idx, (feat, seed) in enumerate(zip(feature_names, seeds))
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
# Sequential processing
|
|
205
|
+
X_perm = X_base.copy()
|
|
206
|
+
results = []
|
|
207
|
+
for idx, feat in enumerate(feature_names):
|
|
208
|
+
orig_col = X_base[:, idx].copy()
|
|
209
|
+
scores = []
|
|
210
|
+
for _ in range(n_repeats):
|
|
211
|
+
X_perm[:, idx] = rng.permutation(orig_col)
|
|
212
|
+
pred_input = X_perm.copy() if safe_copy else X_perm
|
|
213
|
+
pred = predict_fn(pred_input)
|
|
214
|
+
score = metric_fn(y_arr, pred, w_arr)
|
|
215
|
+
scores.append(float(score))
|
|
216
|
+
X_perm[:, idx] = orig_col
|
|
217
|
+
|
|
218
|
+
scores_arr = np.asarray(scores, dtype=float)
|
|
219
|
+
if higher_is_better:
|
|
220
|
+
delta = baseline_score - scores_arr
|
|
221
|
+
else:
|
|
222
|
+
delta = scores_arr - baseline_score
|
|
223
|
+
entry = {
|
|
224
|
+
"feature": feat,
|
|
225
|
+
"importance_mean": float(np.mean(delta)),
|
|
226
|
+
"importance_std": float(np.std(delta)),
|
|
227
|
+
"baseline_score": float(baseline_score),
|
|
228
|
+
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
229
|
+
"metric": metric_name,
|
|
230
|
+
}
|
|
231
|
+
if return_scores:
|
|
232
|
+
entry["permutation_scores"] = scores
|
|
233
|
+
results.append(entry)
|
|
234
|
+
|
|
235
|
+
df = pd.DataFrame(results)
|
|
236
|
+
df = df.sort_values(by="importance_mean", ascending=False).reset_index(drop=True)
|
|
237
|
+
return df
|
|
@@ -1,45 +1,49 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from .common import EPS, PlotStyle
|
|
4
|
-
from .curves import (
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ins_pricing.modelling.plotting.common import EPS, PlotStyle
|
|
4
|
+
from ins_pricing.modelling.plotting.curves import (
|
|
5
5
|
double_lift_table,
|
|
6
6
|
lift_table,
|
|
7
|
-
plot_calibration_curve,
|
|
8
|
-
plot_conversion_lift,
|
|
9
|
-
plot_double_lift_curve,
|
|
10
|
-
plot_ks_curve,
|
|
11
|
-
plot_lift_curve,
|
|
7
|
+
plot_calibration_curve,
|
|
8
|
+
plot_conversion_lift,
|
|
9
|
+
plot_double_lift_curve,
|
|
10
|
+
plot_ks_curve,
|
|
11
|
+
plot_lift_curve,
|
|
12
12
|
plot_pr_curves,
|
|
13
13
|
plot_roc_curves,
|
|
14
14
|
)
|
|
15
|
-
from .
|
|
16
|
-
from .
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
36
|
-
"
|
|
37
|
-
"
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
15
|
+
from ins_pricing.modelling.plotting.compat import PlotUtils, plot_dlift_list, plot_lift_list
|
|
16
|
+
from ins_pricing.modelling.plotting.diagnostics import plot_loss_curve, plot_oneway
|
|
17
|
+
from ins_pricing.modelling.plotting.geo import (
|
|
18
|
+
plot_geo_contour,
|
|
19
|
+
plot_geo_contour_on_map,
|
|
20
|
+
plot_geo_heatmap,
|
|
21
|
+
plot_geo_heatmap_on_map,
|
|
22
|
+
)
|
|
23
|
+
from ins_pricing.modelling.plotting.importance import plot_feature_importance, plot_shap_importance, shap_importance
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"EPS",
|
|
27
|
+
"PlotStyle",
|
|
28
|
+
"double_lift_table",
|
|
29
|
+
"lift_table",
|
|
30
|
+
"plot_calibration_curve",
|
|
31
|
+
"plot_conversion_lift",
|
|
32
|
+
"plot_double_lift_curve",
|
|
33
|
+
"plot_feature_importance",
|
|
34
|
+
"plot_geo_contour",
|
|
35
|
+
"plot_geo_contour_on_map",
|
|
36
|
+
"plot_geo_heatmap",
|
|
37
|
+
"plot_geo_heatmap_on_map",
|
|
38
|
+
"plot_ks_curve",
|
|
39
|
+
"plot_lift_curve",
|
|
40
|
+
"plot_loss_curve",
|
|
41
|
+
"plot_oneway",
|
|
42
|
+
"plot_pr_curves",
|
|
42
43
|
"plot_roc_curves",
|
|
43
44
|
"plot_shap_importance",
|
|
44
45
|
"shap_importance",
|
|
46
|
+
"PlotUtils",
|
|
47
|
+
"plot_lift_list",
|
|
48
|
+
"plot_dlift_list",
|
|
45
49
|
]
|