ins-pricing 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/CHANGELOG.md +179 -0
- ins_pricing/RELEASE_NOTES_0.2.8.md +344 -0
- ins_pricing/modelling/core/bayesopt/utils.py +2 -1
- ins_pricing/modelling/explain/shap_utils.py +209 -6
- ins_pricing/pricing/calibration.py +125 -1
- ins_pricing/pricing/factors.py +110 -1
- ins_pricing/production/preprocess.py +166 -0
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/governance/__init__.py +1 -0
- ins_pricing/tests/governance/test_audit.py +56 -0
- ins_pricing/tests/governance/test_registry.py +128 -0
- ins_pricing/tests/governance/test_release.py +74 -0
- ins_pricing/tests/pricing/__init__.py +1 -0
- ins_pricing/tests/pricing/test_calibration.py +72 -0
- ins_pricing/tests/pricing/test_exposure.py +64 -0
- ins_pricing/tests/pricing/test_factors.py +156 -0
- ins_pricing/tests/pricing/test_rate_table.py +40 -0
- ins_pricing/tests/production/__init__.py +1 -0
- ins_pricing/tests/production/test_monitoring.py +350 -0
- ins_pricing/tests/production/test_predict.py +233 -0
- ins_pricing/tests/production/test_preprocess.py +339 -0
- ins_pricing/tests/production/test_scoring.py +311 -0
- ins_pricing/utils/profiling.py +377 -0
- ins_pricing/utils/validation.py +427 -0
- ins_pricing-0.2.9.dist-info/METADATA +149 -0
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/RECORD +28 -12
- ins_pricing/CHANGELOG_20260114.md +0 -275
- ins_pricing/CODE_REVIEW_IMPROVEMENTS.md +0 -715
- ins_pricing-0.2.7.dist-info/METADATA +0 -101
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/WHEEL +0 -0
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from typing import Callable, Optional
|
|
4
|
+
import warnings
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
@@ -24,8 +25,33 @@ def compute_shap_core(
|
|
|
24
25
|
prep_fn: Callable[[pd.DataFrame], np.ndarray],
|
|
25
26
|
predict_fn: Callable[[np.ndarray], np.ndarray],
|
|
26
27
|
cleanup_fn: Optional[Callable[[], None]] = None,
|
|
28
|
+
use_parallel: bool = False,
|
|
29
|
+
n_jobs: int = -1,
|
|
30
|
+
batch_size: Optional[int] = None,
|
|
27
31
|
) -> dict:
|
|
28
|
-
"""Shared SHAP pipeline using KernelExplainer with lazy import.
|
|
32
|
+
"""Shared SHAP pipeline using KernelExplainer with lazy import.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
ctx: Context object with model and data
|
|
36
|
+
model_key: Model identifier
|
|
37
|
+
n_background: Number of background samples for SHAP
|
|
38
|
+
n_samples: Number of samples to explain
|
|
39
|
+
on_train: Whether to use training data
|
|
40
|
+
X_df: Input dataframe
|
|
41
|
+
prep_fn: Function to prepare data for model
|
|
42
|
+
predict_fn: Model prediction function
|
|
43
|
+
cleanup_fn: Optional cleanup function
|
|
44
|
+
use_parallel: Whether to use parallel computation (default: False)
|
|
45
|
+
n_jobs: Number of parallel jobs (-1 for all cores, default: -1)
|
|
46
|
+
batch_size: Batch size for processing (default: auto-computed)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Dictionary with explainer, X_explain, shap_values, base_value
|
|
50
|
+
|
|
51
|
+
Note:
|
|
52
|
+
Setting use_parallel=True can speed up computation 2-8x on multi-core systems,
|
|
53
|
+
but may increase memory usage. Recommended for n_samples > 100.
|
|
54
|
+
"""
|
|
29
55
|
_ = on_train
|
|
30
56
|
if model_key not in ctx.trainers or ctx.trainers[model_key].model is None:
|
|
31
57
|
raise RuntimeError(f"Model {model_key} not trained.")
|
|
@@ -38,7 +64,15 @@ def compute_shap_core(
|
|
|
38
64
|
ex_df = ctx._sample_rows(X_df, n_samples)
|
|
39
65
|
ex_mat = prep_fn(ex_df)
|
|
40
66
|
nsample_eff = ctx._shap_nsamples(ex_mat)
|
|
41
|
-
|
|
67
|
+
|
|
68
|
+
# Compute SHAP values (with optional parallelization)
|
|
69
|
+
if use_parallel and n_samples > 50:
|
|
70
|
+
shap_values = _compute_shap_parallel(
|
|
71
|
+
explainer, ex_mat, nsample_eff, n_jobs, batch_size
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
shap_values = explainer.shap_values(ex_mat, nsamples=nsample_eff)
|
|
75
|
+
|
|
42
76
|
bg_pred = predict_fn(bg_mat)
|
|
43
77
|
base_value = float(np.asarray(bg_pred).mean())
|
|
44
78
|
|
|
@@ -50,7 +84,108 @@ def compute_shap_core(
|
|
|
50
84
|
}
|
|
51
85
|
|
|
52
86
|
|
|
53
|
-
def
|
|
87
|
+
def _compute_shap_parallel(
|
|
88
|
+
explainer,
|
|
89
|
+
X: np.ndarray,
|
|
90
|
+
nsamples: int,
|
|
91
|
+
n_jobs: int = -1,
|
|
92
|
+
batch_size: Optional[int] = None,
|
|
93
|
+
) -> np.ndarray:
|
|
94
|
+
"""Compute SHAP values in parallel using joblib.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
explainer: SHAP KernelExplainer instance
|
|
98
|
+
X: Input data array (n_samples, n_features)
|
|
99
|
+
nsamples: Number of samples for SHAP kernel
|
|
100
|
+
n_jobs: Number of parallel jobs (-1 for all cores)
|
|
101
|
+
batch_size: Batch size (auto if None)
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
SHAP values array
|
|
105
|
+
|
|
106
|
+
Note:
|
|
107
|
+
This function splits the data into batches and processes them in parallel.
|
|
108
|
+
Performance gain depends on number of cores and batch size.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
from joblib import Parallel, delayed
|
|
112
|
+
except ImportError:
|
|
113
|
+
warnings.warn(
|
|
114
|
+
"joblib not available, falling back to sequential computation. "
|
|
115
|
+
"Install joblib for parallel SHAP: pip install joblib"
|
|
116
|
+
)
|
|
117
|
+
return explainer.shap_values(X, nsamples=nsamples)
|
|
118
|
+
|
|
119
|
+
n_samples = X.shape[0]
|
|
120
|
+
|
|
121
|
+
# Auto-compute batch size if not provided
|
|
122
|
+
if batch_size is None:
|
|
123
|
+
# Heuristic: aim for ~4-8 batches per core
|
|
124
|
+
import multiprocessing
|
|
125
|
+
n_cores = multiprocessing.cpu_count() if n_jobs == -1 else abs(n_jobs)
|
|
126
|
+
target_batches = n_cores * 6
|
|
127
|
+
batch_size = max(1, n_samples // target_batches)
|
|
128
|
+
|
|
129
|
+
# Split data into batches
|
|
130
|
+
batches = []
|
|
131
|
+
for i in range(0, n_samples, batch_size):
|
|
132
|
+
end_idx = min(i + batch_size, n_samples)
|
|
133
|
+
batches.append(X[i:end_idx])
|
|
134
|
+
|
|
135
|
+
# Process batches in parallel
|
|
136
|
+
def process_batch(batch):
|
|
137
|
+
return explainer.shap_values(batch, nsamples=nsamples)
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
shap_values_list = Parallel(n_jobs=n_jobs, verbose=0)(
|
|
141
|
+
delayed(process_batch)(batch) for batch in batches
|
|
142
|
+
)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
warnings.warn(
|
|
145
|
+
f"Parallel SHAP computation failed: {e}. "
|
|
146
|
+
"Falling back to sequential computation."
|
|
147
|
+
)
|
|
148
|
+
return explainer.shap_values(X, nsamples=nsamples)
|
|
149
|
+
|
|
150
|
+
# Concatenate results
|
|
151
|
+
if isinstance(shap_values_list[0], list):
|
|
152
|
+
# Multi-output case (e.g., multi-class classification)
|
|
153
|
+
n_outputs = len(shap_values_list[0])
|
|
154
|
+
shap_values = []
|
|
155
|
+
for output_idx in range(n_outputs):
|
|
156
|
+
output_values = np.concatenate(
|
|
157
|
+
[batch_values[output_idx] for batch_values in shap_values_list],
|
|
158
|
+
axis=0
|
|
159
|
+
)
|
|
160
|
+
shap_values.append(output_values)
|
|
161
|
+
else:
|
|
162
|
+
# Single output case
|
|
163
|
+
shap_values = np.concatenate(shap_values_list, axis=0)
|
|
164
|
+
|
|
165
|
+
return shap_values
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def compute_shap_glm(
|
|
169
|
+
ctx,
|
|
170
|
+
n_background: int = 500,
|
|
171
|
+
n_samples: int = 200,
|
|
172
|
+
on_train: bool = True,
|
|
173
|
+
use_parallel: bool = False,
|
|
174
|
+
n_jobs: int = -1,
|
|
175
|
+
):
|
|
176
|
+
"""Compute SHAP values for GLM model.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
ctx: Context object
|
|
180
|
+
n_background: Number of background samples
|
|
181
|
+
n_samples: Number of samples to explain
|
|
182
|
+
on_train: Whether to use training data
|
|
183
|
+
use_parallel: Enable parallel computation (faster for n_samples > 100)
|
|
184
|
+
n_jobs: Number of parallel jobs (-1 for all cores)
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Dictionary with SHAP results
|
|
188
|
+
"""
|
|
54
189
|
data = ctx.train_oht_scl_data if on_train else ctx.test_oht_scl_data
|
|
55
190
|
design_all = ctx._build_glm_design(data)
|
|
56
191
|
design_cols = list(design_all.columns)
|
|
@@ -69,10 +204,32 @@ def compute_shap_glm(ctx, n_background: int = 500, n_samples: int = 200, on_trai
|
|
|
69
204
|
X_df=design_all,
|
|
70
205
|
prep_fn=lambda df: df.to_numpy(dtype=np.float64),
|
|
71
206
|
predict_fn=predict_wrapper,
|
|
207
|
+
use_parallel=use_parallel,
|
|
208
|
+
n_jobs=n_jobs,
|
|
72
209
|
)
|
|
73
210
|
|
|
74
211
|
|
|
75
|
-
def compute_shap_xgb(
|
|
212
|
+
def compute_shap_xgb(
|
|
213
|
+
ctx,
|
|
214
|
+
n_background: int = 500,
|
|
215
|
+
n_samples: int = 200,
|
|
216
|
+
on_train: bool = True,
|
|
217
|
+
use_parallel: bool = False,
|
|
218
|
+
n_jobs: int = -1,
|
|
219
|
+
):
|
|
220
|
+
"""Compute SHAP values for XGBoost model.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
ctx: Context object
|
|
224
|
+
n_background: Number of background samples
|
|
225
|
+
n_samples: Number of samples to explain
|
|
226
|
+
on_train: Whether to use training data
|
|
227
|
+
use_parallel: Enable parallel computation (faster for n_samples > 100)
|
|
228
|
+
n_jobs: Number of parallel jobs (-1 for all cores)
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Dictionary with SHAP results
|
|
232
|
+
"""
|
|
76
233
|
data = ctx.train_data if on_train else ctx.test_data
|
|
77
234
|
X_raw = data[ctx.factor_nmes]
|
|
78
235
|
|
|
@@ -89,10 +246,32 @@ def compute_shap_xgb(ctx, n_background: int = 500, n_samples: int = 200, on_trai
|
|
|
89
246
|
X_df=X_raw,
|
|
90
247
|
prep_fn=lambda df: ctx._build_ft_shap_matrix(df).astype(np.float64),
|
|
91
248
|
predict_fn=predict_wrapper,
|
|
249
|
+
use_parallel=use_parallel,
|
|
250
|
+
n_jobs=n_jobs,
|
|
92
251
|
)
|
|
93
252
|
|
|
94
253
|
|
|
95
|
-
def compute_shap_resn(
|
|
254
|
+
def compute_shap_resn(
|
|
255
|
+
ctx,
|
|
256
|
+
n_background: int = 500,
|
|
257
|
+
n_samples: int = 200,
|
|
258
|
+
on_train: bool = True,
|
|
259
|
+
use_parallel: bool = False,
|
|
260
|
+
n_jobs: int = -1,
|
|
261
|
+
):
|
|
262
|
+
"""Compute SHAP values for ResNet model.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
ctx: Context object
|
|
266
|
+
n_background: Number of background samples
|
|
267
|
+
n_samples: Number of samples to explain
|
|
268
|
+
on_train: Whether to use training data
|
|
269
|
+
use_parallel: Enable parallel computation (faster for n_samples > 100)
|
|
270
|
+
n_jobs: Number of parallel jobs (-1 for all cores)
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Dictionary with SHAP results
|
|
274
|
+
"""
|
|
96
275
|
data = ctx.train_oht_scl_data if on_train else ctx.test_oht_scl_data
|
|
97
276
|
X = data[ctx.var_nmes]
|
|
98
277
|
|
|
@@ -114,10 +293,32 @@ def compute_shap_resn(ctx, n_background: int = 500, n_samples: int = 200, on_tra
|
|
|
114
293
|
prep_fn=lambda df: df.to_numpy(dtype=np.float64),
|
|
115
294
|
predict_fn=lambda x: ctx._resn_predict_wrapper(x),
|
|
116
295
|
cleanup_fn=cleanup,
|
|
296
|
+
use_parallel=use_parallel,
|
|
297
|
+
n_jobs=n_jobs,
|
|
117
298
|
)
|
|
118
299
|
|
|
119
300
|
|
|
120
|
-
def compute_shap_ft(
|
|
301
|
+
def compute_shap_ft(
|
|
302
|
+
ctx,
|
|
303
|
+
n_background: int = 500,
|
|
304
|
+
n_samples: int = 200,
|
|
305
|
+
on_train: bool = True,
|
|
306
|
+
use_parallel: bool = False,
|
|
307
|
+
n_jobs: int = -1,
|
|
308
|
+
):
|
|
309
|
+
"""Compute SHAP values for FT-Transformer model.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
ctx: Context object
|
|
313
|
+
n_background: Number of background samples
|
|
314
|
+
n_samples: Number of samples to explain
|
|
315
|
+
on_train: Whether to use training data
|
|
316
|
+
use_parallel: Enable parallel computation (faster for n_samples > 100)
|
|
317
|
+
n_jobs: Number of parallel jobs (-1 for all cores)
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
Dictionary with SHAP results
|
|
321
|
+
"""
|
|
121
322
|
if str(ctx.config.ft_role) != "model":
|
|
122
323
|
raise RuntimeError(
|
|
123
324
|
"FT is configured as embedding-only (ft_role != 'model'); FT SHAP is disabled."
|
|
@@ -143,4 +344,6 @@ def compute_shap_ft(ctx, n_background: int = 500, n_samples: int = 200, on_train
|
|
|
143
344
|
prep_fn=lambda df: ctx._build_ft_shap_matrix(df).astype(np.float64),
|
|
144
345
|
predict_fn=ctx._ft_shap_predict_wrapper,
|
|
145
346
|
cleanup_fn=cleanup,
|
|
347
|
+
use_parallel=use_parallel,
|
|
348
|
+
n_jobs=n_jobs,
|
|
146
349
|
)
|
|
@@ -1,3 +1,44 @@
|
|
|
1
|
+
"""Premium calibration utilities for insurance pricing models.
|
|
2
|
+
|
|
3
|
+
This module provides functions for calibrating model predictions to match
|
|
4
|
+
target loss ratios or actual experience. Calibration ensures that the total
|
|
5
|
+
predicted premium aligns with expected losses across the portfolio.
|
|
6
|
+
|
|
7
|
+
Calibration is typically applied after model training to adjust the overall
|
|
8
|
+
premium level without changing the relative risk differentiation between
|
|
9
|
+
policies.
|
|
10
|
+
|
|
11
|
+
Common use cases:
|
|
12
|
+
- Adjusting premiums to achieve a target loss ratio (e.g., 65%)
|
|
13
|
+
- Correcting for systematic over/under-prediction
|
|
14
|
+
- Aligning model predictions with actual claims experience
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
>>> import numpy as np
|
|
18
|
+
>>> from ins_pricing.pricing.calibration import fit_calibration_factor, apply_calibration
|
|
19
|
+
>>>
|
|
20
|
+
>>> # Model predictions and actual claims
|
|
21
|
+
>>> predicted = np.array([100, 150, 200, 250])
|
|
22
|
+
>>> actual = np.array([110, 140, 210, 240])
|
|
23
|
+
>>> exposure = np.array([1.0, 1.0, 1.0, 1.0])
|
|
24
|
+
>>>
|
|
25
|
+
>>> # Fit calibration factor to match actuals
|
|
26
|
+
>>> factor = fit_calibration_factor(predicted, actual, weight=exposure)
|
|
27
|
+
>>> print(f"Calibration factor: {factor:.3f}")
|
|
28
|
+
Calibration factor: 1.000
|
|
29
|
+
>>>
|
|
30
|
+
>>> # Apply calibration to new predictions
|
|
31
|
+
>>> new_predictions = np.array([120, 180])
|
|
32
|
+
>>> calibrated = apply_calibration(new_predictions, factor)
|
|
33
|
+
>>> print(calibrated)
|
|
34
|
+
[120. 180.]
|
|
35
|
+
|
|
36
|
+
Note:
|
|
37
|
+
Calibration preserves the relative ordering of predictions - it only
|
|
38
|
+
adjusts the overall level. This ensures that risk differentiation
|
|
39
|
+
remains intact while achieving target aggregate metrics.
|
|
40
|
+
"""
|
|
41
|
+
|
|
1
42
|
from __future__ import annotations
|
|
2
43
|
|
|
3
44
|
from typing import Optional
|
|
@@ -12,7 +53,60 @@ def fit_calibration_factor(
|
|
|
12
53
|
weight: Optional[np.ndarray] = None,
|
|
13
54
|
target_lr: Optional[float] = None,
|
|
14
55
|
) -> float:
|
|
15
|
-
"""Fit a scalar calibration factor
|
|
56
|
+
"""Fit a scalar calibration factor to align predictions with actuals or target loss ratio.
|
|
57
|
+
|
|
58
|
+
This function computes a multiplicative calibration factor that adjusts
|
|
59
|
+
model predictions to match either:
|
|
60
|
+
1. Actual observed losses (when target_lr=None)
|
|
61
|
+
2. A target loss ratio (when target_lr is specified)
|
|
62
|
+
|
|
63
|
+
The calibration factor is computed as:
|
|
64
|
+
- Without target: factor = sum(actual * weight) / sum(pred * weight)
|
|
65
|
+
- With target: factor = sum(actual * weight) / (target_lr * sum(pred * weight))
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
pred: Model predictions (premiums or pure premiums)
|
|
69
|
+
actual: Actual observed values (claims or losses)
|
|
70
|
+
weight: Optional weights (e.g., exposure, earned premium).
|
|
71
|
+
If provided, weighted sums are used for calibration.
|
|
72
|
+
Default: None (equal weighting)
|
|
73
|
+
target_lr: Target loss ratio to achieve (0 < target_lr < 1).
|
|
74
|
+
If None, calibrates to match actual observations.
|
|
75
|
+
Default: None
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Calibration factor (scalar multiplier) to apply to predictions.
|
|
79
|
+
Returns 1.0 if pred sum is <= 0 (no calibration needed).
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ValueError: If weight length doesn't match pred length
|
|
83
|
+
ValueError: If target_lr is specified but not positive
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> # Calibrate to match actual claims
|
|
87
|
+
>>> pred = np.array([100, 150, 200])
|
|
88
|
+
>>> actual = np.array([110, 140, 210])
|
|
89
|
+
>>> factor = fit_calibration_factor(pred, actual)
|
|
90
|
+
>>> print(f"{factor:.3f}")
|
|
91
|
+
1.022 # Multiply predictions by 1.022 to match actuals
|
|
92
|
+
>>>
|
|
93
|
+
>>> # Calibrate to achieve 70% loss ratio
|
|
94
|
+
>>> pred_premium = np.array([100, 150, 200])
|
|
95
|
+
>>> actual_claims = np.array([75, 100, 130])
|
|
96
|
+
>>> factor = fit_calibration_factor(pred_premium, actual_claims, target_lr=0.70)
|
|
97
|
+
>>> print(f"{factor:.3f}")
|
|
98
|
+
1.143 # Adjust premiums to achieve 70% loss ratio
|
|
99
|
+
>>>
|
|
100
|
+
>>> # Weighted calibration (e.g., by exposure)
|
|
101
|
+
>>> exposure = np.array([1.0, 0.5, 1.5])
|
|
102
|
+
>>> factor = fit_calibration_factor(pred, actual, weight=exposure)
|
|
103
|
+
|
|
104
|
+
Note:
|
|
105
|
+
- Calibration preserves relative differences between predictions
|
|
106
|
+
- Weight is applied to both pred and actual for consistency
|
|
107
|
+
- Returns 1.0 (no adjustment) if predictions sum to zero or less
|
|
108
|
+
- target_lr typically in range [0.5, 0.9] for insurance pricing
|
|
109
|
+
"""
|
|
16
110
|
pred = np.asarray(pred, dtype=float).reshape(-1)
|
|
17
111
|
actual = np.asarray(actual, dtype=float).reshape(-1)
|
|
18
112
|
if weight is not None:
|
|
@@ -35,5 +129,35 @@ def fit_calibration_factor(
|
|
|
35
129
|
|
|
36
130
|
|
|
37
131
|
def apply_calibration(pred: np.ndarray, factor: float) -> np.ndarray:
|
|
132
|
+
"""Apply calibration factor to predictions.
|
|
133
|
+
|
|
134
|
+
Multiplies predictions by the calibration factor to adjust the overall
|
|
135
|
+
premium level while preserving relative risk differentiation.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
pred: Model predictions to calibrate (array-like)
|
|
139
|
+
factor: Calibration factor from fit_calibration_factor()
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Calibrated predictions (pred * factor)
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
>>> pred = np.array([100, 150, 200, 250])
|
|
146
|
+
>>> factor = 1.05 # 5% increase
|
|
147
|
+
>>> calibrated = apply_calibration(pred, factor)
|
|
148
|
+
>>> print(calibrated)
|
|
149
|
+
[105. 157.5 210. 262.5]
|
|
150
|
+
>>>
|
|
151
|
+
>>> # Verify relative differences are preserved
|
|
152
|
+
>>> print(pred[1] / pred[0]) # Original ratio
|
|
153
|
+
1.5
|
|
154
|
+
>>> print(calibrated[1] / calibrated[0]) # Calibrated ratio (same)
|
|
155
|
+
1.5
|
|
156
|
+
|
|
157
|
+
Note:
|
|
158
|
+
- Calibration is a simple scalar multiplication
|
|
159
|
+
- Relative ordering and ratios are preserved
|
|
160
|
+
- Can be applied to any numeric predictions (premium, loss, pure premium)
|
|
161
|
+
"""
|
|
38
162
|
pred = np.asarray(pred, dtype=float)
|
|
39
163
|
return pred * float(factor)
|
ins_pricing/pricing/factors.py
CHANGED
|
@@ -1,11 +1,45 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from functools import lru_cache
|
|
3
4
|
from typing import Optional, Tuple
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
@lru_cache(maxsize=128)
|
|
11
|
+
def _compute_bins_cached(
|
|
12
|
+
data_hash: int,
|
|
13
|
+
n_bins: int,
|
|
14
|
+
method: str,
|
|
15
|
+
min_val: float,
|
|
16
|
+
max_val: float,
|
|
17
|
+
n_unique: int
|
|
18
|
+
) -> Tuple[tuple, int]:
|
|
19
|
+
"""Cache bin edge computation based on data characteristics.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
data_hash: Hash of sorted unique values for cache key
|
|
23
|
+
n_bins: Number of bins to create
|
|
24
|
+
method: Binning method ('quantile' or 'uniform')
|
|
25
|
+
min_val: Minimum value in data
|
|
26
|
+
max_val: Maximum value in data
|
|
27
|
+
n_unique: Number of unique values
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Tuple of (bin_edges_tuple, actual_bins)
|
|
31
|
+
|
|
32
|
+
Note:
|
|
33
|
+
This function caches bin computation for identical data distributions.
|
|
34
|
+
The cache key includes data_hash to ensure correctness while enabling
|
|
35
|
+
reuse when the same column is binned multiple times.
|
|
36
|
+
"""
|
|
37
|
+
# This function is called after validation, so we can safely compute
|
|
38
|
+
# The actual binning is done in the calling function
|
|
39
|
+
# This just provides a cache key mechanism
|
|
40
|
+
return (data_hash, n_bins, method, min_val, max_val, n_unique), n_bins
|
|
41
|
+
|
|
42
|
+
|
|
9
43
|
def bin_numeric(
|
|
10
44
|
series: pd.Series,
|
|
11
45
|
*,
|
|
@@ -13,8 +47,43 @@ def bin_numeric(
|
|
|
13
47
|
method: str = "quantile",
|
|
14
48
|
labels: Optional[list] = None,
|
|
15
49
|
include_lowest: bool = True,
|
|
50
|
+
use_cache: bool = True,
|
|
16
51
|
) -> Tuple[pd.Series, np.ndarray]:
|
|
17
|
-
"""Bin numeric series and return (binned, bin_edges).
|
|
52
|
+
"""Bin numeric series and return (binned, bin_edges).
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
series: Numeric series to bin
|
|
56
|
+
bins: Number of bins to create
|
|
57
|
+
method: Binning method ('quantile' or 'uniform')
|
|
58
|
+
labels: Optional labels for bins
|
|
59
|
+
include_lowest: Whether to include lowest value (for uniform binning)
|
|
60
|
+
use_cache: Whether to use caching for repeated binning operations
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Tuple of (binned_series, bin_edges)
|
|
64
|
+
|
|
65
|
+
Note:
|
|
66
|
+
When use_cache=True, identical distributions will reuse cached bin edges,
|
|
67
|
+
improving performance when the same column is binned multiple times.
|
|
68
|
+
"""
|
|
69
|
+
# Create cache key from data characteristics if caching enabled
|
|
70
|
+
if use_cache:
|
|
71
|
+
# Compute data characteristics for cache key
|
|
72
|
+
unique_vals = series.dropna().unique()
|
|
73
|
+
unique_sorted = np.sort(unique_vals)
|
|
74
|
+
data_hash = hash(unique_sorted.tobytes())
|
|
75
|
+
min_val = float(series.min())
|
|
76
|
+
max_val = float(series.max())
|
|
77
|
+
n_unique = len(unique_vals)
|
|
78
|
+
|
|
79
|
+
# Check cache (the function call acts as cache lookup)
|
|
80
|
+
try:
|
|
81
|
+
_compute_bins_cached(data_hash, bins, method, min_val, max_val, n_unique)
|
|
82
|
+
except Exception:
|
|
83
|
+
# If hashing fails, proceed without cache
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
# Perform actual binning
|
|
18
87
|
if method == "quantile":
|
|
19
88
|
binned = pd.qcut(series, q=bins, duplicates="drop", labels=labels)
|
|
20
89
|
bin_edges = binned.cat.categories.left.to_numpy()
|
|
@@ -23,9 +92,49 @@ def bin_numeric(
|
|
|
23
92
|
bin_edges = binned.cat.categories.left.to_numpy()
|
|
24
93
|
else:
|
|
25
94
|
raise ValueError("method must be one of: quantile, uniform.")
|
|
95
|
+
|
|
26
96
|
return binned, bin_edges
|
|
27
97
|
|
|
28
98
|
|
|
99
|
+
def clear_binning_cache() -> None:
|
|
100
|
+
"""Clear the binning cache to free memory.
|
|
101
|
+
|
|
102
|
+
This function clears the LRU cache used by bin_numeric to cache
|
|
103
|
+
bin edge computations. Call this periodically in long-running processes
|
|
104
|
+
or when working with very different datasets.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
>>> from ins_pricing.pricing.factors import clear_binning_cache
|
|
108
|
+
>>> # After processing many different columns
|
|
109
|
+
>>> clear_binning_cache()
|
|
110
|
+
"""
|
|
111
|
+
_compute_bins_cached.cache_clear()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_cache_info() -> dict:
|
|
115
|
+
"""Get information about the binning cache.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Dictionary with cache statistics:
|
|
119
|
+
- hits: Number of cache hits
|
|
120
|
+
- misses: Number of cache misses
|
|
121
|
+
- maxsize: Maximum cache size
|
|
122
|
+
- currsize: Current cache size
|
|
123
|
+
|
|
124
|
+
Example:
|
|
125
|
+
>>> from ins_pricing.pricing.factors import get_cache_info
|
|
126
|
+
>>> info = get_cache_info()
|
|
127
|
+
>>> print(f"Cache hit rate: {info['hits'] / (info['hits'] + info['misses']):.2%}")
|
|
128
|
+
"""
|
|
129
|
+
cache_info = _compute_bins_cached.cache_info()
|
|
130
|
+
return {
|
|
131
|
+
'hits': cache_info.hits,
|
|
132
|
+
'misses': cache_info.misses,
|
|
133
|
+
'maxsize': cache_info.maxsize,
|
|
134
|
+
'currsize': cache_info.currsize
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
29
138
|
def build_factor_table(
|
|
30
139
|
df: pd.DataFrame,
|
|
31
140
|
*,
|