ins-pricing 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/docs/LOSS_FUNCTIONS.md +78 -0
- ins_pricing/frontend/QUICKSTART.md +152 -0
- ins_pricing/frontend/README.md +419 -0
- ins_pricing/frontend/__init__.py +10 -0
- ins_pricing/frontend/app.py +941 -0
- ins_pricing/frontend/config_builder.py +352 -0
- ins_pricing/frontend/example_config.json +36 -0
- ins_pricing/frontend/example_workflows.py +979 -0
- ins_pricing/frontend/ft_workflow.py +316 -0
- ins_pricing/frontend/runner.py +388 -0
- ins_pricing/production/predict.py +693 -664
- ins_pricing/setup.py +1 -1
- {ins_pricing-0.3.4.dist-info → ins_pricing-0.4.1.dist-info}/METADATA +1 -1
- {ins_pricing-0.3.4.dist-info → ins_pricing-0.4.1.dist-info}/RECORD +16 -6
- {ins_pricing-0.3.4.dist-info → ins_pricing-0.4.1.dist-info}/WHEEL +1 -1
- {ins_pricing-0.3.4.dist-info → ins_pricing-0.4.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration Builder for Insurance Pricing Models
|
|
3
|
+
Generates complete configuration dictionaries from UI parameters.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Optional, Dict, Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ConfigBuilder:
|
|
10
|
+
"""Build configuration dictionaries for model training."""
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.default_config = self._get_default_config()
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def _get_default_config() -> Dict[str, Any]:
|
|
17
|
+
"""Get default configuration template."""
|
|
18
|
+
return {
|
|
19
|
+
"data_format": "csv",
|
|
20
|
+
"data_path_template": "{model_name}.{ext}",
|
|
21
|
+
"dtype_map": None,
|
|
22
|
+
"binary_resp_nme": None,
|
|
23
|
+
"split_group_col": None,
|
|
24
|
+
"split_time_col": None,
|
|
25
|
+
"split_time_ascending": True,
|
|
26
|
+
"cv_strategy": None,
|
|
27
|
+
"cv_group_col": None,
|
|
28
|
+
"cv_time_col": None,
|
|
29
|
+
"cv_time_ascending": True,
|
|
30
|
+
"cv_splits": None,
|
|
31
|
+
"plot_path_style": "nested",
|
|
32
|
+
"save_preprocess": False,
|
|
33
|
+
"preprocess_artifact_path": None,
|
|
34
|
+
"bo_sample_limit": None,
|
|
35
|
+
"cache_predictions": False,
|
|
36
|
+
"prediction_cache_dir": None,
|
|
37
|
+
"prediction_cache_format": "parquet",
|
|
38
|
+
"plot_curves": False,
|
|
39
|
+
"plot": {
|
|
40
|
+
"enable": False,
|
|
41
|
+
"n_bins": 10,
|
|
42
|
+
"oneway": False,
|
|
43
|
+
"oneway_pred": False,
|
|
44
|
+
"pre_oneway": False,
|
|
45
|
+
"lift_models": [],
|
|
46
|
+
"double_lift": False,
|
|
47
|
+
"double_lift_pairs": []
|
|
48
|
+
},
|
|
49
|
+
"env": {
|
|
50
|
+
"OPENBLAS_NUM_THREADS": "1",
|
|
51
|
+
"OMP_NUM_THREADS": "1"
|
|
52
|
+
},
|
|
53
|
+
"use_resn_data_parallel": False,
|
|
54
|
+
"use_ft_data_parallel": False,
|
|
55
|
+
"use_gnn_data_parallel": False,
|
|
56
|
+
"use_resn_ddp": True,
|
|
57
|
+
"use_ft_ddp": True,
|
|
58
|
+
"use_gnn_ddp": True,
|
|
59
|
+
"ddp_min_rows": 50000,
|
|
60
|
+
"ft_role": "model",
|
|
61
|
+
"ft_feature_prefix": "ft_emb",
|
|
62
|
+
"ft_num_numeric_tokens": None,
|
|
63
|
+
"ft_oof_folds": None,
|
|
64
|
+
"ft_oof_strategy": None,
|
|
65
|
+
"ft_oof_shuffle": True,
|
|
66
|
+
"resn_weight_decay": 0.0001,
|
|
67
|
+
"final_ensemble": False,
|
|
68
|
+
"final_ensemble_k": 3,
|
|
69
|
+
"final_refit": True,
|
|
70
|
+
"infer_categorical_max_unique": 50,
|
|
71
|
+
"infer_categorical_max_ratio": 0.05,
|
|
72
|
+
"optuna_study_prefix": "pricing",
|
|
73
|
+
"reuse_best_params": False,
|
|
74
|
+
"best_params_files": {},
|
|
75
|
+
"gnn_use_approx_knn": True,
|
|
76
|
+
"gnn_approx_knn_threshold": 50000,
|
|
77
|
+
"gnn_graph_cache": None,
|
|
78
|
+
"gnn_max_gpu_knn_nodes": 200000,
|
|
79
|
+
"gnn_knn_gpu_mem_ratio": 0.9,
|
|
80
|
+
"gnn_knn_gpu_mem_overhead": 2.0,
|
|
81
|
+
"geo_feature_nmes": [],
|
|
82
|
+
"region_province_col": None,
|
|
83
|
+
"region_city_col": None,
|
|
84
|
+
"region_effect_alpha": 0.0,
|
|
85
|
+
"geo_token_hidden_dim": 32,
|
|
86
|
+
"geo_token_layers": 2,
|
|
87
|
+
"geo_token_dropout": 0.1,
|
|
88
|
+
"geo_token_k_neighbors": 10,
|
|
89
|
+
"geo_token_learning_rate": 0.001,
|
|
90
|
+
"geo_token_epochs": 50,
|
|
91
|
+
"report_output_dir": "./Results/reports",
|
|
92
|
+
"report_group_cols": [],
|
|
93
|
+
"report_time_col": None,
|
|
94
|
+
"report_time_freq": "M",
|
|
95
|
+
"report_time_ascending": True,
|
|
96
|
+
"psi_bins": 10,
|
|
97
|
+
"psi_strategy": "quantile",
|
|
98
|
+
"psi_features": [],
|
|
99
|
+
"calibration": {
|
|
100
|
+
"enable": False,
|
|
101
|
+
"method": "sigmoid",
|
|
102
|
+
"max_rows": None,
|
|
103
|
+
"seed": 13
|
|
104
|
+
},
|
|
105
|
+
"threshold": {
|
|
106
|
+
"enable": False,
|
|
107
|
+
"value": None,
|
|
108
|
+
"metric": "f1",
|
|
109
|
+
"min_positive_rate": None,
|
|
110
|
+
"grid": 99,
|
|
111
|
+
"max_rows": None,
|
|
112
|
+
"seed": 13
|
|
113
|
+
},
|
|
114
|
+
"bootstrap": {
|
|
115
|
+
"enable": False,
|
|
116
|
+
"metrics": [],
|
|
117
|
+
"n_samples": 200,
|
|
118
|
+
"ci": 0.95,
|
|
119
|
+
"seed": 13
|
|
120
|
+
},
|
|
121
|
+
"register_model": False,
|
|
122
|
+
"registry_path": "./Results/model_registry.json",
|
|
123
|
+
"registry_tags": {},
|
|
124
|
+
"registry_status": "candidate",
|
|
125
|
+
"data_fingerprint_max_bytes": 10485760,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
def build_config(
|
|
129
|
+
self,
|
|
130
|
+
data_dir: str,
|
|
131
|
+
model_list: List[str],
|
|
132
|
+
model_categories: List[str],
|
|
133
|
+
target: str,
|
|
134
|
+
weight: str,
|
|
135
|
+
feature_list: List[str],
|
|
136
|
+
categorical_features: List[str],
|
|
137
|
+
task_type: str = "regression",
|
|
138
|
+
prop_test: float = 0.25,
|
|
139
|
+
holdout_ratio: float = 0.25,
|
|
140
|
+
val_ratio: float = 0.25,
|
|
141
|
+
split_strategy: str = "random",
|
|
142
|
+
rand_seed: int = 13,
|
|
143
|
+
epochs: int = 50,
|
|
144
|
+
output_dir: str = "./Results",
|
|
145
|
+
use_gpu: bool = True,
|
|
146
|
+
model_keys: Optional[List[str]] = None,
|
|
147
|
+
max_evals: int = 50,
|
|
148
|
+
xgb_max_depth_max: int = 25,
|
|
149
|
+
xgb_n_estimators_max: int = 500,
|
|
150
|
+
nproc_per_node: int = 2,
|
|
151
|
+
) -> Dict[str, Any]:
|
|
152
|
+
"""
|
|
153
|
+
Build a complete configuration dictionary.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
data_dir: Directory containing data files
|
|
157
|
+
model_list: List of model names
|
|
158
|
+
model_categories: List of model categories
|
|
159
|
+
target: Target column name
|
|
160
|
+
weight: Weight column name
|
|
161
|
+
feature_list: List of feature names
|
|
162
|
+
categorical_features: List of categorical feature names
|
|
163
|
+
task_type: Type of task (regression, binary, multiclass)
|
|
164
|
+
prop_test: Proportion of data for testing
|
|
165
|
+
holdout_ratio: Holdout ratio for validation
|
|
166
|
+
val_ratio: Validation ratio
|
|
167
|
+
split_strategy: Strategy for splitting data
|
|
168
|
+
rand_seed: Random seed for reproducibility
|
|
169
|
+
epochs: Number of training epochs
|
|
170
|
+
output_dir: Directory for output files
|
|
171
|
+
use_gpu: Whether to use GPU
|
|
172
|
+
model_keys: List of model types to train
|
|
173
|
+
max_evals: Maximum number of evaluations for optimization
|
|
174
|
+
xgb_max_depth_max: Maximum depth for XGBoost
|
|
175
|
+
xgb_n_estimators_max: Maximum estimators for XGBoost
|
|
176
|
+
nproc_per_node: Number of processes per node
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Complete configuration dictionary
|
|
180
|
+
"""
|
|
181
|
+
if model_keys is None:
|
|
182
|
+
model_keys = ["xgb", "resn"]
|
|
183
|
+
|
|
184
|
+
config = self.default_config.copy()
|
|
185
|
+
|
|
186
|
+
# Update with user-provided values
|
|
187
|
+
config.update({
|
|
188
|
+
"data_dir": data_dir,
|
|
189
|
+
"model_list": model_list,
|
|
190
|
+
"model_categories": model_categories,
|
|
191
|
+
"target": target,
|
|
192
|
+
"weight": weight,
|
|
193
|
+
"feature_list": feature_list,
|
|
194
|
+
"categorical_features": categorical_features,
|
|
195
|
+
"task_type": task_type,
|
|
196
|
+
"prop_test": prop_test,
|
|
197
|
+
"holdout_ratio": holdout_ratio,
|
|
198
|
+
"val_ratio": val_ratio,
|
|
199
|
+
"split_strategy": split_strategy,
|
|
200
|
+
"rand_seed": rand_seed,
|
|
201
|
+
"epochs": epochs,
|
|
202
|
+
"output_dir": output_dir,
|
|
203
|
+
"use_gpu": use_gpu,
|
|
204
|
+
"xgb_max_depth_max": xgb_max_depth_max,
|
|
205
|
+
"xgb_n_estimators_max": xgb_n_estimators_max,
|
|
206
|
+
"optuna_storage": f"{output_dir}/optuna/bayesopt.sqlite3",
|
|
207
|
+
"stack_model_keys": model_keys,
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
# Add runner configuration
|
|
211
|
+
config["runner"] = {
|
|
212
|
+
"mode": "entry",
|
|
213
|
+
"model_keys": model_keys,
|
|
214
|
+
"nproc_per_node": nproc_per_node,
|
|
215
|
+
"max_evals": max_evals,
|
|
216
|
+
"plot_curves": False,
|
|
217
|
+
"ft_role": None,
|
|
218
|
+
"use_watchdog": False,
|
|
219
|
+
"idle_seconds": 7200,
|
|
220
|
+
"max_restarts": 50,
|
|
221
|
+
"restart_delay_seconds": 10,
|
|
222
|
+
"incremental_args": [
|
|
223
|
+
"--incremental-dir",
|
|
224
|
+
"./IncrementalBatches",
|
|
225
|
+
"--incremental-template",
|
|
226
|
+
"{model_name}_2025Q1.csv",
|
|
227
|
+
"--merge-keys",
|
|
228
|
+
"policy_id",
|
|
229
|
+
"vehicle_id",
|
|
230
|
+
"--model-keys",
|
|
231
|
+
"glm",
|
|
232
|
+
"xgb",
|
|
233
|
+
"ft",
|
|
234
|
+
"--max-evals",
|
|
235
|
+
"25",
|
|
236
|
+
"--update-base-data"
|
|
237
|
+
]
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return config
|
|
241
|
+
|
|
242
|
+
def build_explain_config(
|
|
243
|
+
self,
|
|
244
|
+
base_config: Dict[str, Any],
|
|
245
|
+
model_keys: Optional[List[str]] = None,
|
|
246
|
+
methods: Optional[List[str]] = None,
|
|
247
|
+
on_train: bool = False,
|
|
248
|
+
permutation_n_repeats: int = 5,
|
|
249
|
+
permutation_max_rows: int = 5000,
|
|
250
|
+
shap_n_background: int = 500,
|
|
251
|
+
shap_n_samples: int = 200,
|
|
252
|
+
) -> Dict[str, Any]:
|
|
253
|
+
"""
|
|
254
|
+
Build or update configuration for explain mode.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
base_config: Base configuration dictionary
|
|
258
|
+
model_keys: Models to explain (e.g., ['xgb', 'resn'])
|
|
259
|
+
methods: Explanation methods (e.g., ['permutation', 'shap'])
|
|
260
|
+
on_train: Whether to run on training set (vs validation)
|
|
261
|
+
permutation_n_repeats: Number of repeats for permutation
|
|
262
|
+
permutation_max_rows: Max rows for permutation
|
|
263
|
+
shap_n_background: Background samples for SHAP
|
|
264
|
+
shap_n_samples: Samples for SHAP explanation
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Configuration with explain settings
|
|
268
|
+
"""
|
|
269
|
+
config = base_config.copy()
|
|
270
|
+
|
|
271
|
+
if model_keys is None:
|
|
272
|
+
model_keys = ["xgb"]
|
|
273
|
+
if methods is None:
|
|
274
|
+
methods = ["permutation"]
|
|
275
|
+
|
|
276
|
+
# Set runner mode to explain
|
|
277
|
+
runner = config.get('runner', {})
|
|
278
|
+
runner['mode'] = 'explain'
|
|
279
|
+
config['runner'] = runner
|
|
280
|
+
|
|
281
|
+
# Add explain configuration
|
|
282
|
+
explain = {
|
|
283
|
+
"model_keys": model_keys,
|
|
284
|
+
"methods": methods,
|
|
285
|
+
"on_train": on_train,
|
|
286
|
+
"validation_path": None,
|
|
287
|
+
"train_path": None,
|
|
288
|
+
"save_dir": f"{config.get('output_dir', './Results')}/explain",
|
|
289
|
+
"model_dir": None,
|
|
290
|
+
"result_dir": None,
|
|
291
|
+
"permutation": {
|
|
292
|
+
"metric": "auto",
|
|
293
|
+
"n_repeats": permutation_n_repeats,
|
|
294
|
+
"max_rows": permutation_max_rows,
|
|
295
|
+
"random_state": config.get('rand_seed', 13)
|
|
296
|
+
},
|
|
297
|
+
"shap": {
|
|
298
|
+
"n_background": shap_n_background,
|
|
299
|
+
"n_samples": shap_n_samples,
|
|
300
|
+
"save_values": False
|
|
301
|
+
},
|
|
302
|
+
"integrated_gradients": {
|
|
303
|
+
"steps": 50,
|
|
304
|
+
"batch_size": 256,
|
|
305
|
+
"target": None,
|
|
306
|
+
"baseline": None,
|
|
307
|
+
"baseline_num": None,
|
|
308
|
+
"baseline_geo": None,
|
|
309
|
+
"save_values": False
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
config['explain'] = explain
|
|
313
|
+
|
|
314
|
+
return config
|
|
315
|
+
|
|
316
|
+
def validate_config(self, config: Dict[str, Any]) -> tuple[bool, str]:
|
|
317
|
+
"""
|
|
318
|
+
Validate configuration dictionary.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
config: Configuration to validate
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Tuple of (is_valid, error_message)
|
|
325
|
+
"""
|
|
326
|
+
required_fields = [
|
|
327
|
+
"data_dir",
|
|
328
|
+
"model_list",
|
|
329
|
+
"target",
|
|
330
|
+
"weight",
|
|
331
|
+
"feature_list"
|
|
332
|
+
]
|
|
333
|
+
|
|
334
|
+
for field in required_fields:
|
|
335
|
+
if field not in config:
|
|
336
|
+
return False, f"Missing required field: {field}"
|
|
337
|
+
|
|
338
|
+
if not config[field]:
|
|
339
|
+
return False, f"Empty value for required field: {field}"
|
|
340
|
+
|
|
341
|
+
# Validate model_list and model_categories have same length
|
|
342
|
+
if len(config.get("model_list", [])) != len(config.get("model_categories", [])):
|
|
343
|
+
return False, "model_list and model_categories must have the same length"
|
|
344
|
+
|
|
345
|
+
# Validate categorical features are subset of features
|
|
346
|
+
features = set(config.get("feature_list", []))
|
|
347
|
+
cat_features = set(config.get("categorical_features", []))
|
|
348
|
+
if not cat_features.issubset(features):
|
|
349
|
+
invalid = cat_features - features
|
|
350
|
+
return False, f"Categorical features not in feature_list: {invalid}"
|
|
351
|
+
|
|
352
|
+
return True, "Configuration is valid"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"data_dir": "./Data",
|
|
3
|
+
"model_list": ["od"],
|
|
4
|
+
"model_categories": ["bc"],
|
|
5
|
+
"target": "response",
|
|
6
|
+
"weight": "weights",
|
|
7
|
+
"feature_list": [
|
|
8
|
+
"age_owner",
|
|
9
|
+
"gender_owner",
|
|
10
|
+
"plt_zone",
|
|
11
|
+
"carbrand"
|
|
12
|
+
],
|
|
13
|
+
"categorical_features": [
|
|
14
|
+
"gender_owner",
|
|
15
|
+
"plt_zone",
|
|
16
|
+
"carbrand"
|
|
17
|
+
],
|
|
18
|
+
"task_type": "regression",
|
|
19
|
+
"prop_test": 0.25,
|
|
20
|
+
"holdout_ratio": 0.25,
|
|
21
|
+
"val_ratio": 0.25,
|
|
22
|
+
"split_strategy": "random",
|
|
23
|
+
"rand_seed": 13,
|
|
24
|
+
"epochs": 50,
|
|
25
|
+
"output_dir": "./Results",
|
|
26
|
+
"use_gpu": true,
|
|
27
|
+
"xgb_max_depth_max": 25,
|
|
28
|
+
"xgb_n_estimators_max": 500,
|
|
29
|
+
"optuna_storage": "./Results/optuna/bayesopt.sqlite3",
|
|
30
|
+
"runner": {
|
|
31
|
+
"mode": "entry",
|
|
32
|
+
"model_keys": ["xgb"],
|
|
33
|
+
"nproc_per_node": 1,
|
|
34
|
+
"max_evals": 50
|
|
35
|
+
}
|
|
36
|
+
}
|