ins-pricing 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,352 @@
1
+ """
2
+ Configuration Builder for Insurance Pricing Models
3
+ Generates complete configuration dictionaries from UI parameters.
4
+ """
5
+
6
+ from typing import List, Optional, Dict, Any
7
+
8
+
9
+ class ConfigBuilder:
10
+ """Build configuration dictionaries for model training."""
11
+
12
+ def __init__(self):
13
+ self.default_config = self._get_default_config()
14
+
15
+ @staticmethod
16
+ def _get_default_config() -> Dict[str, Any]:
17
+ """Get default configuration template."""
18
+ return {
19
+ "data_format": "csv",
20
+ "data_path_template": "{model_name}.{ext}",
21
+ "dtype_map": None,
22
+ "binary_resp_nme": None,
23
+ "split_group_col": None,
24
+ "split_time_col": None,
25
+ "split_time_ascending": True,
26
+ "cv_strategy": None,
27
+ "cv_group_col": None,
28
+ "cv_time_col": None,
29
+ "cv_time_ascending": True,
30
+ "cv_splits": None,
31
+ "plot_path_style": "nested",
32
+ "save_preprocess": False,
33
+ "preprocess_artifact_path": None,
34
+ "bo_sample_limit": None,
35
+ "cache_predictions": False,
36
+ "prediction_cache_dir": None,
37
+ "prediction_cache_format": "parquet",
38
+ "plot_curves": False,
39
+ "plot": {
40
+ "enable": False,
41
+ "n_bins": 10,
42
+ "oneway": False,
43
+ "oneway_pred": False,
44
+ "pre_oneway": False,
45
+ "lift_models": [],
46
+ "double_lift": False,
47
+ "double_lift_pairs": []
48
+ },
49
+ "env": {
50
+ "OPENBLAS_NUM_THREADS": "1",
51
+ "OMP_NUM_THREADS": "1"
52
+ },
53
+ "use_resn_data_parallel": False,
54
+ "use_ft_data_parallel": False,
55
+ "use_gnn_data_parallel": False,
56
+ "use_resn_ddp": True,
57
+ "use_ft_ddp": True,
58
+ "use_gnn_ddp": True,
59
+ "ddp_min_rows": 50000,
60
+ "ft_role": "model",
61
+ "ft_feature_prefix": "ft_emb",
62
+ "ft_num_numeric_tokens": None,
63
+ "ft_oof_folds": None,
64
+ "ft_oof_strategy": None,
65
+ "ft_oof_shuffle": True,
66
+ "resn_weight_decay": 0.0001,
67
+ "final_ensemble": False,
68
+ "final_ensemble_k": 3,
69
+ "final_refit": True,
70
+ "infer_categorical_max_unique": 50,
71
+ "infer_categorical_max_ratio": 0.05,
72
+ "optuna_study_prefix": "pricing",
73
+ "reuse_best_params": False,
74
+ "best_params_files": {},
75
+ "gnn_use_approx_knn": True,
76
+ "gnn_approx_knn_threshold": 50000,
77
+ "gnn_graph_cache": None,
78
+ "gnn_max_gpu_knn_nodes": 200000,
79
+ "gnn_knn_gpu_mem_ratio": 0.9,
80
+ "gnn_knn_gpu_mem_overhead": 2.0,
81
+ "geo_feature_nmes": [],
82
+ "region_province_col": None,
83
+ "region_city_col": None,
84
+ "region_effect_alpha": 0.0,
85
+ "geo_token_hidden_dim": 32,
86
+ "geo_token_layers": 2,
87
+ "geo_token_dropout": 0.1,
88
+ "geo_token_k_neighbors": 10,
89
+ "geo_token_learning_rate": 0.001,
90
+ "geo_token_epochs": 50,
91
+ "report_output_dir": "./Results/reports",
92
+ "report_group_cols": [],
93
+ "report_time_col": None,
94
+ "report_time_freq": "M",
95
+ "report_time_ascending": True,
96
+ "psi_bins": 10,
97
+ "psi_strategy": "quantile",
98
+ "psi_features": [],
99
+ "calibration": {
100
+ "enable": False,
101
+ "method": "sigmoid",
102
+ "max_rows": None,
103
+ "seed": 13
104
+ },
105
+ "threshold": {
106
+ "enable": False,
107
+ "value": None,
108
+ "metric": "f1",
109
+ "min_positive_rate": None,
110
+ "grid": 99,
111
+ "max_rows": None,
112
+ "seed": 13
113
+ },
114
+ "bootstrap": {
115
+ "enable": False,
116
+ "metrics": [],
117
+ "n_samples": 200,
118
+ "ci": 0.95,
119
+ "seed": 13
120
+ },
121
+ "register_model": False,
122
+ "registry_path": "./Results/model_registry.json",
123
+ "registry_tags": {},
124
+ "registry_status": "candidate",
125
+ "data_fingerprint_max_bytes": 10485760,
126
+ }
127
+
128
+ def build_config(
129
+ self,
130
+ data_dir: str,
131
+ model_list: List[str],
132
+ model_categories: List[str],
133
+ target: str,
134
+ weight: str,
135
+ feature_list: List[str],
136
+ categorical_features: List[str],
137
+ task_type: str = "regression",
138
+ prop_test: float = 0.25,
139
+ holdout_ratio: float = 0.25,
140
+ val_ratio: float = 0.25,
141
+ split_strategy: str = "random",
142
+ rand_seed: int = 13,
143
+ epochs: int = 50,
144
+ output_dir: str = "./Results",
145
+ use_gpu: bool = True,
146
+ model_keys: Optional[List[str]] = None,
147
+ max_evals: int = 50,
148
+ xgb_max_depth_max: int = 25,
149
+ xgb_n_estimators_max: int = 500,
150
+ nproc_per_node: int = 2,
151
+ ) -> Dict[str, Any]:
152
+ """
153
+ Build a complete configuration dictionary.
154
+
155
+ Args:
156
+ data_dir: Directory containing data files
157
+ model_list: List of model names
158
+ model_categories: List of model categories
159
+ target: Target column name
160
+ weight: Weight column name
161
+ feature_list: List of feature names
162
+ categorical_features: List of categorical feature names
163
+ task_type: Type of task (regression, binary, multiclass)
164
+ prop_test: Proportion of data for testing
165
+ holdout_ratio: Holdout ratio for validation
166
+ val_ratio: Validation ratio
167
+ split_strategy: Strategy for splitting data
168
+ rand_seed: Random seed for reproducibility
169
+ epochs: Number of training epochs
170
+ output_dir: Directory for output files
171
+ use_gpu: Whether to use GPU
172
+ model_keys: List of model types to train
173
+ max_evals: Maximum number of evaluations for optimization
174
+ xgb_max_depth_max: Maximum depth for XGBoost
175
+ xgb_n_estimators_max: Maximum estimators for XGBoost
176
+ nproc_per_node: Number of processes per node
177
+
178
+ Returns:
179
+ Complete configuration dictionary
180
+ """
181
+ if model_keys is None:
182
+ model_keys = ["xgb", "resn"]
183
+
184
+ config = self.default_config.copy()
185
+
186
+ # Update with user-provided values
187
+ config.update({
188
+ "data_dir": data_dir,
189
+ "model_list": model_list,
190
+ "model_categories": model_categories,
191
+ "target": target,
192
+ "weight": weight,
193
+ "feature_list": feature_list,
194
+ "categorical_features": categorical_features,
195
+ "task_type": task_type,
196
+ "prop_test": prop_test,
197
+ "holdout_ratio": holdout_ratio,
198
+ "val_ratio": val_ratio,
199
+ "split_strategy": split_strategy,
200
+ "rand_seed": rand_seed,
201
+ "epochs": epochs,
202
+ "output_dir": output_dir,
203
+ "use_gpu": use_gpu,
204
+ "xgb_max_depth_max": xgb_max_depth_max,
205
+ "xgb_n_estimators_max": xgb_n_estimators_max,
206
+ "optuna_storage": f"{output_dir}/optuna/bayesopt.sqlite3",
207
+ "stack_model_keys": model_keys,
208
+ })
209
+
210
+ # Add runner configuration
211
+ config["runner"] = {
212
+ "mode": "entry",
213
+ "model_keys": model_keys,
214
+ "nproc_per_node": nproc_per_node,
215
+ "max_evals": max_evals,
216
+ "plot_curves": False,
217
+ "ft_role": None,
218
+ "use_watchdog": False,
219
+ "idle_seconds": 7200,
220
+ "max_restarts": 50,
221
+ "restart_delay_seconds": 10,
222
+ "incremental_args": [
223
+ "--incremental-dir",
224
+ "./IncrementalBatches",
225
+ "--incremental-template",
226
+ "{model_name}_2025Q1.csv",
227
+ "--merge-keys",
228
+ "policy_id",
229
+ "vehicle_id",
230
+ "--model-keys",
231
+ "glm",
232
+ "xgb",
233
+ "ft",
234
+ "--max-evals",
235
+ "25",
236
+ "--update-base-data"
237
+ ]
238
+ }
239
+
240
+ return config
241
+
242
+ def build_explain_config(
243
+ self,
244
+ base_config: Dict[str, Any],
245
+ model_keys: Optional[List[str]] = None,
246
+ methods: Optional[List[str]] = None,
247
+ on_train: bool = False,
248
+ permutation_n_repeats: int = 5,
249
+ permutation_max_rows: int = 5000,
250
+ shap_n_background: int = 500,
251
+ shap_n_samples: int = 200,
252
+ ) -> Dict[str, Any]:
253
+ """
254
+ Build or update configuration for explain mode.
255
+
256
+ Args:
257
+ base_config: Base configuration dictionary
258
+ model_keys: Models to explain (e.g., ['xgb', 'resn'])
259
+ methods: Explanation methods (e.g., ['permutation', 'shap'])
260
+ on_train: Whether to run on training set (vs validation)
261
+ permutation_n_repeats: Number of repeats for permutation
262
+ permutation_max_rows: Max rows for permutation
263
+ shap_n_background: Background samples for SHAP
264
+ shap_n_samples: Samples for SHAP explanation
265
+
266
+ Returns:
267
+ Configuration with explain settings
268
+ """
269
+ config = base_config.copy()
270
+
271
+ if model_keys is None:
272
+ model_keys = ["xgb"]
273
+ if methods is None:
274
+ methods = ["permutation"]
275
+
276
+ # Set runner mode to explain
277
+ runner = config.get('runner', {})
278
+ runner['mode'] = 'explain'
279
+ config['runner'] = runner
280
+
281
+ # Add explain configuration
282
+ explain = {
283
+ "model_keys": model_keys,
284
+ "methods": methods,
285
+ "on_train": on_train,
286
+ "validation_path": None,
287
+ "train_path": None,
288
+ "save_dir": f"{config.get('output_dir', './Results')}/explain",
289
+ "model_dir": None,
290
+ "result_dir": None,
291
+ "permutation": {
292
+ "metric": "auto",
293
+ "n_repeats": permutation_n_repeats,
294
+ "max_rows": permutation_max_rows,
295
+ "random_state": config.get('rand_seed', 13)
296
+ },
297
+ "shap": {
298
+ "n_background": shap_n_background,
299
+ "n_samples": shap_n_samples,
300
+ "save_values": False
301
+ },
302
+ "integrated_gradients": {
303
+ "steps": 50,
304
+ "batch_size": 256,
305
+ "target": None,
306
+ "baseline": None,
307
+ "baseline_num": None,
308
+ "baseline_geo": None,
309
+ "save_values": False
310
+ }
311
+ }
312
+ config['explain'] = explain
313
+
314
+ return config
315
+
316
+ def validate_config(self, config: Dict[str, Any]) -> tuple[bool, str]:
317
+ """
318
+ Validate configuration dictionary.
319
+
320
+ Args:
321
+ config: Configuration to validate
322
+
323
+ Returns:
324
+ Tuple of (is_valid, error_message)
325
+ """
326
+ required_fields = [
327
+ "data_dir",
328
+ "model_list",
329
+ "target",
330
+ "weight",
331
+ "feature_list"
332
+ ]
333
+
334
+ for field in required_fields:
335
+ if field not in config:
336
+ return False, f"Missing required field: {field}"
337
+
338
+ if not config[field]:
339
+ return False, f"Empty value for required field: {field}"
340
+
341
+ # Validate model_list and model_categories have same length
342
+ if len(config.get("model_list", [])) != len(config.get("model_categories", [])):
343
+ return False, "model_list and model_categories must have the same length"
344
+
345
+ # Validate categorical features are subset of features
346
+ features = set(config.get("feature_list", []))
347
+ cat_features = set(config.get("categorical_features", []))
348
+ if not cat_features.issubset(features):
349
+ invalid = cat_features - features
350
+ return False, f"Categorical features not in feature_list: {invalid}"
351
+
352
+ return True, "Configuration is valid"
@@ -0,0 +1,36 @@
1
+ {
2
+ "data_dir": "./Data",
3
+ "model_list": ["od"],
4
+ "model_categories": ["bc"],
5
+ "target": "response",
6
+ "weight": "weights",
7
+ "feature_list": [
8
+ "age_owner",
9
+ "gender_owner",
10
+ "plt_zone",
11
+ "carbrand"
12
+ ],
13
+ "categorical_features": [
14
+ "gender_owner",
15
+ "plt_zone",
16
+ "carbrand"
17
+ ],
18
+ "task_type": "regression",
19
+ "prop_test": 0.25,
20
+ "holdout_ratio": 0.25,
21
+ "val_ratio": 0.25,
22
+ "split_strategy": "random",
23
+ "rand_seed": 13,
24
+ "epochs": 50,
25
+ "output_dir": "./Results",
26
+ "use_gpu": true,
27
+ "xgb_max_depth_max": 25,
28
+ "xgb_n_estimators_max": 500,
29
+ "optuna_storage": "./Results/optuna/bayesopt.sqlite3",
30
+ "runner": {
31
+ "mode": "entry",
32
+ "model_keys": ["xgb"],
33
+ "nproc_per_node": 1,
34
+ "max_evals": 50
35
+ }
36
+ }