ins-pricing 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/CHANGELOG.md +179 -0
- ins_pricing/RELEASE_NOTES_0.2.8.md +344 -0
- ins_pricing/modelling/core/bayesopt/utils.py +2 -1
- ins_pricing/modelling/explain/shap_utils.py +209 -6
- ins_pricing/pricing/calibration.py +125 -1
- ins_pricing/pricing/factors.py +110 -1
- ins_pricing/production/preprocess.py +166 -0
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/governance/__init__.py +1 -0
- ins_pricing/tests/governance/test_audit.py +56 -0
- ins_pricing/tests/governance/test_registry.py +128 -0
- ins_pricing/tests/governance/test_release.py +74 -0
- ins_pricing/tests/pricing/__init__.py +1 -0
- ins_pricing/tests/pricing/test_calibration.py +72 -0
- ins_pricing/tests/pricing/test_exposure.py +64 -0
- ins_pricing/tests/pricing/test_factors.py +156 -0
- ins_pricing/tests/pricing/test_rate_table.py +40 -0
- ins_pricing/tests/production/__init__.py +1 -0
- ins_pricing/tests/production/test_monitoring.py +350 -0
- ins_pricing/tests/production/test_predict.py +233 -0
- ins_pricing/tests/production/test_preprocess.py +339 -0
- ins_pricing/tests/production/test_scoring.py +311 -0
- ins_pricing/utils/profiling.py +377 -0
- ins_pricing/utils/validation.py +427 -0
- ins_pricing-0.2.9.dist-info/METADATA +149 -0
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/RECORD +28 -12
- ins_pricing/CHANGELOG_20260114.md +0 -275
- ins_pricing/CODE_REVIEW_IMPROVEMENTS.md +0 -715
- ins_pricing-0.2.7.dist-info/METADATA +0 -101
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/WHEEL +0 -0
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
"""Data validation utilities for ins_pricing.
|
|
2
|
+
|
|
3
|
+
This module provides reusable validation functions to ensure data quality
|
|
4
|
+
and provide clear error messages when validation fails.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> import pandas as pd
|
|
8
|
+
>>> from ins_pricing.utils.validation import validate_required_columns
|
|
9
|
+
>>> df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
|
|
10
|
+
>>> validate_required_columns(df, ['a', 'b'], df_name='my_data')
|
|
11
|
+
>>> # Raises DataValidationError if columns missing
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Dict, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from ins_pricing.exceptions import DataValidationError
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def validate_required_columns(
|
|
24
|
+
df: pd.DataFrame,
|
|
25
|
+
required: List[str],
|
|
26
|
+
*,
|
|
27
|
+
df_name: str = "DataFrame"
|
|
28
|
+
) -> None:
|
|
29
|
+
"""Validate that DataFrame contains all required columns.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
df: DataFrame to validate
|
|
33
|
+
required: List of required column names
|
|
34
|
+
df_name: Name of DataFrame for error messages (default: "DataFrame")
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
DataValidationError: If any required columns are missing
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
>>> df = pd.DataFrame({'age': [25, 30], 'premium': [100, 200]})
|
|
41
|
+
>>> validate_required_columns(df, ['age', 'premium'], df_name='policy_data')
|
|
42
|
+
>>> validate_required_columns(df, ['age', 'claim'], df_name='policy_data')
|
|
43
|
+
Traceback (most recent call last):
|
|
44
|
+
...
|
|
45
|
+
DataValidationError: policy_data missing required columns: ['claim']...
|
|
46
|
+
"""
|
|
47
|
+
missing = [col for col in required if col not in df.columns]
|
|
48
|
+
if missing:
|
|
49
|
+
available_preview = list(df.columns)[:50]
|
|
50
|
+
raise DataValidationError(
|
|
51
|
+
f"{df_name} missing required columns: {missing}. "
|
|
52
|
+
f"Available columns (first 50): {available_preview}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_column_types(
|
|
57
|
+
df: pd.DataFrame,
|
|
58
|
+
type_spec: Dict[str, Union[type, str]],
|
|
59
|
+
*,
|
|
60
|
+
coerce: bool = False,
|
|
61
|
+
df_name: str = "DataFrame"
|
|
62
|
+
) -> pd.DataFrame:
|
|
63
|
+
"""Validate and optionally coerce column data types.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
df: DataFrame to validate
|
|
67
|
+
type_spec: Dictionary mapping column names to expected types.
|
|
68
|
+
Types can be Python types (int, float, str) or pandas
|
|
69
|
+
dtype strings ('int64', 'float64', 'object', 'category')
|
|
70
|
+
coerce: If True, attempt to convert columns to expected types.
|
|
71
|
+
If False, raise error on type mismatch (default: False)
|
|
72
|
+
df_name: Name of DataFrame for error messages
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
DataFrame with validated (and possibly coerced) types
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
DataValidationError: If column types don't match and coerce=False
|
|
79
|
+
|
|
80
|
+
Example:
|
|
81
|
+
>>> df = pd.DataFrame({'age': ['25', '30'], 'premium': [100.0, 200.0]})
|
|
82
|
+
>>> df = validate_column_types(
|
|
83
|
+
... df,
|
|
84
|
+
... {'age': 'int64', 'premium': 'float64'},
|
|
85
|
+
... coerce=True
|
|
86
|
+
... )
|
|
87
|
+
>>> df['age'].dtype
|
|
88
|
+
dtype('int64')
|
|
89
|
+
"""
|
|
90
|
+
df = df.copy() if coerce else df
|
|
91
|
+
|
|
92
|
+
for col, expected_type in type_spec.items():
|
|
93
|
+
if col not in df.columns:
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
current_dtype = df[col].dtype
|
|
97
|
+
|
|
98
|
+
# Convert type spec to pandas dtype
|
|
99
|
+
if isinstance(expected_type, type):
|
|
100
|
+
if expected_type == int:
|
|
101
|
+
expected_dtype = 'int64'
|
|
102
|
+
elif expected_type == float:
|
|
103
|
+
expected_dtype = 'float64'
|
|
104
|
+
elif expected_type == str:
|
|
105
|
+
expected_dtype = 'object'
|
|
106
|
+
else:
|
|
107
|
+
expected_dtype = str(expected_type)
|
|
108
|
+
else:
|
|
109
|
+
expected_dtype = expected_type
|
|
110
|
+
|
|
111
|
+
# Check if types match
|
|
112
|
+
type_matches = (
|
|
113
|
+
str(current_dtype) == expected_dtype or
|
|
114
|
+
current_dtype.name == expected_dtype
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if not type_matches:
|
|
118
|
+
if coerce:
|
|
119
|
+
try:
|
|
120
|
+
df[col] = df[col].astype(expected_dtype)
|
|
121
|
+
except (ValueError, TypeError) as e:
|
|
122
|
+
raise DataValidationError(
|
|
123
|
+
f"{df_name}: Cannot convert column '{col}' from "
|
|
124
|
+
f"{current_dtype} to {expected_dtype}: {e}"
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
raise DataValidationError(
|
|
128
|
+
f"{df_name}: Column '{col}' has type {current_dtype}, "
|
|
129
|
+
f"expected {expected_dtype}"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return df
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def validate_value_range(
|
|
136
|
+
df: pd.DataFrame,
|
|
137
|
+
col: str,
|
|
138
|
+
*,
|
|
139
|
+
min_val: Optional[float] = None,
|
|
140
|
+
max_val: Optional[float] = None,
|
|
141
|
+
df_name: str = "DataFrame"
|
|
142
|
+
) -> None:
|
|
143
|
+
"""Validate that numeric column values are within expected range.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
df: DataFrame to validate
|
|
147
|
+
col: Column name to check
|
|
148
|
+
min_val: Minimum allowed value (inclusive), None for no minimum
|
|
149
|
+
max_val: Maximum allowed value (inclusive), None for no maximum
|
|
150
|
+
df_name: Name of DataFrame for error messages
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
DataValidationError: If values are outside the specified range
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
>>> df = pd.DataFrame({'age': [25, 30, 150]})
|
|
157
|
+
>>> validate_value_range(df, 'age', min_val=0, max_val=120)
|
|
158
|
+
Traceback (most recent call last):
|
|
159
|
+
...
|
|
160
|
+
DataValidationError: ...values outside range [0, 120]...
|
|
161
|
+
"""
|
|
162
|
+
if col not in df.columns:
|
|
163
|
+
raise DataValidationError(
|
|
164
|
+
f"{df_name}: Column '{col}' not found for range validation"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
168
|
+
raise DataValidationError(
|
|
169
|
+
f"{df_name}: Column '{col}' is not numeric, cannot validate range"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
violations = []
|
|
173
|
+
|
|
174
|
+
if min_val is not None:
|
|
175
|
+
below_min = df[col] < min_val
|
|
176
|
+
if below_min.any():
|
|
177
|
+
count = below_min.sum()
|
|
178
|
+
min_found = df.loc[below_min, col].min()
|
|
179
|
+
violations.append(
|
|
180
|
+
f"{count} values below minimum {min_val} (min found: {min_found})"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if max_val is not None:
|
|
184
|
+
above_max = df[col] > max_val
|
|
185
|
+
if above_max.any():
|
|
186
|
+
count = above_max.sum()
|
|
187
|
+
max_found = df.loc[above_max, col].max()
|
|
188
|
+
violations.append(
|
|
189
|
+
f"{count} values above maximum {max_val} (max found: {max_found})"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if violations:
|
|
193
|
+
range_str = f"[{min_val}, {max_val}]"
|
|
194
|
+
raise DataValidationError(
|
|
195
|
+
f"{df_name}: Column '{col}' has values outside range {range_str}: "
|
|
196
|
+
f"{'; '.join(violations)}"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def validate_no_nulls(
|
|
201
|
+
df: pd.DataFrame,
|
|
202
|
+
columns: List[str],
|
|
203
|
+
*,
|
|
204
|
+
df_name: str = "DataFrame"
|
|
205
|
+
) -> None:
|
|
206
|
+
"""Validate that specified columns contain no null values.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
df: DataFrame to validate
|
|
210
|
+
columns: List of column names to check for nulls
|
|
211
|
+
df_name: Name of DataFrame for error messages
|
|
212
|
+
|
|
213
|
+
Raises:
|
|
214
|
+
DataValidationError: If any specified columns contain null values
|
|
215
|
+
|
|
216
|
+
Example:
|
|
217
|
+
>>> df = pd.DataFrame({'age': [25, None, 30], 'premium': [100, 200, 300]})
|
|
218
|
+
>>> validate_no_nulls(df, ['age', 'premium'])
|
|
219
|
+
Traceback (most recent call last):
|
|
220
|
+
...
|
|
221
|
+
DataValidationError: ...contains null values: age (1 nulls)...
|
|
222
|
+
"""
|
|
223
|
+
null_info = []
|
|
224
|
+
|
|
225
|
+
for col in columns:
|
|
226
|
+
if col not in df.columns:
|
|
227
|
+
raise DataValidationError(
|
|
228
|
+
f"{df_name}: Column '{col}' not found for null validation"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
null_count = df[col].isna().sum()
|
|
232
|
+
if null_count > 0:
|
|
233
|
+
null_info.append(f"{col} ({null_count} nulls)")
|
|
234
|
+
|
|
235
|
+
if null_info:
|
|
236
|
+
raise DataValidationError(
|
|
237
|
+
f"{df_name} contains null values: {', '.join(null_info)}"
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def validate_categorical_values(
|
|
242
|
+
df: pd.DataFrame,
|
|
243
|
+
col: str,
|
|
244
|
+
allowed_values: List[str],
|
|
245
|
+
*,
|
|
246
|
+
df_name: str = "DataFrame"
|
|
247
|
+
) -> None:
|
|
248
|
+
"""Validate that categorical column contains only allowed values.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
df: DataFrame to validate
|
|
252
|
+
col: Column name to check
|
|
253
|
+
allowed_values: List of allowed values
|
|
254
|
+
df_name: Name of DataFrame for error messages
|
|
255
|
+
|
|
256
|
+
Raises:
|
|
257
|
+
DataValidationError: If column contains values not in allowed_values
|
|
258
|
+
|
|
259
|
+
Example:
|
|
260
|
+
>>> df = pd.DataFrame({'gender': ['M', 'F', 'X']})
|
|
261
|
+
>>> validate_categorical_values(df, 'gender', ['M', 'F'])
|
|
262
|
+
Traceback (most recent call last):
|
|
263
|
+
...
|
|
264
|
+
DataValidationError: ...contains invalid values: ['X']...
|
|
265
|
+
"""
|
|
266
|
+
if col not in df.columns:
|
|
267
|
+
raise DataValidationError(
|
|
268
|
+
f"{df_name}: Column '{col}' not found for categorical validation"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
unique_values = df[col].dropna().unique()
|
|
272
|
+
invalid_values = [v for v in unique_values if v not in allowed_values]
|
|
273
|
+
|
|
274
|
+
if invalid_values:
|
|
275
|
+
raise DataValidationError(
|
|
276
|
+
f"{df_name}: Column '{col}' contains invalid values: {invalid_values}. "
|
|
277
|
+
f"Allowed values: {allowed_values}"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def validate_positive(
|
|
282
|
+
df: pd.DataFrame,
|
|
283
|
+
columns: List[str],
|
|
284
|
+
*,
|
|
285
|
+
allow_zero: bool = False,
|
|
286
|
+
df_name: str = "DataFrame"
|
|
287
|
+
) -> None:
|
|
288
|
+
"""Validate that numeric columns contain only positive values.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
df: DataFrame to validate
|
|
292
|
+
columns: List of column names to check
|
|
293
|
+
allow_zero: If True, allow zero values (default: False)
|
|
294
|
+
df_name: Name of DataFrame for error messages
|
|
295
|
+
|
|
296
|
+
Raises:
|
|
297
|
+
DataValidationError: If columns contain non-positive values
|
|
298
|
+
|
|
299
|
+
Example:
|
|
300
|
+
>>> df = pd.DataFrame({'premium': [100, -50, 200], 'exposure': [1, 0, 2]})
|
|
301
|
+
>>> validate_positive(df, ['premium', 'exposure'])
|
|
302
|
+
Traceback (most recent call last):
|
|
303
|
+
...
|
|
304
|
+
DataValidationError: ...contains non-positive values...
|
|
305
|
+
"""
|
|
306
|
+
violations = []
|
|
307
|
+
|
|
308
|
+
for col in columns:
|
|
309
|
+
if col not in df.columns:
|
|
310
|
+
raise DataValidationError(
|
|
311
|
+
f"{df_name}: Column '{col}' not found for positivity validation"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
315
|
+
raise DataValidationError(
|
|
316
|
+
f"{df_name}: Column '{col}' is not numeric"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
if allow_zero:
|
|
320
|
+
invalid = df[col] < 0
|
|
321
|
+
msg = "negative"
|
|
322
|
+
else:
|
|
323
|
+
invalid = df[col] <= 0
|
|
324
|
+
msg = "non-positive"
|
|
325
|
+
|
|
326
|
+
if invalid.any():
|
|
327
|
+
count = invalid.sum()
|
|
328
|
+
min_val = df.loc[invalid, col].min()
|
|
329
|
+
violations.append(f"{col} ({count} {msg} values, min: {min_val})")
|
|
330
|
+
|
|
331
|
+
if violations:
|
|
332
|
+
raise DataValidationError(
|
|
333
|
+
f"{df_name} contains {msg} values: {', '.join(violations)}"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def validate_dataframe_not_empty(
|
|
338
|
+
df: pd.DataFrame,
|
|
339
|
+
*,
|
|
340
|
+
df_name: str = "DataFrame"
|
|
341
|
+
) -> None:
|
|
342
|
+
"""Validate that DataFrame is not empty.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
df: DataFrame to validate
|
|
346
|
+
df_name: Name of DataFrame for error messages
|
|
347
|
+
|
|
348
|
+
Raises:
|
|
349
|
+
DataValidationError: If DataFrame is empty
|
|
350
|
+
|
|
351
|
+
Example:
|
|
352
|
+
>>> df = pd.DataFrame()
|
|
353
|
+
>>> validate_dataframe_not_empty(df, df_name='train_data')
|
|
354
|
+
Traceback (most recent call last):
|
|
355
|
+
...
|
|
356
|
+
DataValidationError: train_data is empty (0 rows)
|
|
357
|
+
"""
|
|
358
|
+
if len(df) == 0:
|
|
359
|
+
raise DataValidationError(f"{df_name} is empty (0 rows)")
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def validate_date_range(
|
|
363
|
+
df: pd.DataFrame,
|
|
364
|
+
col: str,
|
|
365
|
+
*,
|
|
366
|
+
min_date: Optional[pd.Timestamp] = None,
|
|
367
|
+
max_date: Optional[pd.Timestamp] = None,
|
|
368
|
+
df_name: str = "DataFrame"
|
|
369
|
+
) -> None:
|
|
370
|
+
"""Validate that date column values are within expected range.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
df: DataFrame to validate
|
|
374
|
+
col: Column name to check (should be datetime type)
|
|
375
|
+
min_date: Minimum allowed date, None for no minimum
|
|
376
|
+
max_date: Maximum allowed date, None for no maximum
|
|
377
|
+
df_name: Name of DataFrame for error messages
|
|
378
|
+
|
|
379
|
+
Raises:
|
|
380
|
+
DataValidationError: If dates are outside the specified range
|
|
381
|
+
|
|
382
|
+
Example:
|
|
383
|
+
>>> df = pd.DataFrame({'policy_date': pd.to_datetime(['2020-01-01', '2025-01-01'])})
|
|
384
|
+
>>> validate_date_range(
|
|
385
|
+
... df, 'policy_date',
|
|
386
|
+
... min_date=pd.Timestamp('2020-01-01'),
|
|
387
|
+
... max_date=pd.Timestamp('2023-12-31')
|
|
388
|
+
... )
|
|
389
|
+
Traceback (most recent call last):
|
|
390
|
+
...
|
|
391
|
+
DataValidationError: ...dates outside range...
|
|
392
|
+
"""
|
|
393
|
+
if col not in df.columns:
|
|
394
|
+
raise DataValidationError(
|
|
395
|
+
f"{df_name}: Column '{col}' not found for date validation"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
if not pd.api.types.is_datetime64_any_dtype(df[col]):
|
|
399
|
+
raise DataValidationError(
|
|
400
|
+
f"{df_name}: Column '{col}' is not datetime type"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
violations = []
|
|
404
|
+
|
|
405
|
+
if min_date is not None:
|
|
406
|
+
before_min = df[col] < min_date
|
|
407
|
+
if before_min.any():
|
|
408
|
+
count = before_min.sum()
|
|
409
|
+
earliest = df.loc[before_min, col].min()
|
|
410
|
+
violations.append(
|
|
411
|
+
f"{count} dates before {min_date} (earliest: {earliest})"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
if max_date is not None:
|
|
415
|
+
after_max = df[col] > max_date
|
|
416
|
+
if after_max.any():
|
|
417
|
+
count = after_max.sum()
|
|
418
|
+
latest = df.loc[after_max, col].max()
|
|
419
|
+
violations.append(
|
|
420
|
+
f"{count} dates after {max_date} (latest: {latest})"
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
if violations:
|
|
424
|
+
raise DataValidationError(
|
|
425
|
+
f"{df_name}: Column '{col}' has dates outside range: "
|
|
426
|
+
f"{'; '.join(violations)}"
|
|
427
|
+
)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ins_pricing
|
|
3
|
+
Version: 0.2.9
|
|
4
|
+
Summary: Reusable modelling, pricing, governance, and reporting utilities.
|
|
5
|
+
Author: meishi125478
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Keywords: pricing,insurance,bayesopt,ml
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: License :: Other/Proprietary License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
Requires-Dist: numpy>=1.20
|
|
17
|
+
Requires-Dist: pandas>=1.4
|
|
18
|
+
Provides-Extra: bayesopt
|
|
19
|
+
Requires-Dist: torch>=1.13; extra == "bayesopt"
|
|
20
|
+
Requires-Dist: optuna>=3.0; extra == "bayesopt"
|
|
21
|
+
Requires-Dist: xgboost>=1.6; extra == "bayesopt"
|
|
22
|
+
Requires-Dist: scikit-learn>=1.1; extra == "bayesopt"
|
|
23
|
+
Requires-Dist: statsmodels>=0.13; extra == "bayesopt"
|
|
24
|
+
Requires-Dist: joblib>=1.2; extra == "bayesopt"
|
|
25
|
+
Requires-Dist: matplotlib>=3.5; extra == "bayesopt"
|
|
26
|
+
Provides-Extra: plotting
|
|
27
|
+
Requires-Dist: matplotlib>=3.5; extra == "plotting"
|
|
28
|
+
Requires-Dist: scikit-learn>=1.1; extra == "plotting"
|
|
29
|
+
Provides-Extra: explain
|
|
30
|
+
Requires-Dist: torch>=1.13; extra == "explain"
|
|
31
|
+
Requires-Dist: shap>=0.41; extra == "explain"
|
|
32
|
+
Requires-Dist: scikit-learn>=1.1; extra == "explain"
|
|
33
|
+
Provides-Extra: geo
|
|
34
|
+
Requires-Dist: contextily>=1.3; extra == "geo"
|
|
35
|
+
Requires-Dist: matplotlib>=3.5; extra == "geo"
|
|
36
|
+
Provides-Extra: gnn
|
|
37
|
+
Requires-Dist: torch>=1.13; extra == "gnn"
|
|
38
|
+
Requires-Dist: pynndescent>=0.5; extra == "gnn"
|
|
39
|
+
Requires-Dist: torch-geometric>=2.3; extra == "gnn"
|
|
40
|
+
Provides-Extra: all
|
|
41
|
+
Requires-Dist: torch>=1.13; extra == "all"
|
|
42
|
+
Requires-Dist: optuna>=3.0; extra == "all"
|
|
43
|
+
Requires-Dist: xgboost>=1.6; extra == "all"
|
|
44
|
+
Requires-Dist: scikit-learn>=1.1; extra == "all"
|
|
45
|
+
Requires-Dist: statsmodels>=0.13; extra == "all"
|
|
46
|
+
Requires-Dist: joblib>=1.2; extra == "all"
|
|
47
|
+
Requires-Dist: matplotlib>=3.5; extra == "all"
|
|
48
|
+
Requires-Dist: shap>=0.41; extra == "all"
|
|
49
|
+
Requires-Dist: contextily>=1.3; extra == "all"
|
|
50
|
+
Requires-Dist: pynndescent>=0.5; extra == "all"
|
|
51
|
+
Requires-Dist: torch-geometric>=2.3; extra == "all"
|
|
52
|
+
|
|
53
|
+
# Insurance-Pricing
|
|
54
|
+
|
|
55
|
+
A reusable toolkit for insurance modeling, pricing, governance, and reporting.
|
|
56
|
+
|
|
57
|
+
## Overview
|
|
58
|
+
|
|
59
|
+
Insurance-Pricing (ins_pricing) is an enterprise-grade Python library designed for machine learning model training, pricing calculations, and model governance workflows in the insurance industry.
|
|
60
|
+
|
|
61
|
+
### Core Modules
|
|
62
|
+
|
|
63
|
+
| Module | Description |
|
|
64
|
+
|--------|-------------|
|
|
65
|
+
| **modelling** | ML model training (GLM, XGBoost, ResNet, FT-Transformer, GNN) and model interpretability (SHAP, permutation importance) |
|
|
66
|
+
| **pricing** | Factor table construction, numeric binning, premium calibration, exposure calculation, PSI monitoring |
|
|
67
|
+
| **production** | Model prediction, batch scoring, data drift detection, production metrics monitoring |
|
|
68
|
+
| **governance** | Model registry, version management, approval workflows, audit logging |
|
|
69
|
+
| **reporting** | Report generation (Markdown format), report scheduling |
|
|
70
|
+
| **utils** | Data validation, performance profiling, device management, logging configuration |
|
|
71
|
+
|
|
72
|
+
### Quick Start
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
# Model training with Bayesian optimization
|
|
76
|
+
from ins_pricing import bayesopt as ropt
|
|
77
|
+
|
|
78
|
+
model = ropt.BayesOptModel(
|
|
79
|
+
train_data, test_data,
|
|
80
|
+
model_name='my_model',
|
|
81
|
+
resp_nme='target',
|
|
82
|
+
weight_nme='weight',
|
|
83
|
+
factor_nmes=feature_list,
|
|
84
|
+
cate_list=categorical_features,
|
|
85
|
+
)
|
|
86
|
+
model.bayesopt_xgb(max_evals=100) # Train XGBoost
|
|
87
|
+
model.bayesopt_resnet(max_evals=50) # Train ResNet
|
|
88
|
+
model.bayesopt_ft(max_evals=50) # Train FT-Transformer
|
|
89
|
+
|
|
90
|
+
# Pricing: build factor table
|
|
91
|
+
from ins_pricing.pricing import build_factor_table
|
|
92
|
+
factors = build_factor_table(
|
|
93
|
+
df,
|
|
94
|
+
factor_col='age_band',
|
|
95
|
+
loss_col='claim_amount',
|
|
96
|
+
exposure_col='exposure',
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Production: batch scoring
|
|
100
|
+
from ins_pricing.production import batch_score
|
|
101
|
+
scores = batch_score(model.trainers['xgb'].predict, df)
|
|
102
|
+
|
|
103
|
+
# Model governance
|
|
104
|
+
from ins_pricing.governance import ModelRegistry
|
|
105
|
+
registry = ModelRegistry('models.json')
|
|
106
|
+
registry.register(model_name, version, metrics=metrics)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Project Structure
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
ins_pricing/
|
|
113
|
+
├── cli/ # Command-line entry points
|
|
114
|
+
├── modelling/
|
|
115
|
+
│ ├── core/bayesopt/ # ML model training core
|
|
116
|
+
│ ├── explain/ # Model interpretability
|
|
117
|
+
│ └── plotting/ # Model visualization
|
|
118
|
+
├── pricing/ # Insurance pricing module
|
|
119
|
+
├── production/ # Production deployment module
|
|
120
|
+
├── governance/ # Model governance
|
|
121
|
+
├── reporting/ # Report generation
|
|
122
|
+
├── utils/ # Utilities
|
|
123
|
+
└── tests/ # Test suite
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Installation
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# Basic installation
|
|
130
|
+
pip install ins_pricing
|
|
131
|
+
|
|
132
|
+
# Full installation (all optional dependencies)
|
|
133
|
+
pip install ins_pricing[all]
|
|
134
|
+
|
|
135
|
+
# Install specific extras
|
|
136
|
+
pip install ins_pricing[bayesopt] # Model training
|
|
137
|
+
pip install ins_pricing[explain] # Model explanation
|
|
138
|
+
pip install ins_pricing[plotting] # Visualization
|
|
139
|
+
pip install ins_pricing[gnn] # Graph neural networks
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Requirements
|
|
143
|
+
|
|
144
|
+
- Python >= 3.9
|
|
145
|
+
- Core dependencies: numpy >= 1.20, pandas >= 1.4
|
|
146
|
+
|
|
147
|
+
### License
|
|
148
|
+
|
|
149
|
+
Proprietary
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
ins_pricing/
|
|
2
|
-
ins_pricing/CODE_REVIEW_IMPROVEMENTS.md,sha256=eGQ1qtl8l84VvxvAnvzcV9Kk_FVPthAg1poq92k6vZU,17852
|
|
1
|
+
ins_pricing/CHANGELOG.md,sha256=Z423KtXu_4_cL0T2C947K8F5hHjulRvoc3lRjH5nCVw,7618
|
|
3
2
|
ins_pricing/README.md,sha256=pMOO1cU06oBfvm5d8gvAsQsJr9bfb6AKdpXlrx0AAxw,2727
|
|
3
|
+
ins_pricing/RELEASE_NOTES_0.2.8.md,sha256=KIJzk1jbZbZPKjwnkPSDHO_2Ipv3SP3CzCNDdf07jI0,9331
|
|
4
4
|
ins_pricing/__init__.py,sha256=46j1wCdLVrgrofeBwKl-3NXTxzjbTv-w3KjW-dyKGiY,2622
|
|
5
5
|
ins_pricing/exceptions.py,sha256=5fZavPV4zNJ7wPC75L215KkHXX9pRrfDAYZOdSKJMGo,4778
|
|
6
|
-
ins_pricing/setup.py,sha256=
|
|
6
|
+
ins_pricing/setup.py,sha256=YXo6D3Tjgo6DwXI2SOufrvgYbHY2oCnT2dStGzA5Fuc,1702
|
|
7
7
|
ins_pricing/cli/BayesOpt_entry.py,sha256=X3AiNQQh5ARcjVMM2vOKWPYPDIId40n_RPZA76pTGl4,558
|
|
8
8
|
ins_pricing/cli/BayesOpt_incremental.py,sha256=_Klr5vvNoq_TbgwrH_T3f0a6cHmA9iVJMViiji6ahJY,35927
|
|
9
9
|
ins_pricing/cli/Explain_Run.py,sha256=gEPQjqHiXyXlCTKjUzwSvbAn5_h74ABgb_sEGs-YHVE,664
|
|
@@ -34,7 +34,7 @@ ins_pricing/modelling/core/bayesopt/config_preprocess.py,sha256=2UpkcmhEk2V3uLv2
|
|
|
34
34
|
ins_pricing/modelling/core/bayesopt/core.py,sha256=ewChb0s-on3ZWIcbCpJTJAaeHeo28nBzsFyvQ-zWHio,40926
|
|
35
35
|
ins_pricing/modelling/core/bayesopt/model_explain_mixin.py,sha256=jCk1zPpwgwBBCndaq-A0_cQnc4RHueh2p5cAuE9ArTo,11620
|
|
36
36
|
ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py,sha256=lD0rUvWV4eWatmTzMrmAUm2Flj8uAOa3R9S2JyYV94k,21807
|
|
37
|
-
ins_pricing/modelling/core/bayesopt/utils.py,sha256=
|
|
37
|
+
ins_pricing/modelling/core/bayesopt/utils.py,sha256=5RKizpR3j6KwR87WqqaXPtgjQXWPW4vM75sIkx38SSM,57924
|
|
38
38
|
ins_pricing/modelling/core/bayesopt/models/__init__.py,sha256=vFFCkGnO6rm50TbxR6QekKKQjq-NW4UFwog6fng8-p8,700
|
|
39
39
|
ins_pricing/modelling/core/bayesopt/models/model_ft_components.py,sha256=0I0NiDf1D3cOhTRQwatsNTw9Julmxv5v3HZV8fTrvcQ,10989
|
|
40
40
|
ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py,sha256=IJoBOYHo5uzLEpo4DZj1PsPXojmCKbMaoRudhB9IGBY,36531
|
|
@@ -51,7 +51,7 @@ ins_pricing/modelling/explain/__init__.py,sha256=CPoGzGu8TTO3FOXjxoXC13VkuIDCf3Y
|
|
|
51
51
|
ins_pricing/modelling/explain/gradients.py,sha256=9TqCws_p49nFxVMcjVxe4KCZ7frezeL0uV_LCdoM5yo,11088
|
|
52
52
|
ins_pricing/modelling/explain/metrics.py,sha256=K_xOY7ZrHWhbJ79RNB7eXN3VXeTe8vq68ZLH2BlZufA,5389
|
|
53
53
|
ins_pricing/modelling/explain/permutation.py,sha256=TeVlhz7FWyPsaZwEP3BSLa0BZgdOq74coZqnadkGaPw,9333
|
|
54
|
-
ins_pricing/modelling/explain/shap_utils.py,sha256=
|
|
54
|
+
ins_pricing/modelling/explain/shap_utils.py,sha256=70zRIHPPdoECFOFQeBTRxLZF-6sjaGJBNMIRS4_kmVI,10462
|
|
55
55
|
ins_pricing/modelling/plotting/__init__.py,sha256=BBQKcE7IYUYObFrjpSnfNS6rmzc80Lae7oEqxKz-vEk,1058
|
|
56
56
|
ins_pricing/modelling/plotting/common.py,sha256=_kFq7JMA0LnKIp4bqAFvr-24VaHjj9pegDMm1qP9_7Y,1439
|
|
57
57
|
ins_pricing/modelling/plotting/curves.py,sha256=fEnoeHAVvLPI5HXadNLUoWygmHtNWCngynLdZK868NQ,18398
|
|
@@ -60,22 +60,26 @@ ins_pricing/modelling/plotting/geo.py,sha256=sRJTYOcAphNFM-oww4qbw9MoZneBCJtur96
|
|
|
60
60
|
ins_pricing/modelling/plotting/importance.py,sha256=xs3l9uW_rCrakoA__fanIph6DK2jN_DugsKASAzteJU,3666
|
|
61
61
|
ins_pricing/pricing/README.md,sha256=PEcyw5oDkqJHOqnNdzBdbbpZwG4SOlnhMwY-owwQ0GI,1064
|
|
62
62
|
ins_pricing/pricing/__init__.py,sha256=XFplK3zkxPyNQZJd1Gn6_VvpwHLedMqxAd_Vn9tqsTE,881
|
|
63
|
-
ins_pricing/pricing/calibration.py,sha256=
|
|
63
|
+
ins_pricing/pricing/calibration.py,sha256=cx9fbDoOnNEMGPH6Js1EDMfVwy9J7zf_90yuNwD7W6I,6196
|
|
64
64
|
ins_pricing/pricing/data_quality.py,sha256=8FecBE60cABsTZE7HETuoKCEOXIrirGAFgg5wQCZrmU,4043
|
|
65
65
|
ins_pricing/pricing/exposure.py,sha256=rw8kKZ_1QdeGTCm13ck9NXrRBPt6TunxRw2s_qkHYkg,2575
|
|
66
|
-
ins_pricing/pricing/factors.py,sha256=
|
|
66
|
+
ins_pricing/pricing/factors.py,sha256=1cqDqdXRLb9-yt-x60f0lPWdgAAOvk7slnawVIKcWDo,6573
|
|
67
67
|
ins_pricing/pricing/monitoring.py,sha256=GnfF2g1C9KzDks9ELBykfAd0zzVBUvjuTWoffa7aGbI,613
|
|
68
68
|
ins_pricing/pricing/rate_table.py,sha256=llDW95i7gR6cCtGFwcGqgpgFvOOPCURaJWmuQw1oce8,2473
|
|
69
69
|
ins_pricing/production/__init__.py,sha256=plUjyiwxrzHDDgXKezyGp9UHOg7Mav4f0ryXYtNmbfs,885
|
|
70
70
|
ins_pricing/production/drift.py,sha256=q_oE_h2NbVETTBkh9QUu8Y68ERuFFcrfKpOb3zBcvsA,383
|
|
71
71
|
ins_pricing/production/monitoring.py,sha256=A6Hyc5WSKhFkDZOIrqmFteuDee75CdcwdTq644vrk-U,4836
|
|
72
72
|
ins_pricing/production/predict.py,sha256=mJog-RGLHxIJxx4oh0D1gbhJwBZPQs1z1P6YTDfWtfg,21791
|
|
73
|
-
ins_pricing/production/preprocess.py,sha256=
|
|
73
|
+
ins_pricing/production/preprocess.py,sha256=cl20X0rVcKNCjVJswB8SdHffMgox6Qga4Ac29L6pW5g,9404
|
|
74
74
|
ins_pricing/production/scoring.py,sha256=yFmMmbYb7w_RC4uZOCMnAjLMRcjXQWIuT1nsfu-bwuc,1379
|
|
75
75
|
ins_pricing/reporting/README.md,sha256=kTVdB6pNewwh1HlCHrI2SzWTgprtQoQprLRQ2qLdgNA,486
|
|
76
76
|
ins_pricing/reporting/__init__.py,sha256=Se5Cdomv9_RwkIDizkw1yx4iCMcjhjTHb4pZK6K895c,242
|
|
77
77
|
ins_pricing/reporting/report_builder.py,sha256=53ZFqGUx2isAoigT5IDwvXkek67zN7-6IgKeGpJhO7c,2241
|
|
78
78
|
ins_pricing/reporting/scheduler.py,sha256=9koG_1cmWvLqrS66uzMJuAlYI2VTkynV19ssB2TtcKU,1336
|
|
79
|
+
ins_pricing/tests/governance/__init__.py,sha256=5Nxg4_dIxY_J58_x2QOXrrRgw6L51Md0Wnt5Up-chqg,39
|
|
80
|
+
ins_pricing/tests/governance/test_audit.py,sha256=ubybXSTVILPN4VxQ2fMnG6oPNv4LjJJE3EsQ53NYdLU,1702
|
|
81
|
+
ins_pricing/tests/governance/test_registry.py,sha256=TvkNMLHViNuopjjho6oETwZ9d6MNaNM1xbL6URPDKSk,4602
|
|
82
|
+
ins_pricing/tests/governance/test_release.py,sha256=Cdo6prZ0xlioAP2AYHodzgASEIa6ZCLjbXW9Me2RGKk,2347
|
|
79
83
|
ins_pricing/tests/modelling/conftest.py,sha256=0KUXnkTgIGEIsf0J4uzIx5Kq4JkDyFo81Mv0qvIzW9k,180
|
|
80
84
|
ins_pricing/tests/modelling/test_cross_val_generic.py,sha256=iLZOFmdyrycB15lFWoQphkFlEjzZTozQXTLVOHLw2Qg,1721
|
|
81
85
|
ins_pricing/tests/modelling/test_distributed_utils.py,sha256=9cStpDw7jPdQwmm0Po-G2tB04uzSR1CoOUZMLuB61yI,466
|
|
@@ -85,12 +89,24 @@ ins_pricing/tests/modelling/test_graph_cache.py,sha256=QEI5cLLtQ9_zwRR50KqUf8qxo
|
|
|
85
89
|
ins_pricing/tests/modelling/test_plotting.py,sha256=4gJax72l40fQrjyJQLOgUmaT6xn6zXpujEaFNeHVwGw,1911
|
|
86
90
|
ins_pricing/tests/modelling/test_plotting_library.py,sha256=SB5RjKTaPydK848V0xpqEaJtEWhRv6ZfnHmnnzjaPh4,4079
|
|
87
91
|
ins_pricing/tests/modelling/test_preprocessor.py,sha256=FqbKltV803Pd-ZY1xBc4XF1T-INDuUliaVcMIDPmBxI,1438
|
|
92
|
+
ins_pricing/tests/pricing/__init__.py,sha256=SVfgUYBlCmc4wjYLMRX5xPFgQZxTS3aHBOA_Cx1aJg4,36
|
|
93
|
+
ins_pricing/tests/pricing/test_calibration.py,sha256=hLZuSWOH4t9WKcQ-2srvYp4P5ldr1Yh1dhl7s61vMp8,2420
|
|
94
|
+
ins_pricing/tests/pricing/test_exposure.py,sha256=CrpSncVce-PGt2XzjOX6qV0SA22vKPUv1u8RlKQjt_g,2054
|
|
95
|
+
ins_pricing/tests/pricing/test_factors.py,sha256=NTE7lz1RWChhoRt2K5003DoNRqG_Gu4X1Aauy2NexOg,5093
|
|
96
|
+
ins_pricing/tests/pricing/test_rate_table.py,sha256=ICHfAQsC9TaxXbQVKM5AvBaJXRTVY723Vaz1XOWNMW8,1250
|
|
97
|
+
ins_pricing/tests/production/__init__.py,sha256=WFWlvBVdjg-E-nKaiJ8VTKNELYufJusufpij1p1xwso,39
|
|
98
|
+
ins_pricing/tests/production/test_monitoring.py,sha256=jettbaVLH4a3efLWeiQ6FFukGEw7mmz6_AeYhYX0caQ,11409
|
|
99
|
+
ins_pricing/tests/production/test_predict.py,sha256=PNamNzb-KNbLtT8xbBkwWOcmJC1gXvxj2nnc7dR5fyc,8193
|
|
100
|
+
ins_pricing/tests/production/test_preprocess.py,sha256=tsHYANwJjNlaSo8O4qiwqBvMOMtwmtZymRFm6UrODrE,11084
|
|
101
|
+
ins_pricing/tests/production/test_scoring.py,sha256=fKz2tJomodrRt333apCrjtyJCwg9RHRbWm0lvcU6xm0,9848
|
|
88
102
|
ins_pricing/utils/__init__.py,sha256=ovtolxOvlYp_1SOxZ35OPBdn7JB2O4idzRSQgIlzCvc,2339
|
|
89
103
|
ins_pricing/utils/device.py,sha256=fePvqSaOkzHMBbrHCXAOCKRwdcR8YtiGI5K8Q3ljXJc,7543
|
|
90
104
|
ins_pricing/utils/logging.py,sha256=_AKB4ErmvygwGLtu7Ai7ESemj6Hh8FTgh4cs8j_gVW4,2258
|
|
91
105
|
ins_pricing/utils/metrics.py,sha256=zhKYgXgki8RDscjP_GO2lEgzrtMIZCqOX_aLpQzdw6k,8390
|
|
92
106
|
ins_pricing/utils/paths.py,sha256=o_tBiclFvBci4cYg9WANwKPxrMcglEdOjDP-EZgGjdQ,8749
|
|
93
|
-
ins_pricing
|
|
94
|
-
ins_pricing
|
|
95
|
-
ins_pricing-0.2.
|
|
96
|
-
ins_pricing-0.2.
|
|
107
|
+
ins_pricing/utils/profiling.py,sha256=kmbykHLcYywlZxAf_aVU8HXID3zOvUcBoO5Q58AijhA,11132
|
|
108
|
+
ins_pricing/utils/validation.py,sha256=4Tw9VUJPk0N-WO3YUqZP-xXRl1Xpubkm0vi3WzzZrv4,13348
|
|
109
|
+
ins_pricing-0.2.9.dist-info/METADATA,sha256=uQLsM3-erzbt48qF7nt7_Orf18CF-aKkk4_5B4_yeMk,5397
|
|
110
|
+
ins_pricing-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
111
|
+
ins_pricing-0.2.9.dist-info/top_level.txt,sha256=haZuNQpHKNBEPZx3NjLnHp8pV3I_J9QG8-HyJn00FA0,12
|
|
112
|
+
ins_pricing-0.2.9.dist-info/RECORD,,
|