ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +9 -6
- ins_pricing/__init__.py +3 -11
- ins_pricing/cli/BayesOpt_entry.py +24 -0
- ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
- ins_pricing/cli/Explain_Run.py +25 -0
- ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
- ins_pricing/cli/Pricing_Run.py +25 -0
- ins_pricing/cli/__init__.py +1 -0
- ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
- ins_pricing/cli/utils/__init__.py +1 -0
- ins_pricing/cli/utils/cli_common.py +320 -0
- ins_pricing/cli/utils/cli_config.py +375 -0
- ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
- {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
- ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
- ins_pricing/docs/modelling/README.md +34 -0
- ins_pricing/modelling/__init__.py +57 -6
- ins_pricing/modelling/core/__init__.py +1 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
- ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
- ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
- ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
- ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
- ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
- ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
- ins_pricing/modelling/core/evaluation.py +115 -0
- ins_pricing/production/__init__.py +4 -0
- ins_pricing/production/preprocess.py +71 -0
- ins_pricing/setup.py +10 -5
- {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
- ins_pricing-0.2.0.dist-info/RECORD +125 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
- ins_pricing/modelling/BayesOpt_entry.py +0 -633
- ins_pricing/modelling/Explain_Run.py +0 -36
- ins_pricing/modelling/Pricing_Run.py +0 -36
- ins_pricing/modelling/README.md +0 -33
- ins_pricing/modelling/bayesopt/models.py +0 -2196
- ins_pricing/modelling/bayesopt/trainers.py +0 -2446
- ins_pricing/modelling/cli_common.py +0 -136
- ins_pricing/modelling/tests/test_plotting.py +0 -63
- ins_pricing/modelling/watchdog_run.py +0 -211
- ins_pricing-0.1.11.dist-info/RECORD +0 -169
- ins_pricing_gemini/__init__.py +0 -23
- ins_pricing_gemini/governance/__init__.py +0 -20
- ins_pricing_gemini/governance/approval.py +0 -93
- ins_pricing_gemini/governance/audit.py +0 -37
- ins_pricing_gemini/governance/registry.py +0 -99
- ins_pricing_gemini/governance/release.py +0 -159
- ins_pricing_gemini/modelling/Explain_Run.py +0 -36
- ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
- ins_pricing_gemini/modelling/__init__.py +0 -151
- ins_pricing_gemini/modelling/cli_common.py +0 -141
- ins_pricing_gemini/modelling/config.py +0 -249
- ins_pricing_gemini/modelling/config_preprocess.py +0 -254
- ins_pricing_gemini/modelling/core.py +0 -741
- ins_pricing_gemini/modelling/data_container.py +0 -42
- ins_pricing_gemini/modelling/explain/__init__.py +0 -55
- ins_pricing_gemini/modelling/explain/gradients.py +0 -334
- ins_pricing_gemini/modelling/explain/metrics.py +0 -176
- ins_pricing_gemini/modelling/explain/permutation.py +0 -155
- ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
- ins_pricing_gemini/modelling/features.py +0 -215
- ins_pricing_gemini/modelling/model_manager.py +0 -148
- ins_pricing_gemini/modelling/model_plotting.py +0 -463
- ins_pricing_gemini/modelling/models.py +0 -2203
- ins_pricing_gemini/modelling/notebook_utils.py +0 -294
- ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
- ins_pricing_gemini/modelling/plotting/common.py +0 -63
- ins_pricing_gemini/modelling/plotting/curves.py +0 -572
- ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
- ins_pricing_gemini/modelling/plotting/geo.py +0 -362
- ins_pricing_gemini/modelling/plotting/importance.py +0 -121
- ins_pricing_gemini/modelling/run_logging.py +0 -133
- ins_pricing_gemini/modelling/tests/conftest.py +0 -8
- ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
- ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
- ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
- ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
- ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
- ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
- ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
- ins_pricing_gemini/modelling/trainers.py +0 -2447
- ins_pricing_gemini/modelling/utils.py +0 -1020
- ins_pricing_gemini/pricing/__init__.py +0 -27
- ins_pricing_gemini/pricing/calibration.py +0 -39
- ins_pricing_gemini/pricing/data_quality.py +0 -117
- ins_pricing_gemini/pricing/exposure.py +0 -85
- ins_pricing_gemini/pricing/factors.py +0 -91
- ins_pricing_gemini/pricing/monitoring.py +0 -99
- ins_pricing_gemini/pricing/rate_table.py +0 -78
- ins_pricing_gemini/production/__init__.py +0 -21
- ins_pricing_gemini/production/drift.py +0 -30
- ins_pricing_gemini/production/monitoring.py +0 -143
- ins_pricing_gemini/production/scoring.py +0 -40
- ins_pricing_gemini/reporting/__init__.py +0 -11
- ins_pricing_gemini/reporting/report_builder.py +0 -72
- ins_pricing_gemini/reporting/scheduler.py +0 -45
- ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
- ins_pricing_gemini/scripts/Explain_entry.py +0 -545
- ins_pricing_gemini/scripts/__init__.py +0 -1
- ins_pricing_gemini/scripts/train.py +0 -568
- ins_pricing_gemini/setup.py +0 -55
- ins_pricing_gemini/smoke_test.py +0 -28
- /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
- /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
- /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from .calibration import apply_calibration, fit_calibration_factor
|
|
4
|
-
from .data_quality import detect_leakage, profile_columns, validate_schema
|
|
5
|
-
from .exposure import aggregate_policy_level, build_frequency_severity, compute_exposure
|
|
6
|
-
from .factors import bin_numeric, build_factor_table
|
|
7
|
-
from .monitoring import population_stability_index, psi_report
|
|
8
|
-
from .rate_table import RateTable, apply_factor_tables, compute_base_rate, rate_premium
|
|
9
|
-
|
|
10
|
-
__all__ = [
|
|
11
|
-
"apply_calibration",
|
|
12
|
-
"fit_calibration_factor",
|
|
13
|
-
"detect_leakage",
|
|
14
|
-
"profile_columns",
|
|
15
|
-
"validate_schema",
|
|
16
|
-
"aggregate_policy_level",
|
|
17
|
-
"build_frequency_severity",
|
|
18
|
-
"compute_exposure",
|
|
19
|
-
"bin_numeric",
|
|
20
|
-
"build_factor_table",
|
|
21
|
-
"population_stability_index",
|
|
22
|
-
"psi_report",
|
|
23
|
-
"RateTable",
|
|
24
|
-
"apply_factor_tables",
|
|
25
|
-
"compute_base_rate",
|
|
26
|
-
"rate_premium",
|
|
27
|
-
]
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def fit_calibration_factor(
|
|
9
|
-
pred: np.ndarray,
|
|
10
|
-
actual: np.ndarray,
|
|
11
|
-
*,
|
|
12
|
-
weight: Optional[np.ndarray] = None,
|
|
13
|
-
target_lr: Optional[float] = None,
|
|
14
|
-
) -> float:
|
|
15
|
-
"""Fit a scalar calibration factor for premiums or pure premiums."""
|
|
16
|
-
pred = np.asarray(pred, dtype=float).reshape(-1)
|
|
17
|
-
actual = np.asarray(actual, dtype=float).reshape(-1)
|
|
18
|
-
if weight is not None:
|
|
19
|
-
weight = np.asarray(weight, dtype=float).reshape(-1)
|
|
20
|
-
if weight.shape[0] != pred.shape[0]:
|
|
21
|
-
raise ValueError("weight length must match pred length.")
|
|
22
|
-
pred = pred * weight
|
|
23
|
-
actual = actual * weight
|
|
24
|
-
|
|
25
|
-
pred_sum = float(np.sum(pred))
|
|
26
|
-
actual_sum = float(np.sum(actual))
|
|
27
|
-
if pred_sum <= 0:
|
|
28
|
-
return 1.0
|
|
29
|
-
|
|
30
|
-
if target_lr is None:
|
|
31
|
-
return actual_sum / pred_sum
|
|
32
|
-
if target_lr <= 0:
|
|
33
|
-
raise ValueError("target_lr must be positive.")
|
|
34
|
-
return actual_sum / (target_lr * pred_sum)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def apply_calibration(pred: np.ndarray, factor: float) -> np.ndarray:
|
|
38
|
-
pred = np.asarray(pred, dtype=float)
|
|
39
|
-
return pred * float(factor)
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Callable, Dict, Iterable, Optional
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def _dtype_matches(actual: np.dtype, expected) -> bool:
|
|
10
|
-
if callable(expected):
|
|
11
|
-
return bool(expected(actual))
|
|
12
|
-
if isinstance(expected, (list, tuple, set)):
|
|
13
|
-
return any(_dtype_matches(actual, item) for item in expected)
|
|
14
|
-
try:
|
|
15
|
-
expected_dtype = np.dtype(expected)
|
|
16
|
-
except Exception:
|
|
17
|
-
return False
|
|
18
|
-
if pd.api.types.is_categorical_dtype(actual) and expected_dtype == np.dtype("category"):
|
|
19
|
-
return True
|
|
20
|
-
if pd.api.types.is_string_dtype(actual) and expected_dtype.kind in {"U", "S", "O"}:
|
|
21
|
-
return True
|
|
22
|
-
if np.issubdtype(actual, expected_dtype):
|
|
23
|
-
return True
|
|
24
|
-
return pd.api.types.is_dtype_equal(actual, expected_dtype)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def validate_schema(
|
|
28
|
-
df: pd.DataFrame,
|
|
29
|
-
required_cols: Iterable[str],
|
|
30
|
-
dtypes: Optional[Dict[str, object]] = None,
|
|
31
|
-
*,
|
|
32
|
-
raise_on_error: bool = True,
|
|
33
|
-
) -> Dict[str, object]:
|
|
34
|
-
"""Validate required columns and optional dtypes."""
|
|
35
|
-
required = list(required_cols)
|
|
36
|
-
missing = [col for col in required if col not in df.columns]
|
|
37
|
-
dtype_mismatch: Dict[str, Dict[str, str]] = {}
|
|
38
|
-
if dtypes:
|
|
39
|
-
for col, expected in dtypes.items():
|
|
40
|
-
if col not in df.columns:
|
|
41
|
-
continue
|
|
42
|
-
actual = df[col].dtype
|
|
43
|
-
if not _dtype_matches(actual, expected):
|
|
44
|
-
dtype_mismatch[col] = {
|
|
45
|
-
"expected": str(expected),
|
|
46
|
-
"actual": str(actual),
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
ok = not missing and not dtype_mismatch
|
|
50
|
-
result = {"ok": ok, "missing": missing, "dtype_mismatch": dtype_mismatch}
|
|
51
|
-
if raise_on_error and not ok:
|
|
52
|
-
raise ValueError(f"Schema validation failed: {result}")
|
|
53
|
-
return result
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def profile_columns(
|
|
57
|
-
df: pd.DataFrame, cols: Optional[Iterable[str]] = None
|
|
58
|
-
) -> pd.DataFrame:
|
|
59
|
-
"""Basic column profiling for missing/uniques and numeric stats."""
|
|
60
|
-
columns = list(cols) if cols is not None else list(df.columns)
|
|
61
|
-
rows = []
|
|
62
|
-
for col in columns:
|
|
63
|
-
series = df[col]
|
|
64
|
-
n = len(series)
|
|
65
|
-
missing_ratio = float(series.isna().mean()) if n else 0.0
|
|
66
|
-
nunique = int(series.nunique(dropna=True))
|
|
67
|
-
unique_ratio = float(nunique / n) if n else 0.0
|
|
68
|
-
entry = {
|
|
69
|
-
"column": col,
|
|
70
|
-
"dtype": str(series.dtype),
|
|
71
|
-
"missing_ratio": missing_ratio,
|
|
72
|
-
"n_unique": nunique,
|
|
73
|
-
"unique_ratio": unique_ratio,
|
|
74
|
-
}
|
|
75
|
-
if pd.api.types.is_numeric_dtype(series):
|
|
76
|
-
entry.update(
|
|
77
|
-
{
|
|
78
|
-
"min": float(series.min(skipna=True)),
|
|
79
|
-
"max": float(series.max(skipna=True)),
|
|
80
|
-
"mean": float(series.mean(skipna=True)),
|
|
81
|
-
}
|
|
82
|
-
)
|
|
83
|
-
rows.append(entry)
|
|
84
|
-
return pd.DataFrame(rows)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def detect_leakage(
|
|
88
|
-
df: pd.DataFrame,
|
|
89
|
-
target_col: str,
|
|
90
|
-
*,
|
|
91
|
-
exclude_cols: Optional[Iterable[str]] = None,
|
|
92
|
-
corr_threshold: float = 0.995,
|
|
93
|
-
) -> pd.DataFrame:
|
|
94
|
-
"""Detect simple leakage via identical columns or very high correlation."""
|
|
95
|
-
if target_col not in df.columns:
|
|
96
|
-
raise ValueError("target_col not found.")
|
|
97
|
-
exclude = set(exclude_cols or [])
|
|
98
|
-
exclude.add(target_col)
|
|
99
|
-
target = df[target_col]
|
|
100
|
-
results = []
|
|
101
|
-
for col in df.columns:
|
|
102
|
-
if col in exclude:
|
|
103
|
-
continue
|
|
104
|
-
series = df[col]
|
|
105
|
-
reason = None
|
|
106
|
-
score = None
|
|
107
|
-
if series.equals(target):
|
|
108
|
-
reason = "identical"
|
|
109
|
-
score = 1.0
|
|
110
|
-
elif pd.api.types.is_numeric_dtype(series) and pd.api.types.is_numeric_dtype(target):
|
|
111
|
-
corr = series.corr(target)
|
|
112
|
-
if pd.notna(corr) and abs(corr) >= corr_threshold:
|
|
113
|
-
reason = "high_corr"
|
|
114
|
-
score = float(corr)
|
|
115
|
-
if reason:
|
|
116
|
-
results.append({"feature": col, "reason": reason, "score": score})
|
|
117
|
-
return pd.DataFrame(results).sort_values(by="score", ascending=False).reset_index(drop=True)
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Iterable, Optional
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def compute_exposure(
|
|
10
|
-
df: pd.DataFrame,
|
|
11
|
-
start_col: str,
|
|
12
|
-
end_col: str,
|
|
13
|
-
*,
|
|
14
|
-
unit: str = "year",
|
|
15
|
-
inclusive: bool = False,
|
|
16
|
-
clip_min: Optional[float] = 0.0,
|
|
17
|
-
clip_max: Optional[float] = None,
|
|
18
|
-
) -> pd.Series:
|
|
19
|
-
"""Compute exposure from start/end date columns."""
|
|
20
|
-
start = pd.to_datetime(df[start_col])
|
|
21
|
-
end = pd.to_datetime(df[end_col])
|
|
22
|
-
delta_days = (end - start).dt.days.astype(float)
|
|
23
|
-
if inclusive:
|
|
24
|
-
delta_days = delta_days + 1.0
|
|
25
|
-
if unit == "day":
|
|
26
|
-
exposure = delta_days
|
|
27
|
-
elif unit == "month":
|
|
28
|
-
exposure = delta_days / 30.0
|
|
29
|
-
elif unit == "year":
|
|
30
|
-
exposure = delta_days / 365.25
|
|
31
|
-
else:
|
|
32
|
-
raise ValueError("unit must be one of: day, month, year.")
|
|
33
|
-
|
|
34
|
-
exposure = exposure.replace([np.inf, -np.inf], np.nan).fillna(0.0)
|
|
35
|
-
if clip_min is not None:
|
|
36
|
-
exposure = exposure.clip(lower=clip_min)
|
|
37
|
-
if clip_max is not None:
|
|
38
|
-
exposure = exposure.clip(upper=clip_max)
|
|
39
|
-
return exposure
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def aggregate_policy_level(
|
|
43
|
-
df: pd.DataFrame,
|
|
44
|
-
policy_keys: Iterable[str],
|
|
45
|
-
*,
|
|
46
|
-
exposure_col: str,
|
|
47
|
-
claim_count_col: Optional[str] = None,
|
|
48
|
-
claim_amount_col: Optional[str] = None,
|
|
49
|
-
weight_col: Optional[str] = None,
|
|
50
|
-
) -> pd.DataFrame:
|
|
51
|
-
"""Aggregate event-level rows to policy-level records."""
|
|
52
|
-
agg = {exposure_col: "sum"}
|
|
53
|
-
if claim_count_col:
|
|
54
|
-
agg[claim_count_col] = "sum"
|
|
55
|
-
if claim_amount_col:
|
|
56
|
-
agg[claim_amount_col] = "sum"
|
|
57
|
-
if weight_col:
|
|
58
|
-
agg[weight_col] = "sum"
|
|
59
|
-
grouped = df.groupby(list(policy_keys), dropna=False).agg(agg).reset_index()
|
|
60
|
-
return grouped
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def build_frequency_severity(
|
|
64
|
-
df: pd.DataFrame,
|
|
65
|
-
*,
|
|
66
|
-
exposure_col: str,
|
|
67
|
-
claim_count_col: str,
|
|
68
|
-
claim_amount_col: str,
|
|
69
|
-
zero_severity: float = 0.0,
|
|
70
|
-
) -> pd.DataFrame:
|
|
71
|
-
"""Compute frequency, severity and pure premium from counts and losses."""
|
|
72
|
-
exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
|
|
73
|
-
counts = df[claim_count_col].to_numpy(dtype=float, copy=False)
|
|
74
|
-
amounts = df[claim_amount_col].to_numpy(dtype=float, copy=False)
|
|
75
|
-
|
|
76
|
-
with np.errstate(divide="ignore", invalid="ignore"):
|
|
77
|
-
frequency = np.where(exposure > 0, counts / exposure, 0.0)
|
|
78
|
-
severity = np.where(counts > 0, amounts / counts, zero_severity)
|
|
79
|
-
pure_premium = frequency * severity
|
|
80
|
-
|
|
81
|
-
out = df.copy()
|
|
82
|
-
out["frequency"] = frequency
|
|
83
|
-
out["severity"] = severity
|
|
84
|
-
out["pure_premium"] = pure_premium
|
|
85
|
-
return out
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Optional, Tuple
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def bin_numeric(
|
|
10
|
-
series: pd.Series,
|
|
11
|
-
*,
|
|
12
|
-
bins: int = 10,
|
|
13
|
-
method: str = "quantile",
|
|
14
|
-
labels: Optional[list] = None,
|
|
15
|
-
include_lowest: bool = True,
|
|
16
|
-
) -> Tuple[pd.Series, np.ndarray]:
|
|
17
|
-
"""Bin numeric series and return (binned, bin_edges)."""
|
|
18
|
-
if method == "quantile":
|
|
19
|
-
binned = pd.qcut(series, q=bins, duplicates="drop", labels=labels)
|
|
20
|
-
bin_edges = binned.cat.categories.left.to_numpy()
|
|
21
|
-
elif method == "uniform":
|
|
22
|
-
binned = pd.cut(series, bins=bins, include_lowest=include_lowest, labels=labels)
|
|
23
|
-
bin_edges = binned.cat.categories.left.to_numpy()
|
|
24
|
-
else:
|
|
25
|
-
raise ValueError("method must be one of: quantile, uniform.")
|
|
26
|
-
return binned, bin_edges
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def build_factor_table(
|
|
30
|
-
df: pd.DataFrame,
|
|
31
|
-
*,
|
|
32
|
-
factor_col: str,
|
|
33
|
-
loss_col: str,
|
|
34
|
-
exposure_col: str,
|
|
35
|
-
weight_col: Optional[str] = None,
|
|
36
|
-
base_rate: Optional[float] = None,
|
|
37
|
-
smoothing: float = 0.0,
|
|
38
|
-
min_exposure: Optional[float] = None,
|
|
39
|
-
) -> pd.DataFrame:
|
|
40
|
-
"""Build a factor table with rate and relativity."""
|
|
41
|
-
if weight_col and weight_col in df.columns:
|
|
42
|
-
weights = df[weight_col].to_numpy(dtype=float, copy=False)
|
|
43
|
-
else:
|
|
44
|
-
weights = None
|
|
45
|
-
|
|
46
|
-
loss = df[loss_col].to_numpy(dtype=float, copy=False)
|
|
47
|
-
exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
|
|
48
|
-
|
|
49
|
-
if weights is not None:
|
|
50
|
-
loss = loss * weights
|
|
51
|
-
exposure = exposure * weights
|
|
52
|
-
|
|
53
|
-
data = pd.DataFrame(
|
|
54
|
-
{
|
|
55
|
-
"factor": df[factor_col],
|
|
56
|
-
"loss": loss,
|
|
57
|
-
"exposure": exposure,
|
|
58
|
-
}
|
|
59
|
-
)
|
|
60
|
-
grouped = data.groupby("factor", dropna=False).agg({"loss": "sum", "exposure": "sum"})
|
|
61
|
-
grouped = grouped.reset_index().rename(columns={"factor": "level"})
|
|
62
|
-
|
|
63
|
-
if base_rate is None:
|
|
64
|
-
total_loss = float(grouped["loss"].sum())
|
|
65
|
-
total_exposure = float(grouped["exposure"].sum())
|
|
66
|
-
base_rate = total_loss / total_exposure if total_exposure > 0 else 0.0
|
|
67
|
-
|
|
68
|
-
exposure_vals = grouped["exposure"].to_numpy(dtype=float, copy=False)
|
|
69
|
-
loss_vals = grouped["loss"].to_numpy(dtype=float, copy=False)
|
|
70
|
-
|
|
71
|
-
with np.errstate(divide="ignore", invalid="ignore"):
|
|
72
|
-
rate = np.where(
|
|
73
|
-
exposure_vals > 0,
|
|
74
|
-
(loss_vals + smoothing * base_rate) / (exposure_vals + smoothing),
|
|
75
|
-
0.0,
|
|
76
|
-
)
|
|
77
|
-
relativity = np.where(base_rate > 0, rate / base_rate, 1.0)
|
|
78
|
-
|
|
79
|
-
grouped["rate"] = rate
|
|
80
|
-
grouped["relativity"] = relativity
|
|
81
|
-
grouped["base_rate"] = float(base_rate)
|
|
82
|
-
|
|
83
|
-
if min_exposure is not None:
|
|
84
|
-
low_exposure = grouped["exposure"] < float(min_exposure)
|
|
85
|
-
grouped.loc[low_exposure, "relativity"] = 1.0
|
|
86
|
-
grouped.loc[low_exposure, "rate"] = float(base_rate)
|
|
87
|
-
grouped["is_low_exposure"] = low_exposure
|
|
88
|
-
else:
|
|
89
|
-
grouped["is_low_exposure"] = False
|
|
90
|
-
|
|
91
|
-
return grouped
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Iterable, Optional
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def psi_numeric(
|
|
10
|
-
expected: np.ndarray,
|
|
11
|
-
actual: np.ndarray,
|
|
12
|
-
*,
|
|
13
|
-
bins: int = 10,
|
|
14
|
-
strategy: str = "quantile",
|
|
15
|
-
eps: float = 1e-6,
|
|
16
|
-
) -> float:
|
|
17
|
-
expected = np.asarray(expected, dtype=float)
|
|
18
|
-
actual = np.asarray(actual, dtype=float)
|
|
19
|
-
expected = expected[~np.isnan(expected)]
|
|
20
|
-
actual = actual[~np.isnan(actual)]
|
|
21
|
-
if expected.size == 0 or actual.size == 0:
|
|
22
|
-
return 0.0
|
|
23
|
-
|
|
24
|
-
if strategy == "quantile":
|
|
25
|
-
quantiles = np.linspace(0, 1, bins + 1)
|
|
26
|
-
bin_edges = np.quantile(expected, quantiles)
|
|
27
|
-
bin_edges = np.unique(bin_edges)
|
|
28
|
-
elif strategy == "uniform":
|
|
29
|
-
min_val = min(expected.min(), actual.min())
|
|
30
|
-
max_val = max(expected.max(), actual.max())
|
|
31
|
-
bin_edges = np.linspace(min_val, max_val, bins + 1)
|
|
32
|
-
else:
|
|
33
|
-
raise ValueError("strategy must be one of: quantile, uniform.")
|
|
34
|
-
|
|
35
|
-
if bin_edges.size < 2:
|
|
36
|
-
return 0.0
|
|
37
|
-
|
|
38
|
-
exp_counts, _ = np.histogram(expected, bins=bin_edges)
|
|
39
|
-
act_counts, _ = np.histogram(actual, bins=bin_edges)
|
|
40
|
-
exp_pct = exp_counts / max(exp_counts.sum(), 1)
|
|
41
|
-
act_pct = act_counts / max(act_counts.sum(), 1)
|
|
42
|
-
exp_pct = np.clip(exp_pct, eps, 1.0)
|
|
43
|
-
act_pct = np.clip(act_pct, eps, 1.0)
|
|
44
|
-
return float(np.sum((act_pct - exp_pct) * np.log(act_pct / exp_pct)))
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def psi_categorical(
|
|
48
|
-
expected: Iterable,
|
|
49
|
-
actual: Iterable,
|
|
50
|
-
*,
|
|
51
|
-
eps: float = 1e-6,
|
|
52
|
-
) -> float:
|
|
53
|
-
expected = pd.Series(expected)
|
|
54
|
-
actual = pd.Series(actual)
|
|
55
|
-
categories = pd.Index(expected.dropna().unique()).union(actual.dropna().unique())
|
|
56
|
-
if categories.empty:
|
|
57
|
-
return 0.0
|
|
58
|
-
exp_counts = expected.value_counts().reindex(categories, fill_value=0)
|
|
59
|
-
act_counts = actual.value_counts().reindex(categories, fill_value=0)
|
|
60
|
-
exp_pct = exp_counts / max(exp_counts.sum(), 1)
|
|
61
|
-
act_pct = act_counts / max(act_counts.sum(), 1)
|
|
62
|
-
exp_pct = np.clip(exp_pct.to_numpy(dtype=float), eps, 1.0)
|
|
63
|
-
act_pct = np.clip(act_pct.to_numpy(dtype=float), eps, 1.0)
|
|
64
|
-
return float(np.sum((act_pct - exp_pct) * np.log(act_pct / exp_pct)))
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def population_stability_index(
|
|
68
|
-
expected: np.ndarray,
|
|
69
|
-
actual: np.ndarray,
|
|
70
|
-
*,
|
|
71
|
-
bins: int = 10,
|
|
72
|
-
strategy: str = "quantile",
|
|
73
|
-
) -> float:
|
|
74
|
-
if pd.api.types.is_numeric_dtype(expected) and pd.api.types.is_numeric_dtype(actual):
|
|
75
|
-
return psi_numeric(expected, actual, bins=bins, strategy=strategy)
|
|
76
|
-
return psi_categorical(expected, actual)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def psi_report(
|
|
80
|
-
expected_df: pd.DataFrame,
|
|
81
|
-
actual_df: pd.DataFrame,
|
|
82
|
-
*,
|
|
83
|
-
features: Optional[Iterable[str]] = None,
|
|
84
|
-
bins: int = 10,
|
|
85
|
-
strategy: str = "quantile",
|
|
86
|
-
) -> pd.DataFrame:
|
|
87
|
-
feats = list(features) if features is not None else list(expected_df.columns)
|
|
88
|
-
rows = []
|
|
89
|
-
for feat in feats:
|
|
90
|
-
if feat not in expected_df.columns or feat not in actual_df.columns:
|
|
91
|
-
continue
|
|
92
|
-
psi = population_stability_index(
|
|
93
|
-
expected_df[feat].to_numpy(),
|
|
94
|
-
actual_df[feat].to_numpy(),
|
|
95
|
-
bins=bins,
|
|
96
|
-
strategy=strategy,
|
|
97
|
-
)
|
|
98
|
-
rows.append({"feature": feat, "psi": psi})
|
|
99
|
-
return pd.DataFrame(rows).sort_values(by="psi", ascending=False).reset_index(drop=True)
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from typing import Dict, Optional
|
|
5
|
-
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def compute_base_rate(
|
|
11
|
-
df: pd.DataFrame,
|
|
12
|
-
*,
|
|
13
|
-
loss_col: str,
|
|
14
|
-
exposure_col: str,
|
|
15
|
-
weight_col: Optional[str] = None,
|
|
16
|
-
) -> float:
|
|
17
|
-
"""Compute base rate as loss / exposure."""
|
|
18
|
-
loss = df[loss_col].to_numpy(dtype=float, copy=False)
|
|
19
|
-
exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
|
|
20
|
-
if weight_col and weight_col in df.columns:
|
|
21
|
-
weight = df[weight_col].to_numpy(dtype=float, copy=False)
|
|
22
|
-
loss = loss * weight
|
|
23
|
-
exposure = exposure * weight
|
|
24
|
-
total_exposure = float(np.sum(exposure))
|
|
25
|
-
if total_exposure <= 0:
|
|
26
|
-
return 0.0
|
|
27
|
-
return float(np.sum(loss) / total_exposure)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def apply_factor_tables(
|
|
31
|
-
df: pd.DataFrame,
|
|
32
|
-
factor_tables: Dict[str, pd.DataFrame],
|
|
33
|
-
*,
|
|
34
|
-
default_relativity: float = 1.0,
|
|
35
|
-
) -> np.ndarray:
|
|
36
|
-
"""Apply factor relativities and return a multiplicative factor."""
|
|
37
|
-
multiplier = np.ones(len(df), dtype=float)
|
|
38
|
-
for factor, table in factor_tables.items():
|
|
39
|
-
if factor not in df.columns:
|
|
40
|
-
raise ValueError(f"Missing factor column: {factor}")
|
|
41
|
-
if "level" not in table.columns or "relativity" not in table.columns:
|
|
42
|
-
raise ValueError("Factor table must include 'level' and 'relativity'.")
|
|
43
|
-
mapping = table.set_index("level")["relativity"]
|
|
44
|
-
rel = df[factor].map(mapping).fillna(default_relativity).to_numpy(dtype=float)
|
|
45
|
-
multiplier *= rel
|
|
46
|
-
return multiplier
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def rate_premium(
|
|
50
|
-
df: pd.DataFrame,
|
|
51
|
-
*,
|
|
52
|
-
exposure_col: str,
|
|
53
|
-
base_rate: float,
|
|
54
|
-
factor_tables: Dict[str, pd.DataFrame],
|
|
55
|
-
default_relativity: float = 1.0,
|
|
56
|
-
) -> np.ndarray:
|
|
57
|
-
"""Compute premium using base rate and factor tables."""
|
|
58
|
-
exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
|
|
59
|
-
factors = apply_factor_tables(
|
|
60
|
-
df, factor_tables, default_relativity=default_relativity
|
|
61
|
-
)
|
|
62
|
-
return exposure * float(base_rate) * factors
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
@dataclass
|
|
66
|
-
class RateTable:
|
|
67
|
-
base_rate: float
|
|
68
|
-
factor_tables: Dict[str, pd.DataFrame]
|
|
69
|
-
default_relativity: float = 1.0
|
|
70
|
-
|
|
71
|
-
def score(self, df: pd.DataFrame, *, exposure_col: str) -> np.ndarray:
|
|
72
|
-
return rate_premium(
|
|
73
|
-
df,
|
|
74
|
-
exposure_col=exposure_col,
|
|
75
|
-
base_rate=self.base_rate,
|
|
76
|
-
factor_tables=self.factor_tables,
|
|
77
|
-
default_relativity=self.default_relativity,
|
|
78
|
-
)
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from .drift import psi_report
|
|
4
|
-
from .monitoring import (
|
|
5
|
-
classification_metrics,
|
|
6
|
-
group_metrics,
|
|
7
|
-
loss_ratio,
|
|
8
|
-
metrics_report,
|
|
9
|
-
regression_metrics,
|
|
10
|
-
)
|
|
11
|
-
from .scoring import batch_score
|
|
12
|
-
|
|
13
|
-
__all__ = [
|
|
14
|
-
"psi_report",
|
|
15
|
-
"classification_metrics",
|
|
16
|
-
"group_metrics",
|
|
17
|
-
"loss_ratio",
|
|
18
|
-
"metrics_report",
|
|
19
|
-
"regression_metrics",
|
|
20
|
-
"batch_score",
|
|
21
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Iterable, Optional
|
|
4
|
-
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
|
-
try:
|
|
8
|
-
from ins_pricing.pricing.monitoring import psi_report as _psi_report
|
|
9
|
-
except Exception: # pragma: no cover - optional import
|
|
10
|
-
_psi_report = None
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def psi_report(
|
|
14
|
-
expected_df: pd.DataFrame,
|
|
15
|
-
actual_df: pd.DataFrame,
|
|
16
|
-
*,
|
|
17
|
-
features: Optional[Iterable[str]] = None,
|
|
18
|
-
bins: int = 10,
|
|
19
|
-
strategy: str = "quantile",
|
|
20
|
-
) -> pd.DataFrame:
|
|
21
|
-
"""Population Stability Index report for drift monitoring."""
|
|
22
|
-
if _psi_report is None:
|
|
23
|
-
raise RuntimeError("psi_report requires ins_pricing.pricing.monitoring.")
|
|
24
|
-
return _psi_report(
|
|
25
|
-
expected_df,
|
|
26
|
-
actual_df,
|
|
27
|
-
features=features,
|
|
28
|
-
bins=bins,
|
|
29
|
-
strategy=strategy,
|
|
30
|
-
)
|