ins-pricing 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +60 -0
- ins_pricing/__init__.py +102 -0
- ins_pricing/governance/README.md +18 -0
- ins_pricing/governance/__init__.py +20 -0
- ins_pricing/governance/approval.py +93 -0
- ins_pricing/governance/audit.py +37 -0
- ins_pricing/governance/registry.py +99 -0
- ins_pricing/governance/release.py +159 -0
- ins_pricing/modelling/BayesOpt.py +146 -0
- ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
- ins_pricing/modelling/BayesOpt_entry.py +575 -0
- ins_pricing/modelling/BayesOpt_incremental.py +731 -0
- ins_pricing/modelling/Explain_Run.py +36 -0
- ins_pricing/modelling/Explain_entry.py +539 -0
- ins_pricing/modelling/Pricing_Run.py +36 -0
- ins_pricing/modelling/README.md +33 -0
- ins_pricing/modelling/__init__.py +44 -0
- ins_pricing/modelling/bayesopt/__init__.py +98 -0
- ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
- ins_pricing/modelling/bayesopt/core.py +1476 -0
- ins_pricing/modelling/bayesopt/models.py +2196 -0
- ins_pricing/modelling/bayesopt/trainers.py +2446 -0
- ins_pricing/modelling/bayesopt/utils.py +1021 -0
- ins_pricing/modelling/cli_common.py +136 -0
- ins_pricing/modelling/explain/__init__.py +55 -0
- ins_pricing/modelling/explain/gradients.py +334 -0
- ins_pricing/modelling/explain/metrics.py +176 -0
- ins_pricing/modelling/explain/permutation.py +155 -0
- ins_pricing/modelling/explain/shap_utils.py +146 -0
- ins_pricing/modelling/notebook_utils.py +284 -0
- ins_pricing/modelling/plotting/__init__.py +45 -0
- ins_pricing/modelling/plotting/common.py +63 -0
- ins_pricing/modelling/plotting/curves.py +572 -0
- ins_pricing/modelling/plotting/diagnostics.py +139 -0
- ins_pricing/modelling/plotting/geo.py +362 -0
- ins_pricing/modelling/plotting/importance.py +121 -0
- ins_pricing/modelling/run_logging.py +133 -0
- ins_pricing/modelling/tests/conftest.py +8 -0
- ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
- ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
- ins_pricing/modelling/tests/test_explain.py +56 -0
- ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
- ins_pricing/modelling/tests/test_graph_cache.py +33 -0
- ins_pricing/modelling/tests/test_plotting.py +63 -0
- ins_pricing/modelling/tests/test_plotting_library.py +150 -0
- ins_pricing/modelling/tests/test_preprocessor.py +48 -0
- ins_pricing/modelling/watchdog_run.py +211 -0
- ins_pricing/pricing/README.md +44 -0
- ins_pricing/pricing/__init__.py +27 -0
- ins_pricing/pricing/calibration.py +39 -0
- ins_pricing/pricing/data_quality.py +117 -0
- ins_pricing/pricing/exposure.py +85 -0
- ins_pricing/pricing/factors.py +91 -0
- ins_pricing/pricing/monitoring.py +99 -0
- ins_pricing/pricing/rate_table.py +78 -0
- ins_pricing/production/__init__.py +21 -0
- ins_pricing/production/drift.py +30 -0
- ins_pricing/production/monitoring.py +143 -0
- ins_pricing/production/scoring.py +40 -0
- ins_pricing/reporting/README.md +20 -0
- ins_pricing/reporting/__init__.py +11 -0
- ins_pricing/reporting/report_builder.py +72 -0
- ins_pricing/reporting/scheduler.py +45 -0
- ins_pricing/setup.py +41 -0
- ins_pricing v2/__init__.py +23 -0
- ins_pricing v2/governance/__init__.py +20 -0
- ins_pricing v2/governance/approval.py +93 -0
- ins_pricing v2/governance/audit.py +37 -0
- ins_pricing v2/governance/registry.py +99 -0
- ins_pricing v2/governance/release.py +159 -0
- ins_pricing v2/modelling/Explain_Run.py +36 -0
- ins_pricing v2/modelling/Pricing_Run.py +36 -0
- ins_pricing v2/modelling/__init__.py +151 -0
- ins_pricing v2/modelling/cli_common.py +141 -0
- ins_pricing v2/modelling/config.py +249 -0
- ins_pricing v2/modelling/config_preprocess.py +254 -0
- ins_pricing v2/modelling/core.py +741 -0
- ins_pricing v2/modelling/data_container.py +42 -0
- ins_pricing v2/modelling/explain/__init__.py +55 -0
- ins_pricing v2/modelling/explain/gradients.py +334 -0
- ins_pricing v2/modelling/explain/metrics.py +176 -0
- ins_pricing v2/modelling/explain/permutation.py +155 -0
- ins_pricing v2/modelling/explain/shap_utils.py +146 -0
- ins_pricing v2/modelling/features.py +215 -0
- ins_pricing v2/modelling/model_manager.py +148 -0
- ins_pricing v2/modelling/model_plotting.py +463 -0
- ins_pricing v2/modelling/models.py +2203 -0
- ins_pricing v2/modelling/notebook_utils.py +294 -0
- ins_pricing v2/modelling/plotting/__init__.py +45 -0
- ins_pricing v2/modelling/plotting/common.py +63 -0
- ins_pricing v2/modelling/plotting/curves.py +572 -0
- ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
- ins_pricing v2/modelling/plotting/geo.py +362 -0
- ins_pricing v2/modelling/plotting/importance.py +121 -0
- ins_pricing v2/modelling/run_logging.py +133 -0
- ins_pricing v2/modelling/tests/conftest.py +8 -0
- ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
- ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
- ins_pricing v2/modelling/tests/test_explain.py +56 -0
- ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
- ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
- ins_pricing v2/modelling/tests/test_plotting.py +63 -0
- ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
- ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
- ins_pricing v2/modelling/trainers.py +2447 -0
- ins_pricing v2/modelling/utils.py +1020 -0
- ins_pricing v2/modelling/watchdog_run.py +211 -0
- ins_pricing v2/pricing/__init__.py +27 -0
- ins_pricing v2/pricing/calibration.py +39 -0
- ins_pricing v2/pricing/data_quality.py +117 -0
- ins_pricing v2/pricing/exposure.py +85 -0
- ins_pricing v2/pricing/factors.py +91 -0
- ins_pricing v2/pricing/monitoring.py +99 -0
- ins_pricing v2/pricing/rate_table.py +78 -0
- ins_pricing v2/production/__init__.py +21 -0
- ins_pricing v2/production/drift.py +30 -0
- ins_pricing v2/production/monitoring.py +143 -0
- ins_pricing v2/production/scoring.py +40 -0
- ins_pricing v2/reporting/__init__.py +11 -0
- ins_pricing v2/reporting/report_builder.py +72 -0
- ins_pricing v2/reporting/scheduler.py +45 -0
- ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
- ins_pricing v2/scripts/Explain_entry.py +545 -0
- ins_pricing v2/scripts/__init__.py +1 -0
- ins_pricing v2/scripts/train.py +568 -0
- ins_pricing v2/setup.py +55 -0
- ins_pricing v2/smoke_test.py +28 -0
- ins_pricing-0.1.6.dist-info/METADATA +78 -0
- ins_pricing-0.1.6.dist-info/RECORD +169 -0
- ins_pricing-0.1.6.dist-info/WHEEL +5 -0
- ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
- user_packages/__init__.py +105 -0
- user_packages legacy/BayesOpt.py +5659 -0
- user_packages legacy/BayesOpt_entry.py +513 -0
- user_packages legacy/BayesOpt_incremental.py +685 -0
- user_packages legacy/Pricing_Run.py +36 -0
- user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
- user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
- user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
- user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
- user_packages legacy/Try/BayesOpt legacy.py +3280 -0
- user_packages legacy/Try/BayesOpt.py +838 -0
- user_packages legacy/Try/BayesOptAll.py +1569 -0
- user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
- user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
- user_packages legacy/Try/BayesOptSearch.py +830 -0
- user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
- user_packages legacy/Try/BayesOptV1.py +1911 -0
- user_packages legacy/Try/BayesOptV10.py +2973 -0
- user_packages legacy/Try/BayesOptV11.py +3001 -0
- user_packages legacy/Try/BayesOptV12.py +3001 -0
- user_packages legacy/Try/BayesOptV2.py +2065 -0
- user_packages legacy/Try/BayesOptV3.py +2209 -0
- user_packages legacy/Try/BayesOptV4.py +2342 -0
- user_packages legacy/Try/BayesOptV5.py +2372 -0
- user_packages legacy/Try/BayesOptV6.py +2759 -0
- user_packages legacy/Try/BayesOptV7.py +2832 -0
- user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
- user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
- user_packages legacy/Try/BayesOptV9.py +2927 -0
- user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
- user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
- user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
- user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
- user_packages legacy/Try/xgbbayesopt.py +523 -0
- user_packages legacy/__init__.py +19 -0
- user_packages legacy/cli_common.py +124 -0
- user_packages legacy/notebook_utils.py +228 -0
- user_packages legacy/watchdog_run.py +202 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from .run_logging import configure_run_logging # type: ignore
|
|
13
|
+
except Exception: # pragma: no cover
|
|
14
|
+
try:
|
|
15
|
+
from run_logging import configure_run_logging # type: ignore
|
|
16
|
+
except Exception: # pragma: no cover
|
|
17
|
+
configure_run_logging = None # type: ignore
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _split_argv(argv: List[str]) -> tuple[List[str], List[str]]:
|
|
21
|
+
if "--" not in argv:
|
|
22
|
+
raise ValueError("Missing '--' separator before the command to run.")
|
|
23
|
+
idx = argv.index("--")
|
|
24
|
+
return argv[:idx], argv[idx + 1 :]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _kill_process_tree(pid: int) -> None:
|
|
28
|
+
if pid <= 0:
|
|
29
|
+
return
|
|
30
|
+
if os.name == "nt":
|
|
31
|
+
subprocess.run(
|
|
32
|
+
["taskkill", "/PID", str(pid), "/T", "/F"],
|
|
33
|
+
stdout=subprocess.DEVNULL,
|
|
34
|
+
stderr=subprocess.DEVNULL,
|
|
35
|
+
check=False,
|
|
36
|
+
)
|
|
37
|
+
return
|
|
38
|
+
try:
|
|
39
|
+
os.killpg(pid, 15)
|
|
40
|
+
time.sleep(2)
|
|
41
|
+
os.killpg(pid, 9)
|
|
42
|
+
except Exception:
|
|
43
|
+
try:
|
|
44
|
+
os.kill(pid, 9)
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _reader_thread(
|
|
50
|
+
proc: subprocess.Popen, last_output_ts: dict, prefix: str = ""
|
|
51
|
+
) -> None:
|
|
52
|
+
assert proc.stdout is not None
|
|
53
|
+
for line in proc.stdout:
|
|
54
|
+
last_output_ts["ts"] = time.time()
|
|
55
|
+
if prefix:
|
|
56
|
+
sys.stdout.write(prefix)
|
|
57
|
+
sys.stdout.write(line)
|
|
58
|
+
sys.stdout.flush()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _parse_args(before_cmd: List[str], cmd: List[str]) -> argparse.Namespace:
|
|
62
|
+
parser = argparse.ArgumentParser(
|
|
63
|
+
description=(
|
|
64
|
+
"Run a command under a simple watchdog: if there is no stdout/stderr "
|
|
65
|
+
"output for N seconds, kill the whole process tree and restart. "
|
|
66
|
+
"Designed to pair with optuna_storage so BayesOpt can resume."
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--idle-seconds",
|
|
71
|
+
type=int,
|
|
72
|
+
default=7200,
|
|
73
|
+
help="Restart if there is no output for this many seconds (default: 7200).",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--max-restarts",
|
|
77
|
+
type=int,
|
|
78
|
+
default=50,
|
|
79
|
+
help="Maximum restart attempts (default: 50).",
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--restart-delay-seconds",
|
|
83
|
+
type=int,
|
|
84
|
+
default=10,
|
|
85
|
+
help="Delay between restarts (default: 10).",
|
|
86
|
+
)
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--stop-on-nonzero-exit",
|
|
89
|
+
action="store_true",
|
|
90
|
+
help="If the command exits non-zero, stop instead of restarting.",
|
|
91
|
+
)
|
|
92
|
+
args = parser.parse_args(before_cmd)
|
|
93
|
+
if not cmd:
|
|
94
|
+
parser.error("Empty command after '--'.")
|
|
95
|
+
return args
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def run_with_watchdog(
|
|
99
|
+
cmd: List[str],
|
|
100
|
+
idle_seconds: int,
|
|
101
|
+
max_restarts: int,
|
|
102
|
+
restart_delay_seconds: int,
|
|
103
|
+
stop_on_nonzero_exit: bool,
|
|
104
|
+
) -> int:
|
|
105
|
+
idle_seconds = max(1, int(idle_seconds))
|
|
106
|
+
max_restarts = max(0, int(max_restarts))
|
|
107
|
+
restart_delay_seconds = max(0, int(restart_delay_seconds))
|
|
108
|
+
|
|
109
|
+
attempt = 0
|
|
110
|
+
while True:
|
|
111
|
+
attempt += 1
|
|
112
|
+
print(
|
|
113
|
+
f"[watchdog] start attempt={attempt} idle_seconds={idle_seconds} cmd={cmd}",
|
|
114
|
+
flush=True,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
creationflags = 0
|
|
118
|
+
start_new_session = False
|
|
119
|
+
if os.name == "nt":
|
|
120
|
+
creationflags = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0)
|
|
121
|
+
else:
|
|
122
|
+
start_new_session = True
|
|
123
|
+
|
|
124
|
+
proc = subprocess.Popen(
|
|
125
|
+
cmd,
|
|
126
|
+
stdout=subprocess.PIPE,
|
|
127
|
+
stderr=subprocess.STDOUT,
|
|
128
|
+
text=True,
|
|
129
|
+
bufsize=1,
|
|
130
|
+
universal_newlines=True,
|
|
131
|
+
creationflags=creationflags,
|
|
132
|
+
start_new_session=start_new_session,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
last_output_ts: dict = {"ts": time.time()}
|
|
136
|
+
reader = threading.Thread(
|
|
137
|
+
target=_reader_thread,
|
|
138
|
+
args=(proc, last_output_ts),
|
|
139
|
+
kwargs={"prefix": ""},
|
|
140
|
+
daemon=True,
|
|
141
|
+
)
|
|
142
|
+
reader.start()
|
|
143
|
+
|
|
144
|
+
killed_for_idle = False
|
|
145
|
+
exit_code: Optional[int] = None
|
|
146
|
+
while True:
|
|
147
|
+
exit_code = proc.poll()
|
|
148
|
+
if exit_code is not None:
|
|
149
|
+
break
|
|
150
|
+
idle_for = time.time() - float(last_output_ts["ts"])
|
|
151
|
+
if idle_for > idle_seconds:
|
|
152
|
+
killed_for_idle = True
|
|
153
|
+
print(
|
|
154
|
+
f"[watchdog] idle>{idle_seconds}s (idle_for={int(idle_for)}s), killing pid={proc.pid}",
|
|
155
|
+
flush=True,
|
|
156
|
+
)
|
|
157
|
+
_kill_process_tree(proc.pid)
|
|
158
|
+
break
|
|
159
|
+
time.sleep(5)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
proc.wait(timeout=30)
|
|
163
|
+
except Exception:
|
|
164
|
+
_kill_process_tree(proc.pid)
|
|
165
|
+
|
|
166
|
+
if exit_code is None:
|
|
167
|
+
exit_code = proc.poll() or 1
|
|
168
|
+
|
|
169
|
+
if exit_code == 0:
|
|
170
|
+
print("[watchdog] finished with exit_code=0", flush=True)
|
|
171
|
+
return 0
|
|
172
|
+
|
|
173
|
+
if stop_on_nonzero_exit and not killed_for_idle:
|
|
174
|
+
print(
|
|
175
|
+
f"[watchdog] command exited non-zero (exit_code={exit_code}); stop.",
|
|
176
|
+
flush=True,
|
|
177
|
+
)
|
|
178
|
+
return int(exit_code)
|
|
179
|
+
|
|
180
|
+
if attempt > max_restarts + 1:
|
|
181
|
+
print(
|
|
182
|
+
f"[watchdog] exceeded max_restarts={max_restarts}; last exit_code={exit_code}",
|
|
183
|
+
flush=True,
|
|
184
|
+
)
|
|
185
|
+
return int(exit_code)
|
|
186
|
+
|
|
187
|
+
print(
|
|
188
|
+
f"[watchdog] restart in {restart_delay_seconds}s (exit_code={exit_code}, killed_for_idle={killed_for_idle})",
|
|
189
|
+
flush=True,
|
|
190
|
+
)
|
|
191
|
+
if restart_delay_seconds:
|
|
192
|
+
time.sleep(restart_delay_seconds)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
196
|
+
if configure_run_logging:
|
|
197
|
+
configure_run_logging(prefix="watchdog")
|
|
198
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
199
|
+
before_cmd, cmd = _split_argv(argv)
|
|
200
|
+
args = _parse_args(before_cmd, cmd)
|
|
201
|
+
return run_with_watchdog(
|
|
202
|
+
cmd=cmd,
|
|
203
|
+
idle_seconds=args.idle_seconds,
|
|
204
|
+
max_restarts=args.max_restarts,
|
|
205
|
+
restart_delay_seconds=args.restart_delay_seconds,
|
|
206
|
+
stop_on_nonzero_exit=bool(args.stop_on_nonzero_exit),
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
if __name__ == "__main__":
|
|
211
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .calibration import apply_calibration, fit_calibration_factor
|
|
4
|
+
from .data_quality import detect_leakage, profile_columns, validate_schema
|
|
5
|
+
from .exposure import aggregate_policy_level, build_frequency_severity, compute_exposure
|
|
6
|
+
from .factors import bin_numeric, build_factor_table
|
|
7
|
+
from .monitoring import population_stability_index, psi_report
|
|
8
|
+
from .rate_table import RateTable, apply_factor_tables, compute_base_rate, rate_premium
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"apply_calibration",
|
|
12
|
+
"fit_calibration_factor",
|
|
13
|
+
"detect_leakage",
|
|
14
|
+
"profile_columns",
|
|
15
|
+
"validate_schema",
|
|
16
|
+
"aggregate_policy_level",
|
|
17
|
+
"build_frequency_severity",
|
|
18
|
+
"compute_exposure",
|
|
19
|
+
"bin_numeric",
|
|
20
|
+
"build_factor_table",
|
|
21
|
+
"population_stability_index",
|
|
22
|
+
"psi_report",
|
|
23
|
+
"RateTable",
|
|
24
|
+
"apply_factor_tables",
|
|
25
|
+
"compute_base_rate",
|
|
26
|
+
"rate_premium",
|
|
27
|
+
]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def fit_calibration_factor(
|
|
9
|
+
pred: np.ndarray,
|
|
10
|
+
actual: np.ndarray,
|
|
11
|
+
*,
|
|
12
|
+
weight: Optional[np.ndarray] = None,
|
|
13
|
+
target_lr: Optional[float] = None,
|
|
14
|
+
) -> float:
|
|
15
|
+
"""Fit a scalar calibration factor for premiums or pure premiums."""
|
|
16
|
+
pred = np.asarray(pred, dtype=float).reshape(-1)
|
|
17
|
+
actual = np.asarray(actual, dtype=float).reshape(-1)
|
|
18
|
+
if weight is not None:
|
|
19
|
+
weight = np.asarray(weight, dtype=float).reshape(-1)
|
|
20
|
+
if weight.shape[0] != pred.shape[0]:
|
|
21
|
+
raise ValueError("weight length must match pred length.")
|
|
22
|
+
pred = pred * weight
|
|
23
|
+
actual = actual * weight
|
|
24
|
+
|
|
25
|
+
pred_sum = float(np.sum(pred))
|
|
26
|
+
actual_sum = float(np.sum(actual))
|
|
27
|
+
if pred_sum <= 0:
|
|
28
|
+
return 1.0
|
|
29
|
+
|
|
30
|
+
if target_lr is None:
|
|
31
|
+
return actual_sum / pred_sum
|
|
32
|
+
if target_lr <= 0:
|
|
33
|
+
raise ValueError("target_lr must be positive.")
|
|
34
|
+
return actual_sum / (target_lr * pred_sum)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def apply_calibration(pred: np.ndarray, factor: float) -> np.ndarray:
|
|
38
|
+
pred = np.asarray(pred, dtype=float)
|
|
39
|
+
return pred * float(factor)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Callable, Dict, Iterable, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _dtype_matches(actual: np.dtype, expected) -> bool:
|
|
10
|
+
if callable(expected):
|
|
11
|
+
return bool(expected(actual))
|
|
12
|
+
if isinstance(expected, (list, tuple, set)):
|
|
13
|
+
return any(_dtype_matches(actual, item) for item in expected)
|
|
14
|
+
try:
|
|
15
|
+
expected_dtype = np.dtype(expected)
|
|
16
|
+
except Exception:
|
|
17
|
+
return False
|
|
18
|
+
if pd.api.types.is_categorical_dtype(actual) and expected_dtype == np.dtype("category"):
|
|
19
|
+
return True
|
|
20
|
+
if pd.api.types.is_string_dtype(actual) and expected_dtype.kind in {"U", "S", "O"}:
|
|
21
|
+
return True
|
|
22
|
+
if np.issubdtype(actual, expected_dtype):
|
|
23
|
+
return True
|
|
24
|
+
return pd.api.types.is_dtype_equal(actual, expected_dtype)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def validate_schema(
|
|
28
|
+
df: pd.DataFrame,
|
|
29
|
+
required_cols: Iterable[str],
|
|
30
|
+
dtypes: Optional[Dict[str, object]] = None,
|
|
31
|
+
*,
|
|
32
|
+
raise_on_error: bool = True,
|
|
33
|
+
) -> Dict[str, object]:
|
|
34
|
+
"""Validate required columns and optional dtypes."""
|
|
35
|
+
required = list(required_cols)
|
|
36
|
+
missing = [col for col in required if col not in df.columns]
|
|
37
|
+
dtype_mismatch: Dict[str, Dict[str, str]] = {}
|
|
38
|
+
if dtypes:
|
|
39
|
+
for col, expected in dtypes.items():
|
|
40
|
+
if col not in df.columns:
|
|
41
|
+
continue
|
|
42
|
+
actual = df[col].dtype
|
|
43
|
+
if not _dtype_matches(actual, expected):
|
|
44
|
+
dtype_mismatch[col] = {
|
|
45
|
+
"expected": str(expected),
|
|
46
|
+
"actual": str(actual),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
ok = not missing and not dtype_mismatch
|
|
50
|
+
result = {"ok": ok, "missing": missing, "dtype_mismatch": dtype_mismatch}
|
|
51
|
+
if raise_on_error and not ok:
|
|
52
|
+
raise ValueError(f"Schema validation failed: {result}")
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def profile_columns(
|
|
57
|
+
df: pd.DataFrame, cols: Optional[Iterable[str]] = None
|
|
58
|
+
) -> pd.DataFrame:
|
|
59
|
+
"""Basic column profiling for missing/uniques and numeric stats."""
|
|
60
|
+
columns = list(cols) if cols is not None else list(df.columns)
|
|
61
|
+
rows = []
|
|
62
|
+
for col in columns:
|
|
63
|
+
series = df[col]
|
|
64
|
+
n = len(series)
|
|
65
|
+
missing_ratio = float(series.isna().mean()) if n else 0.0
|
|
66
|
+
nunique = int(series.nunique(dropna=True))
|
|
67
|
+
unique_ratio = float(nunique / n) if n else 0.0
|
|
68
|
+
entry = {
|
|
69
|
+
"column": col,
|
|
70
|
+
"dtype": str(series.dtype),
|
|
71
|
+
"missing_ratio": missing_ratio,
|
|
72
|
+
"n_unique": nunique,
|
|
73
|
+
"unique_ratio": unique_ratio,
|
|
74
|
+
}
|
|
75
|
+
if pd.api.types.is_numeric_dtype(series):
|
|
76
|
+
entry.update(
|
|
77
|
+
{
|
|
78
|
+
"min": float(series.min(skipna=True)),
|
|
79
|
+
"max": float(series.max(skipna=True)),
|
|
80
|
+
"mean": float(series.mean(skipna=True)),
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
rows.append(entry)
|
|
84
|
+
return pd.DataFrame(rows)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def detect_leakage(
|
|
88
|
+
df: pd.DataFrame,
|
|
89
|
+
target_col: str,
|
|
90
|
+
*,
|
|
91
|
+
exclude_cols: Optional[Iterable[str]] = None,
|
|
92
|
+
corr_threshold: float = 0.995,
|
|
93
|
+
) -> pd.DataFrame:
|
|
94
|
+
"""Detect simple leakage via identical columns or very high correlation."""
|
|
95
|
+
if target_col not in df.columns:
|
|
96
|
+
raise ValueError("target_col not found.")
|
|
97
|
+
exclude = set(exclude_cols or [])
|
|
98
|
+
exclude.add(target_col)
|
|
99
|
+
target = df[target_col]
|
|
100
|
+
results = []
|
|
101
|
+
for col in df.columns:
|
|
102
|
+
if col in exclude:
|
|
103
|
+
continue
|
|
104
|
+
series = df[col]
|
|
105
|
+
reason = None
|
|
106
|
+
score = None
|
|
107
|
+
if series.equals(target):
|
|
108
|
+
reason = "identical"
|
|
109
|
+
score = 1.0
|
|
110
|
+
elif pd.api.types.is_numeric_dtype(series) and pd.api.types.is_numeric_dtype(target):
|
|
111
|
+
corr = series.corr(target)
|
|
112
|
+
if pd.notna(corr) and abs(corr) >= corr_threshold:
|
|
113
|
+
reason = "high_corr"
|
|
114
|
+
score = float(corr)
|
|
115
|
+
if reason:
|
|
116
|
+
results.append({"feature": col, "reason": reason, "score": score})
|
|
117
|
+
return pd.DataFrame(results).sort_values(by="score", ascending=False).reset_index(drop=True)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterable, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_exposure(
|
|
10
|
+
df: pd.DataFrame,
|
|
11
|
+
start_col: str,
|
|
12
|
+
end_col: str,
|
|
13
|
+
*,
|
|
14
|
+
unit: str = "year",
|
|
15
|
+
inclusive: bool = False,
|
|
16
|
+
clip_min: Optional[float] = 0.0,
|
|
17
|
+
clip_max: Optional[float] = None,
|
|
18
|
+
) -> pd.Series:
|
|
19
|
+
"""Compute exposure from start/end date columns."""
|
|
20
|
+
start = pd.to_datetime(df[start_col])
|
|
21
|
+
end = pd.to_datetime(df[end_col])
|
|
22
|
+
delta_days = (end - start).dt.days.astype(float)
|
|
23
|
+
if inclusive:
|
|
24
|
+
delta_days = delta_days + 1.0
|
|
25
|
+
if unit == "day":
|
|
26
|
+
exposure = delta_days
|
|
27
|
+
elif unit == "month":
|
|
28
|
+
exposure = delta_days / 30.0
|
|
29
|
+
elif unit == "year":
|
|
30
|
+
exposure = delta_days / 365.25
|
|
31
|
+
else:
|
|
32
|
+
raise ValueError("unit must be one of: day, month, year.")
|
|
33
|
+
|
|
34
|
+
exposure = exposure.replace([np.inf, -np.inf], np.nan).fillna(0.0)
|
|
35
|
+
if clip_min is not None:
|
|
36
|
+
exposure = exposure.clip(lower=clip_min)
|
|
37
|
+
if clip_max is not None:
|
|
38
|
+
exposure = exposure.clip(upper=clip_max)
|
|
39
|
+
return exposure
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def aggregate_policy_level(
|
|
43
|
+
df: pd.DataFrame,
|
|
44
|
+
policy_keys: Iterable[str],
|
|
45
|
+
*,
|
|
46
|
+
exposure_col: str,
|
|
47
|
+
claim_count_col: Optional[str] = None,
|
|
48
|
+
claim_amount_col: Optional[str] = None,
|
|
49
|
+
weight_col: Optional[str] = None,
|
|
50
|
+
) -> pd.DataFrame:
|
|
51
|
+
"""Aggregate event-level rows to policy-level records."""
|
|
52
|
+
agg = {exposure_col: "sum"}
|
|
53
|
+
if claim_count_col:
|
|
54
|
+
agg[claim_count_col] = "sum"
|
|
55
|
+
if claim_amount_col:
|
|
56
|
+
agg[claim_amount_col] = "sum"
|
|
57
|
+
if weight_col:
|
|
58
|
+
agg[weight_col] = "sum"
|
|
59
|
+
grouped = df.groupby(list(policy_keys), dropna=False).agg(agg).reset_index()
|
|
60
|
+
return grouped
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def build_frequency_severity(
|
|
64
|
+
df: pd.DataFrame,
|
|
65
|
+
*,
|
|
66
|
+
exposure_col: str,
|
|
67
|
+
claim_count_col: str,
|
|
68
|
+
claim_amount_col: str,
|
|
69
|
+
zero_severity: float = 0.0,
|
|
70
|
+
) -> pd.DataFrame:
|
|
71
|
+
"""Compute frequency, severity and pure premium from counts and losses."""
|
|
72
|
+
exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
|
|
73
|
+
counts = df[claim_count_col].to_numpy(dtype=float, copy=False)
|
|
74
|
+
amounts = df[claim_amount_col].to_numpy(dtype=float, copy=False)
|
|
75
|
+
|
|
76
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
77
|
+
frequency = np.where(exposure > 0, counts / exposure, 0.0)
|
|
78
|
+
severity = np.where(counts > 0, amounts / counts, zero_severity)
|
|
79
|
+
pure_premium = frequency * severity
|
|
80
|
+
|
|
81
|
+
out = df.copy()
|
|
82
|
+
out["frequency"] = frequency
|
|
83
|
+
out["severity"] = severity
|
|
84
|
+
out["pure_premium"] = pure_premium
|
|
85
|
+
return out
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def bin_numeric(
|
|
10
|
+
series: pd.Series,
|
|
11
|
+
*,
|
|
12
|
+
bins: int = 10,
|
|
13
|
+
method: str = "quantile",
|
|
14
|
+
labels: Optional[list] = None,
|
|
15
|
+
include_lowest: bool = True,
|
|
16
|
+
) -> Tuple[pd.Series, np.ndarray]:
|
|
17
|
+
"""Bin numeric series and return (binned, bin_edges)."""
|
|
18
|
+
if method == "quantile":
|
|
19
|
+
binned = pd.qcut(series, q=bins, duplicates="drop", labels=labels)
|
|
20
|
+
bin_edges = binned.cat.categories.left.to_numpy()
|
|
21
|
+
elif method == "uniform":
|
|
22
|
+
binned = pd.cut(series, bins=bins, include_lowest=include_lowest, labels=labels)
|
|
23
|
+
bin_edges = binned.cat.categories.left.to_numpy()
|
|
24
|
+
else:
|
|
25
|
+
raise ValueError("method must be one of: quantile, uniform.")
|
|
26
|
+
return binned, bin_edges
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_factor_table(
|
|
30
|
+
df: pd.DataFrame,
|
|
31
|
+
*,
|
|
32
|
+
factor_col: str,
|
|
33
|
+
loss_col: str,
|
|
34
|
+
exposure_col: str,
|
|
35
|
+
weight_col: Optional[str] = None,
|
|
36
|
+
base_rate: Optional[float] = None,
|
|
37
|
+
smoothing: float = 0.0,
|
|
38
|
+
min_exposure: Optional[float] = None,
|
|
39
|
+
) -> pd.DataFrame:
|
|
40
|
+
"""Build a factor table with rate and relativity."""
|
|
41
|
+
if weight_col and weight_col in df.columns:
|
|
42
|
+
weights = df[weight_col].to_numpy(dtype=float, copy=False)
|
|
43
|
+
else:
|
|
44
|
+
weights = None
|
|
45
|
+
|
|
46
|
+
loss = df[loss_col].to_numpy(dtype=float, copy=False)
|
|
47
|
+
exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
|
|
48
|
+
|
|
49
|
+
if weights is not None:
|
|
50
|
+
loss = loss * weights
|
|
51
|
+
exposure = exposure * weights
|
|
52
|
+
|
|
53
|
+
data = pd.DataFrame(
|
|
54
|
+
{
|
|
55
|
+
"factor": df[factor_col],
|
|
56
|
+
"loss": loss,
|
|
57
|
+
"exposure": exposure,
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
grouped = data.groupby("factor", dropna=False).agg({"loss": "sum", "exposure": "sum"})
|
|
61
|
+
grouped = grouped.reset_index().rename(columns={"factor": "level"})
|
|
62
|
+
|
|
63
|
+
if base_rate is None:
|
|
64
|
+
total_loss = float(grouped["loss"].sum())
|
|
65
|
+
total_exposure = float(grouped["exposure"].sum())
|
|
66
|
+
base_rate = total_loss / total_exposure if total_exposure > 0 else 0.0
|
|
67
|
+
|
|
68
|
+
exposure_vals = grouped["exposure"].to_numpy(dtype=float, copy=False)
|
|
69
|
+
loss_vals = grouped["loss"].to_numpy(dtype=float, copy=False)
|
|
70
|
+
|
|
71
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
72
|
+
rate = np.where(
|
|
73
|
+
exposure_vals > 0,
|
|
74
|
+
(loss_vals + smoothing * base_rate) / (exposure_vals + smoothing),
|
|
75
|
+
0.0,
|
|
76
|
+
)
|
|
77
|
+
relativity = np.where(base_rate > 0, rate / base_rate, 1.0)
|
|
78
|
+
|
|
79
|
+
grouped["rate"] = rate
|
|
80
|
+
grouped["relativity"] = relativity
|
|
81
|
+
grouped["base_rate"] = float(base_rate)
|
|
82
|
+
|
|
83
|
+
if min_exposure is not None:
|
|
84
|
+
low_exposure = grouped["exposure"] < float(min_exposure)
|
|
85
|
+
grouped.loc[low_exposure, "relativity"] = 1.0
|
|
86
|
+
grouped.loc[low_exposure, "rate"] = float(base_rate)
|
|
87
|
+
grouped["is_low_exposure"] = low_exposure
|
|
88
|
+
else:
|
|
89
|
+
grouped["is_low_exposure"] = False
|
|
90
|
+
|
|
91
|
+
return grouped
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterable, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def psi_numeric(
|
|
10
|
+
expected: np.ndarray,
|
|
11
|
+
actual: np.ndarray,
|
|
12
|
+
*,
|
|
13
|
+
bins: int = 10,
|
|
14
|
+
strategy: str = "quantile",
|
|
15
|
+
eps: float = 1e-6,
|
|
16
|
+
) -> float:
|
|
17
|
+
expected = np.asarray(expected, dtype=float)
|
|
18
|
+
actual = np.asarray(actual, dtype=float)
|
|
19
|
+
expected = expected[~np.isnan(expected)]
|
|
20
|
+
actual = actual[~np.isnan(actual)]
|
|
21
|
+
if expected.size == 0 or actual.size == 0:
|
|
22
|
+
return 0.0
|
|
23
|
+
|
|
24
|
+
if strategy == "quantile":
|
|
25
|
+
quantiles = np.linspace(0, 1, bins + 1)
|
|
26
|
+
bin_edges = np.quantile(expected, quantiles)
|
|
27
|
+
bin_edges = np.unique(bin_edges)
|
|
28
|
+
elif strategy == "uniform":
|
|
29
|
+
min_val = min(expected.min(), actual.min())
|
|
30
|
+
max_val = max(expected.max(), actual.max())
|
|
31
|
+
bin_edges = np.linspace(min_val, max_val, bins + 1)
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError("strategy must be one of: quantile, uniform.")
|
|
34
|
+
|
|
35
|
+
if bin_edges.size < 2:
|
|
36
|
+
return 0.0
|
|
37
|
+
|
|
38
|
+
exp_counts, _ = np.histogram(expected, bins=bin_edges)
|
|
39
|
+
act_counts, _ = np.histogram(actual, bins=bin_edges)
|
|
40
|
+
exp_pct = exp_counts / max(exp_counts.sum(), 1)
|
|
41
|
+
act_pct = act_counts / max(act_counts.sum(), 1)
|
|
42
|
+
exp_pct = np.clip(exp_pct, eps, 1.0)
|
|
43
|
+
act_pct = np.clip(act_pct, eps, 1.0)
|
|
44
|
+
return float(np.sum((act_pct - exp_pct) * np.log(act_pct / exp_pct)))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def psi_categorical(
|
|
48
|
+
expected: Iterable,
|
|
49
|
+
actual: Iterable,
|
|
50
|
+
*,
|
|
51
|
+
eps: float = 1e-6,
|
|
52
|
+
) -> float:
|
|
53
|
+
expected = pd.Series(expected)
|
|
54
|
+
actual = pd.Series(actual)
|
|
55
|
+
categories = pd.Index(expected.dropna().unique()).union(actual.dropna().unique())
|
|
56
|
+
if categories.empty:
|
|
57
|
+
return 0.0
|
|
58
|
+
exp_counts = expected.value_counts().reindex(categories, fill_value=0)
|
|
59
|
+
act_counts = actual.value_counts().reindex(categories, fill_value=0)
|
|
60
|
+
exp_pct = exp_counts / max(exp_counts.sum(), 1)
|
|
61
|
+
act_pct = act_counts / max(act_counts.sum(), 1)
|
|
62
|
+
exp_pct = np.clip(exp_pct.to_numpy(dtype=float), eps, 1.0)
|
|
63
|
+
act_pct = np.clip(act_pct.to_numpy(dtype=float), eps, 1.0)
|
|
64
|
+
return float(np.sum((act_pct - exp_pct) * np.log(act_pct / exp_pct)))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def population_stability_index(
|
|
68
|
+
expected: np.ndarray,
|
|
69
|
+
actual: np.ndarray,
|
|
70
|
+
*,
|
|
71
|
+
bins: int = 10,
|
|
72
|
+
strategy: str = "quantile",
|
|
73
|
+
) -> float:
|
|
74
|
+
if pd.api.types.is_numeric_dtype(expected) and pd.api.types.is_numeric_dtype(actual):
|
|
75
|
+
return psi_numeric(expected, actual, bins=bins, strategy=strategy)
|
|
76
|
+
return psi_categorical(expected, actual)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def psi_report(
|
|
80
|
+
expected_df: pd.DataFrame,
|
|
81
|
+
actual_df: pd.DataFrame,
|
|
82
|
+
*,
|
|
83
|
+
features: Optional[Iterable[str]] = None,
|
|
84
|
+
bins: int = 10,
|
|
85
|
+
strategy: str = "quantile",
|
|
86
|
+
) -> pd.DataFrame:
|
|
87
|
+
feats = list(features) if features is not None else list(expected_df.columns)
|
|
88
|
+
rows = []
|
|
89
|
+
for feat in feats:
|
|
90
|
+
if feat not in expected_df.columns or feat not in actual_df.columns:
|
|
91
|
+
continue
|
|
92
|
+
psi = population_stability_index(
|
|
93
|
+
expected_df[feat].to_numpy(),
|
|
94
|
+
actual_df[feat].to_numpy(),
|
|
95
|
+
bins=bins,
|
|
96
|
+
strategy=strategy,
|
|
97
|
+
)
|
|
98
|
+
rows.append({"feature": feat, "psi": psi})
|
|
99
|
+
return pd.DataFrame(rows).sort_values(by="psi", ascending=False).reset_index(drop=True)
|