ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,211 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ import threading
8
+ import time
9
+ from typing import List, Optional
10
+
11
+ try:
12
+ from .run_logging import configure_run_logging # type: ignore
13
+ except Exception: # pragma: no cover
14
+ try:
15
+ from run_logging import configure_run_logging # type: ignore
16
+ except Exception: # pragma: no cover
17
+ configure_run_logging = None # type: ignore
18
+
19
+
20
+ def _split_argv(argv: List[str]) -> tuple[List[str], List[str]]:
21
+ if "--" not in argv:
22
+ raise ValueError("Missing '--' separator before the command to run.")
23
+ idx = argv.index("--")
24
+ return argv[:idx], argv[idx + 1 :]
25
+
26
+
27
+ def _kill_process_tree(pid: int) -> None:
28
+ if pid <= 0:
29
+ return
30
+ if os.name == "nt":
31
+ subprocess.run(
32
+ ["taskkill", "/PID", str(pid), "/T", "/F"],
33
+ stdout=subprocess.DEVNULL,
34
+ stderr=subprocess.DEVNULL,
35
+ check=False,
36
+ )
37
+ return
38
+ try:
39
+ os.killpg(pid, 15)
40
+ time.sleep(2)
41
+ os.killpg(pid, 9)
42
+ except Exception:
43
+ try:
44
+ os.kill(pid, 9)
45
+ except Exception:
46
+ pass
47
+
48
+
49
+ def _reader_thread(
50
+ proc: subprocess.Popen, last_output_ts: dict, prefix: str = ""
51
+ ) -> None:
52
+ assert proc.stdout is not None
53
+ for line in proc.stdout:
54
+ last_output_ts["ts"] = time.time()
55
+ if prefix:
56
+ sys.stdout.write(prefix)
57
+ sys.stdout.write(line)
58
+ sys.stdout.flush()
59
+
60
+
61
+ def _parse_args(before_cmd: List[str], cmd: List[str]) -> argparse.Namespace:
62
+ parser = argparse.ArgumentParser(
63
+ description=(
64
+ "Run a command under a simple watchdog: if there is no stdout/stderr "
65
+ "output for N seconds, kill the whole process tree and restart. "
66
+ "Designed to pair with optuna_storage so BayesOpt can resume."
67
+ )
68
+ )
69
+ parser.add_argument(
70
+ "--idle-seconds",
71
+ type=int,
72
+ default=7200,
73
+ help="Restart if there is no output for this many seconds (default: 7200).",
74
+ )
75
+ parser.add_argument(
76
+ "--max-restarts",
77
+ type=int,
78
+ default=50,
79
+ help="Maximum restart attempts (default: 50).",
80
+ )
81
+ parser.add_argument(
82
+ "--restart-delay-seconds",
83
+ type=int,
84
+ default=10,
85
+ help="Delay between restarts (default: 10).",
86
+ )
87
+ parser.add_argument(
88
+ "--stop-on-nonzero-exit",
89
+ action="store_true",
90
+ help="If the command exits non-zero, stop instead of restarting.",
91
+ )
92
+ args = parser.parse_args(before_cmd)
93
+ if not cmd:
94
+ parser.error("Empty command after '--'.")
95
+ return args
96
+
97
+
98
+ def run_with_watchdog(
99
+ cmd: List[str],
100
+ idle_seconds: int,
101
+ max_restarts: int,
102
+ restart_delay_seconds: int,
103
+ stop_on_nonzero_exit: bool,
104
+ ) -> int:
105
+ idle_seconds = max(1, int(idle_seconds))
106
+ max_restarts = max(0, int(max_restarts))
107
+ restart_delay_seconds = max(0, int(restart_delay_seconds))
108
+
109
+ attempt = 0
110
+ while True:
111
+ attempt += 1
112
+ print(
113
+ f"[watchdog] start attempt={attempt} idle_seconds={idle_seconds} cmd={cmd}",
114
+ flush=True,
115
+ )
116
+
117
+ creationflags = 0
118
+ start_new_session = False
119
+ if os.name == "nt":
120
+ creationflags = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0)
121
+ else:
122
+ start_new_session = True
123
+
124
+ proc = subprocess.Popen(
125
+ cmd,
126
+ stdout=subprocess.PIPE,
127
+ stderr=subprocess.STDOUT,
128
+ text=True,
129
+ bufsize=1,
130
+ universal_newlines=True,
131
+ creationflags=creationflags,
132
+ start_new_session=start_new_session,
133
+ )
134
+
135
+ last_output_ts: dict = {"ts": time.time()}
136
+ reader = threading.Thread(
137
+ target=_reader_thread,
138
+ args=(proc, last_output_ts),
139
+ kwargs={"prefix": ""},
140
+ daemon=True,
141
+ )
142
+ reader.start()
143
+
144
+ killed_for_idle = False
145
+ exit_code: Optional[int] = None
146
+ while True:
147
+ exit_code = proc.poll()
148
+ if exit_code is not None:
149
+ break
150
+ idle_for = time.time() - float(last_output_ts["ts"])
151
+ if idle_for > idle_seconds:
152
+ killed_for_idle = True
153
+ print(
154
+ f"[watchdog] idle>{idle_seconds}s (idle_for={int(idle_for)}s), killing pid={proc.pid}",
155
+ flush=True,
156
+ )
157
+ _kill_process_tree(proc.pid)
158
+ break
159
+ time.sleep(5)
160
+
161
+ try:
162
+ proc.wait(timeout=30)
163
+ except Exception:
164
+ _kill_process_tree(proc.pid)
165
+
166
+ if exit_code is None:
167
+ exit_code = proc.poll() or 1
168
+
169
+ if exit_code == 0:
170
+ print("[watchdog] finished with exit_code=0", flush=True)
171
+ return 0
172
+
173
+ if stop_on_nonzero_exit and not killed_for_idle:
174
+ print(
175
+ f"[watchdog] command exited non-zero (exit_code={exit_code}); stop.",
176
+ flush=True,
177
+ )
178
+ return int(exit_code)
179
+
180
+ if attempt > max_restarts + 1:
181
+ print(
182
+ f"[watchdog] exceeded max_restarts={max_restarts}; last exit_code={exit_code}",
183
+ flush=True,
184
+ )
185
+ return int(exit_code)
186
+
187
+ print(
188
+ f"[watchdog] restart in {restart_delay_seconds}s (exit_code={exit_code}, killed_for_idle={killed_for_idle})",
189
+ flush=True,
190
+ )
191
+ if restart_delay_seconds:
192
+ time.sleep(restart_delay_seconds)
193
+
194
+
195
+ def main(argv: Optional[List[str]] = None) -> int:
196
+ if configure_run_logging:
197
+ configure_run_logging(prefix="watchdog")
198
+ argv = list(sys.argv[1:] if argv is None else argv)
199
+ before_cmd, cmd = _split_argv(argv)
200
+ args = _parse_args(before_cmd, cmd)
201
+ return run_with_watchdog(
202
+ cmd=cmd,
203
+ idle_seconds=args.idle_seconds,
204
+ max_restarts=args.max_restarts,
205
+ restart_delay_seconds=args.restart_delay_seconds,
206
+ stop_on_nonzero_exit=bool(args.stop_on_nonzero_exit),
207
+ )
208
+
209
+
210
+ if __name__ == "__main__":
211
+ raise SystemExit(main())
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from .calibration import apply_calibration, fit_calibration_factor
4
+ from .data_quality import detect_leakage, profile_columns, validate_schema
5
+ from .exposure import aggregate_policy_level, build_frequency_severity, compute_exposure
6
+ from .factors import bin_numeric, build_factor_table
7
+ from .monitoring import population_stability_index, psi_report
8
+ from .rate_table import RateTable, apply_factor_tables, compute_base_rate, rate_premium
9
+
10
+ __all__ = [
11
+ "apply_calibration",
12
+ "fit_calibration_factor",
13
+ "detect_leakage",
14
+ "profile_columns",
15
+ "validate_schema",
16
+ "aggregate_policy_level",
17
+ "build_frequency_severity",
18
+ "compute_exposure",
19
+ "bin_numeric",
20
+ "build_factor_table",
21
+ "population_stability_index",
22
+ "psi_report",
23
+ "RateTable",
24
+ "apply_factor_tables",
25
+ "compute_base_rate",
26
+ "rate_premium",
27
+ ]
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+
5
+ import numpy as np
6
+
7
+
8
+ def fit_calibration_factor(
9
+ pred: np.ndarray,
10
+ actual: np.ndarray,
11
+ *,
12
+ weight: Optional[np.ndarray] = None,
13
+ target_lr: Optional[float] = None,
14
+ ) -> float:
15
+ """Fit a scalar calibration factor for premiums or pure premiums."""
16
+ pred = np.asarray(pred, dtype=float).reshape(-1)
17
+ actual = np.asarray(actual, dtype=float).reshape(-1)
18
+ if weight is not None:
19
+ weight = np.asarray(weight, dtype=float).reshape(-1)
20
+ if weight.shape[0] != pred.shape[0]:
21
+ raise ValueError("weight length must match pred length.")
22
+ pred = pred * weight
23
+ actual = actual * weight
24
+
25
+ pred_sum = float(np.sum(pred))
26
+ actual_sum = float(np.sum(actual))
27
+ if pred_sum <= 0:
28
+ return 1.0
29
+
30
+ if target_lr is None:
31
+ return actual_sum / pred_sum
32
+ if target_lr <= 0:
33
+ raise ValueError("target_lr must be positive.")
34
+ return actual_sum / (target_lr * pred_sum)
35
+
36
+
37
+ def apply_calibration(pred: np.ndarray, factor: float) -> np.ndarray:
38
+ pred = np.asarray(pred, dtype=float)
39
+ return pred * float(factor)
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Callable, Dict, Iterable, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def _dtype_matches(actual: np.dtype, expected) -> bool:
10
+ if callable(expected):
11
+ return bool(expected(actual))
12
+ if isinstance(expected, (list, tuple, set)):
13
+ return any(_dtype_matches(actual, item) for item in expected)
14
+ try:
15
+ expected_dtype = np.dtype(expected)
16
+ except Exception:
17
+ return False
18
+ if pd.api.types.is_categorical_dtype(actual) and expected_dtype == np.dtype("category"):
19
+ return True
20
+ if pd.api.types.is_string_dtype(actual) and expected_dtype.kind in {"U", "S", "O"}:
21
+ return True
22
+ if np.issubdtype(actual, expected_dtype):
23
+ return True
24
+ return pd.api.types.is_dtype_equal(actual, expected_dtype)
25
+
26
+
27
+ def validate_schema(
28
+ df: pd.DataFrame,
29
+ required_cols: Iterable[str],
30
+ dtypes: Optional[Dict[str, object]] = None,
31
+ *,
32
+ raise_on_error: bool = True,
33
+ ) -> Dict[str, object]:
34
+ """Validate required columns and optional dtypes."""
35
+ required = list(required_cols)
36
+ missing = [col for col in required if col not in df.columns]
37
+ dtype_mismatch: Dict[str, Dict[str, str]] = {}
38
+ if dtypes:
39
+ for col, expected in dtypes.items():
40
+ if col not in df.columns:
41
+ continue
42
+ actual = df[col].dtype
43
+ if not _dtype_matches(actual, expected):
44
+ dtype_mismatch[col] = {
45
+ "expected": str(expected),
46
+ "actual": str(actual),
47
+ }
48
+
49
+ ok = not missing and not dtype_mismatch
50
+ result = {"ok": ok, "missing": missing, "dtype_mismatch": dtype_mismatch}
51
+ if raise_on_error and not ok:
52
+ raise ValueError(f"Schema validation failed: {result}")
53
+ return result
54
+
55
+
56
+ def profile_columns(
57
+ df: pd.DataFrame, cols: Optional[Iterable[str]] = None
58
+ ) -> pd.DataFrame:
59
+ """Basic column profiling for missing/uniques and numeric stats."""
60
+ columns = list(cols) if cols is not None else list(df.columns)
61
+ rows = []
62
+ for col in columns:
63
+ series = df[col]
64
+ n = len(series)
65
+ missing_ratio = float(series.isna().mean()) if n else 0.0
66
+ nunique = int(series.nunique(dropna=True))
67
+ unique_ratio = float(nunique / n) if n else 0.0
68
+ entry = {
69
+ "column": col,
70
+ "dtype": str(series.dtype),
71
+ "missing_ratio": missing_ratio,
72
+ "n_unique": nunique,
73
+ "unique_ratio": unique_ratio,
74
+ }
75
+ if pd.api.types.is_numeric_dtype(series):
76
+ entry.update(
77
+ {
78
+ "min": float(series.min(skipna=True)),
79
+ "max": float(series.max(skipna=True)),
80
+ "mean": float(series.mean(skipna=True)),
81
+ }
82
+ )
83
+ rows.append(entry)
84
+ return pd.DataFrame(rows)
85
+
86
+
87
+ def detect_leakage(
88
+ df: pd.DataFrame,
89
+ target_col: str,
90
+ *,
91
+ exclude_cols: Optional[Iterable[str]] = None,
92
+ corr_threshold: float = 0.995,
93
+ ) -> pd.DataFrame:
94
+ """Detect simple leakage via identical columns or very high correlation."""
95
+ if target_col not in df.columns:
96
+ raise ValueError("target_col not found.")
97
+ exclude = set(exclude_cols or [])
98
+ exclude.add(target_col)
99
+ target = df[target_col]
100
+ results = []
101
+ for col in df.columns:
102
+ if col in exclude:
103
+ continue
104
+ series = df[col]
105
+ reason = None
106
+ score = None
107
+ if series.equals(target):
108
+ reason = "identical"
109
+ score = 1.0
110
+ elif pd.api.types.is_numeric_dtype(series) and pd.api.types.is_numeric_dtype(target):
111
+ corr = series.corr(target)
112
+ if pd.notna(corr) and abs(corr) >= corr_threshold:
113
+ reason = "high_corr"
114
+ score = float(corr)
115
+ if reason:
116
+ results.append({"feature": col, "reason": reason, "score": score})
117
+ return pd.DataFrame(results).sort_values(by="score", ascending=False).reset_index(drop=True)
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def compute_exposure(
10
+ df: pd.DataFrame,
11
+ start_col: str,
12
+ end_col: str,
13
+ *,
14
+ unit: str = "year",
15
+ inclusive: bool = False,
16
+ clip_min: Optional[float] = 0.0,
17
+ clip_max: Optional[float] = None,
18
+ ) -> pd.Series:
19
+ """Compute exposure from start/end date columns."""
20
+ start = pd.to_datetime(df[start_col])
21
+ end = pd.to_datetime(df[end_col])
22
+ delta_days = (end - start).dt.days.astype(float)
23
+ if inclusive:
24
+ delta_days = delta_days + 1.0
25
+ if unit == "day":
26
+ exposure = delta_days
27
+ elif unit == "month":
28
+ exposure = delta_days / 30.0
29
+ elif unit == "year":
30
+ exposure = delta_days / 365.25
31
+ else:
32
+ raise ValueError("unit must be one of: day, month, year.")
33
+
34
+ exposure = exposure.replace([np.inf, -np.inf], np.nan).fillna(0.0)
35
+ if clip_min is not None:
36
+ exposure = exposure.clip(lower=clip_min)
37
+ if clip_max is not None:
38
+ exposure = exposure.clip(upper=clip_max)
39
+ return exposure
40
+
41
+
42
+ def aggregate_policy_level(
43
+ df: pd.DataFrame,
44
+ policy_keys: Iterable[str],
45
+ *,
46
+ exposure_col: str,
47
+ claim_count_col: Optional[str] = None,
48
+ claim_amount_col: Optional[str] = None,
49
+ weight_col: Optional[str] = None,
50
+ ) -> pd.DataFrame:
51
+ """Aggregate event-level rows to policy-level records."""
52
+ agg = {exposure_col: "sum"}
53
+ if claim_count_col:
54
+ agg[claim_count_col] = "sum"
55
+ if claim_amount_col:
56
+ agg[claim_amount_col] = "sum"
57
+ if weight_col:
58
+ agg[weight_col] = "sum"
59
+ grouped = df.groupby(list(policy_keys), dropna=False).agg(agg).reset_index()
60
+ return grouped
61
+
62
+
63
+ def build_frequency_severity(
64
+ df: pd.DataFrame,
65
+ *,
66
+ exposure_col: str,
67
+ claim_count_col: str,
68
+ claim_amount_col: str,
69
+ zero_severity: float = 0.0,
70
+ ) -> pd.DataFrame:
71
+ """Compute frequency, severity and pure premium from counts and losses."""
72
+ exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
73
+ counts = df[claim_count_col].to_numpy(dtype=float, copy=False)
74
+ amounts = df[claim_amount_col].to_numpy(dtype=float, copy=False)
75
+
76
+ with np.errstate(divide="ignore", invalid="ignore"):
77
+ frequency = np.where(exposure > 0, counts / exposure, 0.0)
78
+ severity = np.where(counts > 0, amounts / counts, zero_severity)
79
+ pure_premium = frequency * severity
80
+
81
+ out = df.copy()
82
+ out["frequency"] = frequency
83
+ out["severity"] = severity
84
+ out["pure_premium"] = pure_premium
85
+ return out
@@ -0,0 +1,91 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Tuple
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def bin_numeric(
10
+ series: pd.Series,
11
+ *,
12
+ bins: int = 10,
13
+ method: str = "quantile",
14
+ labels: Optional[list] = None,
15
+ include_lowest: bool = True,
16
+ ) -> Tuple[pd.Series, np.ndarray]:
17
+ """Bin numeric series and return (binned, bin_edges)."""
18
+ if method == "quantile":
19
+ binned = pd.qcut(series, q=bins, duplicates="drop", labels=labels)
20
+ bin_edges = binned.cat.categories.left.to_numpy()
21
+ elif method == "uniform":
22
+ binned = pd.cut(series, bins=bins, include_lowest=include_lowest, labels=labels)
23
+ bin_edges = binned.cat.categories.left.to_numpy()
24
+ else:
25
+ raise ValueError("method must be one of: quantile, uniform.")
26
+ return binned, bin_edges
27
+
28
+
29
+ def build_factor_table(
30
+ df: pd.DataFrame,
31
+ *,
32
+ factor_col: str,
33
+ loss_col: str,
34
+ exposure_col: str,
35
+ weight_col: Optional[str] = None,
36
+ base_rate: Optional[float] = None,
37
+ smoothing: float = 0.0,
38
+ min_exposure: Optional[float] = None,
39
+ ) -> pd.DataFrame:
40
+ """Build a factor table with rate and relativity."""
41
+ if weight_col and weight_col in df.columns:
42
+ weights = df[weight_col].to_numpy(dtype=float, copy=False)
43
+ else:
44
+ weights = None
45
+
46
+ loss = df[loss_col].to_numpy(dtype=float, copy=False)
47
+ exposure = df[exposure_col].to_numpy(dtype=float, copy=False)
48
+
49
+ if weights is not None:
50
+ loss = loss * weights
51
+ exposure = exposure * weights
52
+
53
+ data = pd.DataFrame(
54
+ {
55
+ "factor": df[factor_col],
56
+ "loss": loss,
57
+ "exposure": exposure,
58
+ }
59
+ )
60
+ grouped = data.groupby("factor", dropna=False).agg({"loss": "sum", "exposure": "sum"})
61
+ grouped = grouped.reset_index().rename(columns={"factor": "level"})
62
+
63
+ if base_rate is None:
64
+ total_loss = float(grouped["loss"].sum())
65
+ total_exposure = float(grouped["exposure"].sum())
66
+ base_rate = total_loss / total_exposure if total_exposure > 0 else 0.0
67
+
68
+ exposure_vals = grouped["exposure"].to_numpy(dtype=float, copy=False)
69
+ loss_vals = grouped["loss"].to_numpy(dtype=float, copy=False)
70
+
71
+ with np.errstate(divide="ignore", invalid="ignore"):
72
+ rate = np.where(
73
+ exposure_vals > 0,
74
+ (loss_vals + smoothing * base_rate) / (exposure_vals + smoothing),
75
+ 0.0,
76
+ )
77
+ relativity = np.where(base_rate > 0, rate / base_rate, 1.0)
78
+
79
+ grouped["rate"] = rate
80
+ grouped["relativity"] = relativity
81
+ grouped["base_rate"] = float(base_rate)
82
+
83
+ if min_exposure is not None:
84
+ low_exposure = grouped["exposure"] < float(min_exposure)
85
+ grouped.loc[low_exposure, "relativity"] = 1.0
86
+ grouped.loc[low_exposure, "rate"] = float(base_rate)
87
+ grouped["is_low_exposure"] = low_exposure
88
+ else:
89
+ grouped["is_low_exposure"] = False
90
+
91
+ return grouped
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def psi_numeric(
10
+ expected: np.ndarray,
11
+ actual: np.ndarray,
12
+ *,
13
+ bins: int = 10,
14
+ strategy: str = "quantile",
15
+ eps: float = 1e-6,
16
+ ) -> float:
17
+ expected = np.asarray(expected, dtype=float)
18
+ actual = np.asarray(actual, dtype=float)
19
+ expected = expected[~np.isnan(expected)]
20
+ actual = actual[~np.isnan(actual)]
21
+ if expected.size == 0 or actual.size == 0:
22
+ return 0.0
23
+
24
+ if strategy == "quantile":
25
+ quantiles = np.linspace(0, 1, bins + 1)
26
+ bin_edges = np.quantile(expected, quantiles)
27
+ bin_edges = np.unique(bin_edges)
28
+ elif strategy == "uniform":
29
+ min_val = min(expected.min(), actual.min())
30
+ max_val = max(expected.max(), actual.max())
31
+ bin_edges = np.linspace(min_val, max_val, bins + 1)
32
+ else:
33
+ raise ValueError("strategy must be one of: quantile, uniform.")
34
+
35
+ if bin_edges.size < 2:
36
+ return 0.0
37
+
38
+ exp_counts, _ = np.histogram(expected, bins=bin_edges)
39
+ act_counts, _ = np.histogram(actual, bins=bin_edges)
40
+ exp_pct = exp_counts / max(exp_counts.sum(), 1)
41
+ act_pct = act_counts / max(act_counts.sum(), 1)
42
+ exp_pct = np.clip(exp_pct, eps, 1.0)
43
+ act_pct = np.clip(act_pct, eps, 1.0)
44
+ return float(np.sum((act_pct - exp_pct) * np.log(act_pct / exp_pct)))
45
+
46
+
47
+ def psi_categorical(
48
+ expected: Iterable,
49
+ actual: Iterable,
50
+ *,
51
+ eps: float = 1e-6,
52
+ ) -> float:
53
+ expected = pd.Series(expected)
54
+ actual = pd.Series(actual)
55
+ categories = pd.Index(expected.dropna().unique()).union(actual.dropna().unique())
56
+ if categories.empty:
57
+ return 0.0
58
+ exp_counts = expected.value_counts().reindex(categories, fill_value=0)
59
+ act_counts = actual.value_counts().reindex(categories, fill_value=0)
60
+ exp_pct = exp_counts / max(exp_counts.sum(), 1)
61
+ act_pct = act_counts / max(act_counts.sum(), 1)
62
+ exp_pct = np.clip(exp_pct.to_numpy(dtype=float), eps, 1.0)
63
+ act_pct = np.clip(act_pct.to_numpy(dtype=float), eps, 1.0)
64
+ return float(np.sum((act_pct - exp_pct) * np.log(act_pct / exp_pct)))
65
+
66
+
67
+ def population_stability_index(
68
+ expected: np.ndarray,
69
+ actual: np.ndarray,
70
+ *,
71
+ bins: int = 10,
72
+ strategy: str = "quantile",
73
+ ) -> float:
74
+ if pd.api.types.is_numeric_dtype(expected) and pd.api.types.is_numeric_dtype(actual):
75
+ return psi_numeric(expected, actual, bins=bins, strategy=strategy)
76
+ return psi_categorical(expected, actual)
77
+
78
+
79
+ def psi_report(
80
+ expected_df: pd.DataFrame,
81
+ actual_df: pd.DataFrame,
82
+ *,
83
+ features: Optional[Iterable[str]] = None,
84
+ bins: int = 10,
85
+ strategy: str = "quantile",
86
+ ) -> pd.DataFrame:
87
+ feats = list(features) if features is not None else list(expected_df.columns)
88
+ rows = []
89
+ for feat in feats:
90
+ if feat not in expected_df.columns or feat not in actual_df.columns:
91
+ continue
92
+ psi = population_stability_index(
93
+ expected_df[feat].to_numpy(),
94
+ actual_df[feat].to_numpy(),
95
+ bins=bins,
96
+ strategy=strategy,
97
+ )
98
+ rows.append({"feature": feat, "psi": psi})
99
+ return pd.DataFrame(rows).sort_values(by="psi", ascending=False).reset_index(drop=True)