ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +48 -22
- ins_pricing/__init__.py +142 -90
- ins_pricing/cli/BayesOpt_entry.py +58 -46
- ins_pricing/cli/BayesOpt_incremental.py +77 -110
- ins_pricing/cli/Explain_Run.py +42 -23
- ins_pricing/cli/Explain_entry.py +551 -577
- ins_pricing/cli/Pricing_Run.py +42 -23
- ins_pricing/cli/bayesopt_entry_runner.py +51 -16
- ins_pricing/cli/utils/bootstrap.py +23 -0
- ins_pricing/cli/utils/cli_common.py +256 -256
- ins_pricing/cli/utils/cli_config.py +379 -360
- ins_pricing/cli/utils/import_resolver.py +375 -358
- ins_pricing/cli/utils/notebook_utils.py +256 -242
- ins_pricing/cli/watchdog_run.py +216 -198
- ins_pricing/frontend/__init__.py +10 -10
- ins_pricing/frontend/app.py +132 -61
- ins_pricing/frontend/config_builder.py +33 -0
- ins_pricing/frontend/example_config.json +11 -0
- ins_pricing/frontend/example_workflows.py +1 -1
- ins_pricing/frontend/runner.py +340 -388
- ins_pricing/governance/__init__.py +20 -20
- ins_pricing/governance/release.py +159 -159
- ins_pricing/modelling/README.md +1 -1
- ins_pricing/modelling/__init__.py +147 -92
- ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
- ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
- ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
- ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
- ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
- ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
- ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
- ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
- ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
- ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
- ins_pricing/modelling/explain/__init__.py +55 -55
- ins_pricing/modelling/explain/metrics.py +27 -174
- ins_pricing/modelling/explain/permutation.py +237 -237
- ins_pricing/modelling/plotting/__init__.py +40 -36
- ins_pricing/modelling/plotting/compat.py +228 -0
- ins_pricing/modelling/plotting/curves.py +572 -572
- ins_pricing/modelling/plotting/diagnostics.py +163 -163
- ins_pricing/modelling/plotting/geo.py +362 -362
- ins_pricing/modelling/plotting/importance.py +121 -121
- ins_pricing/pricing/__init__.py +27 -27
- ins_pricing/pricing/factors.py +67 -56
- ins_pricing/production/__init__.py +35 -25
- ins_pricing/production/{predict.py → inference.py} +140 -57
- ins_pricing/production/monitoring.py +8 -21
- ins_pricing/reporting/__init__.py +11 -11
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/production/test_inference.py +90 -0
- ins_pricing/utils/__init__.py +112 -78
- ins_pricing/utils/device.py +258 -237
- ins_pricing/utils/features.py +53 -0
- ins_pricing/utils/io.py +72 -0
- ins_pricing/utils/logging.py +34 -1
- ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
- ins_pricing/utils/metrics.py +158 -24
- ins_pricing/utils/numerics.py +76 -0
- ins_pricing/utils/paths.py +9 -1
- ins_pricing/utils/profiling.py +8 -4
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
- ins_pricing-0.5.1.dist-info/RECORD +132 -0
- ins_pricing/modelling/core/BayesOpt.py +0 -146
- ins_pricing/modelling/core/__init__.py +0 -1
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
- ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
- ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
- ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
- ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
- ins_pricing/modelling/core/bayesopt/utils.py +0 -105
- ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
- ins_pricing/tests/production/test_predict.py +0 -233
- ins_pricing-0.4.5.dist-info/RECORD +0 -130
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,121 +1,121 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Mapping, Optional, Sequence, Tuple
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from .common import PlotStyle, finalize_figure, plt
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _to_series(
|
|
12
|
-
importance: Mapping[str, float]
|
|
13
|
-
| Sequence[Tuple[str, float]]
|
|
14
|
-
| pd.Series
|
|
15
|
-
| np.ndarray,
|
|
16
|
-
feature_names: Optional[Sequence[str]] = None,
|
|
17
|
-
) -> pd.Series:
|
|
18
|
-
if isinstance(importance, pd.Series):
|
|
19
|
-
return importance.copy()
|
|
20
|
-
if isinstance(importance, Mapping):
|
|
21
|
-
return pd.Series(dict(importance))
|
|
22
|
-
if isinstance(importance, np.ndarray):
|
|
23
|
-
if feature_names is None:
|
|
24
|
-
raise ValueError("feature_names is required when importance is an array.")
|
|
25
|
-
return pd.Series(importance, index=list(feature_names))
|
|
26
|
-
return pd.Series(dict(importance))
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def shap_importance(
|
|
30
|
-
shap_values: np.ndarray,
|
|
31
|
-
feature_names: Sequence[str],
|
|
32
|
-
) -> pd.Series:
|
|
33
|
-
if shap_values.ndim == 3:
|
|
34
|
-
shap_values = shap_values[0]
|
|
35
|
-
if shap_values.ndim != 2:
|
|
36
|
-
raise ValueError("shap_values should be 2d (n_samples, n_features).")
|
|
37
|
-
scores = np.abs(shap_values).mean(axis=0)
|
|
38
|
-
return pd.Series(scores, index=list(feature_names))
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def plot_feature_importance(
|
|
42
|
-
importance: Mapping[str, float]
|
|
43
|
-
| Sequence[Tuple[str, float]]
|
|
44
|
-
| pd.Series
|
|
45
|
-
| np.ndarray,
|
|
46
|
-
*,
|
|
47
|
-
feature_names: Optional[Sequence[str]] = None,
|
|
48
|
-
top_n: int = 30,
|
|
49
|
-
title: str = "Feature Importance",
|
|
50
|
-
sort_by: str = "abs",
|
|
51
|
-
descending: bool = True,
|
|
52
|
-
show_values: bool = False,
|
|
53
|
-
ax: Optional[plt.Axes] = None,
|
|
54
|
-
show: bool = False,
|
|
55
|
-
save_path: Optional[str] = None,
|
|
56
|
-
style: Optional[PlotStyle] = None,
|
|
57
|
-
) -> plt.Figure:
|
|
58
|
-
style = style or PlotStyle()
|
|
59
|
-
series = _to_series(importance, feature_names=feature_names)
|
|
60
|
-
series = series.replace([np.inf, -np.inf], np.nan).dropna()
|
|
61
|
-
|
|
62
|
-
if sort_by not in {"abs", "value"}:
|
|
63
|
-
raise ValueError("sort_by must be 'abs' or 'value'.")
|
|
64
|
-
sort_key = series.abs() if sort_by == "abs" else series
|
|
65
|
-
series = series.loc[sort_key.sort_values(ascending=not descending).index]
|
|
66
|
-
|
|
67
|
-
if top_n > 0:
|
|
68
|
-
series = series.head(int(top_n))
|
|
69
|
-
|
|
70
|
-
created_fig = ax is None
|
|
71
|
-
if created_fig:
|
|
72
|
-
height = max(3.0, 0.3 * len(series))
|
|
73
|
-
fig, ax = plt.subplots(figsize=(style.figsize[0], height))
|
|
74
|
-
else:
|
|
75
|
-
fig = ax.figure
|
|
76
|
-
|
|
77
|
-
y_pos = np.arange(len(series))
|
|
78
|
-
ax.barh(y_pos, series.values, color=style.palette[0])
|
|
79
|
-
ax.set_yticks(y_pos)
|
|
80
|
-
ax.set_yticklabels(series.index, fontsize=style.tick_size)
|
|
81
|
-
ax.invert_yaxis()
|
|
82
|
-
ax.set_title(title, fontsize=style.title_size)
|
|
83
|
-
ax.tick_params(axis="x", labelsize=style.tick_size)
|
|
84
|
-
if style.grid:
|
|
85
|
-
ax.grid(True, axis="x", linestyle=style.grid_style, alpha=style.grid_alpha)
|
|
86
|
-
|
|
87
|
-
if show_values:
|
|
88
|
-
for idx, val in enumerate(series.values):
|
|
89
|
-
ax.text(val, idx, f" {val:.3f}", va="center", fontsize=style.tick_size)
|
|
90
|
-
|
|
91
|
-
if created_fig:
|
|
92
|
-
finalize_figure(fig, save_path=save_path, show=show, style=style)
|
|
93
|
-
|
|
94
|
-
return fig
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def plot_shap_importance(
|
|
98
|
-
shap_values: np.ndarray,
|
|
99
|
-
feature_names: Sequence[str],
|
|
100
|
-
*,
|
|
101
|
-
top_n: int = 30,
|
|
102
|
-
title: str = "SHAP Importance",
|
|
103
|
-
show_values: bool = False,
|
|
104
|
-
ax: Optional[plt.Axes] = None,
|
|
105
|
-
show: bool = False,
|
|
106
|
-
save_path: Optional[str] = None,
|
|
107
|
-
style: Optional[PlotStyle] = None,
|
|
108
|
-
) -> plt.Figure:
|
|
109
|
-
series = shap_importance(shap_values, feature_names)
|
|
110
|
-
return plot_feature_importance(
|
|
111
|
-
series,
|
|
112
|
-
top_n=top_n,
|
|
113
|
-
title=title,
|
|
114
|
-
sort_by="abs",
|
|
115
|
-
descending=True,
|
|
116
|
-
show_values=show_values,
|
|
117
|
-
ax=ax,
|
|
118
|
-
show=show,
|
|
119
|
-
save_path=save_path,
|
|
120
|
-
style=style,
|
|
121
|
-
)
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Mapping, Optional, Sequence, Tuple
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from ins_pricing.modelling.plotting.common import PlotStyle, finalize_figure, plt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _to_series(
|
|
12
|
+
importance: Mapping[str, float]
|
|
13
|
+
| Sequence[Tuple[str, float]]
|
|
14
|
+
| pd.Series
|
|
15
|
+
| np.ndarray,
|
|
16
|
+
feature_names: Optional[Sequence[str]] = None,
|
|
17
|
+
) -> pd.Series:
|
|
18
|
+
if isinstance(importance, pd.Series):
|
|
19
|
+
return importance.copy()
|
|
20
|
+
if isinstance(importance, Mapping):
|
|
21
|
+
return pd.Series(dict(importance))
|
|
22
|
+
if isinstance(importance, np.ndarray):
|
|
23
|
+
if feature_names is None:
|
|
24
|
+
raise ValueError("feature_names is required when importance is an array.")
|
|
25
|
+
return pd.Series(importance, index=list(feature_names))
|
|
26
|
+
return pd.Series(dict(importance))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def shap_importance(
|
|
30
|
+
shap_values: np.ndarray,
|
|
31
|
+
feature_names: Sequence[str],
|
|
32
|
+
) -> pd.Series:
|
|
33
|
+
if shap_values.ndim == 3:
|
|
34
|
+
shap_values = shap_values[0]
|
|
35
|
+
if shap_values.ndim != 2:
|
|
36
|
+
raise ValueError("shap_values should be 2d (n_samples, n_features).")
|
|
37
|
+
scores = np.abs(shap_values).mean(axis=0)
|
|
38
|
+
return pd.Series(scores, index=list(feature_names))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def plot_feature_importance(
|
|
42
|
+
importance: Mapping[str, float]
|
|
43
|
+
| Sequence[Tuple[str, float]]
|
|
44
|
+
| pd.Series
|
|
45
|
+
| np.ndarray,
|
|
46
|
+
*,
|
|
47
|
+
feature_names: Optional[Sequence[str]] = None,
|
|
48
|
+
top_n: int = 30,
|
|
49
|
+
title: str = "Feature Importance",
|
|
50
|
+
sort_by: str = "abs",
|
|
51
|
+
descending: bool = True,
|
|
52
|
+
show_values: bool = False,
|
|
53
|
+
ax: Optional[plt.Axes] = None,
|
|
54
|
+
show: bool = False,
|
|
55
|
+
save_path: Optional[str] = None,
|
|
56
|
+
style: Optional[PlotStyle] = None,
|
|
57
|
+
) -> plt.Figure:
|
|
58
|
+
style = style or PlotStyle()
|
|
59
|
+
series = _to_series(importance, feature_names=feature_names)
|
|
60
|
+
series = series.replace([np.inf, -np.inf], np.nan).dropna()
|
|
61
|
+
|
|
62
|
+
if sort_by not in {"abs", "value"}:
|
|
63
|
+
raise ValueError("sort_by must be 'abs' or 'value'.")
|
|
64
|
+
sort_key = series.abs() if sort_by == "abs" else series
|
|
65
|
+
series = series.loc[sort_key.sort_values(ascending=not descending).index]
|
|
66
|
+
|
|
67
|
+
if top_n > 0:
|
|
68
|
+
series = series.head(int(top_n))
|
|
69
|
+
|
|
70
|
+
created_fig = ax is None
|
|
71
|
+
if created_fig:
|
|
72
|
+
height = max(3.0, 0.3 * len(series))
|
|
73
|
+
fig, ax = plt.subplots(figsize=(style.figsize[0], height))
|
|
74
|
+
else:
|
|
75
|
+
fig = ax.figure
|
|
76
|
+
|
|
77
|
+
y_pos = np.arange(len(series))
|
|
78
|
+
ax.barh(y_pos, series.values, color=style.palette[0])
|
|
79
|
+
ax.set_yticks(y_pos)
|
|
80
|
+
ax.set_yticklabels(series.index, fontsize=style.tick_size)
|
|
81
|
+
ax.invert_yaxis()
|
|
82
|
+
ax.set_title(title, fontsize=style.title_size)
|
|
83
|
+
ax.tick_params(axis="x", labelsize=style.tick_size)
|
|
84
|
+
if style.grid:
|
|
85
|
+
ax.grid(True, axis="x", linestyle=style.grid_style, alpha=style.grid_alpha)
|
|
86
|
+
|
|
87
|
+
if show_values:
|
|
88
|
+
for idx, val in enumerate(series.values):
|
|
89
|
+
ax.text(val, idx, f" {val:.3f}", va="center", fontsize=style.tick_size)
|
|
90
|
+
|
|
91
|
+
if created_fig:
|
|
92
|
+
finalize_figure(fig, save_path=save_path, show=show, style=style)
|
|
93
|
+
|
|
94
|
+
return fig
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def plot_shap_importance(
|
|
98
|
+
shap_values: np.ndarray,
|
|
99
|
+
feature_names: Sequence[str],
|
|
100
|
+
*,
|
|
101
|
+
top_n: int = 30,
|
|
102
|
+
title: str = "SHAP Importance",
|
|
103
|
+
show_values: bool = False,
|
|
104
|
+
ax: Optional[plt.Axes] = None,
|
|
105
|
+
show: bool = False,
|
|
106
|
+
save_path: Optional[str] = None,
|
|
107
|
+
style: Optional[PlotStyle] = None,
|
|
108
|
+
) -> plt.Figure:
|
|
109
|
+
series = shap_importance(shap_values, feature_names)
|
|
110
|
+
return plot_feature_importance(
|
|
111
|
+
series,
|
|
112
|
+
top_n=top_n,
|
|
113
|
+
title=title,
|
|
114
|
+
sort_by="abs",
|
|
115
|
+
descending=True,
|
|
116
|
+
show_values=show_values,
|
|
117
|
+
ax=ax,
|
|
118
|
+
show=show,
|
|
119
|
+
save_path=save_path,
|
|
120
|
+
style=style,
|
|
121
|
+
)
|
ins_pricing/pricing/__init__.py
CHANGED
|
@@ -1,27 +1,27 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from .calibration import apply_calibration, fit_calibration_factor
|
|
4
|
-
from .data_quality import detect_leakage, profile_columns, validate_schema
|
|
5
|
-
from .exposure import aggregate_policy_level, build_frequency_severity, compute_exposure
|
|
6
|
-
from .factors import bin_numeric, build_factor_table
|
|
7
|
-
from .monitoring import population_stability_index, psi_report
|
|
8
|
-
from .rate_table import RateTable, apply_factor_tables, compute_base_rate, rate_premium
|
|
9
|
-
|
|
10
|
-
__all__ = [
|
|
11
|
-
"apply_calibration",
|
|
12
|
-
"fit_calibration_factor",
|
|
13
|
-
"detect_leakage",
|
|
14
|
-
"profile_columns",
|
|
15
|
-
"validate_schema",
|
|
16
|
-
"aggregate_policy_level",
|
|
17
|
-
"build_frequency_severity",
|
|
18
|
-
"compute_exposure",
|
|
19
|
-
"bin_numeric",
|
|
20
|
-
"build_factor_table",
|
|
21
|
-
"population_stability_index",
|
|
22
|
-
"psi_report",
|
|
23
|
-
"RateTable",
|
|
24
|
-
"apply_factor_tables",
|
|
25
|
-
"compute_base_rate",
|
|
26
|
-
"rate_premium",
|
|
27
|
-
]
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ins_pricing.pricing.calibration import apply_calibration, fit_calibration_factor
|
|
4
|
+
from ins_pricing.pricing.data_quality import detect_leakage, profile_columns, validate_schema
|
|
5
|
+
from ins_pricing.pricing.exposure import aggregate_policy_level, build_frequency_severity, compute_exposure
|
|
6
|
+
from ins_pricing.pricing.factors import bin_numeric, build_factor_table
|
|
7
|
+
from ins_pricing.pricing.monitoring import population_stability_index, psi_report
|
|
8
|
+
from ins_pricing.pricing.rate_table import RateTable, apply_factor_tables, compute_base_rate, rate_premium
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"apply_calibration",
|
|
12
|
+
"fit_calibration_factor",
|
|
13
|
+
"detect_leakage",
|
|
14
|
+
"profile_columns",
|
|
15
|
+
"validate_schema",
|
|
16
|
+
"aggregate_policy_level",
|
|
17
|
+
"build_frequency_severity",
|
|
18
|
+
"compute_exposure",
|
|
19
|
+
"bin_numeric",
|
|
20
|
+
"build_factor_table",
|
|
21
|
+
"population_stability_index",
|
|
22
|
+
"psi_report",
|
|
23
|
+
"RateTable",
|
|
24
|
+
"apply_factor_tables",
|
|
25
|
+
"compute_base_rate",
|
|
26
|
+
"rate_premium",
|
|
27
|
+
]
|
ins_pricing/pricing/factors.py
CHANGED
|
@@ -1,43 +1,46 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
import hashlib
|
|
4
5
|
from typing import Optional, Tuple
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
method: str,
|
|
15
|
-
min_val: float,
|
|
16
|
-
max_val: float,
|
|
17
|
-
n_unique: int
|
|
18
|
-
) -> Tuple[tuple, int]:
|
|
19
|
-
"""Cache bin edge computation based on data characteristics.
|
|
11
|
+
_BIN_CACHE_MAXSIZE = 128
|
|
12
|
+
_BIN_CACHE: "OrderedDict[tuple, np.ndarray]" = OrderedDict()
|
|
13
|
+
_BIN_CACHE_HITS = 0
|
|
14
|
+
_BIN_CACHE_MISSES = 0
|
|
20
15
|
|
|
21
|
-
Args:
|
|
22
|
-
data_hash: Hash of sorted unique values for cache key
|
|
23
|
-
n_bins: Number of bins to create
|
|
24
|
-
method: Binning method ('quantile' or 'uniform')
|
|
25
|
-
min_val: Minimum value in data
|
|
26
|
-
max_val: Maximum value in data
|
|
27
|
-
n_unique: Number of unique values
|
|
28
16
|
|
|
29
|
-
|
|
30
|
-
|
|
17
|
+
def _cache_key(series: pd.Series, n_bins: int, method: str) -> Optional[tuple]:
|
|
18
|
+
try:
|
|
19
|
+
values = series.dropna().to_numpy(dtype=float, copy=False)
|
|
20
|
+
if values.size == 0:
|
|
21
|
+
return None
|
|
22
|
+
values = np.sort(values)
|
|
23
|
+
digest = hashlib.blake2b(values.tobytes(), digest_size=16).hexdigest()
|
|
24
|
+
return (digest, int(values.size), int(n_bins), str(method))
|
|
25
|
+
except Exception:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _cache_get(key: tuple) -> Optional[np.ndarray]:
|
|
30
|
+
global _BIN_CACHE_HITS, _BIN_CACHE_MISSES
|
|
31
|
+
if key in _BIN_CACHE:
|
|
32
|
+
_BIN_CACHE_HITS += 1
|
|
33
|
+
_BIN_CACHE.move_to_end(key)
|
|
34
|
+
return _BIN_CACHE[key].copy()
|
|
35
|
+
_BIN_CACHE_MISSES += 1
|
|
36
|
+
return None
|
|
31
37
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# The actual binning is done in the calling function
|
|
39
|
-
# This just provides a cache key mechanism
|
|
40
|
-
return (data_hash, n_bins, method, min_val, max_val, n_unique), n_bins
|
|
38
|
+
|
|
39
|
+
def _cache_set(key: tuple, edges: np.ndarray) -> None:
|
|
40
|
+
_BIN_CACHE[key] = np.asarray(edges, dtype=float)
|
|
41
|
+
_BIN_CACHE.move_to_end(key)
|
|
42
|
+
if len(_BIN_CACHE) > _BIN_CACHE_MAXSIZE:
|
|
43
|
+
_BIN_CACHE.popitem(last=False)
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
def bin_numeric(
|
|
@@ -66,34 +69,40 @@ def bin_numeric(
|
|
|
66
69
|
When use_cache=True, identical distributions will reuse cached bin edges,
|
|
67
70
|
improving performance when the same column is binned multiple times.
|
|
68
71
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
# Check cache (the function call acts as cache lookup)
|
|
80
|
-
try:
|
|
81
|
-
_compute_bins_cached(data_hash, bins, method, min_val, max_val, n_unique)
|
|
82
|
-
except Exception:
|
|
83
|
-
# If hashing fails, proceed without cache
|
|
84
|
-
pass
|
|
72
|
+
cache_key = _cache_key(series, bins, method) if use_cache else None
|
|
73
|
+
bin_edges_full: Optional[np.ndarray] = None
|
|
74
|
+
|
|
75
|
+
if cache_key is not None:
|
|
76
|
+
bin_edges_full = _cache_get(cache_key)
|
|
77
|
+
|
|
78
|
+
if bin_edges_full is not None:
|
|
79
|
+
binned = pd.cut(series, bins=bin_edges_full, include_lowest=True, labels=labels)
|
|
80
|
+
return binned, np.asarray(bin_edges_full[:-1], dtype=float)
|
|
85
81
|
|
|
86
82
|
# Perform actual binning
|
|
87
83
|
if method == "quantile":
|
|
88
|
-
binned = pd.qcut(
|
|
89
|
-
|
|
84
|
+
binned, bin_edges_full = pd.qcut(
|
|
85
|
+
series,
|
|
86
|
+
q=bins,
|
|
87
|
+
duplicates="drop",
|
|
88
|
+
labels=labels,
|
|
89
|
+
retbins=True,
|
|
90
|
+
)
|
|
90
91
|
elif method == "uniform":
|
|
91
|
-
binned = pd.cut(
|
|
92
|
-
|
|
92
|
+
binned, bin_edges_full = pd.cut(
|
|
93
|
+
series,
|
|
94
|
+
bins=bins,
|
|
95
|
+
include_lowest=include_lowest,
|
|
96
|
+
labels=labels,
|
|
97
|
+
retbins=True,
|
|
98
|
+
)
|
|
93
99
|
else:
|
|
94
100
|
raise ValueError("method must be one of: quantile, uniform.")
|
|
95
101
|
|
|
96
|
-
|
|
102
|
+
if cache_key is not None and bin_edges_full is not None:
|
|
103
|
+
_cache_set(cache_key, np.asarray(bin_edges_full, dtype=float))
|
|
104
|
+
|
|
105
|
+
return binned, np.asarray(bin_edges_full[:-1], dtype=float)
|
|
97
106
|
|
|
98
107
|
|
|
99
108
|
def clear_binning_cache() -> None:
|
|
@@ -108,7 +117,10 @@ def clear_binning_cache() -> None:
|
|
|
108
117
|
>>> # After processing many different columns
|
|
109
118
|
>>> clear_binning_cache()
|
|
110
119
|
"""
|
|
111
|
-
|
|
120
|
+
global _BIN_CACHE_HITS, _BIN_CACHE_MISSES
|
|
121
|
+
_BIN_CACHE.clear()
|
|
122
|
+
_BIN_CACHE_HITS = 0
|
|
123
|
+
_BIN_CACHE_MISSES = 0
|
|
112
124
|
|
|
113
125
|
|
|
114
126
|
def get_cache_info() -> dict:
|
|
@@ -126,12 +138,11 @@ def get_cache_info() -> dict:
|
|
|
126
138
|
>>> info = get_cache_info()
|
|
127
139
|
>>> print(f"Cache hit rate: {info['hits'] / (info['hits'] + info['misses']):.2%}")
|
|
128
140
|
"""
|
|
129
|
-
cache_info = _compute_bins_cached.cache_info()
|
|
130
141
|
return {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
142
|
+
"hits": _BIN_CACHE_HITS,
|
|
143
|
+
"misses": _BIN_CACHE_MISSES,
|
|
144
|
+
"maxsize": _BIN_CACHE_MAXSIZE,
|
|
145
|
+
"currsize": len(_BIN_CACHE),
|
|
135
146
|
}
|
|
136
147
|
|
|
137
148
|
|
|
@@ -1,35 +1,45 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from .drift import psi_report
|
|
4
|
-
from .monitoring import (
|
|
5
|
-
classification_metrics,
|
|
6
|
-
group_metrics,
|
|
7
|
-
loss_ratio,
|
|
8
|
-
metrics_report,
|
|
9
|
-
regression_metrics,
|
|
10
|
-
)
|
|
11
|
-
from .scoring import batch_score
|
|
12
|
-
from .preprocess import apply_preprocess_artifacts, load_preprocess_artifacts, prepare_raw_features
|
|
13
|
-
from .
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ins_pricing.production.drift import psi_report
|
|
4
|
+
from ins_pricing.production.monitoring import (
|
|
5
|
+
classification_metrics,
|
|
6
|
+
group_metrics,
|
|
7
|
+
loss_ratio,
|
|
8
|
+
metrics_report,
|
|
9
|
+
regression_metrics,
|
|
10
|
+
)
|
|
11
|
+
from ins_pricing.production.scoring import batch_score
|
|
12
|
+
from ins_pricing.production.preprocess import apply_preprocess_artifacts, load_preprocess_artifacts, prepare_raw_features
|
|
13
|
+
from ins_pricing.production.inference import (
|
|
14
|
+
Predictor,
|
|
15
|
+
ModelSpec,
|
|
16
|
+
PredictorRegistry,
|
|
17
|
+
register_model_loader,
|
|
18
|
+
load_predictor,
|
|
14
19
|
SavedModelPredictor,
|
|
15
20
|
load_best_params,
|
|
16
21
|
load_predictor_from_config,
|
|
17
22
|
load_saved_model,
|
|
18
23
|
predict_from_config,
|
|
19
24
|
)
|
|
20
|
-
|
|
21
|
-
__all__ = [
|
|
22
|
-
"psi_report",
|
|
23
|
-
"classification_metrics",
|
|
24
|
-
"group_metrics",
|
|
25
|
-
"loss_ratio",
|
|
26
|
-
"metrics_report",
|
|
27
|
-
"regression_metrics",
|
|
28
|
-
"batch_score",
|
|
29
|
-
"apply_preprocess_artifacts",
|
|
30
|
-
"load_preprocess_artifacts",
|
|
31
|
-
"prepare_raw_features",
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"psi_report",
|
|
28
|
+
"classification_metrics",
|
|
29
|
+
"group_metrics",
|
|
30
|
+
"loss_ratio",
|
|
31
|
+
"metrics_report",
|
|
32
|
+
"regression_metrics",
|
|
33
|
+
"batch_score",
|
|
34
|
+
"apply_preprocess_artifacts",
|
|
35
|
+
"load_preprocess_artifacts",
|
|
36
|
+
"prepare_raw_features",
|
|
32
37
|
"SavedModelPredictor",
|
|
38
|
+
"Predictor",
|
|
39
|
+
"ModelSpec",
|
|
40
|
+
"PredictorRegistry",
|
|
41
|
+
"register_model_loader",
|
|
42
|
+
"load_predictor",
|
|
33
43
|
"load_best_params",
|
|
34
44
|
"load_predictor_from_config",
|
|
35
45
|
"load_saved_model",
|