ins-pricing 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ins_pricing/CHANGELOG.md +179 -0
  2. ins_pricing/RELEASE_NOTES_0.2.8.md +344 -0
  3. ins_pricing/modelling/core/bayesopt/utils.py +2 -1
  4. ins_pricing/modelling/explain/shap_utils.py +209 -6
  5. ins_pricing/pricing/calibration.py +125 -1
  6. ins_pricing/pricing/factors.py +110 -1
  7. ins_pricing/production/preprocess.py +166 -0
  8. ins_pricing/setup.py +1 -1
  9. ins_pricing/tests/governance/__init__.py +1 -0
  10. ins_pricing/tests/governance/test_audit.py +56 -0
  11. ins_pricing/tests/governance/test_registry.py +128 -0
  12. ins_pricing/tests/governance/test_release.py +74 -0
  13. ins_pricing/tests/pricing/__init__.py +1 -0
  14. ins_pricing/tests/pricing/test_calibration.py +72 -0
  15. ins_pricing/tests/pricing/test_exposure.py +64 -0
  16. ins_pricing/tests/pricing/test_factors.py +156 -0
  17. ins_pricing/tests/pricing/test_rate_table.py +40 -0
  18. ins_pricing/tests/production/__init__.py +1 -0
  19. ins_pricing/tests/production/test_monitoring.py +350 -0
  20. ins_pricing/tests/production/test_predict.py +233 -0
  21. ins_pricing/tests/production/test_preprocess.py +339 -0
  22. ins_pricing/tests/production/test_scoring.py +311 -0
  23. ins_pricing/utils/profiling.py +377 -0
  24. ins_pricing/utils/validation.py +427 -0
  25. ins_pricing-0.2.9.dist-info/METADATA +149 -0
  26. {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/RECORD +28 -12
  27. ins_pricing/CHANGELOG_20260114.md +0 -275
  28. ins_pricing/CODE_REVIEW_IMPROVEMENTS.md +0 -715
  29. ins_pricing-0.2.7.dist-info/METADATA +0 -101
  30. {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/WHEEL +0 -0
  31. {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
1
+ """Tests for pricing factors module."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import pytest
6
+
7
+ from ins_pricing.exceptions import DataValidationError
8
+
9
+
10
+ @pytest.fixture
11
+ def sample_policy_data():
12
+ """Sample insurance policy data."""
13
+ np.random.seed(42)
14
+ return pd.DataFrame({
15
+ "policy_id": range(1000),
16
+ "age": np.random.randint(18, 80, 1000),
17
+ "gender": np.random.choice(["M", "F"], 1000),
18
+ "region": np.random.choice(["North", "South", "East", "West"], 1000),
19
+ "vehicle_age": np.random.randint(0, 15, 1000),
20
+ "claim_amount": np.random.exponential(500, 1000),
21
+ "exposure": np.random.uniform(0.5, 1.0, 1000),
22
+ "premium": np.random.uniform(200, 1000, 1000)
23
+ })
24
+
25
+
26
+ class TestFactorTableConstruction:
27
+ """Test factor table construction."""
28
+
29
+ def test_build_univariate_factor_table(self, sample_policy_data):
30
+ """Test building a univariate factor table."""
31
+ from ins_pricing.pricing.factors import build_factor_table
32
+
33
+ factor_table = build_factor_table(
34
+ df=sample_policy_data,
35
+ factor_col="age",
36
+ loss_col="claim_amount",
37
+ exposure_col="exposure",
38
+ method="quantile",
39
+ n_bins=10
40
+ )
41
+
42
+ assert len(factor_table) <= 10
43
+ assert "age_bin" in factor_table.columns
44
+ assert "relativity" in factor_table.columns
45
+ assert "claim_count" in factor_table.columns
46
+
47
+ def test_equal_width_binning(self, sample_policy_data):
48
+ """Test equal width binning strategy."""
49
+ from ins_pricing.pricing.factors import build_factor_table
50
+
51
+ factor_table = build_factor_table(
52
+ df=sample_policy_data,
53
+ factor_col="vehicle_age",
54
+ loss_col="claim_amount",
55
+ exposure_col="exposure",
56
+ method="equal_width",
57
+ n_bins=5
58
+ )
59
+
60
+ assert len(factor_table) == 5
61
+
62
+ def test_categorical_factor_table(self, sample_policy_data):
63
+ """Test factor table for categorical variables."""
64
+ from ins_pricing.pricing.factors import build_factor_table
65
+
66
+ factor_table = build_factor_table(
67
+ df=sample_policy_data,
68
+ factor_col="region",
69
+ loss_col="claim_amount",
70
+ exposure_col="exposure",
71
+ method="categorical"
72
+ )
73
+
74
+ assert set(factor_table["region"]) == set(sample_policy_data["region"].unique())
75
+ assert "relativity" in factor_table.columns
76
+
77
+
78
+ class TestFactorSmoothing:
79
+ """Test factor smoothing techniques."""
80
+
81
+ def test_credibility_weighting(self):
82
+ """Test credibility-weighted smoothing."""
83
+ from ins_pricing.pricing.factors import apply_credibility_smoothing
84
+
85
+ raw_factors = pd.DataFrame({
86
+ "bin": ["A", "B", "C"],
87
+ "relativity": [1.2, 0.8, 1.5],
88
+ "exposure": [100, 500, 50] # C has low credibility
89
+ })
90
+
91
+ smoothed = apply_credibility_smoothing(raw_factors, base_relativity=1.0)
92
+
93
+ # Low exposure bin should be pulled toward base
94
+ assert abs(smoothed.loc[2, "relativity"] - 1.0) < abs(raw_factors.loc[2, "relativity"] - 1.0)
95
+
96
+ def test_neighbor_smoothing(self):
97
+ """Test smoothing using neighboring bins."""
98
+ from ins_pricing.pricing.factors import apply_neighbor_smoothing
99
+
100
+ factors = pd.DataFrame({
101
+ "bin": [1, 2, 3, 4, 5],
102
+ "relativity": [1.0, 1.2, 2.5, 1.4, 1.5] # Bin 3 is outlier
103
+ })
104
+
105
+ smoothed = apply_neighbor_smoothing(factors)
106
+
107
+ # Outlier should be smoothed
108
+ assert smoothed.loc[2, "relativity"] < factors.loc[2, "relativity"]
109
+
110
+
111
+ class TestFactorApplication:
112
+ """Test applying factors to new data."""
113
+
114
+ def test_apply_factors_to_policies(self, sample_policy_data):
115
+ """Test applying factor table to policies."""
116
+ from ins_pricing.pricing.factors import build_factor_table, apply_factors
117
+
118
+ # Build factor table
119
+ age_factors = build_factor_table(
120
+ df=sample_policy_data,
121
+ factor_col="age",
122
+ loss_col="claim_amount",
123
+ exposure_col="exposure",
124
+ n_bins=5
125
+ )
126
+
127
+ # Apply to new data
128
+ result = apply_factors(sample_policy_data, age_factors, factor_col="age")
129
+
130
+ assert "age_relativity" in result.columns
131
+ assert result["age_relativity"].notna().all()
132
+
133
+
134
+ @pytest.mark.parametrize("method,n_bins", [
135
+ ("quantile", 5),
136
+ ("quantile", 10),
137
+ ("equal_width", 5),
138
+ ("equal_width", 10),
139
+ ])
140
+ class TestBinningMethods:
141
+ """Test different binning methods."""
142
+
143
+ def test_binning_produces_expected_bins(self, sample_policy_data, method, n_bins):
144
+ """Test that binning produces expected number of bins."""
145
+ from ins_pricing.pricing.factors import build_factor_table
146
+
147
+ factor_table = build_factor_table(
148
+ df=sample_policy_data,
149
+ factor_col="age",
150
+ loss_col="claim_amount",
151
+ exposure_col="exposure",
152
+ method=method,
153
+ n_bins=n_bins
154
+ )
155
+
156
+ assert len(factor_table) <= n_bins
@@ -0,0 +1,40 @@
1
+ """Tests for rate table module."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import pytest
6
+
7
+
8
+ class TestRateTableGeneration:
9
+ """Test rate table generation."""
10
+
11
+ def test_generate_multidimensional_rate_table(self):
12
+ """Test generating rate table with multiple dimensions."""
13
+ from ins_pricing.pricing.rate_table import generate_rate_table
14
+
15
+ factors = {
16
+ "age": pd.DataFrame({"age_band": ["18-25", "26-35", "36+"], "relativity": [1.5, 1.0, 0.8]}),
17
+ "region": pd.DataFrame({"region": ["North", "South"], "relativity": [1.2, 0.9]})
18
+ }
19
+
20
+ rate_table = generate_rate_table(factors, base_rate=100)
21
+
22
+ assert len(rate_table) == 3 * 2 # 3 age bands × 2 regions
23
+ assert "rate" in rate_table.columns
24
+
25
+ def test_rate_lookup(self):
26
+ """Test looking up rate for specific characteristics."""
27
+ from ins_pricing.pricing.rate_table import lookup_rate
28
+
29
+ rate_table = pd.DataFrame({
30
+ "age_band": ["18-25", "26-35"],
31
+ "region": ["North", "North"],
32
+ "rate": [150, 120]
33
+ })
34
+
35
+ rate = lookup_rate(
36
+ rate_table,
37
+ characteristics={"age_band": "18-25", "region": "North"}
38
+ )
39
+
40
+ assert rate == 150
@@ -0,0 +1 @@
1
+ """Tests for the production module."""
@@ -0,0 +1,350 @@
1
+ """Tests for production monitoring module."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import pytest
6
+ from datetime import datetime, timedelta
7
+ from unittest.mock import Mock, patch
8
+
9
+ from ins_pricing.exceptions import DataValidationError
10
+
11
+
12
+ @pytest.fixture
13
+ def sample_production_data():
14
+ """Sample production data with timestamps."""
15
+ dates = pd.date_range(start='2024-01-01', periods=100, freq='D')
16
+ return pd.DataFrame({
17
+ "date": dates,
18
+ "prediction": np.random.uniform(100, 500, 100),
19
+ "actual": np.random.uniform(100, 500, 100),
20
+ "feature_1": np.random.uniform(0, 1, 100),
21
+ "feature_2": np.random.choice(['A', 'B', 'C'], 100)
22
+ })
23
+
24
+
25
+ @pytest.fixture
26
+ def training_distribution():
27
+ """Reference training data distribution."""
28
+ return pd.DataFrame({
29
+ "feature_1": np.random.uniform(0, 1, 1000),
30
+ "feature_2": np.random.choice(['A', 'B', 'C'], 1000, p=[0.5, 0.3, 0.2])
31
+ })
32
+
33
+
34
+ class TestDriftDetection:
35
+ """Test data drift detection."""
36
+
37
+ def test_psi_calculation(self, training_distribution, sample_production_data):
38
+ """Test Population Stability Index (PSI) calculation."""
39
+ from ins_pricing.production.monitoring import calculate_psi
40
+
41
+ psi = calculate_psi(
42
+ expected=training_distribution['feature_1'],
43
+ actual=sample_production_data['feature_1'],
44
+ buckets=10
45
+ )
46
+
47
+ assert isinstance(psi, (int, float, np.number))
48
+ assert psi >= 0
49
+
50
+ def test_psi_drift_detected(self):
51
+ """Test PSI detects significant drift."""
52
+ from ins_pricing.production.monitoring import calculate_psi
53
+
54
+ # Create distributions with significant drift
55
+ expected = np.random.uniform(0, 1, 1000)
56
+ actual = np.random.uniform(0.5, 1.5, 1000) # Shifted distribution
57
+
58
+ psi = calculate_psi(expected, actual, buckets=10)
59
+
60
+ # PSI > 0.2 typically indicates significant drift
61
+ assert psi > 0.1
62
+
63
+ def test_psi_no_drift(self):
64
+ """Test PSI when no drift present."""
65
+ from ins_pricing.production.monitoring import calculate_psi
66
+
67
+ # Same distribution
68
+ distribution = np.random.uniform(0, 1, 1000)
69
+ expected = distribution[:500]
70
+ actual = distribution[500:]
71
+
72
+ psi = calculate_psi(expected, actual, buckets=10)
73
+
74
+ # Should be very low PSI
75
+ assert psi < 0.1
76
+
77
+ def test_categorical_drift(self, training_distribution, sample_production_data):
78
+ """Test drift detection for categorical features."""
79
+ from ins_pricing.production.monitoring import categorical_drift
80
+
81
+ drift_score = categorical_drift(
82
+ expected=training_distribution['feature_2'],
83
+ actual=sample_production_data['feature_2']
84
+ )
85
+
86
+ assert isinstance(drift_score, (int, float, np.number))
87
+ assert drift_score >= 0
88
+
89
+ def test_ks_test_drift(self):
90
+ """Test Kolmogorov-Smirnov test for drift."""
91
+ from ins_pricing.production.monitoring import ks_test
92
+
93
+ expected = np.random.normal(0, 1, 1000)
94
+ actual = np.random.normal(0.5, 1, 1000) # Shifted mean
95
+
96
+ statistic, p_value = ks_test(expected, actual)
97
+
98
+ assert 0 <= statistic <= 1
99
+ assert 0 <= p_value <= 1
100
+
101
+
102
+ class TestPerformanceMonitoring:
103
+ """Test model performance monitoring."""
104
+
105
+ def test_rolling_metrics(self, sample_production_data):
106
+ """Test calculation of rolling performance metrics."""
107
+ from ins_pricing.production.monitoring import rolling_metrics
108
+
109
+ metrics = rolling_metrics(
110
+ df=sample_production_data,
111
+ actual_col='actual',
112
+ pred_col='prediction',
113
+ window=7
114
+ )
115
+
116
+ assert 'rolling_mae' in metrics.columns
117
+ assert 'rolling_mse' in metrics.columns
118
+ assert len(metrics) == len(sample_production_data)
119
+
120
+ def test_performance_degradation_alert(self, sample_production_data):
121
+ """Test alerting on performance degradation."""
122
+ from ins_pricing.production.monitoring import check_performance_degradation
123
+
124
+ # Simulate degrading predictions
125
+ sample_production_data.loc[50:, 'prediction'] = \
126
+ sample_production_data.loc[50:, 'actual'] * 2 # Make worse
127
+
128
+ is_degraded = check_performance_degradation(
129
+ df=sample_production_data,
130
+ actual_col='actual',
131
+ pred_col='prediction',
132
+ threshold=0.2 # 20% worse
133
+ )
134
+
135
+ assert isinstance(is_degraded, bool)
136
+
137
+ def test_metric_comparison(self):
138
+ """Test comparison of current vs baseline metrics."""
139
+ from ins_pricing.production.monitoring import compare_metrics
140
+
141
+ baseline = {'mse': 100, 'mae': 8, 'r2': 0.85}
142
+ current = {'mse': 150, 'mae': 10, 'r2': 0.75}
143
+
144
+ comparison = compare_metrics(baseline, current)
145
+
146
+ assert 'mse_change' in comparison
147
+ assert 'mae_change' in comparison
148
+ assert 'r2_change' in comparison
149
+
150
+
151
+ class TestDataQualityChecks:
152
+ """Test data quality monitoring."""
153
+
154
+ def test_missing_value_detection(self):
155
+ """Test detection of missing values in production data."""
156
+ from ins_pricing.production.monitoring import check_missing_values
157
+
158
+ data = pd.DataFrame({
159
+ "col1": [1, 2, np.nan, 4],
160
+ "col2": [1, 2, 3, 4],
161
+ "col3": [np.nan, np.nan, 3, 4]
162
+ })
163
+
164
+ missing_report = check_missing_values(data)
165
+
166
+ assert 'col1' in missing_report
167
+ assert 'col3' in missing_report
168
+ assert missing_report['col1']['count'] == 1
169
+ assert missing_report['col3']['count'] == 2
170
+
171
+ def test_outlier_detection(self):
172
+ """Test outlier detection in production data."""
173
+ from ins_pricing.production.monitoring import detect_outliers
174
+
175
+ data = pd.Series([1, 2, 3, 4, 5, 100, 2, 3, 4, 5]) # 100 is outlier
176
+
177
+ outliers = detect_outliers(data, method='iqr')
178
+
179
+ assert len(outliers) > 0
180
+ assert 100 in data[outliers].values
181
+
182
+ def test_schema_validation(self):
183
+ """Test schema validation for production data."""
184
+ from ins_pricing.production.monitoring import validate_schema
185
+
186
+ expected_schema = {
187
+ "feature_1": "float64",
188
+ "feature_2": "object",
189
+ "prediction": "float64"
190
+ }
191
+
192
+ data = pd.DataFrame({
193
+ "feature_1": [1.0, 2.0],
194
+ "feature_2": ["A", "B"],
195
+ "prediction": [100.0, 200.0]
196
+ })
197
+
198
+ is_valid = validate_schema(data, expected_schema)
199
+
200
+ assert is_valid
201
+
202
+ def test_schema_validation_failure(self):
203
+ """Test schema validation catches type mismatches."""
204
+ from ins_pricing.production.monitoring import validate_schema
205
+
206
+ expected_schema = {
207
+ "feature_1": "float64",
208
+ "feature_2": "int64" # Expect int
209
+ }
210
+
211
+ data = pd.DataFrame({
212
+ "feature_1": [1.0, 2.0],
213
+ "feature_2": ["A", "B"] # Actually string
214
+ })
215
+
216
+ is_valid = validate_schema(data, expected_schema)
217
+
218
+ assert not is_valid
219
+
220
+
221
+ class TestAlertingSystem:
222
+ """Test monitoring alert generation."""
223
+
224
+ def test_drift_alert(self):
225
+ """Test alert generation for drift detection."""
226
+ from ins_pricing.production.monitoring import generate_drift_alert
227
+
228
+ alert = generate_drift_alert(
229
+ feature='age',
230
+ psi=0.35,
231
+ threshold=0.25
232
+ )
233
+
234
+ assert alert['alert_type'] == 'drift'
235
+ assert alert['feature'] == 'age'
236
+ assert alert['severity'] == 'high'
237
+
238
+ def test_performance_alert(self):
239
+ """Test alert generation for performance degradation."""
240
+ from ins_pricing.production.monitoring import generate_performance_alert
241
+
242
+ alert = generate_performance_alert(
243
+ metric='mae',
244
+ baseline=10.0,
245
+ current=15.0,
246
+ threshold=0.2
247
+ )
248
+
249
+ assert alert['alert_type'] == 'performance'
250
+ assert alert['metric'] == 'mae'
251
+
252
+ @patch('ins_pricing.production.monitoring.send_email')
253
+ def test_send_alert_email(self, mock_send):
254
+ """Test sending alert via email."""
255
+ from ins_pricing.production.monitoring import send_alert
256
+
257
+ alert = {
258
+ 'alert_type': 'drift',
259
+ 'feature': 'age',
260
+ 'severity': 'high'
261
+ }
262
+
263
+ send_alert(alert, recipients=['team@example.com'])
264
+
265
+ mock_send.assert_called_once()
266
+
267
+ @patch('ins_pricing.production.monitoring.log_to_monitoring_system')
268
+ def test_log_alert(self, mock_log):
269
+ """Test logging alert to monitoring system."""
270
+ from ins_pricing.production.monitoring import log_alert
271
+
272
+ alert = {'alert_type': 'performance', 'severity': 'medium'}
273
+
274
+ log_alert(alert)
275
+
276
+ mock_log.assert_called_once()
277
+
278
+
279
+ class TestMonitoringDashboard:
280
+ """Test monitoring dashboard data preparation."""
281
+
282
+ def test_dashboard_metrics(self, sample_production_data):
283
+ """Test preparation of dashboard metrics."""
284
+ from ins_pricing.production.monitoring import prepare_dashboard_metrics
285
+
286
+ metrics = prepare_dashboard_metrics(
287
+ df=sample_production_data,
288
+ actual_col='actual',
289
+ pred_col='prediction',
290
+ date_col='date'
291
+ )
292
+
293
+ assert 'daily_predictions' in metrics
294
+ assert 'daily_mae' in metrics
295
+ assert 'daily_mse' in metrics
296
+
297
+ def test_feature_distribution_summary(self, sample_production_data):
298
+ """Test feature distribution summary for dashboard."""
299
+ from ins_pricing.production.monitoring import feature_distribution_summary
300
+
301
+ summary = feature_distribution_summary(
302
+ sample_production_data,
303
+ features=['feature_1', 'feature_2']
304
+ )
305
+
306
+ assert 'feature_1' in summary
307
+ assert 'feature_2' in summary
308
+ assert 'mean' in summary['feature_1']
309
+ assert 'std' in summary['feature_1']
310
+
311
+
312
+ class TestBatchMonitoring:
313
+ """Test batch monitoring functionality."""
314
+
315
+ def test_monitor_batch_predictions(self, sample_production_data, training_distribution):
316
+ """Test monitoring a batch of predictions."""
317
+ from ins_pricing.production.monitoring import monitor_batch
318
+
319
+ report = monitor_batch(
320
+ production_data=sample_production_data,
321
+ reference_data=training_distribution,
322
+ features=['feature_1', 'feature_2']
323
+ )
324
+
325
+ assert 'drift_scores' in report
326
+ assert 'quality_checks' in report
327
+ assert 'alerts' in report
328
+
329
+ def test_scheduled_monitoring(self):
330
+ """Test scheduled monitoring execution."""
331
+ from ins_pricing.production.monitoring import run_scheduled_monitoring
332
+
333
+ with patch('ins_pricing.production.monitoring.load_production_data') as mock_load:
334
+ with patch('ins_pricing.production.monitoring.monitor_batch') as mock_monitor:
335
+ mock_load.return_value = pd.DataFrame()
336
+ mock_monitor.return_value = {'status': 'ok'}
337
+
338
+ result = run_scheduled_monitoring(config={'schedule': 'daily'})
339
+
340
+ assert result['status'] == 'ok'
341
+
342
+
343
+ @pytest.mark.integration
344
+ class TestMonitoringIntegration:
345
+ """Integration tests for monitoring pipeline."""
346
+
347
+ def test_full_monitoring_pipeline(self):
348
+ """Test complete monitoring pipeline."""
349
+ # Would require full setup with real data
350
+ pass