ins-pricing 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/CHANGELOG.md +179 -0
- ins_pricing/RELEASE_NOTES_0.2.8.md +344 -0
- ins_pricing/modelling/core/bayesopt/utils.py +2 -1
- ins_pricing/modelling/explain/shap_utils.py +209 -6
- ins_pricing/pricing/calibration.py +125 -1
- ins_pricing/pricing/factors.py +110 -1
- ins_pricing/production/preprocess.py +166 -0
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/governance/__init__.py +1 -0
- ins_pricing/tests/governance/test_audit.py +56 -0
- ins_pricing/tests/governance/test_registry.py +128 -0
- ins_pricing/tests/governance/test_release.py +74 -0
- ins_pricing/tests/pricing/__init__.py +1 -0
- ins_pricing/tests/pricing/test_calibration.py +72 -0
- ins_pricing/tests/pricing/test_exposure.py +64 -0
- ins_pricing/tests/pricing/test_factors.py +156 -0
- ins_pricing/tests/pricing/test_rate_table.py +40 -0
- ins_pricing/tests/production/__init__.py +1 -0
- ins_pricing/tests/production/test_monitoring.py +350 -0
- ins_pricing/tests/production/test_predict.py +233 -0
- ins_pricing/tests/production/test_preprocess.py +339 -0
- ins_pricing/tests/production/test_scoring.py +311 -0
- ins_pricing/utils/profiling.py +377 -0
- ins_pricing/utils/validation.py +427 -0
- ins_pricing-0.2.9.dist-info/METADATA +149 -0
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/RECORD +28 -12
- ins_pricing/CHANGELOG_20260114.md +0 -275
- ins_pricing/CODE_REVIEW_IMPROVEMENTS.md +0 -715
- ins_pricing-0.2.7.dist-info/METADATA +0 -101
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/WHEEL +0 -0
- {ins_pricing-0.2.7.dist-info → ins_pricing-0.2.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Tests for production prediction module."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from unittest.mock import Mock, patch, MagicMock
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
from ins_pricing.exceptions import ConfigurationError, ModelLoadError, PredictionError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Mock the production.predict module since it may have heavy dependencies
|
|
15
|
+
pytest.importorskip("ins_pricing.production.predict", reason="predict module not available")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture
|
|
19
|
+
def sample_config():
|
|
20
|
+
"""Sample configuration for predictor."""
|
|
21
|
+
return {
|
|
22
|
+
"model_name": "test_model",
|
|
23
|
+
"task_type": "regression",
|
|
24
|
+
"base_dir": "/tmp/models",
|
|
25
|
+
"feature_names": ["age", "premium", "region"],
|
|
26
|
+
"model_type": "xgboost"
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture
|
|
31
|
+
def sample_data():
|
|
32
|
+
"""Sample input data for predictions."""
|
|
33
|
+
return pd.DataFrame({
|
|
34
|
+
"age": [25, 30, 35, 40],
|
|
35
|
+
"premium": [100.0, 150.0, 200.0, 250.0],
|
|
36
|
+
"region": ["A", "B", "A", "C"]
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TestConfigValidation:
|
|
41
|
+
"""Test configuration validation for predictors."""
|
|
42
|
+
|
|
43
|
+
def test_missing_config_file(self, tmp_path):
|
|
44
|
+
"""Test error when config file doesn't exist."""
|
|
45
|
+
from ins_pricing.production.predict import load_predictor_from_config
|
|
46
|
+
|
|
47
|
+
config_path = tmp_path / "nonexistent.json"
|
|
48
|
+
|
|
49
|
+
with pytest.raises((FileNotFoundError, ModelLoadError)):
|
|
50
|
+
load_predictor_from_config(config_path)
|
|
51
|
+
|
|
52
|
+
def test_invalid_json_config(self, tmp_path):
|
|
53
|
+
"""Test error when config file contains invalid JSON."""
|
|
54
|
+
from ins_pricing.production.predict import load_predictor_from_config
|
|
55
|
+
|
|
56
|
+
config_path = tmp_path / "invalid.json"
|
|
57
|
+
config_path.write_text("{ invalid json }")
|
|
58
|
+
|
|
59
|
+
with pytest.raises((ConfigurationError, json.JSONDecodeError)):
|
|
60
|
+
load_predictor_from_config(config_path)
|
|
61
|
+
|
|
62
|
+
def test_missing_required_fields(self, tmp_path):
|
|
63
|
+
"""Test error when required config fields are missing."""
|
|
64
|
+
from ins_pricing.production.predict import load_predictor_from_config
|
|
65
|
+
|
|
66
|
+
config_path = tmp_path / "incomplete.json"
|
|
67
|
+
config_path.write_text(json.dumps({"model_name": "test"}))
|
|
68
|
+
|
|
69
|
+
with pytest.raises(ConfigurationError):
|
|
70
|
+
load_predictor_from_config(config_path)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class TestPredictorLoading:
|
|
74
|
+
"""Test predictor loading functionality."""
|
|
75
|
+
|
|
76
|
+
@patch('ins_pricing.production.predict.load_model')
|
|
77
|
+
def test_load_valid_predictor(self, mock_load_model, tmp_path, sample_config):
|
|
78
|
+
"""Test loading a valid predictor."""
|
|
79
|
+
from ins_pricing.production.predict import load_predictor_from_config
|
|
80
|
+
|
|
81
|
+
# Setup
|
|
82
|
+
config_path = tmp_path / "config.json"
|
|
83
|
+
config_path.write_text(json.dumps(sample_config))
|
|
84
|
+
mock_load_model.return_value = Mock()
|
|
85
|
+
|
|
86
|
+
# Execute
|
|
87
|
+
predictor = load_predictor_from_config(config_path)
|
|
88
|
+
|
|
89
|
+
# Verify
|
|
90
|
+
assert predictor is not None
|
|
91
|
+
assert predictor['config']['model_name'] == "test_model"
|
|
92
|
+
|
|
93
|
+
def test_load_missing_model_file(self, tmp_path, sample_config):
|
|
94
|
+
"""Test error when model file is missing."""
|
|
95
|
+
from ins_pricing.production.predict import load_predictor_from_config
|
|
96
|
+
|
|
97
|
+
config_path = tmp_path / "config.json"
|
|
98
|
+
config_path.write_text(json.dumps(sample_config))
|
|
99
|
+
|
|
100
|
+
with pytest.raises(ModelLoadError):
|
|
101
|
+
load_predictor_from_config(config_path)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class TestPrediction:
|
|
105
|
+
"""Test prediction functionality."""
|
|
106
|
+
|
|
107
|
+
@patch('ins_pricing.production.predict.load_model')
|
|
108
|
+
def test_predict_on_valid_data(self, mock_load_model, sample_data):
|
|
109
|
+
"""Test prediction on valid input data."""
|
|
110
|
+
from ins_pricing.production.predict import predict
|
|
111
|
+
|
|
112
|
+
# Setup mock model
|
|
113
|
+
mock_model = Mock()
|
|
114
|
+
mock_model.predict.return_value = np.array([100, 150, 200, 250])
|
|
115
|
+
mock_load_model.return_value = mock_model
|
|
116
|
+
|
|
117
|
+
predictor = {
|
|
118
|
+
'model': mock_model,
|
|
119
|
+
'config': {'feature_names': ["age", "premium", "region"]}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# Execute
|
|
123
|
+
predictions = predict(predictor, sample_data)
|
|
124
|
+
|
|
125
|
+
# Verify
|
|
126
|
+
assert len(predictions) == len(sample_data)
|
|
127
|
+
assert all(isinstance(p, (int, float, np.number)) for p in predictions)
|
|
128
|
+
|
|
129
|
+
def test_predict_missing_features(self, sample_data):
|
|
130
|
+
"""Test error when input data is missing required features."""
|
|
131
|
+
from ins_pricing.production.predict import predict
|
|
132
|
+
from ins_pricing.utils.validation import validate_required_columns
|
|
133
|
+
|
|
134
|
+
predictor = {
|
|
135
|
+
'model': Mock(),
|
|
136
|
+
'config': {'feature_names': ["age", "premium", "region", "missing_col"]}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Should raise validation error for missing column
|
|
140
|
+
with pytest.raises(PredictionError):
|
|
141
|
+
validate_required_columns(
|
|
142
|
+
sample_data,
|
|
143
|
+
predictor['config']['feature_names'],
|
|
144
|
+
df_name="input_data"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def test_predict_empty_dataframe(self):
|
|
148
|
+
"""Test prediction on empty DataFrame."""
|
|
149
|
+
from ins_pricing.production.predict import predict
|
|
150
|
+
from ins_pricing.utils.validation import validate_dataframe_not_empty
|
|
151
|
+
|
|
152
|
+
empty_df = pd.DataFrame()
|
|
153
|
+
predictor = {'model': Mock(), 'config': {}}
|
|
154
|
+
|
|
155
|
+
with pytest.raises(PredictionError):
|
|
156
|
+
validate_dataframe_not_empty(empty_df, df_name="input_data")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TestBatchScoring:
|
|
160
|
+
"""Test batch scoring functionality."""
|
|
161
|
+
|
|
162
|
+
@patch('ins_pricing.production.predict.load_predictor_from_config')
|
|
163
|
+
@patch('ins_pricing.production.predict.predict')
|
|
164
|
+
def test_batch_score_success(self, mock_predict, mock_load, sample_data, tmp_path):
|
|
165
|
+
"""Test successful batch scoring."""
|
|
166
|
+
from ins_pricing.production.predict import batch_score
|
|
167
|
+
|
|
168
|
+
# Setup
|
|
169
|
+
mock_load.return_value = {'model': Mock(), 'config': {}}
|
|
170
|
+
mock_predict.return_value = np.array([100, 150, 200, 250])
|
|
171
|
+
|
|
172
|
+
output_path = tmp_path / "predictions.csv"
|
|
173
|
+
|
|
174
|
+
# Execute
|
|
175
|
+
batch_score(
|
|
176
|
+
config_path=tmp_path / "config.json",
|
|
177
|
+
input_data=sample_data,
|
|
178
|
+
output_path=output_path
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Verify
|
|
182
|
+
assert output_path.exists()
|
|
183
|
+
results = pd.read_csv(output_path)
|
|
184
|
+
assert "predictions" in results.columns
|
|
185
|
+
|
|
186
|
+
def test_batch_score_large_data(self, tmp_path):
|
|
187
|
+
"""Test batch scoring with large dataset."""
|
|
188
|
+
from ins_pricing.production.predict import batch_score
|
|
189
|
+
|
|
190
|
+
# Create large dataset
|
|
191
|
+
large_data = pd.DataFrame({
|
|
192
|
+
"age": np.random.randint(20, 70, size=10000),
|
|
193
|
+
"premium": np.random.uniform(100, 500, size=10000),
|
|
194
|
+
"region": np.random.choice(["A", "B", "C"], size=10000)
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
with patch('ins_pricing.production.predict.load_predictor_from_config') as mock_load:
|
|
198
|
+
with patch('ins_pricing.production.predict.predict') as mock_predict:
|
|
199
|
+
mock_load.return_value = {'model': Mock(), 'config': {}}
|
|
200
|
+
mock_predict.return_value = np.random.uniform(50, 300, size=10000)
|
|
201
|
+
|
|
202
|
+
output_path = tmp_path / "large_predictions.csv"
|
|
203
|
+
batch_score(
|
|
204
|
+
config_path=tmp_path / "config.json",
|
|
205
|
+
input_data=large_data,
|
|
206
|
+
output_path=output_path
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
assert output_path.exists()
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class TestModelVersioning:
|
|
213
|
+
"""Test model versioning functionality."""
|
|
214
|
+
|
|
215
|
+
def test_version_compatibility_check(self):
|
|
216
|
+
"""Test version compatibility checking."""
|
|
217
|
+
# Test that predictor checks model version compatibility
|
|
218
|
+
pass # Implement based on actual version checking logic
|
|
219
|
+
|
|
220
|
+
def test_load_different_model_versions(self):
|
|
221
|
+
"""Test loading different versions of the same model."""
|
|
222
|
+
pass # Implement based on actual versioning system
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@pytest.mark.integration
|
|
226
|
+
class TestPredictionIntegration:
|
|
227
|
+
"""Integration tests for prediction pipeline."""
|
|
228
|
+
|
|
229
|
+
@pytest.mark.skipif(not Path("test_models").exists(), reason="Test models not available")
|
|
230
|
+
def test_end_to_end_prediction(self):
|
|
231
|
+
"""Test complete prediction pipeline from config to output."""
|
|
232
|
+
# This would require actual model artifacts
|
|
233
|
+
pass
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
"""Tests for production preprocessing module."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pytest
|
|
6
|
+
from unittest.mock import Mock, patch
|
|
7
|
+
|
|
8
|
+
from ins_pricing.exceptions import PreprocessingError, DataValidationError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def sample_raw_data():
|
|
13
|
+
"""Sample raw input data."""
|
|
14
|
+
return pd.DataFrame({
|
|
15
|
+
"age": [25, 30, 35, 40, 45],
|
|
16
|
+
"gender": ["M", "F", "M", "F", "M"],
|
|
17
|
+
"region": ["North", "South", "East", "West", "North"],
|
|
18
|
+
"premium": [100.0, 150.0, 200.0, 250.0, 300.0],
|
|
19
|
+
"coverage": ["Basic", "Premium", "Basic", "Premium", "Premium"]
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestFeatureEngineering:
|
|
24
|
+
"""Test feature engineering transformations."""
|
|
25
|
+
|
|
26
|
+
def test_create_age_bands(self, sample_raw_data):
|
|
27
|
+
"""Test age banding transformation."""
|
|
28
|
+
from ins_pricing.production.preprocess import create_age_bands
|
|
29
|
+
|
|
30
|
+
df = create_age_bands(sample_raw_data, 'age', bins=[0, 30, 40, 100])
|
|
31
|
+
|
|
32
|
+
assert 'age_band' in df.columns
|
|
33
|
+
assert df['age_band'].dtype == 'object' or pd.api.types.is_categorical_dtype(df['age_band'])
|
|
34
|
+
|
|
35
|
+
def test_encode_categorical(self, sample_raw_data):
|
|
36
|
+
"""Test categorical encoding."""
|
|
37
|
+
from ins_pricing.production.preprocess import encode_categorical
|
|
38
|
+
|
|
39
|
+
df = encode_categorical(
|
|
40
|
+
sample_raw_data,
|
|
41
|
+
columns=['gender', 'region'],
|
|
42
|
+
method='onehot'
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Check that encoded columns exist
|
|
46
|
+
assert any('gender_' in col for col in df.columns)
|
|
47
|
+
assert any('region_' in col for col in df.columns)
|
|
48
|
+
|
|
49
|
+
def test_scale_numeric_features(self, sample_raw_data):
|
|
50
|
+
"""Test numeric feature scaling."""
|
|
51
|
+
from ins_pricing.production.preprocess import scale_features
|
|
52
|
+
|
|
53
|
+
df = scale_features(
|
|
54
|
+
sample_raw_data,
|
|
55
|
+
columns=['premium'],
|
|
56
|
+
method='standard'
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Check that scaled values have mean ~0 and std ~1
|
|
60
|
+
assert abs(df['premium'].mean()) < 0.1
|
|
61
|
+
assert abs(df['premium'].std() - 1.0) < 0.1
|
|
62
|
+
|
|
63
|
+
def test_create_interaction_features(self, sample_raw_data):
|
|
64
|
+
"""Test interaction feature creation."""
|
|
65
|
+
from ins_pricing.production.preprocess import create_interactions
|
|
66
|
+
|
|
67
|
+
df = create_interactions(
|
|
68
|
+
sample_raw_data,
|
|
69
|
+
feature_pairs=[('age', 'premium')]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
assert 'age_x_premium' in df.columns
|
|
73
|
+
|
|
74
|
+
def test_polynomial_features(self, sample_raw_data):
|
|
75
|
+
"""Test polynomial feature generation."""
|
|
76
|
+
from ins_pricing.production.preprocess import create_polynomial_features
|
|
77
|
+
|
|
78
|
+
df = create_polynomial_features(
|
|
79
|
+
sample_raw_data,
|
|
80
|
+
columns=['age'],
|
|
81
|
+
degree=2
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
assert 'age_squared' in df.columns
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class TestDataCleaning:
|
|
88
|
+
"""Test data cleaning operations."""
|
|
89
|
+
|
|
90
|
+
def test_handle_missing_values(self):
|
|
91
|
+
"""Test missing value handling."""
|
|
92
|
+
from ins_pricing.production.preprocess import handle_missing
|
|
93
|
+
|
|
94
|
+
df = pd.DataFrame({
|
|
95
|
+
"col1": [1, 2, np.nan, 4],
|
|
96
|
+
"col2": [np.nan, 2, 3, 4],
|
|
97
|
+
"col3": ["A", "B", None, "D"]
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
cleaned = handle_missing(df, strategy='mean', columns=['col1', 'col2'])
|
|
101
|
+
|
|
102
|
+
assert cleaned['col1'].isna().sum() == 0
|
|
103
|
+
assert cleaned['col2'].isna().sum() == 0
|
|
104
|
+
|
|
105
|
+
def test_remove_outliers(self, sample_raw_data):
|
|
106
|
+
"""Test outlier removal."""
|
|
107
|
+
from ins_pricing.production.preprocess import remove_outliers
|
|
108
|
+
|
|
109
|
+
# Add outlier
|
|
110
|
+
data_with_outlier = sample_raw_data.copy()
|
|
111
|
+
data_with_outlier.loc[0, 'premium'] = 10000 # Extreme value
|
|
112
|
+
|
|
113
|
+
cleaned = remove_outliers(data_with_outlier, column='premium', method='iqr')
|
|
114
|
+
|
|
115
|
+
assert len(cleaned) < len(data_with_outlier)
|
|
116
|
+
|
|
117
|
+
def test_deduplicate(self):
|
|
118
|
+
"""Test duplicate removal."""
|
|
119
|
+
from ins_pricing.production.preprocess import deduplicate
|
|
120
|
+
|
|
121
|
+
df = pd.DataFrame({
|
|
122
|
+
"id": [1, 2, 2, 3],
|
|
123
|
+
"value": [10, 20, 20, 30]
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
deduped = deduplicate(df)
|
|
127
|
+
|
|
128
|
+
assert len(deduped) == 3
|
|
129
|
+
|
|
130
|
+
def test_fix_data_types(self):
|
|
131
|
+
"""Test data type corrections."""
|
|
132
|
+
from ins_pricing.production.preprocess import fix_data_types
|
|
133
|
+
|
|
134
|
+
df = pd.DataFrame({
|
|
135
|
+
"age": ["25", "30", "35"], # String instead of int
|
|
136
|
+
"premium": [100, 200, 300]
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
fixed = fix_data_types(df, type_spec={'age': 'int64'})
|
|
140
|
+
|
|
141
|
+
assert fixed['age'].dtype == np.int64
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class TestFeatureSelection:
|
|
145
|
+
"""Test feature selection operations."""
|
|
146
|
+
|
|
147
|
+
def test_select_by_importance(self):
|
|
148
|
+
"""Test feature selection by importance."""
|
|
149
|
+
from ins_pricing.production.preprocess import select_features_by_importance
|
|
150
|
+
|
|
151
|
+
X = pd.DataFrame(np.random.rand(100, 10))
|
|
152
|
+
y = pd.Series(np.random.rand(100))
|
|
153
|
+
|
|
154
|
+
selected = select_features_by_importance(X, y, n_features=5)
|
|
155
|
+
|
|
156
|
+
assert selected.shape[1] == 5
|
|
157
|
+
|
|
158
|
+
def test_remove_low_variance(self):
|
|
159
|
+
"""Test removal of low variance features."""
|
|
160
|
+
from ins_pricing.production.preprocess import remove_low_variance
|
|
161
|
+
|
|
162
|
+
df = pd.DataFrame({
|
|
163
|
+
"constant": [1, 1, 1, 1], # Zero variance
|
|
164
|
+
"low_var": [1, 1, 1, 2], # Low variance
|
|
165
|
+
"high_var": [1, 5, 10, 20] # High variance
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
filtered = remove_low_variance(df, threshold=0.01)
|
|
169
|
+
|
|
170
|
+
assert 'constant' not in filtered.columns
|
|
171
|
+
assert 'high_var' in filtered.columns
|
|
172
|
+
|
|
173
|
+
def test_remove_correlated_features(self):
|
|
174
|
+
"""Test removal of highly correlated features."""
|
|
175
|
+
from ins_pricing.production.preprocess import remove_correlated
|
|
176
|
+
|
|
177
|
+
# Create correlated features
|
|
178
|
+
df = pd.DataFrame({
|
|
179
|
+
"feature_1": np.random.rand(100),
|
|
180
|
+
"feature_2": np.random.rand(100)
|
|
181
|
+
})
|
|
182
|
+
df['feature_3'] = df['feature_1'] * 1.1 # Highly correlated
|
|
183
|
+
|
|
184
|
+
filtered = remove_correlated(df, threshold=0.95)
|
|
185
|
+
|
|
186
|
+
assert filtered.shape[1] < 3
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class TestPipelineValidation:
|
|
190
|
+
"""Test preprocessing pipeline validation."""
|
|
191
|
+
|
|
192
|
+
def test_validate_input_schema(self, sample_raw_data):
|
|
193
|
+
"""Test input schema validation."""
|
|
194
|
+
from ins_pricing.production.preprocess import validate_input_schema
|
|
195
|
+
|
|
196
|
+
expected_schema = {
|
|
197
|
+
"age": "int64",
|
|
198
|
+
"gender": "object",
|
|
199
|
+
"premium": "float64"
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# Should not raise
|
|
203
|
+
validate_input_schema(sample_raw_data, expected_schema)
|
|
204
|
+
|
|
205
|
+
def test_validate_input_schema_failure(self):
|
|
206
|
+
"""Test input schema validation catches errors."""
|
|
207
|
+
from ins_pricing.production.preprocess import validate_input_schema
|
|
208
|
+
from ins_pricing.utils.validation import validate_column_types
|
|
209
|
+
|
|
210
|
+
df = pd.DataFrame({
|
|
211
|
+
"age": ["not_a_number", "25"], # Wrong type
|
|
212
|
+
"premium": [100.0, 200.0]
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
expected_schema = {"age": "int64", "premium": "float64"}
|
|
216
|
+
|
|
217
|
+
with pytest.raises(DataValidationError):
|
|
218
|
+
validate_column_types(df, expected_schema, coerce=False)
|
|
219
|
+
|
|
220
|
+
def test_validate_feature_range(self, sample_raw_data):
|
|
221
|
+
"""Test feature value range validation."""
|
|
222
|
+
from ins_pricing.utils.validation import validate_value_range
|
|
223
|
+
|
|
224
|
+
# Age should be positive
|
|
225
|
+
validate_value_range(sample_raw_data, 'age', min_val=0, max_val=120)
|
|
226
|
+
|
|
227
|
+
# Premium should be positive
|
|
228
|
+
validate_value_range(sample_raw_data, 'premium', min_val=0)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class TestPreprocessorState:
|
|
232
|
+
"""Test preprocessor state management."""
|
|
233
|
+
|
|
234
|
+
def test_save_preprocessor_state(self, sample_raw_data, tmp_path):
|
|
235
|
+
"""Test saving preprocessor state."""
|
|
236
|
+
from ins_pricing.production.preprocess import Preprocessor
|
|
237
|
+
|
|
238
|
+
preprocessor = Preprocessor()
|
|
239
|
+
preprocessor.fit(sample_raw_data)
|
|
240
|
+
|
|
241
|
+
state_path = tmp_path / "preprocessor_state.pkl"
|
|
242
|
+
preprocessor.save(state_path)
|
|
243
|
+
|
|
244
|
+
assert state_path.exists()
|
|
245
|
+
|
|
246
|
+
def test_load_preprocessor_state(self, tmp_path):
|
|
247
|
+
"""Test loading preprocessor state."""
|
|
248
|
+
from ins_pricing.production.preprocess import Preprocessor
|
|
249
|
+
|
|
250
|
+
# Create and save
|
|
251
|
+
preprocessor = Preprocessor()
|
|
252
|
+
state_path = tmp_path / "preprocessor_state.pkl"
|
|
253
|
+
preprocessor.save(state_path)
|
|
254
|
+
|
|
255
|
+
# Load
|
|
256
|
+
loaded = Preprocessor.load(state_path)
|
|
257
|
+
|
|
258
|
+
assert loaded is not None
|
|
259
|
+
|
|
260
|
+
def test_preprocessor_consistency(self, sample_raw_data, tmp_path):
|
|
261
|
+
"""Test that loaded preprocessor produces same results."""
|
|
262
|
+
from ins_pricing.production.preprocess import Preprocessor
|
|
263
|
+
|
|
264
|
+
# Fit and transform
|
|
265
|
+
preprocessor = Preprocessor()
|
|
266
|
+
preprocessor.fit(sample_raw_data)
|
|
267
|
+
result1 = preprocessor.transform(sample_raw_data)
|
|
268
|
+
|
|
269
|
+
# Save, load, transform
|
|
270
|
+
state_path = tmp_path / "preprocessor_state.pkl"
|
|
271
|
+
preprocessor.save(state_path)
|
|
272
|
+
loaded = Preprocessor.load(state_path)
|
|
273
|
+
result2 = loaded.transform(sample_raw_data)
|
|
274
|
+
|
|
275
|
+
# Results should be identical
|
|
276
|
+
pd.testing.assert_frame_equal(result1, result2)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class TestTransformationPipeline:
|
|
280
|
+
"""Test complete transformation pipeline."""
|
|
281
|
+
|
|
282
|
+
def test_full_pipeline(self, sample_raw_data):
|
|
283
|
+
"""Test complete preprocessing pipeline."""
|
|
284
|
+
from ins_pricing.production.preprocess import PreprocessingPipeline
|
|
285
|
+
|
|
286
|
+
pipeline = PreprocessingPipeline([
|
|
287
|
+
('handle_missing', {'strategy': 'mean'}),
|
|
288
|
+
('encode_categorical', {'columns': ['gender', 'region']}),
|
|
289
|
+
('scale_features', {'columns': ['premium'], 'method': 'standard'})
|
|
290
|
+
])
|
|
291
|
+
|
|
292
|
+
transformed = pipeline.fit_transform(sample_raw_data)
|
|
293
|
+
|
|
294
|
+
assert transformed is not None
|
|
295
|
+
assert len(transformed) == len(sample_raw_data)
|
|
296
|
+
|
|
297
|
+
def test_pipeline_error_handling(self):
|
|
298
|
+
"""Test pipeline handles errors gracefully."""
|
|
299
|
+
from ins_pricing.production.preprocess import PreprocessingPipeline
|
|
300
|
+
|
|
301
|
+
df = pd.DataFrame({
|
|
302
|
+
"col1": [1, 2, 3],
|
|
303
|
+
"col2": ["A", "B", "C"]
|
|
304
|
+
})
|
|
305
|
+
|
|
306
|
+
# Try to scale a categorical column (should fail)
|
|
307
|
+
pipeline = PreprocessingPipeline([
|
|
308
|
+
('scale_features', {'columns': ['col2'], 'method': 'standard'})
|
|
309
|
+
])
|
|
310
|
+
|
|
311
|
+
with pytest.raises(PreprocessingError):
|
|
312
|
+
pipeline.fit_transform(df)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@pytest.mark.performance
|
|
316
|
+
class TestPreprocessingPerformance:
|
|
317
|
+
"""Test preprocessing performance."""
|
|
318
|
+
|
|
319
|
+
def test_large_dataset_preprocessing(self):
|
|
320
|
+
"""Test preprocessing on large dataset."""
|
|
321
|
+
from ins_pricing.production.preprocess import Preprocessor
|
|
322
|
+
|
|
323
|
+
n = 100_000
|
|
324
|
+
large_df = pd.DataFrame({
|
|
325
|
+
"age": np.random.randint(18, 80, n),
|
|
326
|
+
"premium": np.random.uniform(100, 1000, n),
|
|
327
|
+
"region": np.random.choice(['A', 'B', 'C', 'D'], n)
|
|
328
|
+
})
|
|
329
|
+
|
|
330
|
+
preprocessor = Preprocessor()
|
|
331
|
+
|
|
332
|
+
import time
|
|
333
|
+
start = time.time()
|
|
334
|
+
preprocessor.fit(large_df)
|
|
335
|
+
transformed = preprocessor.transform(large_df)
|
|
336
|
+
elapsed = time.time() - start
|
|
337
|
+
|
|
338
|
+
assert len(transformed) == n
|
|
339
|
+
assert elapsed < 5.0 # Should complete in under 5 seconds
|