ins-pricing 0.2.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. ins_pricing/CHANGELOG.md +93 -0
  2. ins_pricing/README.md +11 -0
  3. ins_pricing/cli/bayesopt_entry_runner.py +626 -499
  4. ins_pricing/cli/utils/evaluation_context.py +320 -0
  5. ins_pricing/cli/utils/import_resolver.py +350 -0
  6. ins_pricing/modelling/core/bayesopt/PHASE2_REFACTORING_SUMMARY.md +449 -0
  7. ins_pricing/modelling/core/bayesopt/PHASE3_REFACTORING_SUMMARY.md +406 -0
  8. ins_pricing/modelling/core/bayesopt/REFACTORING_SUMMARY.md +247 -0
  9. ins_pricing/modelling/core/bayesopt/config_components.py +351 -0
  10. ins_pricing/modelling/core/bayesopt/config_preprocess.py +3 -4
  11. ins_pricing/modelling/core/bayesopt/core.py +153 -94
  12. ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +118 -31
  13. ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +294 -139
  14. ins_pricing/modelling/core/bayesopt/utils/__init__.py +86 -0
  15. ins_pricing/modelling/core/bayesopt/utils/constants.py +183 -0
  16. ins_pricing/modelling/core/bayesopt/utils/distributed_utils.py +186 -0
  17. ins_pricing/modelling/core/bayesopt/utils/io_utils.py +126 -0
  18. ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +540 -0
  19. ins_pricing/modelling/core/bayesopt/utils/torch_trainer_mixin.py +587 -0
  20. ins_pricing/modelling/core/bayesopt/utils.py +98 -1496
  21. ins_pricing/modelling/core/bayesopt/utils_backup.py +1503 -0
  22. ins_pricing/setup.py +1 -1
  23. {ins_pricing-0.2.9.dist-info → ins_pricing-0.3.0.dist-info}/METADATA +162 -149
  24. {ins_pricing-0.2.9.dist-info → ins_pricing-0.3.0.dist-info}/RECORD +26 -13
  25. {ins_pricing-0.2.9.dist-info → ins_pricing-0.3.0.dist-info}/WHEEL +0 -0
  26. {ins_pricing-0.2.9.dist-info → ins_pricing-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,86 @@
1
+ """Backward compatibility re-exports from refactored utils modules.
2
+
3
+ This module ensures all existing imports continue to work:
4
+ from ins_pricing.modelling.core.bayesopt.utils import EPS, IOUtils, ...
5
+
6
+ The utils.py file has been split into focused modules for better maintainability:
7
+ - constants.py: EPS, set_global_seed, etc.
8
+ - io_utils.py: IOUtils for file I/O
9
+ - distributed_utils.py: DistributedUtils, TrainingUtils for DDP
10
+ - torch_trainer_mixin.py: TorchTrainerMixin for PyTorch training
11
+ - metrics_and_devices.py: Metrics, GPU/device management, CV strategies, plotting
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ # Constants and simple utilities
17
+ from .constants import (
18
+ EPS,
19
+ set_global_seed,
20
+ ensure_parent_dir,
21
+ compute_batch_size,
22
+ tweedie_loss,
23
+ infer_factor_and_cate_list,
24
+ )
25
+
26
+ # I/O utilities
27
+ from .io_utils import (
28
+ IOUtils,
29
+ csv_to_dict,
30
+ )
31
+
32
+ # Distributed training
33
+ from .distributed_utils import (
34
+ DistributedUtils,
35
+ TrainingUtils,
36
+ free_cuda,
37
+ )
38
+
39
+ # PyTorch training mixin
40
+ from .torch_trainer_mixin import (
41
+ TorchTrainerMixin,
42
+ )
43
+
44
+ # Metrics, devices, CV, and plotting
45
+ from .metrics_and_devices import (
46
+ get_logger,
47
+ MetricFactory,
48
+ GPUMemoryManager,
49
+ DeviceManager,
50
+ CVStrategyResolver,
51
+ PlotUtils,
52
+ split_data,
53
+ plot_lift_list,
54
+ plot_dlift_list,
55
+ _OrderedSplitter,
56
+ )
57
+
58
+ __all__ = [
59
+ # Constants
60
+ 'EPS',
61
+ 'set_global_seed',
62
+ 'ensure_parent_dir',
63
+ 'compute_batch_size',
64
+ 'tweedie_loss',
65
+ 'infer_factor_and_cate_list',
66
+ # I/O
67
+ 'IOUtils',
68
+ 'csv_to_dict',
69
+ # Distributed
70
+ 'DistributedUtils',
71
+ 'TrainingUtils',
72
+ 'free_cuda',
73
+ # PyTorch
74
+ 'TorchTrainerMixin',
75
+ # Utilities
76
+ 'get_logger',
77
+ 'MetricFactory',
78
+ 'GPUMemoryManager',
79
+ 'DeviceManager',
80
+ 'CVStrategyResolver',
81
+ 'PlotUtils',
82
+ 'split_data',
83
+ 'plot_lift_list',
84
+ 'plot_dlift_list',
85
+ '_OrderedSplitter',
86
+ ]
@@ -0,0 +1,183 @@
1
+ """Core constants and simple utility functions.
2
+
3
+ This module contains:
4
+ - EPS constant for numerical stability
5
+ - set_global_seed() for reproducibility
6
+ - ensure_parent_dir() for file operations
7
+ - compute_batch_size() for adaptive batching
8
+ - tweedie_loss() for regression loss
9
+ - infer_factor_and_cate_list() for automatic feature detection
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import random
15
+ from pathlib import Path
16
+ from typing import List, Optional, Tuple
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+ import torch
21
+
22
+ # Constants
23
+ # =============================================================================
24
+ EPS = 1e-8
25
+ """Small epsilon value for numerical stability."""
26
+
27
+
28
+ # Simple utility functions
29
+ # =============================================================================
30
+
31
+ def set_global_seed(seed: int) -> None:
32
+ """Set random seed for reproducibility across all libraries.
33
+
34
+ Args:
35
+ seed: Random seed value
36
+ """
37
+ random.seed(seed)
38
+ np.random.seed(seed)
39
+ torch.manual_seed(seed)
40
+ if torch.cuda.is_available():
41
+ torch.cuda.manual_seed_all(seed)
42
+
43
+
44
+ def ensure_parent_dir(file_path: str) -> None:
45
+ """Create parent directories when missing.
46
+
47
+ Args:
48
+ file_path: Path to file whose parent directory should be created
49
+ """
50
+ directory = Path(file_path).parent
51
+ if directory and not directory.exists():
52
+ directory.mkdir(parents=True, exist_ok=True)
53
+
54
+
55
+ def compute_batch_size(data_size: int, learning_rate: float,
56
+ batch_num: int, minimum: int) -> int:
57
+ """Compute adaptive batch size based on data size and learning rate.
58
+
59
+ Args:
60
+ data_size: Total number of samples
61
+ learning_rate: Learning rate value
62
+ batch_num: Target number of batches
63
+ minimum: Minimum batch size
64
+
65
+ Returns:
66
+ Computed batch size
67
+ """
68
+ estimated = int((learning_rate / 1e-4) ** 0.5 *
69
+ (data_size / max(batch_num, 1)))
70
+ return max(1, min(data_size, max(minimum, estimated)))
71
+
72
+
73
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
74
+ """Compute Tweedie deviance loss for PyTorch.
75
+
76
+ Reference: https://scikit-learn.org/stable/modules/model_evaluation.html
77
+
78
+ Args:
79
+ pred: Predicted values (tensor)
80
+ target: True values (tensor)
81
+ p: Tweedie power parameter (1.0-2.0)
82
+ eps: Small epsilon for numerical stability
83
+ max_clip: Maximum value for clipping
84
+
85
+ Returns:
86
+ Tweedie negative log-likelihood (tensor)
87
+ """
88
+ # Clamp predictions to positive values for stability
89
+ pred_clamped = torch.clamp(pred, min=eps)
90
+
91
+ if p == 1:
92
+ # Poisson
93
+ term1 = target * torch.log(target / pred_clamped + eps)
94
+ term2 = -target + pred_clamped
95
+ term3 = 0
96
+ elif p == 0:
97
+ # Gaussian
98
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
99
+ term2 = 0
100
+ term3 = 0
101
+ elif p == 2:
102
+ # Gamma
103
+ term1 = torch.log(pred_clamped / target + eps)
104
+ term2 = -target / pred_clamped + 1
105
+ term3 = 0
106
+ else:
107
+ # General Tweedie
108
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
109
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
110
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
111
+
112
+ return torch.nan_to_num(
113
+ 2 * (term1 - term2 + term3),
114
+ nan=eps,
115
+ posinf=max_clip,
116
+ neginf=-max_clip
117
+ )
118
+
119
+
120
+ def infer_factor_and_cate_list(
121
+ train_df: pd.DataFrame,
122
+ test_df: pd.DataFrame,
123
+ resp_nme: str,
124
+ weight_nme: str,
125
+ binary_resp_nme: Optional[str] = None,
126
+ factor_nmes: Optional[List[str]] = None,
127
+ cate_list: Optional[List[str]] = None,
128
+ infer_categorical_max_unique: int = 50,
129
+ infer_categorical_max_ratio: float = 0.05
130
+ ) -> Tuple[List[str], List[str]]:
131
+ """Infer factor_nmes/cate_list when feature names are not provided.
132
+
133
+ Rules:
134
+ - factor_nmes: start from shared train/test columns, exclude target/weight/(optional binary target).
135
+ - cate_list: object/category/bool plus low-cardinality integer columns.
136
+ - Always intersect with shared train/test columns to avoid mismatches.
137
+
138
+ Args:
139
+ train_df: Training DataFrame
140
+ test_df: Test DataFrame
141
+ resp_nme: Response/target column name
142
+ weight_nme: Sample weight column name
143
+ binary_resp_nme: Optional binary response column name
144
+ factor_nmes: Optional list of feature column names
145
+ cate_list: Optional list of categorical feature names
146
+ infer_categorical_max_unique: Max unique values for categorical inference
147
+ infer_categorical_max_ratio: Max ratio of unique/total for categorical inference
148
+
149
+ Returns:
150
+ Tuple of (factor_nmes, cate_list)
151
+ """
152
+ excluded = {resp_nme, weight_nme}
153
+ if binary_resp_nme:
154
+ excluded.add(binary_resp_nme)
155
+
156
+ common_cols = [c for c in train_df.columns if c in test_df.columns]
157
+ if factor_nmes is None:
158
+ factors = [c for c in common_cols if c not in excluded]
159
+ else:
160
+ factors = [
161
+ c for c in factor_nmes if c in common_cols and c not in excluded
162
+ ]
163
+
164
+ if cate_list is not None:
165
+ cats = [c for c in cate_list if c in factors]
166
+ return factors, cats
167
+
168
+ n_rows = max(1, len(train_df))
169
+ cats: List[str] = []
170
+ for col in factors:
171
+ s = train_df[col]
172
+ if (pd.api.types.is_bool_dtype(s) or
173
+ pd.api.types.is_object_dtype(s) or
174
+ isinstance(s.dtype, pd.CategoricalDtype)):
175
+ cats.append(col)
176
+ continue
177
+ if pd.api.types.is_integer_dtype(s):
178
+ nunique = int(s.nunique(dropna=True))
179
+ if (nunique <= infer_categorical_max_unique or
180
+ (nunique / n_rows) <= infer_categorical_max_ratio):
181
+ cats.append(col)
182
+
183
+ return factors, cats
@@ -0,0 +1,186 @@
1
+ """Distributed training utilities for PyTorch DDP.
2
+
3
+ This module contains:
4
+ - DistributedUtils for DDP setup and process coordination
5
+ - TrainingUtils for CUDA memory management
6
+ - free_cuda() for legacy compatibility
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import gc
12
+ import os
13
+ from datetime import timedelta
14
+ from typing import Optional
15
+
16
+ import torch
17
+ import torch.distributed as dist
18
+
19
+
20
+ def _select_ddp_backend() -> str:
21
+ """Select the appropriate DDP backend based on system capabilities.
22
+
23
+ Returns:
24
+ "nccl" if CUDA is available and NCCL is supported (non-Windows),
25
+ otherwise "gloo"
26
+ """
27
+ if not torch.cuda.is_available():
28
+ return "gloo"
29
+
30
+ if os.name == "nt": # Windows doesn't support NCCL
31
+ return "gloo"
32
+
33
+ try:
34
+ nccl_available = getattr(dist, "is_nccl_available", lambda: False)()
35
+ return "nccl" if nccl_available else "gloo"
36
+ except Exception:
37
+ return "gloo"
38
+
39
+
40
+ def _get_ddp_timeout() -> timedelta:
41
+ """Get the DDP timeout from environment variable.
42
+
43
+ Returns:
44
+ timedelta for DDP timeout (default: 1800 seconds)
45
+ """
46
+ timeout_seconds = int(os.environ.get("BAYESOPT_DDP_TIMEOUT_SECONDS", "1800"))
47
+ return timedelta(seconds=max(1, timeout_seconds))
48
+
49
+
50
+ def _cache_ddp_state(local_rank: int, rank: int, world_size: int) -> tuple:
51
+ """Cache and return DDP state tuple."""
52
+ state = (True, local_rank, rank, world_size)
53
+ DistributedUtils._cached_state = state
54
+ return state
55
+
56
+
57
+ class DistributedUtils:
58
+ """Utilities for distributed data parallel training.
59
+
60
+ This class provides methods for:
61
+ - Initializing DDP process groups
62
+ - Checking process rank and world size
63
+ - Cleanup after distributed training
64
+ """
65
+
66
+ _cached_state: Optional[tuple] = None
67
+
68
+ @staticmethod
69
+ def setup_ddp():
70
+ """Initialize the DDP process group for distributed training.
71
+
72
+ Returns:
73
+ Tuple of (success, local_rank, rank, world_size)
74
+ """
75
+ # Return cached state if already initialized
76
+ if dist.is_initialized():
77
+ if DistributedUtils._cached_state is None:
78
+ DistributedUtils._cached_state = _cache_ddp_state(
79
+ int(os.environ.get("LOCAL_RANK", 0)),
80
+ dist.get_rank(),
81
+ dist.get_world_size(),
82
+ )
83
+ return DistributedUtils._cached_state
84
+
85
+ # Check for required environment variables
86
+ if 'RANK' not in os.environ or 'WORLD_SIZE' not in os.environ:
87
+ print(
88
+ f">>> DDP Setup Failed: RANK or WORLD_SIZE not found in env. "
89
+ f"Keys found: {list(os.environ.keys())}"
90
+ )
91
+ print(">>> Hint: launch with torchrun --nproc_per_node=<N> <script.py>")
92
+ return False, 0, 0, 1
93
+
94
+ rank = int(os.environ["RANK"])
95
+ world_size = int(os.environ["WORLD_SIZE"])
96
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
97
+
98
+ # Windows CUDA DDP is not supported
99
+ if os.name == "nt" and torch.cuda.is_available() and world_size > 1:
100
+ print(
101
+ ">>> DDP Setup Disabled: Windows CUDA DDP is not supported. "
102
+ "Falling back to single process."
103
+ )
104
+ return False, 0, 0, 1
105
+
106
+ # Set CUDA device for this process
107
+ if torch.cuda.is_available():
108
+ torch.cuda.set_device(local_rank)
109
+
110
+ # Initialize process group
111
+ backend = _select_ddp_backend()
112
+ timeout = _get_ddp_timeout()
113
+
114
+ dist.init_process_group(backend=backend, init_method="env://", timeout=timeout)
115
+ print(
116
+ f">>> DDP Initialized ({backend}, timeout={timeout.total_seconds():.0f}s): "
117
+ f"Rank {rank}/{world_size}, Local Rank {local_rank}"
118
+ )
119
+
120
+ return _cache_ddp_state(local_rank, rank, world_size)
121
+
122
+ @staticmethod
123
+ def cleanup_ddp():
124
+ """Destroy the DDP process group and clear cached state."""
125
+ if dist.is_initialized():
126
+ dist.destroy_process_group()
127
+ DistributedUtils._cached_state = None
128
+
129
+ @staticmethod
130
+ def is_main_process():
131
+ """Check if current process is rank 0 (main process).
132
+
133
+ Returns:
134
+ True if main process or DDP not initialized
135
+ """
136
+ return not dist.is_initialized() or dist.get_rank() == 0
137
+
138
+ @staticmethod
139
+ def world_size() -> int:
140
+ """Get the total number of processes in the distributed group.
141
+
142
+ Returns:
143
+ World size (1 if DDP not initialized)
144
+ """
145
+ return dist.get_world_size() if dist.is_initialized() else 1
146
+
147
+
148
+ class TrainingUtils:
149
+ """General training utilities including CUDA management."""
150
+
151
+ @staticmethod
152
+ def free_cuda() -> None:
153
+ """Release CUDA memory and clear cache.
154
+
155
+ This performs aggressive cleanup:
156
+ 1. Move all PyTorch models to CPU
157
+ 2. Run garbage collection
158
+ 3. Clear CUDA cache
159
+ """
160
+ print(">>> Moving all models to CPU...")
161
+ for obj in gc.get_objects():
162
+ try:
163
+ if hasattr(obj, "to") and callable(obj.to):
164
+ obj.to("cpu")
165
+ except Exception:
166
+ pass
167
+
168
+ print(">>> Releasing tensor/optimizer/DataLoader references...")
169
+ gc.collect()
170
+
171
+ print(">>> Clearing CUDA cache...")
172
+ if torch.cuda.is_available():
173
+ torch.cuda.empty_cache()
174
+ torch.cuda.synchronize()
175
+ print(">>> CUDA memory released.")
176
+ else:
177
+ print(">>> CUDA not available; cleanup skipped.")
178
+
179
+
180
+ # Backward compatibility function wrapper
181
+ def free_cuda():
182
+ """Legacy function wrapper for CUDA memory cleanup.
183
+
184
+ This function calls TrainingUtils.free_cuda() for backward compatibility.
185
+ """
186
+ TrainingUtils.free_cuda()
@@ -0,0 +1,126 @@
1
+ """File I/O and parameter loading utilities.
2
+
3
+ This module contains:
4
+ - IOUtils class for loading parameters from JSON/CSV/TSV files
5
+ - csv_to_dict() for CSV file handling
6
+ - File path sanitization utilities
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import csv
12
+ import json
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List
15
+
16
+ import pandas as pd
17
+
18
+
19
+ class IOUtils:
20
+ """File and path utilities for model parameters and configs."""
21
+
22
+ @staticmethod
23
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
24
+ """Load CSV file as list of dictionaries.
25
+
26
+ Args:
27
+ file_path: Path to CSV file
28
+
29
+ Returns:
30
+ List of dictionaries, one per row
31
+ """
32
+ with open(file_path, mode='r', encoding='utf-8') as file:
33
+ reader = csv.DictReader(file)
34
+ return [
35
+ dict(filter(lambda item: item[0] != '', row.items()))
36
+ for row in reader
37
+ ]
38
+
39
+ @staticmethod
40
+ def ensure_parent_dir(file_path: str) -> None:
41
+ """Create parent directories when missing.
42
+
43
+ Args:
44
+ file_path: Path to file whose parent directory should be created
45
+ """
46
+ directory = Path(file_path).parent
47
+ if directory and not directory.exists():
48
+ directory.mkdir(parents=True, exist_ok=True)
49
+
50
+ @staticmethod
51
+ def _sanitize_params_dict(params: Dict[str, Any]) -> Dict[str, Any]:
52
+ """Filter index-like columns such as "Unnamed: 0" from pandas I/O.
53
+
54
+ Args:
55
+ params: Parameter dictionary
56
+
57
+ Returns:
58
+ Sanitized parameter dictionary
59
+ """
60
+ return {
61
+ k: v
62
+ for k, v in (params or {}).items()
63
+ if k and not str(k).startswith("Unnamed")
64
+ }
65
+
66
+ @staticmethod
67
+ def load_params_file(path: str) -> Dict[str, Any]:
68
+ """Load parameter dict from JSON/CSV/TSV files.
69
+
70
+ Supported formats:
71
+ - JSON: accept dict or {"best_params": {...}} wrapper
72
+ - CSV/TSV: read the first row as params
73
+
74
+ Args:
75
+ path: Path to parameter file
76
+
77
+ Returns:
78
+ Parameter dictionary
79
+
80
+ Raises:
81
+ FileNotFoundError: If file doesn't exist
82
+ ValueError: If file format is unsupported or invalid
83
+ """
84
+ file_path = Path(path).expanduser().resolve()
85
+ if not file_path.exists():
86
+ raise FileNotFoundError(f"params file not found: {file_path}")
87
+
88
+ suffix = file_path.suffix.lower()
89
+
90
+ if suffix == ".json":
91
+ payload = json.loads(file_path.read_text(
92
+ encoding="utf-8", errors="replace"))
93
+ if isinstance(payload, dict) and "best_params" in payload:
94
+ payload = payload.get("best_params") or {}
95
+ if not isinstance(payload, dict):
96
+ raise ValueError(
97
+ f"Invalid JSON params file (expect dict): {file_path}")
98
+ return IOUtils._sanitize_params_dict(dict(payload))
99
+
100
+ if suffix in (".csv", ".tsv"):
101
+ df = pd.read_csv(file_path, sep="\t" if suffix == ".tsv" else ",")
102
+ if df.empty:
103
+ raise ValueError(f"Empty params file: {file_path}")
104
+ params = df.iloc[0].to_dict()
105
+ return IOUtils._sanitize_params_dict(params)
106
+
107
+ raise ValueError(
108
+ f"Unsupported params file type '{suffix}': {file_path}")
109
+
110
+
111
+ # Backward compatibility function wrapper
112
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
113
+ """Load CSV file as list of dictionaries (legacy function).
114
+
115
+ Args:
116
+ file_path: Path to CSV file
117
+
118
+ Returns:
119
+ List of dictionaries, one per row
120
+ """
121
+ return IOUtils.csv_to_dict(file_path)
122
+
123
+
124
+ def ensure_parent_dir(file_path: str) -> None:
125
+ """Create parent directories when missing (legacy function)."""
126
+ IOUtils.ensure_parent_dir(file_path)