ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ins_pricing/README.md +48 -22
  2. ins_pricing/__init__.py +142 -90
  3. ins_pricing/cli/BayesOpt_entry.py +58 -46
  4. ins_pricing/cli/BayesOpt_incremental.py +77 -110
  5. ins_pricing/cli/Explain_Run.py +42 -23
  6. ins_pricing/cli/Explain_entry.py +551 -577
  7. ins_pricing/cli/Pricing_Run.py +42 -23
  8. ins_pricing/cli/bayesopt_entry_runner.py +51 -16
  9. ins_pricing/cli/utils/bootstrap.py +23 -0
  10. ins_pricing/cli/utils/cli_common.py +256 -256
  11. ins_pricing/cli/utils/cli_config.py +379 -360
  12. ins_pricing/cli/utils/import_resolver.py +375 -358
  13. ins_pricing/cli/utils/notebook_utils.py +256 -242
  14. ins_pricing/cli/watchdog_run.py +216 -198
  15. ins_pricing/frontend/__init__.py +10 -10
  16. ins_pricing/frontend/app.py +132 -61
  17. ins_pricing/frontend/config_builder.py +33 -0
  18. ins_pricing/frontend/example_config.json +11 -0
  19. ins_pricing/frontend/example_workflows.py +1 -1
  20. ins_pricing/frontend/runner.py +340 -388
  21. ins_pricing/governance/__init__.py +20 -20
  22. ins_pricing/governance/release.py +159 -159
  23. ins_pricing/modelling/README.md +1 -1
  24. ins_pricing/modelling/__init__.py +147 -92
  25. ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
  26. ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
  27. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
  28. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
  29. ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
  30. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
  31. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
  32. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
  33. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
  34. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
  35. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
  36. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
  37. ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
  38. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
  39. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
  40. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
  41. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
  42. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
  43. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
  44. ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
  45. ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
  46. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
  47. ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
  48. ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
  49. ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
  50. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
  51. ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
  52. ins_pricing/modelling/explain/__init__.py +55 -55
  53. ins_pricing/modelling/explain/metrics.py +27 -174
  54. ins_pricing/modelling/explain/permutation.py +237 -237
  55. ins_pricing/modelling/plotting/__init__.py +40 -36
  56. ins_pricing/modelling/plotting/compat.py +228 -0
  57. ins_pricing/modelling/plotting/curves.py +572 -572
  58. ins_pricing/modelling/plotting/diagnostics.py +163 -163
  59. ins_pricing/modelling/plotting/geo.py +362 -362
  60. ins_pricing/modelling/plotting/importance.py +121 -121
  61. ins_pricing/pricing/__init__.py +27 -27
  62. ins_pricing/pricing/factors.py +67 -56
  63. ins_pricing/production/__init__.py +35 -25
  64. ins_pricing/production/{predict.py → inference.py} +140 -57
  65. ins_pricing/production/monitoring.py +8 -21
  66. ins_pricing/reporting/__init__.py +11 -11
  67. ins_pricing/setup.py +1 -1
  68. ins_pricing/tests/production/test_inference.py +90 -0
  69. ins_pricing/utils/__init__.py +112 -78
  70. ins_pricing/utils/device.py +258 -237
  71. ins_pricing/utils/features.py +53 -0
  72. ins_pricing/utils/io.py +72 -0
  73. ins_pricing/utils/logging.py +34 -1
  74. ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
  75. ins_pricing/utils/metrics.py +158 -24
  76. ins_pricing/utils/numerics.py +76 -0
  77. ins_pricing/utils/paths.py +9 -1
  78. ins_pricing/utils/profiling.py +8 -4
  79. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
  80. ins_pricing-0.5.1.dist-info/RECORD +132 -0
  81. ins_pricing/modelling/core/BayesOpt.py +0 -146
  82. ins_pricing/modelling/core/__init__.py +0 -1
  83. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
  84. ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
  85. ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
  86. ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
  87. ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
  88. ins_pricing/modelling/core/bayesopt/utils.py +0 -105
  89. ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
  90. ins_pricing/tests/production/test_predict.py +0 -233
  91. ins_pricing-0.4.5.dist-info/RECORD +0 -130
  92. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
  93. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
@@ -1,76 +1,91 @@
1
- """Device management utilities for PyTorch models.
2
-
1
+ """Device management utilities for PyTorch models.
2
+
3
3
  This module consolidates GPU/CPU device management logic from:
4
- - modelling/core/bayesopt/utils.py
5
- - modelling/core/bayesopt/trainers/trainer_base.py
6
- - production/predict.py
7
-
8
- Example:
9
- >>> from ins_pricing.utils import DeviceManager, GPUMemoryManager
10
- >>> device = DeviceManager.get_best_device()
11
- >>> DeviceManager.move_to_device(model, device)
12
- >>> with GPUMemoryManager.cleanup_context():
13
- ... model.train()
14
- """
15
-
16
- from __future__ import annotations
17
-
18
- import gc
19
- import os
20
- from contextlib import contextmanager
21
- from typing import Any, Dict, Optional
22
-
23
- try:
24
- import torch
25
- import torch.nn as nn
26
- from torch.nn.parallel import DistributedDataParallel as DDP
27
-
28
- TORCH_AVAILABLE = True
29
- except ImportError:
30
- TORCH_AVAILABLE = False
31
- torch = None
32
- nn = None
33
- DDP = None
34
-
35
- from .logging import get_logger
36
-
37
-
38
- # =============================================================================
39
- # GPU Memory Manager
40
- # =============================================================================
41
-
42
-
43
- class GPUMemoryManager:
44
- """Context manager for GPU memory management and cleanup.
45
-
46
- This class consolidates GPU memory cleanup logic that was previously
47
- scattered across multiple trainer files.
48
-
49
- Example:
50
- >>> with GPUMemoryManager.cleanup_context():
51
- ... model.train()
52
- ... # Memory cleaned up after exiting context
53
-
54
- >>> # Or use directly:
55
- >>> GPUMemoryManager.clean()
56
- """
57
-
58
- _logger = get_logger("ins_pricing.gpu")
59
-
60
- @classmethod
61
- def clean(cls, verbose: bool = False) -> None:
4
+ - modelling/bayesopt/utils.py
5
+ - modelling/bayesopt/trainers/trainer_base.py
6
+ - production/inference.py
7
+
8
+ Example:
9
+ >>> from ins_pricing.utils import DeviceManager, GPUMemoryManager
10
+ >>> device = DeviceManager.get_best_device()
11
+ >>> DeviceManager.move_to_device(model, device)
12
+ >>> with GPUMemoryManager.cleanup_context():
13
+ ... model.train()
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import gc
19
+ import os
20
+ from contextlib import contextmanager
21
+ from typing import Any, Dict, Optional
22
+
23
+ try:
24
+ import torch
25
+ import torch.nn as nn
26
+ from torch.nn.parallel import DistributedDataParallel as DDP
27
+
28
+ TORCH_AVAILABLE = True
29
+ except ImportError:
30
+ TORCH_AVAILABLE = False
31
+ torch = None
32
+ nn = None
33
+ DDP = None
34
+
35
+ from ins_pricing.utils.logging import get_logger
36
+
37
+
38
+ # =============================================================================
39
+ # GPU Memory Manager
40
+ # =============================================================================
41
+
42
+
43
+ class GPUMemoryManager:
44
+ """Context manager for GPU memory management and cleanup.
45
+
46
+ This class consolidates GPU memory cleanup logic that was previously
47
+ scattered across multiple trainer files.
48
+
49
+ Example:
50
+ >>> with GPUMemoryManager.cleanup_context():
51
+ ... model.train()
52
+ ... # Memory cleaned up after exiting context
53
+
54
+ >>> # Or use directly:
55
+ >>> GPUMemoryManager.clean()
56
+ """
57
+
58
+ _logger = get_logger("ins_pricing.gpu")
59
+
60
+ @classmethod
61
+ def clean(
62
+ cls,
63
+ verbose: bool = False,
64
+ *,
65
+ synchronize: bool = True,
66
+ empty_cache: bool = True,
67
+ ) -> None:
62
68
  """Clean up GPU memory.
63
69
 
64
70
  Args:
65
71
  verbose: If True, log cleanup details
72
+ synchronize: If True, synchronize CUDA device after cleanup
73
+ empty_cache: If True, clear CUDA cache
66
74
  """
67
75
  gc.collect()
68
76
 
69
77
  if TORCH_AVAILABLE and torch.cuda.is_available():
70
- torch.cuda.empty_cache()
71
- torch.cuda.synchronize()
78
+ if empty_cache:
79
+ torch.cuda.empty_cache()
80
+ if synchronize:
81
+ torch.cuda.synchronize()
72
82
  if verbose:
73
- cls._logger.debug("CUDA cache cleared and synchronized")
83
+ if empty_cache and synchronize:
84
+ cls._logger.debug("CUDA cache cleared and synchronized")
85
+ elif empty_cache:
86
+ cls._logger.debug("CUDA cache cleared")
87
+ elif synchronize:
88
+ cls._logger.debug("CUDA synchronized")
74
89
 
75
90
  # Optional: Force IPC collect for multi-process scenarios
76
91
  if os.environ.get("BAYESOPT_CUDA_IPC_COLLECT", "0") == "1":
@@ -80,177 +95,183 @@ class GPUMemoryManager:
80
95
  cls._logger.debug("CUDA IPC collect performed")
81
96
  except Exception:
82
97
  pass
83
-
84
- @classmethod
85
- @contextmanager
86
- def cleanup_context(cls, verbose: bool = False):
87
- """Context manager that cleans GPU memory on exit.
88
-
89
- Args:
90
- verbose: If True, log cleanup details
91
-
92
- Yields:
93
- None
94
- """
95
- try:
96
- yield
97
- finally:
98
- cls.clean(verbose=verbose)
99
-
100
- @classmethod
101
- def move_model_to_cpu(cls, model: Any) -> Any:
102
- """Move a model to CPU and clean GPU memory.
103
-
104
- Args:
105
- model: PyTorch model to move
106
-
107
- Returns:
108
- Model on CPU
109
- """
110
- if model is not None and hasattr(model, "to"):
111
- model.to("cpu")
112
- cls.clean()
113
- return model
114
-
115
- @classmethod
116
- def get_memory_info(cls) -> Dict[str, Any]:
117
- """Get current GPU memory usage information.
118
-
119
- Returns:
120
- Dictionary with memory info (allocated, reserved, free)
121
- """
122
- if not TORCH_AVAILABLE or not torch.cuda.is_available():
123
- return {"available": False}
124
-
125
- try:
126
- allocated = torch.cuda.memory_allocated()
127
- reserved = torch.cuda.memory_reserved()
128
- free, total = torch.cuda.mem_get_info()
129
- return {
130
- "available": True,
131
- "allocated_mb": allocated // (1024 * 1024),
132
- "reserved_mb": reserved // (1024 * 1024),
133
- "free_mb": free // (1024 * 1024),
134
- "total_mb": total // (1024 * 1024),
135
- }
136
- except Exception:
137
- return {"available": False}
138
-
139
-
140
- # =============================================================================
141
- # Device Manager
142
- # =============================================================================
143
-
144
-
145
- class DeviceManager:
146
- """Unified device management for model and tensor placement.
147
-
148
- This class consolidates device detection and model movement logic
149
- that was previously duplicated across trainer_base.py and predict.py.
150
-
151
- Example:
152
- >>> device = DeviceManager.get_best_device()
153
- >>> model = DeviceManager.move_to_device(model)
154
- """
155
-
156
- _logger = get_logger("ins_pricing.device")
157
- _cached_device: Optional[Any] = None # torch.device when available
158
-
159
- @classmethod
160
- def get_best_device(cls, prefer_cuda: bool = True) -> Any:
161
- """Get the best available device.
162
-
163
- Args:
164
- prefer_cuda: If True, prefer CUDA over MPS
165
-
166
- Returns:
167
- Best available torch.device
168
- """
169
- if not TORCH_AVAILABLE:
170
- return None
171
-
172
- if cls._cached_device is not None:
173
- return cls._cached_device
174
-
175
- if prefer_cuda and torch.cuda.is_available():
176
- cls._cached_device = torch.device("cuda")
177
- elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
178
- cls._cached_device = torch.device("mps")
179
- else:
180
- cls._cached_device = torch.device("cpu")
181
-
182
- cls._logger.debug(f"Selected device: {cls._cached_device}")
183
- return cls._cached_device
184
-
185
- @classmethod
186
- def move_to_device(cls, model_obj: Any, device: Optional[Any] = None) -> None:
187
- """Move a model object to the specified device.
188
-
189
- Handles sklearn-style wrappers that have .ft, .resnet, or .gnn attributes.
190
-
191
- Args:
192
- model_obj: Model object to move (may be sklearn wrapper)
193
- device: Target device (defaults to best available)
194
- """
195
- if model_obj is None:
196
- return
197
-
198
- device = device or cls.get_best_device()
199
- if device is None:
200
- return
201
-
202
- # Update device attribute if present
203
- if hasattr(model_obj, "device"):
204
- model_obj.device = device
205
-
206
- # Move the main model
207
- if hasattr(model_obj, "to"):
208
- model_obj.to(device)
209
-
210
- # Move nested submodules (sklearn wrappers)
211
- for attr_name in ("ft", "resnet", "gnn"):
212
- submodule = getattr(model_obj, attr_name, None)
213
- if submodule is not None and hasattr(submodule, "to"):
214
- submodule.to(device)
215
-
216
- @classmethod
217
- def unwrap_module(cls, module: Any) -> Any:
218
- """Unwrap DDP or DataParallel wrapper to get the base module.
219
-
220
- Args:
221
- module: Potentially wrapped PyTorch module
222
-
223
- Returns:
224
- Unwrapped base module
225
- """
226
- if not TORCH_AVAILABLE:
227
- return module
228
-
229
- if isinstance(module, (DDP, nn.DataParallel)):
230
- return module.module
231
- return module
232
-
233
- @classmethod
234
- def reset_cache(cls) -> None:
235
- """Reset cached device selection."""
236
- cls._cached_device = None
237
-
238
- @classmethod
239
- def is_cuda_available(cls) -> bool:
240
- """Check if CUDA is available.
241
-
242
- Returns:
243
- True if CUDA is available
244
- """
245
- return TORCH_AVAILABLE and torch.cuda.is_available()
246
-
247
- @classmethod
248
- def is_mps_available(cls) -> bool:
249
- """Check if MPS (Apple Silicon) is available.
250
-
251
- Returns:
252
- True if MPS is available
253
- """
254
- if not TORCH_AVAILABLE:
255
- return False
256
- return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
98
+
99
+ @classmethod
100
+ @contextmanager
101
+ def cleanup_context(
102
+ cls,
103
+ verbose: bool = False,
104
+ *,
105
+ synchronize: bool = True,
106
+ empty_cache: bool = True,
107
+ ):
108
+ """Context manager that cleans GPU memory on exit.
109
+
110
+ Args:
111
+ verbose: If True, log cleanup details
112
+
113
+ Yields:
114
+ None
115
+ """
116
+ try:
117
+ yield
118
+ finally:
119
+ cls.clean(verbose=verbose, synchronize=synchronize, empty_cache=empty_cache)
120
+
121
+ @classmethod
122
+ def move_model_to_cpu(cls, model: Any) -> Any:
123
+ """Move a model to CPU and clean GPU memory.
124
+
125
+ Args:
126
+ model: PyTorch model to move
127
+
128
+ Returns:
129
+ Model on CPU
130
+ """
131
+ if model is not None and hasattr(model, "to"):
132
+ model.to("cpu")
133
+ cls.clean()
134
+ return model
135
+
136
+ @classmethod
137
+ def get_memory_info(cls) -> Dict[str, Any]:
138
+ """Get current GPU memory usage information.
139
+
140
+ Returns:
141
+ Dictionary with memory info (allocated, reserved, free)
142
+ """
143
+ if not TORCH_AVAILABLE or not torch.cuda.is_available():
144
+ return {"available": False}
145
+
146
+ try:
147
+ allocated = torch.cuda.memory_allocated()
148
+ reserved = torch.cuda.memory_reserved()
149
+ free, total = torch.cuda.mem_get_info()
150
+ return {
151
+ "available": True,
152
+ "allocated_mb": allocated // (1024 * 1024),
153
+ "reserved_mb": reserved // (1024 * 1024),
154
+ "free_mb": free // (1024 * 1024),
155
+ "total_mb": total // (1024 * 1024),
156
+ }
157
+ except Exception:
158
+ return {"available": False}
159
+
160
+
161
+ # =============================================================================
162
+ # Device Manager
163
+ # =============================================================================
164
+
165
+
166
+ class DeviceManager:
167
+ """Unified device management for model and tensor placement.
168
+
169
+ This class consolidates device detection and model movement logic
170
+ that was previously duplicated across trainer_base.py and predict.py.
171
+
172
+ Example:
173
+ >>> device = DeviceManager.get_best_device()
174
+ >>> model = DeviceManager.move_to_device(model)
175
+ """
176
+
177
+ _logger = get_logger("ins_pricing.device")
178
+ _cached_device: Optional[Any] = None # torch.device when available
179
+
180
+ @classmethod
181
+ def get_best_device(cls, prefer_cuda: bool = True) -> Any:
182
+ """Get the best available device.
183
+
184
+ Args:
185
+ prefer_cuda: If True, prefer CUDA over MPS
186
+
187
+ Returns:
188
+ Best available torch.device
189
+ """
190
+ if not TORCH_AVAILABLE:
191
+ return None
192
+
193
+ if cls._cached_device is not None:
194
+ return cls._cached_device
195
+
196
+ if prefer_cuda and torch.cuda.is_available():
197
+ cls._cached_device = torch.device("cuda")
198
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
199
+ cls._cached_device = torch.device("mps")
200
+ else:
201
+ cls._cached_device = torch.device("cpu")
202
+
203
+ cls._logger.debug(f"Selected device: {cls._cached_device}")
204
+ return cls._cached_device
205
+
206
+ @classmethod
207
+ def move_to_device(cls, model_obj: Any, device: Optional[Any] = None) -> None:
208
+ """Move a model object to the specified device.
209
+
210
+ Handles sklearn-style wrappers that have .ft, .resnet, or .gnn attributes.
211
+
212
+ Args:
213
+ model_obj: Model object to move (may be sklearn wrapper)
214
+ device: Target device (defaults to best available)
215
+ """
216
+ if model_obj is None:
217
+ return
218
+
219
+ device = device or cls.get_best_device()
220
+ if device is None:
221
+ return
222
+
223
+ # Update device attribute if present
224
+ if hasattr(model_obj, "device"):
225
+ model_obj.device = device
226
+
227
+ # Move the main model
228
+ if hasattr(model_obj, "to"):
229
+ model_obj.to(device)
230
+
231
+ # Move nested submodules (sklearn wrappers)
232
+ for attr_name in ("ft", "resnet", "gnn"):
233
+ submodule = getattr(model_obj, attr_name, None)
234
+ if submodule is not None and hasattr(submodule, "to"):
235
+ submodule.to(device)
236
+
237
+ @classmethod
238
+ def unwrap_module(cls, module: Any) -> Any:
239
+ """Unwrap DDP or DataParallel wrapper to get the base module.
240
+
241
+ Args:
242
+ module: Potentially wrapped PyTorch module
243
+
244
+ Returns:
245
+ Unwrapped base module
246
+ """
247
+ if not TORCH_AVAILABLE:
248
+ return module
249
+
250
+ if isinstance(module, (DDP, nn.DataParallel)):
251
+ return module.module
252
+ return module
253
+
254
+ @classmethod
255
+ def reset_cache(cls) -> None:
256
+ """Reset cached device selection."""
257
+ cls._cached_device = None
258
+
259
+ @classmethod
260
+ def is_cuda_available(cls) -> bool:
261
+ """Check if CUDA is available.
262
+
263
+ Returns:
264
+ True if CUDA is available
265
+ """
266
+ return TORCH_AVAILABLE and torch.cuda.is_available()
267
+
268
+ @classmethod
269
+ def is_mps_available(cls) -> bool:
270
+ """Check if MPS (Apple Silicon) is available.
271
+
272
+ Returns:
273
+ True if MPS is available
274
+ """
275
+ if not TORCH_AVAILABLE:
276
+ return False
277
+ return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
@@ -0,0 +1,53 @@
1
+ """Feature inference helpers shared across training and production."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Optional, Tuple
6
+
7
+ import pandas as pd
8
+
9
+
10
+ def infer_factor_and_cate_list(
11
+ train_df: pd.DataFrame,
12
+ test_df: pd.DataFrame,
13
+ resp_nme: str,
14
+ weight_nme: str,
15
+ *,
16
+ binary_resp_nme: Optional[str] = None,
17
+ factor_nmes: Optional[List[str]] = None,
18
+ cate_list: Optional[List[str]] = None,
19
+ infer_categorical_max_unique: int = 50,
20
+ infer_categorical_max_ratio: float = 0.05,
21
+ ) -> Tuple[List[str], List[str]]:
22
+ """Infer factor_nmes/cate_list when feature names are not provided."""
23
+ excluded = {resp_nme, weight_nme}
24
+ if binary_resp_nme:
25
+ excluded.add(binary_resp_nme)
26
+
27
+ common_cols = [c for c in train_df.columns if c in test_df.columns]
28
+ if factor_nmes is None:
29
+ factors = [c for c in common_cols if c not in excluded]
30
+ else:
31
+ factors = [c for c in factor_nmes if c in common_cols and c not in excluded]
32
+
33
+ if cate_list is not None:
34
+ cats = [c for c in cate_list if c in factors]
35
+ return factors, cats
36
+
37
+ n_rows = max(1, len(train_df))
38
+ cats: List[str] = []
39
+ for col in factors:
40
+ s = train_df[col]
41
+ if (
42
+ pd.api.types.is_bool_dtype(s)
43
+ or pd.api.types.is_object_dtype(s)
44
+ or isinstance(s.dtype, pd.CategoricalDtype)
45
+ ):
46
+ cats.append(col)
47
+ continue
48
+ if pd.api.types.is_integer_dtype(s):
49
+ nunique = int(s.nunique(dropna=True))
50
+ if nunique <= infer_categorical_max_unique or (nunique / n_rows) <= infer_categorical_max_ratio:
51
+ cats.append(col)
52
+
53
+ return factors, cats
@@ -0,0 +1,72 @@
1
+ """File and path helpers shared across ins_pricing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List
9
+
10
+ import pandas as pd
11
+
12
+
13
+ def ensure_parent_dir(file_path: str) -> None:
14
+ """Create parent directories when missing."""
15
+ directory = Path(file_path).parent
16
+ if directory and not directory.exists():
17
+ directory.mkdir(parents=True, exist_ok=True)
18
+
19
+
20
+ class IOUtils:
21
+ """File and path utilities for model parameters and configs."""
22
+
23
+ @staticmethod
24
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
25
+ """Load CSV file as list of dictionaries."""
26
+ with open(file_path, mode="r", encoding="utf-8") as file:
27
+ reader = csv.DictReader(file)
28
+ return [dict(filter(lambda item: item[0] != "", row.items())) for row in reader]
29
+
30
+ @staticmethod
31
+ def ensure_parent_dir(file_path: str) -> None:
32
+ """Create parent directories when missing."""
33
+ ensure_parent_dir(file_path)
34
+
35
+ @staticmethod
36
+ def _sanitize_params_dict(params: Dict[str, Any]) -> Dict[str, Any]:
37
+ """Filter index-like columns such as "Unnamed: 0" from pandas I/O."""
38
+ return {
39
+ k: v
40
+ for k, v in (params or {}).items()
41
+ if k and not str(k).startswith("Unnamed")
42
+ }
43
+
44
+ @staticmethod
45
+ def load_params_file(path: str) -> Dict[str, Any]:
46
+ """Load parameter dict from JSON/CSV/TSV files."""
47
+ file_path = Path(path).expanduser().resolve()
48
+ if not file_path.exists():
49
+ raise FileNotFoundError(f"params file not found: {file_path}")
50
+
51
+ suffix = file_path.suffix.lower()
52
+ if suffix == ".json":
53
+ payload = json.loads(file_path.read_text(encoding="utf-8", errors="replace"))
54
+ if isinstance(payload, dict) and "best_params" in payload:
55
+ payload = payload.get("best_params") or {}
56
+ if not isinstance(payload, dict):
57
+ raise ValueError(f"Invalid JSON params file (expect dict): {file_path}")
58
+ return IOUtils._sanitize_params_dict(dict(payload))
59
+
60
+ if suffix in (".csv", ".tsv"):
61
+ df = pd.read_csv(file_path, sep="\t" if suffix == ".tsv" else ",")
62
+ if df.empty:
63
+ raise ValueError(f"Empty params file: {file_path}")
64
+ params = df.iloc[0].to_dict()
65
+ return IOUtils._sanitize_params_dict(params)
66
+
67
+ raise ValueError(f"Unsupported params file type '{suffix}': {file_path}")
68
+
69
+
70
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
71
+ """Load CSV file as list of dictionaries (legacy function)."""
72
+ return IOUtils.csv_to_dict(file_path)