ins-pricing 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. ins_pricing/cli/BayesOpt_entry.py +15 -5
  2. ins_pricing/cli/BayesOpt_incremental.py +43 -10
  3. ins_pricing/cli/Explain_Run.py +16 -5
  4. ins_pricing/cli/Explain_entry.py +29 -8
  5. ins_pricing/cli/Pricing_Run.py +16 -5
  6. ins_pricing/cli/bayesopt_entry_runner.py +45 -12
  7. ins_pricing/cli/utils/bootstrap.py +23 -0
  8. ins_pricing/cli/utils/cli_config.py +34 -15
  9. ins_pricing/cli/utils/import_resolver.py +14 -14
  10. ins_pricing/cli/utils/notebook_utils.py +120 -106
  11. ins_pricing/cli/watchdog_run.py +15 -5
  12. ins_pricing/frontend/app.py +132 -61
  13. ins_pricing/frontend/config_builder.py +33 -0
  14. ins_pricing/frontend/example_config.json +11 -0
  15. ins_pricing/frontend/runner.py +340 -388
  16. ins_pricing/modelling/README.md +1 -1
  17. ins_pricing/modelling/bayesopt/README.md +29 -11
  18. ins_pricing/modelling/bayesopt/config_components.py +12 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +50 -13
  20. ins_pricing/modelling/bayesopt/core.py +47 -19
  21. ins_pricing/modelling/bayesopt/model_plotting_mixin.py +20 -14
  22. ins_pricing/modelling/bayesopt/models/model_ft_components.py +349 -342
  23. ins_pricing/modelling/bayesopt/models/model_ft_trainer.py +11 -5
  24. ins_pricing/modelling/bayesopt/models/model_gnn.py +20 -14
  25. ins_pricing/modelling/bayesopt/models/model_resn.py +9 -3
  26. ins_pricing/modelling/bayesopt/trainers/trainer_base.py +62 -50
  27. ins_pricing/modelling/bayesopt/trainers/trainer_ft.py +61 -53
  28. ins_pricing/modelling/bayesopt/trainers/trainer_glm.py +9 -3
  29. ins_pricing/modelling/bayesopt/trainers/trainer_gnn.py +40 -32
  30. ins_pricing/modelling/bayesopt/trainers/trainer_resn.py +36 -24
  31. ins_pricing/modelling/bayesopt/trainers/trainer_xgb.py +240 -37
  32. ins_pricing/modelling/bayesopt/utils/distributed_utils.py +193 -186
  33. ins_pricing/modelling/bayesopt/utils/torch_trainer_mixin.py +23 -10
  34. ins_pricing/pricing/factors.py +67 -56
  35. ins_pricing/setup.py +1 -1
  36. ins_pricing/utils/__init__.py +7 -6
  37. ins_pricing/utils/device.py +45 -24
  38. ins_pricing/utils/logging.py +34 -1
  39. ins_pricing/utils/profiling.py +8 -4
  40. {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +182 -182
  41. {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.1.dist-info}/RECORD +43 -42
  42. {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
  43. {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
@@ -1,186 +1,193 @@
1
- """Distributed training utilities for PyTorch DDP.
2
-
3
- This module contains:
4
- - DistributedUtils for DDP setup and process coordination
5
- - TrainingUtils for CUDA memory management
6
- - free_cuda() for legacy compatibility
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- import gc
12
- import os
13
- from datetime import timedelta
14
- from typing import Optional
15
-
16
- import torch
17
- import torch.distributed as dist
18
-
19
-
20
- def _select_ddp_backend() -> str:
21
- """Select the appropriate DDP backend based on system capabilities.
22
-
23
- Returns:
24
- "nccl" if CUDA is available and NCCL is supported (non-Windows),
25
- otherwise "gloo"
26
- """
27
- if not torch.cuda.is_available():
28
- return "gloo"
29
-
30
- if os.name == "nt": # Windows doesn't support NCCL
31
- return "gloo"
32
-
33
- try:
34
- nccl_available = getattr(dist, "is_nccl_available", lambda: False)()
35
- return "nccl" if nccl_available else "gloo"
36
- except Exception:
37
- return "gloo"
38
-
39
-
40
- def _get_ddp_timeout() -> timedelta:
41
- """Get the DDP timeout from environment variable.
42
-
43
- Returns:
44
- timedelta for DDP timeout (default: 1800 seconds)
45
- """
46
- timeout_seconds = int(os.environ.get("BAYESOPT_DDP_TIMEOUT_SECONDS", "1800"))
47
- return timedelta(seconds=max(1, timeout_seconds))
48
-
49
-
50
- def _cache_ddp_state(local_rank: int, rank: int, world_size: int) -> tuple:
51
- """Cache and return DDP state tuple."""
52
- state = (True, local_rank, rank, world_size)
53
- DistributedUtils._cached_state = state
54
- return state
55
-
56
-
57
- class DistributedUtils:
58
- """Utilities for distributed data parallel training.
59
-
60
- This class provides methods for:
61
- - Initializing DDP process groups
62
- - Checking process rank and world size
63
- - Cleanup after distributed training
64
- """
65
-
66
- _cached_state: Optional[tuple] = None
67
-
68
- @staticmethod
69
- def setup_ddp():
70
- """Initialize the DDP process group for distributed training.
71
-
72
- Returns:
73
- Tuple of (success, local_rank, rank, world_size)
74
- """
75
- # Return cached state if already initialized
76
- if dist.is_initialized():
77
- if DistributedUtils._cached_state is None:
78
- DistributedUtils._cached_state = _cache_ddp_state(
79
- int(os.environ.get("LOCAL_RANK", 0)),
80
- dist.get_rank(),
81
- dist.get_world_size(),
82
- )
83
- return DistributedUtils._cached_state
84
-
85
- # Check for required environment variables
86
- if 'RANK' not in os.environ or 'WORLD_SIZE' not in os.environ:
87
- print(
88
- f">>> DDP Setup Failed: RANK or WORLD_SIZE not found in env. "
89
- f"Keys found: {list(os.environ.keys())}"
90
- )
91
- print(">>> Hint: launch with torchrun --nproc_per_node=<N> <script.py>")
92
- return False, 0, 0, 1
93
-
94
- rank = int(os.environ["RANK"])
95
- world_size = int(os.environ["WORLD_SIZE"])
96
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
97
-
98
- # Windows CUDA DDP is not supported
99
- if os.name == "nt" and torch.cuda.is_available() and world_size > 1:
100
- print(
101
- ">>> DDP Setup Disabled: Windows CUDA DDP is not supported. "
102
- "Falling back to single process."
103
- )
104
- return False, 0, 0, 1
105
-
106
- # Set CUDA device for this process
107
- if torch.cuda.is_available():
108
- torch.cuda.set_device(local_rank)
109
-
110
- # Initialize process group
111
- backend = _select_ddp_backend()
112
- timeout = _get_ddp_timeout()
113
-
114
- dist.init_process_group(backend=backend, init_method="env://", timeout=timeout)
115
- print(
116
- f">>> DDP Initialized ({backend}, timeout={timeout.total_seconds():.0f}s): "
117
- f"Rank {rank}/{world_size}, Local Rank {local_rank}"
118
- )
119
-
120
- return _cache_ddp_state(local_rank, rank, world_size)
121
-
122
- @staticmethod
123
- def cleanup_ddp():
124
- """Destroy the DDP process group and clear cached state."""
125
- if dist.is_initialized():
126
- dist.destroy_process_group()
127
- DistributedUtils._cached_state = None
128
-
129
- @staticmethod
130
- def is_main_process():
131
- """Check if current process is rank 0 (main process).
132
-
133
- Returns:
134
- True if main process or DDP not initialized
135
- """
136
- return not dist.is_initialized() or dist.get_rank() == 0
137
-
138
- @staticmethod
139
- def world_size() -> int:
140
- """Get the total number of processes in the distributed group.
141
-
142
- Returns:
143
- World size (1 if DDP not initialized)
144
- """
145
- return dist.get_world_size() if dist.is_initialized() else 1
146
-
147
-
148
- class TrainingUtils:
149
- """General training utilities including CUDA management."""
150
-
151
- @staticmethod
152
- def free_cuda() -> None:
153
- """Release CUDA memory and clear cache.
154
-
155
- This performs aggressive cleanup:
156
- 1. Move all PyTorch models to CPU
157
- 2. Run garbage collection
158
- 3. Clear CUDA cache
159
- """
160
- print(">>> Moving all models to CPU...")
161
- for obj in gc.get_objects():
162
- try:
163
- if hasattr(obj, "to") and callable(obj.to):
164
- obj.to("cpu")
165
- except Exception:
166
- pass
167
-
168
- print(">>> Releasing tensor/optimizer/DataLoader references...")
169
- gc.collect()
170
-
171
- print(">>> Clearing CUDA cache...")
172
- if torch.cuda.is_available():
173
- torch.cuda.empty_cache()
174
- torch.cuda.synchronize()
175
- print(">>> CUDA memory released.")
176
- else:
177
- print(">>> CUDA not available; cleanup skipped.")
178
-
179
-
180
- # Backward compatibility function wrapper
181
- def free_cuda():
182
- """Legacy function wrapper for CUDA memory cleanup.
183
-
184
- This function calls TrainingUtils.free_cuda() for backward compatibility.
185
- """
186
- TrainingUtils.free_cuda()
1
+ """Distributed training utilities for PyTorch DDP.
2
+
3
+ This module contains:
4
+ - DistributedUtils for DDP setup and process coordination
5
+ - TrainingUtils for CUDA memory management
6
+ - free_cuda() for legacy compatibility
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import gc
12
+ import os
13
+ from datetime import timedelta
14
+ from typing import Optional
15
+
16
+ import torch
17
+ import torch.distributed as dist
18
+ from ins_pricing.utils import get_logger, log_print
19
+
20
+ _logger = get_logger("ins_pricing.modelling.bayesopt.utils.distributed_utils")
21
+
22
+
23
+ def _log(*args, **kwargs) -> None:
24
+ log_print(_logger, *args, **kwargs)
25
+
26
+
27
+ def _select_ddp_backend() -> str:
28
+ """Select the appropriate DDP backend based on system capabilities.
29
+
30
+ Returns:
31
+ "nccl" if CUDA is available and NCCL is supported (non-Windows),
32
+ otherwise "gloo"
33
+ """
34
+ if not torch.cuda.is_available():
35
+ return "gloo"
36
+
37
+ if os.name == "nt": # Windows doesn't support NCCL
38
+ return "gloo"
39
+
40
+ try:
41
+ nccl_available = getattr(dist, "is_nccl_available", lambda: False)()
42
+ return "nccl" if nccl_available else "gloo"
43
+ except Exception:
44
+ return "gloo"
45
+
46
+
47
+ def _get_ddp_timeout() -> timedelta:
48
+ """Get the DDP timeout from environment variable.
49
+
50
+ Returns:
51
+ timedelta for DDP timeout (default: 1800 seconds)
52
+ """
53
+ timeout_seconds = int(os.environ.get("BAYESOPT_DDP_TIMEOUT_SECONDS", "1800"))
54
+ return timedelta(seconds=max(1, timeout_seconds))
55
+
56
+
57
+ def _cache_ddp_state(local_rank: int, rank: int, world_size: int) -> tuple:
58
+ """Cache and return DDP state tuple."""
59
+ state = (True, local_rank, rank, world_size)
60
+ DistributedUtils._cached_state = state
61
+ return state
62
+
63
+
64
+ class DistributedUtils:
65
+ """Utilities for distributed data parallel training.
66
+
67
+ This class provides methods for:
68
+ - Initializing DDP process groups
69
+ - Checking process rank and world size
70
+ - Cleanup after distributed training
71
+ """
72
+
73
+ _cached_state: Optional[tuple] = None
74
+
75
+ @staticmethod
76
+ def setup_ddp():
77
+ """Initialize the DDP process group for distributed training.
78
+
79
+ Returns:
80
+ Tuple of (success, local_rank, rank, world_size)
81
+ """
82
+ # Return cached state if already initialized
83
+ if dist.is_initialized():
84
+ if DistributedUtils._cached_state is None:
85
+ DistributedUtils._cached_state = _cache_ddp_state(
86
+ int(os.environ.get("LOCAL_RANK", 0)),
87
+ dist.get_rank(),
88
+ dist.get_world_size(),
89
+ )
90
+ return DistributedUtils._cached_state
91
+
92
+ # Check for required environment variables
93
+ if 'RANK' not in os.environ or 'WORLD_SIZE' not in os.environ:
94
+ _log(
95
+ f">>> DDP Setup Failed: RANK or WORLD_SIZE not found in env. "
96
+ f"Keys found: {list(os.environ.keys())}"
97
+ )
98
+ _log(">>> Hint: launch with torchrun --nproc_per_node=<N> <script.py>")
99
+ return False, 0, 0, 1
100
+
101
+ rank = int(os.environ["RANK"])
102
+ world_size = int(os.environ["WORLD_SIZE"])
103
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
104
+
105
+ # Windows CUDA DDP is not supported
106
+ if os.name == "nt" and torch.cuda.is_available() and world_size > 1:
107
+ _log(
108
+ ">>> DDP Setup Disabled: Windows CUDA DDP is not supported. "
109
+ "Falling back to single process."
110
+ )
111
+ return False, 0, 0, 1
112
+
113
+ # Set CUDA device for this process
114
+ if torch.cuda.is_available():
115
+ torch.cuda.set_device(local_rank)
116
+
117
+ # Initialize process group
118
+ backend = _select_ddp_backend()
119
+ timeout = _get_ddp_timeout()
120
+
121
+ dist.init_process_group(backend=backend, init_method="env://", timeout=timeout)
122
+ _log(
123
+ f">>> DDP Initialized ({backend}, timeout={timeout.total_seconds():.0f}s): "
124
+ f"Rank {rank}/{world_size}, Local Rank {local_rank}"
125
+ )
126
+
127
+ return _cache_ddp_state(local_rank, rank, world_size)
128
+
129
+ @staticmethod
130
+ def cleanup_ddp():
131
+ """Destroy the DDP process group and clear cached state."""
132
+ if dist.is_initialized():
133
+ dist.destroy_process_group()
134
+ DistributedUtils._cached_state = None
135
+
136
+ @staticmethod
137
+ def is_main_process():
138
+ """Check if current process is rank 0 (main process).
139
+
140
+ Returns:
141
+ True if main process or DDP not initialized
142
+ """
143
+ return not dist.is_initialized() or dist.get_rank() == 0
144
+
145
+ @staticmethod
146
+ def world_size() -> int:
147
+ """Get the total number of processes in the distributed group.
148
+
149
+ Returns:
150
+ World size (1 if DDP not initialized)
151
+ """
152
+ return dist.get_world_size() if dist.is_initialized() else 1
153
+
154
+
155
+ class TrainingUtils:
156
+ """General training utilities including CUDA management."""
157
+
158
+ @staticmethod
159
+ def free_cuda() -> None:
160
+ """Release CUDA memory and clear cache.
161
+
162
+ This performs aggressive cleanup:
163
+ 1. Move all PyTorch models to CPU
164
+ 2. Run garbage collection
165
+ 3. Clear CUDA cache
166
+ """
167
+ _log(">>> Moving all models to CPU...")
168
+ for obj in gc.get_objects():
169
+ try:
170
+ if hasattr(obj, "to") and callable(obj.to):
171
+ obj.to("cpu")
172
+ except Exception:
173
+ pass
174
+
175
+ _log(">>> Releasing tensor/optimizer/DataLoader references...")
176
+ gc.collect()
177
+
178
+ _log(">>> Clearing CUDA cache...")
179
+ if torch.cuda.is_available():
180
+ torch.cuda.empty_cache()
181
+ torch.cuda.synchronize()
182
+ _log(">>> CUDA memory released.")
183
+ else:
184
+ _log(">>> CUDA not available; cleanup skipped.")
185
+
186
+
187
+ # Backward compatibility function wrapper
188
+ def free_cuda():
189
+ """Legacy function wrapper for CUDA memory cleanup.
190
+
191
+ This function calls TrainingUtils.free_cuda() for backward compatibility.
192
+ """
193
+ TrainingUtils.free_cuda()
@@ -51,7 +51,14 @@ except Exception:
51
51
  plot_loss_curve_common = None
52
52
 
53
53
  # Import from other utils modules
54
- from ins_pricing.utils import EPS, compute_batch_size, tweedie_loss, ensure_parent_dir
54
+ from ins_pricing.utils import (
55
+ EPS,
56
+ compute_batch_size,
57
+ tweedie_loss,
58
+ ensure_parent_dir,
59
+ get_logger,
60
+ log_print,
61
+ )
55
62
  from ins_pricing.utils.losses import (
56
63
  infer_loss_name_from_model_name,
57
64
  loss_requires_positive,
@@ -60,13 +67,19 @@ from ins_pricing.utils.losses import (
60
67
  )
61
68
  from ins_pricing.modelling.bayesopt.utils.distributed_utils import DistributedUtils
62
69
 
70
+ _logger = get_logger("ins_pricing.modelling.bayesopt.utils.torch_trainer_mixin")
71
+
72
+
73
+ def _log(*args, **kwargs) -> None:
74
+ log_print(_logger, *args, **kwargs)
75
+
63
76
 
64
77
  def _plot_skip(label: str) -> None:
65
78
  """Print message when plot is skipped due to missing matplotlib."""
66
79
  if _MPL_IMPORT_ERROR is not None:
67
- print(f"[Plot] Skip {label}: matplotlib unavailable ({_MPL_IMPORT_ERROR}).", flush=True)
80
+ _log(f"[Plot] Skip {label}: matplotlib unavailable ({_MPL_IMPORT_ERROR}).", flush=True)
68
81
  else:
69
- print(f"[Plot] Skip {label}: matplotlib unavailable.", flush=True)
82
+ _log(f"[Plot] Skip {label}: matplotlib unavailable.", flush=True)
70
83
 
71
84
 
72
85
  class TorchTrainerMixin:
@@ -121,7 +134,7 @@ class TorchTrainerMixin:
121
134
  mps_available = bool(getattr(torch.backends, "mps", None) and torch.backends.mps.is_available())
122
135
  ddp_enabled = bool(getattr(self, "is_ddp_enabled", False))
123
136
  data_parallel = bool(getattr(self, "use_data_parallel", False))
124
- print(
137
+ _log(
125
138
  f">>> Resource summary: device={device}, device_type={device_type}, "
126
139
  f"cpu_count={cpu_count}, cuda_count={cuda_count}, mps={mps_available}, "
127
140
  f"ddp={ddp_enabled}, data_parallel={data_parallel}, profile={profile}"
@@ -229,7 +242,7 @@ class TorchTrainerMixin:
229
242
  return batch_size
230
243
  max_batch = max(1, int(budget // per_sample))
231
244
  if max_batch < batch_size:
232
- print(
245
+ _log(
233
246
  f">>> Memory cap: batch_size {batch_size} -> {max_batch} "
234
247
  f"(per_sample~{sample_bytes}B, budget~{budget // (1024**2)}MB)"
235
248
  )
@@ -289,7 +302,7 @@ class TorchTrainerMixin:
289
302
  device_count = torch.cuda.device_count()
290
303
  if device_count > 1:
291
304
  min_bs = min_bs * device_count
292
- print(
305
+ _log(
293
306
  f">>> Multi-GPU detected: {device_count} devices. Adjusted min_bs to {min_bs}.")
294
307
 
295
308
  if data_size > large_threshold:
@@ -329,7 +342,7 @@ class TorchTrainerMixin:
329
342
  if workers > 0:
330
343
  prefetch_factor = 4 if profile == "throughput" else 2
331
344
  persistent = workers > 0 and profile != "memory_saving"
332
- print(
345
+ _log(
333
346
  f">>> DataLoader config: Batch Size={batch_size}, Accum Steps={accum_steps}, "
334
347
  f"Workers={workers}, Prefetch={prefetch_factor or 'off'}, Profile={profile}")
335
348
  sampler = None
@@ -559,13 +572,13 @@ class TorchTrainerMixin:
559
572
  if should_log_epoch:
560
573
  elapsed = int(time.time() - epoch_start_ts)
561
574
  if val_weighted_loss is None:
562
- print(
575
+ _log(
563
576
  f"[Training] Epoch {epoch}/{getattr(self, 'epochs', 1)} "
564
577
  f"train_loss={float(train_epoch_loss):.6f} elapsed={elapsed}s",
565
578
  flush=True,
566
579
  )
567
580
  else:
568
- print(
581
+ _log(
569
582
  f"[Training] Epoch {epoch}/{getattr(self, 'epochs', 1)} "
570
583
  f"train_loss={float(train_epoch_loss):.6f} "
571
584
  f"val_loss={float(val_weighted_loss):.6f} elapsed={elapsed}s",
@@ -620,4 +633,4 @@ class TorchTrainerMixin:
620
633
  plt.tight_layout()
621
634
  plt.savefig(save_path, dpi=300)
622
635
  plt.close(fig)
623
- print(f"[Training] Loss curve saved to {save_path}")
636
+ _log(f"[Training] Loss curve saved to {save_path}")
@@ -1,43 +1,46 @@
1
1
  from __future__ import annotations
2
2
 
3
- from functools import lru_cache
3
+ from collections import OrderedDict
4
+ import hashlib
4
5
  from typing import Optional, Tuple
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
8
9
 
9
10
 
10
- @lru_cache(maxsize=128)
11
- def _compute_bins_cached(
12
- data_hash: int,
13
- n_bins: int,
14
- method: str,
15
- min_val: float,
16
- max_val: float,
17
- n_unique: int
18
- ) -> Tuple[tuple, int]:
19
- """Cache bin edge computation based on data characteristics.
11
+ _BIN_CACHE_MAXSIZE = 128
12
+ _BIN_CACHE: "OrderedDict[tuple, np.ndarray]" = OrderedDict()
13
+ _BIN_CACHE_HITS = 0
14
+ _BIN_CACHE_MISSES = 0
20
15
 
21
- Args:
22
- data_hash: Hash of sorted unique values for cache key
23
- n_bins: Number of bins to create
24
- method: Binning method ('quantile' or 'uniform')
25
- min_val: Minimum value in data
26
- max_val: Maximum value in data
27
- n_unique: Number of unique values
28
16
 
29
- Returns:
30
- Tuple of (bin_edges_tuple, actual_bins)
17
+ def _cache_key(series: pd.Series, n_bins: int, method: str) -> Optional[tuple]:
18
+ try:
19
+ values = series.dropna().to_numpy(dtype=float, copy=False)
20
+ if values.size == 0:
21
+ return None
22
+ values = np.sort(values)
23
+ digest = hashlib.blake2b(values.tobytes(), digest_size=16).hexdigest()
24
+ return (digest, int(values.size), int(n_bins), str(method))
25
+ except Exception:
26
+ return None
27
+
28
+
29
+ def _cache_get(key: tuple) -> Optional[np.ndarray]:
30
+ global _BIN_CACHE_HITS, _BIN_CACHE_MISSES
31
+ if key in _BIN_CACHE:
32
+ _BIN_CACHE_HITS += 1
33
+ _BIN_CACHE.move_to_end(key)
34
+ return _BIN_CACHE[key].copy()
35
+ _BIN_CACHE_MISSES += 1
36
+ return None
31
37
 
32
- Note:
33
- This function caches bin computation for identical data distributions.
34
- The cache key includes data_hash to ensure correctness while enabling
35
- reuse when the same column is binned multiple times.
36
- """
37
- # This function is called after validation, so we can safely compute
38
- # The actual binning is done in the calling function
39
- # This just provides a cache key mechanism
40
- return (data_hash, n_bins, method, min_val, max_val, n_unique), n_bins
38
+
39
+ def _cache_set(key: tuple, edges: np.ndarray) -> None:
40
+ _BIN_CACHE[key] = np.asarray(edges, dtype=float)
41
+ _BIN_CACHE.move_to_end(key)
42
+ if len(_BIN_CACHE) > _BIN_CACHE_MAXSIZE:
43
+ _BIN_CACHE.popitem(last=False)
41
44
 
42
45
 
43
46
  def bin_numeric(
@@ -66,34 +69,40 @@ def bin_numeric(
66
69
  When use_cache=True, identical distributions will reuse cached bin edges,
67
70
  improving performance when the same column is binned multiple times.
68
71
  """
69
- # Create cache key from data characteristics if caching enabled
70
- if use_cache:
71
- # Compute data characteristics for cache key
72
- unique_vals = series.dropna().unique()
73
- unique_sorted = np.sort(unique_vals)
74
- data_hash = hash(unique_sorted.tobytes())
75
- min_val = float(series.min())
76
- max_val = float(series.max())
77
- n_unique = len(unique_vals)
78
-
79
- # Check cache (the function call acts as cache lookup)
80
- try:
81
- _compute_bins_cached(data_hash, bins, method, min_val, max_val, n_unique)
82
- except Exception:
83
- # If hashing fails, proceed without cache
84
- pass
72
+ cache_key = _cache_key(series, bins, method) if use_cache else None
73
+ bin_edges_full: Optional[np.ndarray] = None
74
+
75
+ if cache_key is not None:
76
+ bin_edges_full = _cache_get(cache_key)
77
+
78
+ if bin_edges_full is not None:
79
+ binned = pd.cut(series, bins=bin_edges_full, include_lowest=True, labels=labels)
80
+ return binned, np.asarray(bin_edges_full[:-1], dtype=float)
85
81
 
86
82
  # Perform actual binning
87
83
  if method == "quantile":
88
- binned = pd.qcut(series, q=bins, duplicates="drop", labels=labels)
89
- bin_edges = binned.cat.categories.left.to_numpy()
84
+ binned, bin_edges_full = pd.qcut(
85
+ series,
86
+ q=bins,
87
+ duplicates="drop",
88
+ labels=labels,
89
+ retbins=True,
90
+ )
90
91
  elif method == "uniform":
91
- binned = pd.cut(series, bins=bins, include_lowest=include_lowest, labels=labels)
92
- bin_edges = binned.cat.categories.left.to_numpy()
92
+ binned, bin_edges_full = pd.cut(
93
+ series,
94
+ bins=bins,
95
+ include_lowest=include_lowest,
96
+ labels=labels,
97
+ retbins=True,
98
+ )
93
99
  else:
94
100
  raise ValueError("method must be one of: quantile, uniform.")
95
101
 
96
- return binned, bin_edges
102
+ if cache_key is not None and bin_edges_full is not None:
103
+ _cache_set(cache_key, np.asarray(bin_edges_full, dtype=float))
104
+
105
+ return binned, np.asarray(bin_edges_full[:-1], dtype=float)
97
106
 
98
107
 
99
108
  def clear_binning_cache() -> None:
@@ -108,7 +117,10 @@ def clear_binning_cache() -> None:
108
117
  >>> # After processing many different columns
109
118
  >>> clear_binning_cache()
110
119
  """
111
- _compute_bins_cached.cache_clear()
120
+ global _BIN_CACHE_HITS, _BIN_CACHE_MISSES
121
+ _BIN_CACHE.clear()
122
+ _BIN_CACHE_HITS = 0
123
+ _BIN_CACHE_MISSES = 0
112
124
 
113
125
 
114
126
  def get_cache_info() -> dict:
@@ -126,12 +138,11 @@ def get_cache_info() -> dict:
126
138
  >>> info = get_cache_info()
127
139
  >>> print(f"Cache hit rate: {info['hits'] / (info['hits'] + info['misses']):.2%}")
128
140
  """
129
- cache_info = _compute_bins_cached.cache_info()
130
141
  return {
131
- 'hits': cache_info.hits,
132
- 'misses': cache_info.misses,
133
- 'maxsize': cache_info.maxsize,
134
- 'currsize': cache_info.currsize
142
+ "hits": _BIN_CACHE_HITS,
143
+ "misses": _BIN_CACHE_MISSES,
144
+ "maxsize": _BIN_CACHE_MAXSIZE,
145
+ "currsize": len(_BIN_CACHE),
135
146
  }
136
147
 
137
148
 
ins_pricing/setup.py CHANGED
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="ins_pricing",
6
- version="0.5.0",
6
+ version="0.5.1",
7
7
  description="Reusable modelling, pricing, governance, and reporting utilities.",
8
8
  author="meishi125478",
9
9
  license="Proprietary",