ins-pricing 0.5.0__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/cli/BayesOpt_entry.py +15 -5
- ins_pricing/cli/BayesOpt_incremental.py +43 -10
- ins_pricing/cli/Explain_Run.py +16 -5
- ins_pricing/cli/Explain_entry.py +29 -8
- ins_pricing/cli/Pricing_Run.py +16 -5
- ins_pricing/cli/bayesopt_entry_runner.py +45 -12
- ins_pricing/cli/utils/bootstrap.py +23 -0
- ins_pricing/cli/utils/cli_config.py +34 -15
- ins_pricing/cli/utils/import_resolver.py +14 -14
- ins_pricing/cli/utils/notebook_utils.py +120 -106
- ins_pricing/cli/watchdog_run.py +15 -5
- ins_pricing/frontend/app.py +132 -61
- ins_pricing/frontend/config_builder.py +33 -0
- ins_pricing/frontend/example_config.json +11 -0
- ins_pricing/frontend/runner.py +340 -388
- ins_pricing/modelling/README.md +1 -1
- ins_pricing/modelling/__init__.py +10 -10
- ins_pricing/modelling/bayesopt/README.md +29 -11
- ins_pricing/modelling/bayesopt/config_components.py +12 -0
- ins_pricing/modelling/bayesopt/config_preprocess.py +50 -13
- ins_pricing/modelling/bayesopt/core.py +47 -19
- ins_pricing/modelling/bayesopt/model_plotting_mixin.py +20 -14
- ins_pricing/modelling/bayesopt/models/model_ft_components.py +349 -342
- ins_pricing/modelling/bayesopt/models/model_ft_trainer.py +11 -5
- ins_pricing/modelling/bayesopt/models/model_gnn.py +20 -14
- ins_pricing/modelling/bayesopt/models/model_resn.py +9 -3
- ins_pricing/modelling/bayesopt/trainers/trainer_base.py +62 -50
- ins_pricing/modelling/bayesopt/trainers/trainer_ft.py +61 -53
- ins_pricing/modelling/bayesopt/trainers/trainer_glm.py +9 -3
- ins_pricing/modelling/bayesopt/trainers/trainer_gnn.py +40 -32
- ins_pricing/modelling/bayesopt/trainers/trainer_resn.py +36 -24
- ins_pricing/modelling/bayesopt/trainers/trainer_xgb.py +240 -37
- ins_pricing/modelling/bayesopt/utils/distributed_utils.py +193 -186
- ins_pricing/modelling/bayesopt/utils/torch_trainer_mixin.py +23 -10
- ins_pricing/pricing/factors.py +67 -56
- ins_pricing/setup.py +1 -1
- ins_pricing/utils/__init__.py +7 -6
- ins_pricing/utils/device.py +45 -24
- ins_pricing/utils/logging.py +34 -1
- ins_pricing/utils/profiling.py +8 -4
- {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.3.dist-info}/METADATA +182 -182
- {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.3.dist-info}/RECORD +44 -43
- {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.3.dist-info}/WHEEL +0 -0
- {ins_pricing-0.5.0.dist-info → ins_pricing-0.5.3.dist-info}/top_level.txt +0 -0
|
@@ -1,186 +1,193 @@
|
|
|
1
|
-
"""Distributed training utilities for PyTorch DDP.
|
|
2
|
-
|
|
3
|
-
This module contains:
|
|
4
|
-
- DistributedUtils for DDP setup and process coordination
|
|
5
|
-
- TrainingUtils for CUDA memory management
|
|
6
|
-
- free_cuda() for legacy compatibility
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from __future__ import annotations
|
|
10
|
-
|
|
11
|
-
import gc
|
|
12
|
-
import os
|
|
13
|
-
from datetime import timedelta
|
|
14
|
-
from typing import Optional
|
|
15
|
-
|
|
16
|
-
import torch
|
|
17
|
-
import torch.distributed as dist
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
return "
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
return
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
@staticmethod
|
|
130
|
-
def
|
|
131
|
-
"""
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
1
|
+
"""Distributed training utilities for PyTorch DDP.
|
|
2
|
+
|
|
3
|
+
This module contains:
|
|
4
|
+
- DistributedUtils for DDP setup and process coordination
|
|
5
|
+
- TrainingUtils for CUDA memory management
|
|
6
|
+
- free_cuda() for legacy compatibility
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import gc
|
|
12
|
+
import os
|
|
13
|
+
from datetime import timedelta
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
import torch
|
|
17
|
+
import torch.distributed as dist
|
|
18
|
+
from ins_pricing.utils import get_logger, log_print
|
|
19
|
+
|
|
20
|
+
_logger = get_logger("ins_pricing.modelling.bayesopt.utils.distributed_utils")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _log(*args, **kwargs) -> None:
|
|
24
|
+
log_print(_logger, *args, **kwargs)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _select_ddp_backend() -> str:
|
|
28
|
+
"""Select the appropriate DDP backend based on system capabilities.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
"nccl" if CUDA is available and NCCL is supported (non-Windows),
|
|
32
|
+
otherwise "gloo"
|
|
33
|
+
"""
|
|
34
|
+
if not torch.cuda.is_available():
|
|
35
|
+
return "gloo"
|
|
36
|
+
|
|
37
|
+
if os.name == "nt": # Windows doesn't support NCCL
|
|
38
|
+
return "gloo"
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
nccl_available = getattr(dist, "is_nccl_available", lambda: False)()
|
|
42
|
+
return "nccl" if nccl_available else "gloo"
|
|
43
|
+
except Exception:
|
|
44
|
+
return "gloo"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_ddp_timeout() -> timedelta:
|
|
48
|
+
"""Get the DDP timeout from environment variable.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
timedelta for DDP timeout (default: 1800 seconds)
|
|
52
|
+
"""
|
|
53
|
+
timeout_seconds = int(os.environ.get("BAYESOPT_DDP_TIMEOUT_SECONDS", "1800"))
|
|
54
|
+
return timedelta(seconds=max(1, timeout_seconds))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _cache_ddp_state(local_rank: int, rank: int, world_size: int) -> tuple:
|
|
58
|
+
"""Cache and return DDP state tuple."""
|
|
59
|
+
state = (True, local_rank, rank, world_size)
|
|
60
|
+
DistributedUtils._cached_state = state
|
|
61
|
+
return state
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DistributedUtils:
|
|
65
|
+
"""Utilities for distributed data parallel training.
|
|
66
|
+
|
|
67
|
+
This class provides methods for:
|
|
68
|
+
- Initializing DDP process groups
|
|
69
|
+
- Checking process rank and world size
|
|
70
|
+
- Cleanup after distributed training
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
_cached_state: Optional[tuple] = None
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def setup_ddp():
|
|
77
|
+
"""Initialize the DDP process group for distributed training.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Tuple of (success, local_rank, rank, world_size)
|
|
81
|
+
"""
|
|
82
|
+
# Return cached state if already initialized
|
|
83
|
+
if dist.is_initialized():
|
|
84
|
+
if DistributedUtils._cached_state is None:
|
|
85
|
+
DistributedUtils._cached_state = _cache_ddp_state(
|
|
86
|
+
int(os.environ.get("LOCAL_RANK", 0)),
|
|
87
|
+
dist.get_rank(),
|
|
88
|
+
dist.get_world_size(),
|
|
89
|
+
)
|
|
90
|
+
return DistributedUtils._cached_state
|
|
91
|
+
|
|
92
|
+
# Check for required environment variables
|
|
93
|
+
if 'RANK' not in os.environ or 'WORLD_SIZE' not in os.environ:
|
|
94
|
+
_log(
|
|
95
|
+
f">>> DDP Setup Failed: RANK or WORLD_SIZE not found in env. "
|
|
96
|
+
f"Keys found: {list(os.environ.keys())}"
|
|
97
|
+
)
|
|
98
|
+
_log(">>> Hint: launch with torchrun --nproc_per_node=<N> <script.py>")
|
|
99
|
+
return False, 0, 0, 1
|
|
100
|
+
|
|
101
|
+
rank = int(os.environ["RANK"])
|
|
102
|
+
world_size = int(os.environ["WORLD_SIZE"])
|
|
103
|
+
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
|
104
|
+
|
|
105
|
+
# Windows CUDA DDP is not supported
|
|
106
|
+
if os.name == "nt" and torch.cuda.is_available() and world_size > 1:
|
|
107
|
+
_log(
|
|
108
|
+
">>> DDP Setup Disabled: Windows CUDA DDP is not supported. "
|
|
109
|
+
"Falling back to single process."
|
|
110
|
+
)
|
|
111
|
+
return False, 0, 0, 1
|
|
112
|
+
|
|
113
|
+
# Set CUDA device for this process
|
|
114
|
+
if torch.cuda.is_available():
|
|
115
|
+
torch.cuda.set_device(local_rank)
|
|
116
|
+
|
|
117
|
+
# Initialize process group
|
|
118
|
+
backend = _select_ddp_backend()
|
|
119
|
+
timeout = _get_ddp_timeout()
|
|
120
|
+
|
|
121
|
+
dist.init_process_group(backend=backend, init_method="env://", timeout=timeout)
|
|
122
|
+
_log(
|
|
123
|
+
f">>> DDP Initialized ({backend}, timeout={timeout.total_seconds():.0f}s): "
|
|
124
|
+
f"Rank {rank}/{world_size}, Local Rank {local_rank}"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return _cache_ddp_state(local_rank, rank, world_size)
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def cleanup_ddp():
|
|
131
|
+
"""Destroy the DDP process group and clear cached state."""
|
|
132
|
+
if dist.is_initialized():
|
|
133
|
+
dist.destroy_process_group()
|
|
134
|
+
DistributedUtils._cached_state = None
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def is_main_process():
|
|
138
|
+
"""Check if current process is rank 0 (main process).
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
True if main process or DDP not initialized
|
|
142
|
+
"""
|
|
143
|
+
return not dist.is_initialized() or dist.get_rank() == 0
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def world_size() -> int:
|
|
147
|
+
"""Get the total number of processes in the distributed group.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
World size (1 if DDP not initialized)
|
|
151
|
+
"""
|
|
152
|
+
return dist.get_world_size() if dist.is_initialized() else 1
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class TrainingUtils:
|
|
156
|
+
"""General training utilities including CUDA management."""
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def free_cuda() -> None:
|
|
160
|
+
"""Release CUDA memory and clear cache.
|
|
161
|
+
|
|
162
|
+
This performs aggressive cleanup:
|
|
163
|
+
1. Move all PyTorch models to CPU
|
|
164
|
+
2. Run garbage collection
|
|
165
|
+
3. Clear CUDA cache
|
|
166
|
+
"""
|
|
167
|
+
_log(">>> Moving all models to CPU...")
|
|
168
|
+
for obj in gc.get_objects():
|
|
169
|
+
try:
|
|
170
|
+
if hasattr(obj, "to") and callable(obj.to):
|
|
171
|
+
obj.to("cpu")
|
|
172
|
+
except Exception:
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
_log(">>> Releasing tensor/optimizer/DataLoader references...")
|
|
176
|
+
gc.collect()
|
|
177
|
+
|
|
178
|
+
_log(">>> Clearing CUDA cache...")
|
|
179
|
+
if torch.cuda.is_available():
|
|
180
|
+
torch.cuda.empty_cache()
|
|
181
|
+
torch.cuda.synchronize()
|
|
182
|
+
_log(">>> CUDA memory released.")
|
|
183
|
+
else:
|
|
184
|
+
_log(">>> CUDA not available; cleanup skipped.")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# Backward compatibility function wrapper
|
|
188
|
+
def free_cuda():
|
|
189
|
+
"""Legacy function wrapper for CUDA memory cleanup.
|
|
190
|
+
|
|
191
|
+
This function calls TrainingUtils.free_cuda() for backward compatibility.
|
|
192
|
+
"""
|
|
193
|
+
TrainingUtils.free_cuda()
|
|
@@ -51,7 +51,14 @@ except Exception:
|
|
|
51
51
|
plot_loss_curve_common = None
|
|
52
52
|
|
|
53
53
|
# Import from other utils modules
|
|
54
|
-
from ins_pricing.utils import
|
|
54
|
+
from ins_pricing.utils import (
|
|
55
|
+
EPS,
|
|
56
|
+
compute_batch_size,
|
|
57
|
+
tweedie_loss,
|
|
58
|
+
ensure_parent_dir,
|
|
59
|
+
get_logger,
|
|
60
|
+
log_print,
|
|
61
|
+
)
|
|
55
62
|
from ins_pricing.utils.losses import (
|
|
56
63
|
infer_loss_name_from_model_name,
|
|
57
64
|
loss_requires_positive,
|
|
@@ -60,13 +67,19 @@ from ins_pricing.utils.losses import (
|
|
|
60
67
|
)
|
|
61
68
|
from ins_pricing.modelling.bayesopt.utils.distributed_utils import DistributedUtils
|
|
62
69
|
|
|
70
|
+
_logger = get_logger("ins_pricing.modelling.bayesopt.utils.torch_trainer_mixin")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _log(*args, **kwargs) -> None:
|
|
74
|
+
log_print(_logger, *args, **kwargs)
|
|
75
|
+
|
|
63
76
|
|
|
64
77
|
def _plot_skip(label: str) -> None:
|
|
65
78
|
"""Print message when plot is skipped due to missing matplotlib."""
|
|
66
79
|
if _MPL_IMPORT_ERROR is not None:
|
|
67
|
-
|
|
80
|
+
_log(f"[Plot] Skip {label}: matplotlib unavailable ({_MPL_IMPORT_ERROR}).", flush=True)
|
|
68
81
|
else:
|
|
69
|
-
|
|
82
|
+
_log(f"[Plot] Skip {label}: matplotlib unavailable.", flush=True)
|
|
70
83
|
|
|
71
84
|
|
|
72
85
|
class TorchTrainerMixin:
|
|
@@ -121,7 +134,7 @@ class TorchTrainerMixin:
|
|
|
121
134
|
mps_available = bool(getattr(torch.backends, "mps", None) and torch.backends.mps.is_available())
|
|
122
135
|
ddp_enabled = bool(getattr(self, "is_ddp_enabled", False))
|
|
123
136
|
data_parallel = bool(getattr(self, "use_data_parallel", False))
|
|
124
|
-
|
|
137
|
+
_log(
|
|
125
138
|
f">>> Resource summary: device={device}, device_type={device_type}, "
|
|
126
139
|
f"cpu_count={cpu_count}, cuda_count={cuda_count}, mps={mps_available}, "
|
|
127
140
|
f"ddp={ddp_enabled}, data_parallel={data_parallel}, profile={profile}"
|
|
@@ -229,7 +242,7 @@ class TorchTrainerMixin:
|
|
|
229
242
|
return batch_size
|
|
230
243
|
max_batch = max(1, int(budget // per_sample))
|
|
231
244
|
if max_batch < batch_size:
|
|
232
|
-
|
|
245
|
+
_log(
|
|
233
246
|
f">>> Memory cap: batch_size {batch_size} -> {max_batch} "
|
|
234
247
|
f"(per_sample~{sample_bytes}B, budget~{budget // (1024**2)}MB)"
|
|
235
248
|
)
|
|
@@ -289,7 +302,7 @@ class TorchTrainerMixin:
|
|
|
289
302
|
device_count = torch.cuda.device_count()
|
|
290
303
|
if device_count > 1:
|
|
291
304
|
min_bs = min_bs * device_count
|
|
292
|
-
|
|
305
|
+
_log(
|
|
293
306
|
f">>> Multi-GPU detected: {device_count} devices. Adjusted min_bs to {min_bs}.")
|
|
294
307
|
|
|
295
308
|
if data_size > large_threshold:
|
|
@@ -329,7 +342,7 @@ class TorchTrainerMixin:
|
|
|
329
342
|
if workers > 0:
|
|
330
343
|
prefetch_factor = 4 if profile == "throughput" else 2
|
|
331
344
|
persistent = workers > 0 and profile != "memory_saving"
|
|
332
|
-
|
|
345
|
+
_log(
|
|
333
346
|
f">>> DataLoader config: Batch Size={batch_size}, Accum Steps={accum_steps}, "
|
|
334
347
|
f"Workers={workers}, Prefetch={prefetch_factor or 'off'}, Profile={profile}")
|
|
335
348
|
sampler = None
|
|
@@ -559,13 +572,13 @@ class TorchTrainerMixin:
|
|
|
559
572
|
if should_log_epoch:
|
|
560
573
|
elapsed = int(time.time() - epoch_start_ts)
|
|
561
574
|
if val_weighted_loss is None:
|
|
562
|
-
|
|
575
|
+
_log(
|
|
563
576
|
f"[Training] Epoch {epoch}/{getattr(self, 'epochs', 1)} "
|
|
564
577
|
f"train_loss={float(train_epoch_loss):.6f} elapsed={elapsed}s",
|
|
565
578
|
flush=True,
|
|
566
579
|
)
|
|
567
580
|
else:
|
|
568
|
-
|
|
581
|
+
_log(
|
|
569
582
|
f"[Training] Epoch {epoch}/{getattr(self, 'epochs', 1)} "
|
|
570
583
|
f"train_loss={float(train_epoch_loss):.6f} "
|
|
571
584
|
f"val_loss={float(val_weighted_loss):.6f} elapsed={elapsed}s",
|
|
@@ -620,4 +633,4 @@ class TorchTrainerMixin:
|
|
|
620
633
|
plt.tight_layout()
|
|
621
634
|
plt.savefig(save_path, dpi=300)
|
|
622
635
|
plt.close(fig)
|
|
623
|
-
|
|
636
|
+
_log(f"[Training] Loss curve saved to {save_path}")
|
ins_pricing/pricing/factors.py
CHANGED
|
@@ -1,43 +1,46 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
import hashlib
|
|
4
5
|
from typing import Optional, Tuple
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
method: str,
|
|
15
|
-
min_val: float,
|
|
16
|
-
max_val: float,
|
|
17
|
-
n_unique: int
|
|
18
|
-
) -> Tuple[tuple, int]:
|
|
19
|
-
"""Cache bin edge computation based on data characteristics.
|
|
11
|
+
_BIN_CACHE_MAXSIZE = 128
|
|
12
|
+
_BIN_CACHE: "OrderedDict[tuple, np.ndarray]" = OrderedDict()
|
|
13
|
+
_BIN_CACHE_HITS = 0
|
|
14
|
+
_BIN_CACHE_MISSES = 0
|
|
20
15
|
|
|
21
|
-
Args:
|
|
22
|
-
data_hash: Hash of sorted unique values for cache key
|
|
23
|
-
n_bins: Number of bins to create
|
|
24
|
-
method: Binning method ('quantile' or 'uniform')
|
|
25
|
-
min_val: Minimum value in data
|
|
26
|
-
max_val: Maximum value in data
|
|
27
|
-
n_unique: Number of unique values
|
|
28
16
|
|
|
29
|
-
|
|
30
|
-
|
|
17
|
+
def _cache_key(series: pd.Series, n_bins: int, method: str) -> Optional[tuple]:
|
|
18
|
+
try:
|
|
19
|
+
values = series.dropna().to_numpy(dtype=float, copy=False)
|
|
20
|
+
if values.size == 0:
|
|
21
|
+
return None
|
|
22
|
+
values = np.sort(values)
|
|
23
|
+
digest = hashlib.blake2b(values.tobytes(), digest_size=16).hexdigest()
|
|
24
|
+
return (digest, int(values.size), int(n_bins), str(method))
|
|
25
|
+
except Exception:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _cache_get(key: tuple) -> Optional[np.ndarray]:
|
|
30
|
+
global _BIN_CACHE_HITS, _BIN_CACHE_MISSES
|
|
31
|
+
if key in _BIN_CACHE:
|
|
32
|
+
_BIN_CACHE_HITS += 1
|
|
33
|
+
_BIN_CACHE.move_to_end(key)
|
|
34
|
+
return _BIN_CACHE[key].copy()
|
|
35
|
+
_BIN_CACHE_MISSES += 1
|
|
36
|
+
return None
|
|
31
37
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# The actual binning is done in the calling function
|
|
39
|
-
# This just provides a cache key mechanism
|
|
40
|
-
return (data_hash, n_bins, method, min_val, max_val, n_unique), n_bins
|
|
38
|
+
|
|
39
|
+
def _cache_set(key: tuple, edges: np.ndarray) -> None:
|
|
40
|
+
_BIN_CACHE[key] = np.asarray(edges, dtype=float)
|
|
41
|
+
_BIN_CACHE.move_to_end(key)
|
|
42
|
+
if len(_BIN_CACHE) > _BIN_CACHE_MAXSIZE:
|
|
43
|
+
_BIN_CACHE.popitem(last=False)
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
def bin_numeric(
|
|
@@ -66,34 +69,40 @@ def bin_numeric(
|
|
|
66
69
|
When use_cache=True, identical distributions will reuse cached bin edges,
|
|
67
70
|
improving performance when the same column is binned multiple times.
|
|
68
71
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
# Check cache (the function call acts as cache lookup)
|
|
80
|
-
try:
|
|
81
|
-
_compute_bins_cached(data_hash, bins, method, min_val, max_val, n_unique)
|
|
82
|
-
except Exception:
|
|
83
|
-
# If hashing fails, proceed without cache
|
|
84
|
-
pass
|
|
72
|
+
cache_key = _cache_key(series, bins, method) if use_cache else None
|
|
73
|
+
bin_edges_full: Optional[np.ndarray] = None
|
|
74
|
+
|
|
75
|
+
if cache_key is not None:
|
|
76
|
+
bin_edges_full = _cache_get(cache_key)
|
|
77
|
+
|
|
78
|
+
if bin_edges_full is not None:
|
|
79
|
+
binned = pd.cut(series, bins=bin_edges_full, include_lowest=True, labels=labels)
|
|
80
|
+
return binned, np.asarray(bin_edges_full[:-1], dtype=float)
|
|
85
81
|
|
|
86
82
|
# Perform actual binning
|
|
87
83
|
if method == "quantile":
|
|
88
|
-
binned = pd.qcut(
|
|
89
|
-
|
|
84
|
+
binned, bin_edges_full = pd.qcut(
|
|
85
|
+
series,
|
|
86
|
+
q=bins,
|
|
87
|
+
duplicates="drop",
|
|
88
|
+
labels=labels,
|
|
89
|
+
retbins=True,
|
|
90
|
+
)
|
|
90
91
|
elif method == "uniform":
|
|
91
|
-
binned = pd.cut(
|
|
92
|
-
|
|
92
|
+
binned, bin_edges_full = pd.cut(
|
|
93
|
+
series,
|
|
94
|
+
bins=bins,
|
|
95
|
+
include_lowest=include_lowest,
|
|
96
|
+
labels=labels,
|
|
97
|
+
retbins=True,
|
|
98
|
+
)
|
|
93
99
|
else:
|
|
94
100
|
raise ValueError("method must be one of: quantile, uniform.")
|
|
95
101
|
|
|
96
|
-
|
|
102
|
+
if cache_key is not None and bin_edges_full is not None:
|
|
103
|
+
_cache_set(cache_key, np.asarray(bin_edges_full, dtype=float))
|
|
104
|
+
|
|
105
|
+
return binned, np.asarray(bin_edges_full[:-1], dtype=float)
|
|
97
106
|
|
|
98
107
|
|
|
99
108
|
def clear_binning_cache() -> None:
|
|
@@ -108,7 +117,10 @@ def clear_binning_cache() -> None:
|
|
|
108
117
|
>>> # After processing many different columns
|
|
109
118
|
>>> clear_binning_cache()
|
|
110
119
|
"""
|
|
111
|
-
|
|
120
|
+
global _BIN_CACHE_HITS, _BIN_CACHE_MISSES
|
|
121
|
+
_BIN_CACHE.clear()
|
|
122
|
+
_BIN_CACHE_HITS = 0
|
|
123
|
+
_BIN_CACHE_MISSES = 0
|
|
112
124
|
|
|
113
125
|
|
|
114
126
|
def get_cache_info() -> dict:
|
|
@@ -126,12 +138,11 @@ def get_cache_info() -> dict:
|
|
|
126
138
|
>>> info = get_cache_info()
|
|
127
139
|
>>> print(f"Cache hit rate: {info['hits'] / (info['hits'] + info['misses']):.2%}")
|
|
128
140
|
"""
|
|
129
|
-
cache_info = _compute_bins_cached.cache_info()
|
|
130
141
|
return {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
142
|
+
"hits": _BIN_CACHE_HITS,
|
|
143
|
+
"misses": _BIN_CACHE_MISSES,
|
|
144
|
+
"maxsize": _BIN_CACHE_MAXSIZE,
|
|
145
|
+
"currsize": len(_BIN_CACHE),
|
|
135
146
|
}
|
|
136
147
|
|
|
137
148
|
|
ins_pricing/setup.py
CHANGED