ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +9 -6
- ins_pricing/__init__.py +3 -11
- ins_pricing/cli/BayesOpt_entry.py +24 -0
- ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
- ins_pricing/cli/Explain_Run.py +25 -0
- ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
- ins_pricing/cli/Pricing_Run.py +25 -0
- ins_pricing/cli/__init__.py +1 -0
- ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
- ins_pricing/cli/utils/__init__.py +1 -0
- ins_pricing/cli/utils/cli_common.py +320 -0
- ins_pricing/cli/utils/cli_config.py +375 -0
- ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
- {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
- ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
- ins_pricing/docs/modelling/README.md +34 -0
- ins_pricing/modelling/__init__.py +57 -6
- ins_pricing/modelling/core/__init__.py +1 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
- ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
- ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
- ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
- ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
- ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
- ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
- ins_pricing/modelling/core/evaluation.py +115 -0
- ins_pricing/production/__init__.py +4 -0
- ins_pricing/production/preprocess.py +71 -0
- ins_pricing/setup.py +10 -5
- {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
- ins_pricing-0.2.0.dist-info/RECORD +125 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
- ins_pricing/modelling/BayesOpt_entry.py +0 -633
- ins_pricing/modelling/Explain_Run.py +0 -36
- ins_pricing/modelling/Pricing_Run.py +0 -36
- ins_pricing/modelling/README.md +0 -33
- ins_pricing/modelling/bayesopt/models.py +0 -2196
- ins_pricing/modelling/bayesopt/trainers.py +0 -2446
- ins_pricing/modelling/cli_common.py +0 -136
- ins_pricing/modelling/tests/test_plotting.py +0 -63
- ins_pricing/modelling/watchdog_run.py +0 -211
- ins_pricing-0.1.11.dist-info/RECORD +0 -169
- ins_pricing_gemini/__init__.py +0 -23
- ins_pricing_gemini/governance/__init__.py +0 -20
- ins_pricing_gemini/governance/approval.py +0 -93
- ins_pricing_gemini/governance/audit.py +0 -37
- ins_pricing_gemini/governance/registry.py +0 -99
- ins_pricing_gemini/governance/release.py +0 -159
- ins_pricing_gemini/modelling/Explain_Run.py +0 -36
- ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
- ins_pricing_gemini/modelling/__init__.py +0 -151
- ins_pricing_gemini/modelling/cli_common.py +0 -141
- ins_pricing_gemini/modelling/config.py +0 -249
- ins_pricing_gemini/modelling/config_preprocess.py +0 -254
- ins_pricing_gemini/modelling/core.py +0 -741
- ins_pricing_gemini/modelling/data_container.py +0 -42
- ins_pricing_gemini/modelling/explain/__init__.py +0 -55
- ins_pricing_gemini/modelling/explain/gradients.py +0 -334
- ins_pricing_gemini/modelling/explain/metrics.py +0 -176
- ins_pricing_gemini/modelling/explain/permutation.py +0 -155
- ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
- ins_pricing_gemini/modelling/features.py +0 -215
- ins_pricing_gemini/modelling/model_manager.py +0 -148
- ins_pricing_gemini/modelling/model_plotting.py +0 -463
- ins_pricing_gemini/modelling/models.py +0 -2203
- ins_pricing_gemini/modelling/notebook_utils.py +0 -294
- ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
- ins_pricing_gemini/modelling/plotting/common.py +0 -63
- ins_pricing_gemini/modelling/plotting/curves.py +0 -572
- ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
- ins_pricing_gemini/modelling/plotting/geo.py +0 -362
- ins_pricing_gemini/modelling/plotting/importance.py +0 -121
- ins_pricing_gemini/modelling/run_logging.py +0 -133
- ins_pricing_gemini/modelling/tests/conftest.py +0 -8
- ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
- ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
- ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
- ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
- ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
- ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
- ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
- ins_pricing_gemini/modelling/trainers.py +0 -2447
- ins_pricing_gemini/modelling/utils.py +0 -1020
- ins_pricing_gemini/pricing/__init__.py +0 -27
- ins_pricing_gemini/pricing/calibration.py +0 -39
- ins_pricing_gemini/pricing/data_quality.py +0 -117
- ins_pricing_gemini/pricing/exposure.py +0 -85
- ins_pricing_gemini/pricing/factors.py +0 -91
- ins_pricing_gemini/pricing/monitoring.py +0 -99
- ins_pricing_gemini/pricing/rate_table.py +0 -78
- ins_pricing_gemini/production/__init__.py +0 -21
- ins_pricing_gemini/production/drift.py +0 -30
- ins_pricing_gemini/production/monitoring.py +0 -143
- ins_pricing_gemini/production/scoring.py +0 -40
- ins_pricing_gemini/reporting/__init__.py +0 -11
- ins_pricing_gemini/reporting/report_builder.py +0 -72
- ins_pricing_gemini/reporting/scheduler.py +0 -45
- ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
- ins_pricing_gemini/scripts/Explain_entry.py +0 -545
- ins_pricing_gemini/scripts/__init__.py +0 -1
- ins_pricing_gemini/scripts/train.py +0 -568
- ins_pricing_gemini/setup.py +0 -55
- ins_pricing_gemini/smoke_test.py +0 -28
- /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
- /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
- /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
7
|
-
try:
|
|
8
|
-
from .config import BayesOptConfig
|
|
9
|
-
except ImportError:
|
|
10
|
-
from ins_pricing.modelling.config import BayesOptConfig
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
PLOT_MODEL_LABELS: Dict[str, Tuple[str, str]] = {
|
|
15
|
-
"glm": ("GLM", "pred_glm"),
|
|
16
|
-
"xgb": ("Xgboost", "pred_xgb"),
|
|
17
|
-
"resn": ("ResNet", "pred_resn"),
|
|
18
|
-
"ft": ("FTTransformer", "pred_ft"),
|
|
19
|
-
"gnn": ("GNN", "pred_gnn"),
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
PYTORCH_TRAINERS = {"resn", "ft", "gnn"}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
|
26
|
-
seen = set()
|
|
27
|
-
unique: List[str] = []
|
|
28
|
-
for item in items:
|
|
29
|
-
if item not in seen:
|
|
30
|
-
unique.append(item)
|
|
31
|
-
seen.add(item)
|
|
32
|
-
return unique
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def build_model_names(prefixes: Sequence[str], suffixes: Sequence[str]) -> List[str]:
|
|
36
|
-
names: List[str] = []
|
|
37
|
-
for suffix in suffixes:
|
|
38
|
-
names.extend(f"{prefix}_{suffix}" for prefix in prefixes)
|
|
39
|
-
return names
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def parse_model_pairs(raw_pairs: List) -> List[Tuple[str, str]]:
|
|
43
|
-
pairs: List[Tuple[str, str]] = []
|
|
44
|
-
for pair in raw_pairs:
|
|
45
|
-
if isinstance(pair, (list, tuple)) and len(pair) == 2:
|
|
46
|
-
pairs.append((str(pair[0]), str(pair[1])))
|
|
47
|
-
elif isinstance(pair, str):
|
|
48
|
-
parts = [p.strip() for p in pair.split(",") if p.strip()]
|
|
49
|
-
if len(parts) == 2:
|
|
50
|
-
pairs.append((parts[0], parts[1]))
|
|
51
|
-
return pairs
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def resolve_path(value: Optional[str], base_dir: Path) -> Optional[Path]:
|
|
55
|
-
if value is None:
|
|
56
|
-
return None
|
|
57
|
-
if not isinstance(value, str) or not value.strip():
|
|
58
|
-
return None
|
|
59
|
-
p = Path(value)
|
|
60
|
-
if p.is_absolute():
|
|
61
|
-
return p
|
|
62
|
-
return (base_dir / p).resolve()
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def resolve_config_path(raw: str, script_dir: Path) -> Path:
|
|
66
|
-
candidate = Path(raw)
|
|
67
|
-
if candidate.exists():
|
|
68
|
-
return candidate.resolve()
|
|
69
|
-
candidate2 = (script_dir / raw)
|
|
70
|
-
if candidate2.exists():
|
|
71
|
-
return candidate2.resolve()
|
|
72
|
-
raise FileNotFoundError(
|
|
73
|
-
f"Config file not found: {raw}. Tried: {Path(raw).resolve()} and {candidate2.resolve()}"
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def load_config_json(path: Path, required_keys: Sequence[str]) -> BayesOptConfig:
|
|
78
|
-
cfg_dict = json.loads(path.read_text(encoding="utf-8"))
|
|
79
|
-
missing = [key for key in required_keys if key not in cfg_dict]
|
|
80
|
-
if missing:
|
|
81
|
-
raise ValueError(f"Missing required keys in {path}: {missing}")
|
|
82
|
-
return BayesOptConfig.from_legacy_dict(cfg_dict)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def set_env(env_overrides: Dict[str, Any]) -> None:
|
|
86
|
-
"""Apply environment variables from config.json.
|
|
87
|
-
|
|
88
|
-
Notes (DDP/Optuna hang debugging):
|
|
89
|
-
- You can add these keys into config.json's `env` to debug distributed hangs:
|
|
90
|
-
- `TORCH_DISTRIBUTED_DEBUG=DETAIL`
|
|
91
|
-
- `NCCL_DEBUG=INFO`
|
|
92
|
-
- `BAYESOPT_DDP_BARRIER_DEBUG=1`
|
|
93
|
-
- `BAYESOPT_DDP_BARRIER_TIMEOUT=300`
|
|
94
|
-
- `BAYESOPT_CUDA_SYNC=1` (optional; can slow down)
|
|
95
|
-
- `BAYESOPT_CUDA_IPC_COLLECT=1` (optional; can slow down)
|
|
96
|
-
- This function uses `os.environ.setdefault`, so a value already set in the
|
|
97
|
-
shell will take precedence over config.json.
|
|
98
|
-
"""
|
|
99
|
-
for key, value in (env_overrides or {}).items():
|
|
100
|
-
os.environ.setdefault(str(key), str(value))
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def _looks_like_url(value: str) -> bool:
|
|
104
|
-
value = str(value)
|
|
105
|
-
return "://" in value
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def normalize_config_paths(cfg: Dict[str, Any], config_path: Path) -> Dict[str, Any]:
|
|
109
|
-
"""Resolve relative paths against the config.json directory.
|
|
110
|
-
|
|
111
|
-
Fields handled:
|
|
112
|
-
- data_dir / output_dir / optuna_storage / gnn_graph_cache
|
|
113
|
-
- best_params_files (dict: model_key -> path)
|
|
114
|
-
"""
|
|
115
|
-
base_dir = config_path.parent
|
|
116
|
-
out = dict(cfg)
|
|
117
|
-
|
|
118
|
-
for key in ("data_dir", "output_dir", "gnn_graph_cache"):
|
|
119
|
-
if key in out and isinstance(out.get(key), str):
|
|
120
|
-
resolved = resolve_path(out.get(key), base_dir)
|
|
121
|
-
if resolved is not None:
|
|
122
|
-
out[key] = str(resolved)
|
|
123
|
-
|
|
124
|
-
storage = out.get("optuna_storage")
|
|
125
|
-
if isinstance(storage, str) and storage.strip():
|
|
126
|
-
if not _looks_like_url(storage):
|
|
127
|
-
resolved = resolve_path(storage, base_dir)
|
|
128
|
-
if resolved is not None:
|
|
129
|
-
out["optuna_storage"] = str(resolved)
|
|
130
|
-
|
|
131
|
-
best_files = out.get("best_params_files")
|
|
132
|
-
if isinstance(best_files, dict):
|
|
133
|
-
resolved_map: Dict[str, str] = {}
|
|
134
|
-
for mk, path_str in best_files.items():
|
|
135
|
-
if not isinstance(path_str, str):
|
|
136
|
-
continue
|
|
137
|
-
resolved = resolve_path(path_str, base_dir)
|
|
138
|
-
resolved_map[str(mk)] = str(resolved) if resolved is not None else str(path_str)
|
|
139
|
-
out["best_params_files"] = resolved_map
|
|
140
|
-
|
|
141
|
-
return out
|
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
-
from pydantic import BaseModel, Field, validator
|
|
5
|
-
|
|
6
|
-
class DataConfig(BaseModel):
|
|
7
|
-
resp_nme: str
|
|
8
|
-
weight_nme: str
|
|
9
|
-
factor_nmes: List[str]
|
|
10
|
-
cate_list: Optional[List[str]] = None
|
|
11
|
-
binary_resp_nme: Optional[str] = None
|
|
12
|
-
task_type: str = 'regression'
|
|
13
|
-
prop_test: float = 0.25
|
|
14
|
-
rand_seed: Optional[int] = None
|
|
15
|
-
|
|
16
|
-
class DistributedConfig(BaseModel):
|
|
17
|
-
use_gpu: bool = True
|
|
18
|
-
use_resn_data_parallel: bool = False
|
|
19
|
-
use_ft_data_parallel: bool = False
|
|
20
|
-
use_gnn_data_parallel: bool = False
|
|
21
|
-
use_resn_ddp: bool = False
|
|
22
|
-
use_ft_ddp: bool = False
|
|
23
|
-
use_gnn_ddp: bool = False
|
|
24
|
-
# DDP Timeout settings can be passed via env, but good to have here if needed
|
|
25
|
-
|
|
26
|
-
class GNNConfig(BaseModel):
|
|
27
|
-
use_approx_knn: bool = True
|
|
28
|
-
approx_knn_threshold: int = 50000
|
|
29
|
-
graph_cache: Optional[str] = None
|
|
30
|
-
max_gpu_knn_nodes: Optional[int] = 200000
|
|
31
|
-
knn_gpu_mem_ratio: float = 0.9
|
|
32
|
-
knn_gpu_mem_overhead: float = 2.0
|
|
33
|
-
|
|
34
|
-
class RegionConfig(BaseModel):
|
|
35
|
-
province_col: Optional[str] = None
|
|
36
|
-
city_col: Optional[str] = None
|
|
37
|
-
effect_alpha: float = 50.0
|
|
38
|
-
|
|
39
|
-
class GeoTokenConfig(BaseModel):
|
|
40
|
-
feature_nmes: Optional[List[str]] = None
|
|
41
|
-
hidden_dim: int = 32
|
|
42
|
-
layers: int = 2
|
|
43
|
-
dropout: float = 0.1
|
|
44
|
-
k_neighbors: int = 10
|
|
45
|
-
learning_rate: float = 1e-3
|
|
46
|
-
epochs: int = 50
|
|
47
|
-
|
|
48
|
-
class OptunaConfig(BaseModel):
|
|
49
|
-
storage: Optional[str] = None
|
|
50
|
-
study_prefix: Optional[str] = None
|
|
51
|
-
best_params_files: Optional[Dict[str, str]] = None
|
|
52
|
-
reuse_best_params: bool = False
|
|
53
|
-
|
|
54
|
-
class FTConfig(BaseModel):
|
|
55
|
-
role: str = "model" # "model", "embedding", "unsupervised_embedding"
|
|
56
|
-
feature_prefix: str = "ft_emb"
|
|
57
|
-
num_numeric_tokens: Optional[int] = None
|
|
58
|
-
|
|
59
|
-
class BayesOptConfig(BaseModel):
|
|
60
|
-
# Core Data & Task
|
|
61
|
-
data: DataConfig
|
|
62
|
-
|
|
63
|
-
# Model Names & Meta
|
|
64
|
-
model_nme: str
|
|
65
|
-
|
|
66
|
-
# Training Hyperparameters
|
|
67
|
-
epochs: int = 100
|
|
68
|
-
xgb_max_depth_max: int = 25
|
|
69
|
-
xgb_n_estimators_max: int = 500
|
|
70
|
-
resn_weight_decay: float = 1e-4
|
|
71
|
-
|
|
72
|
-
# Sub-component Configs
|
|
73
|
-
dist: DistributedConfig = Field(default_factory=DistributedConfig)
|
|
74
|
-
gnn: GNNConfig = Field(default_factory=GNNConfig)
|
|
75
|
-
region: RegionConfig = Field(default_factory=RegionConfig)
|
|
76
|
-
geo: GeoTokenConfig = Field(default_factory=GeoTokenConfig)
|
|
77
|
-
optuna: OptunaConfig = Field(default_factory=OptunaConfig)
|
|
78
|
-
ft: FTConfig = Field(default_factory=FTConfig)
|
|
79
|
-
|
|
80
|
-
# Ensemble & output
|
|
81
|
-
output_dir: Optional[str] = None
|
|
82
|
-
final_ensemble: bool = False
|
|
83
|
-
final_ensemble_k: int = 3
|
|
84
|
-
final_refit: bool = True
|
|
85
|
-
|
|
86
|
-
# Flattened accessors for backward compatibility
|
|
87
|
-
@property
|
|
88
|
-
def resp_nme(self): return self.data.resp_nme
|
|
89
|
-
@property
|
|
90
|
-
def weight_nme(self): return self.data.weight_nme
|
|
91
|
-
@property
|
|
92
|
-
def factor_nmes(self): return self.data.factor_nmes
|
|
93
|
-
@property
|
|
94
|
-
def task_type(self): return self.data.task_type
|
|
95
|
-
@property
|
|
96
|
-
def cate_list(self): return self.data.cate_list
|
|
97
|
-
@property
|
|
98
|
-
def binary_resp_nme(self): return self.data.binary_resp_nme
|
|
99
|
-
@property
|
|
100
|
-
def prop_test(self): return self.data.prop_test
|
|
101
|
-
@property
|
|
102
|
-
def rand_seed(self): return self.data.rand_seed
|
|
103
|
-
|
|
104
|
-
@property
|
|
105
|
-
def use_gpu(self): return self.dist.use_gpu
|
|
106
|
-
@property
|
|
107
|
-
def use_resn_data_parallel(self): return self.dist.use_resn_data_parallel
|
|
108
|
-
@property
|
|
109
|
-
def use_ft_data_parallel(self): return self.dist.use_ft_data_parallel
|
|
110
|
-
@property
|
|
111
|
-
def use_gnn_data_parallel(self): return self.dist.use_gnn_data_parallel
|
|
112
|
-
@property
|
|
113
|
-
def use_resn_ddp(self): return self.dist.use_resn_ddp
|
|
114
|
-
@property
|
|
115
|
-
def use_ft_ddp(self): return self.dist.use_ft_ddp
|
|
116
|
-
@property
|
|
117
|
-
def use_gnn_ddp(self): return self.dist.use_gnn_ddp
|
|
118
|
-
|
|
119
|
-
@property
|
|
120
|
-
def gnn_use_approx_knn(self): return self.gnn.use_approx_knn
|
|
121
|
-
@property
|
|
122
|
-
def gnn_approx_knn_threshold(self): return self.gnn.approx_knn_threshold
|
|
123
|
-
@property
|
|
124
|
-
def gnn_graph_cache(self): return self.gnn.graph_cache
|
|
125
|
-
@property
|
|
126
|
-
def gnn_max_gpu_knn_nodes(self): return self.gnn.max_gpu_knn_nodes
|
|
127
|
-
@property
|
|
128
|
-
def gnn_knn_gpu_mem_ratio(self): return self.gnn.knn_gpu_mem_ratio
|
|
129
|
-
@property
|
|
130
|
-
def gnn_knn_gpu_mem_overhead(self): return self.gnn.knn_gpu_mem_overhead
|
|
131
|
-
|
|
132
|
-
@property
|
|
133
|
-
def region_province_col(self): return self.region.province_col
|
|
134
|
-
@property
|
|
135
|
-
def region_city_col(self): return self.region.city_col
|
|
136
|
-
@property
|
|
137
|
-
def region_effect_alpha(self): return self.region.effect_alpha
|
|
138
|
-
|
|
139
|
-
@property
|
|
140
|
-
def geo_feature_nmes(self): return self.geo.feature_nmes
|
|
141
|
-
@property
|
|
142
|
-
def geo_token_hidden_dim(self): return self.geo.hidden_dim
|
|
143
|
-
@property
|
|
144
|
-
def geo_token_layers(self): return self.geo.layers
|
|
145
|
-
@property
|
|
146
|
-
def geo_token_dropout(self): return self.geo.dropout
|
|
147
|
-
@property
|
|
148
|
-
def geo_token_k_neighbors(self): return self.geo.k_neighbors
|
|
149
|
-
@property
|
|
150
|
-
def geo_token_learning_rate(self): return self.geo.learning_rate
|
|
151
|
-
@property
|
|
152
|
-
def geo_token_epochs(self): return self.geo.epochs
|
|
153
|
-
|
|
154
|
-
@property
|
|
155
|
-
def optuna_storage(self): return self.optuna.storage
|
|
156
|
-
@property
|
|
157
|
-
def optuna_study_prefix(self): return self.optuna.study_prefix
|
|
158
|
-
@property
|
|
159
|
-
def best_params_files(self): return self.optuna.best_params_files
|
|
160
|
-
@property
|
|
161
|
-
def reuse_best_params(self): return self.optuna.reuse_best_params
|
|
162
|
-
|
|
163
|
-
@property
|
|
164
|
-
def ft_role(self): return self.ft.role
|
|
165
|
-
@property
|
|
166
|
-
def ft_feature_prefix(self): return self.ft.feature_prefix
|
|
167
|
-
@property
|
|
168
|
-
def ft_num_numeric_tokens(self): return self.ft.num_numeric_tokens
|
|
169
|
-
|
|
170
|
-
@classmethod
|
|
171
|
-
def from_legacy_dict(cls, d: Dict[str, Any]) -> 'BayesOptConfig':
|
|
172
|
-
"""Map flat dictionary to nested Pydantic structure."""
|
|
173
|
-
data = DataConfig(
|
|
174
|
-
resp_nme=d.get('resp_nme'),
|
|
175
|
-
weight_nme=d.get('weight_nme'),
|
|
176
|
-
factor_nmes=d.get('factor_nmes', []),
|
|
177
|
-
cate_list=d.get('cate_list'),
|
|
178
|
-
binary_resp_nme=d.get('binary_resp_nme'),
|
|
179
|
-
task_type=d.get('task_type', 'regression'),
|
|
180
|
-
prop_test=d.get('prop_test', 0.25),
|
|
181
|
-
rand_seed=d.get('rand_seed')
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
dist = DistributedConfig(
|
|
185
|
-
use_gpu=d.get('use_gpu', True),
|
|
186
|
-
use_resn_data_parallel=d.get('use_resn_data_parallel', False),
|
|
187
|
-
use_ft_data_parallel=d.get('use_ft_data_parallel', False),
|
|
188
|
-
use_gnn_data_parallel=d.get('use_gnn_data_parallel', False),
|
|
189
|
-
use_resn_ddp=d.get('use_resn_ddp', False),
|
|
190
|
-
use_ft_ddp=d.get('use_ft_ddp', False),
|
|
191
|
-
use_gnn_ddp=d.get('use_gnn_ddp', False),
|
|
192
|
-
)
|
|
193
|
-
|
|
194
|
-
gnn = GNNConfig(
|
|
195
|
-
use_approx_knn=d.get('gnn_use_approx_knn', True),
|
|
196
|
-
approx_knn_threshold=d.get('gnn_approx_knn_threshold', 50000),
|
|
197
|
-
graph_cache=d.get('gnn_graph_cache'),
|
|
198
|
-
max_gpu_knn_nodes=d.get('gnn_max_gpu_knn_nodes', 200000),
|
|
199
|
-
knn_gpu_mem_ratio=d.get('gnn_knn_gpu_mem_ratio', 0.9),
|
|
200
|
-
knn_gpu_mem_overhead=d.get('gnn_knn_gpu_mem_overhead', 2.0),
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
region = RegionConfig(
|
|
204
|
-
province_col=d.get('region_province_col'),
|
|
205
|
-
city_col=d.get('region_city_col'),
|
|
206
|
-
effect_alpha=d.get('region_effect_alpha', 50.0)
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
geo = GeoTokenConfig(
|
|
210
|
-
feature_nmes=d.get('geo_feature_nmes'),
|
|
211
|
-
hidden_dim=d.get('geo_token_hidden_dim', 32),
|
|
212
|
-
layers=d.get('geo_token_layers', 2),
|
|
213
|
-
dropout=d.get('geo_token_dropout', 0.1),
|
|
214
|
-
k_neighbors=d.get('geo_token_k_neighbors', 10),
|
|
215
|
-
learning_rate=d.get('geo_token_learning_rate', 1e-3),
|
|
216
|
-
epochs=d.get('geo_token_epochs', 50)
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
optuna = OptunaConfig(
|
|
220
|
-
storage=d.get('optuna_storage'),
|
|
221
|
-
study_prefix=d.get('optuna_study_prefix'),
|
|
222
|
-
best_params_files=d.get('best_params_files'),
|
|
223
|
-
reuse_best_params=d.get('reuse_best_params', False)
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
ft = FTConfig(
|
|
227
|
-
role=d.get('ft_role', 'model'),
|
|
228
|
-
feature_prefix=d.get('ft_feature_prefix', 'ft_emb'),
|
|
229
|
-
num_numeric_tokens=d.get('ft_num_numeric_tokens')
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
return cls(
|
|
233
|
-
data=data,
|
|
234
|
-
model_nme=d.get('model_nme', 'model'),
|
|
235
|
-
epochs=d.get('epochs', 100),
|
|
236
|
-
xgb_max_depth_max=d.get('xgb_max_depth_max', 25),
|
|
237
|
-
xgb_n_estimators_max=d.get('xgb_n_estimators_max', 500),
|
|
238
|
-
resn_weight_decay=d.get('resn_weight_decay', 1e-4),
|
|
239
|
-
dist=dist,
|
|
240
|
-
gnn=gnn,
|
|
241
|
-
region=region,
|
|
242
|
-
geo=geo,
|
|
243
|
-
optuna=optuna,
|
|
244
|
-
ft=ft,
|
|
245
|
-
output_dir=d.get('output_dir'),
|
|
246
|
-
final_ensemble=d.get('final_ensemble', False),
|
|
247
|
-
final_ensemble_k=d.get('final_ensemble_k', 3),
|
|
248
|
-
final_refit=d.get('final_refit', True)
|
|
249
|
-
)
|
|
@@ -1,254 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from datetime import datetime
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Any, Dict, List, Optional
|
|
9
|
-
|
|
10
|
-
import numpy as np
|
|
11
|
-
import pandas as pd
|
|
12
|
-
from sklearn.preprocessing import StandardScaler
|
|
13
|
-
|
|
14
|
-
from .utils import IOUtils
|
|
15
|
-
from .config import BayesOptConfig
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# NOTE: Some CSV exports may contain invisible BOM characters or leading/trailing
|
|
19
|
-
# spaces in column names. Pandas requires exact matches, so we normalize a few
|
|
20
|
-
# "required" column names (response/weight/binary response) before validating.
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _clean_column_name(name: Any) -> Any:
|
|
24
|
-
if not isinstance(name, str):
|
|
25
|
-
return name
|
|
26
|
-
return name.replace("\ufeff", "").strip()
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _normalize_required_columns(
|
|
30
|
-
df: pd.DataFrame, required: List[Optional[str]], *, df_label: str
|
|
31
|
-
) -> None:
|
|
32
|
-
required_names = [r for r in required if isinstance(r, str) and r.strip()]
|
|
33
|
-
if not required_names:
|
|
34
|
-
return
|
|
35
|
-
|
|
36
|
-
mapping: Dict[Any, Any] = {}
|
|
37
|
-
existing = set(df.columns)
|
|
38
|
-
for col in df.columns:
|
|
39
|
-
cleaned = _clean_column_name(col)
|
|
40
|
-
if cleaned != col and cleaned not in existing:
|
|
41
|
-
mapping[col] = cleaned
|
|
42
|
-
if mapping:
|
|
43
|
-
df.rename(columns=mapping, inplace=True)
|
|
44
|
-
|
|
45
|
-
existing = set(df.columns)
|
|
46
|
-
for req in required_names:
|
|
47
|
-
if req in existing:
|
|
48
|
-
continue
|
|
49
|
-
candidates = [
|
|
50
|
-
col
|
|
51
|
-
for col in df.columns
|
|
52
|
-
if isinstance(col, str) and _clean_column_name(col).lower() == req.lower()
|
|
53
|
-
]
|
|
54
|
-
if len(candidates) == 1 and req not in existing:
|
|
55
|
-
df.rename(columns={candidates[0]: req}, inplace=True)
|
|
56
|
-
existing = set(df.columns)
|
|
57
|
-
elif len(candidates) > 1:
|
|
58
|
-
raise KeyError(
|
|
59
|
-
f"{df_label} has multiple columns matching required {req!r} "
|
|
60
|
-
f"(case/space-insensitive): {candidates}"
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# ===== Core components and training wrappers =================================
|
|
65
|
-
|
|
66
|
-
# =============================================================================
|
|
67
|
-
# Config, preprocessing, and trainer base types
|
|
68
|
-
# =============================================================================
|
|
69
|
-
# BayesOptConfig is now imported from .config
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class OutputManager:
|
|
74
|
-
# Centralize output paths for plots, results, and models.
|
|
75
|
-
|
|
76
|
-
def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
|
|
77
|
-
self.root = Path(root or os.getcwd())
|
|
78
|
-
self.model_name = model_name
|
|
79
|
-
self.plot_dir = self.root / 'plot'
|
|
80
|
-
self.result_dir = self.root / 'Results'
|
|
81
|
-
self.model_dir = self.root / 'model'
|
|
82
|
-
|
|
83
|
-
def _prepare(self, path: Path) -> str:
|
|
84
|
-
IOUtils.ensure_parent_dir(str(path))
|
|
85
|
-
return str(path)
|
|
86
|
-
|
|
87
|
-
def plot_path(self, filename: str) -> str:
|
|
88
|
-
return self._prepare(self.plot_dir / filename)
|
|
89
|
-
|
|
90
|
-
def result_path(self, filename: str) -> str:
|
|
91
|
-
return self._prepare(self.result_dir / filename)
|
|
92
|
-
|
|
93
|
-
def model_path(self, filename: str) -> str:
|
|
94
|
-
return self._prepare(self.model_dir / filename)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
class VersionManager:
|
|
98
|
-
"""Lightweight versioning: save config and best-params snapshots for traceability."""
|
|
99
|
-
|
|
100
|
-
def __init__(self, output: OutputManager) -> None:
|
|
101
|
-
self.output = output
|
|
102
|
-
self.version_dir = Path(self.output.result_dir) / "versions"
|
|
103
|
-
IOUtils.ensure_parent_dir(str(self.version_dir))
|
|
104
|
-
|
|
105
|
-
def save(self, tag: str, payload: Dict[str, Any]) -> str:
|
|
106
|
-
safe_tag = tag.replace(" ", "_")
|
|
107
|
-
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
108
|
-
path = self.version_dir / f"{ts}_{safe_tag}.json"
|
|
109
|
-
IOUtils.ensure_parent_dir(str(path))
|
|
110
|
-
with open(path, "w", encoding="utf-8") as f:
|
|
111
|
-
json.dump(payload, f, ensure_ascii=False, indent=2, default=str)
|
|
112
|
-
print(f"[Version] Saved snapshot: {path}")
|
|
113
|
-
return str(path)
|
|
114
|
-
|
|
115
|
-
def load_latest(self, tag: str) -> Optional[Dict[str, Any]]:
|
|
116
|
-
"""Load the latest snapshot for a tag (sorted by timestamp prefix)."""
|
|
117
|
-
safe_tag = tag.replace(" ", "_")
|
|
118
|
-
pattern = f"*_{safe_tag}.json"
|
|
119
|
-
candidates = sorted(self.version_dir.glob(pattern))
|
|
120
|
-
if not candidates:
|
|
121
|
-
return None
|
|
122
|
-
path = candidates[-1]
|
|
123
|
-
try:
|
|
124
|
-
return json.loads(path.read_text(encoding="utf-8"))
|
|
125
|
-
except Exception as exc:
|
|
126
|
-
print(f"[Version] Failed to load snapshot {path}: {exc}")
|
|
127
|
-
return None
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
class DatasetPreprocessor:
|
|
131
|
-
# Prepare shared train/test views for trainers.
|
|
132
|
-
|
|
133
|
-
def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
|
|
134
|
-
config: BayesOptConfig) -> None:
|
|
135
|
-
self.config = config
|
|
136
|
-
self.train_data = train_df.copy(deep=False)
|
|
137
|
-
self.test_data = test_df.copy(deep=False)
|
|
138
|
-
self.num_features: List[str] = []
|
|
139
|
-
self.train_oht_data: Optional[pd.DataFrame] = None
|
|
140
|
-
self.test_oht_data: Optional[pd.DataFrame] = None
|
|
141
|
-
self.train_oht_scl_data: Optional[pd.DataFrame] = None
|
|
142
|
-
self.test_oht_scl_data: Optional[pd.DataFrame] = None
|
|
143
|
-
self.var_nmes: List[str] = []
|
|
144
|
-
self.cat_categories_for_shap: Dict[str, List[Any]] = {}
|
|
145
|
-
|
|
146
|
-
def run(self) -> "DatasetPreprocessor":
|
|
147
|
-
"""Run preprocessing: categorical encoding, target clipping, numeric scaling."""
|
|
148
|
-
cfg = self.config
|
|
149
|
-
_normalize_required_columns(
|
|
150
|
-
self.train_data,
|
|
151
|
-
[cfg.resp_nme, cfg.weight_nme, cfg.binary_resp_nme],
|
|
152
|
-
df_label="Train data",
|
|
153
|
-
)
|
|
154
|
-
_normalize_required_columns(
|
|
155
|
-
self.test_data,
|
|
156
|
-
[cfg.resp_nme, cfg.weight_nme, cfg.binary_resp_nme],
|
|
157
|
-
df_label="Test data",
|
|
158
|
-
)
|
|
159
|
-
missing_train = [
|
|
160
|
-
col for col in (cfg.resp_nme, cfg.weight_nme)
|
|
161
|
-
if col not in self.train_data.columns
|
|
162
|
-
]
|
|
163
|
-
if missing_train:
|
|
164
|
-
raise KeyError(
|
|
165
|
-
f"Train data missing required columns: {missing_train}. "
|
|
166
|
-
f"Available columns (first 50): {list(self.train_data.columns)[:50]}"
|
|
167
|
-
)
|
|
168
|
-
if cfg.binary_resp_nme and cfg.binary_resp_nme not in self.train_data.columns:
|
|
169
|
-
raise KeyError(
|
|
170
|
-
f"Train data missing binary response column: {cfg.binary_resp_nme}. "
|
|
171
|
-
f"Available columns (first 50): {list(self.train_data.columns)[:50]}"
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
test_has_resp = cfg.resp_nme in self.test_data.columns
|
|
175
|
-
test_has_weight = cfg.weight_nme in self.test_data.columns
|
|
176
|
-
test_has_binary = bool(
|
|
177
|
-
cfg.binary_resp_nme and cfg.binary_resp_nme in self.test_data.columns
|
|
178
|
-
)
|
|
179
|
-
if not test_has_weight:
|
|
180
|
-
self.test_data[cfg.weight_nme] = 1.0
|
|
181
|
-
if not test_has_resp:
|
|
182
|
-
self.test_data[cfg.resp_nme] = np.nan
|
|
183
|
-
if cfg.binary_resp_nme and cfg.binary_resp_nme not in self.test_data.columns:
|
|
184
|
-
self.test_data[cfg.binary_resp_nme] = np.nan
|
|
185
|
-
|
|
186
|
-
# Precompute weighted actuals for plots and validation checks.
|
|
187
|
-
self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
|
|
188
|
-
self.train_data[cfg.weight_nme]
|
|
189
|
-
if test_has_resp:
|
|
190
|
-
self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
|
|
191
|
-
self.test_data[cfg.weight_nme]
|
|
192
|
-
if cfg.binary_resp_nme:
|
|
193
|
-
self.train_data.loc[:, 'w_binary_act'] = self.train_data[cfg.binary_resp_nme] * \
|
|
194
|
-
self.train_data[cfg.weight_nme]
|
|
195
|
-
if test_has_binary:
|
|
196
|
-
self.test_data.loc[:, 'w_binary_act'] = self.test_data[cfg.binary_resp_nme] * \
|
|
197
|
-
self.test_data[cfg.weight_nme]
|
|
198
|
-
# High-quantile clipping absorbs outliers; removing it lets extremes dominate loss.
|
|
199
|
-
q99 = self.train_data[cfg.resp_nme].quantile(0.999)
|
|
200
|
-
self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
|
|
201
|
-
upper=q99)
|
|
202
|
-
cate_list = list(cfg.cate_list or [])
|
|
203
|
-
if cate_list:
|
|
204
|
-
for cate in cate_list:
|
|
205
|
-
self.train_data[cate] = self.train_data[cate].astype(
|
|
206
|
-
'category')
|
|
207
|
-
self.test_data[cate] = self.test_data[cate].astype('category')
|
|
208
|
-
cats = self.train_data[cate].cat.categories
|
|
209
|
-
self.cat_categories_for_shap[cate] = list(cats)
|
|
210
|
-
self.num_features = [
|
|
211
|
-
nme for nme in cfg.factor_nmes if nme not in cate_list]
|
|
212
|
-
train_oht = self.train_data[cfg.factor_nmes +
|
|
213
|
-
[cfg.weight_nme] + [cfg.resp_nme]].copy()
|
|
214
|
-
test_oht = self.test_data[cfg.factor_nmes +
|
|
215
|
-
[cfg.weight_nme] + [cfg.resp_nme]].copy()
|
|
216
|
-
train_oht = pd.get_dummies(
|
|
217
|
-
train_oht,
|
|
218
|
-
columns=cate_list,
|
|
219
|
-
drop_first=True,
|
|
220
|
-
dtype=np.int8
|
|
221
|
-
)
|
|
222
|
-
test_oht = pd.get_dummies(
|
|
223
|
-
test_oht,
|
|
224
|
-
columns=cate_list,
|
|
225
|
-
drop_first=True,
|
|
226
|
-
dtype=np.int8
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
# Fill missing dummy columns when reindexing to align train/test columns.
|
|
230
|
-
test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
|
|
231
|
-
|
|
232
|
-
# Keep unscaled one-hot data for fold-specific scaling to avoid leakage.
|
|
233
|
-
self.train_oht_data = train_oht
|
|
234
|
-
self.test_oht_data = test_oht
|
|
235
|
-
|
|
236
|
-
train_oht_scaled = train_oht.copy(deep=False)
|
|
237
|
-
test_oht_scaled = test_oht.copy(deep=False)
|
|
238
|
-
for num_chr in self.num_features:
|
|
239
|
-
# Scale per column so features are on comparable ranges for NN stability.
|
|
240
|
-
scaler = StandardScaler()
|
|
241
|
-
train_oht_scaled[num_chr] = scaler.fit_transform(
|
|
242
|
-
train_oht_scaled[num_chr].values.reshape(-1, 1)).astype(np.float32)
|
|
243
|
-
test_oht_scaled[num_chr] = scaler.transform(
|
|
244
|
-
test_oht_scaled[num_chr].values.reshape(-1, 1)).astype(np.float32)
|
|
245
|
-
# Fill missing dummy columns when reindexing to align train/test columns.
|
|
246
|
-
test_oht_scaled = test_oht_scaled.reindex(
|
|
247
|
-
columns=train_oht_scaled.columns, fill_value=0)
|
|
248
|
-
self.train_oht_scl_data = train_oht_scaled
|
|
249
|
-
self.test_oht_scl_data = test_oht_scaled
|
|
250
|
-
excluded = {cfg.weight_nme, cfg.resp_nme}
|
|
251
|
-
self.var_nmes = [
|
|
252
|
-
col for col in train_oht_scaled.columns if col not in excluded
|
|
253
|
-
]
|
|
254
|
-
return self
|