ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. ins_pricing/README.md +9 -6
  2. ins_pricing/__init__.py +3 -11
  3. ins_pricing/cli/BayesOpt_entry.py +24 -0
  4. ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
  5. ins_pricing/cli/Explain_Run.py +25 -0
  6. ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
  7. ins_pricing/cli/Pricing_Run.py +25 -0
  8. ins_pricing/cli/__init__.py +1 -0
  9. ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
  10. ins_pricing/cli/utils/__init__.py +1 -0
  11. ins_pricing/cli/utils/cli_common.py +320 -0
  12. ins_pricing/cli/utils/cli_config.py +375 -0
  13. ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
  14. {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
  15. ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
  16. ins_pricing/docs/modelling/README.md +34 -0
  17. ins_pricing/modelling/__init__.py +57 -6
  18. ins_pricing/modelling/core/__init__.py +1 -0
  19. ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
  20. ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
  21. ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
  22. ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
  23. ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
  24. ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
  25. ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
  26. ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
  27. ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
  28. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
  29. ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
  30. ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
  31. ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
  32. ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
  33. ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
  34. ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
  35. ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
  36. ins_pricing/modelling/core/evaluation.py +115 -0
  37. ins_pricing/production/__init__.py +4 -0
  38. ins_pricing/production/preprocess.py +71 -0
  39. ins_pricing/setup.py +10 -5
  40. {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
  41. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
  42. ins_pricing-0.2.0.dist-info/RECORD +125 -0
  43. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
  44. ins_pricing/modelling/BayesOpt_entry.py +0 -633
  45. ins_pricing/modelling/Explain_Run.py +0 -36
  46. ins_pricing/modelling/Pricing_Run.py +0 -36
  47. ins_pricing/modelling/README.md +0 -33
  48. ins_pricing/modelling/bayesopt/models.py +0 -2196
  49. ins_pricing/modelling/bayesopt/trainers.py +0 -2446
  50. ins_pricing/modelling/cli_common.py +0 -136
  51. ins_pricing/modelling/tests/test_plotting.py +0 -63
  52. ins_pricing/modelling/watchdog_run.py +0 -211
  53. ins_pricing-0.1.11.dist-info/RECORD +0 -169
  54. ins_pricing_gemini/__init__.py +0 -23
  55. ins_pricing_gemini/governance/__init__.py +0 -20
  56. ins_pricing_gemini/governance/approval.py +0 -93
  57. ins_pricing_gemini/governance/audit.py +0 -37
  58. ins_pricing_gemini/governance/registry.py +0 -99
  59. ins_pricing_gemini/governance/release.py +0 -159
  60. ins_pricing_gemini/modelling/Explain_Run.py +0 -36
  61. ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
  62. ins_pricing_gemini/modelling/__init__.py +0 -151
  63. ins_pricing_gemini/modelling/cli_common.py +0 -141
  64. ins_pricing_gemini/modelling/config.py +0 -249
  65. ins_pricing_gemini/modelling/config_preprocess.py +0 -254
  66. ins_pricing_gemini/modelling/core.py +0 -741
  67. ins_pricing_gemini/modelling/data_container.py +0 -42
  68. ins_pricing_gemini/modelling/explain/__init__.py +0 -55
  69. ins_pricing_gemini/modelling/explain/gradients.py +0 -334
  70. ins_pricing_gemini/modelling/explain/metrics.py +0 -176
  71. ins_pricing_gemini/modelling/explain/permutation.py +0 -155
  72. ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
  73. ins_pricing_gemini/modelling/features.py +0 -215
  74. ins_pricing_gemini/modelling/model_manager.py +0 -148
  75. ins_pricing_gemini/modelling/model_plotting.py +0 -463
  76. ins_pricing_gemini/modelling/models.py +0 -2203
  77. ins_pricing_gemini/modelling/notebook_utils.py +0 -294
  78. ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
  79. ins_pricing_gemini/modelling/plotting/common.py +0 -63
  80. ins_pricing_gemini/modelling/plotting/curves.py +0 -572
  81. ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
  82. ins_pricing_gemini/modelling/plotting/geo.py +0 -362
  83. ins_pricing_gemini/modelling/plotting/importance.py +0 -121
  84. ins_pricing_gemini/modelling/run_logging.py +0 -133
  85. ins_pricing_gemini/modelling/tests/conftest.py +0 -8
  86. ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
  87. ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
  88. ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
  89. ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
  90. ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
  91. ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
  92. ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
  93. ins_pricing_gemini/modelling/trainers.py +0 -2447
  94. ins_pricing_gemini/modelling/utils.py +0 -1020
  95. ins_pricing_gemini/pricing/__init__.py +0 -27
  96. ins_pricing_gemini/pricing/calibration.py +0 -39
  97. ins_pricing_gemini/pricing/data_quality.py +0 -117
  98. ins_pricing_gemini/pricing/exposure.py +0 -85
  99. ins_pricing_gemini/pricing/factors.py +0 -91
  100. ins_pricing_gemini/pricing/monitoring.py +0 -99
  101. ins_pricing_gemini/pricing/rate_table.py +0 -78
  102. ins_pricing_gemini/production/__init__.py +0 -21
  103. ins_pricing_gemini/production/drift.py +0 -30
  104. ins_pricing_gemini/production/monitoring.py +0 -143
  105. ins_pricing_gemini/production/scoring.py +0 -40
  106. ins_pricing_gemini/reporting/__init__.py +0 -11
  107. ins_pricing_gemini/reporting/report_builder.py +0 -72
  108. ins_pricing_gemini/reporting/scheduler.py +0 -45
  109. ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
  110. ins_pricing_gemini/scripts/Explain_entry.py +0 -545
  111. ins_pricing_gemini/scripts/__init__.py +0 -1
  112. ins_pricing_gemini/scripts/train.py +0 -568
  113. ins_pricing_gemini/setup.py +0 -55
  114. ins_pricing_gemini/smoke_test.py +0 -28
  115. /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
  116. /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
  117. /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
  118. /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
  119. /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
  120. /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
  121. /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
  122. /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
  123. /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
  124. /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
  125. /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
  126. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
@@ -1,141 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import os
5
- from pathlib import Path
6
- from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
7
- try:
8
- from .config import BayesOptConfig
9
- except ImportError:
10
- from ins_pricing.modelling.config import BayesOptConfig
11
-
12
-
13
-
14
- PLOT_MODEL_LABELS: Dict[str, Tuple[str, str]] = {
15
- "glm": ("GLM", "pred_glm"),
16
- "xgb": ("Xgboost", "pred_xgb"),
17
- "resn": ("ResNet", "pred_resn"),
18
- "ft": ("FTTransformer", "pred_ft"),
19
- "gnn": ("GNN", "pred_gnn"),
20
- }
21
-
22
- PYTORCH_TRAINERS = {"resn", "ft", "gnn"}
23
-
24
-
25
- def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
26
- seen = set()
27
- unique: List[str] = []
28
- for item in items:
29
- if item not in seen:
30
- unique.append(item)
31
- seen.add(item)
32
- return unique
33
-
34
-
35
- def build_model_names(prefixes: Sequence[str], suffixes: Sequence[str]) -> List[str]:
36
- names: List[str] = []
37
- for suffix in suffixes:
38
- names.extend(f"{prefix}_{suffix}" for prefix in prefixes)
39
- return names
40
-
41
-
42
- def parse_model_pairs(raw_pairs: List) -> List[Tuple[str, str]]:
43
- pairs: List[Tuple[str, str]] = []
44
- for pair in raw_pairs:
45
- if isinstance(pair, (list, tuple)) and len(pair) == 2:
46
- pairs.append((str(pair[0]), str(pair[1])))
47
- elif isinstance(pair, str):
48
- parts = [p.strip() for p in pair.split(",") if p.strip()]
49
- if len(parts) == 2:
50
- pairs.append((parts[0], parts[1]))
51
- return pairs
52
-
53
-
54
- def resolve_path(value: Optional[str], base_dir: Path) -> Optional[Path]:
55
- if value is None:
56
- return None
57
- if not isinstance(value, str) or not value.strip():
58
- return None
59
- p = Path(value)
60
- if p.is_absolute():
61
- return p
62
- return (base_dir / p).resolve()
63
-
64
-
65
- def resolve_config_path(raw: str, script_dir: Path) -> Path:
66
- candidate = Path(raw)
67
- if candidate.exists():
68
- return candidate.resolve()
69
- candidate2 = (script_dir / raw)
70
- if candidate2.exists():
71
- return candidate2.resolve()
72
- raise FileNotFoundError(
73
- f"Config file not found: {raw}. Tried: {Path(raw).resolve()} and {candidate2.resolve()}"
74
- )
75
-
76
-
77
- def load_config_json(path: Path, required_keys: Sequence[str]) -> BayesOptConfig:
78
- cfg_dict = json.loads(path.read_text(encoding="utf-8"))
79
- missing = [key for key in required_keys if key not in cfg_dict]
80
- if missing:
81
- raise ValueError(f"Missing required keys in {path}: {missing}")
82
- return BayesOptConfig.from_legacy_dict(cfg_dict)
83
-
84
-
85
- def set_env(env_overrides: Dict[str, Any]) -> None:
86
- """Apply environment variables from config.json.
87
-
88
- Notes (DDP/Optuna hang debugging):
89
- - You can add these keys into config.json's `env` to debug distributed hangs:
90
- - `TORCH_DISTRIBUTED_DEBUG=DETAIL`
91
- - `NCCL_DEBUG=INFO`
92
- - `BAYESOPT_DDP_BARRIER_DEBUG=1`
93
- - `BAYESOPT_DDP_BARRIER_TIMEOUT=300`
94
- - `BAYESOPT_CUDA_SYNC=1` (optional; can slow down)
95
- - `BAYESOPT_CUDA_IPC_COLLECT=1` (optional; can slow down)
96
- - This function uses `os.environ.setdefault`, so a value already set in the
97
- shell will take precedence over config.json.
98
- """
99
- for key, value in (env_overrides or {}).items():
100
- os.environ.setdefault(str(key), str(value))
101
-
102
-
103
- def _looks_like_url(value: str) -> bool:
104
- value = str(value)
105
- return "://" in value
106
-
107
-
108
- def normalize_config_paths(cfg: Dict[str, Any], config_path: Path) -> Dict[str, Any]:
109
- """Resolve relative paths against the config.json directory.
110
-
111
- Fields handled:
112
- - data_dir / output_dir / optuna_storage / gnn_graph_cache
113
- - best_params_files (dict: model_key -> path)
114
- """
115
- base_dir = config_path.parent
116
- out = dict(cfg)
117
-
118
- for key in ("data_dir", "output_dir", "gnn_graph_cache"):
119
- if key in out and isinstance(out.get(key), str):
120
- resolved = resolve_path(out.get(key), base_dir)
121
- if resolved is not None:
122
- out[key] = str(resolved)
123
-
124
- storage = out.get("optuna_storage")
125
- if isinstance(storage, str) and storage.strip():
126
- if not _looks_like_url(storage):
127
- resolved = resolve_path(storage, base_dir)
128
- if resolved is not None:
129
- out["optuna_storage"] = str(resolved)
130
-
131
- best_files = out.get("best_params_files")
132
- if isinstance(best_files, dict):
133
- resolved_map: Dict[str, str] = {}
134
- for mk, path_str in best_files.items():
135
- if not isinstance(path_str, str):
136
- continue
137
- resolved = resolve_path(path_str, base_dir)
138
- resolved_map[str(mk)] = str(resolved) if resolved is not None else str(path_str)
139
- out["best_params_files"] = resolved_map
140
-
141
- return out
@@ -1,249 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any, Dict, List, Optional
4
- from pydantic import BaseModel, Field, validator
5
-
6
- class DataConfig(BaseModel):
7
- resp_nme: str
8
- weight_nme: str
9
- factor_nmes: List[str]
10
- cate_list: Optional[List[str]] = None
11
- binary_resp_nme: Optional[str] = None
12
- task_type: str = 'regression'
13
- prop_test: float = 0.25
14
- rand_seed: Optional[int] = None
15
-
16
- class DistributedConfig(BaseModel):
17
- use_gpu: bool = True
18
- use_resn_data_parallel: bool = False
19
- use_ft_data_parallel: bool = False
20
- use_gnn_data_parallel: bool = False
21
- use_resn_ddp: bool = False
22
- use_ft_ddp: bool = False
23
- use_gnn_ddp: bool = False
24
- # DDP Timeout settings can be passed via env, but good to have here if needed
25
-
26
- class GNNConfig(BaseModel):
27
- use_approx_knn: bool = True
28
- approx_knn_threshold: int = 50000
29
- graph_cache: Optional[str] = None
30
- max_gpu_knn_nodes: Optional[int] = 200000
31
- knn_gpu_mem_ratio: float = 0.9
32
- knn_gpu_mem_overhead: float = 2.0
33
-
34
- class RegionConfig(BaseModel):
35
- province_col: Optional[str] = None
36
- city_col: Optional[str] = None
37
- effect_alpha: float = 50.0
38
-
39
- class GeoTokenConfig(BaseModel):
40
- feature_nmes: Optional[List[str]] = None
41
- hidden_dim: int = 32
42
- layers: int = 2
43
- dropout: float = 0.1
44
- k_neighbors: int = 10
45
- learning_rate: float = 1e-3
46
- epochs: int = 50
47
-
48
- class OptunaConfig(BaseModel):
49
- storage: Optional[str] = None
50
- study_prefix: Optional[str] = None
51
- best_params_files: Optional[Dict[str, str]] = None
52
- reuse_best_params: bool = False
53
-
54
- class FTConfig(BaseModel):
55
- role: str = "model" # "model", "embedding", "unsupervised_embedding"
56
- feature_prefix: str = "ft_emb"
57
- num_numeric_tokens: Optional[int] = None
58
-
59
- class BayesOptConfig(BaseModel):
60
- # Core Data & Task
61
- data: DataConfig
62
-
63
- # Model Names & Meta
64
- model_nme: str
65
-
66
- # Training Hyperparameters
67
- epochs: int = 100
68
- xgb_max_depth_max: int = 25
69
- xgb_n_estimators_max: int = 500
70
- resn_weight_decay: float = 1e-4
71
-
72
- # Sub-component Configs
73
- dist: DistributedConfig = Field(default_factory=DistributedConfig)
74
- gnn: GNNConfig = Field(default_factory=GNNConfig)
75
- region: RegionConfig = Field(default_factory=RegionConfig)
76
- geo: GeoTokenConfig = Field(default_factory=GeoTokenConfig)
77
- optuna: OptunaConfig = Field(default_factory=OptunaConfig)
78
- ft: FTConfig = Field(default_factory=FTConfig)
79
-
80
- # Ensemble & output
81
- output_dir: Optional[str] = None
82
- final_ensemble: bool = False
83
- final_ensemble_k: int = 3
84
- final_refit: bool = True
85
-
86
- # Flattened accessors for backward compatibility
87
- @property
88
- def resp_nme(self): return self.data.resp_nme
89
- @property
90
- def weight_nme(self): return self.data.weight_nme
91
- @property
92
- def factor_nmes(self): return self.data.factor_nmes
93
- @property
94
- def task_type(self): return self.data.task_type
95
- @property
96
- def cate_list(self): return self.data.cate_list
97
- @property
98
- def binary_resp_nme(self): return self.data.binary_resp_nme
99
- @property
100
- def prop_test(self): return self.data.prop_test
101
- @property
102
- def rand_seed(self): return self.data.rand_seed
103
-
104
- @property
105
- def use_gpu(self): return self.dist.use_gpu
106
- @property
107
- def use_resn_data_parallel(self): return self.dist.use_resn_data_parallel
108
- @property
109
- def use_ft_data_parallel(self): return self.dist.use_ft_data_parallel
110
- @property
111
- def use_gnn_data_parallel(self): return self.dist.use_gnn_data_parallel
112
- @property
113
- def use_resn_ddp(self): return self.dist.use_resn_ddp
114
- @property
115
- def use_ft_ddp(self): return self.dist.use_ft_ddp
116
- @property
117
- def use_gnn_ddp(self): return self.dist.use_gnn_ddp
118
-
119
- @property
120
- def gnn_use_approx_knn(self): return self.gnn.use_approx_knn
121
- @property
122
- def gnn_approx_knn_threshold(self): return self.gnn.approx_knn_threshold
123
- @property
124
- def gnn_graph_cache(self): return self.gnn.graph_cache
125
- @property
126
- def gnn_max_gpu_knn_nodes(self): return self.gnn.max_gpu_knn_nodes
127
- @property
128
- def gnn_knn_gpu_mem_ratio(self): return self.gnn.knn_gpu_mem_ratio
129
- @property
130
- def gnn_knn_gpu_mem_overhead(self): return self.gnn.knn_gpu_mem_overhead
131
-
132
- @property
133
- def region_province_col(self): return self.region.province_col
134
- @property
135
- def region_city_col(self): return self.region.city_col
136
- @property
137
- def region_effect_alpha(self): return self.region.effect_alpha
138
-
139
- @property
140
- def geo_feature_nmes(self): return self.geo.feature_nmes
141
- @property
142
- def geo_token_hidden_dim(self): return self.geo.hidden_dim
143
- @property
144
- def geo_token_layers(self): return self.geo.layers
145
- @property
146
- def geo_token_dropout(self): return self.geo.dropout
147
- @property
148
- def geo_token_k_neighbors(self): return self.geo.k_neighbors
149
- @property
150
- def geo_token_learning_rate(self): return self.geo.learning_rate
151
- @property
152
- def geo_token_epochs(self): return self.geo.epochs
153
-
154
- @property
155
- def optuna_storage(self): return self.optuna.storage
156
- @property
157
- def optuna_study_prefix(self): return self.optuna.study_prefix
158
- @property
159
- def best_params_files(self): return self.optuna.best_params_files
160
- @property
161
- def reuse_best_params(self): return self.optuna.reuse_best_params
162
-
163
- @property
164
- def ft_role(self): return self.ft.role
165
- @property
166
- def ft_feature_prefix(self): return self.ft.feature_prefix
167
- @property
168
- def ft_num_numeric_tokens(self): return self.ft.num_numeric_tokens
169
-
170
- @classmethod
171
- def from_legacy_dict(cls, d: Dict[str, Any]) -> 'BayesOptConfig':
172
- """Map flat dictionary to nested Pydantic structure."""
173
- data = DataConfig(
174
- resp_nme=d.get('resp_nme'),
175
- weight_nme=d.get('weight_nme'),
176
- factor_nmes=d.get('factor_nmes', []),
177
- cate_list=d.get('cate_list'),
178
- binary_resp_nme=d.get('binary_resp_nme'),
179
- task_type=d.get('task_type', 'regression'),
180
- prop_test=d.get('prop_test', 0.25),
181
- rand_seed=d.get('rand_seed')
182
- )
183
-
184
- dist = DistributedConfig(
185
- use_gpu=d.get('use_gpu', True),
186
- use_resn_data_parallel=d.get('use_resn_data_parallel', False),
187
- use_ft_data_parallel=d.get('use_ft_data_parallel', False),
188
- use_gnn_data_parallel=d.get('use_gnn_data_parallel', False),
189
- use_resn_ddp=d.get('use_resn_ddp', False),
190
- use_ft_ddp=d.get('use_ft_ddp', False),
191
- use_gnn_ddp=d.get('use_gnn_ddp', False),
192
- )
193
-
194
- gnn = GNNConfig(
195
- use_approx_knn=d.get('gnn_use_approx_knn', True),
196
- approx_knn_threshold=d.get('gnn_approx_knn_threshold', 50000),
197
- graph_cache=d.get('gnn_graph_cache'),
198
- max_gpu_knn_nodes=d.get('gnn_max_gpu_knn_nodes', 200000),
199
- knn_gpu_mem_ratio=d.get('gnn_knn_gpu_mem_ratio', 0.9),
200
- knn_gpu_mem_overhead=d.get('gnn_knn_gpu_mem_overhead', 2.0),
201
- )
202
-
203
- region = RegionConfig(
204
- province_col=d.get('region_province_col'),
205
- city_col=d.get('region_city_col'),
206
- effect_alpha=d.get('region_effect_alpha', 50.0)
207
- )
208
-
209
- geo = GeoTokenConfig(
210
- feature_nmes=d.get('geo_feature_nmes'),
211
- hidden_dim=d.get('geo_token_hidden_dim', 32),
212
- layers=d.get('geo_token_layers', 2),
213
- dropout=d.get('geo_token_dropout', 0.1),
214
- k_neighbors=d.get('geo_token_k_neighbors', 10),
215
- learning_rate=d.get('geo_token_learning_rate', 1e-3),
216
- epochs=d.get('geo_token_epochs', 50)
217
- )
218
-
219
- optuna = OptunaConfig(
220
- storage=d.get('optuna_storage'),
221
- study_prefix=d.get('optuna_study_prefix'),
222
- best_params_files=d.get('best_params_files'),
223
- reuse_best_params=d.get('reuse_best_params', False)
224
- )
225
-
226
- ft = FTConfig(
227
- role=d.get('ft_role', 'model'),
228
- feature_prefix=d.get('ft_feature_prefix', 'ft_emb'),
229
- num_numeric_tokens=d.get('ft_num_numeric_tokens')
230
- )
231
-
232
- return cls(
233
- data=data,
234
- model_nme=d.get('model_nme', 'model'),
235
- epochs=d.get('epochs', 100),
236
- xgb_max_depth_max=d.get('xgb_max_depth_max', 25),
237
- xgb_n_estimators_max=d.get('xgb_n_estimators_max', 500),
238
- resn_weight_decay=d.get('resn_weight_decay', 1e-4),
239
- dist=dist,
240
- gnn=gnn,
241
- region=region,
242
- geo=geo,
243
- optuna=optuna,
244
- ft=ft,
245
- output_dir=d.get('output_dir'),
246
- final_ensemble=d.get('final_ensemble', False),
247
- final_ensemble_k=d.get('final_ensemble_k', 3),
248
- final_refit=d.get('final_refit', True)
249
- )
@@ -1,254 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import os
5
- from dataclasses import dataclass
6
- from datetime import datetime
7
- from pathlib import Path
8
- from typing import Any, Dict, List, Optional
9
-
10
- import numpy as np
11
- import pandas as pd
12
- from sklearn.preprocessing import StandardScaler
13
-
14
- from .utils import IOUtils
15
- from .config import BayesOptConfig
16
-
17
-
18
- # NOTE: Some CSV exports may contain invisible BOM characters or leading/trailing
19
- # spaces in column names. Pandas requires exact matches, so we normalize a few
20
- # "required" column names (response/weight/binary response) before validating.
21
-
22
-
23
- def _clean_column_name(name: Any) -> Any:
24
- if not isinstance(name, str):
25
- return name
26
- return name.replace("\ufeff", "").strip()
27
-
28
-
29
- def _normalize_required_columns(
30
- df: pd.DataFrame, required: List[Optional[str]], *, df_label: str
31
- ) -> None:
32
- required_names = [r for r in required if isinstance(r, str) and r.strip()]
33
- if not required_names:
34
- return
35
-
36
- mapping: Dict[Any, Any] = {}
37
- existing = set(df.columns)
38
- for col in df.columns:
39
- cleaned = _clean_column_name(col)
40
- if cleaned != col and cleaned not in existing:
41
- mapping[col] = cleaned
42
- if mapping:
43
- df.rename(columns=mapping, inplace=True)
44
-
45
- existing = set(df.columns)
46
- for req in required_names:
47
- if req in existing:
48
- continue
49
- candidates = [
50
- col
51
- for col in df.columns
52
- if isinstance(col, str) and _clean_column_name(col).lower() == req.lower()
53
- ]
54
- if len(candidates) == 1 and req not in existing:
55
- df.rename(columns={candidates[0]: req}, inplace=True)
56
- existing = set(df.columns)
57
- elif len(candidates) > 1:
58
- raise KeyError(
59
- f"{df_label} has multiple columns matching required {req!r} "
60
- f"(case/space-insensitive): {candidates}"
61
- )
62
-
63
-
64
- # ===== Core components and training wrappers =================================
65
-
66
- # =============================================================================
67
- # Config, preprocessing, and trainer base types
68
- # =============================================================================
69
- # BayesOptConfig is now imported from .config
70
-
71
-
72
-
73
- class OutputManager:
74
- # Centralize output paths for plots, results, and models.
75
-
76
- def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
77
- self.root = Path(root or os.getcwd())
78
- self.model_name = model_name
79
- self.plot_dir = self.root / 'plot'
80
- self.result_dir = self.root / 'Results'
81
- self.model_dir = self.root / 'model'
82
-
83
- def _prepare(self, path: Path) -> str:
84
- IOUtils.ensure_parent_dir(str(path))
85
- return str(path)
86
-
87
- def plot_path(self, filename: str) -> str:
88
- return self._prepare(self.plot_dir / filename)
89
-
90
- def result_path(self, filename: str) -> str:
91
- return self._prepare(self.result_dir / filename)
92
-
93
- def model_path(self, filename: str) -> str:
94
- return self._prepare(self.model_dir / filename)
95
-
96
-
97
- class VersionManager:
98
- """Lightweight versioning: save config and best-params snapshots for traceability."""
99
-
100
- def __init__(self, output: OutputManager) -> None:
101
- self.output = output
102
- self.version_dir = Path(self.output.result_dir) / "versions"
103
- IOUtils.ensure_parent_dir(str(self.version_dir))
104
-
105
- def save(self, tag: str, payload: Dict[str, Any]) -> str:
106
- safe_tag = tag.replace(" ", "_")
107
- ts = datetime.now().strftime("%Y%m%d_%H%M%S")
108
- path = self.version_dir / f"{ts}_{safe_tag}.json"
109
- IOUtils.ensure_parent_dir(str(path))
110
- with open(path, "w", encoding="utf-8") as f:
111
- json.dump(payload, f, ensure_ascii=False, indent=2, default=str)
112
- print(f"[Version] Saved snapshot: {path}")
113
- return str(path)
114
-
115
- def load_latest(self, tag: str) -> Optional[Dict[str, Any]]:
116
- """Load the latest snapshot for a tag (sorted by timestamp prefix)."""
117
- safe_tag = tag.replace(" ", "_")
118
- pattern = f"*_{safe_tag}.json"
119
- candidates = sorted(self.version_dir.glob(pattern))
120
- if not candidates:
121
- return None
122
- path = candidates[-1]
123
- try:
124
- return json.loads(path.read_text(encoding="utf-8"))
125
- except Exception as exc:
126
- print(f"[Version] Failed to load snapshot {path}: {exc}")
127
- return None
128
-
129
-
130
- class DatasetPreprocessor:
131
- # Prepare shared train/test views for trainers.
132
-
133
- def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
134
- config: BayesOptConfig) -> None:
135
- self.config = config
136
- self.train_data = train_df.copy(deep=False)
137
- self.test_data = test_df.copy(deep=False)
138
- self.num_features: List[str] = []
139
- self.train_oht_data: Optional[pd.DataFrame] = None
140
- self.test_oht_data: Optional[pd.DataFrame] = None
141
- self.train_oht_scl_data: Optional[pd.DataFrame] = None
142
- self.test_oht_scl_data: Optional[pd.DataFrame] = None
143
- self.var_nmes: List[str] = []
144
- self.cat_categories_for_shap: Dict[str, List[Any]] = {}
145
-
146
- def run(self) -> "DatasetPreprocessor":
147
- """Run preprocessing: categorical encoding, target clipping, numeric scaling."""
148
- cfg = self.config
149
- _normalize_required_columns(
150
- self.train_data,
151
- [cfg.resp_nme, cfg.weight_nme, cfg.binary_resp_nme],
152
- df_label="Train data",
153
- )
154
- _normalize_required_columns(
155
- self.test_data,
156
- [cfg.resp_nme, cfg.weight_nme, cfg.binary_resp_nme],
157
- df_label="Test data",
158
- )
159
- missing_train = [
160
- col for col in (cfg.resp_nme, cfg.weight_nme)
161
- if col not in self.train_data.columns
162
- ]
163
- if missing_train:
164
- raise KeyError(
165
- f"Train data missing required columns: {missing_train}. "
166
- f"Available columns (first 50): {list(self.train_data.columns)[:50]}"
167
- )
168
- if cfg.binary_resp_nme and cfg.binary_resp_nme not in self.train_data.columns:
169
- raise KeyError(
170
- f"Train data missing binary response column: {cfg.binary_resp_nme}. "
171
- f"Available columns (first 50): {list(self.train_data.columns)[:50]}"
172
- )
173
-
174
- test_has_resp = cfg.resp_nme in self.test_data.columns
175
- test_has_weight = cfg.weight_nme in self.test_data.columns
176
- test_has_binary = bool(
177
- cfg.binary_resp_nme and cfg.binary_resp_nme in self.test_data.columns
178
- )
179
- if not test_has_weight:
180
- self.test_data[cfg.weight_nme] = 1.0
181
- if not test_has_resp:
182
- self.test_data[cfg.resp_nme] = np.nan
183
- if cfg.binary_resp_nme and cfg.binary_resp_nme not in self.test_data.columns:
184
- self.test_data[cfg.binary_resp_nme] = np.nan
185
-
186
- # Precompute weighted actuals for plots and validation checks.
187
- self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
188
- self.train_data[cfg.weight_nme]
189
- if test_has_resp:
190
- self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
191
- self.test_data[cfg.weight_nme]
192
- if cfg.binary_resp_nme:
193
- self.train_data.loc[:, 'w_binary_act'] = self.train_data[cfg.binary_resp_nme] * \
194
- self.train_data[cfg.weight_nme]
195
- if test_has_binary:
196
- self.test_data.loc[:, 'w_binary_act'] = self.test_data[cfg.binary_resp_nme] * \
197
- self.test_data[cfg.weight_nme]
198
- # High-quantile clipping absorbs outliers; removing it lets extremes dominate loss.
199
- q99 = self.train_data[cfg.resp_nme].quantile(0.999)
200
- self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
201
- upper=q99)
202
- cate_list = list(cfg.cate_list or [])
203
- if cate_list:
204
- for cate in cate_list:
205
- self.train_data[cate] = self.train_data[cate].astype(
206
- 'category')
207
- self.test_data[cate] = self.test_data[cate].astype('category')
208
- cats = self.train_data[cate].cat.categories
209
- self.cat_categories_for_shap[cate] = list(cats)
210
- self.num_features = [
211
- nme for nme in cfg.factor_nmes if nme not in cate_list]
212
- train_oht = self.train_data[cfg.factor_nmes +
213
- [cfg.weight_nme] + [cfg.resp_nme]].copy()
214
- test_oht = self.test_data[cfg.factor_nmes +
215
- [cfg.weight_nme] + [cfg.resp_nme]].copy()
216
- train_oht = pd.get_dummies(
217
- train_oht,
218
- columns=cate_list,
219
- drop_first=True,
220
- dtype=np.int8
221
- )
222
- test_oht = pd.get_dummies(
223
- test_oht,
224
- columns=cate_list,
225
- drop_first=True,
226
- dtype=np.int8
227
- )
228
-
229
- # Fill missing dummy columns when reindexing to align train/test columns.
230
- test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
231
-
232
- # Keep unscaled one-hot data for fold-specific scaling to avoid leakage.
233
- self.train_oht_data = train_oht
234
- self.test_oht_data = test_oht
235
-
236
- train_oht_scaled = train_oht.copy(deep=False)
237
- test_oht_scaled = test_oht.copy(deep=False)
238
- for num_chr in self.num_features:
239
- # Scale per column so features are on comparable ranges for NN stability.
240
- scaler = StandardScaler()
241
- train_oht_scaled[num_chr] = scaler.fit_transform(
242
- train_oht_scaled[num_chr].values.reshape(-1, 1)).astype(np.float32)
243
- test_oht_scaled[num_chr] = scaler.transform(
244
- test_oht_scaled[num_chr].values.reshape(-1, 1)).astype(np.float32)
245
- # Fill missing dummy columns when reindexing to align train/test columns.
246
- test_oht_scaled = test_oht_scaled.reindex(
247
- columns=train_oht_scaled.columns, fill_value=0)
248
- self.train_oht_scl_data = train_oht_scaled
249
- self.test_oht_scl_data = test_oht_scaled
250
- excluded = {cfg.weight_nme, cfg.resp_nme}
251
- self.var_nmes = [
252
- col for col in train_oht_scaled.columns if col not in excluded
253
- ]
254
- return self