ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +9 -6
- ins_pricing/__init__.py +3 -11
- ins_pricing/cli/BayesOpt_entry.py +24 -0
- ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
- ins_pricing/cli/Explain_Run.py +25 -0
- ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
- ins_pricing/cli/Pricing_Run.py +25 -0
- ins_pricing/cli/__init__.py +1 -0
- ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
- ins_pricing/cli/utils/__init__.py +1 -0
- ins_pricing/cli/utils/cli_common.py +320 -0
- ins_pricing/cli/utils/cli_config.py +375 -0
- ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
- {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
- ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
- ins_pricing/docs/modelling/README.md +34 -0
- ins_pricing/modelling/__init__.py +57 -6
- ins_pricing/modelling/core/__init__.py +1 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
- ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
- ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
- ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
- ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
- ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
- ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
- ins_pricing/modelling/core/evaluation.py +115 -0
- ins_pricing/production/__init__.py +4 -0
- ins_pricing/production/preprocess.py +71 -0
- ins_pricing/setup.py +10 -5
- {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
- ins_pricing-0.2.0.dist-info/RECORD +125 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
- ins_pricing/modelling/BayesOpt_entry.py +0 -633
- ins_pricing/modelling/Explain_Run.py +0 -36
- ins_pricing/modelling/Pricing_Run.py +0 -36
- ins_pricing/modelling/README.md +0 -33
- ins_pricing/modelling/bayesopt/models.py +0 -2196
- ins_pricing/modelling/bayesopt/trainers.py +0 -2446
- ins_pricing/modelling/cli_common.py +0 -136
- ins_pricing/modelling/tests/test_plotting.py +0 -63
- ins_pricing/modelling/watchdog_run.py +0 -211
- ins_pricing-0.1.11.dist-info/RECORD +0 -169
- ins_pricing_gemini/__init__.py +0 -23
- ins_pricing_gemini/governance/__init__.py +0 -20
- ins_pricing_gemini/governance/approval.py +0 -93
- ins_pricing_gemini/governance/audit.py +0 -37
- ins_pricing_gemini/governance/registry.py +0 -99
- ins_pricing_gemini/governance/release.py +0 -159
- ins_pricing_gemini/modelling/Explain_Run.py +0 -36
- ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
- ins_pricing_gemini/modelling/__init__.py +0 -151
- ins_pricing_gemini/modelling/cli_common.py +0 -141
- ins_pricing_gemini/modelling/config.py +0 -249
- ins_pricing_gemini/modelling/config_preprocess.py +0 -254
- ins_pricing_gemini/modelling/core.py +0 -741
- ins_pricing_gemini/modelling/data_container.py +0 -42
- ins_pricing_gemini/modelling/explain/__init__.py +0 -55
- ins_pricing_gemini/modelling/explain/gradients.py +0 -334
- ins_pricing_gemini/modelling/explain/metrics.py +0 -176
- ins_pricing_gemini/modelling/explain/permutation.py +0 -155
- ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
- ins_pricing_gemini/modelling/features.py +0 -215
- ins_pricing_gemini/modelling/model_manager.py +0 -148
- ins_pricing_gemini/modelling/model_plotting.py +0 -463
- ins_pricing_gemini/modelling/models.py +0 -2203
- ins_pricing_gemini/modelling/notebook_utils.py +0 -294
- ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
- ins_pricing_gemini/modelling/plotting/common.py +0 -63
- ins_pricing_gemini/modelling/plotting/curves.py +0 -572
- ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
- ins_pricing_gemini/modelling/plotting/geo.py +0 -362
- ins_pricing_gemini/modelling/plotting/importance.py +0 -121
- ins_pricing_gemini/modelling/run_logging.py +0 -133
- ins_pricing_gemini/modelling/tests/conftest.py +0 -8
- ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
- ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
- ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
- ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
- ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
- ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
- ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
- ins_pricing_gemini/modelling/trainers.py +0 -2447
- ins_pricing_gemini/modelling/utils.py +0 -1020
- ins_pricing_gemini/pricing/__init__.py +0 -27
- ins_pricing_gemini/pricing/calibration.py +0 -39
- ins_pricing_gemini/pricing/data_quality.py +0 -117
- ins_pricing_gemini/pricing/exposure.py +0 -85
- ins_pricing_gemini/pricing/factors.py +0 -91
- ins_pricing_gemini/pricing/monitoring.py +0 -99
- ins_pricing_gemini/pricing/rate_table.py +0 -78
- ins_pricing_gemini/production/__init__.py +0 -21
- ins_pricing_gemini/production/drift.py +0 -30
- ins_pricing_gemini/production/monitoring.py +0 -143
- ins_pricing_gemini/production/scoring.py +0 -40
- ins_pricing_gemini/reporting/__init__.py +0 -11
- ins_pricing_gemini/reporting/report_builder.py +0 -72
- ins_pricing_gemini/reporting/scheduler.py +0 -45
- ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
- ins_pricing_gemini/scripts/Explain_entry.py +0 -545
- ins_pricing_gemini/scripts/__init__.py +0 -1
- ins_pricing_gemini/scripts/train.py +0 -568
- ins_pricing_gemini/setup.py +0 -55
- ins_pricing_gemini/smoke_test.py +0 -28
- /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
- /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
- /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
ins_pricing/README.md
CHANGED
|
@@ -7,13 +7,13 @@ between modelling, production, governance, and reporting.
|
|
|
7
7
|
|
|
8
8
|
## Architecture
|
|
9
9
|
|
|
10
|
+
- `cli/`: CLI entry points + shared utilities.
|
|
10
11
|
- `modelling/`
|
|
11
|
-
- `
|
|
12
|
+
- `core/`: BayesOpt training core (GLM / XGB / ResNet / FT / GNN).
|
|
12
13
|
- `plotting/`: model-agnostic curves and geo visualizations.
|
|
13
14
|
- `explain/`: permutation, gradients, and SHAP helpers.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
- `Pricing_Run.py`: lightweight pricing orchestration.
|
|
15
|
+
- `docs/modelling/`: modelling documentation.
|
|
16
|
+
- `examples/modelling/`: demo configs + notebooks (repo only; not packaged).
|
|
17
17
|
- `pricing/`: factor tables, calibration, exposure, monitoring.
|
|
18
18
|
- `production/`: scoring, metrics, drift/PSI.
|
|
19
19
|
- `governance/`: registry, release, audit, approval workflow.
|
|
@@ -23,7 +23,7 @@ between modelling, production, governance, and reporting.
|
|
|
23
23
|
|
|
24
24
|
1. Model training
|
|
25
25
|
- Python API: `from ins_pricing.modelling import BayesOptModel`
|
|
26
|
-
- CLI: `python ins_pricing/
|
|
26
|
+
- CLI: `python ins_pricing/cli/BayesOpt_entry.py --config-json ...`
|
|
27
27
|
2. Evaluation & visualization
|
|
28
28
|
- Curves: `from ins_pricing.plotting import curves`
|
|
29
29
|
- Importance: `from ins_pricing.plotting import importance`
|
|
@@ -42,7 +42,10 @@ between modelling, production, governance, and reporting.
|
|
|
42
42
|
|
|
43
43
|
- `ins_pricing` exposes lightweight lazy imports so that `pricing/production/governance`
|
|
44
44
|
can be used without installing heavy ML dependencies.
|
|
45
|
-
-
|
|
45
|
+
- Migration note: CLI entry points now live under `ins_pricing/cli/` and demo assets are under
|
|
46
|
+
`ins_pricing/examples/modelling/`. Update any scripts that referenced `ins_pricing/modelling/cli/*` or
|
|
47
|
+
`ins_pricing/modelling/examples/*`.
|
|
48
|
+
- Demo notebooks/configs live in the repo under `ins_pricing/examples/modelling/` and are not shipped in the PyPI package.
|
|
46
49
|
- Heavy dependencies are only required when you import or use the related modules:
|
|
47
50
|
- BayesOpt: `torch`, `optuna`, `xgboost`, etc.
|
|
48
51
|
- Explain: `torch` (gradients), `shap` (SHAP).
|
ins_pricing/__init__.py
CHANGED
|
@@ -22,22 +22,14 @@ _MODELLING_EXPORTS = {
|
|
|
22
22
|
}
|
|
23
23
|
|
|
24
24
|
_LAZY_SUBMODULES = {
|
|
25
|
-
"bayesopt": "ins_pricing.modelling.bayesopt",
|
|
25
|
+
"bayesopt": "ins_pricing.modelling.core.bayesopt",
|
|
26
26
|
"plotting": "ins_pricing.modelling.plotting",
|
|
27
27
|
"explain": "ins_pricing.modelling.explain",
|
|
28
|
-
"BayesOpt": "ins_pricing.modelling.BayesOpt",
|
|
29
|
-
"BayesOpt_entry": "ins_pricing.modelling.BayesOpt_entry",
|
|
30
|
-
"BayesOpt_incremental": "ins_pricing.modelling.BayesOpt_incremental",
|
|
31
|
-
"Explain_entry": "ins_pricing.modelling.Explain_entry",
|
|
32
|
-
"Explain_Run": "ins_pricing.modelling.Explain_Run",
|
|
33
|
-
"Pricing_Run": "ins_pricing.modelling.Pricing_Run",
|
|
34
|
-
"cli_common": "ins_pricing.modelling.cli_common",
|
|
35
|
-
"notebook_utils": "ins_pricing.modelling.notebook_utils",
|
|
36
|
-
"watchdog_run": "ins_pricing.modelling.watchdog_run",
|
|
28
|
+
"BayesOpt": "ins_pricing.modelling.core.BayesOpt",
|
|
37
29
|
}
|
|
38
30
|
|
|
39
31
|
_PACKAGE_PATHS = {
|
|
40
|
-
"bayesopt": Path(__file__).resolve().parent / "modelling" / "bayesopt",
|
|
32
|
+
"bayesopt": Path(__file__).resolve().parent / "modelling" / "core" / "bayesopt",
|
|
41
33
|
"plotting": Path(__file__).resolve().parent / "modelling" / "plotting",
|
|
42
34
|
"explain": Path(__file__).resolve().parent / "modelling" / "explain",
|
|
43
35
|
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Thin wrapper for the BayesOpt CLI entry point.
|
|
2
|
+
|
|
3
|
+
The main implementation lives in bayesopt_entry_runner.py.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
if __package__ in {None, ""}:
|
|
12
|
+
repo_root = Path(__file__).resolve().parents[2]
|
|
13
|
+
if str(repo_root) not in sys.path:
|
|
14
|
+
sys.path.insert(0, str(repo_root))
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from .bayesopt_entry_runner import main
|
|
18
|
+
except Exception: # pragma: no cover
|
|
19
|
+
from ins_pricing.cli.bayesopt_entry_runner import main
|
|
20
|
+
|
|
21
|
+
__all__ = ["main"]
|
|
22
|
+
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
main()
|
|
@@ -8,8 +8,8 @@ of per-model incremental CSVs or a single incremental file when updating
|
|
|
8
8
|
one dataset.
|
|
9
9
|
|
|
10
10
|
Example:
|
|
11
|
-
python ins_pricing/
|
|
12
|
-
--config-json ins_pricing/modelling/
|
|
11
|
+
python ins_pricing/cli/BayesOpt_incremental.py \
|
|
12
|
+
--config-json ins_pricing/examples/modelling/config_incremental_template.json \
|
|
13
13
|
--incremental-dir ./incremental_batches \
|
|
14
14
|
--merge-keys policy_id vehicle_id \
|
|
15
15
|
--model-keys glm xgb resn --plot-curves
|
|
@@ -17,80 +17,114 @@ Example:
|
|
|
17
17
|
|
|
18
18
|
from __future__ import annotations
|
|
19
19
|
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
import sys
|
|
22
|
+
|
|
23
|
+
if __package__ in {None, ""}:
|
|
24
|
+
repo_root = Path(__file__).resolve().parents[2]
|
|
25
|
+
if str(repo_root) not in sys.path:
|
|
26
|
+
sys.path.insert(0, str(repo_root))
|
|
27
|
+
|
|
20
28
|
import argparse
|
|
21
29
|
import json
|
|
22
30
|
from dataclasses import asdict
|
|
23
31
|
from datetime import datetime
|
|
24
|
-
from pathlib import Path
|
|
25
32
|
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
26
33
|
|
|
27
34
|
import pandas as pd
|
|
28
|
-
from sklearn.model_selection import train_test_split
|
|
29
35
|
|
|
30
36
|
try:
|
|
31
|
-
from
|
|
32
|
-
from .cli_common import ( # type: ignore
|
|
37
|
+
from .. import bayesopt as ropt # type: ignore
|
|
38
|
+
from .utils.cli_common import ( # type: ignore
|
|
33
39
|
PLOT_MODEL_LABELS,
|
|
34
40
|
PYTORCH_TRAINERS,
|
|
35
41
|
build_model_names,
|
|
36
42
|
dedupe_preserve_order,
|
|
37
|
-
|
|
38
|
-
normalize_config_paths,
|
|
43
|
+
load_dataset,
|
|
39
44
|
parse_model_pairs,
|
|
40
|
-
|
|
45
|
+
resolve_data_path,
|
|
41
46
|
resolve_path,
|
|
42
|
-
|
|
47
|
+
split_train_test,
|
|
48
|
+
)
|
|
49
|
+
from .utils.cli_config import ( # type: ignore
|
|
50
|
+
add_config_json_arg,
|
|
51
|
+
resolve_and_load_config,
|
|
52
|
+
resolve_data_config,
|
|
53
|
+
resolve_split_config,
|
|
54
|
+
resolve_runtime_config,
|
|
55
|
+
resolve_output_dirs,
|
|
43
56
|
)
|
|
44
57
|
except Exception: # pragma: no cover
|
|
45
58
|
try:
|
|
46
59
|
import bayesopt as ropt # type: ignore
|
|
47
|
-
from cli_common import ( # type: ignore
|
|
60
|
+
from utils.cli_common import ( # type: ignore
|
|
48
61
|
PLOT_MODEL_LABELS,
|
|
49
62
|
PYTORCH_TRAINERS,
|
|
50
63
|
build_model_names,
|
|
51
64
|
dedupe_preserve_order,
|
|
52
|
-
|
|
53
|
-
normalize_config_paths,
|
|
65
|
+
load_dataset,
|
|
54
66
|
parse_model_pairs,
|
|
55
|
-
|
|
67
|
+
resolve_data_path,
|
|
56
68
|
resolve_path,
|
|
57
|
-
|
|
69
|
+
split_train_test,
|
|
70
|
+
)
|
|
71
|
+
from utils.cli_config import ( # type: ignore
|
|
72
|
+
add_config_json_arg,
|
|
73
|
+
resolve_and_load_config,
|
|
74
|
+
resolve_data_config,
|
|
75
|
+
resolve_split_config,
|
|
76
|
+
resolve_runtime_config,
|
|
77
|
+
resolve_output_dirs,
|
|
58
78
|
)
|
|
59
79
|
except Exception:
|
|
60
80
|
try:
|
|
61
|
-
import ins_pricing.bayesopt as ropt # type: ignore
|
|
62
|
-
from ins_pricing.cli_common import ( # type: ignore
|
|
81
|
+
import ins_pricing.modelling.core.bayesopt as ropt # type: ignore
|
|
82
|
+
from ins_pricing.cli.utils.cli_common import ( # type: ignore
|
|
63
83
|
PLOT_MODEL_LABELS,
|
|
64
84
|
PYTORCH_TRAINERS,
|
|
65
85
|
build_model_names,
|
|
66
86
|
dedupe_preserve_order,
|
|
67
|
-
|
|
68
|
-
normalize_config_paths,
|
|
87
|
+
load_dataset,
|
|
69
88
|
parse_model_pairs,
|
|
70
|
-
|
|
89
|
+
resolve_data_path,
|
|
71
90
|
resolve_path,
|
|
72
|
-
|
|
91
|
+
split_train_test,
|
|
92
|
+
)
|
|
93
|
+
from ins_pricing.cli.utils.cli_config import ( # type: ignore
|
|
94
|
+
add_config_json_arg,
|
|
95
|
+
resolve_and_load_config,
|
|
96
|
+
resolve_data_config,
|
|
97
|
+
resolve_split_config,
|
|
98
|
+
resolve_runtime_config,
|
|
99
|
+
resolve_output_dirs,
|
|
73
100
|
)
|
|
74
101
|
except Exception:
|
|
75
102
|
import BayesOpt as ropt # type: ignore
|
|
76
|
-
from cli_common import ( # type: ignore
|
|
103
|
+
from utils.cli_common import ( # type: ignore
|
|
77
104
|
PLOT_MODEL_LABELS,
|
|
78
105
|
PYTORCH_TRAINERS,
|
|
79
106
|
build_model_names,
|
|
80
107
|
dedupe_preserve_order,
|
|
81
|
-
|
|
82
|
-
normalize_config_paths,
|
|
108
|
+
load_dataset,
|
|
83
109
|
parse_model_pairs,
|
|
84
|
-
|
|
110
|
+
resolve_data_path,
|
|
85
111
|
resolve_path,
|
|
86
|
-
|
|
112
|
+
split_train_test,
|
|
113
|
+
)
|
|
114
|
+
from utils.cli_config import ( # type: ignore
|
|
115
|
+
add_config_json_arg,
|
|
116
|
+
resolve_and_load_config,
|
|
117
|
+
resolve_data_config,
|
|
118
|
+
resolve_split_config,
|
|
119
|
+
resolve_runtime_config,
|
|
120
|
+
resolve_output_dirs,
|
|
87
121
|
)
|
|
88
122
|
|
|
89
123
|
try:
|
|
90
|
-
from .run_logging import configure_run_logging # type: ignore
|
|
124
|
+
from .utils.run_logging import configure_run_logging # type: ignore
|
|
91
125
|
except Exception: # pragma: no cover
|
|
92
126
|
try:
|
|
93
|
-
from run_logging import configure_run_logging # type: ignore
|
|
127
|
+
from utils.run_logging import configure_run_logging # type: ignore
|
|
94
128
|
except Exception: # pragma: no cover
|
|
95
129
|
configure_run_logging = None # type: ignore
|
|
96
130
|
|
|
@@ -103,10 +137,9 @@ def _parse_args() -> argparse.Namespace:
|
|
|
103
137
|
parser = argparse.ArgumentParser(
|
|
104
138
|
description="Incrementally retrain BayesOpt models using new batches of data."
|
|
105
139
|
)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
help="Path to the JSON config that BayesOpt_entry.py uses."
|
|
140
|
+
add_config_json_arg(
|
|
141
|
+
parser,
|
|
142
|
+
help_text="Path to the JSON config that cli/BayesOpt_entry.py uses.",
|
|
110
143
|
)
|
|
111
144
|
parser.add_argument(
|
|
112
145
|
"--model-names",
|
|
@@ -358,6 +391,15 @@ def _coerce_scalar(value: Any) -> Any:
|
|
|
358
391
|
return value
|
|
359
392
|
|
|
360
393
|
|
|
394
|
+
def _infer_format_from_path(path: Path) -> str:
|
|
395
|
+
suffix = path.suffix.lower()
|
|
396
|
+
if suffix in {".parquet", ".pq"}:
|
|
397
|
+
return "parquet"
|
|
398
|
+
if suffix in {".feather", ".ft"}:
|
|
399
|
+
return "feather"
|
|
400
|
+
return "csv"
|
|
401
|
+
|
|
402
|
+
|
|
361
403
|
def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
|
|
362
404
|
label = trainer.label.lower()
|
|
363
405
|
result_dir = Path(model.output_manager.result_dir)
|
|
@@ -393,10 +435,10 @@ def _to_serializable(obj: Any) -> Any:
|
|
|
393
435
|
class IncrementalUpdateRunner:
|
|
394
436
|
def __init__(self, args: argparse.Namespace) -> None:
|
|
395
437
|
self.args = args
|
|
396
|
-
script_dir = Path(__file__).resolve().
|
|
397
|
-
self.config_path =
|
|
398
|
-
|
|
399
|
-
|
|
438
|
+
script_dir = Path(__file__).resolve().parents[1]
|
|
439
|
+
self.config_path, self.cfg = resolve_and_load_config(
|
|
440
|
+
args.config_json,
|
|
441
|
+
script_dir,
|
|
400
442
|
required_keys=[
|
|
401
443
|
"data_dir",
|
|
402
444
|
"model_list",
|
|
@@ -407,26 +449,57 @@ class IncrementalUpdateRunner:
|
|
|
407
449
|
"categorical_features",
|
|
408
450
|
],
|
|
409
451
|
)
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
452
|
+
data_dir, data_format, data_path_template, dtype_map = resolve_data_config(
|
|
453
|
+
self.cfg,
|
|
454
|
+
self.config_path,
|
|
455
|
+
create_data_dir=True,
|
|
456
|
+
)
|
|
457
|
+
self.data_dir = data_dir
|
|
458
|
+
self.data_format = data_format
|
|
459
|
+
self.data_path_template = data_path_template
|
|
460
|
+
self.dtype_map = dtype_map
|
|
461
|
+
split_cfg = resolve_split_config(self.cfg)
|
|
462
|
+
runtime_cfg = resolve_runtime_config(self.cfg)
|
|
463
|
+
output_cfg = resolve_output_dirs(
|
|
464
|
+
self.cfg,
|
|
465
|
+
self.config_path,
|
|
466
|
+
output_override=args.output_dir,
|
|
467
|
+
)
|
|
468
|
+
self.runtime_cfg = runtime_cfg
|
|
469
|
+
self.prop_test = args.prop_test if args.prop_test is not None else split_cfg["prop_test"]
|
|
415
470
|
self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
|
|
416
471
|
self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
|
|
472
|
+
self.split_strategy = split_cfg["split_strategy"]
|
|
473
|
+
self.split_group_col = split_cfg["split_group_col"]
|
|
474
|
+
self.split_time_col = split_cfg["split_time_col"]
|
|
475
|
+
self.split_time_ascending = split_cfg["split_time_ascending"]
|
|
476
|
+
self.cv_strategy = split_cfg["cv_strategy"]
|
|
477
|
+
self.cv_group_col = split_cfg["cv_group_col"]
|
|
478
|
+
self.cv_time_col = split_cfg["cv_time_col"]
|
|
479
|
+
self.cv_time_ascending = split_cfg["cv_time_ascending"]
|
|
480
|
+
self.cv_splits = split_cfg["cv_splits"]
|
|
481
|
+
self.ft_oof_folds = split_cfg["ft_oof_folds"]
|
|
482
|
+
self.ft_oof_strategy = split_cfg["ft_oof_strategy"]
|
|
483
|
+
self.ft_oof_shuffle = split_cfg["ft_oof_shuffle"]
|
|
484
|
+
self.save_preprocess = runtime_cfg["save_preprocess"]
|
|
485
|
+
self.preprocess_artifact_path = runtime_cfg["preprocess_artifact_path"]
|
|
486
|
+
self.bo_sample_limit = runtime_cfg["bo_sample_limit"]
|
|
487
|
+
self.cache_predictions = runtime_cfg["cache_predictions"]
|
|
488
|
+
self.prediction_cache_dir = runtime_cfg["prediction_cache_dir"]
|
|
489
|
+
self.prediction_cache_format = runtime_cfg["prediction_cache_format"]
|
|
490
|
+
self.plot_path_style = runtime_cfg["plot_path_style"]
|
|
491
|
+
self.xgb_max_depth_max = runtime_cfg["xgb_max_depth_max"]
|
|
492
|
+
self.xgb_n_estimators_max = runtime_cfg["xgb_n_estimators_max"]
|
|
493
|
+
self.optuna_storage = runtime_cfg["optuna_storage"]
|
|
494
|
+
self.optuna_study_prefix = runtime_cfg["optuna_study_prefix"]
|
|
495
|
+
self.best_params_files = runtime_cfg["best_params_files"]
|
|
496
|
+
self.reuse_best_params = runtime_cfg["reuse_best_params"]
|
|
417
497
|
self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
|
|
418
498
|
self.model_names = self._resolve_model_names(args.model_names)
|
|
419
499
|
self.merge_keys = list(args.merge_keys or [])
|
|
420
500
|
self.timestamp_col = args.timestamp_col
|
|
421
501
|
self.timestamp_ascending = not args.timestamp_descending
|
|
422
|
-
output_root =
|
|
423
|
-
if isinstance(output_root, Path) and not output_root.is_absolute():
|
|
424
|
-
output_root = (self.config_path.parent / output_root).resolve()
|
|
425
|
-
if isinstance(output_root, str) and output_root.strip():
|
|
426
|
-
resolved = resolve_path(output_root, self.config_path.parent)
|
|
427
|
-
if resolved is not None:
|
|
428
|
-
output_root = str(resolved)
|
|
429
|
-
self.output_root = output_root
|
|
502
|
+
self.output_root = output_cfg["output_dir"]
|
|
430
503
|
|
|
431
504
|
self.incremental_dir = None
|
|
432
505
|
if args.incremental_dir is not None:
|
|
@@ -465,10 +538,18 @@ class IncrementalUpdateRunner:
|
|
|
465
538
|
if not path or not path.exists():
|
|
466
539
|
return None, None
|
|
467
540
|
try:
|
|
468
|
-
df =
|
|
541
|
+
df = load_dataset(
|
|
542
|
+
path,
|
|
543
|
+
data_format="auto",
|
|
544
|
+
dtype_map=self.dtype_map,
|
|
545
|
+
low_memory=False,
|
|
546
|
+
)
|
|
469
547
|
except pd.errors.EmptyDataError:
|
|
470
548
|
_log(f"Incremental file {path} is empty; treating as no-op.")
|
|
471
549
|
return None, path
|
|
550
|
+
except Exception as exc:
|
|
551
|
+
_log(f"Failed to load incremental file {path}: {exc}")
|
|
552
|
+
return None, path
|
|
472
553
|
return df, path
|
|
473
554
|
|
|
474
555
|
def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
|
|
@@ -507,7 +588,15 @@ class IncrementalUpdateRunner:
|
|
|
507
588
|
|
|
508
589
|
def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
|
|
509
590
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
510
|
-
|
|
591
|
+
fmt = str(self.data_format or "csv").lower()
|
|
592
|
+
if fmt == "auto":
|
|
593
|
+
fmt = _infer_format_from_path(dest)
|
|
594
|
+
if fmt == "parquet":
|
|
595
|
+
df.to_parquet(dest, index=False)
|
|
596
|
+
elif fmt == "feather":
|
|
597
|
+
df.reset_index(drop=True).to_feather(dest)
|
|
598
|
+
else:
|
|
599
|
+
df.to_csv(dest, index=False)
|
|
511
600
|
_log(f"Wrote {len(df)} rows to {dest} ({reason}).")
|
|
512
601
|
|
|
513
602
|
def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -515,12 +604,19 @@ class IncrementalUpdateRunner:
|
|
|
515
604
|
raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
|
|
516
605
|
if len(merged) < 2:
|
|
517
606
|
raise ValueError("Need at least two rows to form a train/test split.")
|
|
518
|
-
train_df, test_df =
|
|
607
|
+
train_df, test_df = split_train_test(
|
|
519
608
|
merged,
|
|
520
|
-
|
|
521
|
-
|
|
609
|
+
holdout_ratio=self.prop_test,
|
|
610
|
+
strategy=self.split_strategy,
|
|
611
|
+
group_col=self.split_group_col,
|
|
612
|
+
time_col=self.split_time_col,
|
|
613
|
+
time_ascending=self.split_time_ascending,
|
|
614
|
+
rand_seed=self.rand_seed,
|
|
615
|
+
reset_index_mode="always",
|
|
616
|
+
ratio_label="prop_test",
|
|
617
|
+
validate_ratio=False,
|
|
522
618
|
)
|
|
523
|
-
return train_df
|
|
619
|
+
return train_df, test_df
|
|
524
620
|
|
|
525
621
|
def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
|
|
526
622
|
requested = self.args.model_keys
|
|
@@ -555,6 +651,7 @@ class IncrementalUpdateRunner:
|
|
|
555
651
|
prop_test=self.prop_test,
|
|
556
652
|
rand_seed=self.rand_seed,
|
|
557
653
|
epochs=self.epochs,
|
|
654
|
+
use_gpu=bool(self.cfg.get("use_gpu", True)),
|
|
558
655
|
use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
|
|
559
656
|
use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
|
|
560
657
|
use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
|
|
@@ -562,27 +659,52 @@ class IncrementalUpdateRunner:
|
|
|
562
659
|
use_ft_ddp=self.cfg.get("use_ft_ddp", False),
|
|
563
660
|
use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
|
|
564
661
|
output_dir=str(self.output_root) if self.output_root else None,
|
|
565
|
-
xgb_max_depth_max=
|
|
566
|
-
xgb_n_estimators_max=
|
|
662
|
+
xgb_max_depth_max=self.xgb_max_depth_max,
|
|
663
|
+
xgb_n_estimators_max=self.xgb_n_estimators_max,
|
|
567
664
|
resn_weight_decay=self.cfg.get("resn_weight_decay"),
|
|
568
665
|
final_ensemble=bool(self.cfg.get("final_ensemble", False)),
|
|
569
666
|
final_ensemble_k=int(self.cfg.get("final_ensemble_k", 3)),
|
|
570
667
|
final_refit=bool(self.cfg.get("final_refit", True)),
|
|
571
|
-
optuna_storage=self.
|
|
572
|
-
optuna_study_prefix=self.
|
|
573
|
-
best_params_files=self.
|
|
574
|
-
reuse_best_params=
|
|
668
|
+
optuna_storage=self.optuna_storage,
|
|
669
|
+
optuna_study_prefix=self.optuna_study_prefix,
|
|
670
|
+
best_params_files=self.best_params_files,
|
|
671
|
+
reuse_best_params=self.reuse_best_params,
|
|
575
672
|
gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
|
|
576
673
|
gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
|
|
577
674
|
gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
|
|
578
675
|
gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
|
|
579
676
|
gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
|
|
580
677
|
gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
|
|
678
|
+
region_province_col=self.cfg.get("region_province_col"),
|
|
679
|
+
region_city_col=self.cfg.get("region_city_col"),
|
|
680
|
+
region_effect_alpha=self.cfg.get("region_effect_alpha"),
|
|
681
|
+
geo_feature_nmes=self.cfg.get("geo_feature_nmes"),
|
|
682
|
+
geo_token_hidden_dim=self.cfg.get("geo_token_hidden_dim"),
|
|
683
|
+
geo_token_layers=self.cfg.get("geo_token_layers"),
|
|
684
|
+
geo_token_dropout=self.cfg.get("geo_token_dropout"),
|
|
685
|
+
geo_token_k_neighbors=self.cfg.get("geo_token_k_neighbors"),
|
|
686
|
+
geo_token_learning_rate=self.cfg.get("geo_token_learning_rate"),
|
|
687
|
+
geo_token_epochs=self.cfg.get("geo_token_epochs"),
|
|
581
688
|
ft_role=str(self.cfg.get("ft_role", "model")),
|
|
582
689
|
ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
|
|
583
690
|
ft_num_numeric_tokens=self.cfg.get("ft_num_numeric_tokens"),
|
|
584
691
|
infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
|
|
585
692
|
infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
|
|
693
|
+
cv_strategy=self.cv_strategy or self.split_strategy,
|
|
694
|
+
cv_group_col=self.cv_group_col or self.split_group_col,
|
|
695
|
+
cv_time_col=self.cv_time_col or self.split_time_col,
|
|
696
|
+
cv_time_ascending=self.cv_time_ascending,
|
|
697
|
+
cv_splits=self.cv_splits,
|
|
698
|
+
ft_oof_folds=self.ft_oof_folds,
|
|
699
|
+
ft_oof_strategy=self.ft_oof_strategy,
|
|
700
|
+
ft_oof_shuffle=self.ft_oof_shuffle,
|
|
701
|
+
save_preprocess=self.save_preprocess,
|
|
702
|
+
preprocess_artifact_path=self.preprocess_artifact_path,
|
|
703
|
+
plot_path_style=self.plot_path_style,
|
|
704
|
+
bo_sample_limit=self.bo_sample_limit,
|
|
705
|
+
cache_predictions=self.cache_predictions,
|
|
706
|
+
prediction_cache_dir=self.prediction_cache_dir,
|
|
707
|
+
prediction_cache_format=self.prediction_cache_format,
|
|
586
708
|
)
|
|
587
709
|
|
|
588
710
|
if self.plot_requested and not self.args.dry_run:
|
|
@@ -689,7 +811,12 @@ class IncrementalUpdateRunner:
|
|
|
689
811
|
_log(f"Finished incremental update for {total_trained} dataset(s).")
|
|
690
812
|
|
|
691
813
|
def _process_single_model(self, model_name: str) -> int:
|
|
692
|
-
base_path =
|
|
814
|
+
base_path = resolve_data_path(
|
|
815
|
+
self.data_dir,
|
|
816
|
+
model_name,
|
|
817
|
+
data_format=self.data_format,
|
|
818
|
+
path_template=self.data_path_template,
|
|
819
|
+
)
|
|
693
820
|
if not base_path.exists():
|
|
694
821
|
_log(f"Base dataset {base_path} not found; skipping {model_name}.")
|
|
695
822
|
self.summary_records.append({
|
|
@@ -698,7 +825,12 @@ class IncrementalUpdateRunner:
|
|
|
698
825
|
})
|
|
699
826
|
return 0
|
|
700
827
|
|
|
701
|
-
base_df =
|
|
828
|
+
base_df = load_dataset(
|
|
829
|
+
base_path,
|
|
830
|
+
data_format=self.data_format,
|
|
831
|
+
dtype_map=self.dtype_map,
|
|
832
|
+
low_memory=False,
|
|
833
|
+
)
|
|
702
834
|
inc_df, inc_path = self._load_incremental_df(model_name)
|
|
703
835
|
if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
|
|
704
836
|
raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
|
|
@@ -711,7 +843,8 @@ class IncrementalUpdateRunner:
|
|
|
711
843
|
if self.args.update_base_data and not self.args.dry_run:
|
|
712
844
|
self._write_dataset(merged_df, base_path, "update_base_data")
|
|
713
845
|
if self.args.persist_merged_dir and not self.args.dry_run:
|
|
714
|
-
|
|
846
|
+
suffix = base_path.suffix or ".csv"
|
|
847
|
+
dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}{suffix}"
|
|
715
848
|
self._write_dataset(merged_df, dest, "persist_merged_dir")
|
|
716
849
|
|
|
717
850
|
if not self._should_train(new_rows):
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from .utils.notebook_utils import run_from_config, run_from_config_cli # type: ignore
|
|
8
|
+
except Exception: # pragma: no cover
|
|
9
|
+
from utils.notebook_utils import run_from_config, run_from_config_cli # type: ignore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run(config_json: str | Path) -> None:
|
|
13
|
+
"""Run explain by config.json (runner.mode=explain)."""
|
|
14
|
+
run_from_config(config_json)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main(argv: Optional[list[str]] = None) -> None:
|
|
18
|
+
run_from_config_cli(
|
|
19
|
+
"Explain_Run: run explain by config.json (runner.mode=explain).",
|
|
20
|
+
argv,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
main()
|