proscore 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proscore/__init__.py +469 -0
- proscore/__main__.py +68 -0
- proscore/_data/__init__.py +93 -0
- proscore/_pipeline_config.py +1133 -0
- proscore/binning/__init__.py +8 -0
- proscore/binning/_adjust.py +280 -0
- proscore/binning/_base.py +42 -0
- proscore/binning/_binning.py +774 -0
- proscore/binning/_categorical.py +112 -0
- proscore/binning/_chi.py +197 -0
- proscore/binning/_distance.py +23 -0
- proscore/binning/_frequency.py +38 -0
- proscore/binning/_tree.py +34 -0
- proscore/binning/_woe.py +76 -0
- proscore/evaluate/__init__.py +19 -0
- proscore/evaluate/_metrics.py +331 -0
- proscore/inspect/__init__.py +9 -0
- proscore/inspect/_correlation.py +117 -0
- proscore/inspect/_detect.py +213 -0
- proscore/inspect/_quality.py +394 -0
- proscore/inspect/_stability.py +259 -0
- proscore/modeling/__init__.py +3 -0
- proscore/modeling/_scorecard.py +213 -0
- proscore/monitor/__init__.py +9 -0
- proscore/monitor/_monitor.py +549 -0
- proscore/report/__init__.py +5 -0
- proscore/report/_builder.py +1132 -0
- proscore/selection/__init__.py +11 -0
- proscore/selection/_filter.py +448 -0
- proscore/selection/_screen.py +83 -0
- proscore/selection/_stepwise.py +623 -0
- proscore/transform/__init__.py +3 -0
- proscore/transform/_woe.py +255 -0
- proscore/utils/__init__.py +62 -0
- proscore/utils/_config.py +5 -0
- proscore/utils/_exceptions.py +14 -0
- proscore/utils/_presets.py +135 -0
- proscore/utils/_psi.py +49 -0
- proscore/viz/__init__.py +15 -0
- proscore/viz/_plots.py +269 -0
- proscore-0.1.0.dist-info/METADATA +192 -0
- proscore-0.1.0.dist-info/RECORD +46 -0
- proscore-0.1.0.dist-info/WHEEL +5 -0
- proscore-0.1.0.dist-info/entry_points.txt +2 -0
- proscore-0.1.0.dist-info/licenses/LICENSE +21 -0
- proscore-0.1.0.dist-info/top_level.txt +1 -0
proscore/__init__.py
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""ProScore — Scorecard modelling toolkit."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from proscore import inspect
|
|
8
|
+
from proscore._pipeline_config import PipelineConfig, run_pipeline
|
|
9
|
+
from proscore.binning import Binning, BinningProcess
|
|
10
|
+
from proscore.evaluate import evaluate as _evaluate
|
|
11
|
+
from proscore.modeling import ScoreCard
|
|
12
|
+
from proscore.selection import Filter, StepwiseSelector, assess_screen
|
|
13
|
+
from proscore.transform import WOETransformer
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ProScore:
|
|
19
|
+
"""
|
|
20
|
+
Chain-style entry point for the full scorecard pipeline.
|
|
21
|
+
|
|
22
|
+
Recommended order (aligned with modular notebooks)::
|
|
23
|
+
|
|
24
|
+
ps = ProScore()
|
|
25
|
+
ps.read(train=..., test=..., oot=..., target="bad_flag") \\
|
|
26
|
+
.detect() \\
|
|
27
|
+
.prefilter(max_corr=0.75, max_vif=10) \\
|
|
28
|
+
.bin(method="chi", n_bins=5) \\
|
|
29
|
+
.refine(iv_range=(0.02, None), max_psi=0.25) \\
|
|
30
|
+
.transform() \\
|
|
31
|
+
.select(method="stepwise") \\
|
|
32
|
+
.fit(odds=50, pdo=10) \\
|
|
33
|
+
.scorecard() \\
|
|
34
|
+
.evaluate()
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self) -> None:
|
|
38
|
+
self.target: str = ""
|
|
39
|
+
self.id_col: str | None = None
|
|
40
|
+
self.train_df: pd.DataFrame | None = None
|
|
41
|
+
self.test_df: pd.DataFrame | None = None
|
|
42
|
+
self.oot_df: pd.DataFrame | None = None
|
|
43
|
+
|
|
44
|
+
self.detect_result: pd.DataFrame | None = None
|
|
45
|
+
self.quality_result: pd.DataFrame | None = None
|
|
46
|
+
self._prefilter: Filter | None = None
|
|
47
|
+
self._filter: Filter | None = None
|
|
48
|
+
self._binner: Binning | BinningProcess | None = None
|
|
49
|
+
self._transformer: WOETransformer | None = None
|
|
50
|
+
self._selector: StepwiseSelector | None = None
|
|
51
|
+
self._scorecard: ScoreCard | None = None
|
|
52
|
+
self.eval_result: dict | None = None
|
|
53
|
+
|
|
54
|
+
self._halted: bool = False
|
|
55
|
+
self._halt_message: str = ""
|
|
56
|
+
self._refine_skipped: bool = False
|
|
57
|
+
self._screen_outcomes: list = []
|
|
58
|
+
|
|
59
|
+
# ── helpers ───────────────────────────────────────────────────────────────
|
|
60
|
+
|
|
61
|
+
def _train_X(self, features: list[str]) -> pd.DataFrame:
|
|
62
|
+
return self.train_df[features]
|
|
63
|
+
|
|
64
|
+
def _train_y(self) -> pd.Series:
|
|
65
|
+
return self.train_df[self.target]
|
|
66
|
+
|
|
67
|
+
def _test_X(self, features: list[str]) -> pd.DataFrame | None:
|
|
68
|
+
if self.test_df is None:
|
|
69
|
+
return None
|
|
70
|
+
return self.test_df[features]
|
|
71
|
+
|
|
72
|
+
def _test_y(self) -> pd.Series | None:
|
|
73
|
+
if self.test_df is None:
|
|
74
|
+
return None
|
|
75
|
+
return self.test_df[self.target]
|
|
76
|
+
|
|
77
|
+
def _oot_X(self, features: list[str]) -> pd.DataFrame | None:
|
|
78
|
+
if self.oot_df is None:
|
|
79
|
+
return None
|
|
80
|
+
return self.oot_df[features]
|
|
81
|
+
|
|
82
|
+
def _oot_y(self) -> pd.Series | None:
|
|
83
|
+
if self.oot_df is None:
|
|
84
|
+
return None
|
|
85
|
+
return self.oot_df[self.target]
|
|
86
|
+
|
|
87
|
+
def _categorical_features(self) -> list[str]:
|
|
88
|
+
return [
|
|
89
|
+
c for c in self.train_df.columns
|
|
90
|
+
if c != self.target and not pd.api.types.is_numeric_dtype(self.train_df[c])
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
def _initial_features(self) -> list[str]:
|
|
94
|
+
"""Numeric columns — the Filter candidate pool."""
|
|
95
|
+
return [
|
|
96
|
+
c for c in self.train_df.columns
|
|
97
|
+
if c != self.target and pd.api.types.is_numeric_dtype(self.train_df[c])
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
def _numeric_after_prefilter(self) -> list[str]:
|
|
101
|
+
if self._prefilter is not None and self._prefilter.support_:
|
|
102
|
+
return self._prefilter.support_
|
|
103
|
+
return self._initial_features()
|
|
104
|
+
|
|
105
|
+
def _current_numeric(self) -> list[str]:
|
|
106
|
+
"""Numeric features after refine (or prefilter / fallback)."""
|
|
107
|
+
if self._selector is not None:
|
|
108
|
+
return [f for f in self._selector.support_ if f in self._initial_features()]
|
|
109
|
+
if self._filter is not None and self._filter.support_:
|
|
110
|
+
return self._filter.support_
|
|
111
|
+
return self._numeric_after_prefilter()
|
|
112
|
+
|
|
113
|
+
def _features_for_binning_fit(self) -> list[str]:
|
|
114
|
+
"""Columns passed to :meth:`bin` — prefilter survivors + categoricals."""
|
|
115
|
+
return self._numeric_after_prefilter() + self._categorical_features()
|
|
116
|
+
|
|
117
|
+
def _features_for_modeling(self) -> list[str]:
|
|
118
|
+
"""Columns for WOE / stepwise — refine survivors + categoricals."""
|
|
119
|
+
return self._current_numeric() + self._categorical_features()
|
|
120
|
+
|
|
121
|
+
# ── read ──────────────────────────────────────────────────────────────────
|
|
122
|
+
|
|
123
|
+
def read(
|
|
124
|
+
self,
|
|
125
|
+
train: pd.DataFrame,
|
|
126
|
+
*,
|
|
127
|
+
target: str,
|
|
128
|
+
test: pd.DataFrame | None = None,
|
|
129
|
+
oot: pd.DataFrame | None = None,
|
|
130
|
+
id_col: str | None = None,
|
|
131
|
+
) -> ProScore:
|
|
132
|
+
"""Load data (already split into train / test / oot)."""
|
|
133
|
+
self.train_df = train
|
|
134
|
+
self.test_df = test
|
|
135
|
+
self.oot_df = oot
|
|
136
|
+
self.target = target
|
|
137
|
+
self.id_col = id_col
|
|
138
|
+
|
|
139
|
+
base_cols = set(train.columns)
|
|
140
|
+
if test is not None and set(test.columns) != base_cols:
|
|
141
|
+
raise ValueError("test columns must exactly match train columns")
|
|
142
|
+
if oot is not None and set(oot.columns) != base_cols:
|
|
143
|
+
raise ValueError("oot columns must exactly match train columns")
|
|
144
|
+
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
# ── inspect (train only) ──────────────────────────────────────────────────
|
|
148
|
+
|
|
149
|
+
def detect(self, **kwargs) -> ProScore:
|
|
150
|
+
_check_read(self)
|
|
151
|
+
self.detect_result = inspect.detect(self.train_df, target=self.target, **kwargs)
|
|
152
|
+
return self
|
|
153
|
+
|
|
154
|
+
def quality(self, **kwargs) -> ProScore:
|
|
155
|
+
_check_read(self)
|
|
156
|
+
self.quality_result = inspect.quality(self.train_df, target=self.target, **kwargs)
|
|
157
|
+
return self
|
|
158
|
+
|
|
159
|
+
# ── prefilter (coarse — no bin_table required) ───────────────────────────
|
|
160
|
+
|
|
161
|
+
def prefilter(self, **kwargs) -> ProScore:
|
|
162
|
+
"""
|
|
163
|
+
Coarse feature screen on numeric columns (no binning required).
|
|
164
|
+
|
|
165
|
+
Typical kwargs: ``max_missing_rate``, ``max_one_value_rate``,
|
|
166
|
+
``min_auc``, ``max_corr``, ``max_vif``. Leave ``iv_range`` and
|
|
167
|
+
``max_psi`` unset (``None``) until :meth:`refine`.
|
|
168
|
+
"""
|
|
169
|
+
_check_read(self)
|
|
170
|
+
features = self._initial_features()
|
|
171
|
+
self._prefilter = Filter(**kwargs)
|
|
172
|
+
self._prefilter.fit(
|
|
173
|
+
self._train_X(features), self._train_y(),
|
|
174
|
+
X_test=self._test_X(features) if self.test_df is not None else None,
|
|
175
|
+
bin_table=None,
|
|
176
|
+
)
|
|
177
|
+
outcome = assess_screen(
|
|
178
|
+
self._prefilter.support_,
|
|
179
|
+
stage="prefilter",
|
|
180
|
+
n_candidates=len(features),
|
|
181
|
+
)
|
|
182
|
+
self._screen_outcomes.append(outcome)
|
|
183
|
+
if not outcome.ok and len(self._categorical_features()) == 0:
|
|
184
|
+
self._halted = True
|
|
185
|
+
self._halt_message = outcome.message
|
|
186
|
+
return self
|
|
187
|
+
|
|
188
|
+
# ── bin (train — survivors of prefilter + categoricals) ───────────────────
|
|
189
|
+
|
|
190
|
+
def bin(self, method: str = "chi", n_bins: int = 10, **kwargs) -> ProScore:
|
|
191
|
+
_check_read(self)
|
|
192
|
+
if self._halted:
|
|
193
|
+
_warn_halted(self)
|
|
194
|
+
return self
|
|
195
|
+
features = self._features_for_binning_fit()
|
|
196
|
+
if len(features) == 0:
|
|
197
|
+
self._halted = True
|
|
198
|
+
self._halt_message = "分箱阶段无可选特征(数值与类别均为空)。"
|
|
199
|
+
_warn_halted(self)
|
|
200
|
+
return self
|
|
201
|
+
X = pd.concat([self.train_df[features], self.train_df[self.target]], axis=1)
|
|
202
|
+
|
|
203
|
+
feature_config = kwargs.pop("feature_config", None)
|
|
204
|
+
if feature_config:
|
|
205
|
+
self._binner = BinningProcess(
|
|
206
|
+
feature_config=feature_config,
|
|
207
|
+
default_method=method,
|
|
208
|
+
default_n_bins=n_bins,
|
|
209
|
+
**kwargs,
|
|
210
|
+
)
|
|
211
|
+
self._binner.fit(X, y=self.target)
|
|
212
|
+
else:
|
|
213
|
+
self._binner = Binning(method=method, n_bins=n_bins, **kwargs)
|
|
214
|
+
self._binner.fit(X, y=self.target)
|
|
215
|
+
return self
|
|
216
|
+
|
|
217
|
+
# ── refine (fine — IV/PSI from bin_table_; requires :meth:`bin`) ─────────
|
|
218
|
+
|
|
219
|
+
def refine(self, **kwargs) -> ProScore:
|
|
220
|
+
"""
|
|
221
|
+
Fine screen on numeric columns that passed :meth:`prefilter`.
|
|
222
|
+
|
|
223
|
+
Pass ``iv_range``, ``max_psi`` (Train vs Test), etc. Uses
|
|
224
|
+
:attr:`Binning.bin_table_` from the preceding :meth:`bin` call.
|
|
225
|
+
"""
|
|
226
|
+
_check_read(self)
|
|
227
|
+
if self._halted:
|
|
228
|
+
_warn_halted(self)
|
|
229
|
+
return self
|
|
230
|
+
_check_binner(self)
|
|
231
|
+
features = self._numeric_after_prefilter()
|
|
232
|
+
if len(features) == 0:
|
|
233
|
+
import warnings
|
|
234
|
+
from proscore.selection._screen import FeatureScreenWarning
|
|
235
|
+
|
|
236
|
+
warnings.warn(
|
|
237
|
+
"refine skipped: no numeric features after prefilter.",
|
|
238
|
+
FeatureScreenWarning,
|
|
239
|
+
stacklevel=2,
|
|
240
|
+
)
|
|
241
|
+
self._refine_skipped = True
|
|
242
|
+
self._filter = None
|
|
243
|
+
return self
|
|
244
|
+
self._refine_skipped = False
|
|
245
|
+
self._filter = Filter(**kwargs)
|
|
246
|
+
self._filter.fit(
|
|
247
|
+
self._train_X(features), self._train_y(),
|
|
248
|
+
X_test=self._test_X(features) if self.test_df is not None else None,
|
|
249
|
+
bin_table=self._binner.bin_table_,
|
|
250
|
+
)
|
|
251
|
+
outcome = assess_screen(
|
|
252
|
+
self._filter.support_,
|
|
253
|
+
stage="refine",
|
|
254
|
+
n_candidates=len(features),
|
|
255
|
+
)
|
|
256
|
+
self._screen_outcomes.append(outcome)
|
|
257
|
+
modeling = self._features_for_modeling()
|
|
258
|
+
if not outcome.ok and len(modeling) == 0:
|
|
259
|
+
self._halted = True
|
|
260
|
+
self._halt_message = outcome.message
|
|
261
|
+
return self
|
|
262
|
+
|
|
263
|
+
def filter(self, **kwargs) -> ProScore:
|
|
264
|
+
"""Alias for :meth:`refine` (backward compatibility)."""
|
|
265
|
+
return self.refine(**kwargs)
|
|
266
|
+
|
|
267
|
+
# ── transform ─────────────────────────────────────────────────────────────
|
|
268
|
+
|
|
269
|
+
def transform(self, unseen_strategy: str = "worst", **kwargs) -> ProScore:
|
|
270
|
+
if self._halted:
|
|
271
|
+
_warn_halted(self)
|
|
272
|
+
return self
|
|
273
|
+
_check_binner(self)
|
|
274
|
+
_check_refine(self)
|
|
275
|
+
features = self._features_for_modeling()
|
|
276
|
+
if len(features) == 0:
|
|
277
|
+
self._halted = True
|
|
278
|
+
self._halt_message = "无可建模特征(数值+类别均为空)。"
|
|
279
|
+
_warn_halted(self)
|
|
280
|
+
return self
|
|
281
|
+
self._transformer = WOETransformer(unseen_strategy=unseen_strategy, **kwargs)
|
|
282
|
+
tables = {k: v for k, v in self._binner.bin_table_.items() if k in features}
|
|
283
|
+
self._transformer.fit(tables)
|
|
284
|
+
return self
|
|
285
|
+
|
|
286
|
+
# ── select ────────────────────────────────────────────────────────────────
|
|
287
|
+
|
|
288
|
+
def select(self, method: str = "stepwise", **kwargs) -> ProScore:
|
|
289
|
+
if self._halted:
|
|
290
|
+
_warn_halted(self)
|
|
291
|
+
return self
|
|
292
|
+
_check_transformer(self)
|
|
293
|
+
features = self._features_for_modeling()
|
|
294
|
+
train_woe = self._transformer.transform(self._train_X(features))
|
|
295
|
+
train_woe[self.target] = self._train_y().values
|
|
296
|
+
|
|
297
|
+
test_woe = None
|
|
298
|
+
y_test = None
|
|
299
|
+
if self.test_df is not None:
|
|
300
|
+
test_woe = self._transformer.transform(self._test_X(features))
|
|
301
|
+
y_test = self._test_y().values
|
|
302
|
+
|
|
303
|
+
force_in = kwargs.pop("force_in", None)
|
|
304
|
+
self._selector = StepwiseSelector(**kwargs)
|
|
305
|
+
self._selector.fit(
|
|
306
|
+
train_woe, self._train_y(), candidates=features,
|
|
307
|
+
force_in=force_in, X_test=test_woe, y_test=y_test,
|
|
308
|
+
)
|
|
309
|
+
return self
|
|
310
|
+
|
|
311
|
+
# ── fit / scorecard / evaluate ────────────────────────────────────────────
|
|
312
|
+
|
|
313
|
+
def fit(self, odds: float = 50, pdo: float = 10, base_score: float = 600, **kwargs) -> ProScore:
|
|
314
|
+
if self._halted:
|
|
315
|
+
_warn_halted(self)
|
|
316
|
+
return self
|
|
317
|
+
_check_selector(self)
|
|
318
|
+
features = self._selector.support_
|
|
319
|
+
train_woe = self._transformer.transform(self._train_X(features))
|
|
320
|
+
train_woe[self.target] = self._train_y().values
|
|
321
|
+
|
|
322
|
+
self._scorecard = ScoreCard(odds=odds, pdo=pdo, base_score=base_score, **kwargs)
|
|
323
|
+
self._scorecard.fit(train_woe, y=self.target, features=features)
|
|
324
|
+
return self
|
|
325
|
+
|
|
326
|
+
def scorecard(self) -> ProScore:
|
|
327
|
+
_check_scorecard(self)
|
|
328
|
+
features = self._selector.support_
|
|
329
|
+
tables = {k: v for k, v in self._binner.bin_table_.items() if k in features}
|
|
330
|
+
self._scorecard.scorecard(tables)
|
|
331
|
+
return self
|
|
332
|
+
|
|
333
|
+
def evaluate(self, n_bins: int = 10) -> ProScore:
|
|
334
|
+
_check_scorecard(self)
|
|
335
|
+
features = self._selector.support_
|
|
336
|
+
|
|
337
|
+
train_woe = self._transformer.transform(self._train_X(features))
|
|
338
|
+
|
|
339
|
+
test_woe = None
|
|
340
|
+
if self.test_df is not None:
|
|
341
|
+
test_woe = self._transformer.transform(self._test_X(features))
|
|
342
|
+
|
|
343
|
+
oot_woe = None
|
|
344
|
+
if self.oot_df is not None:
|
|
345
|
+
oot_woe = self._transformer.transform(self._oot_X(features))
|
|
346
|
+
|
|
347
|
+
self.eval_result = _evaluate(
|
|
348
|
+
self._scorecard.model_,
|
|
349
|
+
train_woe, self._train_y(),
|
|
350
|
+
X_test=test_woe, y_test=self._test_y(),
|
|
351
|
+
X_oot=oot_woe, y_oot=self._oot_y(),
|
|
352
|
+
features=features, n_bins=n_bins,
|
|
353
|
+
)
|
|
354
|
+
return self
|
|
355
|
+
|
|
356
|
+
# ── properties ────────────────────────────────────────────────────────────
|
|
357
|
+
|
|
358
|
+
@property
|
|
359
|
+
def prefilter_(self) -> Filter | None:
|
|
360
|
+
"""Coarse filter result (:meth:`prefilter`)."""
|
|
361
|
+
return self._prefilter
|
|
362
|
+
|
|
363
|
+
@property
|
|
364
|
+
def filter_(self) -> Filter | None:
|
|
365
|
+
"""Fine filter result (:meth:`refine`); same object as ``refine_``."""
|
|
366
|
+
return self._filter
|
|
367
|
+
|
|
368
|
+
@property
|
|
369
|
+
def refine_(self) -> Filter | None:
|
|
370
|
+
return self._filter
|
|
371
|
+
|
|
372
|
+
@property
|
|
373
|
+
def binner_(self) -> Binning | BinningProcess | None:
|
|
374
|
+
return self._binner
|
|
375
|
+
|
|
376
|
+
@property
|
|
377
|
+
def transformer_(self) -> WOETransformer | None:
|
|
378
|
+
return self._transformer
|
|
379
|
+
|
|
380
|
+
@property
|
|
381
|
+
def selector_(self) -> StepwiseSelector | None:
|
|
382
|
+
return self._selector
|
|
383
|
+
|
|
384
|
+
@property
|
|
385
|
+
def scorecard_(self) -> ScoreCard | None:
|
|
386
|
+
return self._scorecard
|
|
387
|
+
|
|
388
|
+
@property
|
|
389
|
+
def support_(self) -> list[str]:
|
|
390
|
+
if self._selector is None:
|
|
391
|
+
return []
|
|
392
|
+
return self._selector.support_
|
|
393
|
+
|
|
394
|
+
@property
|
|
395
|
+
def bin_tables_(self):
|
|
396
|
+
if self._binner is None:
|
|
397
|
+
raise RuntimeError("Call bin() first.")
|
|
398
|
+
return self._binner.bin_table_
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def score_table_(self):
|
|
402
|
+
if self._scorecard is None:
|
|
403
|
+
raise RuntimeError("Call fit() first.")
|
|
404
|
+
return self._scorecard.score_table_
|
|
405
|
+
|
|
406
|
+
@property
|
|
407
|
+
def model_(self):
|
|
408
|
+
if self._scorecard is None:
|
|
409
|
+
raise RuntimeError("Call fit() first.")
|
|
410
|
+
return self._scorecard.model_
|
|
411
|
+
|
|
412
|
+
@property
|
|
413
|
+
def halted_(self) -> bool:
|
|
414
|
+
"""``True`` when the pipeline should stop (no modelling features)."""
|
|
415
|
+
return self._halted
|
|
416
|
+
|
|
417
|
+
@property
|
|
418
|
+
def halt_message_(self) -> str:
|
|
419
|
+
return self._halt_message
|
|
420
|
+
|
|
421
|
+
@property
|
|
422
|
+
def screen_outcomes_(self) -> list:
|
|
423
|
+
return list(self._screen_outcomes)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _warn_halted(ps: ProScore) -> None:
|
|
427
|
+
import warnings
|
|
428
|
+
from proscore.selection._screen import FeatureScreenWarning
|
|
429
|
+
|
|
430
|
+
warnings.warn(
|
|
431
|
+
f"Pipeline halted: {ps._halt_message}",
|
|
432
|
+
FeatureScreenWarning,
|
|
433
|
+
stacklevel=3,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _check_read(ps: ProScore) -> None:
|
|
438
|
+
if ps.train_df is None:
|
|
439
|
+
raise RuntimeError("Call read() first.")
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _check_binner(ps: ProScore) -> None:
|
|
443
|
+
if ps._binner is None:
|
|
444
|
+
raise RuntimeError("Call bin() first.")
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def _check_refine(ps: ProScore) -> None:
|
|
448
|
+
if ps._filter is not None:
|
|
449
|
+
return # refine() was called and completed normally
|
|
450
|
+
if getattr(ps, "_refine_skipped", False):
|
|
451
|
+
return # refine() was called but had nothing to do
|
|
452
|
+
if ps._prefilter is not None:
|
|
453
|
+
return # prefilter was run, refine intentionally skipped (valid path)
|
|
454
|
+
raise RuntimeError("Call refine() first — or at least prefilter().")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def _check_transformer(ps: ProScore) -> None:
|
|
458
|
+
if ps._transformer is None:
|
|
459
|
+
raise RuntimeError("Call transform() first.")
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _check_selector(ps: ProScore) -> None:
|
|
463
|
+
if ps._selector is None:
|
|
464
|
+
raise RuntimeError("Call select() first.")
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _check_scorecard(ps: ProScore) -> None:
|
|
468
|
+
if ps._scorecard is None:
|
|
469
|
+
raise RuntimeError("Call fit() first.")
|
proscore/__main__.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""ProScore CLI entry point.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
proscore run pipeline.xlsx [--output-script script.py]
|
|
6
|
+
proscore template [output_dir]
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> None:
|
|
16
|
+
args = sys.argv[1:]
|
|
17
|
+
|
|
18
|
+
if not args:
|
|
19
|
+
print(__doc__)
|
|
20
|
+
sys.exit(0)
|
|
21
|
+
|
|
22
|
+
cmd = args[0]
|
|
23
|
+
|
|
24
|
+
if cmd == "run":
|
|
25
|
+
_cmd_run(args[1:])
|
|
26
|
+
elif cmd == "template":
|
|
27
|
+
_cmd_template(args[1:])
|
|
28
|
+
elif cmd in ("-h", "--help"):
|
|
29
|
+
print(__doc__)
|
|
30
|
+
else:
|
|
31
|
+
print(f"Unknown command: {cmd!r}")
|
|
32
|
+
print(__doc__)
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _cmd_run(args: list[str]) -> None:
|
|
37
|
+
import argparse
|
|
38
|
+
|
|
39
|
+
parser = argparse.ArgumentParser(description="Run a pipeline from Excel config")
|
|
40
|
+
parser.add_argument("config", help="Path to pipeline.xlsx")
|
|
41
|
+
parser.add_argument("--output-script", "-o", default=None,
|
|
42
|
+
help="Also generate a self-contained Python script")
|
|
43
|
+
opts = parser.parse_args(args)
|
|
44
|
+
|
|
45
|
+
from proscore._pipeline_config import run_pipeline
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
run_pipeline(opts.config, output_script=opts.output_script)
|
|
49
|
+
except ValueError as e:
|
|
50
|
+
print(f"配置错误: {e}", file=sys.stderr)
|
|
51
|
+
sys.exit(1)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(f"运行错误: {e}", file=sys.stderr)
|
|
54
|
+
import traceback
|
|
55
|
+
traceback.print_exc()
|
|
56
|
+
sys.exit(1)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _cmd_template(args: list[str]) -> None:
|
|
60
|
+
out_dir = args[0] if args else "."
|
|
61
|
+
from proscore._pipeline_config import generate_template
|
|
62
|
+
|
|
63
|
+
path = generate_template(out_dir)
|
|
64
|
+
print(f"模板已生成: {path}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
main()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Data loading and validation utilities for ProScore."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataReader:
|
|
10
|
+
"""
|
|
11
|
+
Load and validate a DataFrame for the scorecard pipeline.
|
|
12
|
+
|
|
13
|
+
Parameters
|
|
14
|
+
----------
|
|
15
|
+
df : pd.DataFrame
|
|
16
|
+
Input data.
|
|
17
|
+
target : str
|
|
18
|
+
Target column name.
|
|
19
|
+
id_col : str, optional
|
|
20
|
+
Primary-key / identifier column (excluded from modelling).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
df: pd.DataFrame,
|
|
26
|
+
target: str,
|
|
27
|
+
id_col: str | None = None,
|
|
28
|
+
):
|
|
29
|
+
if target not in df.columns:
|
|
30
|
+
raise KeyError(f"Target column {target!r} not found in DataFrame")
|
|
31
|
+
if id_col is not None and id_col not in df.columns:
|
|
32
|
+
raise KeyError(f"ID column {id_col!r} not found in DataFrame")
|
|
33
|
+
|
|
34
|
+
self.df = df
|
|
35
|
+
self.target = target
|
|
36
|
+
self.id_col = id_col
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def features_(self) -> list[str]:
|
|
40
|
+
"""Column names available for modelling (excludes target and id)."""
|
|
41
|
+
skip = {self.target}
|
|
42
|
+
if self.id_col:
|
|
43
|
+
skip.add(self.id_col)
|
|
44
|
+
return [c for c in self.df.columns if c not in skip]
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def X(self) -> pd.DataFrame:
|
|
48
|
+
"""Feature DataFrame (excludes target and id)."""
|
|
49
|
+
return self.df[self.features_]
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def y(self) -> pd.Series:
|
|
53
|
+
"""Target Series."""
|
|
54
|
+
return self.df[self.target]
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def shape(self) -> tuple[int, int]:
|
|
58
|
+
"""(n_rows, n_features) — *n_features* excludes *target* and *id_col*."""
|
|
59
|
+
return (len(self.df), len(self.features_))
|
|
60
|
+
|
|
61
|
+
def summary(self) -> pd.DataFrame:
|
|
62
|
+
"""Quick overview: dtype, missing rate, unique count per column."""
|
|
63
|
+
rows = []
|
|
64
|
+
for col in self.features_:
|
|
65
|
+
s = self.df[col]
|
|
66
|
+
rows.append({
|
|
67
|
+
"variable": col,
|
|
68
|
+
"dtype": str(s.dtype),
|
|
69
|
+
"n_missing": int(s.isna().sum()),
|
|
70
|
+
"missing_pct": round(s.isna().mean() * 100, 2),
|
|
71
|
+
"n_unique": int(s.nunique()),
|
|
72
|
+
})
|
|
73
|
+
return pd.DataFrame(rows).sort_values("missing_pct", ascending=False).reset_index(drop=True)
|
|
74
|
+
|
|
75
|
+
def __repr__(self) -> str:
|
|
76
|
+
return (
|
|
77
|
+
f"DataReader(n_rows={len(self.df)}, n_features={len(self.features_)}, "
|
|
78
|
+
f"target={self.target!r}, id={self.id_col!r})"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# ── factory ────────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_csv(
|
|
85
|
+
cls,
|
|
86
|
+
path: str,
|
|
87
|
+
target: str,
|
|
88
|
+
id_col: str | None = None,
|
|
89
|
+
**kwargs,
|
|
90
|
+
) -> DataReader:
|
|
91
|
+
"""Create a DataReader from a CSV file."""
|
|
92
|
+
df = pd.read_csv(path, **kwargs)
|
|
93
|
+
return cls(df, target=target, id_col=id_col)
|