proscore 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. proscore/__init__.py +469 -0
  2. proscore/__main__.py +68 -0
  3. proscore/_data/__init__.py +93 -0
  4. proscore/_pipeline_config.py +1133 -0
  5. proscore/binning/__init__.py +8 -0
  6. proscore/binning/_adjust.py +280 -0
  7. proscore/binning/_base.py +42 -0
  8. proscore/binning/_binning.py +774 -0
  9. proscore/binning/_categorical.py +112 -0
  10. proscore/binning/_chi.py +197 -0
  11. proscore/binning/_distance.py +23 -0
  12. proscore/binning/_frequency.py +38 -0
  13. proscore/binning/_tree.py +34 -0
  14. proscore/binning/_woe.py +76 -0
  15. proscore/evaluate/__init__.py +19 -0
  16. proscore/evaluate/_metrics.py +331 -0
  17. proscore/inspect/__init__.py +9 -0
  18. proscore/inspect/_correlation.py +117 -0
  19. proscore/inspect/_detect.py +213 -0
  20. proscore/inspect/_quality.py +394 -0
  21. proscore/inspect/_stability.py +259 -0
  22. proscore/modeling/__init__.py +3 -0
  23. proscore/modeling/_scorecard.py +213 -0
  24. proscore/monitor/__init__.py +9 -0
  25. proscore/monitor/_monitor.py +549 -0
  26. proscore/report/__init__.py +5 -0
  27. proscore/report/_builder.py +1132 -0
  28. proscore/selection/__init__.py +11 -0
  29. proscore/selection/_filter.py +448 -0
  30. proscore/selection/_screen.py +83 -0
  31. proscore/selection/_stepwise.py +623 -0
  32. proscore/transform/__init__.py +3 -0
  33. proscore/transform/_woe.py +255 -0
  34. proscore/utils/__init__.py +62 -0
  35. proscore/utils/_config.py +5 -0
  36. proscore/utils/_exceptions.py +14 -0
  37. proscore/utils/_presets.py +135 -0
  38. proscore/utils/_psi.py +49 -0
  39. proscore/viz/__init__.py +15 -0
  40. proscore/viz/_plots.py +269 -0
  41. proscore-0.1.0.dist-info/METADATA +192 -0
  42. proscore-0.1.0.dist-info/RECORD +46 -0
  43. proscore-0.1.0.dist-info/WHEEL +5 -0
  44. proscore-0.1.0.dist-info/entry_points.txt +2 -0
  45. proscore-0.1.0.dist-info/licenses/LICENSE +21 -0
  46. proscore-0.1.0.dist-info/top_level.txt +1 -0
proscore/__init__.py ADDED
@@ -0,0 +1,469 @@
1
+ """ProScore — Scorecard modelling toolkit."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from proscore import inspect
8
+ from proscore._pipeline_config import PipelineConfig, run_pipeline
9
+ from proscore.binning import Binning, BinningProcess
10
+ from proscore.evaluate import evaluate as _evaluate
11
+ from proscore.modeling import ScoreCard
12
+ from proscore.selection import Filter, StepwiseSelector, assess_screen
13
+ from proscore.transform import WOETransformer
14
+
15
+ __version__ = "0.1.0"
16
+
17
+
18
+ class ProScore:
19
+ """
20
+ Chain-style entry point for the full scorecard pipeline.
21
+
22
+ Recommended order (aligned with modular notebooks)::
23
+
24
+ ps = ProScore()
25
+ ps.read(train=..., test=..., oot=..., target="bad_flag") \\
26
+ .detect() \\
27
+ .prefilter(max_corr=0.75, max_vif=10) \\
28
+ .bin(method="chi", n_bins=5) \\
29
+ .refine(iv_range=(0.02, None), max_psi=0.25) \\
30
+ .transform() \\
31
+ .select(method="stepwise") \\
32
+ .fit(odds=50, pdo=10) \\
33
+ .scorecard() \\
34
+ .evaluate()
35
+ """
36
+
37
+ def __init__(self) -> None:
38
+ self.target: str = ""
39
+ self.id_col: str | None = None
40
+ self.train_df: pd.DataFrame | None = None
41
+ self.test_df: pd.DataFrame | None = None
42
+ self.oot_df: pd.DataFrame | None = None
43
+
44
+ self.detect_result: pd.DataFrame | None = None
45
+ self.quality_result: pd.DataFrame | None = None
46
+ self._prefilter: Filter | None = None
47
+ self._filter: Filter | None = None
48
+ self._binner: Binning | BinningProcess | None = None
49
+ self._transformer: WOETransformer | None = None
50
+ self._selector: StepwiseSelector | None = None
51
+ self._scorecard: ScoreCard | None = None
52
+ self.eval_result: dict | None = None
53
+
54
+ self._halted: bool = False
55
+ self._halt_message: str = ""
56
+ self._refine_skipped: bool = False
57
+ self._screen_outcomes: list = []
58
+
59
+ # ── helpers ───────────────────────────────────────────────────────────────
60
+
61
+ def _train_X(self, features: list[str]) -> pd.DataFrame:
62
+ return self.train_df[features]
63
+
64
+ def _train_y(self) -> pd.Series:
65
+ return self.train_df[self.target]
66
+
67
+ def _test_X(self, features: list[str]) -> pd.DataFrame | None:
68
+ if self.test_df is None:
69
+ return None
70
+ return self.test_df[features]
71
+
72
+ def _test_y(self) -> pd.Series | None:
73
+ if self.test_df is None:
74
+ return None
75
+ return self.test_df[self.target]
76
+
77
+ def _oot_X(self, features: list[str]) -> pd.DataFrame | None:
78
+ if self.oot_df is None:
79
+ return None
80
+ return self.oot_df[features]
81
+
82
+ def _oot_y(self) -> pd.Series | None:
83
+ if self.oot_df is None:
84
+ return None
85
+ return self.oot_df[self.target]
86
+
87
+ def _categorical_features(self) -> list[str]:
88
+ return [
89
+ c for c in self.train_df.columns
90
+ if c != self.target and not pd.api.types.is_numeric_dtype(self.train_df[c])
91
+ ]
92
+
93
+ def _initial_features(self) -> list[str]:
94
+ """Numeric columns — the Filter candidate pool."""
95
+ return [
96
+ c for c in self.train_df.columns
97
+ if c != self.target and pd.api.types.is_numeric_dtype(self.train_df[c])
98
+ ]
99
+
100
+ def _numeric_after_prefilter(self) -> list[str]:
101
+ if self._prefilter is not None and self._prefilter.support_:
102
+ return self._prefilter.support_
103
+ return self._initial_features()
104
+
105
+ def _current_numeric(self) -> list[str]:
106
+ """Numeric features after refine (or prefilter / fallback)."""
107
+ if self._selector is not None:
108
+ return [f for f in self._selector.support_ if f in self._initial_features()]
109
+ if self._filter is not None and self._filter.support_:
110
+ return self._filter.support_
111
+ return self._numeric_after_prefilter()
112
+
113
+ def _features_for_binning_fit(self) -> list[str]:
114
+ """Columns passed to :meth:`bin` — prefilter survivors + categoricals."""
115
+ return self._numeric_after_prefilter() + self._categorical_features()
116
+
117
+ def _features_for_modeling(self) -> list[str]:
118
+ """Columns for WOE / stepwise — refine survivors + categoricals."""
119
+ return self._current_numeric() + self._categorical_features()
120
+
121
+ # ── read ──────────────────────────────────────────────────────────────────
122
+
123
+ def read(
124
+ self,
125
+ train: pd.DataFrame,
126
+ *,
127
+ target: str,
128
+ test: pd.DataFrame | None = None,
129
+ oot: pd.DataFrame | None = None,
130
+ id_col: str | None = None,
131
+ ) -> ProScore:
132
+ """Load data (already split into train / test / oot)."""
133
+ self.train_df = train
134
+ self.test_df = test
135
+ self.oot_df = oot
136
+ self.target = target
137
+ self.id_col = id_col
138
+
139
+ base_cols = set(train.columns)
140
+ if test is not None and set(test.columns) != base_cols:
141
+ raise ValueError("test columns must exactly match train columns")
142
+ if oot is not None and set(oot.columns) != base_cols:
143
+ raise ValueError("oot columns must exactly match train columns")
144
+
145
+ return self
146
+
147
+ # ── inspect (train only) ──────────────────────────────────────────────────
148
+
149
+ def detect(self, **kwargs) -> ProScore:
150
+ _check_read(self)
151
+ self.detect_result = inspect.detect(self.train_df, target=self.target, **kwargs)
152
+ return self
153
+
154
+ def quality(self, **kwargs) -> ProScore:
155
+ _check_read(self)
156
+ self.quality_result = inspect.quality(self.train_df, target=self.target, **kwargs)
157
+ return self
158
+
159
+ # ── prefilter (coarse — no bin_table required) ───────────────────────────
160
+
161
+ def prefilter(self, **kwargs) -> ProScore:
162
+ """
163
+ Coarse feature screen on numeric columns (no binning required).
164
+
165
+ Typical kwargs: ``max_missing_rate``, ``max_one_value_rate``,
166
+ ``min_auc``, ``max_corr``, ``max_vif``. Leave ``iv_range`` and
167
+ ``max_psi`` unset (``None``) until :meth:`refine`.
168
+ """
169
+ _check_read(self)
170
+ features = self._initial_features()
171
+ self._prefilter = Filter(**kwargs)
172
+ self._prefilter.fit(
173
+ self._train_X(features), self._train_y(),
174
+ X_test=self._test_X(features) if self.test_df is not None else None,
175
+ bin_table=None,
176
+ )
177
+ outcome = assess_screen(
178
+ self._prefilter.support_,
179
+ stage="prefilter",
180
+ n_candidates=len(features),
181
+ )
182
+ self._screen_outcomes.append(outcome)
183
+ if not outcome.ok and len(self._categorical_features()) == 0:
184
+ self._halted = True
185
+ self._halt_message = outcome.message
186
+ return self
187
+
188
+ # ── bin (train — survivors of prefilter + categoricals) ───────────────────
189
+
190
+ def bin(self, method: str = "chi", n_bins: int = 10, **kwargs) -> ProScore:
191
+ _check_read(self)
192
+ if self._halted:
193
+ _warn_halted(self)
194
+ return self
195
+ features = self._features_for_binning_fit()
196
+ if len(features) == 0:
197
+ self._halted = True
198
+ self._halt_message = "分箱阶段无可选特征(数值与类别均为空)。"
199
+ _warn_halted(self)
200
+ return self
201
+ X = pd.concat([self.train_df[features], self.train_df[self.target]], axis=1)
202
+
203
+ feature_config = kwargs.pop("feature_config", None)
204
+ if feature_config:
205
+ self._binner = BinningProcess(
206
+ feature_config=feature_config,
207
+ default_method=method,
208
+ default_n_bins=n_bins,
209
+ **kwargs,
210
+ )
211
+ self._binner.fit(X, y=self.target)
212
+ else:
213
+ self._binner = Binning(method=method, n_bins=n_bins, **kwargs)
214
+ self._binner.fit(X, y=self.target)
215
+ return self
216
+
217
+ # ── refine (fine — IV/PSI from bin_table_; requires :meth:`bin`) ─────────
218
+
219
+ def refine(self, **kwargs) -> ProScore:
220
+ """
221
+ Fine screen on numeric columns that passed :meth:`prefilter`.
222
+
223
+ Pass ``iv_range``, ``max_psi`` (Train vs Test), etc. Uses
224
+ :attr:`Binning.bin_table_` from the preceding :meth:`bin` call.
225
+ """
226
+ _check_read(self)
227
+ if self._halted:
228
+ _warn_halted(self)
229
+ return self
230
+ _check_binner(self)
231
+ features = self._numeric_after_prefilter()
232
+ if len(features) == 0:
233
+ import warnings
234
+ from proscore.selection._screen import FeatureScreenWarning
235
+
236
+ warnings.warn(
237
+ "refine skipped: no numeric features after prefilter.",
238
+ FeatureScreenWarning,
239
+ stacklevel=2,
240
+ )
241
+ self._refine_skipped = True
242
+ self._filter = None
243
+ return self
244
+ self._refine_skipped = False
245
+ self._filter = Filter(**kwargs)
246
+ self._filter.fit(
247
+ self._train_X(features), self._train_y(),
248
+ X_test=self._test_X(features) if self.test_df is not None else None,
249
+ bin_table=self._binner.bin_table_,
250
+ )
251
+ outcome = assess_screen(
252
+ self._filter.support_,
253
+ stage="refine",
254
+ n_candidates=len(features),
255
+ )
256
+ self._screen_outcomes.append(outcome)
257
+ modeling = self._features_for_modeling()
258
+ if not outcome.ok and len(modeling) == 0:
259
+ self._halted = True
260
+ self._halt_message = outcome.message
261
+ return self
262
+
263
+ def filter(self, **kwargs) -> ProScore:
264
+ """Alias for :meth:`refine` (backward compatibility)."""
265
+ return self.refine(**kwargs)
266
+
267
+ # ── transform ─────────────────────────────────────────────────────────────
268
+
269
+ def transform(self, unseen_strategy: str = "worst", **kwargs) -> ProScore:
270
+ if self._halted:
271
+ _warn_halted(self)
272
+ return self
273
+ _check_binner(self)
274
+ _check_refine(self)
275
+ features = self._features_for_modeling()
276
+ if len(features) == 0:
277
+ self._halted = True
278
+ self._halt_message = "无可建模特征(数值+类别均为空)。"
279
+ _warn_halted(self)
280
+ return self
281
+ self._transformer = WOETransformer(unseen_strategy=unseen_strategy, **kwargs)
282
+ tables = {k: v for k, v in self._binner.bin_table_.items() if k in features}
283
+ self._transformer.fit(tables)
284
+ return self
285
+
286
+ # ── select ────────────────────────────────────────────────────────────────
287
+
288
+ def select(self, method: str = "stepwise", **kwargs) -> ProScore:
289
+ if self._halted:
290
+ _warn_halted(self)
291
+ return self
292
+ _check_transformer(self)
293
+ features = self._features_for_modeling()
294
+ train_woe = self._transformer.transform(self._train_X(features))
295
+ train_woe[self.target] = self._train_y().values
296
+
297
+ test_woe = None
298
+ y_test = None
299
+ if self.test_df is not None:
300
+ test_woe = self._transformer.transform(self._test_X(features))
301
+ y_test = self._test_y().values
302
+
303
+ force_in = kwargs.pop("force_in", None)
304
+ self._selector = StepwiseSelector(**kwargs)
305
+ self._selector.fit(
306
+ train_woe, self._train_y(), candidates=features,
307
+ force_in=force_in, X_test=test_woe, y_test=y_test,
308
+ )
309
+ return self
310
+
311
+ # ── fit / scorecard / evaluate ────────────────────────────────────────────
312
+
313
+ def fit(self, odds: float = 50, pdo: float = 10, base_score: float = 600, **kwargs) -> ProScore:
314
+ if self._halted:
315
+ _warn_halted(self)
316
+ return self
317
+ _check_selector(self)
318
+ features = self._selector.support_
319
+ train_woe = self._transformer.transform(self._train_X(features))
320
+ train_woe[self.target] = self._train_y().values
321
+
322
+ self._scorecard = ScoreCard(odds=odds, pdo=pdo, base_score=base_score, **kwargs)
323
+ self._scorecard.fit(train_woe, y=self.target, features=features)
324
+ return self
325
+
326
+ def scorecard(self) -> ProScore:
327
+ _check_scorecard(self)
328
+ features = self._selector.support_
329
+ tables = {k: v for k, v in self._binner.bin_table_.items() if k in features}
330
+ self._scorecard.scorecard(tables)
331
+ return self
332
+
333
+ def evaluate(self, n_bins: int = 10) -> ProScore:
334
+ _check_scorecard(self)
335
+ features = self._selector.support_
336
+
337
+ train_woe = self._transformer.transform(self._train_X(features))
338
+
339
+ test_woe = None
340
+ if self.test_df is not None:
341
+ test_woe = self._transformer.transform(self._test_X(features))
342
+
343
+ oot_woe = None
344
+ if self.oot_df is not None:
345
+ oot_woe = self._transformer.transform(self._oot_X(features))
346
+
347
+ self.eval_result = _evaluate(
348
+ self._scorecard.model_,
349
+ train_woe, self._train_y(),
350
+ X_test=test_woe, y_test=self._test_y(),
351
+ X_oot=oot_woe, y_oot=self._oot_y(),
352
+ features=features, n_bins=n_bins,
353
+ )
354
+ return self
355
+
356
+ # ── properties ────────────────────────────────────────────────────────────
357
+
358
+ @property
359
+ def prefilter_(self) -> Filter | None:
360
+ """Coarse filter result (:meth:`prefilter`)."""
361
+ return self._prefilter
362
+
363
+ @property
364
+ def filter_(self) -> Filter | None:
365
+ """Fine filter result (:meth:`refine`); same object as ``refine_``."""
366
+ return self._filter
367
+
368
+ @property
369
+ def refine_(self) -> Filter | None:
370
+ return self._filter
371
+
372
+ @property
373
+ def binner_(self) -> Binning | BinningProcess | None:
374
+ return self._binner
375
+
376
+ @property
377
+ def transformer_(self) -> WOETransformer | None:
378
+ return self._transformer
379
+
380
+ @property
381
+ def selector_(self) -> StepwiseSelector | None:
382
+ return self._selector
383
+
384
+ @property
385
+ def scorecard_(self) -> ScoreCard | None:
386
+ return self._scorecard
387
+
388
+ @property
389
+ def support_(self) -> list[str]:
390
+ if self._selector is None:
391
+ return []
392
+ return self._selector.support_
393
+
394
+ @property
395
+ def bin_tables_(self):
396
+ if self._binner is None:
397
+ raise RuntimeError("Call bin() first.")
398
+ return self._binner.bin_table_
399
+
400
+ @property
401
+ def score_table_(self):
402
+ if self._scorecard is None:
403
+ raise RuntimeError("Call fit() first.")
404
+ return self._scorecard.score_table_
405
+
406
+ @property
407
+ def model_(self):
408
+ if self._scorecard is None:
409
+ raise RuntimeError("Call fit() first.")
410
+ return self._scorecard.model_
411
+
412
+ @property
413
+ def halted_(self) -> bool:
414
+ """``True`` when the pipeline should stop (no modelling features)."""
415
+ return self._halted
416
+
417
+ @property
418
+ def halt_message_(self) -> str:
419
+ return self._halt_message
420
+
421
+ @property
422
+ def screen_outcomes_(self) -> list:
423
+ return list(self._screen_outcomes)
424
+
425
+
426
+ def _warn_halted(ps: ProScore) -> None:
427
+ import warnings
428
+ from proscore.selection._screen import FeatureScreenWarning
429
+
430
+ warnings.warn(
431
+ f"Pipeline halted: {ps._halt_message}",
432
+ FeatureScreenWarning,
433
+ stacklevel=3,
434
+ )
435
+
436
+
437
+ def _check_read(ps: ProScore) -> None:
438
+ if ps.train_df is None:
439
+ raise RuntimeError("Call read() first.")
440
+
441
+
442
+ def _check_binner(ps: ProScore) -> None:
443
+ if ps._binner is None:
444
+ raise RuntimeError("Call bin() first.")
445
+
446
+
447
+ def _check_refine(ps: ProScore) -> None:
448
+ if ps._filter is not None:
449
+ return # refine() was called and completed normally
450
+ if getattr(ps, "_refine_skipped", False):
451
+ return # refine() was called but had nothing to do
452
+ if ps._prefilter is not None:
453
+ return # prefilter was run, refine intentionally skipped (valid path)
454
+ raise RuntimeError("Call refine() first — or at least prefilter().")
455
+
456
+
457
+ def _check_transformer(ps: ProScore) -> None:
458
+ if ps._transformer is None:
459
+ raise RuntimeError("Call transform() first.")
460
+
461
+
462
+ def _check_selector(ps: ProScore) -> None:
463
+ if ps._selector is None:
464
+ raise RuntimeError("Call select() first.")
465
+
466
+
467
+ def _check_scorecard(ps: ProScore) -> None:
468
+ if ps._scorecard is None:
469
+ raise RuntimeError("Call fit() first.")
proscore/__main__.py ADDED
@@ -0,0 +1,68 @@
1
+ """ProScore CLI entry point.
2
+
3
+ Usage::
4
+
5
+ proscore run pipeline.xlsx [--output-script script.py]
6
+ proscore template [output_dir]
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import sys
12
+ from pathlib import Path
13
+
14
+
15
+ def main() -> None:
16
+ args = sys.argv[1:]
17
+
18
+ if not args:
19
+ print(__doc__)
20
+ sys.exit(0)
21
+
22
+ cmd = args[0]
23
+
24
+ if cmd == "run":
25
+ _cmd_run(args[1:])
26
+ elif cmd == "template":
27
+ _cmd_template(args[1:])
28
+ elif cmd in ("-h", "--help"):
29
+ print(__doc__)
30
+ else:
31
+ print(f"Unknown command: {cmd!r}")
32
+ print(__doc__)
33
+ sys.exit(1)
34
+
35
+
36
+ def _cmd_run(args: list[str]) -> None:
37
+ import argparse
38
+
39
+ parser = argparse.ArgumentParser(description="Run a pipeline from Excel config")
40
+ parser.add_argument("config", help="Path to pipeline.xlsx")
41
+ parser.add_argument("--output-script", "-o", default=None,
42
+ help="Also generate a self-contained Python script")
43
+ opts = parser.parse_args(args)
44
+
45
+ from proscore._pipeline_config import run_pipeline
46
+
47
+ try:
48
+ run_pipeline(opts.config, output_script=opts.output_script)
49
+ except ValueError as e:
50
+ print(f"配置错误: {e}", file=sys.stderr)
51
+ sys.exit(1)
52
+ except Exception as e:
53
+ print(f"运行错误: {e}", file=sys.stderr)
54
+ import traceback
55
+ traceback.print_exc()
56
+ sys.exit(1)
57
+
58
+
59
+ def _cmd_template(args: list[str]) -> None:
60
+ out_dir = args[0] if args else "."
61
+ from proscore._pipeline_config import generate_template
62
+
63
+ path = generate_template(out_dir)
64
+ print(f"模板已生成: {path}")
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()
@@ -0,0 +1,93 @@
1
+ """Data loading and validation utilities for ProScore."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ class DataReader:
10
+ """
11
+ Load and validate a DataFrame for the scorecard pipeline.
12
+
13
+ Parameters
14
+ ----------
15
+ df : pd.DataFrame
16
+ Input data.
17
+ target : str
18
+ Target column name.
19
+ id_col : str, optional
20
+ Primary-key / identifier column (excluded from modelling).
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ df: pd.DataFrame,
26
+ target: str,
27
+ id_col: str | None = None,
28
+ ):
29
+ if target not in df.columns:
30
+ raise KeyError(f"Target column {target!r} not found in DataFrame")
31
+ if id_col is not None and id_col not in df.columns:
32
+ raise KeyError(f"ID column {id_col!r} not found in DataFrame")
33
+
34
+ self.df = df
35
+ self.target = target
36
+ self.id_col = id_col
37
+
38
+ @property
39
+ def features_(self) -> list[str]:
40
+ """Column names available for modelling (excludes target and id)."""
41
+ skip = {self.target}
42
+ if self.id_col:
43
+ skip.add(self.id_col)
44
+ return [c for c in self.df.columns if c not in skip]
45
+
46
+ @property
47
+ def X(self) -> pd.DataFrame:
48
+ """Feature DataFrame (excludes target and id)."""
49
+ return self.df[self.features_]
50
+
51
+ @property
52
+ def y(self) -> pd.Series:
53
+ """Target Series."""
54
+ return self.df[self.target]
55
+
56
+ @property
57
+ def shape(self) -> tuple[int, int]:
58
+ """(n_rows, n_features) — *n_features* excludes *target* and *id_col*."""
59
+ return (len(self.df), len(self.features_))
60
+
61
+ def summary(self) -> pd.DataFrame:
62
+ """Quick overview: dtype, missing rate, unique count per column."""
63
+ rows = []
64
+ for col in self.features_:
65
+ s = self.df[col]
66
+ rows.append({
67
+ "variable": col,
68
+ "dtype": str(s.dtype),
69
+ "n_missing": int(s.isna().sum()),
70
+ "missing_pct": round(s.isna().mean() * 100, 2),
71
+ "n_unique": int(s.nunique()),
72
+ })
73
+ return pd.DataFrame(rows).sort_values("missing_pct", ascending=False).reset_index(drop=True)
74
+
75
+ def __repr__(self) -> str:
76
+ return (
77
+ f"DataReader(n_rows={len(self.df)}, n_features={len(self.features_)}, "
78
+ f"target={self.target!r}, id={self.id_col!r})"
79
+ )
80
+
81
+ # ── factory ────────────────────────────────────────────────────────────
82
+
83
+ @classmethod
84
+ def from_csv(
85
+ cls,
86
+ path: str,
87
+ target: str,
88
+ id_col: str | None = None,
89
+ **kwargs,
90
+ ) -> DataReader:
91
+ """Create a DataReader from a CSV file."""
92
+ df = pd.read_csv(path, **kwargs)
93
+ return cls(df, target=target, id_col=id_col)