AutoCarver 7.2.2__tar.gz → 7.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/utils/base_carver.py +120 -3
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/binary_combination_evaluators.py +367 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/binary_target_rates.py +4 -3
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/continuous/continuous_combination_evaluators.py +363 -3
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/continuous/continuous_target_rates.py +5 -4
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/combination_evaluator.py +105 -14
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/testing.py +25 -7
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/categorical_discretizer.py +24 -7
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/ordinal_discretizer.py +18 -3
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/quantitatives/quantitative_discretizer.py +22 -20
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/base_discretizer.py +12 -3
- autocarver-7.2.6/AutoCarver/discretizers/utils/frequency_ci.py +74 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/features.py +2 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/PKG-INFO +1 -1
- {autocarver-7.2.2 → autocarver-7.2.6}/pyproject.toml +1 -1
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/binary_carver.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/continuous_carver.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/multiclass_carver.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/utils/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/utils/pretty_print.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/continuous/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/combinations.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/target_rate.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/config.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/discretizer.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/chained_discretizer.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/qualitative_discretizer.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/quantitatives/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/quantitatives/continuous_discretizer.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/multiprocessing.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/type_discretizers.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/categorical_feature.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/ordinal_feature.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/qualitative_feature.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/datetime_feature.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/numerical_feature.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/quantitative_feature.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/base_feature.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/grouped_list.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/serialization.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/classification_selector.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/base_filters.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/qualitative_filters.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/quantitative_filters.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/base_measures.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/qualitative_measures.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/quantitative_measures.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/regression_selector.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/utils/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/utils/base_selector.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/utils/pretty_print.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/__init__.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/attributes.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/dependencies.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/extend_docstring.py +0 -0
- {autocarver-7.2.2 → autocarver-7.2.6}/README.md +0 -0
|
@@ -58,6 +58,7 @@ def _carve_feature_worker(
|
|
|
58
58
|
max_n_mod: int,
|
|
59
59
|
min_freq: float,
|
|
60
60
|
dropna: bool,
|
|
61
|
+
min_freq_alpha: float,
|
|
61
62
|
) -> tuple[BaseFeature, bool]:
|
|
62
63
|
"""Picklable worker: scores best combination for a single feature.
|
|
63
64
|
|
|
@@ -70,11 +71,43 @@ def _carve_feature_worker(
|
|
|
70
71
|
# workers never print per-feature progress; the parent prints a single banner
|
|
71
72
|
evaluator.verbose = False
|
|
72
73
|
best = evaluator.get_best_combination(
|
|
73
|
-
feature,
|
|
74
|
+
feature,
|
|
75
|
+
xagg,
|
|
76
|
+
xagg_dev,
|
|
77
|
+
max_n_mod=max_n_mod,
|
|
78
|
+
min_freq=min_freq,
|
|
79
|
+
dropna=dropna,
|
|
80
|
+
min_freq_alpha=min_freq_alpha,
|
|
74
81
|
)
|
|
75
82
|
return feature, best is not None
|
|
76
83
|
|
|
77
84
|
|
|
85
|
+
def _drop_reason_from_history(history: pd.DataFrame) -> str:
|
|
86
|
+
"""Synthesizes a human-readable drop reason from a dropped feature's history.
|
|
87
|
+
|
|
88
|
+
Picks the most frequent failing-test message across ``train``/``dev`` blocks
|
|
89
|
+
of historized non-viable combinations.
|
|
90
|
+
"""
|
|
91
|
+
if history.empty:
|
|
92
|
+
return "No combination historized"
|
|
93
|
+
|
|
94
|
+
info_counts: dict[str, int] = {}
|
|
95
|
+
for _, row in history.iterrows():
|
|
96
|
+
if bool(row.get("viable", False)):
|
|
97
|
+
continue
|
|
98
|
+
for block_key in ("train", "dev"):
|
|
99
|
+
block = row.get(block_key)
|
|
100
|
+
if isinstance(block, dict):
|
|
101
|
+
msg = block.get("info") or ""
|
|
102
|
+
if msg:
|
|
103
|
+
info_counts[msg] = info_counts.get(msg, 0) + 1
|
|
104
|
+
|
|
105
|
+
if not info_counts:
|
|
106
|
+
return "No robust combination"
|
|
107
|
+
msg, _ = max(info_counts.items(), key=lambda kv: kv[1])
|
|
108
|
+
return f"No robust combination ({msg})"
|
|
109
|
+
|
|
110
|
+
|
|
78
111
|
def _replace_feature_in_features(features: Features, updated: BaseFeature) -> None:
|
|
79
112
|
"""Swaps an existing feature (by version) for the worker-returned copy."""
|
|
80
113
|
if isinstance(updated, CategoricalFeature):
|
|
@@ -164,6 +197,22 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
164
197
|
combination_evaluator.verbose = self.config.verbose
|
|
165
198
|
self.combination_evaluator: CombinationEvaluator = combination_evaluator
|
|
166
199
|
|
|
200
|
+
# features dropped by the carver because no robust combination was found.
|
|
201
|
+
# Kept (not cleared on re-fit) so users can inspect why each dropped via
|
|
202
|
+
# the marker columns added to ``summary`` / ``history``.
|
|
203
|
+
self.dropped_features: list[BaseFeature] = []
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def half_min_freq(self) -> float:
|
|
207
|
+
"""Half of :attr:`min_freq` — the tolerant frequency floor the carver
|
|
208
|
+
applies when discretizing prior to combination search. Halving here gives
|
|
209
|
+
the combination evaluator a finer granularity to recombine, while the
|
|
210
|
+
underlying discretizers themselves compare directly against ``min_freq``
|
|
211
|
+
(with a 1-row tolerance). Owning the halving in the carver — rather than
|
|
212
|
+
inside individual discretizers — keeps the per-discretizer semantic uniform.
|
|
213
|
+
"""
|
|
214
|
+
return self.min_freq / 2
|
|
215
|
+
|
|
167
216
|
@property
|
|
168
217
|
def pretty_print(self) -> bool:
|
|
169
218
|
"""Returns the pretty_print attribute"""
|
|
@@ -173,8 +222,57 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
173
222
|
content = super().to_json(light_mode)
|
|
174
223
|
content["max_n_mod"] = self.max_n_mod
|
|
175
224
|
content["combination_evaluator"] = self.combination_evaluator.to_json()
|
|
225
|
+
content["dropped_features"] = [f.to_json(light_mode) for f in self.dropped_features]
|
|
176
226
|
return content
|
|
177
227
|
|
|
228
|
+
@property
|
|
229
|
+
def summary(self) -> pd.DataFrame:
|
|
230
|
+
"""Per-feature carving summary, extended with one block per dropped feature.
|
|
231
|
+
|
|
232
|
+
Rows from features that the carver dropped (no robust combination on
|
|
233
|
+
train and/or dev) are appended at the end with two marker columns:
|
|
234
|
+
|
|
235
|
+
- ``dropped`` (bool): ``True`` for dropped features, ``False`` otherwise.
|
|
236
|
+
- ``dropped_reason`` (str | None): synthesized from the feature's history
|
|
237
|
+
— the dominant failing test message across attempted combinations.
|
|
238
|
+
"""
|
|
239
|
+
rows: list[dict] = []
|
|
240
|
+
for feature in self.features:
|
|
241
|
+
for row in feature.summary:
|
|
242
|
+
rows.append({**row, "dropped": False, "dropped_reason": None})
|
|
243
|
+
for feature in self.dropped_features:
|
|
244
|
+
reason = _drop_reason_from_history(feature.history)
|
|
245
|
+
for row in feature.summary:
|
|
246
|
+
rows.append({**row, "dropped": True, "dropped_reason": reason})
|
|
247
|
+
|
|
248
|
+
summaries = pd.DataFrame(rows)
|
|
249
|
+
if summaries.empty:
|
|
250
|
+
return summaries
|
|
251
|
+
|
|
252
|
+
excluded = {"feature", "label", "content", "target_mean", "frequency", "dropped", "dropped_reason"}
|
|
253
|
+
indices = [col for col in summaries.columns if col not in excluded]
|
|
254
|
+
indices = ["feature"] + indices + ["label"]
|
|
255
|
+
return summaries.set_index(indices)
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def history(self) -> pd.DataFrame:
|
|
259
|
+
"""Combined combination-history of carved + dropped features.
|
|
260
|
+
|
|
261
|
+
Dropped features' rows are appended with ``dropped=True``; carved
|
|
262
|
+
features' rows get ``dropped=False``.
|
|
263
|
+
"""
|
|
264
|
+
frames: list[pd.DataFrame] = []
|
|
265
|
+
current = self.features.history
|
|
266
|
+
if not current.empty:
|
|
267
|
+
frames.append(current.assign(dropped=False))
|
|
268
|
+
for feature in self.dropped_features:
|
|
269
|
+
df = feature.history
|
|
270
|
+
if len(df) > 0:
|
|
271
|
+
frames.append(df.assign(feature=str(feature), dropped=True))
|
|
272
|
+
if not frames:
|
|
273
|
+
return pd.DataFrame()
|
|
274
|
+
return pd.concat(frames, ignore_index=True)
|
|
275
|
+
|
|
178
276
|
def _prepare_samples(self, samples: Samples) -> Samples:
|
|
179
277
|
"""Validates format and content of X and y."""
|
|
180
278
|
if samples.train.y is None:
|
|
@@ -186,7 +284,7 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
186
284
|
|
|
187
285
|
# discretizing features at half min_freq so the carver has a finer
|
|
188
286
|
# granularity to combine when forming optimal groups
|
|
189
|
-
samples = discretize(self.features, samples, self.
|
|
287
|
+
samples = discretize(self.features, samples, self.half_min_freq, self.config)
|
|
190
288
|
|
|
191
289
|
# setting dropna to True for filling up nans
|
|
192
290
|
self.features.dropna = True
|
|
@@ -282,6 +380,7 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
282
380
|
max_n_mod=self.max_n_mod,
|
|
283
381
|
min_freq=self.min_freq,
|
|
284
382
|
dropna=self.config.dropna,
|
|
383
|
+
min_freq_alpha=self.config.min_freq_alpha,
|
|
285
384
|
)
|
|
286
385
|
|
|
287
386
|
with Pool(processes=self.config.n_jobs) as pool:
|
|
@@ -294,6 +393,7 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
294
393
|
"increasing the size of X_dev or dropping the feature (X not "
|
|
295
394
|
"representative of X_dev for this feature)."
|
|
296
395
|
)
|
|
396
|
+
self.dropped_features.append(updated_feature)
|
|
297
397
|
self.features.remove(updated_feature.version)
|
|
298
398
|
|
|
299
399
|
@abstractmethod
|
|
@@ -324,7 +424,13 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
324
424
|
|
|
325
425
|
# getting best combination
|
|
326
426
|
best_combination = self.combination_evaluator.get_best_combination(
|
|
327
|
-
feature,
|
|
427
|
+
feature,
|
|
428
|
+
xagg,
|
|
429
|
+
xagg_dev,
|
|
430
|
+
max_n_mod=self.max_n_mod,
|
|
431
|
+
min_freq=self.min_freq,
|
|
432
|
+
dropna=self.config.dropna,
|
|
433
|
+
min_freq_alpha=self.config.min_freq_alpha,
|
|
328
434
|
)
|
|
329
435
|
|
|
330
436
|
# printing carved distribution, for found, suitable combination
|
|
@@ -343,6 +449,7 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
343
449
|
f"WARNING: No robust combination for {feature}. Consider increasing the size of "
|
|
344
450
|
"X_dev or dropping the feature (X not representative of X_dev for this feature)."
|
|
345
451
|
)
|
|
452
|
+
self.dropped_features.append(feature)
|
|
346
453
|
self.features.remove(feature.version)
|
|
347
454
|
|
|
348
455
|
def _print_xagg(
|
|
@@ -461,6 +568,16 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
461
568
|
config=config,
|
|
462
569
|
)
|
|
463
570
|
instance.is_fitted = is_fitted
|
|
571
|
+
|
|
572
|
+
# deserializing dropped_features (mirrors Features.load type-dispatch)
|
|
573
|
+
for fjson in data.pop("dropped_features", []):
|
|
574
|
+
if fjson.get("is_categorical"):
|
|
575
|
+
instance.dropped_features.append(CategoricalFeature.load(fjson))
|
|
576
|
+
elif fjson.get("is_ordinal"):
|
|
577
|
+
instance.dropped_features.append(OrdinalFeature.load(fjson))
|
|
578
|
+
elif fjson.get("is_quantitative"):
|
|
579
|
+
instance.dropped_features.append(QuantitativeFeature.load(fjson))
|
|
580
|
+
|
|
464
581
|
return instance
|
|
465
582
|
|
|
466
583
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Module for binary combination evaluators."""
|
|
2
2
|
|
|
3
|
+
import math
|
|
3
4
|
from abc import ABC
|
|
4
5
|
from collections.abc import Iterable, Iterator
|
|
5
6
|
|
|
@@ -12,9 +13,11 @@ from AutoCarver.combinations.binary.binary_target_rates import BinaryTargetRate,
|
|
|
12
13
|
from AutoCarver.combinations.utils.combination_evaluator import (
|
|
13
14
|
AggregatedSample,
|
|
14
15
|
CombinationEvaluator,
|
|
16
|
+
_nan_fanout_variants,
|
|
15
17
|
)
|
|
16
18
|
from AutoCarver.combinations.utils.combinations import combination_formatter
|
|
17
19
|
from AutoCarver.combinations.utils.target_rate import TargetRate
|
|
20
|
+
from AutoCarver.features import GroupedList
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
|
|
@@ -25,6 +28,9 @@ class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
|
|
|
25
28
|
# narrow inherited attribute: binary evaluators always carry a BinaryTargetRate
|
|
26
29
|
# (enforced by _init_target_rate).
|
|
27
30
|
target_rate: BinaryTargetRate
|
|
31
|
+
# narrow inherited `sort_by: str | None`: concrete binary subclasses
|
|
32
|
+
# (TschuprowtCombinations, CramervCombinations) always set this to a str.
|
|
33
|
+
sort_by: str
|
|
28
34
|
|
|
29
35
|
def _init_target_rate(self, target_rate: TargetRate[pd.DataFrame] | None) -> BinaryTargetRate:
|
|
30
36
|
"""Initializes target rate."""
|
|
@@ -194,6 +200,191 @@ class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
|
|
|
194
200
|
tol=tol,
|
|
195
201
|
)
|
|
196
202
|
|
|
203
|
+
def _get_best_combination_non_nan(self) -> dict | None:
|
|
204
|
+
"""DP-based override with progressive top-K.
|
|
205
|
+
|
|
206
|
+
Replaces ``consecutive_combinations + _compute_associations`` with the
|
|
207
|
+
interval-DP in :func:`_top_k_partitions_chi2_dp`, which returns the
|
|
208
|
+
top-K consecutive partitions ranked by ``self.sort_by`` desc.
|
|
209
|
+
|
|
210
|
+
**Progressive search.** Starts with ``top_k = self.dp_top_k_initial``.
|
|
211
|
+
If the viability walk doesn't find a viable candidate within that
|
|
212
|
+
top-K, doubles ``top_k`` and re-runs DP — walking only the new
|
|
213
|
+
entries from where we left off. Repeats until either a viable is
|
|
214
|
+
found or DP exhausts every consecutive partition (signalled by
|
|
215
|
+
``len(result) < top_k``). Total work bounded by ~2× a single DP run
|
|
216
|
+
at the final top_k.
|
|
217
|
+
|
|
218
|
+
This makes the search **exhaustive in the worst case**, matching the
|
|
219
|
+
legacy enumerate-and-score path's correctness while keeping the
|
|
220
|
+
common case (viable found in top ~100) essentially free. Mirrors
|
|
221
|
+
:meth:`ContinuousCombinationEvaluator._get_best_combination_non_nan`.
|
|
222
|
+
|
|
223
|
+
The NaN-fan-out path (:meth:`_get_best_combination_with_nan`) still
|
|
224
|
+
goes through the legacy enumerate-and-score loop.
|
|
225
|
+
"""
|
|
226
|
+
feature_labels = self.feature.labels
|
|
227
|
+
if feature_labels is None:
|
|
228
|
+
raise RuntimeError(f"[{self.__name__}] feature labels are not populated")
|
|
229
|
+
raw_labels = GroupedList(feature_labels[:])
|
|
230
|
+
|
|
231
|
+
if self.feature.has_nan:
|
|
232
|
+
if self.feature.dropna:
|
|
233
|
+
raw_labels.remove(self.feature.nan)
|
|
234
|
+
self.samples.dropna(self.feature.nan)
|
|
235
|
+
|
|
236
|
+
if self.samples.train.shape[0] <= 1:
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
self._historize_raw_combination()
|
|
240
|
+
|
|
241
|
+
# Iterate over raw_labels (mirrors the parent's
|
|
242
|
+
# ``consecutive_combinations(raw_labels, ...)`` enumeration). When
|
|
243
|
+
# raw_labels and raw_xagg.index diverge (edge-case fixtures with
|
|
244
|
+
# has_nan=False + nan row in xagg, or has_nan=True/dropna=False),
|
|
245
|
+
# rows present in raw_xagg but not in raw_labels are excluded; labels
|
|
246
|
+
# present in raw_labels but not in raw_xagg get zero counts — matching
|
|
247
|
+
# the legacy ``_grouper``'s ``groupby.get(idx, idx)`` semantics where
|
|
248
|
+
# an unmapped row produces no contribution.
|
|
249
|
+
raw_xagg = self.samples.train.xagg
|
|
250
|
+
all_n0 = raw_xagg.iloc[:, 0].to_numpy(dtype=float)
|
|
251
|
+
all_n1 = raw_xagg.iloc[:, 1].to_numpy(dtype=float)
|
|
252
|
+
xagg_pos = {m: i for i, m in enumerate(raw_xagg.index)}
|
|
253
|
+
raw_index = list(raw_labels)
|
|
254
|
+
n0_per_mod = np.fromiter(
|
|
255
|
+
(all_n0[xagg_pos[m]] if m in xagg_pos else 0.0 for m in raw_index),
|
|
256
|
+
dtype=float,
|
|
257
|
+
count=len(raw_index),
|
|
258
|
+
)
|
|
259
|
+
n1_per_mod = np.fromiter(
|
|
260
|
+
(all_n1[xagg_pos[m]] if m in xagg_pos else 0.0 for m in raw_index),
|
|
261
|
+
dtype=float,
|
|
262
|
+
count=len(raw_index),
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Progressive top-K with doubling. See docstring.
|
|
266
|
+
top_k = self.dp_top_k_initial
|
|
267
|
+
walked = 0
|
|
268
|
+
viable: dict | None = None
|
|
269
|
+
associations: list[dict] = []
|
|
270
|
+
while True:
|
|
271
|
+
associations = _top_k_partitions_chi2_dp(
|
|
272
|
+
n0_per_mod,
|
|
273
|
+
n1_per_mod,
|
|
274
|
+
max_n_mod=self.max_n_mod,
|
|
275
|
+
raw_index=raw_index,
|
|
276
|
+
sort_by=self.sort_by,
|
|
277
|
+
top_k=top_k,
|
|
278
|
+
)
|
|
279
|
+
viable, walked = self._walk_for_viable(associations, start=walked)
|
|
280
|
+
if viable is not None:
|
|
281
|
+
break
|
|
282
|
+
if walked < top_k:
|
|
283
|
+
break # DP exhausted every consecutive partition; no viable exists
|
|
284
|
+
top_k *= 2
|
|
285
|
+
|
|
286
|
+
self._apply_best_combination(viable)
|
|
287
|
+
return viable
|
|
288
|
+
|
|
289
|
+
def _get_best_combination_with_nan(self, best_combination: dict | None) -> dict | None:
|
|
290
|
+
"""DP-based override with NaN fan-out.
|
|
291
|
+
|
|
292
|
+
Mirrors :meth:`ContinuousCombinationEvaluator._get_best_combination_with_nan`:
|
|
293
|
+
|
|
294
|
+
1. DP top-K base consecutive partitions over the non-nan labels
|
|
295
|
+
(:func:`_top_k_partitions_chi2_dp` on a restricted view of the
|
|
296
|
+
per-modality ``(n0, n1)`` counts);
|
|
297
|
+
2. fan each base out across NaN placements exactly like
|
|
298
|
+
:func:`nan_combinations` (nan folded into each group, then nan
|
|
299
|
+
as its own group when ``len(base) < max_n_mod``, plus the final
|
|
300
|
+
``[all_non_nan, [nan]]`` partition);
|
|
301
|
+
3. re-score every variant in closed form with
|
|
302
|
+
:func:`_chi2_assoc_for_combination` against the **full** per-modality
|
|
303
|
+
counts (the nan row is in ``samples.train.xagg`` because
|
|
304
|
+
:meth:`_get_best_combination_non_nan`'s ``_apply_best_combination``
|
|
305
|
+
rebuilt it from raw);
|
|
306
|
+
4. walk the sorted variants for the first viable, with progressive
|
|
307
|
+
top-K doubling on the base DP — dedup'd via a per-partition seen
|
|
308
|
+
set so combinations carried over from a smaller ``top_k`` are not
|
|
309
|
+
re-tested / re-historized.
|
|
310
|
+
|
|
311
|
+
Falls back to the parent implementation when the guard condition
|
|
312
|
+
(``self.dropna and feature.has_nan and best_combination is not None``)
|
|
313
|
+
is not met — matches the legacy short-circuit behaviour.
|
|
314
|
+
"""
|
|
315
|
+
if not (self.dropna and self.feature.has_nan and best_combination is not None):
|
|
316
|
+
return super()._get_best_combination_with_nan(best_combination)
|
|
317
|
+
|
|
318
|
+
if self.verbose:
|
|
319
|
+
print(f"[{self.__name__}] Grouping NaNs")
|
|
320
|
+
|
|
321
|
+
feature_labels = self.feature.labels
|
|
322
|
+
if feature_labels is None:
|
|
323
|
+
raise RuntimeError(f"[{self.__name__}] feature labels are not populated")
|
|
324
|
+
raw_labels = GroupedList(feature_labels[:])
|
|
325
|
+
raw_labels.remove(self.feature.nan)
|
|
326
|
+
nan_label = self.feature.nan
|
|
327
|
+
|
|
328
|
+
# Full per-modality (n0, n1) — nan row is in xagg because
|
|
329
|
+
# _apply_best_combination on the non-nan winner rebuilt it from raw.
|
|
330
|
+
raw_xagg = self.samples.train.xagg
|
|
331
|
+
n0_per_mod = raw_xagg.iloc[:, 0].to_numpy(dtype=float)
|
|
332
|
+
n1_per_mod = raw_xagg.iloc[:, 1].to_numpy(dtype=float)
|
|
333
|
+
n_obs = float(n0_per_mod.sum() + n1_per_mod.sum())
|
|
334
|
+
mod_to_pos: dict = {m: i for i, m in enumerate(raw_xagg.index)}
|
|
335
|
+
n_mod = len(mod_to_pos)
|
|
336
|
+
tol = 1e-10
|
|
337
|
+
|
|
338
|
+
# Non-nan subset, aligned to raw_labels order, for the base DP.
|
|
339
|
+
non_nan_index = list(raw_labels)
|
|
340
|
+
n0_non_nan = np.fromiter(
|
|
341
|
+
(n0_per_mod[mod_to_pos[m]] for m in non_nan_index),
|
|
342
|
+
dtype=float,
|
|
343
|
+
count=len(non_nan_index),
|
|
344
|
+
)
|
|
345
|
+
n1_non_nan = np.fromiter(
|
|
346
|
+
(n1_per_mod[mod_to_pos[m]] for m in non_nan_index),
|
|
347
|
+
dtype=float,
|
|
348
|
+
count=len(non_nan_index),
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
historized: set[tuple] = set()
|
|
352
|
+
base_top_k = self.dp_top_k_initial
|
|
353
|
+
viable: dict | None = None
|
|
354
|
+
|
|
355
|
+
while True:
|
|
356
|
+
base_partitions = _top_k_partitions_chi2_dp(
|
|
357
|
+
n0_non_nan,
|
|
358
|
+
n1_non_nan,
|
|
359
|
+
max_n_mod=self.max_n_mod,
|
|
360
|
+
raw_index=non_nan_index,
|
|
361
|
+
sort_by=self.sort_by,
|
|
362
|
+
top_k=base_top_k,
|
|
363
|
+
tol=tol,
|
|
364
|
+
)
|
|
365
|
+
scored = _score_nan_variants_chi2(
|
|
366
|
+
base_partitions=base_partitions,
|
|
367
|
+
nan_label=nan_label,
|
|
368
|
+
raw_labels=non_nan_index,
|
|
369
|
+
max_n_mod=self.max_n_mod,
|
|
370
|
+
n0_per_mod=n0_per_mod,
|
|
371
|
+
n1_per_mod=n1_per_mod,
|
|
372
|
+
n_obs=n_obs,
|
|
373
|
+
mod_to_pos=mod_to_pos,
|
|
374
|
+
n_mod=n_mod,
|
|
375
|
+
tol=tol,
|
|
376
|
+
sort_by=self.sort_by,
|
|
377
|
+
)
|
|
378
|
+
viable = self._walk_nan_variants(scored, historized)
|
|
379
|
+
if viable is not None:
|
|
380
|
+
break
|
|
381
|
+
if len(base_partitions) < base_top_k:
|
|
382
|
+
break # DP exhausted every consecutive partition
|
|
383
|
+
base_top_k *= 2
|
|
384
|
+
|
|
385
|
+
self._apply_best_combination(viable)
|
|
386
|
+
return viable
|
|
387
|
+
|
|
197
388
|
|
|
198
389
|
class TschuprowtCombinations(BinaryCombinationEvaluator):
|
|
199
390
|
"""Tschuprow's T based combination evaluation toolkit"""
|
|
@@ -408,3 +599,179 @@ def _chi2_assoc_batch(
|
|
|
408
599
|
"cramerv": float(cramerv_q[b]),
|
|
409
600
|
"tschuprowt": float(tt_q[b]),
|
|
410
601
|
}
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _top_k_partitions_chi2_dp( # noqa: C901
|
|
605
|
+
n0_per_mod: np.ndarray,
|
|
606
|
+
n1_per_mod: np.ndarray,
|
|
607
|
+
*,
|
|
608
|
+
max_n_mod: int,
|
|
609
|
+
raw_index: list,
|
|
610
|
+
sort_by: str = "tschuprowt",
|
|
611
|
+
top_k: int = 1000,
|
|
612
|
+
tol: float = 1e-10,
|
|
613
|
+
) -> list[dict]:
|
|
614
|
+
"""Top-K consecutive-segmentation partitions ranked by a chi²-derived metric.
|
|
615
|
+
|
|
616
|
+
Binary analogue of
|
|
617
|
+
:func:`AutoCarver.combinations.continuous.continuous_combination_evaluators._top_k_partitions_kruskal_dp`.
|
|
618
|
+
|
|
619
|
+
The per-segment chi² cell contribution
|
|
620
|
+
|
|
621
|
+
.. math::
|
|
622
|
+
|
|
623
|
+
c_g = (n_{0,g} + \\tau - E_{0,g})^2 / E_{0,g}
|
|
624
|
+
+ (n_{1,g} + \\tau - E_{1,g})^2 / E_{1,g}
|
|
625
|
+
|
|
626
|
+
(with Yates correction iff the combination has exactly 2 groups) is
|
|
627
|
+
additive across groups **given a fixed number of groups k**: the column
|
|
628
|
+
marginals ``C[c] = N_c + k·τ`` and total ``N = N₀ + N₁ + 2k·τ`` depend
|
|
629
|
+
only on ``k``, not on the split positions. So we run a separate interval-DP
|
|
630
|
+
per ``k ∈ [2, K]`` and merge.
|
|
631
|
+
|
|
632
|
+
Returns a list of ``{combination, index_to_groupby, cramerv, tschuprowt}``
|
|
633
|
+
sorted by ``sort_by`` desc — mirrors the yield shape of
|
|
634
|
+
:meth:`_compute_associations` so it drops into the streaming pipeline.
|
|
635
|
+
|
|
636
|
+
Complexity: O(K² · n_mod² · top_k · log top_k). Independent of the
|
|
637
|
+
combination count (which can reach ~8 M at ``n_mod=40, max_n_mod=7``).
|
|
638
|
+
|
|
639
|
+
Edge cases (mirror :func:`_chi2_assoc_for_combination`):
|
|
640
|
+
|
|
641
|
+
* ``max_n_mod < 2`` or ``n_mod < 2``: returns ``[]``.
|
|
642
|
+
* ``sort_by`` must be ``"cramerv"`` or ``"tschuprowt"``.
|
|
643
|
+
"""
|
|
644
|
+
if sort_by not in ("cramerv", "tschuprowt"):
|
|
645
|
+
raise ValueError(f"sort_by must be 'cramerv' or 'tschuprowt', got {sort_by!r}")
|
|
646
|
+
|
|
647
|
+
n_mod = len(raw_index)
|
|
648
|
+
K = min(max_n_mod, n_mod)
|
|
649
|
+
if K < 2:
|
|
650
|
+
return []
|
|
651
|
+
|
|
652
|
+
n0_prefix = np.concatenate([[0.0], np.cumsum(n0_per_mod.astype(np.float64))])
|
|
653
|
+
n1_prefix = np.concatenate([[0.0], np.cumsum(n1_per_mod.astype(np.float64))])
|
|
654
|
+
N0_total = float(n0_prefix[-1])
|
|
655
|
+
N1_total = float(n1_prefix[-1])
|
|
656
|
+
n_obs = N0_total + N1_total
|
|
657
|
+
|
|
658
|
+
# Collected across all k: (sort_key, cramerv_q, tschuprowt_q, splits)
|
|
659
|
+
all_entries: list[tuple[float, float, float, tuple[int, ...]]] = []
|
|
660
|
+
|
|
661
|
+
for k_groups in range(2, K + 1):
|
|
662
|
+
C0 = N0_total + k_groups * tol
|
|
663
|
+
C1 = N1_total + k_groups * tol
|
|
664
|
+
N_with_tol = N0_total + N1_total + 2.0 * k_groups * tol
|
|
665
|
+
yates = k_groups == 2
|
|
666
|
+
|
|
667
|
+
def seg_cost(
|
|
668
|
+
i: int, j: int, _C0: float = C0, _C1: float = C1, _N: float = N_with_tol, _yates: bool = yates
|
|
669
|
+
) -> float:
|
|
670
|
+
obs0 = (n0_prefix[j] - n0_prefix[i]) + tol
|
|
671
|
+
obs1 = (n1_prefix[j] - n1_prefix[i]) + tol
|
|
672
|
+
R = obs0 + obs1
|
|
673
|
+
E0 = R * _C0 / _N
|
|
674
|
+
E1 = R * _C1 / _N
|
|
675
|
+
if _yates:
|
|
676
|
+
d0 = E0 - obs0
|
|
677
|
+
d1 = E1 - obs1
|
|
678
|
+
obs0 = obs0 + (1.0 if d0 > 0 else (-1.0 if d0 < 0 else 0.0)) * min(0.5, abs(d0))
|
|
679
|
+
obs1 = obs1 + (1.0 if d1 > 0 else (-1.0 if d1 < 0 else 0.0)) * min(0.5, abs(d1))
|
|
680
|
+
return (obs0 - E0) ** 2 / E0 + (obs1 - E1) ** 2 / E1
|
|
681
|
+
|
|
682
|
+
# dp[g][j] holds up to ``top_k`` (chi2_partial, splits_tuple) pairs sorted
|
|
683
|
+
# desc, where splits_tuple = (0, s_1, ..., s_{g-1}, j). g = number of
|
|
684
|
+
# groups in the prefix, j = right boundary.
|
|
685
|
+
dp: list[list[list[tuple[float, tuple[int, ...]]]]] = [
|
|
686
|
+
[[] for _ in range(n_mod + 1)] for _ in range(k_groups + 1)
|
|
687
|
+
]
|
|
688
|
+
for j in range(1, n_mod + 1):
|
|
689
|
+
dp[1][j] = [(seg_cost(0, j), (0, j))]
|
|
690
|
+
for g in range(2, k_groups + 1):
|
|
691
|
+
for j in range(g, n_mod + 1):
|
|
692
|
+
candidates: list[tuple[float, tuple[int, ...]]] = []
|
|
693
|
+
for i in range(g - 1, j):
|
|
694
|
+
c = seg_cost(i, j)
|
|
695
|
+
for prev_s, prev_splits in dp[g - 1][i]:
|
|
696
|
+
candidates.append((prev_s + c, prev_splits + (j,)))
|
|
697
|
+
if candidates:
|
|
698
|
+
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
699
|
+
dp[g][j] = candidates[:top_k]
|
|
700
|
+
|
|
701
|
+
# Translate chi² → cramerv (quantised) → tschuprowt (quantised). Matches
|
|
702
|
+
# :func:`_chi2_assoc_for_combination` cell-for-cell.
|
|
703
|
+
denom = (k_groups - 1) ** 0.25 # k_groups ≥ 2 here, so denom > 0
|
|
704
|
+
for chi2, splits in dp[k_groups][n_mod]:
|
|
705
|
+
cramerv_raw = (chi2 / n_obs) ** 0.5
|
|
706
|
+
cramerv_q = round(cramerv_raw / tol) * tol
|
|
707
|
+
tt_raw = cramerv_q / denom
|
|
708
|
+
tt_q = round(tt_raw / tol) * tol
|
|
709
|
+
sort_key = tt_q if sort_by == "tschuprowt" else cramerv_q
|
|
710
|
+
all_entries.append((sort_key, cramerv_q, tt_q, splits))
|
|
711
|
+
|
|
712
|
+
all_entries.sort(key=lambda x: x[0], reverse=True)
|
|
713
|
+
all_entries = all_entries[:top_k]
|
|
714
|
+
|
|
715
|
+
out: list[dict] = []
|
|
716
|
+
for _, cv, tt, splits in all_entries:
|
|
717
|
+
combination = [list(raw_index[splits[g] : splits[g + 1]]) for g in range(len(splits) - 1)]
|
|
718
|
+
out.append(
|
|
719
|
+
{
|
|
720
|
+
"combination": combination,
|
|
721
|
+
"index_to_groupby": combination_formatter(combination),
|
|
722
|
+
"cramerv": float(cv),
|
|
723
|
+
"tschuprowt": float(tt),
|
|
724
|
+
}
|
|
725
|
+
)
|
|
726
|
+
return out
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _score_nan_variants_chi2(
|
|
730
|
+
*,
|
|
731
|
+
base_partitions: list[dict],
|
|
732
|
+
nan_label: str,
|
|
733
|
+
raw_labels: list,
|
|
734
|
+
max_n_mod: int,
|
|
735
|
+
n0_per_mod: np.ndarray,
|
|
736
|
+
n1_per_mod: np.ndarray,
|
|
737
|
+
n_obs: float,
|
|
738
|
+
mod_to_pos: dict,
|
|
739
|
+
n_mod: int,
|
|
740
|
+
tol: float,
|
|
741
|
+
sort_by: str,
|
|
742
|
+
) -> list[dict]:
|
|
743
|
+
"""Score every NaN-fanout variant via closed-form chi² (Cramér's V +
|
|
744
|
+
Tschuprow's T), sorted by ``sort_by`` desc.
|
|
745
|
+
|
|
746
|
+
Uses :func:`_chi2_assoc_for_combination` per variant — bit-identical to
|
|
747
|
+
the legacy ``chi2_contingency`` path on the per-variant crosstab.
|
|
748
|
+
"""
|
|
749
|
+
scored: list[dict] = []
|
|
750
|
+
for variant in _nan_fanout_variants(base_partitions, nan_label, raw_labels, max_n_mod):
|
|
751
|
+
index_to_groupby = combination_formatter(variant)
|
|
752
|
+
cv, tt = _chi2_assoc_for_combination(
|
|
753
|
+
n0_per_mod=n0_per_mod,
|
|
754
|
+
n1_per_mod=n1_per_mod,
|
|
755
|
+
n_obs=n_obs,
|
|
756
|
+
mod_to_pos=mod_to_pos,
|
|
757
|
+
n_mod=n_mod,
|
|
758
|
+
index_to_groupby=index_to_groupby,
|
|
759
|
+
tol=tol,
|
|
760
|
+
)
|
|
761
|
+
scored.append(
|
|
762
|
+
{
|
|
763
|
+
"combination": variant,
|
|
764
|
+
"index_to_groupby": index_to_groupby,
|
|
765
|
+
"cramerv": cv,
|
|
766
|
+
"tschuprowt": tt,
|
|
767
|
+
}
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
def _key(a: dict) -> float:
|
|
771
|
+
v = a[sort_by]
|
|
772
|
+
if v is None or (isinstance(v, float) and math.isnan(v)):
|
|
773
|
+
return float("-inf")
|
|
774
|
+
return float(v)
|
|
775
|
+
|
|
776
|
+
scored.sort(key=_key, reverse=True)
|
|
777
|
+
return scored
|
|
@@ -33,14 +33,15 @@ class BinaryTargetRate(TargetRate[pd.DataFrame], ABC):
|
|
|
33
33
|
"""
|
|
34
34
|
# checking for an xtab
|
|
35
35
|
if xagg is not None:
|
|
36
|
-
# frequency per modality
|
|
37
|
-
|
|
36
|
+
# count + frequency per modality (count carried for CI-based viability tests)
|
|
37
|
+
count = xagg.sum(axis=1)
|
|
38
|
+
frequency = count / count.sum()
|
|
38
39
|
|
|
39
40
|
# computing target rate. `_compute` expects pd.DataFrame (Generic
|
|
40
41
|
# XAgg=DataFrame); compute()'s wide signature is for LSP matching,
|
|
41
42
|
# callers always pass a crosstab here.
|
|
42
43
|
return pd.DataFrame(
|
|
43
|
-
{self.__name__: self._compute(xagg), "frequency": frequency} # type: ignore
|
|
44
|
+
{self.__name__: self._compute(xagg), "frequency": frequency, "count": count} # type: ignore
|
|
44
45
|
)
|
|
45
46
|
return None
|
|
46
47
|
|