AutoCarver 7.2.2__tar.gz → 7.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/utils/base_carver.py +120 -3
  2. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/binary_combination_evaluators.py +367 -0
  3. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/binary_target_rates.py +4 -3
  4. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/continuous/continuous_combination_evaluators.py +363 -3
  5. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/continuous/continuous_target_rates.py +5 -4
  6. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/combination_evaluator.py +105 -14
  7. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/testing.py +25 -7
  8. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/categorical_discretizer.py +24 -7
  9. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/ordinal_discretizer.py +18 -3
  10. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/quantitatives/quantitative_discretizer.py +22 -20
  11. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/base_discretizer.py +12 -3
  12. autocarver-7.2.6/AutoCarver/discretizers/utils/frequency_ci.py +74 -0
  13. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/features.py +2 -0
  14. {autocarver-7.2.2 → autocarver-7.2.6}/PKG-INFO +1 -1
  15. {autocarver-7.2.2 → autocarver-7.2.6}/pyproject.toml +1 -1
  16. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/__init__.py +0 -0
  17. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/__init__.py +0 -0
  18. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/binary_carver.py +0 -0
  19. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/continuous_carver.py +0 -0
  20. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/multiclass_carver.py +0 -0
  21. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/utils/__init__.py +0 -0
  22. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/utils/pretty_print.py +0 -0
  23. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/__init__.py +0 -0
  24. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/__init__.py +0 -0
  25. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/continuous/__init__.py +0 -0
  26. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/__init__.py +0 -0
  27. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/combinations.py +0 -0
  28. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/utils/target_rate.py +0 -0
  29. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/config.py +0 -0
  30. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/__init__.py +0 -0
  31. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/discretizer.py +0 -0
  32. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/__init__.py +0 -0
  33. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/chained_discretizer.py +0 -0
  34. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/qualitatives/qualitative_discretizer.py +0 -0
  35. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/quantitatives/__init__.py +0 -0
  36. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/quantitatives/continuous_discretizer.py +0 -0
  37. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/__init__.py +0 -0
  38. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/multiprocessing.py +0 -0
  39. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/discretizers/utils/type_discretizers.py +0 -0
  40. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/__init__.py +0 -0
  41. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/__init__.py +0 -0
  42. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/categorical_feature.py +0 -0
  43. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/ordinal_feature.py +0 -0
  44. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/qualitatives/qualitative_feature.py +0 -0
  45. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/__init__.py +0 -0
  46. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/datetime_feature.py +0 -0
  47. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/numerical_feature.py +0 -0
  48. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/quantitatives/quantitative_feature.py +0 -0
  49. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/__init__.py +0 -0
  50. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/base_feature.py +0 -0
  51. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/grouped_list.py +0 -0
  52. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/features/utils/serialization.py +0 -0
  53. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/__init__.py +0 -0
  54. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/classification_selector.py +0 -0
  55. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/__init__.py +0 -0
  56. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/base_filters.py +0 -0
  57. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/qualitative_filters.py +0 -0
  58. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/filters/quantitative_filters.py +0 -0
  59. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/__init__.py +0 -0
  60. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/base_measures.py +0 -0
  61. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/qualitative_measures.py +0 -0
  62. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/measures/quantitative_measures.py +0 -0
  63. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/regression_selector.py +0 -0
  64. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/utils/__init__.py +0 -0
  65. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/utils/base_selector.py +0 -0
  66. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/selectors/utils/pretty_print.py +0 -0
  67. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/__init__.py +0 -0
  68. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/attributes.py +0 -0
  69. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/dependencies.py +0 -0
  70. {autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/utils/extend_docstring.py +0 -0
  71. {autocarver-7.2.2 → autocarver-7.2.6}/README.md +0 -0
@@ -58,6 +58,7 @@ def _carve_feature_worker(
58
58
  max_n_mod: int,
59
59
  min_freq: float,
60
60
  dropna: bool,
61
+ min_freq_alpha: float,
61
62
  ) -> tuple[BaseFeature, bool]:
62
63
  """Picklable worker: scores best combination for a single feature.
63
64
 
@@ -70,11 +71,43 @@ def _carve_feature_worker(
70
71
  # workers never print per-feature progress; the parent prints a single banner
71
72
  evaluator.verbose = False
72
73
  best = evaluator.get_best_combination(
73
- feature, xagg, xagg_dev, max_n_mod=max_n_mod, min_freq=min_freq, dropna=dropna
74
+ feature,
75
+ xagg,
76
+ xagg_dev,
77
+ max_n_mod=max_n_mod,
78
+ min_freq=min_freq,
79
+ dropna=dropna,
80
+ min_freq_alpha=min_freq_alpha,
74
81
  )
75
82
  return feature, best is not None
76
83
 
77
84
 
85
+ def _drop_reason_from_history(history: pd.DataFrame) -> str:
86
+ """Synthesizes a human-readable drop reason from a dropped feature's history.
87
+
88
+ Picks the most frequent failing-test message across ``train``/``dev`` blocks
89
+ of historized non-viable combinations.
90
+ """
91
+ if history.empty:
92
+ return "No combination historized"
93
+
94
+ info_counts: dict[str, int] = {}
95
+ for _, row in history.iterrows():
96
+ if bool(row.get("viable", False)):
97
+ continue
98
+ for block_key in ("train", "dev"):
99
+ block = row.get(block_key)
100
+ if isinstance(block, dict):
101
+ msg = block.get("info") or ""
102
+ if msg:
103
+ info_counts[msg] = info_counts.get(msg, 0) + 1
104
+
105
+ if not info_counts:
106
+ return "No robust combination"
107
+ msg, _ = max(info_counts.items(), key=lambda kv: kv[1])
108
+ return f"No robust combination ({msg})"
109
+
110
+
78
111
  def _replace_feature_in_features(features: Features, updated: BaseFeature) -> None:
79
112
  """Swaps an existing feature (by version) for the worker-returned copy."""
80
113
  if isinstance(updated, CategoricalFeature):
@@ -164,6 +197,22 @@ class BaseCarver(BaseDiscretizer, ABC):
164
197
  combination_evaluator.verbose = self.config.verbose
165
198
  self.combination_evaluator: CombinationEvaluator = combination_evaluator
166
199
 
200
+ # features dropped by the carver because no robust combination was found.
201
+ # Kept (not cleared on re-fit) so users can inspect why each dropped via
202
+ # the marker columns added to ``summary`` / ``history``.
203
+ self.dropped_features: list[BaseFeature] = []
204
+
205
+ @property
206
+ def half_min_freq(self) -> float:
207
+ """Half of :attr:`min_freq` — the tolerant frequency floor the carver
208
+ applies when discretizing prior to combination search. Halving here gives
209
+ the combination evaluator a finer granularity to recombine, while the
210
+ underlying discretizers themselves compare directly against ``min_freq``
211
+ (with a 1-row tolerance). Owning the halving in the carver — rather than
212
+ inside individual discretizers — keeps the per-discretizer semantic uniform.
213
+ """
214
+ return self.min_freq / 2
215
+
167
216
  @property
168
217
  def pretty_print(self) -> bool:
169
218
  """Returns the pretty_print attribute"""
@@ -173,8 +222,57 @@ class BaseCarver(BaseDiscretizer, ABC):
173
222
  content = super().to_json(light_mode)
174
223
  content["max_n_mod"] = self.max_n_mod
175
224
  content["combination_evaluator"] = self.combination_evaluator.to_json()
225
+ content["dropped_features"] = [f.to_json(light_mode) for f in self.dropped_features]
176
226
  return content
177
227
 
228
+ @property
229
+ def summary(self) -> pd.DataFrame:
230
+ """Per-feature carving summary, extended with one block per dropped feature.
231
+
232
+ Rows from features that the carver dropped (no robust combination on
233
+ train and/or dev) are appended at the end with two marker columns:
234
+
235
+ - ``dropped`` (bool): ``True`` for dropped features, ``False`` otherwise.
236
+ - ``dropped_reason`` (str | None): synthesized from the feature's history
237
+ — the dominant failing test message across attempted combinations.
238
+ """
239
+ rows: list[dict] = []
240
+ for feature in self.features:
241
+ for row in feature.summary:
242
+ rows.append({**row, "dropped": False, "dropped_reason": None})
243
+ for feature in self.dropped_features:
244
+ reason = _drop_reason_from_history(feature.history)
245
+ for row in feature.summary:
246
+ rows.append({**row, "dropped": True, "dropped_reason": reason})
247
+
248
+ summaries = pd.DataFrame(rows)
249
+ if summaries.empty:
250
+ return summaries
251
+
252
+ excluded = {"feature", "label", "content", "target_mean", "frequency", "dropped", "dropped_reason"}
253
+ indices = [col for col in summaries.columns if col not in excluded]
254
+ indices = ["feature"] + indices + ["label"]
255
+ return summaries.set_index(indices)
256
+
257
+ @property
258
+ def history(self) -> pd.DataFrame:
259
+ """Combined combination-history of carved + dropped features.
260
+
261
+ Dropped features' rows are appended with ``dropped=True``; carved
262
+ features' rows get ``dropped=False``.
263
+ """
264
+ frames: list[pd.DataFrame] = []
265
+ current = self.features.history
266
+ if not current.empty:
267
+ frames.append(current.assign(dropped=False))
268
+ for feature in self.dropped_features:
269
+ df = feature.history
270
+ if len(df) > 0:
271
+ frames.append(df.assign(feature=str(feature), dropped=True))
272
+ if not frames:
273
+ return pd.DataFrame()
274
+ return pd.concat(frames, ignore_index=True)
275
+
178
276
  def _prepare_samples(self, samples: Samples) -> Samples:
179
277
  """Validates format and content of X and y."""
180
278
  if samples.train.y is None:
@@ -186,7 +284,7 @@ class BaseCarver(BaseDiscretizer, ABC):
186
284
 
187
285
  # discretizing features at half min_freq so the carver has a finer
188
286
  # granularity to combine when forming optimal groups
189
- samples = discretize(self.features, samples, self.min_freq / 2, self.config)
287
+ samples = discretize(self.features, samples, self.half_min_freq, self.config)
190
288
 
191
289
  # setting dropna to True for filling up nans
192
290
  self.features.dropna = True
@@ -282,6 +380,7 @@ class BaseCarver(BaseDiscretizer, ABC):
282
380
  max_n_mod=self.max_n_mod,
283
381
  min_freq=self.min_freq,
284
382
  dropna=self.config.dropna,
383
+ min_freq_alpha=self.config.min_freq_alpha,
285
384
  )
286
385
 
287
386
  with Pool(processes=self.config.n_jobs) as pool:
@@ -294,6 +393,7 @@ class BaseCarver(BaseDiscretizer, ABC):
294
393
  "increasing the size of X_dev or dropping the feature (X not "
295
394
  "representative of X_dev for this feature)."
296
395
  )
396
+ self.dropped_features.append(updated_feature)
297
397
  self.features.remove(updated_feature.version)
298
398
 
299
399
  @abstractmethod
@@ -324,7 +424,13 @@ class BaseCarver(BaseDiscretizer, ABC):
324
424
 
325
425
  # getting best combination
326
426
  best_combination = self.combination_evaluator.get_best_combination(
327
- feature, xagg, xagg_dev, max_n_mod=self.max_n_mod, min_freq=self.min_freq, dropna=self.config.dropna
427
+ feature,
428
+ xagg,
429
+ xagg_dev,
430
+ max_n_mod=self.max_n_mod,
431
+ min_freq=self.min_freq,
432
+ dropna=self.config.dropna,
433
+ min_freq_alpha=self.config.min_freq_alpha,
328
434
  )
329
435
 
330
436
  # printing carved distribution, for found, suitable combination
@@ -343,6 +449,7 @@ class BaseCarver(BaseDiscretizer, ABC):
343
449
  f"WARNING: No robust combination for {feature}. Consider increasing the size of "
344
450
  "X_dev or dropping the feature (X not representative of X_dev for this feature)."
345
451
  )
452
+ self.dropped_features.append(feature)
346
453
  self.features.remove(feature.version)
347
454
 
348
455
  def _print_xagg(
@@ -461,6 +568,16 @@ class BaseCarver(BaseDiscretizer, ABC):
461
568
  config=config,
462
569
  )
463
570
  instance.is_fitted = is_fitted
571
+
572
+ # deserializing dropped_features (mirrors Features.load type-dispatch)
573
+ for fjson in data.pop("dropped_features", []):
574
+ if fjson.get("is_categorical"):
575
+ instance.dropped_features.append(CategoricalFeature.load(fjson))
576
+ elif fjson.get("is_ordinal"):
577
+ instance.dropped_features.append(OrdinalFeature.load(fjson))
578
+ elif fjson.get("is_quantitative"):
579
+ instance.dropped_features.append(QuantitativeFeature.load(fjson))
580
+
464
581
  return instance
465
582
 
466
583
 
@@ -1,5 +1,6 @@
1
1
  """Module for binary combination evaluators."""
2
2
 
3
+ import math
3
4
  from abc import ABC
4
5
  from collections.abc import Iterable, Iterator
5
6
 
@@ -12,9 +13,11 @@ from AutoCarver.combinations.binary.binary_target_rates import BinaryTargetRate,
12
13
  from AutoCarver.combinations.utils.combination_evaluator import (
13
14
  AggregatedSample,
14
15
  CombinationEvaluator,
16
+ _nan_fanout_variants,
15
17
  )
16
18
  from AutoCarver.combinations.utils.combinations import combination_formatter
17
19
  from AutoCarver.combinations.utils.target_rate import TargetRate
20
+ from AutoCarver.features import GroupedList
18
21
 
19
22
 
20
23
  class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
@@ -25,6 +28,9 @@ class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
25
28
  # narrow inherited attribute: binary evaluators always carry a BinaryTargetRate
26
29
  # (enforced by _init_target_rate).
27
30
  target_rate: BinaryTargetRate
31
+ # narrow inherited `sort_by: str | None`: concrete binary subclasses
32
+ # (TschuprowtCombinations, CramervCombinations) always set this to a str.
33
+ sort_by: str
28
34
 
29
35
  def _init_target_rate(self, target_rate: TargetRate[pd.DataFrame] | None) -> BinaryTargetRate:
30
36
  """Initializes target rate."""
@@ -194,6 +200,191 @@ class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
194
200
  tol=tol,
195
201
  )
196
202
 
203
+ def _get_best_combination_non_nan(self) -> dict | None:
204
+ """DP-based override with progressive top-K.
205
+
206
+ Replaces ``consecutive_combinations + _compute_associations`` with the
207
+ interval-DP in :func:`_top_k_partitions_chi2_dp`, which returns the
208
+ top-K consecutive partitions ranked by ``self.sort_by`` desc.
209
+
210
+ **Progressive search.** Starts with ``top_k = self.dp_top_k_initial``.
211
+ If the viability walk doesn't find a viable candidate within that
212
+ top-K, doubles ``top_k`` and re-runs DP — walking only the new
213
+ entries from where we left off. Repeats until either a viable is
214
+ found or DP exhausts every consecutive partition (signalled by
215
+ ``len(result) < top_k``). Total work bounded by ~2× a single DP run
216
+ at the final top_k.
217
+
218
+ This makes the search **exhaustive in the worst case**, matching the
219
+ legacy enumerate-and-score path's correctness while keeping the
220
+ common case (viable found in top ~100) essentially free. Mirrors
221
+ :meth:`ContinuousCombinationEvaluator._get_best_combination_non_nan`.
222
+
223
+ The NaN-fan-out path (:meth:`_get_best_combination_with_nan`) still
224
+ goes through the legacy enumerate-and-score loop.
225
+ """
226
+ feature_labels = self.feature.labels
227
+ if feature_labels is None:
228
+ raise RuntimeError(f"[{self.__name__}] feature labels are not populated")
229
+ raw_labels = GroupedList(feature_labels[:])
230
+
231
+ if self.feature.has_nan:
232
+ if self.feature.dropna:
233
+ raw_labels.remove(self.feature.nan)
234
+ self.samples.dropna(self.feature.nan)
235
+
236
+ if self.samples.train.shape[0] <= 1:
237
+ return None
238
+
239
+ self._historize_raw_combination()
240
+
241
+ # Iterate over raw_labels (mirrors the parent's
242
+ # ``consecutive_combinations(raw_labels, ...)`` enumeration). When
243
+ # raw_labels and raw_xagg.index diverge (edge-case fixtures with
244
+ # has_nan=False + nan row in xagg, or has_nan=True/dropna=False),
245
+ # rows present in raw_xagg but not in raw_labels are excluded; labels
246
+ # present in raw_labels but not in raw_xagg get zero counts — matching
247
+ # the legacy ``_grouper``'s ``groupby.get(idx, idx)`` semantics where
248
+ # an unmapped row produces no contribution.
249
+ raw_xagg = self.samples.train.xagg
250
+ all_n0 = raw_xagg.iloc[:, 0].to_numpy(dtype=float)
251
+ all_n1 = raw_xagg.iloc[:, 1].to_numpy(dtype=float)
252
+ xagg_pos = {m: i for i, m in enumerate(raw_xagg.index)}
253
+ raw_index = list(raw_labels)
254
+ n0_per_mod = np.fromiter(
255
+ (all_n0[xagg_pos[m]] if m in xagg_pos else 0.0 for m in raw_index),
256
+ dtype=float,
257
+ count=len(raw_index),
258
+ )
259
+ n1_per_mod = np.fromiter(
260
+ (all_n1[xagg_pos[m]] if m in xagg_pos else 0.0 for m in raw_index),
261
+ dtype=float,
262
+ count=len(raw_index),
263
+ )
264
+
265
+ # Progressive top-K with doubling. See docstring.
266
+ top_k = self.dp_top_k_initial
267
+ walked = 0
268
+ viable: dict | None = None
269
+ associations: list[dict] = []
270
+ while True:
271
+ associations = _top_k_partitions_chi2_dp(
272
+ n0_per_mod,
273
+ n1_per_mod,
274
+ max_n_mod=self.max_n_mod,
275
+ raw_index=raw_index,
276
+ sort_by=self.sort_by,
277
+ top_k=top_k,
278
+ )
279
+ viable, walked = self._walk_for_viable(associations, start=walked)
280
+ if viable is not None:
281
+ break
282
+ if walked < top_k:
283
+ break # DP exhausted every consecutive partition; no viable exists
284
+ top_k *= 2
285
+
286
+ self._apply_best_combination(viable)
287
+ return viable
288
+
289
+ def _get_best_combination_with_nan(self, best_combination: dict | None) -> dict | None:
290
+ """DP-based override with NaN fan-out.
291
+
292
+ Mirrors :meth:`ContinuousCombinationEvaluator._get_best_combination_with_nan`:
293
+
294
+ 1. DP top-K base consecutive partitions over the non-nan labels
295
+ (:func:`_top_k_partitions_chi2_dp` on a restricted view of the
296
+ per-modality ``(n0, n1)`` counts);
297
+ 2. fan each base out across NaN placements exactly like
298
+ :func:`nan_combinations` (nan folded into each group, then nan
299
+ as its own group when ``len(base) < max_n_mod``, plus the final
300
+ ``[all_non_nan, [nan]]`` partition);
301
+ 3. re-score every variant in closed form with
302
+ :func:`_chi2_assoc_for_combination` against the **full** per-modality
303
+ counts (the nan row is in ``samples.train.xagg`` because
304
+ :meth:`_get_best_combination_non_nan`'s ``_apply_best_combination``
305
+ rebuilt it from raw);
306
+ 4. walk the sorted variants for the first viable, with progressive
307
+ top-K doubling on the base DP — dedup'd via a per-partition seen
308
+ set so combinations carried over from a smaller ``top_k`` are not
309
+ re-tested / re-historized.
310
+
311
+ Falls back to the parent implementation when the guard condition
312
+ (``self.dropna and feature.has_nan and best_combination is not None``)
313
+ is not met — matches the legacy short-circuit behaviour.
314
+ """
315
+ if not (self.dropna and self.feature.has_nan and best_combination is not None):
316
+ return super()._get_best_combination_with_nan(best_combination)
317
+
318
+ if self.verbose:
319
+ print(f"[{self.__name__}] Grouping NaNs")
320
+
321
+ feature_labels = self.feature.labels
322
+ if feature_labels is None:
323
+ raise RuntimeError(f"[{self.__name__}] feature labels are not populated")
324
+ raw_labels = GroupedList(feature_labels[:])
325
+ raw_labels.remove(self.feature.nan)
326
+ nan_label = self.feature.nan
327
+
328
+ # Full per-modality (n0, n1) — nan row is in xagg because
329
+ # _apply_best_combination on the non-nan winner rebuilt it from raw.
330
+ raw_xagg = self.samples.train.xagg
331
+ n0_per_mod = raw_xagg.iloc[:, 0].to_numpy(dtype=float)
332
+ n1_per_mod = raw_xagg.iloc[:, 1].to_numpy(dtype=float)
333
+ n_obs = float(n0_per_mod.sum() + n1_per_mod.sum())
334
+ mod_to_pos: dict = {m: i for i, m in enumerate(raw_xagg.index)}
335
+ n_mod = len(mod_to_pos)
336
+ tol = 1e-10
337
+
338
+ # Non-nan subset, aligned to raw_labels order, for the base DP.
339
+ non_nan_index = list(raw_labels)
340
+ n0_non_nan = np.fromiter(
341
+ (n0_per_mod[mod_to_pos[m]] for m in non_nan_index),
342
+ dtype=float,
343
+ count=len(non_nan_index),
344
+ )
345
+ n1_non_nan = np.fromiter(
346
+ (n1_per_mod[mod_to_pos[m]] for m in non_nan_index),
347
+ dtype=float,
348
+ count=len(non_nan_index),
349
+ )
350
+
351
+ historized: set[tuple] = set()
352
+ base_top_k = self.dp_top_k_initial
353
+ viable: dict | None = None
354
+
355
+ while True:
356
+ base_partitions = _top_k_partitions_chi2_dp(
357
+ n0_non_nan,
358
+ n1_non_nan,
359
+ max_n_mod=self.max_n_mod,
360
+ raw_index=non_nan_index,
361
+ sort_by=self.sort_by,
362
+ top_k=base_top_k,
363
+ tol=tol,
364
+ )
365
+ scored = _score_nan_variants_chi2(
366
+ base_partitions=base_partitions,
367
+ nan_label=nan_label,
368
+ raw_labels=non_nan_index,
369
+ max_n_mod=self.max_n_mod,
370
+ n0_per_mod=n0_per_mod,
371
+ n1_per_mod=n1_per_mod,
372
+ n_obs=n_obs,
373
+ mod_to_pos=mod_to_pos,
374
+ n_mod=n_mod,
375
+ tol=tol,
376
+ sort_by=self.sort_by,
377
+ )
378
+ viable = self._walk_nan_variants(scored, historized)
379
+ if viable is not None:
380
+ break
381
+ if len(base_partitions) < base_top_k:
382
+ break # DP exhausted every consecutive partition
383
+ base_top_k *= 2
384
+
385
+ self._apply_best_combination(viable)
386
+ return viable
387
+
197
388
 
198
389
  class TschuprowtCombinations(BinaryCombinationEvaluator):
199
390
  """Tschuprow's T based combination evaluation toolkit"""
@@ -408,3 +599,179 @@ def _chi2_assoc_batch(
408
599
  "cramerv": float(cramerv_q[b]),
409
600
  "tschuprowt": float(tt_q[b]),
410
601
  }
602
+
603
+
604
+ def _top_k_partitions_chi2_dp( # noqa: C901
605
+ n0_per_mod: np.ndarray,
606
+ n1_per_mod: np.ndarray,
607
+ *,
608
+ max_n_mod: int,
609
+ raw_index: list,
610
+ sort_by: str = "tschuprowt",
611
+ top_k: int = 1000,
612
+ tol: float = 1e-10,
613
+ ) -> list[dict]:
614
+ """Top-K consecutive-segmentation partitions ranked by a chi²-derived metric.
615
+
616
+ Binary analogue of
617
+ :func:`AutoCarver.combinations.continuous.continuous_combination_evaluators._top_k_partitions_kruskal_dp`.
618
+
619
+ The per-segment chi² cell contribution
620
+
621
+ .. math::
622
+
623
+ c_g = (n_{0,g} + \\tau - E_{0,g})^2 / E_{0,g}
624
+ + (n_{1,g} + \\tau - E_{1,g})^2 / E_{1,g}
625
+
626
+ (with Yates correction iff the combination has exactly 2 groups) is
627
+ additive across groups **given a fixed number of groups k**: the column
628
+ marginals ``C[c] = N_c + k·τ`` and total ``N = N₀ + N₁ + 2k·τ`` depend
629
+ only on ``k``, not on the split positions. So we run a separate interval-DP
630
+ per ``k ∈ [2, K]`` and merge.
631
+
632
+ Returns a list of ``{combination, index_to_groupby, cramerv, tschuprowt}``
633
+ sorted by ``sort_by`` desc — mirrors the yield shape of
634
+ :meth:`_compute_associations` so it drops into the streaming pipeline.
635
+
636
+ Complexity: O(K² · n_mod² · top_k · log top_k). Independent of the
637
+ combination count (which can reach ~8 M at ``n_mod=40, max_n_mod=7``).
638
+
639
+ Edge cases (mirror :func:`_chi2_assoc_for_combination`):
640
+
641
+ * ``max_n_mod < 2`` or ``n_mod < 2``: returns ``[]``.
642
+ * ``sort_by`` must be ``"cramerv"`` or ``"tschuprowt"``.
643
+ """
644
+ if sort_by not in ("cramerv", "tschuprowt"):
645
+ raise ValueError(f"sort_by must be 'cramerv' or 'tschuprowt', got {sort_by!r}")
646
+
647
+ n_mod = len(raw_index)
648
+ K = min(max_n_mod, n_mod)
649
+ if K < 2:
650
+ return []
651
+
652
+ n0_prefix = np.concatenate([[0.0], np.cumsum(n0_per_mod.astype(np.float64))])
653
+ n1_prefix = np.concatenate([[0.0], np.cumsum(n1_per_mod.astype(np.float64))])
654
+ N0_total = float(n0_prefix[-1])
655
+ N1_total = float(n1_prefix[-1])
656
+ n_obs = N0_total + N1_total
657
+
658
+ # Collected across all k: (sort_key, cramerv_q, tschuprowt_q, splits)
659
+ all_entries: list[tuple[float, float, float, tuple[int, ...]]] = []
660
+
661
+ for k_groups in range(2, K + 1):
662
+ C0 = N0_total + k_groups * tol
663
+ C1 = N1_total + k_groups * tol
664
+ N_with_tol = N0_total + N1_total + 2.0 * k_groups * tol
665
+ yates = k_groups == 2
666
+
667
+ def seg_cost(
668
+ i: int, j: int, _C0: float = C0, _C1: float = C1, _N: float = N_with_tol, _yates: bool = yates
669
+ ) -> float:
670
+ obs0 = (n0_prefix[j] - n0_prefix[i]) + tol
671
+ obs1 = (n1_prefix[j] - n1_prefix[i]) + tol
672
+ R = obs0 + obs1
673
+ E0 = R * _C0 / _N
674
+ E1 = R * _C1 / _N
675
+ if _yates:
676
+ d0 = E0 - obs0
677
+ d1 = E1 - obs1
678
+ obs0 = obs0 + (1.0 if d0 > 0 else (-1.0 if d0 < 0 else 0.0)) * min(0.5, abs(d0))
679
+ obs1 = obs1 + (1.0 if d1 > 0 else (-1.0 if d1 < 0 else 0.0)) * min(0.5, abs(d1))
680
+ return (obs0 - E0) ** 2 / E0 + (obs1 - E1) ** 2 / E1
681
+
682
+ # dp[g][j] holds up to ``top_k`` (chi2_partial, splits_tuple) pairs sorted
683
+ # desc, where splits_tuple = (0, s_1, ..., s_{g-1}, j). g = number of
684
+ # groups in the prefix, j = right boundary.
685
+ dp: list[list[list[tuple[float, tuple[int, ...]]]]] = [
686
+ [[] for _ in range(n_mod + 1)] for _ in range(k_groups + 1)
687
+ ]
688
+ for j in range(1, n_mod + 1):
689
+ dp[1][j] = [(seg_cost(0, j), (0, j))]
690
+ for g in range(2, k_groups + 1):
691
+ for j in range(g, n_mod + 1):
692
+ candidates: list[tuple[float, tuple[int, ...]]] = []
693
+ for i in range(g - 1, j):
694
+ c = seg_cost(i, j)
695
+ for prev_s, prev_splits in dp[g - 1][i]:
696
+ candidates.append((prev_s + c, prev_splits + (j,)))
697
+ if candidates:
698
+ candidates.sort(key=lambda x: x[0], reverse=True)
699
+ dp[g][j] = candidates[:top_k]
700
+
701
+ # Translate chi² → cramerv (quantised) → tschuprowt (quantised). Matches
702
+ # :func:`_chi2_assoc_for_combination` cell-for-cell.
703
+ denom = (k_groups - 1) ** 0.25 # k_groups ≥ 2 here, so denom > 0
704
+ for chi2, splits in dp[k_groups][n_mod]:
705
+ cramerv_raw = (chi2 / n_obs) ** 0.5
706
+ cramerv_q = round(cramerv_raw / tol) * tol
707
+ tt_raw = cramerv_q / denom
708
+ tt_q = round(tt_raw / tol) * tol
709
+ sort_key = tt_q if sort_by == "tschuprowt" else cramerv_q
710
+ all_entries.append((sort_key, cramerv_q, tt_q, splits))
711
+
712
+ all_entries.sort(key=lambda x: x[0], reverse=True)
713
+ all_entries = all_entries[:top_k]
714
+
715
+ out: list[dict] = []
716
+ for _, cv, tt, splits in all_entries:
717
+ combination = [list(raw_index[splits[g] : splits[g + 1]]) for g in range(len(splits) - 1)]
718
+ out.append(
719
+ {
720
+ "combination": combination,
721
+ "index_to_groupby": combination_formatter(combination),
722
+ "cramerv": float(cv),
723
+ "tschuprowt": float(tt),
724
+ }
725
+ )
726
+ return out
727
+
728
+
729
+ def _score_nan_variants_chi2(
730
+ *,
731
+ base_partitions: list[dict],
732
+ nan_label: str,
733
+ raw_labels: list,
734
+ max_n_mod: int,
735
+ n0_per_mod: np.ndarray,
736
+ n1_per_mod: np.ndarray,
737
+ n_obs: float,
738
+ mod_to_pos: dict,
739
+ n_mod: int,
740
+ tol: float,
741
+ sort_by: str,
742
+ ) -> list[dict]:
743
+ """Score every NaN-fanout variant via closed-form chi² (Cramér's V +
744
+ Tschuprow's T), sorted by ``sort_by`` desc.
745
+
746
+ Uses :func:`_chi2_assoc_for_combination` per variant — bit-identical to
747
+ the legacy ``chi2_contingency`` path on the per-variant crosstab.
748
+ """
749
+ scored: list[dict] = []
750
+ for variant in _nan_fanout_variants(base_partitions, nan_label, raw_labels, max_n_mod):
751
+ index_to_groupby = combination_formatter(variant)
752
+ cv, tt = _chi2_assoc_for_combination(
753
+ n0_per_mod=n0_per_mod,
754
+ n1_per_mod=n1_per_mod,
755
+ n_obs=n_obs,
756
+ mod_to_pos=mod_to_pos,
757
+ n_mod=n_mod,
758
+ index_to_groupby=index_to_groupby,
759
+ tol=tol,
760
+ )
761
+ scored.append(
762
+ {
763
+ "combination": variant,
764
+ "index_to_groupby": index_to_groupby,
765
+ "cramerv": cv,
766
+ "tschuprowt": tt,
767
+ }
768
+ )
769
+
770
+ def _key(a: dict) -> float:
771
+ v = a[sort_by]
772
+ if v is None or (isinstance(v, float) and math.isnan(v)):
773
+ return float("-inf")
774
+ return float(v)
775
+
776
+ scored.sort(key=_key, reverse=True)
777
+ return scored
@@ -33,14 +33,15 @@ class BinaryTargetRate(TargetRate[pd.DataFrame], ABC):
33
33
  """
34
34
  # checking for an xtab
35
35
  if xagg is not None:
36
- # frequency per modality
37
- frequency = xagg.sum(axis=1) / xagg.sum().sum()
36
+ # count + frequency per modality (count carried for CI-based viability tests)
37
+ count = xagg.sum(axis=1)
38
+ frequency = count / count.sum()
38
39
 
39
40
  # computing target rate. `_compute` expects pd.DataFrame (Generic
40
41
  # XAgg=DataFrame); compute()'s wide signature is for LSP matching,
41
42
  # callers always pass a crosstab here.
42
43
  return pd.DataFrame(
43
- {self.__name__: self._compute(xagg), "frequency": frequency} # type: ignore
44
+ {self.__name__: self._compute(xagg), "frequency": frequency, "count": count} # type: ignore
44
45
  )
45
46
  return None
46
47