PyPI - AutoCarver - Versions diffs - 7.2.2__tar.gz → 7.2.6__tar.gz - Mend

AutoCarver 7.2.2tar.gz → 7.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

{autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/carvers/utils/base_carver.py RENAMED Viewed

@@ -58,6 +58,7 @@ def _carve_feature_worker(
     max_n_mod: int,
     min_freq: float,
     dropna: bool,
+    min_freq_alpha: float,
 ) -> tuple[BaseFeature, bool]:
     """Picklable worker: scores best combination for a single feature.
@@ -70,11 +71,43 @@ def _carve_feature_worker(
     # workers never print per-feature progress; the parent prints a single banner
     evaluator.verbose = False
     best = evaluator.get_best_combination(
-        feature, xagg, xagg_dev, max_n_mod=max_n_mod, min_freq=min_freq, dropna=dropna
+        feature,
+        xagg,
+        xagg_dev,
+        max_n_mod=max_n_mod,
+        min_freq=min_freq,
+        dropna=dropna,
+        min_freq_alpha=min_freq_alpha,
     )
     return feature, best is not None
+def _drop_reason_from_history(history: pd.DataFrame) -> str:
+    """Synthesizes a human-readable drop reason from a dropped feature's history.
+    Picks the most frequent failing-test message across ``train``/``dev`` blocks
+    of historized non-viable combinations.
+    """
+    if history.empty:
+        return "No combination historized"
+    info_counts: dict[str, int] = {}
+    for _, row in history.iterrows():
+        if bool(row.get("viable", False)):
+            continue
+        for block_key in ("train", "dev"):
+            block = row.get(block_key)
+            if isinstance(block, dict):
+                msg = block.get("info") or ""
+                if msg:
+                    info_counts[msg] = info_counts.get(msg, 0) + 1
+    if not info_counts:
+        return "No robust combination"
+    msg, _ = max(info_counts.items(), key=lambda kv: kv[1])
+    return f"No robust combination ({msg})"
 def _replace_feature_in_features(features: Features, updated: BaseFeature) -> None:
     """Swaps an existing feature (by version) for the worker-returned copy."""
     if isinstance(updated, CategoricalFeature):
@@ -164,6 +197,22 @@ class BaseCarver(BaseDiscretizer, ABC):
         combination_evaluator.verbose = self.config.verbose
         self.combination_evaluator: CombinationEvaluator = combination_evaluator
+        # features dropped by the carver because no robust combination was found.
+        # Kept (not cleared on re-fit) so users can inspect why each dropped via
+        # the marker columns added to ``summary`` / ``history``.
+        self.dropped_features: list[BaseFeature] = []
+    @property
+    def half_min_freq(self) -> float:
+        """Half of :attr:`min_freq` — the tolerant frequency floor the carver
+        applies when discretizing prior to combination search. Halving here gives
+        the combination evaluator a finer granularity to recombine, while the
+        underlying discretizers themselves compare directly against ``min_freq``
+        (with a 1-row tolerance). Owning the halving in the carver — rather than
+        inside individual discretizers — keeps the per-discretizer semantic uniform.
+        """
+        return self.min_freq / 2
     @property
     def pretty_print(self) -> bool:
         """Returns the pretty_print attribute"""
@@ -173,8 +222,57 @@ class BaseCarver(BaseDiscretizer, ABC):
         content = super().to_json(light_mode)
         content["max_n_mod"] = self.max_n_mod
         content["combination_evaluator"] = self.combination_evaluator.to_json()
+        content["dropped_features"] = [f.to_json(light_mode) for f in self.dropped_features]
         return content
+    @property
+    def summary(self) -> pd.DataFrame:
+        """Per-feature carving summary, extended with one block per dropped feature.
+        Rows from features that the carver dropped (no robust combination on
+        train and/or dev) are appended at the end with two marker columns:
+        - ``dropped`` (bool): ``True`` for dropped features, ``False`` otherwise.
+        - ``dropped_reason`` (str | None): synthesized from the feature's history
+          — the dominant failing test message across attempted combinations.
+        """
+        rows: list[dict] = []
+        for feature in self.features:
+            for row in feature.summary:
+                rows.append({**row, "dropped": False, "dropped_reason": None})
+        for feature in self.dropped_features:
+            reason = _drop_reason_from_history(feature.history)
+            for row in feature.summary:
+                rows.append({**row, "dropped": True, "dropped_reason": reason})
+        summaries = pd.DataFrame(rows)
+        if summaries.empty:
+            return summaries
+        excluded = {"feature", "label", "content", "target_mean", "frequency", "dropped", "dropped_reason"}
+        indices = [col for col in summaries.columns if col not in excluded]
+        indices = ["feature"] + indices + ["label"]
+        return summaries.set_index(indices)
+    @property
+    def history(self) -> pd.DataFrame:
+        """Combined combination-history of carved + dropped features.
+        Dropped features' rows are appended with ``dropped=True``; carved
+        features' rows get ``dropped=False``.
+        """
+        frames: list[pd.DataFrame] = []
+        current = self.features.history
+        if not current.empty:
+            frames.append(current.assign(dropped=False))
+        for feature in self.dropped_features:
+            df = feature.history
+            if len(df) > 0:
+                frames.append(df.assign(feature=str(feature), dropped=True))
+        if not frames:
+            return pd.DataFrame()
+        return pd.concat(frames, ignore_index=True)
     def _prepare_samples(self, samples: Samples) -> Samples:
         """Validates format and content of X and y."""
         if samples.train.y is None:
@@ -186,7 +284,7 @@ class BaseCarver(BaseDiscretizer, ABC):
         # discretizing features at half min_freq so the carver has a finer
         # granularity to combine when forming optimal groups
-        samples = discretize(self.features, samples, self.min_freq / 2, self.config)
+        samples = discretize(self.features, samples, self.half_min_freq, self.config)
         # setting dropna to True for filling up nans
         self.features.dropna = True
@@ -282,6 +380,7 @@ class BaseCarver(BaseDiscretizer, ABC):
             max_n_mod=self.max_n_mod,
             min_freq=self.min_freq,
             dropna=self.config.dropna,
+            min_freq_alpha=self.config.min_freq_alpha,
         )
         with Pool(processes=self.config.n_jobs) as pool:
@@ -294,6 +393,7 @@ class BaseCarver(BaseDiscretizer, ABC):
                         "increasing the size of X_dev or dropping the feature (X not "
                         "representative of X_dev for this feature)."
                     )
+                    self.dropped_features.append(updated_feature)
                     self.features.remove(updated_feature.version)
     @abstractmethod
@@ -324,7 +424,13 @@ class BaseCarver(BaseDiscretizer, ABC):
         # getting best combination
         best_combination = self.combination_evaluator.get_best_combination(
-            feature, xagg, xagg_dev, max_n_mod=self.max_n_mod, min_freq=self.min_freq, dropna=self.config.dropna
+            feature,
+            xagg,
+            xagg_dev,
+            max_n_mod=self.max_n_mod,
+            min_freq=self.min_freq,
+            dropna=self.config.dropna,
+            min_freq_alpha=self.config.min_freq_alpha,
         )
         # printing carved distribution, for found, suitable combination
@@ -343,6 +449,7 @@ class BaseCarver(BaseDiscretizer, ABC):
                 f"WARNING: No robust combination for {feature}. Consider increasing the size of "
                 "X_dev or dropping the feature (X not representative of X_dev for this feature)."
             )
+            self.dropped_features.append(feature)
             self.features.remove(feature.version)
     def _print_xagg(
@@ -461,6 +568,16 @@ class BaseCarver(BaseDiscretizer, ABC):
             config=config,
         )
         instance.is_fitted = is_fitted
+        # deserializing dropped_features (mirrors Features.load type-dispatch)
+        for fjson in data.pop("dropped_features", []):
+            if fjson.get("is_categorical"):
+                instance.dropped_features.append(CategoricalFeature.load(fjson))
+            elif fjson.get("is_ordinal"):
+                instance.dropped_features.append(OrdinalFeature.load(fjson))
+            elif fjson.get("is_quantitative"):
+                instance.dropped_features.append(QuantitativeFeature.load(fjson))
         return instance

{autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/binary_combination_evaluators.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Module for binary combination evaluators."""
+import math
 from abc import ABC
 from collections.abc import Iterable, Iterator
@@ -12,9 +13,11 @@ from AutoCarver.combinations.binary.binary_target_rates import BinaryTargetRate,
 from AutoCarver.combinations.utils.combination_evaluator import (
     AggregatedSample,
     CombinationEvaluator,
+    _nan_fanout_variants,
 )
 from AutoCarver.combinations.utils.combinations import combination_formatter
 from AutoCarver.combinations.utils.target_rate import TargetRate
+from AutoCarver.features import GroupedList
 class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
@@ -25,6 +28,9 @@ class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
     # narrow inherited attribute: binary evaluators always carry a BinaryTargetRate
     # (enforced by _init_target_rate).
     target_rate: BinaryTargetRate
+    # narrow inherited `sort_by: str | None`: concrete binary subclasses
+    # (TschuprowtCombinations, CramervCombinations) always set this to a str.
+    sort_by: str
     def _init_target_rate(self, target_rate: TargetRate[pd.DataFrame] | None) -> BinaryTargetRate:
         """Initializes target rate."""
@@ -194,6 +200,191 @@ class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
                 tol=tol,
             )
+    def _get_best_combination_non_nan(self) -> dict | None:
+        """DP-based override with progressive top-K.
+        Replaces ``consecutive_combinations + _compute_associations`` with the
+        interval-DP in :func:`_top_k_partitions_chi2_dp`, which returns the
+        top-K consecutive partitions ranked by ``self.sort_by`` desc.
+        **Progressive search.** Starts with ``top_k = self.dp_top_k_initial``.
+        If the viability walk doesn't find a viable candidate within that
+        top-K, doubles ``top_k`` and re-runs DP — walking only the new
+        entries from where we left off. Repeats until either a viable is
+        found or DP exhausts every consecutive partition (signalled by
+        ``len(result) < top_k``). Total work bounded by ~2× a single DP run
+        at the final top_k.
+        This makes the search **exhaustive in the worst case**, matching the
+        legacy enumerate-and-score path's correctness while keeping the
+        common case (viable found in top ~100) essentially free. Mirrors
+        :meth:`ContinuousCombinationEvaluator._get_best_combination_non_nan`.
+        The NaN-fan-out path (:meth:`_get_best_combination_with_nan`) still
+        goes through the legacy enumerate-and-score loop.
+        """
+        feature_labels = self.feature.labels
+        if feature_labels is None:
+            raise RuntimeError(f"[{self.__name__}] feature labels are not populated")
+        raw_labels = GroupedList(feature_labels[:])
+        if self.feature.has_nan:
+            if self.feature.dropna:
+                raw_labels.remove(self.feature.nan)
+            self.samples.dropna(self.feature.nan)
+        if self.samples.train.shape[0] <= 1:
+            return None
+        self._historize_raw_combination()
+        # Iterate over raw_labels (mirrors the parent's
+        # ``consecutive_combinations(raw_labels, ...)`` enumeration). When
+        # raw_labels and raw_xagg.index diverge (edge-case fixtures with
+        # has_nan=False + nan row in xagg, or has_nan=True/dropna=False),
+        # rows present in raw_xagg but not in raw_labels are excluded; labels
+        # present in raw_labels but not in raw_xagg get zero counts — matching
+        # the legacy ``_grouper``'s ``groupby.get(idx, idx)`` semantics where
+        # an unmapped row produces no contribution.
+        raw_xagg = self.samples.train.xagg
+        all_n0 = raw_xagg.iloc[:, 0].to_numpy(dtype=float)
+        all_n1 = raw_xagg.iloc[:, 1].to_numpy(dtype=float)
+        xagg_pos = {m: i for i, m in enumerate(raw_xagg.index)}
+        raw_index = list(raw_labels)
+        n0_per_mod = np.fromiter(
+            (all_n0[xagg_pos[m]] if m in xagg_pos else 0.0 for m in raw_index),
+            dtype=float,
+            count=len(raw_index),
+        )
+        n1_per_mod = np.fromiter(
+            (all_n1[xagg_pos[m]] if m in xagg_pos else 0.0 for m in raw_index),
+            dtype=float,
+            count=len(raw_index),
+        )
+        # Progressive top-K with doubling. See docstring.
+        top_k = self.dp_top_k_initial
+        walked = 0
+        viable: dict | None = None
+        associations: list[dict] = []
+        while True:
+            associations = _top_k_partitions_chi2_dp(
+                n0_per_mod,
+                n1_per_mod,
+                max_n_mod=self.max_n_mod,
+                raw_index=raw_index,
+                sort_by=self.sort_by,
+                top_k=top_k,
+            )
+            viable, walked = self._walk_for_viable(associations, start=walked)
+            if viable is not None:
+                break
+            if walked < top_k:
+                break  # DP exhausted every consecutive partition; no viable exists
+            top_k *= 2
+        self._apply_best_combination(viable)
+        return viable
+    def _get_best_combination_with_nan(self, best_combination: dict | None) -> dict | None:
+        """DP-based override with NaN fan-out.
+        Mirrors :meth:`ContinuousCombinationEvaluator._get_best_combination_with_nan`:
+        1. DP top-K base consecutive partitions over the non-nan labels
+           (:func:`_top_k_partitions_chi2_dp` on a restricted view of the
+           per-modality ``(n0, n1)`` counts);
+        2. fan each base out across NaN placements exactly like
+           :func:`nan_combinations` (nan folded into each group, then nan
+           as its own group when ``len(base) < max_n_mod``, plus the final
+           ``[all_non_nan, [nan]]`` partition);
+        3. re-score every variant in closed form with
+           :func:`_chi2_assoc_for_combination` against the **full** per-modality
+           counts (the nan row is in ``samples.train.xagg`` because
+           :meth:`_get_best_combination_non_nan`'s ``_apply_best_combination``
+           rebuilt it from raw);
+        4. walk the sorted variants for the first viable, with progressive
+           top-K doubling on the base DP — dedup'd via a per-partition seen
+           set so combinations carried over from a smaller ``top_k`` are not
+           re-tested / re-historized.
+        Falls back to the parent implementation when the guard condition
+        (``self.dropna and feature.has_nan and best_combination is not None``)
+        is not met — matches the legacy short-circuit behaviour.
+        """
+        if not (self.dropna and self.feature.has_nan and best_combination is not None):
+            return super()._get_best_combination_with_nan(best_combination)
+        if self.verbose:
+            print(f"[{self.__name__}] Grouping NaNs")
+        feature_labels = self.feature.labels
+        if feature_labels is None:
+            raise RuntimeError(f"[{self.__name__}] feature labels are not populated")
+        raw_labels = GroupedList(feature_labels[:])
+        raw_labels.remove(self.feature.nan)
+        nan_label = self.feature.nan
+        # Full per-modality (n0, n1) — nan row is in xagg because
+        # _apply_best_combination on the non-nan winner rebuilt it from raw.
+        raw_xagg = self.samples.train.xagg
+        n0_per_mod = raw_xagg.iloc[:, 0].to_numpy(dtype=float)
+        n1_per_mod = raw_xagg.iloc[:, 1].to_numpy(dtype=float)
+        n_obs = float(n0_per_mod.sum() + n1_per_mod.sum())
+        mod_to_pos: dict = {m: i for i, m in enumerate(raw_xagg.index)}
+        n_mod = len(mod_to_pos)
+        tol = 1e-10
+        # Non-nan subset, aligned to raw_labels order, for the base DP.
+        non_nan_index = list(raw_labels)
+        n0_non_nan = np.fromiter(
+            (n0_per_mod[mod_to_pos[m]] for m in non_nan_index),
+            dtype=float,
+            count=len(non_nan_index),
+        )
+        n1_non_nan = np.fromiter(
+            (n1_per_mod[mod_to_pos[m]] for m in non_nan_index),
+            dtype=float,
+            count=len(non_nan_index),
+        )
+        historized: set[tuple] = set()
+        base_top_k = self.dp_top_k_initial
+        viable: dict | None = None
+        while True:
+            base_partitions = _top_k_partitions_chi2_dp(
+                n0_non_nan,
+                n1_non_nan,
+                max_n_mod=self.max_n_mod,
+                raw_index=non_nan_index,
+                sort_by=self.sort_by,
+                top_k=base_top_k,
+                tol=tol,
+            )
+            scored = _score_nan_variants_chi2(
+                base_partitions=base_partitions,
+                nan_label=nan_label,
+                raw_labels=non_nan_index,
+                max_n_mod=self.max_n_mod,
+                n0_per_mod=n0_per_mod,
+                n1_per_mod=n1_per_mod,
+                n_obs=n_obs,
+                mod_to_pos=mod_to_pos,
+                n_mod=n_mod,
+                tol=tol,
+                sort_by=self.sort_by,
+            )
+            viable = self._walk_nan_variants(scored, historized)
+            if viable is not None:
+                break
+            if len(base_partitions) < base_top_k:
+                break  # DP exhausted every consecutive partition
+            base_top_k *= 2
+        self._apply_best_combination(viable)
+        return viable
 class TschuprowtCombinations(BinaryCombinationEvaluator):
     """Tschuprow's T based combination evaluation toolkit"""
@@ -408,3 +599,179 @@ def _chi2_assoc_batch(
             "cramerv": float(cramerv_q[b]),
             "tschuprowt": float(tt_q[b]),
         }
+def _top_k_partitions_chi2_dp(  # noqa: C901
+    n0_per_mod: np.ndarray,
+    n1_per_mod: np.ndarray,
+    *,
+    max_n_mod: int,
+    raw_index: list,
+    sort_by: str = "tschuprowt",
+    top_k: int = 1000,
+    tol: float = 1e-10,
+) -> list[dict]:
+    """Top-K consecutive-segmentation partitions ranked by a chi²-derived metric.
+    Binary analogue of
+    :func:`AutoCarver.combinations.continuous.continuous_combination_evaluators._top_k_partitions_kruskal_dp`.
+    The per-segment chi² cell contribution
+    .. math::
+        c_g = (n_{0,g} + \\tau - E_{0,g})^2 / E_{0,g}
+              + (n_{1,g} + \\tau - E_{1,g})^2 / E_{1,g}
+    (with Yates correction iff the combination has exactly 2 groups) is
+    additive across groups **given a fixed number of groups k**: the column
+    marginals ``C[c] = N_c + k·τ`` and total ``N = N₀ + N₁ + 2k·τ`` depend
+    only on ``k``, not on the split positions. So we run a separate interval-DP
+    per ``k ∈ [2, K]`` and merge.
+    Returns a list of ``{combination, index_to_groupby, cramerv, tschuprowt}``
+    sorted by ``sort_by`` desc — mirrors the yield shape of
+    :meth:`_compute_associations` so it drops into the streaming pipeline.
+    Complexity: O(K² · n_mod² · top_k · log top_k). Independent of the
+    combination count (which can reach ~8 M at ``n_mod=40, max_n_mod=7``).
+    Edge cases (mirror :func:`_chi2_assoc_for_combination`):
+    * ``max_n_mod < 2`` or ``n_mod < 2``: returns ``[]``.
+    * ``sort_by`` must be ``"cramerv"`` or ``"tschuprowt"``.
+    """
+    if sort_by not in ("cramerv", "tschuprowt"):
+        raise ValueError(f"sort_by must be 'cramerv' or 'tschuprowt', got {sort_by!r}")
+    n_mod = len(raw_index)
+    K = min(max_n_mod, n_mod)
+    if K < 2:
+        return []
+    n0_prefix = np.concatenate([[0.0], np.cumsum(n0_per_mod.astype(np.float64))])
+    n1_prefix = np.concatenate([[0.0], np.cumsum(n1_per_mod.astype(np.float64))])
+    N0_total = float(n0_prefix[-1])
+    N1_total = float(n1_prefix[-1])
+    n_obs = N0_total + N1_total
+    # Collected across all k: (sort_key, cramerv_q, tschuprowt_q, splits)
+    all_entries: list[tuple[float, float, float, tuple[int, ...]]] = []
+    for k_groups in range(2, K + 1):
+        C0 = N0_total + k_groups * tol
+        C1 = N1_total + k_groups * tol
+        N_with_tol = N0_total + N1_total + 2.0 * k_groups * tol
+        yates = k_groups == 2
+        def seg_cost(
+            i: int, j: int, _C0: float = C0, _C1: float = C1, _N: float = N_with_tol, _yates: bool = yates
+        ) -> float:
+            obs0 = (n0_prefix[j] - n0_prefix[i]) + tol
+            obs1 = (n1_prefix[j] - n1_prefix[i]) + tol
+            R = obs0 + obs1
+            E0 = R * _C0 / _N
+            E1 = R * _C1 / _N
+            if _yates:
+                d0 = E0 - obs0
+                d1 = E1 - obs1
+                obs0 = obs0 + (1.0 if d0 > 0 else (-1.0 if d0 < 0 else 0.0)) * min(0.5, abs(d0))
+                obs1 = obs1 + (1.0 if d1 > 0 else (-1.0 if d1 < 0 else 0.0)) * min(0.5, abs(d1))
+            return (obs0 - E0) ** 2 / E0 + (obs1 - E1) ** 2 / E1
+        # dp[g][j] holds up to ``top_k`` (chi2_partial, splits_tuple) pairs sorted
+        # desc, where splits_tuple = (0, s_1, ..., s_{g-1}, j). g = number of
+        # groups in the prefix, j = right boundary.
+        dp: list[list[list[tuple[float, tuple[int, ...]]]]] = [
+            [[] for _ in range(n_mod + 1)] for _ in range(k_groups + 1)
+        ]
+        for j in range(1, n_mod + 1):
+            dp[1][j] = [(seg_cost(0, j), (0, j))]
+        for g in range(2, k_groups + 1):
+            for j in range(g, n_mod + 1):
+                candidates: list[tuple[float, tuple[int, ...]]] = []
+                for i in range(g - 1, j):
+                    c = seg_cost(i, j)
+                    for prev_s, prev_splits in dp[g - 1][i]:
+                        candidates.append((prev_s + c, prev_splits + (j,)))
+                if candidates:
+                    candidates.sort(key=lambda x: x[0], reverse=True)
+                    dp[g][j] = candidates[:top_k]
+        # Translate chi² → cramerv (quantised) → tschuprowt (quantised). Matches
+        # :func:`_chi2_assoc_for_combination` cell-for-cell.
+        denom = (k_groups - 1) ** 0.25  # k_groups ≥ 2 here, so denom > 0
+        for chi2, splits in dp[k_groups][n_mod]:
+            cramerv_raw = (chi2 / n_obs) ** 0.5
+            cramerv_q = round(cramerv_raw / tol) * tol
+            tt_raw = cramerv_q / denom
+            tt_q = round(tt_raw / tol) * tol
+            sort_key = tt_q if sort_by == "tschuprowt" else cramerv_q
+            all_entries.append((sort_key, cramerv_q, tt_q, splits))
+    all_entries.sort(key=lambda x: x[0], reverse=True)
+    all_entries = all_entries[:top_k]
+    out: list[dict] = []
+    for _, cv, tt, splits in all_entries:
+        combination = [list(raw_index[splits[g] : splits[g + 1]]) for g in range(len(splits) - 1)]
+        out.append(
+            {
+                "combination": combination,
+                "index_to_groupby": combination_formatter(combination),
+                "cramerv": float(cv),
+                "tschuprowt": float(tt),
+            }
+        )
+    return out
+def _score_nan_variants_chi2(
+    *,
+    base_partitions: list[dict],
+    nan_label: str,
+    raw_labels: list,
+    max_n_mod: int,
+    n0_per_mod: np.ndarray,
+    n1_per_mod: np.ndarray,
+    n_obs: float,
+    mod_to_pos: dict,
+    n_mod: int,
+    tol: float,
+    sort_by: str,
+) -> list[dict]:
+    """Score every NaN-fanout variant via closed-form chi² (Cramér's V +
+    Tschuprow's T), sorted by ``sort_by`` desc.
+    Uses :func:`_chi2_assoc_for_combination` per variant — bit-identical to
+    the legacy ``chi2_contingency`` path on the per-variant crosstab.
+    """
+    scored: list[dict] = []
+    for variant in _nan_fanout_variants(base_partitions, nan_label, raw_labels, max_n_mod):
+        index_to_groupby = combination_formatter(variant)
+        cv, tt = _chi2_assoc_for_combination(
+            n0_per_mod=n0_per_mod,
+            n1_per_mod=n1_per_mod,
+            n_obs=n_obs,
+            mod_to_pos=mod_to_pos,
+            n_mod=n_mod,
+            index_to_groupby=index_to_groupby,
+            tol=tol,
+        )
+        scored.append(
+            {
+                "combination": variant,
+                "index_to_groupby": index_to_groupby,
+                "cramerv": cv,
+                "tschuprowt": tt,
+            }
+        )
+    def _key(a: dict) -> float:
+        v = a[sort_by]
+        if v is None or (isinstance(v, float) and math.isnan(v)):
+            return float("-inf")
+        return float(v)
+    scored.sort(key=_key, reverse=True)
+    return scored

{autocarver-7.2.2 → autocarver-7.2.6}/AutoCarver/combinations/binary/binary_target_rates.py RENAMED Viewed

@@ -33,14 +33,15 @@ class BinaryTargetRate(TargetRate[pd.DataFrame], ABC):
         """
         # checking for an xtab
         if xagg is not None:
-            # frequency per modality
-            frequency = xagg.sum(axis=1) / xagg.sum().sum()
+            # count + frequency per modality (count carried for CI-based viability tests)
+            count = xagg.sum(axis=1)
+            frequency = count / count.sum()
             # computing target rate. `_compute` expects pd.DataFrame (Generic
             # XAgg=DataFrame); compute()'s wide signature is for LSP matching,
             # callers always pass a crosstab here.
             return pd.DataFrame(
-                {self.__name__: self._compute(xagg), "frequency": frequency}  # type: ignore
+                {self.__name__: self._compute(xagg), "frequency": frequency, "count": count}  # type: ignore
             )
         return None

AutoCarver 7.2.2__tar.gz → 7.2.6__tar.gz

AutoCarver 7.2.2tar.gz → 7.2.6tar.gz