AutoCarver 7.2.0__tar.gz → 7.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/binary_carver.py +3 -3
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/continuous_carver.py +4 -4
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/multiclass_carver.py +11 -6
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/utils/base_carver.py +105 -14
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/utils/pretty_print.py +3 -3
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/binary/binary_combination_evaluators.py +11 -4
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/binary/binary_target_rates.py +13 -4
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/continuous/continuous_combination_evaluators.py +136 -5
- autocarver-7.2.2/AutoCarver/combinations/continuous/continuous_target_rates.py +186 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/combination_evaluator.py +61 -32
- autocarver-7.2.2/AutoCarver/combinations/utils/target_rate.py +52 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/discretizer.py +2 -2
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/categorical_discretizer.py +4 -4
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/chained_discretizer.py +4 -8
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/ordinal_discretizer.py +4 -9
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/qualitative_discretizer.py +3 -7
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/quantitatives/continuous_discretizer.py +49 -14
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/quantitatives/quantitative_discretizer.py +4 -6
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/base_discretizer.py +57 -13
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/type_discretizers.py +4 -4
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/utils/base_selector.py +7 -28
- {autocarver-7.2.0 → autocarver-7.2.2}/PKG-INFO +2 -3
- {autocarver-7.2.0 → autocarver-7.2.2}/pyproject.toml +3 -10
- autocarver-7.2.0/AutoCarver/combinations/continuous/continuous_target_rates.py +0 -96
- autocarver-7.2.0/AutoCarver/combinations/utils/target_rate.py +0 -30
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/utils/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/binary/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/continuous/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/combinations.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/testing.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/config.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/quantitatives/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/multiprocessing.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/features.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/categorical_feature.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/ordinal_feature.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/qualitative_feature.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/datetime_feature.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/numerical_feature.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/quantitative_feature.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/base_feature.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/grouped_list.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/serialization.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/classification_selector.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/base_filters.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/qualitative_filters.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/quantitative_filters.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/base_measures.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/qualitative_measures.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/quantitative_measures.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/regression_selector.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/utils/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/utils/pretty_print.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/__init__.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/attributes.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/dependencies.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/extend_docstring.py +0 -0
- {autocarver-7.2.0 → autocarver-7.2.2}/README.md +0 -0
|
@@ -65,7 +65,7 @@ class BinaryCarver(BaseCarver):
|
|
|
65
65
|
config=config,
|
|
66
66
|
)
|
|
67
67
|
|
|
68
|
-
def
|
|
68
|
+
def _prepare_samples(self, samples: Samples) -> Samples:
|
|
69
69
|
"""Validates format and content of X and y."""
|
|
70
70
|
if samples.train.y is None:
|
|
71
71
|
raise ValueError(f"[{self.__name__}] y must be provided")
|
|
@@ -73,9 +73,9 @@ class BinaryCarver(BaseCarver):
|
|
|
73
73
|
if not ((0 in y_values) and (1 in y_values)) or len(y_values) != 2:
|
|
74
74
|
raise ValueError(f"[{self.__name__}] y must be a binary Series of 0 and 1 (int or float, not object)")
|
|
75
75
|
|
|
76
|
-
return super().
|
|
76
|
+
return super()._prepare_samples(samples)
|
|
77
77
|
|
|
78
|
-
def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.DataFrame | None]:
|
|
78
|
+
def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series | pd.DataFrame | None]:
|
|
79
79
|
"""Computes crosstabs for specified features and ensures that the crosstab is ordered
|
|
80
80
|
according to the known labels"""
|
|
81
81
|
# checking for empty datasets (dev)
|
|
@@ -63,7 +63,7 @@ class ContinuousCarver(BaseCarver):
|
|
|
63
63
|
config=config,
|
|
64
64
|
)
|
|
65
65
|
|
|
66
|
-
def
|
|
66
|
+
def _prepare_samples(self, samples: Samples) -> Samples:
|
|
67
67
|
"""Validates format and content of X and y."""
|
|
68
68
|
if samples.train.y is None:
|
|
69
69
|
raise ValueError(f"[{self.__name__}] y must be provided")
|
|
@@ -74,9 +74,9 @@ class ContinuousCarver(BaseCarver):
|
|
|
74
74
|
if len(y_values) <= 2:
|
|
75
75
|
raise ValueError(f"[{self.__name__}] provided y is binary, consider using BinaryCarver instead.")
|
|
76
76
|
|
|
77
|
-
return super().
|
|
77
|
+
return super()._prepare_samples(samples)
|
|
78
78
|
|
|
79
|
-
def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.DataFrame | None]:
|
|
79
|
+
def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series | pd.DataFrame | None]:
|
|
80
80
|
"""Computes y values for modalities of specified features and ensures the ordering
|
|
81
81
|
according to the known labels"""
|
|
82
82
|
# checking for empty datasets
|
|
@@ -98,4 +98,4 @@ def get_target_values_by_modality(X: pd.DataFrame, y: pd.Series, feature: BaseFe
|
|
|
98
98
|
|
|
99
99
|
# reindexing to ensure the right order (labels may be None pre-fit; pandas
|
|
100
100
|
# treats None as "no reindex" so the original ordering is kept)
|
|
101
|
-
return yval.reindex(feature.labels, fill_value=[]) # type: ignore
|
|
101
|
+
return yval.reindex(feature.labels, fill_value=[]) # type: ignore
|
|
@@ -53,7 +53,7 @@ class MulticlassCarver(BinaryCarver):
|
|
|
53
53
|
if self.config.copy:
|
|
54
54
|
print("WARNING: can't set copy=True for MulticlassCarver (no inplace DataFrame.assign).")
|
|
55
55
|
|
|
56
|
-
def
|
|
56
|
+
def _prepare_samples(self, samples: Samples) -> Samples:
|
|
57
57
|
"""Validates format and content of X and y."""
|
|
58
58
|
# converting target to str (y is required by Carver.fit)
|
|
59
59
|
if samples.train.y is None:
|
|
@@ -93,11 +93,11 @@ class MulticlassCarver(BinaryCarver):
|
|
|
93
93
|
samples = Samples(train=Sample(X, y), dev=Sample(X_dev, y_dev))
|
|
94
94
|
|
|
95
95
|
# preparing datasets and checking for wrong values
|
|
96
|
-
samples = self.
|
|
96
|
+
samples = self._prepare_samples(samples)
|
|
97
97
|
|
|
98
|
-
# getting distinct y classes (
|
|
99
|
-
|
|
100
|
-
y_classes = sorted(samples.train.y.unique().tolist())[1:] #
|
|
98
|
+
# getting distinct y classes (_prepare_samples raises if y is missing)
|
|
99
|
+
# removing one of the classes
|
|
100
|
+
y_classes = sorted(samples.train.y.unique().tolist())[1:] # type: ignore
|
|
101
101
|
|
|
102
102
|
# adding versionned features
|
|
103
103
|
self.features.add_feature_versions(y_classes)
|
|
@@ -125,7 +125,12 @@ class MulticlassCarver(BinaryCarver):
|
|
|
125
125
|
config=replace(self.config, copy=True),
|
|
126
126
|
)
|
|
127
127
|
|
|
128
|
-
binary_carver.fit_transform(
|
|
128
|
+
binary_carver.fit_transform(
|
|
129
|
+
samples.train.X,
|
|
130
|
+
train_y_class,
|
|
131
|
+
X_dev=samples.dev.X if samples.dev.has_X else None,
|
|
132
|
+
y_dev=dev_y_class,
|
|
133
|
+
)
|
|
129
134
|
|
|
130
135
|
# filtering out dropped features whilst keeping other version tags
|
|
131
136
|
kept_features = binary_carver.features.versions
|
|
@@ -5,6 +5,8 @@ for any task.
|
|
|
5
5
|
import json
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from dataclasses import dataclass, field, replace
|
|
8
|
+
from functools import partial
|
|
9
|
+
from multiprocessing import Pool
|
|
8
10
|
from typing import Self
|
|
9
11
|
|
|
10
12
|
import pandas as pd
|
|
@@ -19,6 +21,8 @@ from AutoCarver.combinations import (
|
|
|
19
21
|
from AutoCarver.discretizers import BaseDiscretizer, Discretizer, Sample
|
|
20
22
|
from AutoCarver.discretizers.utils.base_discretizer import DiscretizerConfig
|
|
21
23
|
from AutoCarver.features import BaseFeature, Features
|
|
24
|
+
from AutoCarver.features.qualitatives import CategoricalFeature, OrdinalFeature
|
|
25
|
+
from AutoCarver.features.quantitatives import QuantitativeFeature
|
|
22
26
|
from AutoCarver.utils import extend_docstring, has_idisplay
|
|
23
27
|
|
|
24
28
|
# trying to import extra dependencies
|
|
@@ -37,16 +41,63 @@ class Samples:
|
|
|
37
41
|
dev (Sample): The development sample, containing features (X) and target (y).
|
|
38
42
|
"""
|
|
39
43
|
|
|
40
|
-
train: Sample = field(default_factory=
|
|
41
|
-
dev: Sample = field(default_factory=
|
|
44
|
+
train: Sample = field(default_factory=Sample)
|
|
45
|
+
dev: Sample = field(default_factory=Sample)
|
|
42
46
|
|
|
43
47
|
def fillna(self, features: Features) -> None:
|
|
44
48
|
"""fills up nans in X and X_dev"""
|
|
45
49
|
self.train.X = features.fillna(self.train.X)
|
|
46
|
-
if self.dev.
|
|
50
|
+
if self.dev.has_X:
|
|
47
51
|
self.dev.X = features.fillna(self.dev.X)
|
|
48
52
|
|
|
49
53
|
|
|
54
|
+
def _carve_feature_worker(
|
|
55
|
+
payload: tuple[BaseFeature, pd.Series | pd.DataFrame | None, pd.Series | pd.DataFrame | None],
|
|
56
|
+
*,
|
|
57
|
+
evaluator: CombinationEvaluator,
|
|
58
|
+
max_n_mod: int,
|
|
59
|
+
min_freq: float,
|
|
60
|
+
dropna: bool,
|
|
61
|
+
) -> tuple[BaseFeature, bool]:
|
|
62
|
+
"""Picklable worker: scores best combination for a single feature.
|
|
63
|
+
|
|
64
|
+
Each pool task receives a pickled deep copy of ``evaluator`` and a single
|
|
65
|
+
``(feature, xagg, xagg_dev)`` triple; mutations stay local to the worker
|
|
66
|
+
process. The parent reattaches the returned (mutated) feature to its
|
|
67
|
+
``Features`` container.
|
|
68
|
+
"""
|
|
69
|
+
feature, xagg, xagg_dev = payload
|
|
70
|
+
# workers never print per-feature progress; the parent prints a single banner
|
|
71
|
+
evaluator.verbose = False
|
|
72
|
+
best = evaluator.get_best_combination(
|
|
73
|
+
feature, xagg, xagg_dev, max_n_mod=max_n_mod, min_freq=min_freq, dropna=dropna
|
|
74
|
+
)
|
|
75
|
+
return feature, best is not None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _replace_feature_in_features(features: Features, updated: BaseFeature) -> None:
|
|
79
|
+
"""Swaps an existing feature (by version) for the worker-returned copy."""
|
|
80
|
+
if isinstance(updated, CategoricalFeature):
|
|
81
|
+
categoricals = features.categoricals
|
|
82
|
+
for i, existing in enumerate(categoricals):
|
|
83
|
+
if existing.version == updated.version:
|
|
84
|
+
categoricals[i] = updated
|
|
85
|
+
return
|
|
86
|
+
elif isinstance(updated, OrdinalFeature):
|
|
87
|
+
ordinals = features.ordinals
|
|
88
|
+
for i, existing in enumerate(ordinals):
|
|
89
|
+
if existing.version == updated.version:
|
|
90
|
+
ordinals[i] = updated
|
|
91
|
+
return
|
|
92
|
+
elif isinstance(updated, QuantitativeFeature):
|
|
93
|
+
quantitatives = features.quantitatives
|
|
94
|
+
for i, existing in enumerate(quantitatives):
|
|
95
|
+
if existing.version == updated.version:
|
|
96
|
+
quantitatives[i] = updated
|
|
97
|
+
return
|
|
98
|
+
raise KeyError(f"[BaseCarver] feature {updated.version!r} not in Features")
|
|
99
|
+
|
|
100
|
+
|
|
50
101
|
class BaseCarver(BaseDiscretizer, ABC):
|
|
51
102
|
"""Automatic carving of continuous, discrete, categorical and ordinal
|
|
52
103
|
features that maximizes association with a binary or continuous target.
|
|
@@ -124,14 +175,14 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
124
175
|
content["combination_evaluator"] = self.combination_evaluator.to_json()
|
|
125
176
|
return content
|
|
126
177
|
|
|
127
|
-
def
|
|
178
|
+
def _prepare_samples(self, samples: Samples) -> Samples:
|
|
128
179
|
"""Validates format and content of X and y."""
|
|
129
180
|
if samples.train.y is None:
|
|
130
181
|
raise ValueError(f"[{self.__name__}] y must be provided, got {samples.train.y}")
|
|
131
182
|
|
|
132
183
|
# Checking for binary target and copying X
|
|
133
|
-
samples.train = super().
|
|
134
|
-
samples.dev = super().
|
|
184
|
+
samples.train = super()._prepare_sample(samples.train)
|
|
185
|
+
samples.dev = super()._prepare_sample(samples.dev)
|
|
135
186
|
|
|
136
187
|
# discretizing features at half min_freq so the carver has a finer
|
|
137
188
|
# granularity to combine when forming optimal groups
|
|
@@ -145,7 +196,7 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
145
196
|
|
|
146
197
|
return samples
|
|
147
198
|
|
|
148
|
-
def fit( #
|
|
199
|
+
def fit( # type: ignore
|
|
149
200
|
self,
|
|
150
201
|
X: pd.DataFrame,
|
|
151
202
|
y: pd.Series,
|
|
@@ -184,7 +235,7 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
184
235
|
samples = Samples(Sample(X, y), Sample(X_dev, y_dev))
|
|
185
236
|
|
|
186
237
|
# preparing datasets and checking for wrong values
|
|
187
|
-
samples = self.
|
|
238
|
+
samples = self._prepare_samples(samples)
|
|
188
239
|
|
|
189
240
|
# logging if requested
|
|
190
241
|
super()._log_if_verbose("---------\n------")
|
|
@@ -196,16 +247,55 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
196
247
|
# getting all features to carve (features are removed from self.features)
|
|
197
248
|
all_features = self.features.versions
|
|
198
249
|
|
|
199
|
-
# carving each feature
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
250
|
+
# carving each feature (parallel across features when n_jobs > 1)
|
|
251
|
+
if self.config.n_jobs > 1 and len(all_features) > 1:
|
|
252
|
+
self._carve_features_parallel(all_features, xaggs, xaggs_dev)
|
|
253
|
+
else:
|
|
254
|
+
for n, feature in enumerate(all_features):
|
|
255
|
+
num_iter = f"{n + 1}/{len(all_features)}" # logging iteration number
|
|
256
|
+
self._carve_feature(self.features(feature), xaggs, xaggs_dev, num_iter)
|
|
203
257
|
|
|
204
258
|
# discretizing features based on each feature's values_order
|
|
205
259
|
super().fit(X, y)
|
|
206
260
|
|
|
207
261
|
return self
|
|
208
262
|
|
|
263
|
+
def _carve_features_parallel(
|
|
264
|
+
self,
|
|
265
|
+
all_features: list[str],
|
|
266
|
+
xaggs: dict[str, pd.Series | pd.DataFrame | None],
|
|
267
|
+
xaggs_dev: dict[str, pd.Series | pd.DataFrame | None],
|
|
268
|
+
) -> None:
|
|
269
|
+
"""Dispatches ``_carve_feature`` across a process pool, one task per feature.
|
|
270
|
+
|
|
271
|
+
Per-feature workers receive only the feature instance + its xagg /
|
|
272
|
+
xagg_dev slice (not the full dict). Verbose per-feature logging is
|
|
273
|
+
silenced; a single banner is printed when verbose is on.
|
|
274
|
+
"""
|
|
275
|
+
if self.config.verbose:
|
|
276
|
+
print(f"--- [{self.__name__}] Carving {len(all_features)} features on {self.config.n_jobs} workers")
|
|
277
|
+
|
|
278
|
+
payloads = [(self.features(version), xaggs[version], xaggs_dev[version]) for version in all_features]
|
|
279
|
+
worker = partial(
|
|
280
|
+
_carve_feature_worker,
|
|
281
|
+
evaluator=self.combination_evaluator,
|
|
282
|
+
max_n_mod=self.max_n_mod,
|
|
283
|
+
min_freq=self.min_freq,
|
|
284
|
+
dropna=self.config.dropna,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
with Pool(processes=self.config.n_jobs) as pool:
|
|
288
|
+
for updated_feature, viable in pool.imap_unordered(worker, payloads):
|
|
289
|
+
if viable:
|
|
290
|
+
_replace_feature_in_features(self.features, updated_feature)
|
|
291
|
+
else:
|
|
292
|
+
print(
|
|
293
|
+
f"WARNING: No robust combination for {updated_feature}. Consider "
|
|
294
|
+
"increasing the size of X_dev or dropping the feature (X not "
|
|
295
|
+
"representative of X_dev for this feature)."
|
|
296
|
+
)
|
|
297
|
+
self.features.remove(updated_feature.version)
|
|
298
|
+
|
|
209
299
|
@abstractmethod
|
|
210
300
|
def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series | pd.DataFrame | None]:
|
|
211
301
|
"""Helper that aggregates X by y into per-feature crosstabs or means
|
|
@@ -239,10 +329,11 @@ class BaseCarver(BaseDiscretizer, ABC):
|
|
|
239
329
|
|
|
240
330
|
# printing carved distribution, for found, suitable combination
|
|
241
331
|
if best_combination is not None:
|
|
332
|
+
dev_sample = self.combination_evaluator.samples.dev
|
|
242
333
|
self._print_xagg(
|
|
243
334
|
feature,
|
|
244
335
|
xagg=self.combination_evaluator.samples.train.xagg,
|
|
245
|
-
xagg_dev=
|
|
336
|
+
xagg_dev=dev_sample.xagg if dev_sample.has_xagg else None,
|
|
246
337
|
message="Carved distribution",
|
|
247
338
|
)
|
|
248
339
|
|
|
@@ -392,7 +483,7 @@ def discretize(
|
|
|
392
483
|
samples.train.X = discretizer.fit_transform(**samples.train)
|
|
393
484
|
|
|
394
485
|
# applying discretizer on X_dev if provided
|
|
395
|
-
if samples.dev.
|
|
486
|
+
if samples.dev.has_X:
|
|
396
487
|
samples.dev.X = discretizer.transform(**samples.dev)
|
|
397
488
|
|
|
398
489
|
return samples
|
|
@@ -44,14 +44,14 @@ def prettier_xagg(
|
|
|
44
44
|
|
|
45
45
|
# adding custom caption/title
|
|
46
46
|
if caption is not None:
|
|
47
|
-
nicer_xagg = nicer_xagg.set_caption(caption)
|
|
47
|
+
nicer_xagg = nicer_xagg.set_caption(caption) # type: ignore
|
|
48
48
|
|
|
49
49
|
# hiding index for dev
|
|
50
50
|
if hide_index:
|
|
51
|
-
nicer_xagg.hide(axis="index")
|
|
51
|
+
nicer_xagg.hide(axis="index") # type: ignore
|
|
52
52
|
|
|
53
53
|
# converting to html
|
|
54
|
-
nicer_xagg = nicer_xagg._repr_html_() #
|
|
54
|
+
nicer_xagg = nicer_xagg._repr_html_() # type: ignore
|
|
55
55
|
|
|
56
56
|
return nicer_xagg
|
|
57
57
|
|
|
@@ -14,15 +14,19 @@ from AutoCarver.combinations.utils.combination_evaluator import (
|
|
|
14
14
|
CombinationEvaluator,
|
|
15
15
|
)
|
|
16
16
|
from AutoCarver.combinations.utils.combinations import combination_formatter
|
|
17
|
+
from AutoCarver.combinations.utils.target_rate import TargetRate
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
class BinaryCombinationEvaluator(CombinationEvaluator, ABC):
|
|
20
|
+
class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
|
|
20
21
|
"""Binary combination evaluator class."""
|
|
21
22
|
|
|
22
23
|
is_y_binary = True
|
|
23
24
|
_target_rate_classes: list[type[BinaryTargetRate]] = [TargetMean, OddsRatio, Woe]
|
|
25
|
+
# narrow inherited attribute: binary evaluators always carry a BinaryTargetRate
|
|
26
|
+
# (enforced by _init_target_rate).
|
|
27
|
+
target_rate: BinaryTargetRate
|
|
24
28
|
|
|
25
|
-
def _init_target_rate(self, target_rate:
|
|
29
|
+
def _init_target_rate(self, target_rate: TargetRate[pd.DataFrame] | None) -> BinaryTargetRate:
|
|
26
30
|
"""Initializes target rate."""
|
|
27
31
|
if target_rate is None:
|
|
28
32
|
return TargetMean()
|
|
@@ -31,8 +35,11 @@ class BinaryCombinationEvaluator(CombinationEvaluator, ABC):
|
|
|
31
35
|
return target_rate
|
|
32
36
|
|
|
33
37
|
def _association_measure(
|
|
34
|
-
self,
|
|
35
|
-
|
|
38
|
+
self,
|
|
39
|
+
xagg: AggregatedSample | pd.Series | pd.DataFrame,
|
|
40
|
+
n_obs: int | None = None,
|
|
41
|
+
tol: float = 1e-10,
|
|
42
|
+
) -> dict[str, float | None]:
|
|
36
43
|
"""Computes measures of association between feature and target by crosstab.
|
|
37
44
|
|
|
38
45
|
Used for the raw (one-shot) distribution. The hot per-combination loop
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""set of target rates for binary classification"""
|
|
2
2
|
|
|
3
3
|
from abc import ABC
|
|
4
|
+
from typing import overload
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
@@ -8,12 +9,16 @@ import pandas as pd
|
|
|
8
9
|
from AutoCarver.combinations.utils import TargetRate
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
class BinaryTargetRate(TargetRate, ABC):
|
|
12
|
+
class BinaryTargetRate(TargetRate[pd.DataFrame], ABC):
|
|
12
13
|
"""Binary target rate class."""
|
|
13
14
|
|
|
14
15
|
__name__ = "binary_target_rate"
|
|
15
16
|
|
|
16
|
-
|
|
17
|
+
@overload
|
|
18
|
+
def compute(self, xagg: pd.Series | pd.DataFrame) -> pd.DataFrame: ...
|
|
19
|
+
@overload
|
|
20
|
+
def compute(self, xagg: None) -> None: ...
|
|
21
|
+
def compute(self, xagg: pd.Series | pd.DataFrame | None) -> pd.DataFrame | None:
|
|
17
22
|
"""Computes the target rate.
|
|
18
23
|
|
|
19
24
|
Parameters
|
|
@@ -31,8 +36,12 @@ class BinaryTargetRate(TargetRate, ABC):
|
|
|
31
36
|
# frequency per modality
|
|
32
37
|
frequency = xagg.sum(axis=1) / xagg.sum().sum()
|
|
33
38
|
|
|
34
|
-
# computing target rate
|
|
35
|
-
|
|
39
|
+
# computing target rate. `_compute` expects pd.DataFrame (Generic
|
|
40
|
+
# XAgg=DataFrame); compute()'s wide signature is for LSP matching,
|
|
41
|
+
# callers always pass a crosstab here.
|
|
42
|
+
return pd.DataFrame(
|
|
43
|
+
{self.__name__: self._compute(xagg), "frequency": frequency} # type: ignore
|
|
44
|
+
)
|
|
36
45
|
return None
|
|
37
46
|
|
|
38
47
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from abc import ABC
|
|
4
4
|
from collections.abc import Iterable, Iterator
|
|
5
|
+
from typing import Any
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
@@ -11,15 +12,20 @@ from tqdm import tqdm
|
|
|
11
12
|
from AutoCarver.combinations.continuous.continuous_target_rates import ContinuousTargetRate, TargetMean, TargetMedian
|
|
12
13
|
from AutoCarver.combinations.utils.combination_evaluator import AggregatedSample, CombinationEvaluator
|
|
13
14
|
from AutoCarver.combinations.utils.combinations import combination_formatter
|
|
15
|
+
from AutoCarver.combinations.utils.target_rate import TargetRate
|
|
16
|
+
from AutoCarver.combinations.utils.testing import Keys, is_viable, test_viability
|
|
14
17
|
|
|
15
18
|
|
|
16
|
-
class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
|
|
19
|
+
class ContinuousCombinationEvaluator(CombinationEvaluator[pd.Series], ABC):
|
|
17
20
|
"""Continuous combination evaluator class."""
|
|
18
21
|
|
|
19
22
|
is_y_continuous = True
|
|
20
23
|
_target_rate_classes: list[type[ContinuousTargetRate]] = [TargetMean, TargetMedian]
|
|
24
|
+
# narrow the inherited `target_rate: TargetRate` annotation — continuous
|
|
25
|
+
# carvers always carry a ContinuousTargetRate (enforced by _init_target_rate).
|
|
26
|
+
target_rate: ContinuousTargetRate
|
|
21
27
|
|
|
22
|
-
def _init_target_rate(self, target_rate:
|
|
28
|
+
def _init_target_rate(self, target_rate: TargetRate[pd.Series] | None) -> ContinuousTargetRate:
|
|
23
29
|
"""Initializes target rate."""
|
|
24
30
|
if target_rate is None:
|
|
25
31
|
return TargetMean()
|
|
@@ -28,7 +34,10 @@ class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
|
|
|
28
34
|
return target_rate
|
|
29
35
|
|
|
30
36
|
def _association_measure(
|
|
31
|
-
self,
|
|
37
|
+
self,
|
|
38
|
+
xagg: AggregatedSample | pd.Series | pd.DataFrame,
|
|
39
|
+
n_obs: int | None = None,
|
|
40
|
+
tol: float = 1e-10,
|
|
32
41
|
) -> dict[str, float | None]:
|
|
33
42
|
"""Computes measures of association between feature and quantitative target.
|
|
34
43
|
|
|
@@ -134,13 +143,29 @@ class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
|
|
|
134
143
|
:meth:`_association_measure`).
|
|
135
144
|
"""
|
|
136
145
|
raw_xagg = self.samples.train.xagg
|
|
137
|
-
# Pre-rank y once for the whole feature
|
|
138
|
-
R_per_mod, n_per_mod, N, tie_corr = _modality_rank_stats(raw_xagg)
|
|
146
|
+
# Pre-rank y once for the whole feature.
|
|
147
|
+
R_per_mod, n_per_mod, N, tie_corr = _modality_rank_stats(raw_xagg) # type: ignore
|
|
139
148
|
|
|
140
149
|
# Map modality label -> position in R_per_mod / n_per_mod
|
|
141
150
|
mod_to_pos: dict = {m: i for i, m in enumerate(raw_xagg.index)}
|
|
142
151
|
n_mod = len(mod_to_pos)
|
|
143
152
|
|
|
153
|
+
# Cache per-modality (n, sum_y) for the viability fast path.
|
|
154
|
+
# Resets each time _compute_associations runs so the nan-pass refreshes
|
|
155
|
+
# the cache after _apply_best_combination changes samples.train.xagg.
|
|
156
|
+
sum_y_per_mod = _modality_sum_y(raw_xagg) # type: ignore
|
|
157
|
+
# Why: heterogeneous-value dict; annotate `Any` so downstream readers (line 203-204
|
|
158
|
+
# and _get_dev_modality_stats) can narrow to the per-key concrete type without ty
|
|
159
|
+
# unioning across all value types.
|
|
160
|
+
self._train_modality_stats: dict[str, Any] = {
|
|
161
|
+
"n_per_mod": n_per_mod.astype(float),
|
|
162
|
+
"sum_y_per_mod": sum_y_per_mod,
|
|
163
|
+
"mod_to_pos": mod_to_pos,
|
|
164
|
+
"n_mod": n_mod,
|
|
165
|
+
}
|
|
166
|
+
self._dev_modality_stats: dict[str, Any] | None = None # lazy; aligned to train's mod_to_pos
|
|
167
|
+
self._dev_modality_stats_id: int | None = None
|
|
168
|
+
|
|
144
169
|
batch: list[dict] = []
|
|
145
170
|
for grouped_xagg in tqdm(grouped_xaggs, desc="Computing associations", disable=not self.verbose):
|
|
146
171
|
batch.append(grouped_xagg)
|
|
@@ -166,6 +191,100 @@ class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
|
|
|
166
191
|
n_mod=n_mod,
|
|
167
192
|
)
|
|
168
193
|
|
|
194
|
+
def _get_dev_modality_stats(self) -> dict | None:
|
|
195
|
+
"""Lazily build per-modality ``(n, sum_y)`` for the dev sample,
|
|
196
|
+
aligned to ``self._train_modality_stats['mod_to_pos']`` (zeros for
|
|
197
|
+
modalities absent from dev). Returns ``None`` when no dev sample is set.
|
|
198
|
+
|
|
199
|
+
Cache is keyed by ``id(dev_xagg)`` so external reassignment of
|
|
200
|
+
``samples.dev`` between viability iterations triggers a fresh
|
|
201
|
+
computation (the unit tests rely on this; production flows reassign
|
|
202
|
+
dev only via ``samples.set`` at the start of ``get_best_combination``).
|
|
203
|
+
"""
|
|
204
|
+
if not self.samples.dev.has_xagg:
|
|
205
|
+
return None
|
|
206
|
+
dev_xagg = self.samples.dev.xagg
|
|
207
|
+
if self._dev_modality_stats is not None and self._dev_modality_stats_id == id(dev_xagg):
|
|
208
|
+
return self._dev_modality_stats
|
|
209
|
+
train_stats = self._train_modality_stats
|
|
210
|
+
mod_to_pos: dict = train_stats["mod_to_pos"]
|
|
211
|
+
n_mod: int = train_stats["n_mod"]
|
|
212
|
+
|
|
213
|
+
n = np.zeros(n_mod, dtype=float)
|
|
214
|
+
sum_y = np.zeros(n_mod, dtype=float)
|
|
215
|
+
for mod, vals in dev_xagg.items():
|
|
216
|
+
pos = mod_to_pos.get(mod)
|
|
217
|
+
if pos is None:
|
|
218
|
+
continue # dev has a modality train doesn't — skip
|
|
219
|
+
arr = np.asarray(vals, dtype=float)
|
|
220
|
+
n[pos] = arr.size
|
|
221
|
+
sum_y[pos] = float(arr.sum())
|
|
222
|
+
|
|
223
|
+
self._dev_modality_stats = {
|
|
224
|
+
"n_per_mod": n,
|
|
225
|
+
"sum_y_per_mod": sum_y,
|
|
226
|
+
"mod_to_pos": mod_to_pos,
|
|
227
|
+
"n_mod": n_mod,
|
|
228
|
+
}
|
|
229
|
+
self._dev_modality_stats_id = id(dev_xagg)
|
|
230
|
+
return self._dev_modality_stats
|
|
231
|
+
|
|
232
|
+
def _test_viability_train(self, combination: dict) -> dict:
|
|
233
|
+
"""Fast-path viability on train; falls back to legacy when the active
|
|
234
|
+
target rate's ``compute_from_stats`` returns ``None`` (e.g.
|
|
235
|
+
``TargetMedian`` whose default closed-form path is a no-op).
|
|
236
|
+
"""
|
|
237
|
+
stats = getattr(self, "_train_modality_stats", None)
|
|
238
|
+
if stats is not None:
|
|
239
|
+
train_rates = self.target_rate.compute_from_stats(
|
|
240
|
+
stats=stats, index_to_groupby=combination["index_to_groupby"]
|
|
241
|
+
)
|
|
242
|
+
if train_rates is not None:
|
|
243
|
+
return test_viability(train_rates, self.min_freq, self.target_rate.__name__)
|
|
244
|
+
# Fallback: legacy grouper + apply(np.mean/median) over Python lists
|
|
245
|
+
return super()._test_viability_train(combination)
|
|
246
|
+
|
|
247
|
+
def _get_viable_combination(self, associations: list[dict]) -> dict | None:
|
|
248
|
+
"""Walks associations under the fast viability path and materialises
|
|
249
|
+
the winning combination's grouped xagg once at the end.
|
|
250
|
+
|
|
251
|
+
The fast path skips ``combination['xagg']`` because the closed-form
|
|
252
|
+
viability check doesn't need it; downstream consumers (debug, tests,
|
|
253
|
+
and any future code that introspects the winner) still expect to see
|
|
254
|
+
it, so we rebuild it for the winner only — that's one ``_grouper``
|
|
255
|
+
call per feature instead of ~13k per feature.
|
|
256
|
+
"""
|
|
257
|
+
viable = super()._get_viable_combination(associations)
|
|
258
|
+
if viable is not None and viable.get("xagg") is None:
|
|
259
|
+
# `clean_combination` pops `index_to_groupby` during historization
|
|
260
|
+
# earlier in the loop, so rebuild it from the still-present
|
|
261
|
+
# `combination` list-of-groups.
|
|
262
|
+
index_to_groupby = viable.get("index_to_groupby")
|
|
263
|
+
if index_to_groupby is None:
|
|
264
|
+
index_to_groupby = combination_formatter(viable["combination"])
|
|
265
|
+
viable["xagg"] = self._grouper(self.samples.train, index_to_groupby)
|
|
266
|
+
return viable
|
|
267
|
+
|
|
268
|
+
def _test_viability_dev(self, test_results: dict, combination: dict) -> dict:
|
|
269
|
+
"""Fast-path viability on dev; falls back to legacy when the active
|
|
270
|
+
target rate's ``compute_from_stats`` returns ``None``.
|
|
271
|
+
"""
|
|
272
|
+
if not test_results[Keys.VIABLE.value] or not self.samples.dev.has_xagg:
|
|
273
|
+
return {**test_results, "dev": {Keys.VIABLE.value: None}}
|
|
274
|
+
|
|
275
|
+
dev_stats = self._get_dev_modality_stats()
|
|
276
|
+
if dev_stats is not None:
|
|
277
|
+
dev_rates = self.target_rate.compute_from_stats(
|
|
278
|
+
stats=dev_stats, index_to_groupby=combination["index_to_groupby"]
|
|
279
|
+
)
|
|
280
|
+
if dev_rates is not None:
|
|
281
|
+
train_target_rate = test_results["train_rates"][self.target_rate.__name__]
|
|
282
|
+
dev_results = test_viability(dev_rates, self.min_freq, self.target_rate.__name__, train_target_rate)
|
|
283
|
+
merged = {**test_results, **dev_results}
|
|
284
|
+
merged[Keys.VIABLE.value] = is_viable(merged)
|
|
285
|
+
return merged
|
|
286
|
+
return super()._test_viability_dev(test_results, combination)
|
|
287
|
+
|
|
169
288
|
|
|
170
289
|
class KruskalCombinations(ContinuousCombinationEvaluator):
|
|
171
290
|
"""Kruskal-Wallis' H based combination evaluation toolkit"""
|
|
@@ -186,6 +305,18 @@ _KRUSKAL_BATCH_SIZE = 1024
|
|
|
186
305
|
# ---------------------------------------------------------------------------
|
|
187
306
|
|
|
188
307
|
|
|
308
|
+
def _modality_sum_y(raw_xagg: pd.Series) -> np.ndarray:
|
|
309
|
+
"""Per-modality ``sum_y`` aligned with ``raw_xagg.index``.
|
|
310
|
+
|
|
311
|
+
Used by the viability fast path (Step 3.5) to compute group target means
|
|
312
|
+
in closed form (``sum_y_g / n_g``) instead of applying ``np.mean`` to
|
|
313
|
+
Python lists of y values per candidate.
|
|
314
|
+
"""
|
|
315
|
+
return np.fromiter(
|
|
316
|
+
(float(np.asarray(v, dtype=float).sum()) for v in raw_xagg.values), dtype=float, count=len(raw_xagg)
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
189
320
|
def _modality_rank_stats(
|
|
190
321
|
raw_xagg: pd.Series,
|
|
191
322
|
) -> tuple[np.ndarray | None, np.ndarray, int, float | None]:
|