AutoCarver 7.2.0__tar.gz → 7.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/binary_carver.py +3 -3
  2. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/continuous_carver.py +4 -4
  3. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/multiclass_carver.py +11 -6
  4. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/utils/base_carver.py +105 -14
  5. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/utils/pretty_print.py +3 -3
  6. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/binary/binary_combination_evaluators.py +11 -4
  7. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/binary/binary_target_rates.py +13 -4
  8. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/continuous/continuous_combination_evaluators.py +136 -5
  9. autocarver-7.2.2/AutoCarver/combinations/continuous/continuous_target_rates.py +186 -0
  10. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/combination_evaluator.py +61 -32
  11. autocarver-7.2.2/AutoCarver/combinations/utils/target_rate.py +52 -0
  12. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/discretizer.py +2 -2
  13. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/categorical_discretizer.py +4 -4
  14. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/chained_discretizer.py +4 -8
  15. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/ordinal_discretizer.py +4 -9
  16. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/qualitative_discretizer.py +3 -7
  17. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/quantitatives/continuous_discretizer.py +49 -14
  18. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/quantitatives/quantitative_discretizer.py +4 -6
  19. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/base_discretizer.py +57 -13
  20. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/type_discretizers.py +4 -4
  21. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/utils/base_selector.py +7 -28
  22. {autocarver-7.2.0 → autocarver-7.2.2}/PKG-INFO +2 -3
  23. {autocarver-7.2.0 → autocarver-7.2.2}/pyproject.toml +3 -10
  24. autocarver-7.2.0/AutoCarver/combinations/continuous/continuous_target_rates.py +0 -96
  25. autocarver-7.2.0/AutoCarver/combinations/utils/target_rate.py +0 -30
  26. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/__init__.py +0 -0
  27. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/__init__.py +0 -0
  28. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/carvers/utils/__init__.py +0 -0
  29. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/__init__.py +0 -0
  30. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/binary/__init__.py +0 -0
  31. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/continuous/__init__.py +0 -0
  32. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/__init__.py +0 -0
  33. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/combinations.py +0 -0
  34. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/combinations/utils/testing.py +0 -0
  35. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/config.py +0 -0
  36. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/__init__.py +0 -0
  37. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/qualitatives/__init__.py +0 -0
  38. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/quantitatives/__init__.py +0 -0
  39. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/__init__.py +0 -0
  40. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/discretizers/utils/multiprocessing.py +0 -0
  41. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/__init__.py +0 -0
  42. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/features.py +0 -0
  43. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/__init__.py +0 -0
  44. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/categorical_feature.py +0 -0
  45. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/ordinal_feature.py +0 -0
  46. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/qualitatives/qualitative_feature.py +0 -0
  47. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/__init__.py +0 -0
  48. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/datetime_feature.py +0 -0
  49. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/numerical_feature.py +0 -0
  50. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/quantitatives/quantitative_feature.py +0 -0
  51. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/__init__.py +0 -0
  52. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/base_feature.py +0 -0
  53. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/grouped_list.py +0 -0
  54. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/features/utils/serialization.py +0 -0
  55. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/__init__.py +0 -0
  56. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/classification_selector.py +0 -0
  57. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/__init__.py +0 -0
  58. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/base_filters.py +0 -0
  59. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/qualitative_filters.py +0 -0
  60. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/filters/quantitative_filters.py +0 -0
  61. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/__init__.py +0 -0
  62. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/base_measures.py +0 -0
  63. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/qualitative_measures.py +0 -0
  64. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/measures/quantitative_measures.py +0 -0
  65. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/regression_selector.py +0 -0
  66. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/utils/__init__.py +0 -0
  67. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/selectors/utils/pretty_print.py +0 -0
  68. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/__init__.py +0 -0
  69. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/attributes.py +0 -0
  70. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/dependencies.py +0 -0
  71. {autocarver-7.2.0 → autocarver-7.2.2}/AutoCarver/utils/extend_docstring.py +0 -0
  72. {autocarver-7.2.0 → autocarver-7.2.2}/README.md +0 -0
@@ -65,7 +65,7 @@ class BinaryCarver(BaseCarver):
65
65
  config=config,
66
66
  )
67
67
 
68
- def _prepare_data(self, samples: Samples) -> Samples:
68
+ def _prepare_samples(self, samples: Samples) -> Samples:
69
69
  """Validates format and content of X and y."""
70
70
  if samples.train.y is None:
71
71
  raise ValueError(f"[{self.__name__}] y must be provided")
@@ -73,9 +73,9 @@ class BinaryCarver(BaseCarver):
73
73
  if not ((0 in y_values) and (1 in y_values)) or len(y_values) != 2:
74
74
  raise ValueError(f"[{self.__name__}] y must be a binary Series of 0 and 1 (int or float, not object)")
75
75
 
76
- return super()._prepare_data(samples)
76
+ return super()._prepare_samples(samples)
77
77
 
78
- def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.DataFrame | None]:
78
+ def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series | pd.DataFrame | None]:
79
79
  """Computes crosstabs for specified features and ensures that the crosstab is ordered
80
80
  according to the known labels"""
81
81
  # checking for empty datasets (dev)
@@ -63,7 +63,7 @@ class ContinuousCarver(BaseCarver):
63
63
  config=config,
64
64
  )
65
65
 
66
- def _prepare_data(self, samples: Samples) -> Samples:
66
+ def _prepare_samples(self, samples: Samples) -> Samples:
67
67
  """Validates format and content of X and y."""
68
68
  if samples.train.y is None:
69
69
  raise ValueError(f"[{self.__name__}] y must be provided")
@@ -74,9 +74,9 @@ class ContinuousCarver(BaseCarver):
74
74
  if len(y_values) <= 2:
75
75
  raise ValueError(f"[{self.__name__}] provided y is binary, consider using BinaryCarver instead.")
76
76
 
77
- return super()._prepare_data(samples)
77
+ return super()._prepare_samples(samples)
78
78
 
79
- def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.DataFrame | None]:
79
+ def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series | pd.DataFrame | None]:
80
80
  """Computes y values for modalities of specified features and ensures the ordering
81
81
  according to the known labels"""
82
82
  # checking for empty datasets
@@ -98,4 +98,4 @@ def get_target_values_by_modality(X: pd.DataFrame, y: pd.Series, feature: BaseFe
98
98
 
99
99
  # reindexing to ensure the right order (labels may be None pre-fit; pandas
100
100
  # treats None as "no reindex" so the original ordering is kept)
101
- return yval.reindex(feature.labels, fill_value=[]) # type: ignore[arg-type]
101
+ return yval.reindex(feature.labels, fill_value=[]) # type: ignore
@@ -53,7 +53,7 @@ class MulticlassCarver(BinaryCarver):
53
53
  if self.config.copy:
54
54
  print("WARNING: can't set copy=True for MulticlassCarver (no inplace DataFrame.assign).")
55
55
 
56
- def _prepare_data(self, samples: Samples) -> Samples:
56
+ def _prepare_samples(self, samples: Samples) -> Samples:
57
57
  """Validates format and content of X and y."""
58
58
  # converting target to str (y is required by Carver.fit)
59
59
  if samples.train.y is None:
@@ -93,11 +93,11 @@ class MulticlassCarver(BinaryCarver):
93
93
  samples = Samples(train=Sample(X, y), dev=Sample(X_dev, y_dev))
94
94
 
95
95
  # preparing datasets and checking for wrong values
96
- samples = self._prepare_data(samples)
96
+ samples = self._prepare_samples(samples)
97
97
 
98
- # getting distinct y classes (_prepare_data raises if y is missing)
99
- assert samples.train.y is not None
100
- y_classes = sorted(samples.train.y.unique().tolist())[1:] # removing one of the classes
98
+ # getting distinct y classes (_prepare_samples raises if y is missing)
99
+ # removing one of the classes
100
+ y_classes = sorted(samples.train.y.unique().tolist())[1:] # type: ignore
101
101
 
102
102
  # adding versionned features
103
103
  self.features.add_feature_versions(y_classes)
@@ -125,7 +125,12 @@ class MulticlassCarver(BinaryCarver):
125
125
  config=replace(self.config, copy=True),
126
126
  )
127
127
 
128
- binary_carver.fit_transform(samples.train.X, train_y_class, X_dev=samples.dev.X, y_dev=dev_y_class)
128
+ binary_carver.fit_transform(
129
+ samples.train.X,
130
+ train_y_class,
131
+ X_dev=samples.dev.X if samples.dev.has_X else None,
132
+ y_dev=dev_y_class,
133
+ )
129
134
 
130
135
  # filtering out dropped features whilst keeping other version tags
131
136
  kept_features = binary_carver.features.versions
@@ -5,6 +5,8 @@ for any task.
5
5
  import json
6
6
  from abc import ABC, abstractmethod
7
7
  from dataclasses import dataclass, field, replace
8
+ from functools import partial
9
+ from multiprocessing import Pool
8
10
  from typing import Self
9
11
 
10
12
  import pandas as pd
@@ -19,6 +21,8 @@ from AutoCarver.combinations import (
19
21
  from AutoCarver.discretizers import BaseDiscretizer, Discretizer, Sample
20
22
  from AutoCarver.discretizers.utils.base_discretizer import DiscretizerConfig
21
23
  from AutoCarver.features import BaseFeature, Features
24
+ from AutoCarver.features.qualitatives import CategoricalFeature, OrdinalFeature
25
+ from AutoCarver.features.quantitatives import QuantitativeFeature
22
26
  from AutoCarver.utils import extend_docstring, has_idisplay
23
27
 
24
28
  # trying to import extra dependencies
@@ -37,16 +41,63 @@ class Samples:
37
41
  dev (Sample): The development sample, containing features (X) and target (y).
38
42
  """
39
43
 
40
- train: Sample = field(default_factory=lambda: Sample(X=None))
41
- dev: Sample = field(default_factory=lambda: Sample(X=None))
44
+ train: Sample = field(default_factory=Sample)
45
+ dev: Sample = field(default_factory=Sample)
42
46
 
43
47
  def fillna(self, features: Features) -> None:
44
48
  """fills up nans in X and X_dev"""
45
49
  self.train.X = features.fillna(self.train.X)
46
- if self.dev.X is not None:
50
+ if self.dev.has_X:
47
51
  self.dev.X = features.fillna(self.dev.X)
48
52
 
49
53
 
54
+ def _carve_feature_worker(
55
+ payload: tuple[BaseFeature, pd.Series | pd.DataFrame | None, pd.Series | pd.DataFrame | None],
56
+ *,
57
+ evaluator: CombinationEvaluator,
58
+ max_n_mod: int,
59
+ min_freq: float,
60
+ dropna: bool,
61
+ ) -> tuple[BaseFeature, bool]:
62
+ """Picklable worker: scores best combination for a single feature.
63
+
64
+ Each pool task receives a pickled deep copy of ``evaluator`` and a single
65
+ ``(feature, xagg, xagg_dev)`` triple; mutations stay local to the worker
66
+ process. The parent reattaches the returned (mutated) feature to its
67
+ ``Features`` container.
68
+ """
69
+ feature, xagg, xagg_dev = payload
70
+ # workers never print per-feature progress; the parent prints a single banner
71
+ evaluator.verbose = False
72
+ best = evaluator.get_best_combination(
73
+ feature, xagg, xagg_dev, max_n_mod=max_n_mod, min_freq=min_freq, dropna=dropna
74
+ )
75
+ return feature, best is not None
76
+
77
+
78
+ def _replace_feature_in_features(features: Features, updated: BaseFeature) -> None:
79
+ """Swaps an existing feature (by version) for the worker-returned copy."""
80
+ if isinstance(updated, CategoricalFeature):
81
+ categoricals = features.categoricals
82
+ for i, existing in enumerate(categoricals):
83
+ if existing.version == updated.version:
84
+ categoricals[i] = updated
85
+ return
86
+ elif isinstance(updated, OrdinalFeature):
87
+ ordinals = features.ordinals
88
+ for i, existing in enumerate(ordinals):
89
+ if existing.version == updated.version:
90
+ ordinals[i] = updated
91
+ return
92
+ elif isinstance(updated, QuantitativeFeature):
93
+ quantitatives = features.quantitatives
94
+ for i, existing in enumerate(quantitatives):
95
+ if existing.version == updated.version:
96
+ quantitatives[i] = updated
97
+ return
98
+ raise KeyError(f"[BaseCarver] feature {updated.version!r} not in Features")
99
+
100
+
50
101
  class BaseCarver(BaseDiscretizer, ABC):
51
102
  """Automatic carving of continuous, discrete, categorical and ordinal
52
103
  features that maximizes association with a binary or continuous target.
@@ -124,14 +175,14 @@ class BaseCarver(BaseDiscretizer, ABC):
124
175
  content["combination_evaluator"] = self.combination_evaluator.to_json()
125
176
  return content
126
177
 
127
- def _prepare_data(self, samples: Samples) -> Samples:
178
+ def _prepare_samples(self, samples: Samples) -> Samples:
128
179
  """Validates format and content of X and y."""
129
180
  if samples.train.y is None:
130
181
  raise ValueError(f"[{self.__name__}] y must be provided, got {samples.train.y}")
131
182
 
132
183
  # Checking for binary target and copying X
133
- samples.train = super()._prepare_data(samples.train)
134
- samples.dev = super()._prepare_data(samples.dev)
184
+ samples.train = super()._prepare_sample(samples.train)
185
+ samples.dev = super()._prepare_sample(samples.dev)
135
186
 
136
187
  # discretizing features at half min_freq so the carver has a finer
137
188
  # granularity to combine when forming optimal groups
@@ -145,7 +196,7 @@ class BaseCarver(BaseDiscretizer, ABC):
145
196
 
146
197
  return samples
147
198
 
148
- def fit( # pylint: disable=W0222
199
+ def fit( # type: ignore
149
200
  self,
150
201
  X: pd.DataFrame,
151
202
  y: pd.Series,
@@ -184,7 +235,7 @@ class BaseCarver(BaseDiscretizer, ABC):
184
235
  samples = Samples(Sample(X, y), Sample(X_dev, y_dev))
185
236
 
186
237
  # preparing datasets and checking for wrong values
187
- samples = self._prepare_data(samples)
238
+ samples = self._prepare_samples(samples)
188
239
 
189
240
  # logging if requested
190
241
  super()._log_if_verbose("---------\n------")
@@ -196,16 +247,55 @@ class BaseCarver(BaseDiscretizer, ABC):
196
247
  # getting all features to carve (features are removed from self.features)
197
248
  all_features = self.features.versions
198
249
 
199
- # carving each feature
200
- for n, feature in enumerate(all_features):
201
- num_iter = f"{n + 1}/{len(all_features)}" # logging iteration number
202
- self._carve_feature(self.features(feature), xaggs, xaggs_dev, num_iter)
250
+ # carving each feature (parallel across features when n_jobs > 1)
251
+ if self.config.n_jobs > 1 and len(all_features) > 1:
252
+ self._carve_features_parallel(all_features, xaggs, xaggs_dev)
253
+ else:
254
+ for n, feature in enumerate(all_features):
255
+ num_iter = f"{n + 1}/{len(all_features)}" # logging iteration number
256
+ self._carve_feature(self.features(feature), xaggs, xaggs_dev, num_iter)
203
257
 
204
258
  # discretizing features based on each feature's values_order
205
259
  super().fit(X, y)
206
260
 
207
261
  return self
208
262
 
263
+ def _carve_features_parallel(
264
+ self,
265
+ all_features: list[str],
266
+ xaggs: dict[str, pd.Series | pd.DataFrame | None],
267
+ xaggs_dev: dict[str, pd.Series | pd.DataFrame | None],
268
+ ) -> None:
269
+ """Dispatches ``_carve_feature`` across a process pool, one task per feature.
270
+
271
+ Per-feature workers receive only the feature instance + its xagg /
272
+ xagg_dev slice (not the full dict). Verbose per-feature logging is
273
+ silenced; a single banner is printed when verbose is on.
274
+ """
275
+ if self.config.verbose:
276
+ print(f"--- [{self.__name__}] Carving {len(all_features)} features on {self.config.n_jobs} workers")
277
+
278
+ payloads = [(self.features(version), xaggs[version], xaggs_dev[version]) for version in all_features]
279
+ worker = partial(
280
+ _carve_feature_worker,
281
+ evaluator=self.combination_evaluator,
282
+ max_n_mod=self.max_n_mod,
283
+ min_freq=self.min_freq,
284
+ dropna=self.config.dropna,
285
+ )
286
+
287
+ with Pool(processes=self.config.n_jobs) as pool:
288
+ for updated_feature, viable in pool.imap_unordered(worker, payloads):
289
+ if viable:
290
+ _replace_feature_in_features(self.features, updated_feature)
291
+ else:
292
+ print(
293
+ f"WARNING: No robust combination for {updated_feature}. Consider "
294
+ "increasing the size of X_dev or dropping the feature (X not "
295
+ "representative of X_dev for this feature)."
296
+ )
297
+ self.features.remove(updated_feature.version)
298
+
209
299
  @abstractmethod
210
300
  def _aggregator(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series | pd.DataFrame | None]:
211
301
  """Helper that aggregates X by y into per-feature crosstabs or means
@@ -239,10 +329,11 @@ class BaseCarver(BaseDiscretizer, ABC):
239
329
 
240
330
  # printing carved distribution, for found, suitable combination
241
331
  if best_combination is not None:
332
+ dev_sample = self.combination_evaluator.samples.dev
242
333
  self._print_xagg(
243
334
  feature,
244
335
  xagg=self.combination_evaluator.samples.train.xagg,
245
- xagg_dev=self.combination_evaluator.samples.dev.xagg,
336
+ xagg_dev=dev_sample.xagg if dev_sample.has_xagg else None,
246
337
  message="Carved distribution",
247
338
  )
248
339
 
@@ -392,7 +483,7 @@ def discretize(
392
483
  samples.train.X = discretizer.fit_transform(**samples.train)
393
484
 
394
485
  # applying discretizer on X_dev if provided
395
- if samples.dev.X is not None:
486
+ if samples.dev.has_X:
396
487
  samples.dev.X = discretizer.transform(**samples.dev)
397
488
 
398
489
  return samples
@@ -44,14 +44,14 @@ def prettier_xagg(
44
44
 
45
45
  # adding custom caption/title
46
46
  if caption is not None:
47
- nicer_xagg = nicer_xagg.set_caption(caption)
47
+ nicer_xagg = nicer_xagg.set_caption(caption) # type: ignore
48
48
 
49
49
  # hiding index for dev
50
50
  if hide_index:
51
- nicer_xagg.hide(axis="index")
51
+ nicer_xagg.hide(axis="index") # type: ignore
52
52
 
53
53
  # converting to html
54
- nicer_xagg = nicer_xagg._repr_html_() # pylint: disable=W0212
54
+ nicer_xagg = nicer_xagg._repr_html_() # type: ignore
55
55
 
56
56
  return nicer_xagg
57
57
 
@@ -14,15 +14,19 @@ from AutoCarver.combinations.utils.combination_evaluator import (
14
14
  CombinationEvaluator,
15
15
  )
16
16
  from AutoCarver.combinations.utils.combinations import combination_formatter
17
+ from AutoCarver.combinations.utils.target_rate import TargetRate
17
18
 
18
19
 
19
- class BinaryCombinationEvaluator(CombinationEvaluator, ABC):
20
+ class BinaryCombinationEvaluator(CombinationEvaluator[pd.DataFrame], ABC):
20
21
  """Binary combination evaluator class."""
21
22
 
22
23
  is_y_binary = True
23
24
  _target_rate_classes: list[type[BinaryTargetRate]] = [TargetMean, OddsRatio, Woe]
25
+ # narrow inherited attribute: binary evaluators always carry a BinaryTargetRate
26
+ # (enforced by _init_target_rate).
27
+ target_rate: BinaryTargetRate
24
28
 
25
- def _init_target_rate(self, target_rate: BinaryTargetRate | None) -> BinaryTargetRate:
29
+ def _init_target_rate(self, target_rate: TargetRate[pd.DataFrame] | None) -> BinaryTargetRate:
26
30
  """Initializes target rate."""
27
31
  if target_rate is None:
28
32
  return TargetMean()
@@ -31,8 +35,11 @@ class BinaryCombinationEvaluator(CombinationEvaluator, ABC):
31
35
  return target_rate
32
36
 
33
37
  def _association_measure(
34
- self, xagg: AggregatedSample, n_obs: int | None = None, tol: float = 1e-10
35
- ) -> dict[str, float]:
38
+ self,
39
+ xagg: AggregatedSample | pd.Series | pd.DataFrame,
40
+ n_obs: int | None = None,
41
+ tol: float = 1e-10,
42
+ ) -> dict[str, float | None]:
36
43
  """Computes measures of association between feature and target by crosstab.
37
44
 
38
45
  Used for the raw (one-shot) distribution. The hot per-combination loop
@@ -1,6 +1,7 @@
1
1
  """set of target rates for binary classification"""
2
2
 
3
3
  from abc import ABC
4
+ from typing import overload
4
5
 
5
6
  import numpy as np
6
7
  import pandas as pd
@@ -8,12 +9,16 @@ import pandas as pd
8
9
  from AutoCarver.combinations.utils import TargetRate
9
10
 
10
11
 
11
- class BinaryTargetRate(TargetRate, ABC):
12
+ class BinaryTargetRate(TargetRate[pd.DataFrame], ABC):
12
13
  """Binary target rate class."""
13
14
 
14
15
  __name__ = "binary_target_rate"
15
16
 
16
- def compute(self, xagg: pd.DataFrame) -> pd.DataFrame:
17
+ @overload
18
+ def compute(self, xagg: pd.Series | pd.DataFrame) -> pd.DataFrame: ...
19
+ @overload
20
+ def compute(self, xagg: None) -> None: ...
21
+ def compute(self, xagg: pd.Series | pd.DataFrame | None) -> pd.DataFrame | None:
17
22
  """Computes the target rate.
18
23
 
19
24
  Parameters
@@ -31,8 +36,12 @@ class BinaryTargetRate(TargetRate, ABC):
31
36
  # frequency per modality
32
37
  frequency = xagg.sum(axis=1) / xagg.sum().sum()
33
38
 
34
- # computing target rate
35
- return pd.DataFrame({self.__name__: self._compute(xagg), "frequency": frequency})
39
+ # computing target rate. `_compute` expects pd.DataFrame (Generic
40
+ # XAgg=DataFrame); compute()'s wide signature is for LSP matching,
41
+ # callers always pass a crosstab here.
42
+ return pd.DataFrame(
43
+ {self.__name__: self._compute(xagg), "frequency": frequency} # type: ignore
44
+ )
36
45
  return None
37
46
 
38
47
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from abc import ABC
4
4
  from collections.abc import Iterable, Iterator
5
+ from typing import Any
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
@@ -11,15 +12,20 @@ from tqdm import tqdm
11
12
  from AutoCarver.combinations.continuous.continuous_target_rates import ContinuousTargetRate, TargetMean, TargetMedian
12
13
  from AutoCarver.combinations.utils.combination_evaluator import AggregatedSample, CombinationEvaluator
13
14
  from AutoCarver.combinations.utils.combinations import combination_formatter
15
+ from AutoCarver.combinations.utils.target_rate import TargetRate
16
+ from AutoCarver.combinations.utils.testing import Keys, is_viable, test_viability
14
17
 
15
18
 
16
- class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
19
+ class ContinuousCombinationEvaluator(CombinationEvaluator[pd.Series], ABC):
17
20
  """Continuous combination evaluator class."""
18
21
 
19
22
  is_y_continuous = True
20
23
  _target_rate_classes: list[type[ContinuousTargetRate]] = [TargetMean, TargetMedian]
24
+ # narrow the inherited `target_rate: TargetRate` annotation — continuous
25
+ # carvers always carry a ContinuousTargetRate (enforced by _init_target_rate).
26
+ target_rate: ContinuousTargetRate
21
27
 
22
- def _init_target_rate(self, target_rate: ContinuousTargetRate | None) -> ContinuousTargetRate:
28
+ def _init_target_rate(self, target_rate: TargetRate[pd.Series] | None) -> ContinuousTargetRate:
23
29
  """Initializes target rate."""
24
30
  if target_rate is None:
25
31
  return TargetMean()
@@ -28,7 +34,10 @@ class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
28
34
  return target_rate
29
35
 
30
36
  def _association_measure(
31
- self, xagg: AggregatedSample, n_obs: int | None = None, tol: float = 1e-10
37
+ self,
38
+ xagg: AggregatedSample | pd.Series | pd.DataFrame,
39
+ n_obs: int | None = None,
40
+ tol: float = 1e-10,
32
41
  ) -> dict[str, float | None]:
33
42
  """Computes measures of association between feature and quantitative target.
34
43
 
@@ -134,13 +143,29 @@ class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
134
143
  :meth:`_association_measure`).
135
144
  """
136
145
  raw_xagg = self.samples.train.xagg
137
- # Pre-rank y once for the whole feature
138
- R_per_mod, n_per_mod, N, tie_corr = _modality_rank_stats(raw_xagg)
146
+ # Pre-rank y once for the whole feature.
147
+ R_per_mod, n_per_mod, N, tie_corr = _modality_rank_stats(raw_xagg) # type: ignore
139
148
 
140
149
  # Map modality label -> position in R_per_mod / n_per_mod
141
150
  mod_to_pos: dict = {m: i for i, m in enumerate(raw_xagg.index)}
142
151
  n_mod = len(mod_to_pos)
143
152
 
153
+ # Cache per-modality (n, sum_y) for the viability fast path.
154
+ # Resets each time _compute_associations runs so the nan-pass refreshes
155
+ # the cache after _apply_best_combination changes samples.train.xagg.
156
+ sum_y_per_mod = _modality_sum_y(raw_xagg) # type: ignore
157
+ # Why: heterogeneous-value dict; annotate `Any` so downstream readers (line 203-204
158
+ # and _get_dev_modality_stats) can narrow to the per-key concrete type without ty
159
+ # unioning across all value types.
160
+ self._train_modality_stats: dict[str, Any] = {
161
+ "n_per_mod": n_per_mod.astype(float),
162
+ "sum_y_per_mod": sum_y_per_mod,
163
+ "mod_to_pos": mod_to_pos,
164
+ "n_mod": n_mod,
165
+ }
166
+ self._dev_modality_stats: dict[str, Any] | None = None # lazy; aligned to train's mod_to_pos
167
+ self._dev_modality_stats_id: int | None = None
168
+
144
169
  batch: list[dict] = []
145
170
  for grouped_xagg in tqdm(grouped_xaggs, desc="Computing associations", disable=not self.verbose):
146
171
  batch.append(grouped_xagg)
@@ -166,6 +191,100 @@ class ContinuousCombinationEvaluator(CombinationEvaluator, ABC):
166
191
  n_mod=n_mod,
167
192
  )
168
193
 
194
+ def _get_dev_modality_stats(self) -> dict | None:
195
+ """Lazily build per-modality ``(n, sum_y)`` for the dev sample,
196
+ aligned to ``self._train_modality_stats['mod_to_pos']`` (zeros for
197
+ modalities absent from dev). Returns ``None`` when no dev sample is set.
198
+
199
+ Cache is keyed by ``id(dev_xagg)`` so external reassignment of
200
+ ``samples.dev`` between viability iterations triggers a fresh
201
+ computation (the unit tests rely on this; production flows reassign
202
+ dev only via ``samples.set`` at the start of ``get_best_combination``).
203
+ """
204
+ if not self.samples.dev.has_xagg:
205
+ return None
206
+ dev_xagg = self.samples.dev.xagg
207
+ if self._dev_modality_stats is not None and self._dev_modality_stats_id == id(dev_xagg):
208
+ return self._dev_modality_stats
209
+ train_stats = self._train_modality_stats
210
+ mod_to_pos: dict = train_stats["mod_to_pos"]
211
+ n_mod: int = train_stats["n_mod"]
212
+
213
+ n = np.zeros(n_mod, dtype=float)
214
+ sum_y = np.zeros(n_mod, dtype=float)
215
+ for mod, vals in dev_xagg.items():
216
+ pos = mod_to_pos.get(mod)
217
+ if pos is None:
218
+ continue # dev has a modality train doesn't — skip
219
+ arr = np.asarray(vals, dtype=float)
220
+ n[pos] = arr.size
221
+ sum_y[pos] = float(arr.sum())
222
+
223
+ self._dev_modality_stats = {
224
+ "n_per_mod": n,
225
+ "sum_y_per_mod": sum_y,
226
+ "mod_to_pos": mod_to_pos,
227
+ "n_mod": n_mod,
228
+ }
229
+ self._dev_modality_stats_id = id(dev_xagg)
230
+ return self._dev_modality_stats
231
+
232
+ def _test_viability_train(self, combination: dict) -> dict:
233
+ """Fast-path viability on train; falls back to legacy when the active
234
+ target rate's ``compute_from_stats`` returns ``None`` (e.g.
235
+ ``TargetMedian`` whose default closed-form path is a no-op).
236
+ """
237
+ stats = getattr(self, "_train_modality_stats", None)
238
+ if stats is not None:
239
+ train_rates = self.target_rate.compute_from_stats(
240
+ stats=stats, index_to_groupby=combination["index_to_groupby"]
241
+ )
242
+ if train_rates is not None:
243
+ return test_viability(train_rates, self.min_freq, self.target_rate.__name__)
244
+ # Fallback: legacy grouper + apply(np.mean/median) over Python lists
245
+ return super()._test_viability_train(combination)
246
+
247
+ def _get_viable_combination(self, associations: list[dict]) -> dict | None:
248
+ """Walks associations under the fast viability path and materialises
249
+ the winning combination's grouped xagg once at the end.
250
+
251
+ The fast path skips ``combination['xagg']`` because the closed-form
252
+ viability check doesn't need it; downstream consumers (debug, tests,
253
+ and any future code that introspects the winner) still expect to see
254
+ it, so we rebuild it for the winner only — that's one ``_grouper``
255
+ call per feature instead of ~13k per feature.
256
+ """
257
+ viable = super()._get_viable_combination(associations)
258
+ if viable is not None and viable.get("xagg") is None:
259
+ # `clean_combination` pops `index_to_groupby` during historization
260
+ # earlier in the loop, so rebuild it from the still-present
261
+ # `combination` list-of-groups.
262
+ index_to_groupby = viable.get("index_to_groupby")
263
+ if index_to_groupby is None:
264
+ index_to_groupby = combination_formatter(viable["combination"])
265
+ viable["xagg"] = self._grouper(self.samples.train, index_to_groupby)
266
+ return viable
267
+
268
+ def _test_viability_dev(self, test_results: dict, combination: dict) -> dict:
269
+ """Fast-path viability on dev; falls back to legacy when the active
270
+ target rate's ``compute_from_stats`` returns ``None``.
271
+ """
272
+ if not test_results[Keys.VIABLE.value] or not self.samples.dev.has_xagg:
273
+ return {**test_results, "dev": {Keys.VIABLE.value: None}}
274
+
275
+ dev_stats = self._get_dev_modality_stats()
276
+ if dev_stats is not None:
277
+ dev_rates = self.target_rate.compute_from_stats(
278
+ stats=dev_stats, index_to_groupby=combination["index_to_groupby"]
279
+ )
280
+ if dev_rates is not None:
281
+ train_target_rate = test_results["train_rates"][self.target_rate.__name__]
282
+ dev_results = test_viability(dev_rates, self.min_freq, self.target_rate.__name__, train_target_rate)
283
+ merged = {**test_results, **dev_results}
284
+ merged[Keys.VIABLE.value] = is_viable(merged)
285
+ return merged
286
+ return super()._test_viability_dev(test_results, combination)
287
+
169
288
 
170
289
  class KruskalCombinations(ContinuousCombinationEvaluator):
171
290
  """Kruskal-Wallis' H based combination evaluation toolkit"""
@@ -186,6 +305,18 @@ _KRUSKAL_BATCH_SIZE = 1024
186
305
  # ---------------------------------------------------------------------------
187
306
 
188
307
 
308
+ def _modality_sum_y(raw_xagg: pd.Series) -> np.ndarray:
309
+ """Per-modality ``sum_y`` aligned with ``raw_xagg.index``.
310
+
311
+ Used by the viability fast path (Step 3.5) to compute group target means
312
+ in closed form (``sum_y_g / n_g``) instead of applying ``np.mean`` to
313
+ Python lists of y values per candidate.
314
+ """
315
+ return np.fromiter(
316
+ (float(np.asarray(v, dtype=float).sum()) for v in raw_xagg.values), dtype=float, count=len(raw_xagg)
317
+ )
318
+
319
+
189
320
  def _modality_rank_stats(
190
321
  raw_xagg: pd.Series,
191
322
  ) -> tuple[np.ndarray | None, np.ndarray, int, float | None]: