InsideForest 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/cluster_selector.py +3 -0
  2. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/inside_forest.py +215 -1
  3. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/labels.py +6 -0
  4. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/metadata.py +51 -5
  5. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/models.py +10 -5
  6. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/PKG-INFO +1 -1
  7. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/SOURCES.txt +3 -0
  8. {insideforest-0.3.2 → insideforest-0.3.3}/PKG-INFO +1 -1
  9. {insideforest-0.3.2 → insideforest-0.3.3}/README.md +57 -6
  10. insideforest-0.3.3/experiments/select_clusters_hyperparam.py +86 -0
  11. {insideforest-0.3.2 → insideforest-0.3.3}/setup.py +1 -1
  12. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_cluster_selector.py +16 -0
  13. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_descrip_helpers.py +6 -0
  14. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_fit_predict.py +16 -0
  15. insideforest-0.3.3/tests/test_metadata_run_experiments.py +40 -0
  16. insideforest-0.3.3/tests/test_models.py +30 -0
  17. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/__init__.py +0 -0
  18. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/descrip.py +0 -0
  19. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/regions.py +0 -0
  20. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/trees.py +0 -0
  21. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/dependency_links.txt +0 -0
  22. {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/top_level.txt +0 -0
  23. {insideforest-0.3.2 → insideforest-0.3.3}/LICENSE +0 -0
  24. {insideforest-0.3.2 → insideforest-0.3.3}/experiments/__init__.py +0 -0
  25. {insideforest-0.3.2 → insideforest-0.3.3}/experiments/benchmark.py +0 -0
  26. {insideforest-0.3.2 → insideforest-0.3.3}/experiments/benchmark_get_rangos.py +0 -0
  27. {insideforest-0.3.2 → insideforest-0.3.3}/experiments/rf_param_benchmark.py +0 -0
  28. {insideforest-0.3.2 → insideforest-0.3.3}/experiments/summary_benchmark.py +0 -0
  29. {insideforest-0.3.2 → insideforest-0.3.3}/setup.cfg +0 -0
  30. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_chimera_values_selector.py +0 -0
  31. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_descrip.py +0 -0
  32. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_eps_search_perf.py +0 -0
  33. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_params.py +0 -0
  34. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_regressor_fit_predict.py +0 -0
  35. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_iou_equivalence.py +0 -0
  36. {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_trees.py +0 -0
@@ -86,6 +86,9 @@ def select_clusters(
86
86
  ponderador = regla['ponderador']
87
87
  cluster = regla['cluster']
88
88
 
89
+ missing_cols = [col for col in variables if col not in df_datos.columns]
90
+ if missing_cols:
91
+ raise KeyError(f"Columns not found in df_datos: {missing_cols}")
89
92
  X_datos = df_datos[variables]
90
93
  condiciones = [
91
94
  (X_datos[var].to_numpy() >= linf[var]) & (X_datos[var].to_numpy() <= lsup[var])
@@ -1,14 +1,93 @@
1
- import pandas as pd
1
+ from typing import Optional, Dict, Any, List, Tuple
2
+
3
+ import numpy as np
2
4
  import joblib
5
+ try:
6
+ import pandas as pd
7
+ _HAS_PANDAS = True
8
+ except Exception:
9
+ _HAS_PANDAS = False
3
10
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
4
11
  from sklearn.exceptions import NotFittedError
5
12
  from sklearn.utils.validation import check_is_fitted
13
+ from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression
14
+ from sklearn.utils.multiclass import type_of_target
6
15
 
7
16
  from .trees import Trees
8
17
  from .regions import Regions
9
18
  from .descrip import get_frontiers
10
19
 
11
20
 
21
+ # ---------- FAST helpers ----------
22
+ def _size_bucket(n: int, d: int) -> str:
23
+ prod = n * d
24
+ if prod <= 50_000:
25
+ return "small"
26
+ elif prod <= 200_000:
27
+ return "medium"
28
+ elif prod <= 1_000_000:
29
+ return "large"
30
+ else:
31
+ return "huge"
32
+
33
+
34
+ def _choose_k_features(n: int, d: int) -> int:
35
+ bucket = _size_bucket(n, d)
36
+ if bucket == "small":
37
+ k = min(d, 64)
38
+ elif bucket == "medium":
39
+ k = min(d, 48)
40
+ elif bucket == "large":
41
+ k = min(d, 32)
42
+ else:
43
+ k = min(d, 24)
44
+ return max(8, k)
45
+
46
+
47
+ def _choose_fast_params(n: int, d: int) -> Dict[str, Any]:
48
+ bucket = _size_bucket(n, d)
49
+ if bucket == "small":
50
+ rf_params = dict(n_estimators=80, max_depth=12, min_samples_leaf=3, n_jobs=-1, random_state=42)
51
+ tree_params = dict(percentil=98, low_frac=0.02)
52
+ divide = 3
53
+ method = "menu"
54
+ elif bucket == "medium":
55
+ rf_params = dict(n_estimators=60, max_depth=10, min_samples_leaf=5, n_jobs=-1, random_state=42)
56
+ tree_params = dict(percentil=99, low_frac=0.01)
57
+ divide = 3
58
+ method = "menu"
59
+ elif bucket == "large":
60
+ rf_params = dict(n_estimators=40, max_depth=9, min_samples_leaf=8, n_jobs=-1, random_state=42)
61
+ tree_params = dict(percentil=99.5, low_frac=0.0075)
62
+ divide = 3
63
+ method = "menu"
64
+ else:
65
+ rf_params = dict(n_estimators=30, max_depth=8, min_samples_leaf=10, n_jobs=-1, random_state=42)
66
+ tree_params = dict(percentil=99.7, low_frac=0.005)
67
+ divide = 3
68
+ method = "menu"
69
+
70
+ return dict(
71
+ rf_params=rf_params,
72
+ tree_params=tree_params,
73
+ divide=divide,
74
+ method=method,
75
+ get_detail=False,
76
+ )
77
+
78
+
79
+ def _merge_dicts(base: Dict[str, Any], override: Optional[Dict[str, Any]]) -> Dict[str, Any]:
80
+ if not override:
81
+ return base
82
+ out = dict(base)
83
+ for k, v in override.items():
84
+ if isinstance(v, dict) and isinstance(out.get(k), dict):
85
+ out[k] = {**out[k], **v}
86
+ else:
87
+ out[k] = v
88
+ return out
89
+
90
+
12
91
  class _BaseInsideForest:
13
92
  """Internal base class handling shared ``fit`` and ``predict`` logic.
14
93
 
@@ -67,7 +146,12 @@ class _BaseInsideForest:
67
146
  get_detail=False,
68
147
  leaf_percentile=95,
69
148
  low_leaf_fraction=0.05,
149
+ auto_fast=False,
150
+ auto_feature_reduce=False,
151
+ explicit_k_features: Optional[int] = None,
152
+ fast_overrides: Optional[Dict[str, Any]] = None,
70
153
  ):
154
+ self.rf_cls = rf_cls
71
155
  self.rf_params = rf_params or {}
72
156
  self.tree_params = tree_params or {}
73
157
  self.var_obj = var_obj
@@ -79,6 +163,19 @@ class _BaseInsideForest:
79
163
  self.leaf_percentile = leaf_percentile
80
164
  self.low_leaf_fraction = low_leaf_fraction
81
165
 
166
+ # FAST knobs
167
+ self.auto_fast = auto_fast
168
+ self.auto_feature_reduce = auto_feature_reduce
169
+ self.explicit_k_features = explicit_k_features
170
+ self.fast_overrides = fast_overrides or {}
171
+
172
+ # FAST bookkeeping
173
+ self._feature_mask_: Optional[np.ndarray] = None
174
+ self.feature_names_in_: Optional[List[str]] = None
175
+ self.feature_names_out_: Optional[List[str]] = None
176
+ self._size_bucket_: Optional[str] = None
177
+ self._fast_params_used_: Optional[Dict[str, Any]] = None
178
+
82
179
  # Ensure tree parameters include the percentile settings
83
180
  self.tree_params.setdefault("percentil", leaf_percentile)
84
181
  self.tree_params.setdefault("low_frac", low_leaf_fraction)
@@ -121,6 +218,10 @@ class _BaseInsideForest:
121
218
  "get_detail": self.get_detail,
122
219
  "leaf_percentile": self.leaf_percentile,
123
220
  "low_leaf_fraction": self.low_leaf_fraction,
221
+ "auto_fast": self.auto_fast,
222
+ "auto_feature_reduce": self.auto_feature_reduce,
223
+ "explicit_k_features": self.explicit_k_features,
224
+ "fast_overrides": self.fast_overrides,
124
225
  }
125
226
 
126
227
  def set_params(self, **params):
@@ -164,6 +265,10 @@ class _BaseInsideForest:
164
265
  "get_detail",
165
266
  "leaf_percentile",
166
267
  "low_leaf_fraction",
268
+ "auto_fast",
269
+ "auto_feature_reduce",
270
+ "explicit_k_features",
271
+ "fast_overrides",
167
272
  }:
168
273
  setattr(self, key, value)
169
274
  if key == "leaf_percentile":
@@ -177,6 +282,59 @@ class _BaseInsideForest:
177
282
 
178
283
  return self
179
284
 
285
+ def _maybe_reduce_features(self, X, y=None):
286
+ """Optionally reduce features; preserve original column names if DataFrame."""
287
+ if not self.auto_feature_reduce:
288
+ if _HAS_PANDAS and isinstance(X, pd.DataFrame):
289
+ self.feature_names_in_ = list(X.columns)
290
+ self.feature_names_out_ = list(X.columns)
291
+ else:
292
+ self.feature_names_in_ = None
293
+ self.feature_names_out_ = None
294
+ self._feature_mask_ = None
295
+ return X
296
+
297
+ n, d = X.shape
298
+ k = (
299
+ self.explicit_k_features
300
+ if self.explicit_k_features is not None
301
+ else _choose_k_features(n, d)
302
+ )
303
+ k = min(k, d)
304
+
305
+ is_df = _HAS_PANDAS and isinstance(X, pd.DataFrame)
306
+ self.feature_names_in_ = list(X.columns) if is_df else None
307
+ X_arr = X.values if is_df else np.asarray(X)
308
+
309
+ support = None
310
+ if y is not None:
311
+ try:
312
+ ytype = type_of_target(y)
313
+ except Exception:
314
+ ytype = None
315
+ if ytype in {"binary", "multiclass"}:
316
+ sel = SelectKBest(mutual_info_classif, k=k).fit(X_arr, y)
317
+ support = sel.get_support()
318
+ elif ytype in {"continuous", "continuous-multioutput"}:
319
+ sel = SelectKBest(mutual_info_regression, k=k).fit(X_arr, y)
320
+ support = sel.get_support()
321
+ if support is None:
322
+ variances = X_arr.var(axis=0)
323
+ idx_sorted = np.argsort(-variances)[:k]
324
+ support = np.zeros(X_arr.shape[1], dtype=bool)
325
+ support[idx_sorted] = True
326
+
327
+ self._feature_mask_ = support
328
+
329
+ if is_df:
330
+ cols = np.array(self.feature_names_in_)
331
+ keep_cols = cols[support].tolist()
332
+ self.feature_names_out_ = keep_cols
333
+ return X[keep_cols]
334
+ else:
335
+ self.feature_names_out_ = None
336
+ return X_arr[:, support]
337
+
180
338
  def fit(self, X, y=None, rf=None):
181
339
  """Fit the internal random forest and compute cluster labels.
182
340
 
@@ -220,13 +378,53 @@ class _BaseInsideForest:
220
378
  else:
221
379
  if isinstance(X, pd.DataFrame):
222
380
  X_df = X.copy()
381
+ if self.var_obj in X_df.columns:
382
+ X_df = X_df.drop(columns=[self.var_obj])
223
383
  else:
224
384
  X_df = pd.DataFrame(data=X)
225
385
 
226
386
  # Replace spaces with underscores to keep compatibility with Trees
227
387
  X_df.columns = [str(c).replace(" ", "_") for c in X_df.columns]
388
+
389
+ # 0) Feature reduction (optional)
390
+ Xr = self._maybe_reduce_features(X_df, y)
391
+ if _HAS_PANDAS and isinstance(Xr, pd.DataFrame):
392
+ X_df = Xr
393
+ else:
394
+ X_df = pd.DataFrame(Xr)
228
395
  self.feature_names_ = list(X_df.columns)
229
396
 
397
+ # 1) Fast preset (optional)
398
+ if self.auto_fast:
399
+ n, d = X_df.shape
400
+ auto = _choose_fast_params(n, d)
401
+ combined = dict(auto)
402
+
403
+ if isinstance(self.rf_params, dict):
404
+ combined["rf_params"] = {**auto["rf_params"], **self.rf_params}
405
+ if isinstance(self.tree_params, dict):
406
+ combined["tree_params"] = {**auto["tree_params"], **self.tree_params}
407
+ if hasattr(self, "divide"):
408
+ combined["divide"] = getattr(self, "divide", auto["divide"])
409
+ if hasattr(self, "method"):
410
+ combined["method"] = "menu" if y is not None else getattr(self, "method", auto["method"])
411
+ if hasattr(self, "get_detail"):
412
+ combined["get_detail"] = getattr(self, "get_detail", auto["get_detail"])
413
+
414
+ combined = _merge_dicts(combined, self.fast_overrides)
415
+
416
+ self._fast_params_used_ = combined
417
+ self._size_bucket_ = _size_bucket(n, d)
418
+
419
+ self.rf_params = combined.get("rf_params", self.rf_params)
420
+ self.tree_params = combined.get("tree_params", self.tree_params)
421
+ self.divide = combined.get("divide", self.divide)
422
+ self.method = combined.get("method", self.method)
423
+ self.get_detail = combined.get("get_detail", self.get_detail)
424
+
425
+ self.rf = self.rf_cls(**self.rf_params)
426
+ self.trees = Trees(**self.tree_params)
427
+
230
428
  # Allow passing a custom random forest estimator
231
429
  if rf is not None:
232
430
  self.rf = rf
@@ -518,6 +716,10 @@ class InsideForestClassifier(_BaseInsideForest):
518
716
  get_detail=False,
519
717
  leaf_percentile=95,
520
718
  low_leaf_fraction=0.05,
719
+ auto_fast=False,
720
+ auto_feature_reduce=False,
721
+ explicit_k_features: Optional[int] = None,
722
+ fast_overrides: Optional[Dict[str, Any]] = None,
521
723
  ):
522
724
  super().__init__(
523
725
  RandomForestClassifier,
@@ -531,6 +733,10 @@ class InsideForestClassifier(_BaseInsideForest):
531
733
  get_detail=get_detail,
532
734
  leaf_percentile=leaf_percentile,
533
735
  low_leaf_fraction=low_leaf_fraction,
736
+ auto_fast=auto_fast,
737
+ auto_feature_reduce=auto_feature_reduce,
738
+ explicit_k_features=explicit_k_features,
739
+ fast_overrides=fast_overrides,
534
740
  )
535
741
 
536
742
 
@@ -550,6 +756,10 @@ class InsideForestRegressor(_BaseInsideForest):
550
756
  get_detail=False,
551
757
  leaf_percentile=95,
552
758
  low_leaf_fraction=0.05,
759
+ auto_fast=False,
760
+ auto_feature_reduce=False,
761
+ explicit_k_features: Optional[int] = None,
762
+ fast_overrides: Optional[Dict[str, Any]] = None,
553
763
  ):
554
764
  super().__init__(
555
765
  RandomForestRegressor,
@@ -563,6 +773,10 @@ class InsideForestRegressor(_BaseInsideForest):
563
773
  get_detail=get_detail,
564
774
  leaf_percentile=leaf_percentile,
565
775
  low_leaf_fraction=low_leaf_fraction,
776
+ auto_fast=auto_fast,
777
+ auto_feature_reduce=auto_feature_reduce,
778
+ explicit_k_features=explicit_k_features,
779
+ fast_overrides=fast_overrides,
566
780
  )
567
781
 
568
782
 
@@ -129,6 +129,12 @@ class Labels:
129
129
  upper_bounds = sub_df.loc[row_index, 'lsup'].copy()
130
130
  variables = list(upper_bounds.index)
131
131
 
132
+ # Early exit when there are no variables to filter on.
133
+ # Returning an empty DataFrame prevents index errors when constructing
134
+ # boolean conditions on an empty list of variables.
135
+ if len(variables) == 0:
136
+ return df.iloc[0:0]
137
+
132
138
  conditions = [
133
139
  (df[var] <= upper_bounds[var]) & (df[var] > lower_bounds[var])
134
140
  for var in variables
@@ -297,7 +297,12 @@ def conditions_to_tokens(conds: list[str]) -> set[str]:
297
297
  # 2. GENERADOR DE EXPERIMENTOS PARA UN SOLO Df2
298
298
  # ------------------------------------------------------------------ #
299
299
 
300
- def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
300
+ def experiments_from_df2(
301
+ df2: pd.DataFrame,
302
+ meta: pd.DataFrame,
303
+ df_data: pd.DataFrame | None = None,
304
+ var_obj: str = "target",
305
+ ) -> pd.DataFrame:
301
306
  """Generate pairwise cluster comparisons for a single Df2.
302
307
 
303
308
  Parameters
@@ -307,13 +312,20 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
307
312
  ``cluster_ef_sample`` and ``cluster_n_sample`` columns.
308
313
  meta : pd.DataFrame
309
314
  Metadata indexed by ``rule_token`` providing actionability metrics.
315
+ df_data : pd.DataFrame, optional
316
+ Raw dataset used to compute cluster descriptions. If provided, the
317
+ effectiveness and number of observations for the intersection between
318
+ clusters is computed by applying the shared rules to ``df_data``.
319
+ var_obj : str, default "target"
320
+ Name of the target column inside ``df_data`` used to estimate
321
+ effectiveness.
310
322
 
311
323
  Returns
312
324
  -------
313
325
  pd.DataFrame
314
326
  Each row contains the comparison between two clusters along with the
315
- exclusive variables and a score penalizing difficult actions and
316
- rewarding overlap.
327
+ exclusive variables, intersection statistics and a score penalizing
328
+ difficult actions and rewarding overlap.
317
329
  """
318
330
  # --- action table -----------------------------
319
331
  meta_idx = meta.set_index('rule_token')
@@ -328,6 +340,28 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
328
340
  only_a = sorted(conds_a - conds_b)
329
341
  only_b = sorted(conds_b - conds_a)
330
342
 
343
+ # -------------------- intersection stats -----------------------
344
+ inter_ef = None
345
+ inter_n = None
346
+ if df_data is not None and inters:
347
+ def _apply_conditions(data, conds):
348
+ for cond in conds:
349
+ match = re.match(
350
+ r"\s*([-\d\.eE]+)\s*<=\s*([A-Za-z_][A-Za-z0-9_]*)\s*<=\s*([-\d\.eE]+)",
351
+ str(cond),
352
+ )
353
+ if match and match.group(2) in data.columns:
354
+ low = float(match.group(1))
355
+ high = float(match.group(3))
356
+ var = match.group(2)
357
+ data = data[(data[var] >= low) & (data[var] <= high)]
358
+ return data
359
+
360
+ df_inter = _apply_conditions(df_data.copy(), inters)
361
+ inter_n = int(df_inter.shape[0])
362
+ if var_obj in df_inter.columns and inter_n > 0:
363
+ inter_ef = float(df_inter[var_obj].mean())
364
+
331
365
  # Determine which cluster has lower effectiveness
332
366
  delta_ef = row_a['cluster_ef_sample'] - row_b['cluster_ef_sample']
333
367
  row_a_subset, row_b_subset = (row_a, row_b) if delta_ef < 0 else (row_b, row_a)
@@ -378,6 +412,8 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
378
412
  'n_only_a' : n_only_a,
379
413
  'n_only_b' : n_only_b,
380
414
  'intersection' : inters,
415
+ 'intersection_ef_sample': inter_ef,
416
+ 'intersection_n_sample' : inter_n,
381
417
  'only_cluster_a' : only_subset_a,
382
418
  'only_cluster_b' : only_subset_b,
383
419
  'score' : score,
@@ -390,7 +426,11 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
390
426
  # ------------------------------------------------------------------ #
391
427
  # 3. PIPELINE GENERAL PARA «n» Df2
392
428
  # ------------------------------------------------------------------ #
393
- def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
429
+ def run_experiments(
430
+ mx,
431
+ df2_dict: dict[str, pd.DataFrame],
432
+ data_dict: dict[str, pd.DataFrame] | None = None,
433
+ ) -> pd.DataFrame:
394
434
  """Generate and consolidate hypotheses for multiple datasets.
395
435
 
396
436
  Parameters
@@ -399,6 +439,10 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
399
439
  Instance used to extract metadata from ``cluster_descripcion`` fields.
400
440
  df2_dict : dict[str, pd.DataFrame]
401
441
  Mapping of dataset name to its corresponding Df2 table.
442
+ data_dict : dict[str, pd.DataFrame], optional
443
+ Optional mapping of dataset name to the raw data used to create each
444
+ Df2. When provided, intersection effectiveness and observation counts
445
+ are computed using these DataFrames.
402
446
 
403
447
  Returns
404
448
  -------
@@ -410,7 +454,8 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
410
454
 
411
455
  for name, df2 in df2_dict.items():
412
456
  df1 = mx.extract(df2)
413
- hypo = experiments_from_df2(df2, df1)
457
+ df_raw = data_dict.get(name) if data_dict else None
458
+ hypo = experiments_from_df2(df2, df1, df_raw, var_obj=getattr(mx, "var_obj", "target"))
414
459
 
415
460
  if not hypo.empty:
416
461
  hypo['dataset'] = name
@@ -427,6 +472,7 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
427
472
  'variables_a', 'variables_b', 'variables_intersection',
428
473
  'difficulty_a', 'difficulty_b', 'n_intersection',
429
474
  'n_only_a', 'n_only_b', 'intersection',
475
+ 'intersection_ef_sample', 'intersection_n_sample',
430
476
  'only_cluster_a', 'only_cluster_b', 'score']
431
477
  return pd.DataFrame(columns=cols)
432
478
 
@@ -44,7 +44,9 @@ class Models:
44
44
 
45
45
  X = df.drop(columns=[target_col]).values
46
46
  y = df.loc[:, target_col].values
47
- for k in range(1,int(len(df))):
47
+ fp = fn = 0
48
+ y_pred = None
49
+ for k in range(1, int(len(df))):
48
50
  try:
49
51
  knn = KNeighborsClassifier(n_neighbors=k)
50
52
  knn.fit(X, y)
@@ -55,17 +57,20 @@ class Models:
55
57
  break
56
58
  tn, fp, fn, tp = cm.ravel()
57
59
  if criterio_fp:
58
- if fp>min_obs:
60
+ if fp > min_obs:
59
61
  break
60
62
  else:
61
- if fn>min_obs:
63
+ if fn > min_obs:
62
64
  break
63
- if fn>0:
65
+ if y_pred is None:
66
+ return df.iloc[0:0], df
67
+ if fn > 0:
64
68
  false_negatives = (y == 1) & (y_pred == 0)
65
69
  return df[false_negatives], df[~false_negatives]
66
- if fp>0:
70
+ if fp > 0:
67
71
  false_positives = (y == 0) & (y_pred == 1)
68
72
  return df[false_positives], df[~false_positives]
73
+ return df.iloc[0:0], df
69
74
 
70
75
  def get_cvRF(self, X_train, y_train, param_grid):
71
76
  """Grid-search a RandomForest classifier.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: InsideForest
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: A comprehensive library for describing and analyzing data insights via AI
5
5
  Home-page: https://github.com/jcval94/InsideForest.git
6
6
  Author: Jose Carlos Del Valle
@@ -18,6 +18,7 @@ experiments/__init__.py
18
18
  experiments/benchmark.py
19
19
  experiments/benchmark_get_rangos.py
20
20
  experiments/rf_param_benchmark.py
21
+ experiments/select_clusters_hyperparam.py
21
22
  experiments/summary_benchmark.py
22
23
  tests/test_chimera_values_selector.py
23
24
  tests/test_cluster_selector.py
@@ -28,4 +29,6 @@ tests/test_inside_forest_fit_predict.py
28
29
  tests/test_inside_forest_params.py
29
30
  tests/test_inside_forest_regressor_fit_predict.py
30
31
  tests/test_iou_equivalence.py
32
+ tests/test_metadata_run_experiments.py
33
+ tests/test_models.py
31
34
  tests/test_trees.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: InsideForest
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: A comprehensive library for describing and analyzing data insights via AI
5
5
  Home-page: https://github.com/jcval94/InsideForest.git
6
6
  Author: Jose Carlos Del Valle
@@ -74,6 +74,21 @@ pred_labels = in_f.predict(X_rest) # cluster labels for the remaining data
74
74
  training_labels = in_f.labels_ # labels for the training subset
75
75
  ```
76
76
 
77
+ ### FAST presets and feature reduction
78
+
79
+ InsideForest can automatically pick faster training parameters and reduce
80
+ features based on dataset size:
81
+
82
+ ```python
83
+ in_f = InsideForestClassifier(auto_fast=True, auto_feature_reduce=True)
84
+ in_f.fit(X_train, y_train)
85
+ ```
86
+
87
+ Use `explicit_k_features` to fix the number of retained features and
88
+ `fast_overrides` to tweak the automatic presets. After fitting, the
89
+ attributes `_feature_mask_`, `feature_names_in_`, `feature_names_out_`,
90
+ `_size_bucket_`, and `_fast_params_used_` reveal the applied settings.
91
+
77
92
  You can control how final cluster labels are consolidated through the
78
93
  `method` parameter. Available strategies are:
79
94
 
@@ -221,12 +236,25 @@ Compares clusters A and B using the rules provided by a row from the experiments
221
236
  ## Experiments
222
237
 
223
238
  The `experiments/benchmark.py` module runs supervised clustering
224
- benchmarks on a medium sized dataset (`Digits`) and on a synthetically
225
- generated large dataset. It compares `InsideForest` with traditional
226
- baselines like KMeans and DBSCAN, reporting purity, macro F1-score and
227
- runtime for each method. It also performs a basic sensitivity analysis
228
- on key hyperparameters: `K` for KMeans and `eps`/`min_samples` for
229
- DBSCAN.
239
+ benchmarks on datasets such as `Digits`, `Iris` and `Wine`. It compares
240
+ `InsideForest` with traditional baselines like KMeans and DBSCAN,
241
+ reporting purity, macro F1-score, accuracy, information-theoretic
242
+ metrics and runtime. A basic sensitivity analysis is also provided for
243
+ key hyperparameters: `K` for KMeans and `eps`/`min_samples` for DBSCAN.
244
+
245
+ Recent results are summarized below:
246
+
247
+ | Dataset | Algorithm | Purity | Macro F1 | Accuracy | NMI | AMI | ARI | Bcubed F1 | Divergence | Time (s) |
248
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
249
+ | Digits | InsideForest | 0.783 | 0.362 | 0.261 | 0.501 | 0.339 | 0.169 | 0.218 | 0.789 | 39.570 |
250
+ | Digits | KMeans(k=10) | 0.673 | 0.620 | 0.666 | 0.672 | 0.669 | 0.531 | 0.633 | 0.711 | 0.047 |
251
+ | Digits | DBSCAN(eps=0.5,min=5) | 0.102 | 0.018 | 0.102 | 0.000 | 0.000 | 0.000 | 0.182 | 0.000 | 0.014 |
252
+ | Iris | InsideForest | 0.714 | 0.581 | 0.673 | 0.511 | 0.481 | 0.445 | 0.680 | 0.388 | 0.990 |
253
+ | Iris | KMeans(k=3) | 0.667 | 0.531 | 0.580 | 0.590 | 0.584 | 0.433 | 0.710 | 0.427 | 0.002 |
254
+ | Iris | DBSCAN(eps=0.5,min=5) | 0.680 | 0.674 | 0.680 | 0.511 | 0.505 | 0.442 | 0.651 | 0.402 | 0.002 |
255
+ | Wine | InsideForest | 0.810 | 0.511 | 0.422 | 0.398 | 0.285 | 0.248 | 0.484 | 0.495 | 3.308 |
256
+ | Wine | KMeans(k=3) | 0.966 | 0.967 | 0.966 | 0.876 | 0.875 | 0.897 | 0.937 | 0.628 | 0.004 |
257
+ | Wine | DBSCAN(eps=0.5,min=5) | 0.399 | 0.190 | 0.399 | 0.000 | 0.000 | 0.000 | 0.509 | 0.000 | 0.002 |
230
258
 
231
259
  Execute the script with:
232
260
 
@@ -254,6 +282,29 @@ os.environ["OPENAI_API_KEY"] = "sk-your-key"
254
282
  res = generate_descriptions(iris_conds, OPENAI_API_KEY=os.getenv("OPENAI_API_KEY"))
255
283
  ```
256
284
 
285
+ You can also interact with the OpenAI API directly:
286
+
287
+ ```python
288
+ from openai import OpenAI
289
+ import os
290
+
291
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
292
+ response = client.chat.completions.create(
293
+ model="gpt-4o-mini",
294
+ messages=[
295
+ {"role": "system", "content": "You are a helpful assistant."},
296
+ {
297
+ "role": "user",
298
+ "content": (
299
+ "Summarize: 4.3 <= sepal length (cm) <= 5.8 and "
300
+ "1.0 <= petal width (cm) <= 1.8"
301
+ ),
302
+ },
303
+ ],
304
+ )
305
+ print(response.choices[0].message.content)
306
+ ```
307
+
257
308
  ### `categorize_conditions`
258
309
 
259
310
  ```python
@@ -0,0 +1,86 @@
1
+ import itertools
2
+ import time
3
+ from typing import Dict, List
4
+
5
+ import pandas as pd
6
+ from sklearn.datasets import load_iris, load_wine
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+ from InsideForest import InsideForestClassifier
11
+ from experiments.benchmark import _evaluate
12
+
13
+
14
+ def _prepare_data(loader):
15
+ X, y = loader(return_X_y=True)
16
+ X = StandardScaler().fit_transform(X)
17
+ return train_test_split(X, y, train_size=0.35, stratify=y, random_state=42)
18
+
19
+
20
+ def run_experiments() -> pd.DataFrame:
21
+ datasets = {
22
+ "iris": load_iris,
23
+ "wine": load_wine,
24
+ }
25
+
26
+ param_grid = {
27
+ "divide": [3, 5, 7],
28
+ "leaf_percentile": [85, 90, 95],
29
+ "low_leaf_fraction": [0.01, 0.03, 0.05],
30
+ }
31
+
32
+ rows: List[Dict] = []
33
+
34
+ for ds_name, loader in datasets.items():
35
+ X_train, X_test, y_train, y_test = _prepare_data(loader)
36
+ for divide, leaf, low_frac in itertools.product(
37
+ param_grid["divide"],
38
+ param_grid["leaf_percentile"],
39
+ param_grid["low_leaf_fraction"],
40
+ ):
41
+ clf = InsideForestClassifier(
42
+ method="select_clusters",
43
+ divide=divide,
44
+ get_detail=False,
45
+ leaf_percentile=leaf,
46
+ low_leaf_fraction=low_frac,
47
+ )
48
+ start = time.time()
49
+ clf.fit(X_train, y_train)
50
+ preds = clf.predict(X_test)
51
+ runtime = time.time() - start
52
+
53
+ name = f"{ds_name}_div{divide}_leaf{leaf}_low{low_frac}"
54
+ metrics = _evaluate(y_test, preds, runtime, name).as_dict()
55
+ metrics.update(
56
+ {
57
+ "dataset": ds_name,
58
+ "divide": divide,
59
+ "leaf_percentile": leaf,
60
+ "low_leaf_fraction": low_frac,
61
+ }
62
+ )
63
+ rows.append(metrics)
64
+
65
+ return pd.DataFrame(rows)
66
+
67
+
68
+ def main() -> None:
69
+ df = run_experiments()
70
+ cols = [
71
+ "dataset",
72
+ "divide",
73
+ "leaf_percentile",
74
+ "low_leaf_fraction",
75
+ "purity",
76
+ "macro_f1",
77
+ "accuracy",
78
+ "nmi",
79
+ "bcubed_f1",
80
+ "runtime",
81
+ ]
82
+ print(df[cols].to_string(index=False))
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='InsideForest',
5
- version='0.3.2',
5
+ version='0.3.3',
6
6
  packages=find_packages(),
7
7
  license='MIT',
8
8
  license_files=['LICENSE'],
@@ -2,6 +2,7 @@ import os, sys
2
2
  sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
3
3
 
4
4
  import pandas as pd
5
+ import pytest
5
6
  from InsideForest.cluster_selector import select_clusters
6
7
 
7
8
 
@@ -23,3 +24,18 @@ def test_fallback_cluster_assignment():
23
24
  assert clusters[1] == 99
24
25
  assert clusters_all[1] == [99]
25
26
  assert ponderadores_all[1] == [0.0]
27
+
28
+
29
+ def test_missing_column_in_rule_raises_error():
30
+ df_datos = pd.DataFrame({'x': [0.5]})
31
+ cols = pd.MultiIndex.from_tuples([
32
+ ('linf', 'y'),
33
+ ('lsup', 'y'),
34
+ ('metrics', 'ponderador'),
35
+ ])
36
+ df_reglas = pd.DataFrame([[0.0, 1.0, 1.0]], columns=cols)
37
+ df_reglas['cluster'] = [1.0]
38
+
39
+ with pytest.raises(KeyError) as excinfo:
40
+ select_clusters(df_datos, df_reglas)
41
+ assert 'y' in str(excinfo.value)
@@ -9,6 +9,7 @@ from InsideForest.descrip import (
9
9
  _scale_clusters,
10
10
  _compute_inflection_points,
11
11
  _merge_outputs,
12
+ _list_rules_to_text,
12
13
  )
13
14
 
14
15
 
@@ -87,3 +88,8 @@ def test_merge_outputs():
87
88
  }
88
89
  assert expected_cols.issubset(final_df.columns)
89
90
  assert "cluster_ponderador" not in final_df.columns
91
+
92
+
93
+ def test_list_rules_to_text_empty_rule_set_returns_placeholder():
94
+ meta_df = pd.DataFrame()
95
+ assert _list_rules_to_text([], meta_df, lang="en") == "—"
@@ -49,6 +49,22 @@ def test_fit_accepts_df_with_target_column():
49
49
  assert model.labels_.shape[0] == len(df)
50
50
 
51
51
 
52
+ def test_fit_with_y_and_df_includes_target_column():
53
+ df = pd.DataFrame(
54
+ data={
55
+ 'feat1': [0, 1, 2, 3],
56
+ 'feat2': [3, 2, 1, 0],
57
+ 'target': [0, 1, 0, 1],
58
+ }
59
+ )
60
+ y = df['target'].to_numpy()
61
+ model = InsideForestClassifier(rf_params={'n_estimators': 5, 'random_state': 0})
62
+ model.fit(X=df, y=y)
63
+ assert 'target' not in model.feature_names_
64
+ preds = model.predict(df[['feat1', 'feat2']])
65
+ assert preds.shape == (4,)
66
+
67
+
52
68
  def test_fit_df_missing_target_raises():
53
69
  df = pd.DataFrame(data={'feat1': [0, 1], 'feat2': [1, 0]})
54
70
  model = InsideForestClassifier()
@@ -0,0 +1,40 @@
1
+ import pandas as pd
2
+ import pytest
3
+
4
+ from InsideForest.metadata import MetaExtractor, run_experiments
5
+
6
+
7
+ def test_run_experiments_includes_intersection_stats():
8
+ # dataset with simple target
9
+ df = pd.DataFrame({
10
+ 'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
11
+ 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
12
+ 'target': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
13
+ })
14
+
15
+ # cluster descriptions with a shared rule on x
16
+ df2 = pd.DataFrame({
17
+ 'cluster': [0, 1],
18
+ 'cluster_descripcion': [
19
+ '0 <= x <= 5 AND 0 <= y <= 5',
20
+ '0 <= x <= 5 AND 5 <= y <= 10',
21
+ ],
22
+ 'cluster_ef_sample': [0.2, 1.0],
23
+ 'cluster_n_sample': [5, 1],
24
+ })
25
+
26
+ # minimal metadata for variables x and y
27
+ meta_df = pd.DataFrame({
28
+ 'actionability.increase_difficulty': [1, 1],
29
+ 'actionability.decrease_difficulty': [1, 1],
30
+ }, index=['x', 'y'])
31
+
32
+ mx = MetaExtractor(meta_df, var_obj='target')
33
+
34
+ result = run_experiments(mx, {'ds': df2}, data_dict={'ds': df})
35
+
36
+ assert 'intersection_n_sample' in result.columns
37
+ assert 'intersection_ef_sample' in result.columns
38
+ row = result.iloc[0]
39
+ assert row['intersection_n_sample'] == 5
40
+ assert row['intersection_ef_sample'] == pytest.approx(0.2)
@@ -0,0 +1,30 @@
1
+ import pandas as pd
2
+ from InsideForest.models import Models
3
+
4
+
5
+ def test_get_knn_rows_success():
6
+ df = pd.DataFrame({'feature': [0, 1, 2, 3, 4, 5],
7
+ 'target': [0, 0, 0, 1, 1, 1]})
8
+ models = Models()
9
+ mis_df, rest_df = models.get_knn_rows(df, 'target', criterio_fp=False, min_obs=0)
10
+ assert not mis_df.empty
11
+ assert len(mis_df) + len(rest_df) == len(df)
12
+ assert rest_df.equals(df.drop(mis_df.index))
13
+
14
+
15
+ def test_get_knn_rows_no_misclassification():
16
+ df = pd.DataFrame({'feature': [0, 1, 2, 3, 4, 5],
17
+ 'target': [0, 0, 0, 1, 1, 1]})
18
+ models = Models()
19
+ mis_df, rest_df = models.get_knn_rows(df, 'target', min_obs=10)
20
+ assert mis_df.empty
21
+ assert rest_df.equals(df)
22
+
23
+
24
+ def test_get_knn_rows_training_error():
25
+ df = pd.DataFrame({'feature': ['a', 'b', 'c'],
26
+ 'target': [0, 1, 0]})
27
+ models = Models()
28
+ mis_df, rest_df = models.get_knn_rows(df, 'target')
29
+ assert mis_df.empty
30
+ assert rest_df.equals(df)
File without changes
File without changes