InsideForest 0.3.2__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/cluster_selector.py +3 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/inside_forest.py +215 -1
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/labels.py +6 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/metadata.py +51 -5
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/models.py +10 -5
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/PKG-INFO +1 -1
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/SOURCES.txt +3 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/PKG-INFO +1 -1
- {insideforest-0.3.2 → insideforest-0.3.3}/README.md +57 -6
- insideforest-0.3.3/experiments/select_clusters_hyperparam.py +86 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/setup.py +1 -1
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_cluster_selector.py +16 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_descrip_helpers.py +6 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_fit_predict.py +16 -0
- insideforest-0.3.3/tests/test_metadata_run_experiments.py +40 -0
- insideforest-0.3.3/tests/test_models.py +30 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/__init__.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/descrip.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/regions.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/trees.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/dependency_links.txt +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/top_level.txt +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/LICENSE +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/experiments/__init__.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/experiments/benchmark.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/experiments/benchmark_get_rangos.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/experiments/rf_param_benchmark.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/experiments/summary_benchmark.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/setup.cfg +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_chimera_values_selector.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_descrip.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_eps_search_perf.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_params.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_regressor_fit_predict.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_iou_equivalence.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.3}/tests/test_trees.py +0 -0
|
@@ -86,6 +86,9 @@ def select_clusters(
|
|
|
86
86
|
ponderador = regla['ponderador']
|
|
87
87
|
cluster = regla['cluster']
|
|
88
88
|
|
|
89
|
+
missing_cols = [col for col in variables if col not in df_datos.columns]
|
|
90
|
+
if missing_cols:
|
|
91
|
+
raise KeyError(f"Columns not found in df_datos: {missing_cols}")
|
|
89
92
|
X_datos = df_datos[variables]
|
|
90
93
|
condiciones = [
|
|
91
94
|
(X_datos[var].to_numpy() >= linf[var]) & (X_datos[var].to_numpy() <= lsup[var])
|
|
@@ -1,14 +1,93 @@
|
|
|
1
|
-
import
|
|
1
|
+
from typing import Optional, Dict, Any, List, Tuple
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
2
4
|
import joblib
|
|
5
|
+
try:
|
|
6
|
+
import pandas as pd
|
|
7
|
+
_HAS_PANDAS = True
|
|
8
|
+
except Exception:
|
|
9
|
+
_HAS_PANDAS = False
|
|
3
10
|
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
4
11
|
from sklearn.exceptions import NotFittedError
|
|
5
12
|
from sklearn.utils.validation import check_is_fitted
|
|
13
|
+
from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression
|
|
14
|
+
from sklearn.utils.multiclass import type_of_target
|
|
6
15
|
|
|
7
16
|
from .trees import Trees
|
|
8
17
|
from .regions import Regions
|
|
9
18
|
from .descrip import get_frontiers
|
|
10
19
|
|
|
11
20
|
|
|
21
|
+
# ---------- FAST helpers ----------
|
|
22
|
+
def _size_bucket(n: int, d: int) -> str:
|
|
23
|
+
prod = n * d
|
|
24
|
+
if prod <= 50_000:
|
|
25
|
+
return "small"
|
|
26
|
+
elif prod <= 200_000:
|
|
27
|
+
return "medium"
|
|
28
|
+
elif prod <= 1_000_000:
|
|
29
|
+
return "large"
|
|
30
|
+
else:
|
|
31
|
+
return "huge"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _choose_k_features(n: int, d: int) -> int:
|
|
35
|
+
bucket = _size_bucket(n, d)
|
|
36
|
+
if bucket == "small":
|
|
37
|
+
k = min(d, 64)
|
|
38
|
+
elif bucket == "medium":
|
|
39
|
+
k = min(d, 48)
|
|
40
|
+
elif bucket == "large":
|
|
41
|
+
k = min(d, 32)
|
|
42
|
+
else:
|
|
43
|
+
k = min(d, 24)
|
|
44
|
+
return max(8, k)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _choose_fast_params(n: int, d: int) -> Dict[str, Any]:
|
|
48
|
+
bucket = _size_bucket(n, d)
|
|
49
|
+
if bucket == "small":
|
|
50
|
+
rf_params = dict(n_estimators=80, max_depth=12, min_samples_leaf=3, n_jobs=-1, random_state=42)
|
|
51
|
+
tree_params = dict(percentil=98, low_frac=0.02)
|
|
52
|
+
divide = 3
|
|
53
|
+
method = "menu"
|
|
54
|
+
elif bucket == "medium":
|
|
55
|
+
rf_params = dict(n_estimators=60, max_depth=10, min_samples_leaf=5, n_jobs=-1, random_state=42)
|
|
56
|
+
tree_params = dict(percentil=99, low_frac=0.01)
|
|
57
|
+
divide = 3
|
|
58
|
+
method = "menu"
|
|
59
|
+
elif bucket == "large":
|
|
60
|
+
rf_params = dict(n_estimators=40, max_depth=9, min_samples_leaf=8, n_jobs=-1, random_state=42)
|
|
61
|
+
tree_params = dict(percentil=99.5, low_frac=0.0075)
|
|
62
|
+
divide = 3
|
|
63
|
+
method = "menu"
|
|
64
|
+
else:
|
|
65
|
+
rf_params = dict(n_estimators=30, max_depth=8, min_samples_leaf=10, n_jobs=-1, random_state=42)
|
|
66
|
+
tree_params = dict(percentil=99.7, low_frac=0.005)
|
|
67
|
+
divide = 3
|
|
68
|
+
method = "menu"
|
|
69
|
+
|
|
70
|
+
return dict(
|
|
71
|
+
rf_params=rf_params,
|
|
72
|
+
tree_params=tree_params,
|
|
73
|
+
divide=divide,
|
|
74
|
+
method=method,
|
|
75
|
+
get_detail=False,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _merge_dicts(base: Dict[str, Any], override: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
|
80
|
+
if not override:
|
|
81
|
+
return base
|
|
82
|
+
out = dict(base)
|
|
83
|
+
for k, v in override.items():
|
|
84
|
+
if isinstance(v, dict) and isinstance(out.get(k), dict):
|
|
85
|
+
out[k] = {**out[k], **v}
|
|
86
|
+
else:
|
|
87
|
+
out[k] = v
|
|
88
|
+
return out
|
|
89
|
+
|
|
90
|
+
|
|
12
91
|
class _BaseInsideForest:
|
|
13
92
|
"""Internal base class handling shared ``fit`` and ``predict`` logic.
|
|
14
93
|
|
|
@@ -67,7 +146,12 @@ class _BaseInsideForest:
|
|
|
67
146
|
get_detail=False,
|
|
68
147
|
leaf_percentile=95,
|
|
69
148
|
low_leaf_fraction=0.05,
|
|
149
|
+
auto_fast=False,
|
|
150
|
+
auto_feature_reduce=False,
|
|
151
|
+
explicit_k_features: Optional[int] = None,
|
|
152
|
+
fast_overrides: Optional[Dict[str, Any]] = None,
|
|
70
153
|
):
|
|
154
|
+
self.rf_cls = rf_cls
|
|
71
155
|
self.rf_params = rf_params or {}
|
|
72
156
|
self.tree_params = tree_params or {}
|
|
73
157
|
self.var_obj = var_obj
|
|
@@ -79,6 +163,19 @@ class _BaseInsideForest:
|
|
|
79
163
|
self.leaf_percentile = leaf_percentile
|
|
80
164
|
self.low_leaf_fraction = low_leaf_fraction
|
|
81
165
|
|
|
166
|
+
# FAST knobs
|
|
167
|
+
self.auto_fast = auto_fast
|
|
168
|
+
self.auto_feature_reduce = auto_feature_reduce
|
|
169
|
+
self.explicit_k_features = explicit_k_features
|
|
170
|
+
self.fast_overrides = fast_overrides or {}
|
|
171
|
+
|
|
172
|
+
# FAST bookkeeping
|
|
173
|
+
self._feature_mask_: Optional[np.ndarray] = None
|
|
174
|
+
self.feature_names_in_: Optional[List[str]] = None
|
|
175
|
+
self.feature_names_out_: Optional[List[str]] = None
|
|
176
|
+
self._size_bucket_: Optional[str] = None
|
|
177
|
+
self._fast_params_used_: Optional[Dict[str, Any]] = None
|
|
178
|
+
|
|
82
179
|
# Ensure tree parameters include the percentile settings
|
|
83
180
|
self.tree_params.setdefault("percentil", leaf_percentile)
|
|
84
181
|
self.tree_params.setdefault("low_frac", low_leaf_fraction)
|
|
@@ -121,6 +218,10 @@ class _BaseInsideForest:
|
|
|
121
218
|
"get_detail": self.get_detail,
|
|
122
219
|
"leaf_percentile": self.leaf_percentile,
|
|
123
220
|
"low_leaf_fraction": self.low_leaf_fraction,
|
|
221
|
+
"auto_fast": self.auto_fast,
|
|
222
|
+
"auto_feature_reduce": self.auto_feature_reduce,
|
|
223
|
+
"explicit_k_features": self.explicit_k_features,
|
|
224
|
+
"fast_overrides": self.fast_overrides,
|
|
124
225
|
}
|
|
125
226
|
|
|
126
227
|
def set_params(self, **params):
|
|
@@ -164,6 +265,10 @@ class _BaseInsideForest:
|
|
|
164
265
|
"get_detail",
|
|
165
266
|
"leaf_percentile",
|
|
166
267
|
"low_leaf_fraction",
|
|
268
|
+
"auto_fast",
|
|
269
|
+
"auto_feature_reduce",
|
|
270
|
+
"explicit_k_features",
|
|
271
|
+
"fast_overrides",
|
|
167
272
|
}:
|
|
168
273
|
setattr(self, key, value)
|
|
169
274
|
if key == "leaf_percentile":
|
|
@@ -177,6 +282,59 @@ class _BaseInsideForest:
|
|
|
177
282
|
|
|
178
283
|
return self
|
|
179
284
|
|
|
285
|
+
def _maybe_reduce_features(self, X, y=None):
|
|
286
|
+
"""Optionally reduce features; preserve original column names if DataFrame."""
|
|
287
|
+
if not self.auto_feature_reduce:
|
|
288
|
+
if _HAS_PANDAS and isinstance(X, pd.DataFrame):
|
|
289
|
+
self.feature_names_in_ = list(X.columns)
|
|
290
|
+
self.feature_names_out_ = list(X.columns)
|
|
291
|
+
else:
|
|
292
|
+
self.feature_names_in_ = None
|
|
293
|
+
self.feature_names_out_ = None
|
|
294
|
+
self._feature_mask_ = None
|
|
295
|
+
return X
|
|
296
|
+
|
|
297
|
+
n, d = X.shape
|
|
298
|
+
k = (
|
|
299
|
+
self.explicit_k_features
|
|
300
|
+
if self.explicit_k_features is not None
|
|
301
|
+
else _choose_k_features(n, d)
|
|
302
|
+
)
|
|
303
|
+
k = min(k, d)
|
|
304
|
+
|
|
305
|
+
is_df = _HAS_PANDAS and isinstance(X, pd.DataFrame)
|
|
306
|
+
self.feature_names_in_ = list(X.columns) if is_df else None
|
|
307
|
+
X_arr = X.values if is_df else np.asarray(X)
|
|
308
|
+
|
|
309
|
+
support = None
|
|
310
|
+
if y is not None:
|
|
311
|
+
try:
|
|
312
|
+
ytype = type_of_target(y)
|
|
313
|
+
except Exception:
|
|
314
|
+
ytype = None
|
|
315
|
+
if ytype in {"binary", "multiclass"}:
|
|
316
|
+
sel = SelectKBest(mutual_info_classif, k=k).fit(X_arr, y)
|
|
317
|
+
support = sel.get_support()
|
|
318
|
+
elif ytype in {"continuous", "continuous-multioutput"}:
|
|
319
|
+
sel = SelectKBest(mutual_info_regression, k=k).fit(X_arr, y)
|
|
320
|
+
support = sel.get_support()
|
|
321
|
+
if support is None:
|
|
322
|
+
variances = X_arr.var(axis=0)
|
|
323
|
+
idx_sorted = np.argsort(-variances)[:k]
|
|
324
|
+
support = np.zeros(X_arr.shape[1], dtype=bool)
|
|
325
|
+
support[idx_sorted] = True
|
|
326
|
+
|
|
327
|
+
self._feature_mask_ = support
|
|
328
|
+
|
|
329
|
+
if is_df:
|
|
330
|
+
cols = np.array(self.feature_names_in_)
|
|
331
|
+
keep_cols = cols[support].tolist()
|
|
332
|
+
self.feature_names_out_ = keep_cols
|
|
333
|
+
return X[keep_cols]
|
|
334
|
+
else:
|
|
335
|
+
self.feature_names_out_ = None
|
|
336
|
+
return X_arr[:, support]
|
|
337
|
+
|
|
180
338
|
def fit(self, X, y=None, rf=None):
|
|
181
339
|
"""Fit the internal random forest and compute cluster labels.
|
|
182
340
|
|
|
@@ -220,13 +378,53 @@ class _BaseInsideForest:
|
|
|
220
378
|
else:
|
|
221
379
|
if isinstance(X, pd.DataFrame):
|
|
222
380
|
X_df = X.copy()
|
|
381
|
+
if self.var_obj in X_df.columns:
|
|
382
|
+
X_df = X_df.drop(columns=[self.var_obj])
|
|
223
383
|
else:
|
|
224
384
|
X_df = pd.DataFrame(data=X)
|
|
225
385
|
|
|
226
386
|
# Replace spaces with underscores to keep compatibility with Trees
|
|
227
387
|
X_df.columns = [str(c).replace(" ", "_") for c in X_df.columns]
|
|
388
|
+
|
|
389
|
+
# 0) Feature reduction (optional)
|
|
390
|
+
Xr = self._maybe_reduce_features(X_df, y)
|
|
391
|
+
if _HAS_PANDAS and isinstance(Xr, pd.DataFrame):
|
|
392
|
+
X_df = Xr
|
|
393
|
+
else:
|
|
394
|
+
X_df = pd.DataFrame(Xr)
|
|
228
395
|
self.feature_names_ = list(X_df.columns)
|
|
229
396
|
|
|
397
|
+
# 1) Fast preset (optional)
|
|
398
|
+
if self.auto_fast:
|
|
399
|
+
n, d = X_df.shape
|
|
400
|
+
auto = _choose_fast_params(n, d)
|
|
401
|
+
combined = dict(auto)
|
|
402
|
+
|
|
403
|
+
if isinstance(self.rf_params, dict):
|
|
404
|
+
combined["rf_params"] = {**auto["rf_params"], **self.rf_params}
|
|
405
|
+
if isinstance(self.tree_params, dict):
|
|
406
|
+
combined["tree_params"] = {**auto["tree_params"], **self.tree_params}
|
|
407
|
+
if hasattr(self, "divide"):
|
|
408
|
+
combined["divide"] = getattr(self, "divide", auto["divide"])
|
|
409
|
+
if hasattr(self, "method"):
|
|
410
|
+
combined["method"] = "menu" if y is not None else getattr(self, "method", auto["method"])
|
|
411
|
+
if hasattr(self, "get_detail"):
|
|
412
|
+
combined["get_detail"] = getattr(self, "get_detail", auto["get_detail"])
|
|
413
|
+
|
|
414
|
+
combined = _merge_dicts(combined, self.fast_overrides)
|
|
415
|
+
|
|
416
|
+
self._fast_params_used_ = combined
|
|
417
|
+
self._size_bucket_ = _size_bucket(n, d)
|
|
418
|
+
|
|
419
|
+
self.rf_params = combined.get("rf_params", self.rf_params)
|
|
420
|
+
self.tree_params = combined.get("tree_params", self.tree_params)
|
|
421
|
+
self.divide = combined.get("divide", self.divide)
|
|
422
|
+
self.method = combined.get("method", self.method)
|
|
423
|
+
self.get_detail = combined.get("get_detail", self.get_detail)
|
|
424
|
+
|
|
425
|
+
self.rf = self.rf_cls(**self.rf_params)
|
|
426
|
+
self.trees = Trees(**self.tree_params)
|
|
427
|
+
|
|
230
428
|
# Allow passing a custom random forest estimator
|
|
231
429
|
if rf is not None:
|
|
232
430
|
self.rf = rf
|
|
@@ -518,6 +716,10 @@ class InsideForestClassifier(_BaseInsideForest):
|
|
|
518
716
|
get_detail=False,
|
|
519
717
|
leaf_percentile=95,
|
|
520
718
|
low_leaf_fraction=0.05,
|
|
719
|
+
auto_fast=False,
|
|
720
|
+
auto_feature_reduce=False,
|
|
721
|
+
explicit_k_features: Optional[int] = None,
|
|
722
|
+
fast_overrides: Optional[Dict[str, Any]] = None,
|
|
521
723
|
):
|
|
522
724
|
super().__init__(
|
|
523
725
|
RandomForestClassifier,
|
|
@@ -531,6 +733,10 @@ class InsideForestClassifier(_BaseInsideForest):
|
|
|
531
733
|
get_detail=get_detail,
|
|
532
734
|
leaf_percentile=leaf_percentile,
|
|
533
735
|
low_leaf_fraction=low_leaf_fraction,
|
|
736
|
+
auto_fast=auto_fast,
|
|
737
|
+
auto_feature_reduce=auto_feature_reduce,
|
|
738
|
+
explicit_k_features=explicit_k_features,
|
|
739
|
+
fast_overrides=fast_overrides,
|
|
534
740
|
)
|
|
535
741
|
|
|
536
742
|
|
|
@@ -550,6 +756,10 @@ class InsideForestRegressor(_BaseInsideForest):
|
|
|
550
756
|
get_detail=False,
|
|
551
757
|
leaf_percentile=95,
|
|
552
758
|
low_leaf_fraction=0.05,
|
|
759
|
+
auto_fast=False,
|
|
760
|
+
auto_feature_reduce=False,
|
|
761
|
+
explicit_k_features: Optional[int] = None,
|
|
762
|
+
fast_overrides: Optional[Dict[str, Any]] = None,
|
|
553
763
|
):
|
|
554
764
|
super().__init__(
|
|
555
765
|
RandomForestRegressor,
|
|
@@ -563,6 +773,10 @@ class InsideForestRegressor(_BaseInsideForest):
|
|
|
563
773
|
get_detail=get_detail,
|
|
564
774
|
leaf_percentile=leaf_percentile,
|
|
565
775
|
low_leaf_fraction=low_leaf_fraction,
|
|
776
|
+
auto_fast=auto_fast,
|
|
777
|
+
auto_feature_reduce=auto_feature_reduce,
|
|
778
|
+
explicit_k_features=explicit_k_features,
|
|
779
|
+
fast_overrides=fast_overrides,
|
|
566
780
|
)
|
|
567
781
|
|
|
568
782
|
|
|
@@ -129,6 +129,12 @@ class Labels:
|
|
|
129
129
|
upper_bounds = sub_df.loc[row_index, 'lsup'].copy()
|
|
130
130
|
variables = list(upper_bounds.index)
|
|
131
131
|
|
|
132
|
+
# Early exit when there are no variables to filter on.
|
|
133
|
+
# Returning an empty DataFrame prevents index errors when constructing
|
|
134
|
+
# boolean conditions on an empty list of variables.
|
|
135
|
+
if len(variables) == 0:
|
|
136
|
+
return df.iloc[0:0]
|
|
137
|
+
|
|
132
138
|
conditions = [
|
|
133
139
|
(df[var] <= upper_bounds[var]) & (df[var] > lower_bounds[var])
|
|
134
140
|
for var in variables
|
|
@@ -297,7 +297,12 @@ def conditions_to_tokens(conds: list[str]) -> set[str]:
|
|
|
297
297
|
# 2. GENERADOR DE EXPERIMENTOS PARA UN SOLO Df2
|
|
298
298
|
# ------------------------------------------------------------------ #
|
|
299
299
|
|
|
300
|
-
def experiments_from_df2(
|
|
300
|
+
def experiments_from_df2(
|
|
301
|
+
df2: pd.DataFrame,
|
|
302
|
+
meta: pd.DataFrame,
|
|
303
|
+
df_data: pd.DataFrame | None = None,
|
|
304
|
+
var_obj: str = "target",
|
|
305
|
+
) -> pd.DataFrame:
|
|
301
306
|
"""Generate pairwise cluster comparisons for a single Df2.
|
|
302
307
|
|
|
303
308
|
Parameters
|
|
@@ -307,13 +312,20 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
|
|
|
307
312
|
``cluster_ef_sample`` and ``cluster_n_sample`` columns.
|
|
308
313
|
meta : pd.DataFrame
|
|
309
314
|
Metadata indexed by ``rule_token`` providing actionability metrics.
|
|
315
|
+
df_data : pd.DataFrame, optional
|
|
316
|
+
Raw dataset used to compute cluster descriptions. If provided, the
|
|
317
|
+
effectiveness and number of observations for the intersection between
|
|
318
|
+
clusters is computed by applying the shared rules to ``df_data``.
|
|
319
|
+
var_obj : str, default "target"
|
|
320
|
+
Name of the target column inside ``df_data`` used to estimate
|
|
321
|
+
effectiveness.
|
|
310
322
|
|
|
311
323
|
Returns
|
|
312
324
|
-------
|
|
313
325
|
pd.DataFrame
|
|
314
326
|
Each row contains the comparison between two clusters along with the
|
|
315
|
-
exclusive variables and a score penalizing
|
|
316
|
-
rewarding overlap.
|
|
327
|
+
exclusive variables, intersection statistics and a score penalizing
|
|
328
|
+
difficult actions and rewarding overlap.
|
|
317
329
|
"""
|
|
318
330
|
# --- action table -----------------------------
|
|
319
331
|
meta_idx = meta.set_index('rule_token')
|
|
@@ -328,6 +340,28 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
|
|
|
328
340
|
only_a = sorted(conds_a - conds_b)
|
|
329
341
|
only_b = sorted(conds_b - conds_a)
|
|
330
342
|
|
|
343
|
+
# -------------------- intersection stats -----------------------
|
|
344
|
+
inter_ef = None
|
|
345
|
+
inter_n = None
|
|
346
|
+
if df_data is not None and inters:
|
|
347
|
+
def _apply_conditions(data, conds):
|
|
348
|
+
for cond in conds:
|
|
349
|
+
match = re.match(
|
|
350
|
+
r"\s*([-\d\.eE]+)\s*<=\s*([A-Za-z_][A-Za-z0-9_]*)\s*<=\s*([-\d\.eE]+)",
|
|
351
|
+
str(cond),
|
|
352
|
+
)
|
|
353
|
+
if match and match.group(2) in data.columns:
|
|
354
|
+
low = float(match.group(1))
|
|
355
|
+
high = float(match.group(3))
|
|
356
|
+
var = match.group(2)
|
|
357
|
+
data = data[(data[var] >= low) & (data[var] <= high)]
|
|
358
|
+
return data
|
|
359
|
+
|
|
360
|
+
df_inter = _apply_conditions(df_data.copy(), inters)
|
|
361
|
+
inter_n = int(df_inter.shape[0])
|
|
362
|
+
if var_obj in df_inter.columns and inter_n > 0:
|
|
363
|
+
inter_ef = float(df_inter[var_obj].mean())
|
|
364
|
+
|
|
331
365
|
# Determine which cluster has lower effectiveness
|
|
332
366
|
delta_ef = row_a['cluster_ef_sample'] - row_b['cluster_ef_sample']
|
|
333
367
|
row_a_subset, row_b_subset = (row_a, row_b) if delta_ef < 0 else (row_b, row_a)
|
|
@@ -378,6 +412,8 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
|
|
|
378
412
|
'n_only_a' : n_only_a,
|
|
379
413
|
'n_only_b' : n_only_b,
|
|
380
414
|
'intersection' : inters,
|
|
415
|
+
'intersection_ef_sample': inter_ef,
|
|
416
|
+
'intersection_n_sample' : inter_n,
|
|
381
417
|
'only_cluster_a' : only_subset_a,
|
|
382
418
|
'only_cluster_b' : only_subset_b,
|
|
383
419
|
'score' : score,
|
|
@@ -390,7 +426,11 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
|
|
|
390
426
|
# ------------------------------------------------------------------ #
|
|
391
427
|
# 3. PIPELINE GENERAL PARA «n» Df2
|
|
392
428
|
# ------------------------------------------------------------------ #
|
|
393
|
-
def run_experiments(
|
|
429
|
+
def run_experiments(
|
|
430
|
+
mx,
|
|
431
|
+
df2_dict: dict[str, pd.DataFrame],
|
|
432
|
+
data_dict: dict[str, pd.DataFrame] | None = None,
|
|
433
|
+
) -> pd.DataFrame:
|
|
394
434
|
"""Generate and consolidate hypotheses for multiple datasets.
|
|
395
435
|
|
|
396
436
|
Parameters
|
|
@@ -399,6 +439,10 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
|
|
399
439
|
Instance used to extract metadata from ``cluster_descripcion`` fields.
|
|
400
440
|
df2_dict : dict[str, pd.DataFrame]
|
|
401
441
|
Mapping of dataset name to its corresponding Df2 table.
|
|
442
|
+
data_dict : dict[str, pd.DataFrame], optional
|
|
443
|
+
Optional mapping of dataset name to the raw data used to create each
|
|
444
|
+
Df2. When provided, intersection effectiveness and observation counts
|
|
445
|
+
are computed using these DataFrames.
|
|
402
446
|
|
|
403
447
|
Returns
|
|
404
448
|
-------
|
|
@@ -410,7 +454,8 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
|
|
410
454
|
|
|
411
455
|
for name, df2 in df2_dict.items():
|
|
412
456
|
df1 = mx.extract(df2)
|
|
413
|
-
|
|
457
|
+
df_raw = data_dict.get(name) if data_dict else None
|
|
458
|
+
hypo = experiments_from_df2(df2, df1, df_raw, var_obj=getattr(mx, "var_obj", "target"))
|
|
414
459
|
|
|
415
460
|
if not hypo.empty:
|
|
416
461
|
hypo['dataset'] = name
|
|
@@ -427,6 +472,7 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
|
|
427
472
|
'variables_a', 'variables_b', 'variables_intersection',
|
|
428
473
|
'difficulty_a', 'difficulty_b', 'n_intersection',
|
|
429
474
|
'n_only_a', 'n_only_b', 'intersection',
|
|
475
|
+
'intersection_ef_sample', 'intersection_n_sample',
|
|
430
476
|
'only_cluster_a', 'only_cluster_b', 'score']
|
|
431
477
|
return pd.DataFrame(columns=cols)
|
|
432
478
|
|
|
@@ -44,7 +44,9 @@ class Models:
|
|
|
44
44
|
|
|
45
45
|
X = df.drop(columns=[target_col]).values
|
|
46
46
|
y = df.loc[:, target_col].values
|
|
47
|
-
|
|
47
|
+
fp = fn = 0
|
|
48
|
+
y_pred = None
|
|
49
|
+
for k in range(1, int(len(df))):
|
|
48
50
|
try:
|
|
49
51
|
knn = KNeighborsClassifier(n_neighbors=k)
|
|
50
52
|
knn.fit(X, y)
|
|
@@ -55,17 +57,20 @@ class Models:
|
|
|
55
57
|
break
|
|
56
58
|
tn, fp, fn, tp = cm.ravel()
|
|
57
59
|
if criterio_fp:
|
|
58
|
-
if fp>min_obs:
|
|
60
|
+
if fp > min_obs:
|
|
59
61
|
break
|
|
60
62
|
else:
|
|
61
|
-
if fn>min_obs:
|
|
63
|
+
if fn > min_obs:
|
|
62
64
|
break
|
|
63
|
-
if
|
|
65
|
+
if y_pred is None:
|
|
66
|
+
return df.iloc[0:0], df
|
|
67
|
+
if fn > 0:
|
|
64
68
|
false_negatives = (y == 1) & (y_pred == 0)
|
|
65
69
|
return df[false_negatives], df[~false_negatives]
|
|
66
|
-
if fp>0:
|
|
70
|
+
if fp > 0:
|
|
67
71
|
false_positives = (y == 0) & (y_pred == 1)
|
|
68
72
|
return df[false_positives], df[~false_positives]
|
|
73
|
+
return df.iloc[0:0], df
|
|
69
74
|
|
|
70
75
|
def get_cvRF(self, X_train, y_train, param_grid):
|
|
71
76
|
"""Grid-search a RandomForest classifier.
|
|
@@ -18,6 +18,7 @@ experiments/__init__.py
|
|
|
18
18
|
experiments/benchmark.py
|
|
19
19
|
experiments/benchmark_get_rangos.py
|
|
20
20
|
experiments/rf_param_benchmark.py
|
|
21
|
+
experiments/select_clusters_hyperparam.py
|
|
21
22
|
experiments/summary_benchmark.py
|
|
22
23
|
tests/test_chimera_values_selector.py
|
|
23
24
|
tests/test_cluster_selector.py
|
|
@@ -28,4 +29,6 @@ tests/test_inside_forest_fit_predict.py
|
|
|
28
29
|
tests/test_inside_forest_params.py
|
|
29
30
|
tests/test_inside_forest_regressor_fit_predict.py
|
|
30
31
|
tests/test_iou_equivalence.py
|
|
32
|
+
tests/test_metadata_run_experiments.py
|
|
33
|
+
tests/test_models.py
|
|
31
34
|
tests/test_trees.py
|
|
@@ -74,6 +74,21 @@ pred_labels = in_f.predict(X_rest) # cluster labels for the remaining data
|
|
|
74
74
|
training_labels = in_f.labels_ # labels for the training subset
|
|
75
75
|
```
|
|
76
76
|
|
|
77
|
+
### FAST presets and feature reduction
|
|
78
|
+
|
|
79
|
+
InsideForest can automatically pick faster training parameters and reduce
|
|
80
|
+
features based on dataset size:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
in_f = InsideForestClassifier(auto_fast=True, auto_feature_reduce=True)
|
|
84
|
+
in_f.fit(X_train, y_train)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Use `explicit_k_features` to fix the number of retained features and
|
|
88
|
+
`fast_overrides` to tweak the automatic presets. After fitting, the
|
|
89
|
+
attributes `_feature_mask_`, `feature_names_in_`, `feature_names_out_`,
|
|
90
|
+
`_size_bucket_`, and `_fast_params_used_` reveal the applied settings.
|
|
91
|
+
|
|
77
92
|
You can control how final cluster labels are consolidated through the
|
|
78
93
|
`method` parameter. Available strategies are:
|
|
79
94
|
|
|
@@ -221,12 +236,25 @@ Compares clusters A and B using the rules provided by a row from the experiments
|
|
|
221
236
|
## Experiments
|
|
222
237
|
|
|
223
238
|
The `experiments/benchmark.py` module runs supervised clustering
|
|
224
|
-
benchmarks on
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
239
|
+
benchmarks on datasets such as `Digits`, `Iris` and `Wine`. It compares
|
|
240
|
+
`InsideForest` with traditional baselines like KMeans and DBSCAN,
|
|
241
|
+
reporting purity, macro F1-score, accuracy, information-theoretic
|
|
242
|
+
metrics and runtime. A basic sensitivity analysis is also provided for
|
|
243
|
+
key hyperparameters: `K` for KMeans and `eps`/`min_samples` for DBSCAN.
|
|
244
|
+
|
|
245
|
+
Recent results are summarized below:
|
|
246
|
+
|
|
247
|
+
| Dataset | Algorithm | Purity | Macro F1 | Accuracy | NMI | AMI | ARI | Bcubed F1 | Divergence | Time (s) |
|
|
248
|
+
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
|
249
|
+
| Digits | InsideForest | 0.783 | 0.362 | 0.261 | 0.501 | 0.339 | 0.169 | 0.218 | 0.789 | 39.570 |
|
|
250
|
+
| Digits | KMeans(k=10) | 0.673 | 0.620 | 0.666 | 0.672 | 0.669 | 0.531 | 0.633 | 0.711 | 0.047 |
|
|
251
|
+
| Digits | DBSCAN(eps=0.5,min=5) | 0.102 | 0.018 | 0.102 | 0.000 | 0.000 | 0.000 | 0.182 | 0.000 | 0.014 |
|
|
252
|
+
| Iris | InsideForest | 0.714 | 0.581 | 0.673 | 0.511 | 0.481 | 0.445 | 0.680 | 0.388 | 0.990 |
|
|
253
|
+
| Iris | KMeans(k=3) | 0.667 | 0.531 | 0.580 | 0.590 | 0.584 | 0.433 | 0.710 | 0.427 | 0.002 |
|
|
254
|
+
| Iris | DBSCAN(eps=0.5,min=5) | 0.680 | 0.674 | 0.680 | 0.511 | 0.505 | 0.442 | 0.651 | 0.402 | 0.002 |
|
|
255
|
+
| Wine | InsideForest | 0.810 | 0.511 | 0.422 | 0.398 | 0.285 | 0.248 | 0.484 | 0.495 | 3.308 |
|
|
256
|
+
| Wine | KMeans(k=3) | 0.966 | 0.967 | 0.966 | 0.876 | 0.875 | 0.897 | 0.937 | 0.628 | 0.004 |
|
|
257
|
+
| Wine | DBSCAN(eps=0.5,min=5) | 0.399 | 0.190 | 0.399 | 0.000 | 0.000 | 0.000 | 0.509 | 0.000 | 0.002 |
|
|
230
258
|
|
|
231
259
|
Execute the script with:
|
|
232
260
|
|
|
@@ -254,6 +282,29 @@ os.environ["OPENAI_API_KEY"] = "sk-your-key"
|
|
|
254
282
|
res = generate_descriptions(iris_conds, OPENAI_API_KEY=os.getenv("OPENAI_API_KEY"))
|
|
255
283
|
```
|
|
256
284
|
|
|
285
|
+
You can also interact with the OpenAI API directly:
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
from openai import OpenAI
|
|
289
|
+
import os
|
|
290
|
+
|
|
291
|
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
292
|
+
response = client.chat.completions.create(
|
|
293
|
+
model="gpt-4o-mini",
|
|
294
|
+
messages=[
|
|
295
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
296
|
+
{
|
|
297
|
+
"role": "user",
|
|
298
|
+
"content": (
|
|
299
|
+
"Summarize: 4.3 <= sepal length (cm) <= 5.8 and "
|
|
300
|
+
"1.0 <= petal width (cm) <= 1.8"
|
|
301
|
+
),
|
|
302
|
+
},
|
|
303
|
+
],
|
|
304
|
+
)
|
|
305
|
+
print(response.choices[0].message.content)
|
|
306
|
+
```
|
|
307
|
+
|
|
257
308
|
### `categorize_conditions`
|
|
258
309
|
|
|
259
310
|
```python
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import time
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.datasets import load_iris, load_wine
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
|
|
10
|
+
from InsideForest import InsideForestClassifier
|
|
11
|
+
from experiments.benchmark import _evaluate
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _prepare_data(loader):
|
|
15
|
+
X, y = loader(return_X_y=True)
|
|
16
|
+
X = StandardScaler().fit_transform(X)
|
|
17
|
+
return train_test_split(X, y, train_size=0.35, stratify=y, random_state=42)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def run_experiments() -> pd.DataFrame:
|
|
21
|
+
datasets = {
|
|
22
|
+
"iris": load_iris,
|
|
23
|
+
"wine": load_wine,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
param_grid = {
|
|
27
|
+
"divide": [3, 5, 7],
|
|
28
|
+
"leaf_percentile": [85, 90, 95],
|
|
29
|
+
"low_leaf_fraction": [0.01, 0.03, 0.05],
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
rows: List[Dict] = []
|
|
33
|
+
|
|
34
|
+
for ds_name, loader in datasets.items():
|
|
35
|
+
X_train, X_test, y_train, y_test = _prepare_data(loader)
|
|
36
|
+
for divide, leaf, low_frac in itertools.product(
|
|
37
|
+
param_grid["divide"],
|
|
38
|
+
param_grid["leaf_percentile"],
|
|
39
|
+
param_grid["low_leaf_fraction"],
|
|
40
|
+
):
|
|
41
|
+
clf = InsideForestClassifier(
|
|
42
|
+
method="select_clusters",
|
|
43
|
+
divide=divide,
|
|
44
|
+
get_detail=False,
|
|
45
|
+
leaf_percentile=leaf,
|
|
46
|
+
low_leaf_fraction=low_frac,
|
|
47
|
+
)
|
|
48
|
+
start = time.time()
|
|
49
|
+
clf.fit(X_train, y_train)
|
|
50
|
+
preds = clf.predict(X_test)
|
|
51
|
+
runtime = time.time() - start
|
|
52
|
+
|
|
53
|
+
name = f"{ds_name}_div{divide}_leaf{leaf}_low{low_frac}"
|
|
54
|
+
metrics = _evaluate(y_test, preds, runtime, name).as_dict()
|
|
55
|
+
metrics.update(
|
|
56
|
+
{
|
|
57
|
+
"dataset": ds_name,
|
|
58
|
+
"divide": divide,
|
|
59
|
+
"leaf_percentile": leaf,
|
|
60
|
+
"low_leaf_fraction": low_frac,
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
rows.append(metrics)
|
|
64
|
+
|
|
65
|
+
return pd.DataFrame(rows)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def main() -> None:
|
|
69
|
+
df = run_experiments()
|
|
70
|
+
cols = [
|
|
71
|
+
"dataset",
|
|
72
|
+
"divide",
|
|
73
|
+
"leaf_percentile",
|
|
74
|
+
"low_leaf_fraction",
|
|
75
|
+
"purity",
|
|
76
|
+
"macro_f1",
|
|
77
|
+
"accuracy",
|
|
78
|
+
"nmi",
|
|
79
|
+
"bcubed_f1",
|
|
80
|
+
"runtime",
|
|
81
|
+
]
|
|
82
|
+
print(df[cols].to_string(index=False))
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
main()
|
|
@@ -2,6 +2,7 @@ import os, sys
|
|
|
2
2
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
+
import pytest
|
|
5
6
|
from InsideForest.cluster_selector import select_clusters
|
|
6
7
|
|
|
7
8
|
|
|
@@ -23,3 +24,18 @@ def test_fallback_cluster_assignment():
|
|
|
23
24
|
assert clusters[1] == 99
|
|
24
25
|
assert clusters_all[1] == [99]
|
|
25
26
|
assert ponderadores_all[1] == [0.0]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_missing_column_in_rule_raises_error():
|
|
30
|
+
df_datos = pd.DataFrame({'x': [0.5]})
|
|
31
|
+
cols = pd.MultiIndex.from_tuples([
|
|
32
|
+
('linf', 'y'),
|
|
33
|
+
('lsup', 'y'),
|
|
34
|
+
('metrics', 'ponderador'),
|
|
35
|
+
])
|
|
36
|
+
df_reglas = pd.DataFrame([[0.0, 1.0, 1.0]], columns=cols)
|
|
37
|
+
df_reglas['cluster'] = [1.0]
|
|
38
|
+
|
|
39
|
+
with pytest.raises(KeyError) as excinfo:
|
|
40
|
+
select_clusters(df_datos, df_reglas)
|
|
41
|
+
assert 'y' in str(excinfo.value)
|
|
@@ -9,6 +9,7 @@ from InsideForest.descrip import (
|
|
|
9
9
|
_scale_clusters,
|
|
10
10
|
_compute_inflection_points,
|
|
11
11
|
_merge_outputs,
|
|
12
|
+
_list_rules_to_text,
|
|
12
13
|
)
|
|
13
14
|
|
|
14
15
|
|
|
@@ -87,3 +88,8 @@ def test_merge_outputs():
|
|
|
87
88
|
}
|
|
88
89
|
assert expected_cols.issubset(final_df.columns)
|
|
89
90
|
assert "cluster_ponderador" not in final_df.columns
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_list_rules_to_text_empty_rule_set_returns_placeholder():
|
|
94
|
+
meta_df = pd.DataFrame()
|
|
95
|
+
assert _list_rules_to_text([], meta_df, lang="en") == "—"
|
|
@@ -49,6 +49,22 @@ def test_fit_accepts_df_with_target_column():
|
|
|
49
49
|
assert model.labels_.shape[0] == len(df)
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
def test_fit_with_y_and_df_includes_target_column():
|
|
53
|
+
df = pd.DataFrame(
|
|
54
|
+
data={
|
|
55
|
+
'feat1': [0, 1, 2, 3],
|
|
56
|
+
'feat2': [3, 2, 1, 0],
|
|
57
|
+
'target': [0, 1, 0, 1],
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
y = df['target'].to_numpy()
|
|
61
|
+
model = InsideForestClassifier(rf_params={'n_estimators': 5, 'random_state': 0})
|
|
62
|
+
model.fit(X=df, y=y)
|
|
63
|
+
assert 'target' not in model.feature_names_
|
|
64
|
+
preds = model.predict(df[['feat1', 'feat2']])
|
|
65
|
+
assert preds.shape == (4,)
|
|
66
|
+
|
|
67
|
+
|
|
52
68
|
def test_fit_df_missing_target_raises():
|
|
53
69
|
df = pd.DataFrame(data={'feat1': [0, 1], 'feat2': [1, 0]})
|
|
54
70
|
model = InsideForestClassifier()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from InsideForest.metadata import MetaExtractor, run_experiments
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_run_experiments_includes_intersection_stats():
|
|
8
|
+
# dataset with simple target
|
|
9
|
+
df = pd.DataFrame({
|
|
10
|
+
'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
|
11
|
+
'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
|
12
|
+
'target': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
# cluster descriptions with a shared rule on x
|
|
16
|
+
df2 = pd.DataFrame({
|
|
17
|
+
'cluster': [0, 1],
|
|
18
|
+
'cluster_descripcion': [
|
|
19
|
+
'0 <= x <= 5 AND 0 <= y <= 5',
|
|
20
|
+
'0 <= x <= 5 AND 5 <= y <= 10',
|
|
21
|
+
],
|
|
22
|
+
'cluster_ef_sample': [0.2, 1.0],
|
|
23
|
+
'cluster_n_sample': [5, 1],
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
# minimal metadata for variables x and y
|
|
27
|
+
meta_df = pd.DataFrame({
|
|
28
|
+
'actionability.increase_difficulty': [1, 1],
|
|
29
|
+
'actionability.decrease_difficulty': [1, 1],
|
|
30
|
+
}, index=['x', 'y'])
|
|
31
|
+
|
|
32
|
+
mx = MetaExtractor(meta_df, var_obj='target')
|
|
33
|
+
|
|
34
|
+
result = run_experiments(mx, {'ds': df2}, data_dict={'ds': df})
|
|
35
|
+
|
|
36
|
+
assert 'intersection_n_sample' in result.columns
|
|
37
|
+
assert 'intersection_ef_sample' in result.columns
|
|
38
|
+
row = result.iloc[0]
|
|
39
|
+
assert row['intersection_n_sample'] == 5
|
|
40
|
+
assert row['intersection_ef_sample'] == pytest.approx(0.2)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from InsideForest.models import Models
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_get_knn_rows_success():
|
|
6
|
+
df = pd.DataFrame({'feature': [0, 1, 2, 3, 4, 5],
|
|
7
|
+
'target': [0, 0, 0, 1, 1, 1]})
|
|
8
|
+
models = Models()
|
|
9
|
+
mis_df, rest_df = models.get_knn_rows(df, 'target', criterio_fp=False, min_obs=0)
|
|
10
|
+
assert not mis_df.empty
|
|
11
|
+
assert len(mis_df) + len(rest_df) == len(df)
|
|
12
|
+
assert rest_df.equals(df.drop(mis_df.index))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_get_knn_rows_no_misclassification():
|
|
16
|
+
df = pd.DataFrame({'feature': [0, 1, 2, 3, 4, 5],
|
|
17
|
+
'target': [0, 0, 0, 1, 1, 1]})
|
|
18
|
+
models = Models()
|
|
19
|
+
mis_df, rest_df = models.get_knn_rows(df, 'target', min_obs=10)
|
|
20
|
+
assert mis_df.empty
|
|
21
|
+
assert rest_df.equals(df)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_get_knn_rows_training_error():
|
|
25
|
+
df = pd.DataFrame({'feature': ['a', 'b', 'c'],
|
|
26
|
+
'target': [0, 1, 0]})
|
|
27
|
+
models = Models()
|
|
28
|
+
mis_df, rest_df = models.get_knn_rows(df, 'target')
|
|
29
|
+
assert mis_df.empty
|
|
30
|
+
assert rest_df.equals(df)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|