InsideForest 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/cluster_selector.py +54 -55
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/descrip.py +121 -117
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/inside_forest.py +215 -1
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/labels.py +6 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/metadata.py +60 -14
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/models.py +10 -5
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/regions.py +133 -120
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/trees.py +16 -17
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/PKG-INFO +1 -1
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/SOURCES.txt +4 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/PKG-INFO +1 -1
- {insideforest-0.3.2 → insideforest-0.3.4}/README.md +61 -6
- insideforest-0.3.4/experiments/select_clusters_hyperparam.py +86 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/setup.py +1 -1
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_cluster_selector.py +16 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_descrip.py +9 -9
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_descrip_helpers.py +17 -11
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_inside_forest_fit_predict.py +16 -0
- insideforest-0.3.4/tests/test_metadata_run_experiments.py +40 -0
- insideforest-0.3.4/tests/test_models.py +30 -0
- insideforest-0.3.4/tests/test_numeric_utils.py +65 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/__init__.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/dependency_links.txt +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/top_level.txt +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/LICENSE +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/experiments/__init__.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/experiments/benchmark.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/experiments/benchmark_get_rangos.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/experiments/rf_param_benchmark.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/experiments/summary_benchmark.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/setup.cfg +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_chimera_values_selector.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_eps_search_perf.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_inside_forest_params.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_inside_forest_regressor_fit_predict.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_iou_equivalence.py +0 -0
- {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_trees.py +0 -0
|
@@ -86,6 +86,9 @@ def select_clusters(
|
|
|
86
86
|
ponderador = regla['ponderador']
|
|
87
87
|
cluster = regla['cluster']
|
|
88
88
|
|
|
89
|
+
missing_cols = [col for col in variables if col not in df_datos.columns]
|
|
90
|
+
if missing_cols:
|
|
91
|
+
raise KeyError(f"Columns not found in df_datos: {missing_cols}")
|
|
89
92
|
X_datos = df_datos[variables]
|
|
90
93
|
condiciones = [
|
|
91
94
|
(X_datos[var].to_numpy() >= linf[var]) & (X_datos[var].to_numpy() <= lsup[var])
|
|
@@ -126,18 +129,19 @@ def select_clusters(
|
|
|
126
129
|
|
|
127
130
|
class MenuClusterSelector:
|
|
128
131
|
"""
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
- fit(records_train, y):
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
132
|
+
Cluster selector when X = records (one menu per row).
|
|
133
|
+
Trains with y to estimate q_v(y). During prediction chooses a value per row
|
|
134
|
+
maximizing a global objective: J = w_nmi * NMI + w_v * V_measure - λ * RegK.
|
|
135
|
+
|
|
136
|
+
- fit(records_train, y): compute q_v(y) (Laplace smoothing) and set the
|
|
137
|
+
vocabulary of values.
|
|
138
|
+
- predict(records, n_clusters=None): assign one value per row without seeing y,
|
|
139
|
+
optimizing J via coordinate ascent (greedy improvements). If n_clusters=K,
|
|
140
|
+
first restrict to a catalog S of size K using coverage+quality and then optimize.
|
|
137
141
|
"""
|
|
138
142
|
|
|
139
143
|
# =========================
|
|
140
|
-
#
|
|
144
|
+
# METRICS (explicit)
|
|
141
145
|
# =========================
|
|
142
146
|
@staticmethod
|
|
143
147
|
def _safe_div(a, b):
|
|
@@ -182,7 +186,7 @@ class MenuClusterSelector:
|
|
|
182
186
|
def _v_measure_from_soft(cls, C: np.ndarray, Py: np.ndarray, Pv: np.ndarray,
|
|
183
187
|
beta: float = 1.0) -> float:
|
|
184
188
|
"""
|
|
185
|
-
V-measure =
|
|
189
|
+
V-measure = harmonic(homogeneity, completeness) over C, Py, Pv.
|
|
186
190
|
"""
|
|
187
191
|
n = C.sum()
|
|
188
192
|
if n <= 0:
|
|
@@ -212,9 +216,9 @@ class MenuClusterSelector:
|
|
|
212
216
|
@staticmethod
|
|
213
217
|
def _k_regularizer(Pv: np.ndarray, target_K: int | None, lam: float) -> float:
|
|
214
218
|
"""
|
|
215
|
-
|
|
216
|
-
-
|
|
217
|
-
-
|
|
219
|
+
Global regularizer over the number of values used:
|
|
220
|
+
- If target_K is None: Reg = lam * H(V) (penalizes high entropy ⇒ fewer effective values).
|
|
221
|
+
- If target_K is int: Reg = lam * (H(V) - log(target_K))^2 (pushes toward ~K values).
|
|
218
222
|
"""
|
|
219
223
|
if lam <= 0.0:
|
|
220
224
|
return 0.0
|
|
@@ -270,7 +274,7 @@ class MenuClusterSelector:
|
|
|
270
274
|
self.value_to_idx_ = {v: i for i, v in enumerate(self.idx_to_value_)}
|
|
271
275
|
|
|
272
276
|
def _ensure_vocab_for_predict(self, records: Sequence[Sequence[Any]]):
|
|
273
|
-
#
|
|
277
|
+
# Add unseen values from train with uniform q_v (smoothing)
|
|
274
278
|
new_vals = []
|
|
275
279
|
for row in records:
|
|
276
280
|
for v in (row if row else [None]):
|
|
@@ -446,9 +450,9 @@ class MenuClusterSelector:
|
|
|
446
450
|
|
|
447
451
|
def predict(self, records: Sequence[Sequence[Any]], n_clusters: int | None = None) -> List[Any]:
|
|
448
452
|
"""
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
453
|
+
Assign one value per row maximizing J = w_nmi*NMI + w_v*V - lam_k*RegK,
|
|
454
|
+
using coordinate ascent (greedy improvements).
|
|
455
|
+
If n_clusters=K, first restrict to a catalog S of size K (coverage+quality).
|
|
452
456
|
"""
|
|
453
457
|
assert (
|
|
454
458
|
self.q_ is not None and self.classes_ is not None and self.Py_ is not None
|
|
@@ -479,12 +483,12 @@ def balance_lists_n_clusters(
|
|
|
479
483
|
seed: int | None = None,
|
|
480
484
|
) -> List[Any]:
|
|
481
485
|
"""
|
|
482
|
-
|
|
486
|
+
Assign **a single value per row** optimizing two objectives with identical weight:
|
|
483
487
|
|
|
484
|
-
• |distinct - n_clusters| →
|
|
485
|
-
(
|
|
488
|
+
• |distinct - n_clusters| → approach the desired number of clusters
|
|
489
|
+
(if `n_clusters` is `None`, the minimum possible is chosen naturally).
|
|
486
490
|
|
|
487
|
-
•
|
|
491
|
+
• Absolute imbalance → Σ |c_v – ideal| / n, where ``ideal = n / k``.
|
|
488
492
|
"""
|
|
489
493
|
rng = random.Random(seed)
|
|
490
494
|
records = [row if row else [-1] for row in records]
|
|
@@ -508,7 +512,7 @@ def balance_lists_n_clusters(
|
|
|
508
512
|
return cluster_term + imbalance(cnt)
|
|
509
513
|
|
|
510
514
|
def neighbour(assign: List[Any]) -> List[Any]:
|
|
511
|
-
"""
|
|
515
|
+
"""Move one row to another valid option (random)."""
|
|
512
516
|
i = rng.randrange(n)
|
|
513
517
|
row = records[i]
|
|
514
518
|
cur = assign[i]
|
|
@@ -519,7 +523,7 @@ def balance_lists_n_clusters(
|
|
|
519
523
|
new[i] = rng.choice(alt)
|
|
520
524
|
return new
|
|
521
525
|
|
|
522
|
-
#
|
|
526
|
+
# Reasonable initialization ------------------------------------------
|
|
523
527
|
val_rows = defaultdict(list)
|
|
524
528
|
for idx, row in enumerate(records):
|
|
525
529
|
for v in row:
|
|
@@ -585,11 +589,11 @@ def max_prob_clusters(
|
|
|
585
589
|
seed: int | None = None,
|
|
586
590
|
) -> List[Any]:
|
|
587
591
|
"""
|
|
588
|
-
|
|
589
|
-
•
|
|
590
|
-
•
|
|
591
|
-
–
|
|
592
|
-
–
|
|
592
|
+
Select **one value per row** such that:
|
|
593
|
+
• If `n_clusters` is `None` → minimize number of distinct values.
|
|
594
|
+
• If `n_clusters` is an integer:
|
|
595
|
+
– attempt to return EXACTLY that number of clusters, maximizing the sum of probabilities.
|
|
596
|
+
– if impossible, use the nearest feasible value (`k_min` or `k_max`).
|
|
593
597
|
"""
|
|
594
598
|
rng = random.Random(seed)
|
|
595
599
|
n = len(records)
|
|
@@ -630,7 +634,7 @@ def max_prob_clusters(
|
|
|
630
634
|
|
|
631
635
|
S = set(list(S)[:k_target]) # asegura |S| == k_target
|
|
632
636
|
|
|
633
|
-
#
|
|
637
|
+
# Step 3: greedy assignment -----------------------------------------
|
|
634
638
|
assign: List[Any] = []
|
|
635
639
|
for row in records:
|
|
636
640
|
opts = [v for v in row if v in S]
|
|
@@ -687,29 +691,24 @@ def match_class_distribution(
|
|
|
687
691
|
*,
|
|
688
692
|
seed: int | None = None,
|
|
689
693
|
) -> List[Any]:
|
|
690
|
-
"""
|
|
691
|
-
|
|
692
|
-
Para cada registro se selecciona una etiqueta de su lista de opciones de
|
|
693
|
-
forma que, de manera codiciosa, la distribución de clases dentro de cada
|
|
694
|
-
etiqueta sea lo más parecida posible a la distribución global observada en
|
|
695
|
-
``y``.
|
|
694
|
+
"""Assign a value per row imitating the distribution of ``y``.
|
|
696
695
|
|
|
697
696
|
Parameters
|
|
698
697
|
----------
|
|
699
698
|
records : Sequence[Sequence[Any]]
|
|
700
|
-
|
|
699
|
+
List of label options per row.
|
|
701
700
|
y : Sequence[Any]
|
|
702
|
-
|
|
701
|
+
Target classes associated with each row.
|
|
703
702
|
n_clusters : int | None, optional
|
|
704
|
-
|
|
705
|
-
|
|
703
|
+
Desired number of distinct labels. Used as a soft bound,
|
|
704
|
+
prioritizing the most frequent.
|
|
706
705
|
seed : int | None, optional
|
|
707
|
-
|
|
706
|
+
Random seed for processing order.
|
|
708
707
|
|
|
709
708
|
Returns
|
|
710
709
|
-------
|
|
711
710
|
List[Any]
|
|
712
|
-
|
|
711
|
+
Label selected for each row.
|
|
713
712
|
"""
|
|
714
713
|
|
|
715
714
|
rng = np.random.default_rng(seed)
|
|
@@ -790,9 +789,9 @@ def _round_quota(pi: np.ndarray, n: int) -> np.ndarray:
|
|
|
790
789
|
|
|
791
790
|
def compress_distribution_to_K(Py: np.ndarray, K: int) -> np.ndarray:
|
|
792
791
|
"""
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
792
|
+
Compress the distribution of y (counts Py) to K masses without losing "shape"
|
|
793
|
+
by repeatedly merging the two smallest masses.
|
|
794
|
+
Returns proportions that sum to 1.
|
|
796
795
|
"""
|
|
797
796
|
masses = list(np.asarray(Py, dtype=np.float64))
|
|
798
797
|
if K >= len(masses):
|
|
@@ -814,19 +813,19 @@ def compress_distribution_to_K(Py: np.ndarray, K: int) -> np.ndarray:
|
|
|
814
813
|
|
|
815
814
|
class ChimeraValuesSelector:
|
|
816
815
|
"""
|
|
817
|
-
|
|
818
|
-
-
|
|
819
|
-
-
|
|
820
|
-
(
|
|
821
|
-
-
|
|
816
|
+
Assign ONE value per row (always from its own menu) such that:
|
|
817
|
+
- The number of DISTINCT values (K) can be fixed or auto-chosen.
|
|
818
|
+
- The frequency distribution per chosen value mimics the shape of y
|
|
819
|
+
(compress Py→K masses and translate those quotas to K actual values).
|
|
820
|
+
- The semantic quality of value v is measured with s(v) = log q_v · P(y).
|
|
822
821
|
|
|
823
|
-
|
|
822
|
+
Flow:
|
|
824
823
|
fit(records_train, y_train):
|
|
825
|
-
-
|
|
824
|
+
- Learn q_v(y) with Laplace smoothing over availability.
|
|
826
825
|
predict(records, n_labels=None, k_range=(2,12)):
|
|
827
|
-
-
|
|
828
|
-
-
|
|
829
|
-
-
|
|
826
|
+
- Build catalog S of K values (set-cover + quality).
|
|
827
|
+
- Assign target quotas ~ pi_K*n to each value of S (respecting capacities).
|
|
828
|
+
- Assign each row to its best option in S with available capacity.
|
|
830
829
|
"""
|
|
831
830
|
|
|
832
831
|
def __init__(self, smoothing: float = 1.0, seed: Optional[int] = 42):
|
|
@@ -1068,7 +1067,7 @@ class ChimeraValuesSelector:
|
|
|
1068
1067
|
best = cand
|
|
1069
1068
|
|
|
1070
1069
|
if best is None:
|
|
1071
|
-
raise RuntimeError("No
|
|
1070
|
+
raise RuntimeError("No feasible K found in the given range.")
|
|
1072
1071
|
|
|
1073
1072
|
_, cv_diff, S, quota, assign_idx = best
|
|
1074
1073
|
labels = [self.idx_to_value_[j] for j in assign_idx]
|