InsideForest 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/cluster_selector.py +54 -55
  2. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/descrip.py +121 -117
  3. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/inside_forest.py +215 -1
  4. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/labels.py +6 -0
  5. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/metadata.py +60 -14
  6. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/models.py +10 -5
  7. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/regions.py +133 -120
  8. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/trees.py +16 -17
  9. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/PKG-INFO +1 -1
  10. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/SOURCES.txt +4 -0
  11. {insideforest-0.3.2 → insideforest-0.3.4}/PKG-INFO +1 -1
  12. {insideforest-0.3.2 → insideforest-0.3.4}/README.md +61 -6
  13. insideforest-0.3.4/experiments/select_clusters_hyperparam.py +86 -0
  14. {insideforest-0.3.2 → insideforest-0.3.4}/setup.py +1 -1
  15. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_cluster_selector.py +16 -0
  16. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_descrip.py +9 -9
  17. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_descrip_helpers.py +17 -11
  18. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_inside_forest_fit_predict.py +16 -0
  19. insideforest-0.3.4/tests/test_metadata_run_experiments.py +40 -0
  20. insideforest-0.3.4/tests/test_models.py +30 -0
  21. insideforest-0.3.4/tests/test_numeric_utils.py +65 -0
  22. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest/__init__.py +0 -0
  23. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/dependency_links.txt +0 -0
  24. {insideforest-0.3.2 → insideforest-0.3.4}/InsideForest.egg-info/top_level.txt +0 -0
  25. {insideforest-0.3.2 → insideforest-0.3.4}/LICENSE +0 -0
  26. {insideforest-0.3.2 → insideforest-0.3.4}/experiments/__init__.py +0 -0
  27. {insideforest-0.3.2 → insideforest-0.3.4}/experiments/benchmark.py +0 -0
  28. {insideforest-0.3.2 → insideforest-0.3.4}/experiments/benchmark_get_rangos.py +0 -0
  29. {insideforest-0.3.2 → insideforest-0.3.4}/experiments/rf_param_benchmark.py +0 -0
  30. {insideforest-0.3.2 → insideforest-0.3.4}/experiments/summary_benchmark.py +0 -0
  31. {insideforest-0.3.2 → insideforest-0.3.4}/setup.cfg +0 -0
  32. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_chimera_values_selector.py +0 -0
  33. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_eps_search_perf.py +0 -0
  34. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_inside_forest_params.py +0 -0
  35. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_inside_forest_regressor_fit_predict.py +0 -0
  36. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_iou_equivalence.py +0 -0
  37. {insideforest-0.3.2 → insideforest-0.3.4}/tests/test_trees.py +0 -0
@@ -86,6 +86,9 @@ def select_clusters(
86
86
  ponderador = regla['ponderador']
87
87
  cluster = regla['cluster']
88
88
 
89
+ missing_cols = [col for col in variables if col not in df_datos.columns]
90
+ if missing_cols:
91
+ raise KeyError(f"Columns not found in df_datos: {missing_cols}")
89
92
  X_datos = df_datos[variables]
90
93
  condiciones = [
91
94
  (X_datos[var].to_numpy() >= linf[var]) & (X_datos[var].to_numpy() <= lsup[var])
@@ -126,18 +129,19 @@ def select_clusters(
126
129
 
127
130
  class MenuClusterSelector:
128
131
  """
129
- Selector de 'clústers' cuando X = records (solo menús por fila).
130
- Entrena con y para estimar q_v(y). En predicción elige un valor por fila
131
- maximizando un objetivo global: J = w_nmi * NMI + w_v * V_measure - λ * RegK.
132
-
133
- - fit(records_train, y): calcula q_v(y) (suavizado Laplace) y fija el vocabulario de valores.
134
- - predict(records, n_clusters=None): asigna un valor por fila SIN ver y, optimizando J
135
- por ascenso coordinado (greedy por mejoras de J). Si se da n_clusters=K, primero
136
- restringe a un catálogo S de tamaño K mediante cobertura+calidad y luego optimiza.
132
+ Cluster selector when X = records (one menu per row).
133
+ Trains with y to estimate q_v(y). During prediction chooses a value per row
134
+ maximizing a global objective: J = w_nmi * NMI + w_v * V_measure - λ * RegK.
135
+
136
+ - fit(records_train, y): compute q_v(y) (Laplace smoothing) and set the
137
+ vocabulary of values.
138
+ - predict(records, n_clusters=None): assign one value per row without seeing y,
139
+ optimizing J via coordinate ascent (greedy improvements). If n_clusters=K,
140
+ first restrict to a catalog S of size K using coverage+quality and then optimize.
137
141
  """
138
142
 
139
143
  # =========================
140
- # MÉTRICAS (explícitas)
144
+ # METRICS (explicit)
141
145
  # =========================
142
146
  @staticmethod
143
147
  def _safe_div(a, b):
@@ -182,7 +186,7 @@ class MenuClusterSelector:
182
186
  def _v_measure_from_soft(cls, C: np.ndarray, Py: np.ndarray, Pv: np.ndarray,
183
187
  beta: float = 1.0) -> float:
184
188
  """
185
- V-measure = armónica(homogeneidad, completitud) sobre C, Py, Pv.
189
+ V-measure = harmonic(homogeneity, completeness) over C, Py, Pv.
186
190
  """
187
191
  n = C.sum()
188
192
  if n <= 0:
@@ -212,9 +216,9 @@ class MenuClusterSelector:
212
216
  @staticmethod
213
217
  def _k_regularizer(Pv: np.ndarray, target_K: int | None, lam: float) -> float:
214
218
  """
215
- Regularizador global sobre el de valores usados:
216
- - Si target_K es None: Reg = lam * H(V) (castiga alta entropíamenos valores efectivos).
217
- - Si target_K es int: Reg = lam * (H(V) - log(target_K))^2 (empuja a ~K valores).
219
+ Global regularizer over the number of values used:
220
+ - If target_K is None: Reg = lam * H(V) (penalizes high entropyfewer effective values).
221
+ - If target_K is int: Reg = lam * (H(V) - log(target_K))^2 (pushes toward ~K values).
218
222
  """
219
223
  if lam <= 0.0:
220
224
  return 0.0
@@ -270,7 +274,7 @@ class MenuClusterSelector:
270
274
  self.value_to_idx_ = {v: i for i, v in enumerate(self.idx_to_value_)}
271
275
 
272
276
  def _ensure_vocab_for_predict(self, records: Sequence[Sequence[Any]]):
273
- # Añade valores no vistos en train con q_v uniforme (suavizado)
277
+ # Add unseen values from train with uniform q_v (smoothing)
274
278
  new_vals = []
275
279
  for row in records:
276
280
  for v in (row if row else [None]):
@@ -446,9 +450,9 @@ class MenuClusterSelector:
446
450
 
447
451
  def predict(self, records: Sequence[Sequence[Any]], n_clusters: int | None = None) -> List[Any]:
448
452
  """
449
- Asigna 1 valor por fila maximizando J = w_nmi*NMI + w_v*V - lam_k*RegK,
450
- usando ascenso coordinado (greedy por mejoras).
451
- Si n_clusters=K, primero restringe a un catálogo S de tamaño K (cobertura+calidad).
453
+ Assign one value per row maximizing J = w_nmi*NMI + w_v*V - lam_k*RegK,
454
+ using coordinate ascent (greedy improvements).
455
+ If n_clusters=K, first restrict to a catalog S of size K (coverage+quality).
452
456
  """
453
457
  assert (
454
458
  self.q_ is not None and self.classes_ is not None and self.Py_ is not None
@@ -479,12 +483,12 @@ def balance_lists_n_clusters(
479
483
  seed: int | None = None,
480
484
  ) -> List[Any]:
481
485
  """
482
- Asigna **un único valor por fila** optimizando dos objetivos con *peso idéntico*:
486
+ Assign **a single value per row** optimizing two objectives with identical weight:
483
487
 
484
- • |distinct - n_clusters| → acercarse al deseado de clusters
485
- (si `n_clusters` es `None`, se toma el mínimo posible de forma natural).
488
+ • |distinct - n_clusters| → approach the desired number of clusters
489
+ (if `n_clusters` is `None`, the minimum possible is chosen naturally).
486
490
 
487
- Desbalance absoluto Σ |c_v – ideal| / n, donde `ideal = n / k`.
491
+ Absolute imbalance Σ |c_v – ideal| / n, where ``ideal = n / k``.
488
492
  """
489
493
  rng = random.Random(seed)
490
494
  records = [row if row else [-1] for row in records]
@@ -508,7 +512,7 @@ def balance_lists_n_clusters(
508
512
  return cluster_term + imbalance(cnt)
509
513
 
510
514
  def neighbour(assign: List[Any]) -> List[Any]:
511
- """Mueve una fila a otra opción válida (aleatorio)."""
515
+ """Move one row to another valid option (random)."""
512
516
  i = rng.randrange(n)
513
517
  row = records[i]
514
518
  cur = assign[i]
@@ -519,7 +523,7 @@ def balance_lists_n_clusters(
519
523
  new[i] = rng.choice(alt)
520
524
  return new
521
525
 
522
- # Inicialización razonable ------------------------------------------
526
+ # Reasonable initialization ------------------------------------------
523
527
  val_rows = defaultdict(list)
524
528
  for idx, row in enumerate(records):
525
529
  for v in row:
@@ -585,11 +589,11 @@ def max_prob_clusters(
585
589
  seed: int | None = None,
586
590
  ) -> List[Any]:
587
591
  """
588
- Selecciona **un valor por fila** cumpliendo:
589
- Si `n_clusters` es `None` minimiza el de valores distintos.
590
- Si `n_clusters` es un entero:
591
- intenta devolver EXACTAMENTE ese de clusters, maximizando la suma de probabilidades.
592
- si es imposible, usa el valor factible más próximo (`k_min` o `k_max`).
592
+ Select **one value per row** such that:
593
+ If `n_clusters` is `None` minimize number of distinct values.
594
+ If `n_clusters` is an integer:
595
+ attempt to return EXACTLY that number of clusters, maximizing the sum of probabilities.
596
+ if impossible, use the nearest feasible value (`k_min` or `k_max`).
593
597
  """
594
598
  rng = random.Random(seed)
595
599
  n = len(records)
@@ -630,7 +634,7 @@ def max_prob_clusters(
630
634
 
631
635
  S = set(list(S)[:k_target]) # asegura |S| == k_target
632
636
 
633
- # Paso 3: asignación greedy -----------------------------------------
637
+ # Step 3: greedy assignment -----------------------------------------
634
638
  assign: List[Any] = []
635
639
  for row in records:
636
640
  opts = [v for v in row if v in S]
@@ -687,29 +691,24 @@ def match_class_distribution(
687
691
  *,
688
692
  seed: int | None = None,
689
693
  ) -> List[Any]:
690
- """Asignar un valor por fila imitando la distribución de ``y``.
691
-
692
- Para cada registro se selecciona una etiqueta de su lista de opciones de
693
- forma que, de manera codiciosa, la distribución de clases dentro de cada
694
- etiqueta sea lo más parecida posible a la distribución global observada en
695
- ``y``.
694
+ """Assign a value per row imitating the distribution of ``y``.
696
695
 
697
696
  Parameters
698
697
  ----------
699
698
  records : Sequence[Sequence[Any]]
700
- Lista de opciones de etiqueta por fila.
699
+ List of label options per row.
701
700
  y : Sequence[Any]
702
- Clases objetivo asociadas a cada fila.
701
+ Target classes associated with each row.
703
702
  n_clusters : int | None, optional
704
- Número deseado de etiquetas distintas. Se usa como cota blanda
705
- priorizando las más frecuentes.
703
+ Desired number of distinct labels. Used as a soft bound,
704
+ prioritizing the most frequent.
706
705
  seed : int | None, optional
707
- Semilla para la aleatoriedad del orden de procesamiento.
706
+ Random seed for processing order.
708
707
 
709
708
  Returns
710
709
  -------
711
710
  List[Any]
712
- Etiqueta seleccionada para cada fila.
711
+ Label selected for each row.
713
712
  """
714
713
 
715
714
  rng = np.random.default_rng(seed)
@@ -790,9 +789,9 @@ def _round_quota(pi: np.ndarray, n: int) -> np.ndarray:
790
789
 
791
790
  def compress_distribution_to_K(Py: np.ndarray, K: int) -> np.ndarray:
792
791
  """
793
- Comprime la dist. de y (conteos Py) a K masas sin perder “silueta”
794
- fusionando repetidamente las dos masas más pequeñas.
795
- Devuelve proporciones (suman 1).
792
+ Compress the distribution of y (counts Py) to K masses without losing "shape"
793
+ by repeatedly merging the two smallest masses.
794
+ Returns proportions that sum to 1.
796
795
  """
797
796
  masses = list(np.asarray(Py, dtype=np.float64))
798
797
  if K >= len(masses):
@@ -814,19 +813,19 @@ def compress_distribution_to_K(Py: np.ndarray, K: int) -> np.ndarray:
814
813
 
815
814
  class ChimeraValuesSelector:
816
815
  """
817
- Asigna UN valor por fila (siempre de su propio menú) de forma que:
818
- - La cantidad de valores DISTINTOS (K) puede fijarse o autoelegirse.
819
- - La distribución de frecuencias por valor elegido IMITA la silueta de y
820
- (comprime Py→K masas y traduce esas cuotas a K valores reales).
821
- - La calidad semántica del valor v se mide con s(v) = log q_v · P(y).
816
+ Assign ONE value per row (always from its own menu) such that:
817
+ - The number of DISTINCT values (K) can be fixed or auto-chosen.
818
+ - The frequency distribution per chosen value mimics the shape of y
819
+ (compress Py→K masses and translate those quotas to K actual values).
820
+ - The semantic quality of value v is measured with s(v) = log q_v · P(y).
822
821
 
823
- Flujo:
822
+ Flow:
824
823
  fit(records_train, y_train):
825
- - Aprende q_v(y) con suavizado de Laplace sobre disponibilidad.
824
+ - Learn q_v(y) with Laplace smoothing over availability.
826
825
  predict(records, n_labels=None, k_range=(2,12)):
827
- - Construye catálogo S de K valores (set-cover + calidad).
828
- - Asigna CUOTAS objetivo ~ pi_K*n a cada valor de S (respetando capacidades).
829
- - Asigna cada fila a su mejor opción en S con capacidad disponible.
826
+ - Build catalog S of K values (set-cover + quality).
827
+ - Assign target quotas ~ pi_K*n to each value of S (respecting capacities).
828
+ - Assign each row to its best option in S with available capacity.
830
829
  """
831
830
 
832
831
  def __init__(self, smoothing: float = 1.0, seed: Optional[int] = 42):
@@ -1068,7 +1067,7 @@ class ChimeraValuesSelector:
1068
1067
  best = cand
1069
1068
 
1070
1069
  if best is None:
1071
- raise RuntimeError("No se encontró K factible en el rango dado.")
1070
+ raise RuntimeError("No feasible K found in the given range.")
1072
1071
 
1073
1072
  _, cv_diff, S, quota, assign_idx = best
1074
1073
  labels = [self.idx_to_value_[j] for j in assign_idx]