statslibx 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslibx/io.py ADDED
@@ -0,0 +1,21 @@
1
+ import pandas as pd
2
+ import polars as pl
3
+ from pathlib import Path
4
+
5
+
6
+ def load_file(path: str):
7
+ path = Path(path)
8
+
9
+ if not path.exists():
10
+ raise FileNotFoundError(f"{path} not found")
11
+
12
+ if path.suffix == ".csv":
13
+ return pd.read_csv(path)
14
+
15
+ if path.suffix == ".json":
16
+ return pd.read_json(path)
17
+
18
+ if path.suffix in {".txt", ".tsv"}:
19
+ return pd.read_csv(path, sep="\t")
20
+
21
+ raise ValueError(f"Unsupported file type: {path.suffix}")
@@ -0,0 +1,228 @@
1
+ from typing import Optional, Union, List, Dict, Any
2
+ import pandas as pd
3
+ import polars as pl
4
+ import numpy as np
5
+
6
+
7
+ class Preprocessing:
8
+
9
+ def __init__(self, data: Union[pd.DataFrame, pl.DataFrame]):
10
+ if not isinstance(data, (pd.DataFrame, pl.DataFrame)):
11
+ raise TypeError("data must be a pandas or polars DataFrame")
12
+ self.data = data
13
+
14
+ # ------------------------------------------------------------------
15
+ # Internal helpers
16
+ # ------------------------------------------------------------------
17
+
18
+ def _is_pandas(self) -> bool:
19
+ return isinstance(self.data, pd.DataFrame)
20
+
21
+ def _is_polars(self) -> bool:
22
+ return isinstance(self.data, pl.DataFrame)
23
+
24
+ def _count_nulls(self, column: str) -> int:
25
+ if self._is_pandas():
26
+ return int(self.data[column].isna().sum())
27
+ return int(self.data[column].null_count())
28
+
29
+ def _get_columns(self, columns):
30
+ if columns is None:
31
+ return list(self.data.columns)
32
+ if isinstance(columns, str):
33
+ return [columns]
34
+ return columns
35
+
36
+ # ------------------------------------------------------------------
37
+ # Inspection
38
+ # ------------------------------------------------------------------
39
+
40
+ def detect_nulls(
41
+ self,
42
+ columns: Optional[Union[str, List[str]]] = None
43
+ ) -> pd.DataFrame:
44
+
45
+ columns = self._get_columns(columns)
46
+ total = self.data.shape[0]
47
+
48
+ rows = []
49
+ for col in columns:
50
+ nulls = self._count_nulls(col)
51
+ rows.append({
52
+ "column": col,
53
+ "nulls": nulls,
54
+ "non_nulls": total - nulls,
55
+ "null_pct": nulls / total
56
+ })
57
+
58
+ return pd.DataFrame(rows)
59
+
60
+ def check_uniqueness(self) -> pd.DataFrame:
61
+ if self._is_pandas():
62
+ unique = self.data.nunique()
63
+ return pd.DataFrame({
64
+ "column": unique.index,
65
+ "unique_values": unique.values
66
+ })
67
+
68
+ unique = self.data.select(pl.all().n_unique())
69
+ return unique.to_pandas().melt(
70
+ var_name="column",
71
+ value_name="unique_values"
72
+ )
73
+
74
+ def preview_data(self, n: int = 5):
75
+ return self.data.head(n)
76
+
77
+ # ------------------------------------------------------------------
78
+ # Description
79
+ # ------------------------------------------------------------------
80
+
81
+ def describe_numeric(self):
82
+ if self._is_pandas():
83
+ return self.data.select_dtypes(include=np.number).describe()
84
+
85
+ return self.data.select(pl.all().filter(pl.col(pl.NUMERIC))).describe()
86
+
87
+ def describe_categorical(self):
88
+ if self._is_pandas():
89
+ return self.data.select_dtypes(include="object").describe()
90
+
91
+ return self.data.select(pl.all().filter(pl.col(pl.Utf8))).describe()
92
+
93
+ # ------------------------------------------------------------------
94
+ # Transformations
95
+ # ------------------------------------------------------------------
96
+
97
+ def fill_nulls(
98
+ self,
99
+ fill_with: Any,
100
+ columns: Optional[Union[str, List[str]]] = None
101
+ ):
102
+ columns = self._get_columns(columns)
103
+
104
+ if self._is_pandas():
105
+ self.data[columns] = self.data[columns].fillna(fill_with)
106
+
107
+ else:
108
+ self.data = self.data.with_columns([
109
+ pl.col(col).fill_null(fill_with) for col in columns
110
+ ])
111
+
112
+ return self
113
+
114
+ def normalize(self, column: str):
115
+ if self._is_pandas():
116
+ col = self.data[column]
117
+ self.data[column] = (col - col.min()) / (col.max() - col.min())
118
+ else:
119
+ self.data = self.data.with_columns(
120
+ ((pl.col(column) - pl.col(column).min()) /
121
+ (pl.col(column).max() - pl.col(column).min()))
122
+ .alias(column)
123
+ )
124
+ return self
125
+
126
+ def standardize(self, column: str):
127
+ if self._is_pandas():
128
+ col = self.data[column]
129
+ self.data[column] = (col - col.mean()) / col.std()
130
+ else:
131
+ self.data = self.data.with_columns(
132
+ ((pl.col(column) - pl.col(column).mean()) /
133
+ pl.col(column).std())
134
+ .alias(column)
135
+ )
136
+ return self
137
+
138
+ # ------------------------------------------------------------------
139
+ # Filtering
140
+ # ------------------------------------------------------------------
141
+
142
+ def filter_rows(self, condition):
143
+ if self._is_pandas():
144
+ self.data = self.data.loc[condition]
145
+ else:
146
+ self.data = self.data.filter(condition)
147
+ return self
148
+
149
+ def filter_columns(self, columns: List[str]):
150
+ if self._is_pandas():
151
+ self.data = self.data[columns]
152
+ else:
153
+ self.data = self.data.select(columns)
154
+ return self
155
+
156
+ def rename_columns(self, mapping: Dict[str, str]):
157
+ if self._is_pandas():
158
+ self.data = self.data.rename(columns=mapping)
159
+ else:
160
+ self.data = self.data.rename(mapping)
161
+ return self
162
+
163
+ # ------------------------------------------------------------------
164
+ # Outliers
165
+ # ------------------------------------------------------------------
166
+
167
+ def detect_outliers(
168
+ self,
169
+ column: str,
170
+ method: str = "iqr"
171
+ ) -> pd.DataFrame:
172
+ if self._is_pandas():
173
+ series = self.data[column]
174
+ else:
175
+ series = self.data[column].to_pandas()
176
+
177
+ # 2. Calcular la máscara según el método
178
+ if method == "iqr":
179
+ q1 = series.quantile(0.25)
180
+ q3 = series.quantile(0.75)
181
+ iqr = q3 - q1
182
+ mask_values = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
183
+
184
+ elif method == "zscore":
185
+ z = (series - series.mean()) / series.std()
186
+ mask_values = z.abs() > 3
187
+ else:
188
+ raise ValueError("method must be 'iqr' or 'zscore'")
189
+
190
+ outliers = self.data[mask_values.values]
191
+
192
+ # 4. Manejo de retorno profesional
193
+ if len(outliers) == 0:
194
+ print(f"No outliers found in column '{column}'")
195
+ return outliers
196
+
197
+ return outliers
198
+
199
+
200
+ # ------------------------------------------------------------------
201
+ # Data Quality Report
202
+ # ------------------------------------------------------------------
203
+
204
+ def data_quality(self) -> pd.DataFrame:
205
+ total_rows = self.data.shape[0]
206
+ rows = []
207
+
208
+ for col in self.data.columns:
209
+ nulls = self._count_nulls(col)
210
+
211
+ if self._is_pandas():
212
+ dtype = str(self.data[col].dtype)
213
+ unique = self.data[col].nunique()
214
+ else:
215
+ dtype = str(self.data.schema[col])
216
+ unique = self.data[col].n_unique()
217
+
218
+ rows.append({
219
+ "column": col,
220
+ "dtype": dtype,
221
+ "nulls": nulls,
222
+ "null_pct": nulls / total_rows,
223
+ "unique_values": unique,
224
+ "completeness_pct": 1 - (nulls / total_rows)
225
+ })
226
+
227
+ return pd.DataFrame(rows)
228
+
@@ -0,0 +1,2 @@
1
+ class ProbabilityStats:
2
+ pass
statslibx/utils.py CHANGED
@@ -398,11 +398,35 @@ class UtilsStats:
398
398
 
399
399
  return outliers
400
400
 
401
- def calculate_effect_size(self, group1: np.ndarray, group2: np.ndarray,
402
- method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
401
+ def calculate_effect_size(self,
402
+ data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path] = None,
403
+ group1: Union[str, pd.Series, np.ndarray] = None,
404
+ group2: Union[str, pd.Series, np.ndarray] = None,
405
+ method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
403
406
  """
404
407
  Calcula el tamaño del efecto entre dos grupos
405
408
  """
409
+
410
+ # --- Preparar arrays ---
411
+ # Caso 1: data es DataFrame y group1/group2 son nombres de columna
412
+ if isinstance(data, pd.DataFrame):
413
+ group1 = np.array(data[group1])
414
+ group2 = np.array(data[group2])
415
+ # Caso 2: data no es None, y es una serie o array, usarlo como group1
416
+ elif isinstance(data, (pd.Series, np.ndarray)) and group2 is not None:
417
+ group1 = np.array(data)
418
+ group2 = np.array(group2)
419
+ # Caso 3: group1 y group2 ya son arrays o Series
420
+ else:
421
+ group1 = np.array(group1)
422
+ group2 = np.array(group2)
423
+
424
+ # Eliminar nan automáticamente
425
+ group1 = group1[~np.isnan(group1)]
426
+ group2 = group2[~np.isnan(group2)]
427
+
428
+
429
+ # --- Calcular estadísticas ---
406
430
  mean1, mean2 = np.mean(group1), np.mean(group2)
407
431
  std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
408
432
  n1, n2 = len(group1), len(group2)
@@ -434,6 +458,7 @@ class UtilsStats:
434
458
  'pooled_std': pooled_std
435
459
  }
436
460
 
461
+
437
462
  # ============= MÉTODOS DE VISUALIZACIÓN COMPLETOS =============
438
463
 
439
464
  def _plot_distribution_seaborn(self, data, plot_type, bins, figsize, title, **kwargs):
@@ -528,6 +553,47 @@ class UtilsStats:
528
553
  plt.tight_layout()
529
554
 
530
555
  return fig
556
+
557
+ def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
558
+ """Implementación con plotly"""
559
+ try:
560
+ import plotly.graph_objects as go
561
+ import plotly.express as px
562
+ from plotly.subplots import make_subplots
563
+ except ImportError:
564
+ raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
565
+
566
+ if plot_type == 'all':
567
+ fig = make_subplots(
568
+ rows=2, cols=2,
569
+ subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
570
+ )
571
+
572
+ # Histograma
573
+ fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
574
+
575
+ # Box plot
576
+ fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
577
+
578
+ # Violin plot
579
+ fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
580
+
581
+ # Distribución acumulada
582
+ hist, bin_edges = np.histogram(data, bins=bins, density=True)
583
+ cdf = np.cumsum(hist * np.diff(bin_edges))
584
+ fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
585
+
586
+ else:
587
+ if plot_type == 'hist':
588
+ fig = px.histogram(data, nbins=bins, title=title)
589
+ elif plot_type == 'box':
590
+ fig = px.box(y=data, title=title)
591
+ elif plot_type == 'violin':
592
+ fig = px.violin(y=data, title=title, box=True)
593
+ else:
594
+ fig = px.histogram(data, nbins=bins, title=title)
595
+
596
+ return fig
531
597
 
532
598
  def plot_distribution(self,
533
599
  data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
@@ -536,7 +602,7 @@ class UtilsStats:
536
602
  backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
537
603
  bins: int = 30,
538
604
  figsize: Optional[Tuple[int, int]] = None,
539
- save_fig: Optional[bool] = None,
605
+ save_fig: Optional[bool] = False,
540
606
  filename: Optional[str] = None,
541
607
  **kwargs):
542
608
  """
@@ -568,7 +634,7 @@ class UtilsStats:
568
634
  """
569
635
  backend = backend or self._plot_backend
570
636
  figsize = figsize or self._default_figsize
571
- save_fig = save_fig if save_fig is not None else self._save_fig
637
+ self._save_fig = save_fig
572
638
 
573
639
  # Resolver datos
574
640
  data, source = self._resolve_data(data, column)
@@ -605,59 +671,20 @@ class UtilsStats:
605
671
  if save_fig and backend != 'plotly':
606
672
  self._save_figure(fig, filename)
607
673
 
608
- return fig
674
+ if backend == 'plotly':
675
+ return fig
609
676
 
610
677
  except Exception as e:
611
678
  print(f"Error en plot_distribution: {e}")
612
679
  raise
613
680
 
614
- def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
615
- """Implementación con plotly"""
616
- try:
617
- import plotly.graph_objects as go
618
- import plotly.express as px
619
- from plotly.subplots import make_subplots
620
- except ImportError:
621
- raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
622
-
623
- if plot_type == 'all':
624
- fig = make_subplots(
625
- rows=2, cols=2,
626
- subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
627
- )
628
-
629
- # Histograma
630
- fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
631
-
632
- # Box plot
633
- fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
634
-
635
- # Violin plot
636
- fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
637
-
638
- # Distribución acumulada
639
- hist, bin_edges = np.histogram(data, bins=bins, density=True)
640
- cdf = np.cumsum(hist * np.diff(bin_edges))
641
- fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
642
-
643
- else:
644
- if plot_type == 'hist':
645
- fig = px.histogram(data, nbins=bins, title=title)
646
- elif plot_type == 'box':
647
- fig = px.box(y=data, title=title)
648
- elif plot_type == 'violin':
649
- fig = px.violin(y=data, title=title, box=True)
650
- else:
651
- fig = px.histogram(data, nbins=bins, title=title)
652
-
653
- return fig
654
-
655
681
  def plot_correlation_matrix(self,
656
682
  data: Union[pd.DataFrame, str, Path],
657
- method: str = 'pearson',
658
- backend: Optional[Literal['seaborn', 'plotly']] = None,
683
+ method: Literal['pearson', 'kendall', 'spearman'] = 'pearson',
684
+ backend: Optional[Literal['seaborn', 'plotly']] = "seaborn",
685
+ triangular: Optional[bool] = False,
659
686
  figsize: Optional[Tuple[int, int]] = None,
660
- save_fig: Optional[bool] = None,
687
+ save_fig: Optional[bool] = False,
661
688
  filename: Optional[str] = None,
662
689
  **kwargs):
663
690
  """
@@ -674,25 +701,32 @@ class UtilsStats:
674
701
  """
675
702
  backend = backend or self._plot_backend
676
703
  figsize = figsize or self._default_figsize
677
- save_fig = save_fig if save_fig is not None else self._save_fig
704
+ self.save_fig = save_fig
678
705
  filename = filename or "matriz_correlacion"
679
-
706
+
680
707
  # Resolver datos
681
708
  data, source = self._resolve_data(data)
682
709
 
683
710
  if not isinstance(data, pd.DataFrame):
684
711
  raise ValueError("Se requiere un DataFrame para calcular matriz de correlación")
712
+ else:
713
+ data = data.select_dtypes(include=['float64', 'int64'])
685
714
 
686
715
  # Calcular matriz de correlación
687
716
  corr_matrix = data.corr(method=method)
688
717
 
689
718
  if backend == 'seaborn':
690
719
  fig, ax = plt.subplots(figsize=figsize)
691
- mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
720
+ if triangular:
721
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
692
722
 
693
- sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
694
- cmap='coolwarm', center=0, ax=ax,
695
- square=True, linewidths=0.5, **kwargs)
723
+ sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
724
+ cmap='coolwarm', center=0, ax=ax,
725
+ square=True, linewidths=0.5, **kwargs)
726
+ else:
727
+ sns.heatmap(corr_matrix, annot=True, fmt='.2f',
728
+ cmap='coolwarm', center=0, ax=ax,
729
+ square=True, linewidths=0.5, **kwargs)
696
730
  ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
697
731
  plt.tight_layout()
698
732
 
@@ -731,15 +765,15 @@ class UtilsStats:
731
765
  print(f"✓ Figura Plotly guardada: {filepath}")
732
766
  except Exception as e:
733
767
  print(f"✗ Error guardando figura Plotly: {e}")
734
-
735
- return fig
768
+ if backend == 'plotly':
769
+ return fig
736
770
 
737
771
  def plot_scatter_matrix(self,
738
772
  data: Union[pd.DataFrame, str, Path],
739
773
  columns: Optional[List[str]] = None,
740
774
  backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
741
775
  figsize: Optional[Tuple[int, int]] = None,
742
- save_fig: Optional[bool] = None,
776
+ save_fig: Optional[bool] = False,
743
777
  filename: Optional[str] = None,
744
778
  **kwargs):
745
779
  """
@@ -752,7 +786,7 @@ class UtilsStats:
752
786
  """
753
787
  backend = backend or self._plot_backend
754
788
  figsize = figsize or self._default_figsize
755
- save_fig = save_fig if save_fig is not None else self._save_fig
789
+ self.save_fig = save_fig
756
790
  filename = filename or "scatter_matrix"
757
791
 
758
792
  # Resolver datos
@@ -791,7 +825,8 @@ class UtilsStats:
791
825
  except Exception as e:
792
826
  print(f"✗ Error guardando figura Plotly: {e}")
793
827
 
794
- return fig
828
+ if backend == 'plotly':
829
+ return fig
795
830
 
796
831
  # ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
797
832
 
@@ -802,7 +837,7 @@ class UtilsStats:
802
837
  ci_method: str = 'parametric',
803
838
  bins: int = 30,
804
839
  figsize: Optional[Tuple[int, int]] = None,
805
- save_fig: Optional[bool] = None,
840
+ save_fig: Optional[bool] = False,
806
841
  filename: Optional[str] = None,
807
842
  **kwargs) -> plt.Figure:
808
843
  """
@@ -838,7 +873,7 @@ class UtilsStats:
838
873
  x_range = np.linspace(data_array.min(), data_array.max(), 300)
839
874
 
840
875
  # ======= FIGURA =======
841
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize or (14, 6))
876
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize or (14, 6))
842
877
 
843
878
  # ============================================================
844
879
  # PANEL 1: HISTOGRAMA + KDE
@@ -903,109 +938,37 @@ class UtilsStats:
903
938
  plt.tight_layout()
904
939
 
905
940
  # Guardado opcional
906
- save_fig = save_fig if save_fig is not None else self._save_fig
941
+ self.save_fig = save_fig
907
942
  if save_fig:
908
943
  self._save_figure(fig, filename)
909
944
 
910
- return fig
911
-
912
-
913
- def plot_multiple_distributions_with_ci(self,
914
- data_dict: dict,
915
- confidence_level: float = 0.95,
916
- figsize: Optional[Tuple[int, int]] = None,
917
- save_fig: Optional[bool] = None,
918
- filename: Optional[str] = None,
919
- **kwargs) -> plt.Figure:
920
- """
921
- Grafica múltiples distribuciones con sus intervalos de confianza
922
- """
923
- n_distributions = len(data_dict)
924
- fig, axes = plt.subplots(n_distributions, 2,
925
- figsize=figsize or (14, 5 * n_distributions))
926
-
927
- if n_distributions == 1:
928
- axes = axes.reshape(1, -1)
929
-
930
- colors = plt.cm.Set3(np.linspace(0, 1, n_distributions))
931
-
932
- for idx, (name, data) in enumerate(data_dict.items()):
933
- ax1, ax2 = axes[idx]
934
-
935
- if isinstance(data, pd.Series):
936
- data_array = data.dropna().values
937
- else:
938
- data_array = np.array(data)
939
- data_array = data_array[~np.isnan(data_array)]
940
-
941
- # Calcular estadísticas
942
- ci_result = self.calculate_confidence_intervals(data_array, confidence_level=confidence_level)
943
-
944
- # Gráfica izquierda: Distribución básica
945
- ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
946
- edgecolor='black', density=True)
947
-
948
- kde = stats.gaussian_kde(data_array)
949
- x_range = np.linspace(data_array.min(), data_array.max(), 200)
950
- ax1.plot(x_range, kde(x_range), 'k-', linewidth=2)
951
- ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2)
952
-
953
- ax1.set_title(f'{name}\nMedia: {ci_result["mean"]:.2f}')
954
- ax1.grid(True, alpha=0.3)
955
-
956
- # Gráfica derecha: Con intervalos de confianza
957
- ax2.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
958
- edgecolor='black', density=True)
959
- ax2.plot(x_range, kde(x_range), 'k-', linewidth=2)
960
-
961
- ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
962
- ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
963
- alpha=0.3, color='orange')
964
- ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
965
- ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
966
-
967
- ax2.set_title(f'{name} con IC {confidence_level*100}%')
968
- ax2.grid(True, alpha=0.3)
969
-
970
- plt.tight_layout()
971
-
972
- # Guardar figura si está activado
973
- save_fig = save_fig if save_fig is not None else self._save_fig
974
- if save_fig:
975
- filename = filename or "multiples_distribuciones_ci"
976
- self._save_figure(fig, filename)
977
-
978
- return fig
979
945
 
980
946
  # ============= MÉTODOS UTILITARIOS ADICIONALES =============
981
947
 
982
- def get_descriptive_stats(self,
983
- data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
984
- column: Optional[str] = None) -> dict:
985
- """
986
- Obtiene estadísticas descriptivas completas
987
-
988
- Ahora acepta rutas de archivos
989
- """
990
- # Resolver datos
991
- data, source = self._resolve_data(data, column)
992
-
948
+ def get_descriptive_stats(self, data, column=None):
949
+
993
950
  if isinstance(data, pd.DataFrame):
994
951
  if column is None:
995
- raise ValueError("Debe especificar 'column' cuando data es DataFrame")
952
+ raise ValueError("Debe especificarse una columna")
996
953
  data_series = data[column]
997
- elif isinstance(data, pd.Series):
998
- data_series = data
999
954
  else:
1000
955
  data_series = pd.Series(data)
1001
-
956
+
1002
957
  data_clean = data_series.dropna()
1003
-
958
+
959
+ if len(data_clean) == 0:
960
+ return {k: np.nan for k in [
961
+ 'count','mean','median','mode','std','variance',
962
+ 'min','max','q1','q3','iqr','skewness','kurtosis','range'
963
+ ]}
964
+
965
+ mode_result = stats.mode(data_clean, keepdims=False)
966
+
1004
967
  return {
1005
968
  'count': len(data_clean),
1006
969
  'mean': np.mean(data_clean),
1007
970
  'median': np.median(data_clean),
1008
- 'mode': stats.mode(data_clean)[0][0] if len(data_clean) > 0 else np.nan,
971
+ 'mode': mode_result.mode,
1009
972
  'std': np.std(data_clean, ddof=1),
1010
973
  'variance': np.var(data_clean, ddof=1),
1011
974
  'min': np.min(data_clean),
@@ -1017,7 +980,6 @@ class UtilsStats:
1017
980
  'kurtosis': stats.kurtosis(data_clean),
1018
981
  'range': np.max(data_clean) - np.min(data_clean)
1019
982
  }
1020
-
1021
983
  def help(self):
1022
984
  """
1023
985
  Muestra ayuda completa de la clase DescriptiveStats