datanarrator 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datanarrator-0.1.0/PKG-INFO +16 -0
- datanarrator-0.1.0/README.md +0 -0
- datanarrator-0.1.0/datanarrator/__init__.py +9 -0
- datanarrator-0.1.0/datanarrator/analyzer.py +182 -0
- datanarrator-0.1.0/datanarrator/narrator.py +308 -0
- datanarrator-0.1.0/datanarrator.egg-info/PKG-INFO +16 -0
- datanarrator-0.1.0/datanarrator.egg-info/SOURCES.txt +11 -0
- datanarrator-0.1.0/datanarrator.egg-info/dependency_links.txt +1 -0
- datanarrator-0.1.0/datanarrator.egg-info/requires.txt +2 -0
- datanarrator-0.1.0/datanarrator.egg-info/top_level.txt +1 -0
- datanarrator-0.1.0/pyproject.toml +32 -0
- datanarrator-0.1.0/setup.cfg +4 -0
- datanarrator-0.1.0/tests/test_narrator.py +250 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datanarrator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convierte cualquier DataFrame de pandas en un análisis en lenguaje natural
|
|
5
|
+
Author-email: Tu Nombre <tu@email.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ikernavarro4/data-narrator
|
|
8
|
+
Project-URL: Repository, https://github.com/ikernavarro4/data-narrator
|
|
9
|
+
Keywords: data science,eda,nlg,pandas,analysis
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: pandas>=1.5.0
|
|
16
|
+
Requires-Dist: numpy>=1.23.0
|
|
File without changes
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DataAnalyzer:
|
|
6
|
+
"""Analiza un DataFrame y extrae estadísticas relevantes."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, df: pd.DataFrame):
|
|
9
|
+
if not isinstance(df, pd.DataFrame):
|
|
10
|
+
raise TypeError("El input debe ser un DataFrame de pandas.")
|
|
11
|
+
if df.empty:
|
|
12
|
+
raise ValueError("El DataFrame no puede estar vacío.")
|
|
13
|
+
self.df = df
|
|
14
|
+
self._results = None
|
|
15
|
+
|
|
16
|
+
def analyze(self) -> dict:
|
|
17
|
+
"""Corre el análisis completo y devuelve un diccionario con los resultados."""
|
|
18
|
+
self._results = {
|
|
19
|
+
"overview": self._overview(),
|
|
20
|
+
"numeric": self._analyze_numeric(),
|
|
21
|
+
"categorical": self._analyze_categorical(),
|
|
22
|
+
"datetime": self._analyze_datetime(),
|
|
23
|
+
"correlations": self._correlations(),
|
|
24
|
+
"alerts": self._alerts(),
|
|
25
|
+
}
|
|
26
|
+
return self._results
|
|
27
|
+
|
|
28
|
+
def _overview(self) -> dict:
|
|
29
|
+
df = self.df
|
|
30
|
+
numeric_cols = df.select_dtypes(include="number").columns.tolist()
|
|
31
|
+
categorical_cols = df.select_dtypes(include=["str", "category"]).columns.tolist()
|
|
32
|
+
datetime_cols = df.select_dtypes(include=["datetime"]).columns.tolist()
|
|
33
|
+
|
|
34
|
+
# Intentar detectar fechas en columnas string
|
|
35
|
+
for col in categorical_cols.copy():
|
|
36
|
+
try:
|
|
37
|
+
pd.to_datetime(df[col], infer_datetime_format=True)
|
|
38
|
+
datetime_cols.append(col)
|
|
39
|
+
categorical_cols.remove(col)
|
|
40
|
+
except Exception:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
total_nulls = df.isnull().sum().sum()
|
|
44
|
+
total_cells = df.shape[0] * df.shape[1]
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
"rows": df.shape[0],
|
|
48
|
+
"cols": df.shape[1],
|
|
49
|
+
"numeric_cols": numeric_cols,
|
|
50
|
+
"categorical_cols": categorical_cols,
|
|
51
|
+
"datetime_cols": datetime_cols,
|
|
52
|
+
"total_nulls": int(total_nulls),
|
|
53
|
+
"null_pct": round(total_nulls / total_cells * 100, 2),
|
|
54
|
+
"duplicates": int(df.duplicated().sum()),
|
|
55
|
+
"memory_kb": round(df.memory_usage(deep=True).sum() / 1024, 1),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
def _analyze_numeric(self) -> list:
|
|
59
|
+
results = []
|
|
60
|
+
numeric_cols = self.df.select_dtypes(include="number").columns
|
|
61
|
+
for col in numeric_cols:
|
|
62
|
+
series = self.df[col].dropna()
|
|
63
|
+
if len(series) == 0:
|
|
64
|
+
continue
|
|
65
|
+
q1 = series.quantile(0.25)
|
|
66
|
+
q3 = series.quantile(0.75)
|
|
67
|
+
iqr = q3 - q1
|
|
68
|
+
outliers = series[(series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)]
|
|
69
|
+
results.append({
|
|
70
|
+
"col": col,
|
|
71
|
+
"mean": round(series.mean(), 2),
|
|
72
|
+
"median": round(series.median(), 2),
|
|
73
|
+
"std": round(series.std(), 2),
|
|
74
|
+
"min": round(series.min(), 2),
|
|
75
|
+
"max": round(series.max(), 2),
|
|
76
|
+
"nulls": int(self.df[col].isnull().sum()),
|
|
77
|
+
"null_pct": round(self.df[col].isnull().sum() / len(self.df) * 100, 1),
|
|
78
|
+
"skew": round(float(series.skew()), 2),
|
|
79
|
+
"outlier_count": len(outliers),
|
|
80
|
+
})
|
|
81
|
+
return results
|
|
82
|
+
|
|
83
|
+
def _analyze_categorical(self) -> list:
|
|
84
|
+
results = []
|
|
85
|
+
cat_cols = self.df.select_dtypes(include=["str", "category"]).columns
|
|
86
|
+
for col in cat_cols:
|
|
87
|
+
series = self.df[col].dropna()
|
|
88
|
+
value_counts = series.value_counts()
|
|
89
|
+
results.append({
|
|
90
|
+
"col": col,
|
|
91
|
+
"unique": int(series.nunique()),
|
|
92
|
+
"top_value": str(value_counts.index[0]) if len(value_counts) > 0 else None,
|
|
93
|
+
"top_freq": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
|
|
94
|
+
"top_pct": round(value_counts.iloc[0] / len(series) * 100, 1) if len(value_counts) > 0 else 0,
|
|
95
|
+
"nulls": int(self.df[col].isnull().sum()),
|
|
96
|
+
"null_pct": round(self.df[col].isnull().sum() / len(self.df) * 100, 1),
|
|
97
|
+
"high_cardinality": series.nunique() > 20,
|
|
98
|
+
})
|
|
99
|
+
return results
|
|
100
|
+
|
|
101
|
+
def _analyze_datetime(self) -> list:
|
|
102
|
+
results = []
|
|
103
|
+
dt_cols = self.df.select_dtypes(include=["datetime"]).columns
|
|
104
|
+
for col in dt_cols:
|
|
105
|
+
series = pd.to_datetime(self.df[col], errors="coerce").dropna()
|
|
106
|
+
if len(series) == 0:
|
|
107
|
+
continue
|
|
108
|
+
results.append({
|
|
109
|
+
"col": col,
|
|
110
|
+
"min_date": str(series.min().date()),
|
|
111
|
+
"max_date": str(series.max().date()),
|
|
112
|
+
"range_days": (series.max() - series.min()).days,
|
|
113
|
+
"nulls": int(self.df[col].isnull().sum()),
|
|
114
|
+
})
|
|
115
|
+
return results
|
|
116
|
+
|
|
117
|
+
def _correlations(self) -> list:
|
|
118
|
+
numeric = self.df.select_dtypes(include="number")
|
|
119
|
+
if numeric.shape[1] < 2:
|
|
120
|
+
return []
|
|
121
|
+
corr_matrix = numeric.corr()
|
|
122
|
+
pairs = []
|
|
123
|
+
cols = corr_matrix.columns.tolist()
|
|
124
|
+
for i in range(len(cols)):
|
|
125
|
+
for j in range(i + 1, len(cols)):
|
|
126
|
+
val = corr_matrix.iloc[i, j]
|
|
127
|
+
if not np.isnan(val) and abs(val) >= 0.5:
|
|
128
|
+
pairs.append({
|
|
129
|
+
"col_a": cols[i],
|
|
130
|
+
"col_b": cols[j],
|
|
131
|
+
"correlation": round(val, 2),
|
|
132
|
+
"strength": "alta" if abs(val) >= 0.75 else "moderada",
|
|
133
|
+
"direction": "positiva" if val > 0 else "negativa",
|
|
134
|
+
})
|
|
135
|
+
pairs.sort(key=lambda x: abs(x["correlation"]), reverse=True)
|
|
136
|
+
return pairs
|
|
137
|
+
|
|
138
|
+
def _alerts(self) -> list:
|
|
139
|
+
alerts = []
|
|
140
|
+
df = self.df
|
|
141
|
+
|
|
142
|
+
# Duplicados
|
|
143
|
+
dups = df.duplicated().sum()
|
|
144
|
+
if dups > 0:
|
|
145
|
+
alerts.append({
|
|
146
|
+
"type": "duplicates",
|
|
147
|
+
"message": f"{dups} registros duplicados detectados.",
|
|
148
|
+
"suggestion": "Considera eliminarlos antes de modelar.",
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
# Columnas con muchos nulos
|
|
152
|
+
for col in df.columns:
|
|
153
|
+
null_pct = df[col].isnull().sum() / len(df) * 100
|
|
154
|
+
if null_pct >= 20:
|
|
155
|
+
alerts.append({
|
|
156
|
+
"type": "high_nulls",
|
|
157
|
+
"col": col,
|
|
158
|
+
"message": f"'{col}' tiene {null_pct:.1f}% de valores nulos.",
|
|
159
|
+
"suggestion": "Considera imputar o eliminar esta columna.",
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
# Alta cardinalidad
|
|
163
|
+
for col in df.select_dtypes(include=["str", "category"]).columns:
|
|
164
|
+
if df[col].nunique() > 50:
|
|
165
|
+
alerts.append({
|
|
166
|
+
"type": "high_cardinality",
|
|
167
|
+
"col": col,
|
|
168
|
+
"message": f"'{col}' tiene {df[col].nunique()} valores únicos.",
|
|
169
|
+
"suggestion": "Evita label encoding directo. Considera target encoding.",
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
# Columnas con un solo valor
|
|
173
|
+
for col in df.columns:
|
|
174
|
+
if df[col].nunique() == 1:
|
|
175
|
+
alerts.append({
|
|
176
|
+
"type": "constant_column",
|
|
177
|
+
"col": col,
|
|
178
|
+
"message": f"'{col}' tiene un solo valor único.",
|
|
179
|
+
"suggestion": "Esta columna no aporta información. Considera eliminarla.",
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
return alerts
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from datanarrator.analyzer import DataAnalyzer
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Narrator:
|
|
7
|
+
"""
|
|
8
|
+
Convierte un DataFrame de pandas en un análisis en lenguaje natural.
|
|
9
|
+
|
|
10
|
+
Ejemplo:
|
|
11
|
+
>>> from datanarrator import Narrator
|
|
12
|
+
>>> import pandas as pd
|
|
13
|
+
>>> df = pd.read_csv("titanic.csv")
|
|
14
|
+
>>> n = Narrator(df, lang="es")
|
|
15
|
+
>>> print(n.describe())
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
SUPPORTED_LANGS = ("es", "en")
|
|
19
|
+
|
|
20
|
+
def __init__(self, df: pd.DataFrame, lang: str = "es"):
|
|
21
|
+
if lang not in self.SUPPORTED_LANGS:
|
|
22
|
+
raise ValueError(f"Idioma '{lang}' no soportado. Usa: {self.SUPPORTED_LANGS}")
|
|
23
|
+
self.df = df
|
|
24
|
+
self.lang = lang
|
|
25
|
+
self._analyzer = DataAnalyzer(df)
|
|
26
|
+
self._data = self._analyzer.analyze()
|
|
27
|
+
|
|
28
|
+
# ------------------------------------------------------------------
|
|
29
|
+
# API pública
|
|
30
|
+
# ------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
def describe(self) -> str:
|
|
33
|
+
"""Genera el análisis completo en lenguaje natural."""
|
|
34
|
+
sections = [
|
|
35
|
+
self._section_overview(),
|
|
36
|
+
self._section_numeric(),
|
|
37
|
+
self._section_categorical(),
|
|
38
|
+
self._section_correlations(),
|
|
39
|
+
self._section_alerts(),
|
|
40
|
+
]
|
|
41
|
+
return "\n\n".join(s for s in sections if s)
|
|
42
|
+
|
|
43
|
+
def executive_summary(self) -> str:
|
|
44
|
+
"""Resumen ejecutivo de 2-3 oraciones."""
|
|
45
|
+
ov = self._data["overview"]
|
|
46
|
+
parts = []
|
|
47
|
+
|
|
48
|
+
if self.lang == "es":
|
|
49
|
+
parts.append(
|
|
50
|
+
f"El dataset contiene {ov['rows']:,} registros y {ov['cols']} columnas "
|
|
51
|
+
f"({len(ov['numeric_cols'])} numéricas, {len(ov['categorical_cols'])} categóricas)."
|
|
52
|
+
)
|
|
53
|
+
if ov["null_pct"] > 0:
|
|
54
|
+
parts.append(f"Presenta un {ov['null_pct']}% de valores nulos en total.")
|
|
55
|
+
corrs = self._data["correlations"]
|
|
56
|
+
if corrs:
|
|
57
|
+
top = corrs[0]
|
|
58
|
+
parts.append(
|
|
59
|
+
f"La correlación más fuerte es entre '{top['col_a']}' y '{top['col_b']}' ({top['correlation']})."
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
parts.append(
|
|
63
|
+
f"The dataset has {ov['rows']:,} rows and {ov['cols']} columns "
|
|
64
|
+
f"({len(ov['numeric_cols'])} numeric, {len(ov['categorical_cols'])} categorical)."
|
|
65
|
+
)
|
|
66
|
+
if ov["null_pct"] > 0:
|
|
67
|
+
parts.append(f"Overall null rate is {ov['null_pct']}%.")
|
|
68
|
+
corrs = self._data["correlations"]
|
|
69
|
+
if corrs:
|
|
70
|
+
top = corrs[0]
|
|
71
|
+
parts.append(
|
|
72
|
+
f"Strongest correlation: '{top['col_a']}' and '{top['col_b']}' ({top['correlation']})."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return " ".join(parts)
|
|
76
|
+
|
|
77
|
+
def alerts_only(self) -> str:
|
|
78
|
+
"""Devuelve solo las alertas y recomendaciones detectadas."""
|
|
79
|
+
alerts = self._data["alerts"]
|
|
80
|
+
if not alerts:
|
|
81
|
+
if self.lang == "es":
|
|
82
|
+
return "No se detectaron alertas en este dataset."
|
|
83
|
+
return "No alerts detected in this dataset."
|
|
84
|
+
|
|
85
|
+
lines = []
|
|
86
|
+
header = "Alertas detectadas:" if self.lang == "es" else "Alerts detected:"
|
|
87
|
+
lines.append(header)
|
|
88
|
+
for a in alerts:
|
|
89
|
+
lines.append(f" → {a['message']} {a['suggestion']}")
|
|
90
|
+
return "\n".join(lines)
|
|
91
|
+
|
|
92
|
+
def export(self, filepath: str) -> None:
|
|
93
|
+
"""Exporta el análisis completo a un archivo .txt o .md."""
|
|
94
|
+
content = self.describe()
|
|
95
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
96
|
+
f.write(content)
|
|
97
|
+
print(f"Análisis exportado a: {filepath}")
|
|
98
|
+
|
|
99
|
+
def compare(self, df2):
|
|
100
|
+
if not isinstance(df2, __import__('pandas').DataFrame):
|
|
101
|
+
raise TypeError("El input debe ser un DataFrame de pandas.")
|
|
102
|
+
if df2.empty:
|
|
103
|
+
raise ValueError("El segundo DataFrame no puede estar vacío.")
|
|
104
|
+
df1 = self.df
|
|
105
|
+
out = []
|
|
106
|
+
if self.lang == "es":
|
|
107
|
+
out.append("--- Comparación de datasets ---")
|
|
108
|
+
diff = df2.shape[0] - df1.shape[0]
|
|
109
|
+
if diff > 0:
|
|
110
|
+
out.append(f"El segundo dataset tiene {diff} registros más.")
|
|
111
|
+
elif diff < 0:
|
|
112
|
+
out.append(f"El segundo dataset tiene {abs(diff)} registros menos.")
|
|
113
|
+
else:
|
|
114
|
+
out.append("Ambos datasets tienen el mismo número de registros.")
|
|
115
|
+
cols1, cols2 = set(df1.columns), set(df2.columns)
|
|
116
|
+
if cols1 - cols2:
|
|
117
|
+
out.append(f"Columnas solo en el primero: {', '.join(cols1 - cols2)}.")
|
|
118
|
+
if cols2 - cols1:
|
|
119
|
+
out.append(f"Columnas solo en el segundo: {', '.join(cols2 - cols1)}.")
|
|
120
|
+
common = [c for c in df1.select_dtypes(include="number").columns if c in df2.select_dtypes(include="number").columns]
|
|
121
|
+
for col in common:
|
|
122
|
+
m1, m2 = df1[col].mean(), df2[col].mean()
|
|
123
|
+
if m1 == 0:
|
|
124
|
+
continue
|
|
125
|
+
pct = round((m2 - m1) / abs(m1) * 100, 1)
|
|
126
|
+
if abs(pct) >= 10:
|
|
127
|
+
d = "subió" if pct > 0 else "bajó"
|
|
128
|
+
out.append(f"'{col}': media {d} de {round(m1,2)} a {round(m2,2)} ({pct:+.1f}%).")
|
|
129
|
+
n1 = df1.isnull().sum().sum() / (df1.shape[0] * df1.shape[1]) * 100
|
|
130
|
+
n2 = df2.isnull().sum().sum() / (df2.shape[0] * df2.shape[1]) * 100
|
|
131
|
+
dn = round(n2 - n1, 1)
|
|
132
|
+
if abs(dn) >= 5:
|
|
133
|
+
d = "más" if dn > 0 else "menos"
|
|
134
|
+
out.append(f"⚠ El segundo dataset tiene {abs(dn)}% {d} valores nulos.")
|
|
135
|
+
out.append(" → Posible degradación en calidad de datos.")
|
|
136
|
+
drift = []
|
|
137
|
+
for col in common:
|
|
138
|
+
s1, s2 = df1[col].std(), df2[col].std()
|
|
139
|
+
if s1 == 0:
|
|
140
|
+
continue
|
|
141
|
+
if abs(s2 - s1) / s1 * 100 >= 30:
|
|
142
|
+
drift.append(col)
|
|
143
|
+
if drift:
|
|
144
|
+
out.append(f"⚠ Posible data drift en: {', '.join(drift)}.")
|
|
145
|
+
out.append(" → Dispersión cambió más del 30%. Revisa antes de producción.")
|
|
146
|
+
else:
|
|
147
|
+
out.append("--- Dataset comparison ---")
|
|
148
|
+
diff = df2.shape[0] - df1.shape[0]
|
|
149
|
+
if diff > 0:
|
|
150
|
+
out.append(f"The second dataset has {diff} more rows.")
|
|
151
|
+
elif diff < 0:
|
|
152
|
+
out.append(f"The second dataset has {abs(diff)} fewer rows.")
|
|
153
|
+
else:
|
|
154
|
+
out.append("Both datasets have the same number of rows.")
|
|
155
|
+
cols1, cols2 = set(df1.columns), set(df2.columns)
|
|
156
|
+
if cols1 - cols2:
|
|
157
|
+
out.append(f"Columns only in first: {', '.join(cols1 - cols2)}.")
|
|
158
|
+
if cols2 - cols1:
|
|
159
|
+
out.append(f"Columns only in second: {', '.join(cols2 - cols1)}.")
|
|
160
|
+
common = [c for c in df1.select_dtypes(include="number").columns if c in df2.select_dtypes(include="number").columns]
|
|
161
|
+
for col in common:
|
|
162
|
+
m1, m2 = df1[col].mean(), df2[col].mean()
|
|
163
|
+
if m1 == 0:
|
|
164
|
+
continue
|
|
165
|
+
pct = round((m2 - m1) / abs(m1) * 100, 1)
|
|
166
|
+
if abs(pct) >= 10:
|
|
167
|
+
d = "increased" if pct > 0 else "decreased"
|
|
168
|
+
out.append(f"'{col}': mean {d} from {round(m1,2)} to {round(m2,2)} ({pct:+.1f}%).")
|
|
169
|
+
n1 = df1.isnull().sum().sum() / (df1.shape[0] * df1.shape[1]) * 100
|
|
170
|
+
n2 = df2.isnull().sum().sum() / (df2.shape[0] * df2.shape[1]) * 100
|
|
171
|
+
dn = round(n2 - n1, 1)
|
|
172
|
+
if abs(dn) >= 5:
|
|
173
|
+
d = "more" if dn > 0 else "fewer"
|
|
174
|
+
out.append(f"⚠ Second dataset has {abs(dn)}% {d} null values.")
|
|
175
|
+
out.append(" → Possible data quality degradation.")
|
|
176
|
+
drift = []
|
|
177
|
+
for col in common:
|
|
178
|
+
s1, s2 = df1[col].std(), df2[col].std()
|
|
179
|
+
if s1 == 0:
|
|
180
|
+
continue
|
|
181
|
+
if abs(s2 - s1) / s1 * 100 >= 30:
|
|
182
|
+
drift.append(col)
|
|
183
|
+
if drift:
|
|
184
|
+
out.append(f"⚠ Possible data drift in: {', '.join(drift)}.")
|
|
185
|
+
out.append(" → Dispersion changed over 30%. Review before production.")
|
|
186
|
+
return "\n".join(out)
|
|
187
|
+
|
|
188
|
+
# ------------------------------------------------------------------
|
|
189
|
+
# Secciones internas
|
|
190
|
+
# ------------------------------------------------------------------
|
|
191
|
+
|
|
192
|
+
def _section_overview(self) -> str:
|
|
193
|
+
ov = self._data["overview"]
|
|
194
|
+
if self.lang == "es":
|
|
195
|
+
lines = [
|
|
196
|
+
"--- Resumen general ---",
|
|
197
|
+
f"El dataset contiene {ov['rows']:,} registros con {ov['cols']} columnas: "
|
|
198
|
+
f"{len(ov['numeric_cols'])} numéricas, {len(ov['categorical_cols'])} categóricas"
|
|
199
|
+
+ (f" y {len(ov['datetime_cols'])} de fecha." if ov["datetime_cols"] else "."),
|
|
200
|
+
]
|
|
201
|
+
if ov["total_nulls"] > 0:
|
|
202
|
+
lines.append(f"Valores nulos: {ov['total_nulls']:,} ({ov['null_pct']}% del total).")
|
|
203
|
+
else:
|
|
204
|
+
lines.append("No se encontraron valores nulos.")
|
|
205
|
+
if ov["duplicates"] > 0:
|
|
206
|
+
lines.append(f"Se detectaron {ov['duplicates']} registros duplicados.")
|
|
207
|
+
lines.append(f"Memoria utilizada: {ov['memory_kb']} KB.")
|
|
208
|
+
else:
|
|
209
|
+
lines = [
|
|
210
|
+
"--- Overview ---",
|
|
211
|
+
f"The dataset has {ov['rows']:,} rows and {ov['cols']} columns: "
|
|
212
|
+
f"{len(ov['numeric_cols'])} numeric, {len(ov['categorical_cols'])} categorical"
|
|
213
|
+
+ (f", and {len(ov['datetime_cols'])} datetime." if ov["datetime_cols"] else "."),
|
|
214
|
+
]
|
|
215
|
+
if ov["total_nulls"] > 0:
|
|
216
|
+
lines.append(f"Null values: {ov['total_nulls']:,} ({ov['null_pct']}% of all cells).")
|
|
217
|
+
else:
|
|
218
|
+
lines.append("No null values found.")
|
|
219
|
+
if ov["duplicates"] > 0:
|
|
220
|
+
lines.append(f"{ov['duplicates']} duplicate rows detected.")
|
|
221
|
+
lines.append(f"Memory usage: {ov['memory_kb']} KB.")
|
|
222
|
+
return "\n".join(lines)
|
|
223
|
+
|
|
224
|
+
def _section_numeric(self) -> str:
|
|
225
|
+
cols = self._data["numeric"]
|
|
226
|
+
if not cols:
|
|
227
|
+
return ""
|
|
228
|
+
header = "--- Columnas numéricas ---" if self.lang == "es" else "--- Numeric columns ---"
|
|
229
|
+
lines = [header]
|
|
230
|
+
for c in cols:
|
|
231
|
+
if self.lang == "es":
|
|
232
|
+
line = (
|
|
233
|
+
f"{c['col']}: media={c['mean']}, mediana={c['median']}, "
|
|
234
|
+
f"std={c['std']}, rango=[{c['min']} – {c['max']}]"
|
|
235
|
+
)
|
|
236
|
+
if c["nulls"] > 0:
|
|
237
|
+
line += f", nulos={c['nulls']} ({c['null_pct']}%)"
|
|
238
|
+
if c["outlier_count"] > 0:
|
|
239
|
+
line += f". Se detectaron {c['outlier_count']} posibles outliers (IQR)."
|
|
240
|
+
if abs(c["skew"]) > 1:
|
|
241
|
+
direction = "positivo" if c["skew"] > 0 else "negativo"
|
|
242
|
+
line += f" Distribución con sesgo {direction} ({c['skew']})."
|
|
243
|
+
else:
|
|
244
|
+
line = (
|
|
245
|
+
f"{c['col']}: mean={c['mean']}, median={c['median']}, "
|
|
246
|
+
f"std={c['std']}, range=[{c['min']} – {c['max']}]"
|
|
247
|
+
)
|
|
248
|
+
if c["nulls"] > 0:
|
|
249
|
+
line += f", nulls={c['nulls']} ({c['null_pct']}%)"
|
|
250
|
+
if c["outlier_count"] > 0:
|
|
251
|
+
line += f". {c['outlier_count']} potential outliers detected (IQR)."
|
|
252
|
+
if abs(c["skew"]) > 1:
|
|
253
|
+
direction = "positive" if c["skew"] > 0 else "negative"
|
|
254
|
+
line += f" {direction.capitalize()} skew ({c['skew']})."
|
|
255
|
+
lines.append(f" {line}")
|
|
256
|
+
return "\n".join(lines)
|
|
257
|
+
|
|
258
|
+
def _section_categorical(self) -> str:
|
|
259
|
+
cols = self._data["categorical"]
|
|
260
|
+
if not cols:
|
|
261
|
+
return ""
|
|
262
|
+
header = "--- Columnas categóricas ---" if self.lang == "es" else "--- Categorical columns ---"
|
|
263
|
+
lines = [header]
|
|
264
|
+
for c in cols:
|
|
265
|
+
if self.lang == "es":
|
|
266
|
+
line = (
|
|
267
|
+
f"{c['col']}: {c['unique']} valores únicos. "
|
|
268
|
+
f"El más frecuente es '{c['top_value']}' ({c['top_pct']}% de los registros)."
|
|
269
|
+
)
|
|
270
|
+
if c["nulls"] > 0:
|
|
271
|
+
line += f" Nulos: {c['null_pct']}%."
|
|
272
|
+
else:
|
|
273
|
+
line = (
|
|
274
|
+
f"{c['col']}: {c['unique']} unique values. "
|
|
275
|
+
f"Most frequent: '{c['top_value']}' ({c['top_pct']}% of records)."
|
|
276
|
+
)
|
|
277
|
+
if c["nulls"] > 0:
|
|
278
|
+
line += f" Nulls: {c['null_pct']}%."
|
|
279
|
+
lines.append(f" {line}")
|
|
280
|
+
return "\n".join(lines)
|
|
281
|
+
|
|
282
|
+
def _section_correlations(self) -> str:
|
|
283
|
+
corrs = self._data["correlations"]
|
|
284
|
+
if not corrs:
|
|
285
|
+
return ""
|
|
286
|
+
header = "--- Correlaciones relevantes ---" if self.lang == "es" else "--- Relevant correlations ---"
|
|
287
|
+
lines = [header]
|
|
288
|
+
for c in corrs:
|
|
289
|
+
if self.lang == "es":
|
|
290
|
+
lines.append(
|
|
291
|
+
f" {c['col_a']} ↔ {c['col_b']}: correlación {c['strength']} {c['direction']} ({c['correlation']})"
|
|
292
|
+
)
|
|
293
|
+
else:
|
|
294
|
+
lines.append(
|
|
295
|
+
f" {c['col_a']} ↔ {c['col_b']}: {c['strength']} {c['direction']} correlation ({c['correlation']})"
|
|
296
|
+
)
|
|
297
|
+
return "\n".join(lines)
|
|
298
|
+
|
|
299
|
+
def _section_alerts(self) -> str:
|
|
300
|
+
alerts = self._data["alerts"]
|
|
301
|
+
if not alerts:
|
|
302
|
+
return ""
|
|
303
|
+
header = "--- Alertas y recomendaciones ---" if self.lang == "es" else "--- Alerts & recommendations ---"
|
|
304
|
+
lines = [header]
|
|
305
|
+
for a in alerts:
|
|
306
|
+
lines.append(f" ⚠ {a['message']}")
|
|
307
|
+
lines.append(f" → {a['suggestion']}")
|
|
308
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datanarrator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convierte cualquier DataFrame de pandas en un análisis en lenguaje natural
|
|
5
|
+
Author-email: Tu Nombre <tu@email.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ikernavarro4/data-narrator
|
|
8
|
+
Project-URL: Repository, https://github.com/ikernavarro4/data-narrator
|
|
9
|
+
Keywords: data science,eda,nlg,pandas,analysis
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: pandas>=1.5.0
|
|
16
|
+
Requires-Dist: numpy>=1.23.0
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
datanarrator/__init__.py
|
|
4
|
+
datanarrator/analyzer.py
|
|
5
|
+
datanarrator/narrator.py
|
|
6
|
+
datanarrator.egg-info/PKG-INFO
|
|
7
|
+
datanarrator.egg-info/SOURCES.txt
|
|
8
|
+
datanarrator.egg-info/dependency_links.txt
|
|
9
|
+
datanarrator.egg-info/requires.txt
|
|
10
|
+
datanarrator.egg-info/top_level.txt
|
|
11
|
+
tests/test_narrator.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datanarrator
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "datanarrator"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Convierte cualquier DataFrame de pandas en un análisis en lenguaje natural"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Tu Nombre", email = "tu@email.com" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["data science", "eda", "nlg", "pandas", "analysis"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"pandas>=1.5.0",
|
|
23
|
+
"numpy>=1.23.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/ikernavarro4/data-narrator"
|
|
28
|
+
Repository = "https://github.com/ikernavarro4/data-narrator"
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
where = ["."]
|
|
32
|
+
include = ["datanarrator*"]
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from datanarrator import Narrator
|
|
5
|
+
from datanarrator.analyzer import DataAnalyzer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# ------------------------------------------------------------------
|
|
9
|
+
# Fixtures — datasets reutilizables en todos los tests
|
|
10
|
+
# ------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def df_basic():
|
|
14
|
+
"""Dataset básico con columnas numéricas y categóricas."""
|
|
15
|
+
return pd.DataFrame({
|
|
16
|
+
"edad": [25, 30, 35, 40, 45, 200],
|
|
17
|
+
"salario": [30000, 45000, 50000, 60000, 70000, 80000],
|
|
18
|
+
"ciudad": ["cdmx", "cdmx", "monterrey", "guadalajara", "cdmx", "monterrey"],
|
|
19
|
+
"activo": ["si", "no", "si", "si", "no", "si"],
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def df_with_nulls():
|
|
25
|
+
"""Dataset con valores nulos."""
|
|
26
|
+
return pd.DataFrame({
|
|
27
|
+
"edad": [25, None, 35, None, 45],
|
|
28
|
+
"nombre": ["Ana", "Luis", None, "María", "Pedro"],
|
|
29
|
+
"salario": [30000, 45000, 50000, None, 70000],
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def df_numeric_only():
|
|
35
|
+
"""Dataset solo con columnas numéricas."""
|
|
36
|
+
return pd.DataFrame({
|
|
37
|
+
"a": [1, 2, 3, 4, 5],
|
|
38
|
+
"b": [10, 20, 30, 40, 50],
|
|
39
|
+
"c": [100, 200, 300, 400, 500],
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.fixture
|
|
44
|
+
def df_categorical_only():
|
|
45
|
+
"""Dataset solo con columnas categóricas."""
|
|
46
|
+
return pd.DataFrame({
|
|
47
|
+
"color": ["rojo", "azul", "rojo", "verde", "azul"],
|
|
48
|
+
"talla": ["s", "m", "l", "m", "s"],
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ------------------------------------------------------------------
|
|
53
|
+
# Tests de inicialización
|
|
54
|
+
# ------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def test_narrator_init_valid(df_basic):
|
|
57
|
+
n = Narrator(df_basic)
|
|
58
|
+
assert n is not None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_narrator_default_lang(df_basic):
|
|
62
|
+
n = Narrator(df_basic)
|
|
63
|
+
assert n.lang == "es"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_narrator_english_lang(df_basic):
|
|
67
|
+
n = Narrator(df_basic, lang="en")
|
|
68
|
+
assert n.lang == "en"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_narrator_invalid_lang(df_basic):
|
|
72
|
+
with pytest.raises(ValueError):
|
|
73
|
+
Narrator(df_basic, lang="fr")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_narrator_invalid_input():
|
|
77
|
+
with pytest.raises(TypeError):
|
|
78
|
+
Narrator("no soy un dataframe")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_narrator_empty_dataframe():
|
|
82
|
+
with pytest.raises(ValueError):
|
|
83
|
+
Narrator(pd.DataFrame())
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
# Tests de describe()
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def test_describe_returns_string(df_basic):
|
|
91
|
+
n = Narrator(df_basic)
|
|
92
|
+
result = n.describe()
|
|
93
|
+
assert isinstance(result, str)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_describe_not_empty(df_basic):
|
|
97
|
+
n = Narrator(df_basic)
|
|
98
|
+
result = n.describe()
|
|
99
|
+
assert len(result) > 0
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_describe_contains_resumen(df_basic):
|
|
103
|
+
n = Narrator(df_basic, lang="es")
|
|
104
|
+
result = n.describe()
|
|
105
|
+
assert "Resumen general" in result
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def test_describe_english(df_basic):
|
|
109
|
+
n = Narrator(df_basic, lang="en")
|
|
110
|
+
result = n.describe()
|
|
111
|
+
assert "Overview" in result
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def test_describe_numeric_only(df_numeric_only):
|
|
115
|
+
n = Narrator(df_numeric_only)
|
|
116
|
+
result = n.describe()
|
|
117
|
+
assert isinstance(result, str)
|
|
118
|
+
assert len(result) > 0
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_describe_categorical_only(df_categorical_only):
|
|
122
|
+
n = Narrator(df_categorical_only)
|
|
123
|
+
result = n.describe()
|
|
124
|
+
assert isinstance(result, str)
|
|
125
|
+
assert len(result) > 0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ------------------------------------------------------------------
|
|
129
|
+
# Tests de executive_summary()
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def test_executive_summary_returns_string(df_basic):
|
|
133
|
+
n = Narrator(df_basic)
|
|
134
|
+
result = n.executive_summary()
|
|
135
|
+
assert isinstance(result, str)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_executive_summary_not_empty(df_basic):
|
|
139
|
+
n = Narrator(df_basic)
|
|
140
|
+
result = n.executive_summary()
|
|
141
|
+
assert len(result) > 0
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_executive_summary_english(df_basic):
|
|
145
|
+
n = Narrator(df_basic, lang="en")
|
|
146
|
+
result = n.executive_summary()
|
|
147
|
+
assert isinstance(result, str)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# ------------------------------------------------------------------
|
|
151
|
+
# Tests de alerts_only()
|
|
152
|
+
# ------------------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
def test_alerts_only_returns_string(df_basic):
|
|
155
|
+
n = Narrator(df_basic)
|
|
156
|
+
result = n.alerts_only()
|
|
157
|
+
assert isinstance(result, str)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_alerts_with_nulls(df_with_nulls):
|
|
161
|
+
n = Narrator(df_with_nulls)
|
|
162
|
+
result = n.alerts_only()
|
|
163
|
+
assert isinstance(result, str)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_no_alerts_clean_dataset(df_numeric_only):
|
|
167
|
+
n = Narrator(df_numeric_only)
|
|
168
|
+
result = n.alerts_only()
|
|
169
|
+
assert "No se detectaron alertas" in result
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ------------------------------------------------------------------
|
|
173
|
+
# Tests de export()
|
|
174
|
+
# ------------------------------------------------------------------
|
|
175
|
+
|
|
176
|
+
def test_export_creates_file(df_basic, tmp_path):
|
|
177
|
+
n = Narrator(df_basic)
|
|
178
|
+
filepath = tmp_path / "reporte.txt"
|
|
179
|
+
n.export(str(filepath))
|
|
180
|
+
assert filepath.exists()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def test_export_file_not_empty(df_basic, tmp_path):
|
|
184
|
+
n = Narrator(df_basic)
|
|
185
|
+
filepath = tmp_path / "reporte.md"
|
|
186
|
+
n.export(str(filepath))
|
|
187
|
+
assert filepath.stat().st_size > 0
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ------------------------------------------------------------------
|
|
191
|
+
# Tests del analyzer
|
|
192
|
+
# ------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
def test_analyzer_overview_rows(df_basic):
|
|
195
|
+
analyzer = DataAnalyzer(df_basic)
|
|
196
|
+
result = analyzer.analyze()
|
|
197
|
+
assert result["overview"]["rows"] == 6
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def test_analyzer_overview_cols(df_basic):
|
|
201
|
+
analyzer = DataAnalyzer(df_basic)
|
|
202
|
+
result = analyzer.analyze()
|
|
203
|
+
assert result["overview"]["cols"] == 4
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def test_analyzer_detects_nulls(df_with_nulls):
|
|
207
|
+
analyzer = DataAnalyzer(df_with_nulls)
|
|
208
|
+
result = analyzer.analyze()
|
|
209
|
+
assert result["overview"]["total_nulls"] > 0
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def test_analyzer_correlations(df_numeric_only):
|
|
213
|
+
analyzer = DataAnalyzer(df_numeric_only)
|
|
214
|
+
result = analyzer.analyze()
|
|
215
|
+
assert isinstance(result["correlations"], list)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def test_analyzer_alerts_high_nulls(df_with_nulls):
|
|
219
|
+
analyzer = DataAnalyzer(df_with_nulls)
|
|
220
|
+
result = analyzer.analyze()
|
|
221
|
+
alert_types = [a["type"] for a in result["alerts"]]
|
|
222
|
+
assert "high_nulls" in alert_types
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# ------------------------------------------------------------------
|
|
226
|
+
# Tests de compare()
|
|
227
|
+
# ------------------------------------------------------------------
|
|
228
|
+
|
|
229
|
+
def test_compare_returns_string(df_basic):
|
|
230
|
+
df2 = df_basic.copy()
|
|
231
|
+
df2["edad"] = df2["edad"] * 2
|
|
232
|
+
n = Narrator(df_basic)
|
|
233
|
+
result = n.compare(df2)
|
|
234
|
+
assert isinstance(result, str)
|
|
235
|
+
|
|
236
|
+
def test_compare_invalid_input(df_basic):
|
|
237
|
+
n = Narrator(df_basic)
|
|
238
|
+
with pytest.raises(TypeError):
|
|
239
|
+
n.compare("no soy un dataframe")
|
|
240
|
+
|
|
241
|
+
def test_compare_empty_dataframe(df_basic):
|
|
242
|
+
n = Narrator(df_basic)
|
|
243
|
+
with pytest.raises(ValueError):
|
|
244
|
+
n.compare(pd.DataFrame())
|
|
245
|
+
|
|
246
|
+
def test_compare_english(df_basic):
|
|
247
|
+
df2 = df_basic.copy()
|
|
248
|
+
n = Narrator(df_basic, lang="en")
|
|
249
|
+
result = n.compare(df2)
|
|
250
|
+
assert "comparison" in result
|