viewx 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {viewx-0.2.3 → viewx-0.2.4}/PKG-INFO +2 -3
  2. {viewx-0.2.3 → viewx-0.2.4}/README.md +1 -1
  3. {viewx-0.2.3 → viewx-0.2.4}/setup.py +2 -3
  4. {viewx-0.2.3 → viewx-0.2.4}/tests/test1.py +14 -16
  5. viewx-0.2.4/tests/test8_slides_auto.py +15 -0
  6. viewx-0.2.4/tests/test9_report_auto.py +22 -0
  7. viewx-0.2.4/viewx/DataMatrix/__init__.py +30 -0
  8. viewx-0.2.4/viewx/DataMatrix/analyzers.py +325 -0
  9. viewx-0.2.4/viewx/DataMatrix/bibliometrics.py +143 -0
  10. viewx-0.2.4/viewx/DataMatrix/datamatrix_engine.py +1348 -0
  11. viewx-0.2.4/viewx/DataMatrix/explorer.py +87 -0
  12. viewx-0.2.4/viewx/DataMatrix/visualizer.py +511 -0
  13. {viewx-0.2.3 → viewx-0.2.4}/viewx/HTML/html_engine.py +408 -167
  14. viewx-0.2.4/viewx/Report/auto_builder.py +187 -0
  15. {viewx-0.2.3 → viewx-0.2.4}/viewx/Report/report_engine.py +30 -1
  16. {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/__init__.py +1 -0
  17. viewx-0.2.4/viewx/Slides/auto_builder.py +171 -0
  18. {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/slides_engine.py +125 -49
  19. {viewx-0.2.3 → viewx-0.2.4}/viewx/__init__.py +4 -5
  20. {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/__init__.py +20 -14
  21. viewx-0.2.4/viewx/shared/__init__.py +20 -0
  22. viewx-0.2.4/viewx/shared/a11y.py +24 -0
  23. viewx-0.2.4/viewx/shared/explorer_runtime.py +523 -0
  24. viewx-0.2.4/viewx/shared/insights.py +121 -0
  25. viewx-0.2.4/viewx/shared/plotly_bundle.py +28 -0
  26. viewx-0.2.4/viewx/shared/runtime.py +237 -0
  27. {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/PKG-INFO +2 -3
  28. {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/SOURCES.txt +13 -2
  29. {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/requires.txt +0 -1
  30. viewx-0.2.3/tests/test2.py +0 -39
  31. viewx-0.2.3/viewx/DataMatrix/__init__.py +0 -3
  32. viewx-0.2.3/viewx/DataMatrix/bibliometrics.py +0 -52
  33. viewx-0.2.3/viewx/DataMatrix/datamatrix_engine.py +0 -184
  34. viewx-0.2.3/viewx/DataMatrix/visualizer.py +0 -75
  35. {viewx-0.2.3 → viewx-0.2.4}/setup.cfg +0 -0
  36. {viewx-0.2.3 → viewx-0.2.4}/tests/test3.py +0 -0
  37. {viewx-0.2.3 → viewx-0.2.4}/tests/test4.py +0 -0
  38. {viewx-0.2.3 → viewx-0.2.4}/tests/test5.py +0 -0
  39. {viewx-0.2.3 → viewx-0.2.4}/tests/test6.py +0 -0
  40. {viewx-0.2.3 → viewx-0.2.4}/tests/test7.py +0 -0
  41. {viewx-0.2.3 → viewx-0.2.4}/viewx/HTML/__init__.py +0 -0
  42. {viewx-0.2.3 → viewx-0.2.4}/viewx/Report/__init__.py +0 -0
  43. {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/charts.py +0 -0
  44. {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/components.py +0 -0
  45. {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/course_completion.csv +0 -0
  46. {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/iris.csv +0 -0
  47. {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/penguins.csv +0 -0
  48. {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/sp500_companies.csv +0 -0
  49. {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/titanic.csv +0 -0
  50. {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/dependency_links.txt +0 -0
  51. {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: viewx
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Librería de visualización adaptable para HTML, Dashboards y PDFs en Python
5
5
  Home-page: https://github.com/GhostAnalyst30/ViewX
6
6
  Author: Emmanuel Ascendra Perez
@@ -25,7 +25,6 @@ Requires-Dist: matplotlib>=3.8.0
25
25
  Requires-Dist: pylatex>=1.4.2
26
26
  Requires-Dist: seaborn>=0.12.2
27
27
  Requires-Dist: plotly>=6.0.0
28
- Requires-Dist: streamlit>=1.32.0
29
28
  Provides-Extra: streamlit
30
29
  Requires-Dist: streamlit>=1.32.0; extra == "streamlit"
31
30
  Provides-Extra: dash
@@ -52,7 +51,7 @@ Dynamic: requires-dist
52
51
  Dynamic: requires-python
53
52
  Dynamic: summary
54
53
 
55
- # ViewX — v2.3
54
+ # ViewX — v2.4
56
55
 
57
56
  **ViewX** es un paquete moderno de Python diseñado para generar **páginas HTML interactivas**, **dashboards dinámicos** y **visualizaciones inteligentes** que se adaptan automáticamente a los objetos agregados por el usuario.
58
57
 
@@ -1,4 +1,4 @@
1
- # ViewX — v2.3
1
+ # ViewX — v2.4
2
2
 
3
3
  **ViewX** es un paquete moderno de Python diseñado para generar **páginas HTML interactivas**, **dashboards dinámicos** y **visualizaciones inteligentes** que se adaptan automáticamente a los objetos agregados por el usuario.
4
4
 
@@ -9,7 +9,7 @@ except FileNotFoundError:
9
9
 
10
10
  setup(
11
11
  name="viewx",
12
- version="0.2.3",
12
+ version="0.2.4",
13
13
  author="Emmanuel Ascendra Perez",
14
14
  author_email="ascendraemmanuel@gmail.com",
15
15
  description="Librería de visualización adaptable para HTML, Dashboards y PDFs en Python",
@@ -47,8 +47,7 @@ setup(
47
47
  "matplotlib>=3.8.0",
48
48
  "pylatex>=1.4.2", # Para PDFs
49
49
  "seaborn>=0.12.2",
50
- "plotly>=6.0.0",
51
- "streamlit>=1.32.0"
50
+ "plotly>=6.0.0"
52
51
  ],
53
52
 
54
53
  # Dependencias opcionales
@@ -35,6 +35,7 @@ HTML.auto_generate(
35
35
  df,
36
36
  title = "Demo 1 · Auto Layout",
37
37
  filename = "demo1_auto.html",
38
+ show = False,
38
39
  )
39
40
 
40
41
  # ════════════════════════════════════════════════════════════════════════════
@@ -49,7 +50,8 @@ HTML.auto_generate(
49
50
  authors = [
50
51
  {"name": "Ana García", "email": "ana@empresa.com"},
51
52
  {"name": "Luis Torres", "email": "luis@empresa.com"},
52
- ]
53
+ ],
54
+ show = False,
53
55
  )
54
56
 
55
57
  # ════════════════════════════════════════════════════════════════════════════
@@ -63,6 +65,7 @@ HTML.auto_generate(
63
65
  filename = "demo3_kpi_focus.html",
64
66
  layout = "kpi_focus",
65
67
  authors = "Carlos Méndez",
68
+ show = False,
66
69
  )
67
70
 
68
71
  # ════════════════════════════════════════════════════════════════════════════
@@ -75,6 +78,7 @@ HTML.auto_generate(
75
78
  title = "Demo 4 · Chart Focus",
76
79
  filename = "demo4_chart_focus.html",
77
80
  layout = "chart_focus",
81
+ show = False,
78
82
  )
79
83
 
80
84
  # ════════════════════════════════════════════════════════════════════════════
@@ -87,14 +91,11 @@ HTML.auto_generate(
87
91
  title = "Demo 5 · Table First",
88
92
  filename = "demo5_table_first.html",
89
93
  layout = "table_first",
94
+ show = False,
90
95
  )
91
96
 
92
97
  # ════════════════════════════════════════════════════════════════════════════
93
98
  # DEMO 6 — Layout 100% personalizado
94
- # Diseño:
95
- # [KPI ventas] [KPI utilidad] [KPI unidades] | [Chart barras región]
96
- # [Chart línea temporal (ventas) ] | [Chart scatter ]
97
- # [Tabla completa ]
98
99
  # ════════════════════════════════════════════════════════════════════════════
99
100
  HTML.auto_generate(
100
101
  df,
@@ -104,19 +105,15 @@ HTML.auto_generate(
104
105
  filename = "demo6_custom.html",
105
106
  authors = [{"name": "Equipo BI", "email": "bi@empresa.com"}],
106
107
  layout = [
107
- # Fila 1: 3 KPIs a la izquierda + 1 chart a la derecha
108
108
  {"type": "kpi", "index": 0, "row": 1, "col": 1, "height": 2, "width": 3},
109
109
  {"type": "kpi", "index": 1, "row": 1, "col": 4, "height": 2, "width": 3},
110
110
  {"type": "kpi", "index": 2, "row": 1, "col": 7, "height": 2, "width": 3},
111
- {"type": "chart", "index": 1, "row": 1, "col": 10, "height": 7, "width": 3}, # barras región
112
-
113
- # Fila 2: línea temporal grande + scatter
114
- {"type": "chart", "index": 0, "row": 3, "col": 1, "height": 5, "width": 6}, # línea tiempo
115
- {"type": "chart", "index": 2, "row": 3, "col": 7, "height": 5, "width": 3}, # scatter
116
-
117
- # Fila 3: tabla completa
111
+ {"type": "chart", "index": 1, "row": 1, "col": 10, "height": 7, "width": 3},
112
+ {"type": "chart", "index": 0, "row": 3, "col": 1, "height": 5, "width": 6},
113
+ {"type": "chart", "index": 2, "row": 3, "col": 7, "height": 5, "width": 3},
118
114
  {"type": "table", "row": 8, "col": 1, "height": 4, "width": 12},
119
- ]
115
+ ],
116
+ show = False,
120
117
  )
121
118
 
122
119
  # ════════════════════════════════════════════════════════════════════════════
@@ -135,9 +132,10 @@ HTML.auto_generate(
135
132
  template = "corporate_blue",
136
133
  title = "Demo 7 · Parseo Automático de Strings",
137
134
  filename = "demo7_parseo.html",
135
+ show = False,
138
136
  )
139
137
 
140
- print("\n✅ Todos los dashboards generados:")
138
+ print("\nTodos los dashboards generados:")
141
139
  for i, name in enumerate([
142
140
  "demo1_auto.html",
143
141
  "demo2_cols.html",
@@ -147,4 +145,4 @@ for i, name in enumerate([
147
145
  "demo6_custom.html",
148
146
  "demo7_parseo.html",
149
147
  ], 1):
150
- print(f" {i}. {name}")
148
+ print(f" {i}. {name}")
@@ -0,0 +1,15 @@
1
+ """Auto-generated slides from a DataFrame."""
2
+
3
+ from viewx.datasets import load_iris
4
+ from viewx.Slides import Presentation
5
+
6
+ df = load_iris()
7
+
8
+ path = Presentation.auto_generate(
9
+ df,
10
+ title="Iris Dataset Overview",
11
+ theme="ocean",
12
+ filename="output/test8_auto_slides.html",
13
+ show=False,
14
+ )
15
+ print(f"Presentation.auto_generate -> {path}")
@@ -0,0 +1,22 @@
1
+ """Auto-generated PDF quality report from a DataFrame."""
2
+
3
+ import shutil
4
+
5
+ from viewx.datasets import load_iris
6
+ from viewx import Report
7
+
8
+ if not shutil.which("pdflatex"):
9
+ print("SKIP: pdflatex not found — install a LaTeX distribution to run this test.")
10
+ raise SystemExit(0)
11
+
12
+ df = load_iris()
13
+
14
+ path = Report.auto_generate(
15
+ df,
16
+ title="Iris Dataset Quality Report",
17
+ author="ViewX Test",
18
+ filename="test9_auto_report",
19
+ outdir="output",
20
+ include_plots=True,
21
+ )
22
+ print(f"Report.auto_generate -> {path}")
@@ -0,0 +1,30 @@
1
+ from .analyzers import (
2
+ AnalyzerEngine,
3
+ BooleanStrategy,
4
+ CategoricalStrategy,
5
+ ColumnProfile,
6
+ ColumnTypeStrategy,
7
+ DatasetReport,
8
+ DateTimeStrategy,
9
+ NumericStrategy,
10
+ )
11
+ from .bibliometrics import BibliometricsAnalyzer
12
+ from .datamatrix_engine import DataMatrix, ReportTheme
13
+ from .explorer import build_explorer_payload
14
+ from .visualizer import Visualizer
15
+
16
+ __all__ = [
17
+ "DataMatrix",
18
+ "ReportTheme",
19
+ "AnalyzerEngine",
20
+ "Visualizer",
21
+ "BibliometricsAnalyzer",
22
+ "DatasetReport",
23
+ "ColumnProfile",
24
+ "ColumnTypeStrategy",
25
+ "NumericStrategy",
26
+ "CategoricalStrategy",
27
+ "DateTimeStrategy",
28
+ "BooleanStrategy",
29
+ "build_explorer_payload",
30
+ ]
@@ -0,0 +1,325 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+
11
+ @dataclass
12
+ class ColumnProfile:
13
+ name: str
14
+ dtype: str
15
+ inferred_type: str
16
+ n_unique: int
17
+ n_missing: int
18
+ p_missing: float
19
+ is_constant: bool
20
+ cardinality_ratio: float
21
+ skewness: Optional[float] = None
22
+ kurtosis: Optional[float] = None
23
+ mean: Optional[float] = None
24
+ std: Optional[float] = None
25
+ min: Optional[float] = None
26
+ max: Optional[float] = None
27
+ median: Optional[float] = None
28
+ q1: Optional[float] = None
29
+ q3: Optional[float] = None
30
+ iqr: Optional[float] = None
31
+ top_values: Dict = field(default_factory=dict)
32
+ outliers: int = 0
33
+ alerts: List[str] = field(default_factory=list)
34
+
35
+
36
+ @dataclass
37
+ class DatasetReport:
38
+ n_rows: int
39
+ n_cols: int
40
+ n_duplicates: int
41
+ n_missing_total: int
42
+ p_missing_total: float
43
+ memory_usage: str
44
+ estimated_rows: str
45
+ column_profiles: Dict[str, ColumnProfile]
46
+ correlation_pairs: List[Tuple[str, str, float]]
47
+ alerts: List[str]
48
+ categorical_columns: List[str]
49
+ numeric_columns: List[str]
50
+ datetime_columns: List[str]
51
+ boolean_columns: List[str]
52
+
53
+
54
+ class ColumnTypeStrategy(ABC):
55
+ @abstractmethod
56
+ def infer(self, series: pd.Series) -> str:
57
+ ...
58
+
59
+ @abstractmethod
60
+ def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
61
+ ...
62
+
63
+
64
+ class NumericStrategy(ColumnTypeStrategy):
65
+ def infer(self, series: pd.Series) -> str:
66
+ return "numeric"
67
+
68
+ def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
69
+ s = series.dropna()
70
+ n_missing = series.isna().sum()
71
+ p_missing = (n_missing / n_total) * 100
72
+ n_unique = series.nunique()
73
+
74
+ profile = ColumnProfile(
75
+ name=col,
76
+ dtype=str(series.dtype),
77
+ inferred_type="numeric",
78
+ n_unique=n_unique,
79
+ n_missing=n_missing,
80
+ p_missing=p_missing,
81
+ is_constant=n_unique == 1,
82
+ cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
83
+ )
84
+
85
+ if len(s) > 0:
86
+ profile.mean = float(s.mean())
87
+ profile.std = float(s.std())
88
+ profile.min = float(s.min())
89
+ profile.max = float(s.max())
90
+ profile.median = float(s.median())
91
+ profile.q1 = float(s.quantile(0.25))
92
+ profile.q3 = float(s.quantile(0.75))
93
+ profile.iqr = profile.q3 - profile.q1
94
+ profile.skewness = float(s.skew()) if len(s) > 2 else 0.0
95
+ profile.kurtosis = float(s.kurtosis()) if len(s) > 2 else 0.0
96
+
97
+ q1, q3 = profile.q1, profile.q3
98
+ iqr = profile.iqr
99
+ if iqr and iqr > 0:
100
+ lower = q1 - 1.5 * iqr
101
+ upper = q3 + 1.5 * iqr
102
+ profile.outliers = int(((s < lower) | (s > upper)).sum())
103
+
104
+ if p_missing > 50:
105
+ profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
106
+ if profile.is_constant:
107
+ profile.alerts.append(f"Column '{col}': constant value ({s.iloc[0] if len(s) > 0 else 'N/A'})")
108
+ if profile.skewness is not None and abs(profile.skewness) > 2:
109
+ profile.alerts.append(f"Column '{col}': high skewness ({profile.skewness:.2f})")
110
+ if profile.outliers > 0:
111
+ profile.alerts.append(f"Column '{col}': {profile.outliers} outliers detected")
112
+
113
+ return profile
114
+
115
+
116
+ class CategoricalStrategy(ColumnTypeStrategy):
117
+ def infer(self, series: pd.Series) -> str:
118
+ return "categorical"
119
+
120
+ def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
121
+ s = series.dropna()
122
+ n_missing = series.isna().sum()
123
+ p_missing = (n_missing / n_total) * 100
124
+ n_unique = series.nunique()
125
+
126
+ top_values = {}
127
+ if len(s) > 0:
128
+ top_n = s.value_counts().head(5)
129
+ top_values = {str(k): int(v) for k, v in top_n.items()}
130
+
131
+ profile = ColumnProfile(
132
+ name=col,
133
+ dtype=str(series.dtype),
134
+ inferred_type="categorical",
135
+ n_unique=n_unique,
136
+ n_missing=n_missing,
137
+ p_missing=p_missing,
138
+ is_constant=n_unique <= 1,
139
+ cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
140
+ top_values=top_values,
141
+ )
142
+
143
+ if p_missing > 50:
144
+ profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
145
+ if profile.is_constant:
146
+ profile.alerts.append(f"Column '{col}': constant value")
147
+ if n_unique == n_total:
148
+ profile.alerts.append(f"Column '{col}': all values unique (possible ID)")
149
+
150
+ return profile
151
+
152
+
153
+ class DateTimeStrategy(ColumnTypeStrategy):
154
+ def infer(self, series: pd.Series) -> str:
155
+ return "datetime"
156
+
157
+ def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
158
+ s = series.dropna()
159
+ n_missing = series.isna().sum()
160
+ p_missing = (n_missing / n_total) * 100
161
+ n_unique = series.nunique()
162
+
163
+ profile = ColumnProfile(
164
+ name=col,
165
+ dtype=str(series.dtype),
166
+ inferred_type="datetime",
167
+ n_unique=n_unique,
168
+ n_missing=n_missing,
169
+ p_missing=p_missing,
170
+ is_constant=n_unique <= 1,
171
+ cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
172
+ )
173
+
174
+ if len(s) > 0:
175
+ try:
176
+ years = s.dt.year
177
+ profile.min = float(years.min())
178
+ profile.max = float(years.max())
179
+ profile.mean = float(years.mean())
180
+ except Exception:
181
+ pass
182
+
183
+ if p_missing > 50:
184
+ profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
185
+
186
+ return profile
187
+
188
+
189
+ class BooleanStrategy(ColumnTypeStrategy):
190
+ def infer(self, series: pd.Series) -> str:
191
+ return "boolean"
192
+
193
+ def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
194
+ s = series.dropna()
195
+ n_missing = series.isna().sum()
196
+ p_missing = (n_missing / n_total) * 100
197
+ n_unique = series.nunique()
198
+ true_count = int(s.astype(bool).sum()) if len(s) > 0 else 0
199
+
200
+ profile = ColumnProfile(
201
+ name=col,
202
+ dtype=str(series.dtype),
203
+ inferred_type="boolean",
204
+ n_unique=n_unique,
205
+ n_missing=n_missing,
206
+ p_missing=p_missing,
207
+ is_constant=n_unique <= 1,
208
+ cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
209
+ mean=float(true_count / len(s)) if len(s) > 0 else 0,
210
+ top_values={"True": true_count, "False": len(s) - true_count} if len(s) > 0 else {},
211
+ )
212
+
213
+ if p_missing > 50:
214
+ profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
215
+
216
+ return profile
217
+
218
+
219
+ class AnalyzerEngine:
220
+ def __init__(self):
221
+ self.strategies: Dict[str, ColumnTypeStrategy] = {
222
+ "numeric": NumericStrategy(),
223
+ "categorical": CategoricalStrategy(),
224
+ "datetime": DateTimeStrategy(),
225
+ "boolean": BooleanStrategy(),
226
+ }
227
+
228
+ def infer_column_type(self, series: pd.Series) -> str:
229
+ if pd.api.types.is_datetime64_any_dtype(series):
230
+ return "datetime"
231
+ if pd.api.types.is_bool_dtype(series):
232
+ return "boolean"
233
+ if pd.api.types.is_numeric_dtype(series):
234
+ return "numeric"
235
+ return "categorical"
236
+
237
+ def analyze_column(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
238
+ ctype = self.infer_column_type(series)
239
+ strategy = self.strategies[ctype]
240
+ return strategy.analyze(series, col, n_total)
241
+
242
+ def analyze_dataset(self, df: pd.DataFrame) -> DatasetReport:
243
+ profiles: Dict[str, ColumnProfile] = {}
244
+ all_alerts: List[str] = []
245
+ categorical_columns: List[str] = []
246
+ numeric_columns: List[str] = []
247
+ datetime_columns: List[str] = []
248
+ boolean_columns: List[str] = []
249
+
250
+ for col in df.columns:
251
+ profile = self.analyze_column(df[col], col, len(df))
252
+ profiles[col] = profile
253
+ all_alerts.extend(profile.alerts)
254
+
255
+ if profile.inferred_type == "numeric":
256
+ numeric_columns.append(col)
257
+ elif profile.inferred_type == "categorical":
258
+ categorical_columns.append(col)
259
+ elif profile.inferred_type == "datetime":
260
+ datetime_columns.append(col)
261
+ elif profile.inferred_type == "boolean":
262
+ boolean_columns.append(col)
263
+
264
+ n_duplicates = int(df.duplicated().sum())
265
+ if n_duplicates > 0:
266
+ all_alerts.append(f"Found {n_duplicates} duplicate rows")
267
+
268
+ correlation_pairs = self._find_correlations(df, numeric_columns)
269
+
270
+ mem_bytes = df.memory_usage(deep=True).sum()
271
+ if mem_bytes > 1e9:
272
+ memory_usage = f"{mem_bytes / 1e9:.2f} GB"
273
+ elif mem_bytes > 1e6:
274
+ memory_usage = f"{mem_bytes / 1e6:.2f} MB"
275
+ else:
276
+ memory_usage = f"{mem_bytes / 1e3:.1f} KB"
277
+
278
+ n_rows = len(df)
279
+ if n_rows > 1_000_000:
280
+ estimated_rows = f"{n_rows / 1_000_000:.1f}M"
281
+ elif n_rows > 1_000:
282
+ estimated_rows = f"{n_rows / 1_000:.1f}K"
283
+ else:
284
+ estimated_rows = str(n_rows)
285
+
286
+ n_missing_total = sum(p.n_missing for p in profiles.values())
287
+ total_cells = n_rows * len(df.columns)
288
+ p_missing_total = (n_missing_total / total_cells) * 100 if total_cells > 0 else 0
289
+
290
+ return DatasetReport(
291
+ n_rows=n_rows,
292
+ n_cols=len(df.columns),
293
+ n_duplicates=n_duplicates,
294
+ n_missing_total=n_missing_total,
295
+ p_missing_total=p_missing_total,
296
+ memory_usage=memory_usage,
297
+ estimated_rows=estimated_rows,
298
+ column_profiles=profiles,
299
+ correlation_pairs=correlation_pairs,
300
+ alerts=all_alerts,
301
+ categorical_columns=categorical_columns,
302
+ numeric_columns=numeric_columns,
303
+ datetime_columns=datetime_columns,
304
+ boolean_columns=boolean_columns,
305
+ )
306
+
307
+ def _find_correlations(
308
+ self, df: pd.DataFrame, numeric_cols: List[str], threshold: float = 0.3
309
+ ) -> List[Tuple[str, str, float]]:
310
+ if len(numeric_cols) < 2:
311
+ return []
312
+
313
+ corr_matrix = df[numeric_cols].corr().abs()
314
+ pairs = []
315
+ for i in range(len(numeric_cols)):
316
+ for j in range(i + 1, len(numeric_cols)):
317
+ r = corr_matrix.iloc[i, j]
318
+ if r >= threshold:
319
+ col_a = numeric_cols[i]
320
+ col_b = numeric_cols[j]
321
+ actual_r = df[[col_a, col_b]].corr().iloc[0, 1]
322
+ pairs.append((col_a, col_b, float(actual_r)))
323
+
324
+ pairs.sort(key=lambda x: abs(x[2]), reverse=True)
325
+ return pairs[:10]
@@ -0,0 +1,143 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from typing import Dict, List, Optional, Tuple
5
+
6
+ import pandas as pd
7
+
8
+
9
+ class BibliometricsAnalyzer:
10
+ def __init__(self):
11
+ self.column_map = {
12
+ "AU": ["Authors", "AU", "Author", "Autores", "AUTHOR", "authors"],
13
+ "PY": ["Year", "PY", "Publication Year", "Año", "YEAR", "year"],
14
+ "SO": [
15
+ "Source title", "SO", "Journal", "Source", "Revista",
16
+ "SOURCE", "source", "Publication Name",
17
+ ],
18
+ "DE": [
19
+ "Author Keywords", "DE", "Keywords", "Palabras Clave",
20
+ "KEYWORDS", "keywords", "Index Keywords",
21
+ ],
22
+ "TC": [
23
+ "Cited by", "TC", "Times Cited", "Citas",
24
+ "CITING", "citations", "Citations",
25
+ ],
26
+ "TI": ["Title", "TI", "TITLE", "Article Title", "title"],
27
+ "DI": ["DOI", "DI", "DOI Number", "doi"],
28
+ "AF": [
29
+ "Affiliation", "AF", "AFFILIATION", "affiliation",
30
+ "Author Affiliation", "Author Affiliations",
31
+ ],
32
+ "AB": ["Abstract", "AB", "ABSTRACT", "abstract"],
33
+ "DT": [
34
+ "Document Type", "DT", "Document Type", "document_type",
35
+ ],
36
+ }
37
+
38
+ def _find_column(self, df: pd.DataFrame, key: str) -> Optional[str]:
39
+ candidates = self.column_map[key]
40
+ for col in df.columns:
41
+ col_upper = col.strip().upper()
42
+ for c in candidates:
43
+ if col_upper == c.upper() or col.strip() == c:
44
+ return col
45
+ if col_upper.replace(" ", "_") == c.upper().replace(" ", "_"):
46
+ return col
47
+ return None
48
+
49
+ def _split_multi(self, raw: str, separators: str = ";,", strip_parens: bool = True) -> List[str]:
50
+ if not isinstance(raw, str) or not raw.strip():
51
+ return []
52
+ result = []
53
+ for part in raw.split():
54
+ if not part.strip():
55
+ continue
56
+ items = [item.strip() for item in part.replace(";", ",").split(",")]
57
+ for item in items:
58
+ item = item.strip().strip(".").strip()
59
+ if strip_parens:
60
+ item = item.split("(")[0].strip()
61
+ if item and len(item) > 1:
62
+ result.append(item)
63
+ return result
64
+
65
+ def _split_list_field(self, raw: str) -> List[str]:
66
+ if not isinstance(raw, str) or not raw.strip():
67
+ return []
68
+ items = [x.strip() for x in raw.replace(";", ",").split(",")]
69
+ return [x for x in items if x]
70
+
71
+ def analyze(self, df: pd.DataFrame) -> Optional[dict]:
72
+ results: Dict[str, pd.DataFrame] = {}
73
+
74
+ py_col = self._find_column(df, "PY")
75
+ if py_col:
76
+ year_series = df[py_col].dropna()
77
+ year_numeric = pd.to_numeric(year_series, errors="coerce").dropna()
78
+ if len(year_numeric) > 0:
79
+ prod = year_numeric.value_counts().sort_index().reset_index()
80
+ prod.columns = ["Year", "Count"]
81
+ prod["Year"] = prod["Year"].astype(int)
82
+ results["annual_production"] = prod
83
+
84
+ au_col = self._find_column(df, "AU")
85
+ if au_col:
86
+ all_authors: List[str] = []
87
+ for entry in df[au_col].dropna():
88
+ all_authors.extend(self._split_list_field(str(entry)))
89
+
90
+ if all_authors:
91
+ au_counts = Counter(all_authors)
92
+ au_df = (
93
+ pd.DataFrame(au_counts.most_common(20), columns=["Author", "Count"])
94
+ .sort_values("Count", ascending=False)
95
+ .reset_index(drop=True)
96
+ )
97
+ results["top_authors"] = au_df
98
+
99
+ n_unique = len(au_counts)
100
+ total = sum(au_counts.values())
101
+ results["author_summary"] = {
102
+ "total_authors": total,
103
+ "unique_authors": n_unique,
104
+ "avg_per_publication": round(total / max(len(df), 1), 2),
105
+ }
106
+
107
+ so_col = self._find_column(df, "SO")
108
+ if so_col:
109
+ so_counts = df[so_col].dropna().value_counts().head(15)
110
+ if len(so_counts) > 0:
111
+ so_df = so_counts.reset_index()
112
+ so_df.columns = ["Source", "Count"]
113
+ results["top_sources"] = so_df
114
+
115
+ de_col = self._find_column(df, "DE")
116
+ if de_col:
117
+ all_keywords: List[str] = []
118
+ for entry in df[de_col].dropna():
119
+ all_keywords.extend(self._split_list_field(str(entry)))
120
+
121
+ if all_keywords:
122
+ kw_counts = Counter(all_keywords)
123
+ kw_df = (
124
+ pd.DataFrame(kw_counts.most_common(20), columns=["Keyword", "Count"])
125
+ .sort_values("Count", ascending=False)
126
+ .reset_index(drop=True)
127
+ )
128
+ results["top_keywords"] = kw_df
129
+
130
+ tc_col = self._find_column(df, "TC")
131
+ if tc_col:
132
+ tc_series = pd.to_numeric(df[tc_col], errors="coerce").dropna()
133
+ if len(tc_series) > 0:
134
+ results["citation_summary"] = {
135
+ "total_citations": int(tc_series.sum()),
136
+ "mean_citations": round(tc_series.mean(), 2),
137
+ "median_citations": round(tc_series.median(), 2),
138
+ "max_citations": int(tc_series.max()),
139
+ "min_citations": int(tc_series.min()),
140
+ }
141
+
142
+
143
+ return results if results else None