viewx 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {viewx-0.2.3 → viewx-0.2.4}/PKG-INFO +2 -3
- {viewx-0.2.3 → viewx-0.2.4}/README.md +1 -1
- {viewx-0.2.3 → viewx-0.2.4}/setup.py +2 -3
- {viewx-0.2.3 → viewx-0.2.4}/tests/test1.py +14 -16
- viewx-0.2.4/tests/test8_slides_auto.py +15 -0
- viewx-0.2.4/tests/test9_report_auto.py +22 -0
- viewx-0.2.4/viewx/DataMatrix/__init__.py +30 -0
- viewx-0.2.4/viewx/DataMatrix/analyzers.py +325 -0
- viewx-0.2.4/viewx/DataMatrix/bibliometrics.py +143 -0
- viewx-0.2.4/viewx/DataMatrix/datamatrix_engine.py +1348 -0
- viewx-0.2.4/viewx/DataMatrix/explorer.py +87 -0
- viewx-0.2.4/viewx/DataMatrix/visualizer.py +511 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/HTML/html_engine.py +408 -167
- viewx-0.2.4/viewx/Report/auto_builder.py +187 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/Report/report_engine.py +30 -1
- {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/__init__.py +1 -0
- viewx-0.2.4/viewx/Slides/auto_builder.py +171 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/slides_engine.py +125 -49
- {viewx-0.2.3 → viewx-0.2.4}/viewx/__init__.py +4 -5
- {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/__init__.py +20 -14
- viewx-0.2.4/viewx/shared/__init__.py +20 -0
- viewx-0.2.4/viewx/shared/a11y.py +24 -0
- viewx-0.2.4/viewx/shared/explorer_runtime.py +523 -0
- viewx-0.2.4/viewx/shared/insights.py +121 -0
- viewx-0.2.4/viewx/shared/plotly_bundle.py +28 -0
- viewx-0.2.4/viewx/shared/runtime.py +237 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/PKG-INFO +2 -3
- {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/SOURCES.txt +13 -2
- {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/requires.txt +0 -1
- viewx-0.2.3/tests/test2.py +0 -39
- viewx-0.2.3/viewx/DataMatrix/__init__.py +0 -3
- viewx-0.2.3/viewx/DataMatrix/bibliometrics.py +0 -52
- viewx-0.2.3/viewx/DataMatrix/datamatrix_engine.py +0 -184
- viewx-0.2.3/viewx/DataMatrix/visualizer.py +0 -75
- {viewx-0.2.3 → viewx-0.2.4}/setup.cfg +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/tests/test3.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/tests/test4.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/tests/test5.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/tests/test6.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/tests/test7.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/HTML/__init__.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/Report/__init__.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/charts.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/Slides/components.py +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/course_completion.csv +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/iris.csv +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/penguins.csv +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/sp500_companies.csv +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx/datasets/titanic.csv +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/dependency_links.txt +0 -0
- {viewx-0.2.3 → viewx-0.2.4}/viewx.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: viewx
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Librería de visualización adaptable para HTML, Dashboards y PDFs en Python
|
|
5
5
|
Home-page: https://github.com/GhostAnalyst30/ViewX
|
|
6
6
|
Author: Emmanuel Ascendra Perez
|
|
@@ -25,7 +25,6 @@ Requires-Dist: matplotlib>=3.8.0
|
|
|
25
25
|
Requires-Dist: pylatex>=1.4.2
|
|
26
26
|
Requires-Dist: seaborn>=0.12.2
|
|
27
27
|
Requires-Dist: plotly>=6.0.0
|
|
28
|
-
Requires-Dist: streamlit>=1.32.0
|
|
29
28
|
Provides-Extra: streamlit
|
|
30
29
|
Requires-Dist: streamlit>=1.32.0; extra == "streamlit"
|
|
31
30
|
Provides-Extra: dash
|
|
@@ -52,7 +51,7 @@ Dynamic: requires-dist
|
|
|
52
51
|
Dynamic: requires-python
|
|
53
52
|
Dynamic: summary
|
|
54
53
|
|
|
55
|
-
# ViewX — v2.
|
|
54
|
+
# ViewX — v2.4
|
|
56
55
|
|
|
57
56
|
**ViewX** es un paquete moderno de Python diseñado para generar **páginas HTML interactivas**, **dashboards dinámicos** y **visualizaciones inteligentes** que se adaptan automáticamente a los objetos agregados por el usuario.
|
|
58
57
|
|
|
@@ -9,7 +9,7 @@ except FileNotFoundError:
|
|
|
9
9
|
|
|
10
10
|
setup(
|
|
11
11
|
name="viewx",
|
|
12
|
-
version="0.2.
|
|
12
|
+
version="0.2.4",
|
|
13
13
|
author="Emmanuel Ascendra Perez",
|
|
14
14
|
author_email="ascendraemmanuel@gmail.com",
|
|
15
15
|
description="Librería de visualización adaptable para HTML, Dashboards y PDFs en Python",
|
|
@@ -47,8 +47,7 @@ setup(
|
|
|
47
47
|
"matplotlib>=3.8.0",
|
|
48
48
|
"pylatex>=1.4.2", # Para PDFs
|
|
49
49
|
"seaborn>=0.12.2",
|
|
50
|
-
"plotly>=6.0.0"
|
|
51
|
-
"streamlit>=1.32.0"
|
|
50
|
+
"plotly>=6.0.0"
|
|
52
51
|
],
|
|
53
52
|
|
|
54
53
|
# Dependencias opcionales
|
|
@@ -35,6 +35,7 @@ HTML.auto_generate(
|
|
|
35
35
|
df,
|
|
36
36
|
title = "Demo 1 · Auto Layout",
|
|
37
37
|
filename = "demo1_auto.html",
|
|
38
|
+
show = False,
|
|
38
39
|
)
|
|
39
40
|
|
|
40
41
|
# ════════════════════════════════════════════════════════════════════════════
|
|
@@ -49,7 +50,8 @@ HTML.auto_generate(
|
|
|
49
50
|
authors = [
|
|
50
51
|
{"name": "Ana García", "email": "ana@empresa.com"},
|
|
51
52
|
{"name": "Luis Torres", "email": "luis@empresa.com"},
|
|
52
|
-
]
|
|
53
|
+
],
|
|
54
|
+
show = False,
|
|
53
55
|
)
|
|
54
56
|
|
|
55
57
|
# ════════════════════════════════════════════════════════════════════════════
|
|
@@ -63,6 +65,7 @@ HTML.auto_generate(
|
|
|
63
65
|
filename = "demo3_kpi_focus.html",
|
|
64
66
|
layout = "kpi_focus",
|
|
65
67
|
authors = "Carlos Méndez",
|
|
68
|
+
show = False,
|
|
66
69
|
)
|
|
67
70
|
|
|
68
71
|
# ════════════════════════════════════════════════════════════════════════════
|
|
@@ -75,6 +78,7 @@ HTML.auto_generate(
|
|
|
75
78
|
title = "Demo 4 · Chart Focus",
|
|
76
79
|
filename = "demo4_chart_focus.html",
|
|
77
80
|
layout = "chart_focus",
|
|
81
|
+
show = False,
|
|
78
82
|
)
|
|
79
83
|
|
|
80
84
|
# ════════════════════════════════════════════════════════════════════════════
|
|
@@ -87,14 +91,11 @@ HTML.auto_generate(
|
|
|
87
91
|
title = "Demo 5 · Table First",
|
|
88
92
|
filename = "demo5_table_first.html",
|
|
89
93
|
layout = "table_first",
|
|
94
|
+
show = False,
|
|
90
95
|
)
|
|
91
96
|
|
|
92
97
|
# ════════════════════════════════════════════════════════════════════════════
|
|
93
98
|
# DEMO 6 — Layout 100% personalizado
|
|
94
|
-
# Diseño:
|
|
95
|
-
# [KPI ventas] [KPI utilidad] [KPI unidades] | [Chart barras región]
|
|
96
|
-
# [Chart línea temporal (ventas) ] | [Chart scatter ]
|
|
97
|
-
# [Tabla completa ]
|
|
98
99
|
# ════════════════════════════════════════════════════════════════════════════
|
|
99
100
|
HTML.auto_generate(
|
|
100
101
|
df,
|
|
@@ -104,19 +105,15 @@ HTML.auto_generate(
|
|
|
104
105
|
filename = "demo6_custom.html",
|
|
105
106
|
authors = [{"name": "Equipo BI", "email": "bi@empresa.com"}],
|
|
106
107
|
layout = [
|
|
107
|
-
# Fila 1: 3 KPIs a la izquierda + 1 chart a la derecha
|
|
108
108
|
{"type": "kpi", "index": 0, "row": 1, "col": 1, "height": 2, "width": 3},
|
|
109
109
|
{"type": "kpi", "index": 1, "row": 1, "col": 4, "height": 2, "width": 3},
|
|
110
110
|
{"type": "kpi", "index": 2, "row": 1, "col": 7, "height": 2, "width": 3},
|
|
111
|
-
{"type": "chart", "index": 1, "row": 1, "col": 10, "height": 7, "width": 3},
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
{"type": "chart", "index": 0, "row": 3, "col": 1, "height": 5, "width": 6}, # línea tiempo
|
|
115
|
-
{"type": "chart", "index": 2, "row": 3, "col": 7, "height": 5, "width": 3}, # scatter
|
|
116
|
-
|
|
117
|
-
# Fila 3: tabla completa
|
|
111
|
+
{"type": "chart", "index": 1, "row": 1, "col": 10, "height": 7, "width": 3},
|
|
112
|
+
{"type": "chart", "index": 0, "row": 3, "col": 1, "height": 5, "width": 6},
|
|
113
|
+
{"type": "chart", "index": 2, "row": 3, "col": 7, "height": 5, "width": 3},
|
|
118
114
|
{"type": "table", "row": 8, "col": 1, "height": 4, "width": 12},
|
|
119
|
-
]
|
|
115
|
+
],
|
|
116
|
+
show = False,
|
|
120
117
|
)
|
|
121
118
|
|
|
122
119
|
# ════════════════════════════════════════════════════════════════════════════
|
|
@@ -135,9 +132,10 @@ HTML.auto_generate(
|
|
|
135
132
|
template = "corporate_blue",
|
|
136
133
|
title = "Demo 7 · Parseo Automático de Strings",
|
|
137
134
|
filename = "demo7_parseo.html",
|
|
135
|
+
show = False,
|
|
138
136
|
)
|
|
139
137
|
|
|
140
|
-
print("\
|
|
138
|
+
print("\nTodos los dashboards generados:")
|
|
141
139
|
for i, name in enumerate([
|
|
142
140
|
"demo1_auto.html",
|
|
143
141
|
"demo2_cols.html",
|
|
@@ -147,4 +145,4 @@ for i, name in enumerate([
|
|
|
147
145
|
"demo6_custom.html",
|
|
148
146
|
"demo7_parseo.html",
|
|
149
147
|
], 1):
|
|
150
|
-
print(f" {i}. {name}")
|
|
148
|
+
print(f" {i}. {name}")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Auto-generated slides from a DataFrame."""
|
|
2
|
+
|
|
3
|
+
from viewx.datasets import load_iris
|
|
4
|
+
from viewx.Slides import Presentation
|
|
5
|
+
|
|
6
|
+
df = load_iris()
|
|
7
|
+
|
|
8
|
+
path = Presentation.auto_generate(
|
|
9
|
+
df,
|
|
10
|
+
title="Iris Dataset Overview",
|
|
11
|
+
theme="ocean",
|
|
12
|
+
filename="output/test8_auto_slides.html",
|
|
13
|
+
show=False,
|
|
14
|
+
)
|
|
15
|
+
print(f"Presentation.auto_generate -> {path}")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Auto-generated PDF quality report from a DataFrame."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
|
|
5
|
+
from viewx.datasets import load_iris
|
|
6
|
+
from viewx import Report
|
|
7
|
+
|
|
8
|
+
if not shutil.which("pdflatex"):
|
|
9
|
+
print("SKIP: pdflatex not found — install a LaTeX distribution to run this test.")
|
|
10
|
+
raise SystemExit(0)
|
|
11
|
+
|
|
12
|
+
df = load_iris()
|
|
13
|
+
|
|
14
|
+
path = Report.auto_generate(
|
|
15
|
+
df,
|
|
16
|
+
title="Iris Dataset Quality Report",
|
|
17
|
+
author="ViewX Test",
|
|
18
|
+
filename="test9_auto_report",
|
|
19
|
+
outdir="output",
|
|
20
|
+
include_plots=True,
|
|
21
|
+
)
|
|
22
|
+
print(f"Report.auto_generate -> {path}")
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from .analyzers import (
|
|
2
|
+
AnalyzerEngine,
|
|
3
|
+
BooleanStrategy,
|
|
4
|
+
CategoricalStrategy,
|
|
5
|
+
ColumnProfile,
|
|
6
|
+
ColumnTypeStrategy,
|
|
7
|
+
DatasetReport,
|
|
8
|
+
DateTimeStrategy,
|
|
9
|
+
NumericStrategy,
|
|
10
|
+
)
|
|
11
|
+
from .bibliometrics import BibliometricsAnalyzer
|
|
12
|
+
from .datamatrix_engine import DataMatrix, ReportTheme
|
|
13
|
+
from .explorer import build_explorer_payload
|
|
14
|
+
from .visualizer import Visualizer
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"DataMatrix",
|
|
18
|
+
"ReportTheme",
|
|
19
|
+
"AnalyzerEngine",
|
|
20
|
+
"Visualizer",
|
|
21
|
+
"BibliometricsAnalyzer",
|
|
22
|
+
"DatasetReport",
|
|
23
|
+
"ColumnProfile",
|
|
24
|
+
"ColumnTypeStrategy",
|
|
25
|
+
"NumericStrategy",
|
|
26
|
+
"CategoricalStrategy",
|
|
27
|
+
"DateTimeStrategy",
|
|
28
|
+
"BooleanStrategy",
|
|
29
|
+
"build_explorer_payload",
|
|
30
|
+
]
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ColumnProfile:
|
|
13
|
+
name: str
|
|
14
|
+
dtype: str
|
|
15
|
+
inferred_type: str
|
|
16
|
+
n_unique: int
|
|
17
|
+
n_missing: int
|
|
18
|
+
p_missing: float
|
|
19
|
+
is_constant: bool
|
|
20
|
+
cardinality_ratio: float
|
|
21
|
+
skewness: Optional[float] = None
|
|
22
|
+
kurtosis: Optional[float] = None
|
|
23
|
+
mean: Optional[float] = None
|
|
24
|
+
std: Optional[float] = None
|
|
25
|
+
min: Optional[float] = None
|
|
26
|
+
max: Optional[float] = None
|
|
27
|
+
median: Optional[float] = None
|
|
28
|
+
q1: Optional[float] = None
|
|
29
|
+
q3: Optional[float] = None
|
|
30
|
+
iqr: Optional[float] = None
|
|
31
|
+
top_values: Dict = field(default_factory=dict)
|
|
32
|
+
outliers: int = 0
|
|
33
|
+
alerts: List[str] = field(default_factory=list)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class DatasetReport:
|
|
38
|
+
n_rows: int
|
|
39
|
+
n_cols: int
|
|
40
|
+
n_duplicates: int
|
|
41
|
+
n_missing_total: int
|
|
42
|
+
p_missing_total: float
|
|
43
|
+
memory_usage: str
|
|
44
|
+
estimated_rows: str
|
|
45
|
+
column_profiles: Dict[str, ColumnProfile]
|
|
46
|
+
correlation_pairs: List[Tuple[str, str, float]]
|
|
47
|
+
alerts: List[str]
|
|
48
|
+
categorical_columns: List[str]
|
|
49
|
+
numeric_columns: List[str]
|
|
50
|
+
datetime_columns: List[str]
|
|
51
|
+
boolean_columns: List[str]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ColumnTypeStrategy(ABC):
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def infer(self, series: pd.Series) -> str:
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
|
|
61
|
+
...
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class NumericStrategy(ColumnTypeStrategy):
|
|
65
|
+
def infer(self, series: pd.Series) -> str:
|
|
66
|
+
return "numeric"
|
|
67
|
+
|
|
68
|
+
def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
|
|
69
|
+
s = series.dropna()
|
|
70
|
+
n_missing = series.isna().sum()
|
|
71
|
+
p_missing = (n_missing / n_total) * 100
|
|
72
|
+
n_unique = series.nunique()
|
|
73
|
+
|
|
74
|
+
profile = ColumnProfile(
|
|
75
|
+
name=col,
|
|
76
|
+
dtype=str(series.dtype),
|
|
77
|
+
inferred_type="numeric",
|
|
78
|
+
n_unique=n_unique,
|
|
79
|
+
n_missing=n_missing,
|
|
80
|
+
p_missing=p_missing,
|
|
81
|
+
is_constant=n_unique == 1,
|
|
82
|
+
cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if len(s) > 0:
|
|
86
|
+
profile.mean = float(s.mean())
|
|
87
|
+
profile.std = float(s.std())
|
|
88
|
+
profile.min = float(s.min())
|
|
89
|
+
profile.max = float(s.max())
|
|
90
|
+
profile.median = float(s.median())
|
|
91
|
+
profile.q1 = float(s.quantile(0.25))
|
|
92
|
+
profile.q3 = float(s.quantile(0.75))
|
|
93
|
+
profile.iqr = profile.q3 - profile.q1
|
|
94
|
+
profile.skewness = float(s.skew()) if len(s) > 2 else 0.0
|
|
95
|
+
profile.kurtosis = float(s.kurtosis()) if len(s) > 2 else 0.0
|
|
96
|
+
|
|
97
|
+
q1, q3 = profile.q1, profile.q3
|
|
98
|
+
iqr = profile.iqr
|
|
99
|
+
if iqr and iqr > 0:
|
|
100
|
+
lower = q1 - 1.5 * iqr
|
|
101
|
+
upper = q3 + 1.5 * iqr
|
|
102
|
+
profile.outliers = int(((s < lower) | (s > upper)).sum())
|
|
103
|
+
|
|
104
|
+
if p_missing > 50:
|
|
105
|
+
profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
|
|
106
|
+
if profile.is_constant:
|
|
107
|
+
profile.alerts.append(f"Column '{col}': constant value ({s.iloc[0] if len(s) > 0 else 'N/A'})")
|
|
108
|
+
if profile.skewness is not None and abs(profile.skewness) > 2:
|
|
109
|
+
profile.alerts.append(f"Column '{col}': high skewness ({profile.skewness:.2f})")
|
|
110
|
+
if profile.outliers > 0:
|
|
111
|
+
profile.alerts.append(f"Column '{col}': {profile.outliers} outliers detected")
|
|
112
|
+
|
|
113
|
+
return profile
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class CategoricalStrategy(ColumnTypeStrategy):
|
|
117
|
+
def infer(self, series: pd.Series) -> str:
|
|
118
|
+
return "categorical"
|
|
119
|
+
|
|
120
|
+
def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
|
|
121
|
+
s = series.dropna()
|
|
122
|
+
n_missing = series.isna().sum()
|
|
123
|
+
p_missing = (n_missing / n_total) * 100
|
|
124
|
+
n_unique = series.nunique()
|
|
125
|
+
|
|
126
|
+
top_values = {}
|
|
127
|
+
if len(s) > 0:
|
|
128
|
+
top_n = s.value_counts().head(5)
|
|
129
|
+
top_values = {str(k): int(v) for k, v in top_n.items()}
|
|
130
|
+
|
|
131
|
+
profile = ColumnProfile(
|
|
132
|
+
name=col,
|
|
133
|
+
dtype=str(series.dtype),
|
|
134
|
+
inferred_type="categorical",
|
|
135
|
+
n_unique=n_unique,
|
|
136
|
+
n_missing=n_missing,
|
|
137
|
+
p_missing=p_missing,
|
|
138
|
+
is_constant=n_unique <= 1,
|
|
139
|
+
cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
|
|
140
|
+
top_values=top_values,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if p_missing > 50:
|
|
144
|
+
profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
|
|
145
|
+
if profile.is_constant:
|
|
146
|
+
profile.alerts.append(f"Column '{col}': constant value")
|
|
147
|
+
if n_unique == n_total:
|
|
148
|
+
profile.alerts.append(f"Column '{col}': all values unique (possible ID)")
|
|
149
|
+
|
|
150
|
+
return profile
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class DateTimeStrategy(ColumnTypeStrategy):
|
|
154
|
+
def infer(self, series: pd.Series) -> str:
|
|
155
|
+
return "datetime"
|
|
156
|
+
|
|
157
|
+
def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
|
|
158
|
+
s = series.dropna()
|
|
159
|
+
n_missing = series.isna().sum()
|
|
160
|
+
p_missing = (n_missing / n_total) * 100
|
|
161
|
+
n_unique = series.nunique()
|
|
162
|
+
|
|
163
|
+
profile = ColumnProfile(
|
|
164
|
+
name=col,
|
|
165
|
+
dtype=str(series.dtype),
|
|
166
|
+
inferred_type="datetime",
|
|
167
|
+
n_unique=n_unique,
|
|
168
|
+
n_missing=n_missing,
|
|
169
|
+
p_missing=p_missing,
|
|
170
|
+
is_constant=n_unique <= 1,
|
|
171
|
+
cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if len(s) > 0:
|
|
175
|
+
try:
|
|
176
|
+
years = s.dt.year
|
|
177
|
+
profile.min = float(years.min())
|
|
178
|
+
profile.max = float(years.max())
|
|
179
|
+
profile.mean = float(years.mean())
|
|
180
|
+
except Exception:
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
if p_missing > 50:
|
|
184
|
+
profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
|
|
185
|
+
|
|
186
|
+
return profile
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class BooleanStrategy(ColumnTypeStrategy):
|
|
190
|
+
def infer(self, series: pd.Series) -> str:
|
|
191
|
+
return "boolean"
|
|
192
|
+
|
|
193
|
+
def analyze(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
|
|
194
|
+
s = series.dropna()
|
|
195
|
+
n_missing = series.isna().sum()
|
|
196
|
+
p_missing = (n_missing / n_total) * 100
|
|
197
|
+
n_unique = series.nunique()
|
|
198
|
+
true_count = int(s.astype(bool).sum()) if len(s) > 0 else 0
|
|
199
|
+
|
|
200
|
+
profile = ColumnProfile(
|
|
201
|
+
name=col,
|
|
202
|
+
dtype=str(series.dtype),
|
|
203
|
+
inferred_type="boolean",
|
|
204
|
+
n_unique=n_unique,
|
|
205
|
+
n_missing=n_missing,
|
|
206
|
+
p_missing=p_missing,
|
|
207
|
+
is_constant=n_unique <= 1,
|
|
208
|
+
cardinality_ratio=n_unique / n_total if n_total > 0 else 0,
|
|
209
|
+
mean=float(true_count / len(s)) if len(s) > 0 else 0,
|
|
210
|
+
top_values={"True": true_count, "False": len(s) - true_count} if len(s) > 0 else {},
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if p_missing > 50:
|
|
214
|
+
profile.alerts.append(f"Column '{col}': {p_missing:.1f}% missing values")
|
|
215
|
+
|
|
216
|
+
return profile
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class AnalyzerEngine:
|
|
220
|
+
def __init__(self):
|
|
221
|
+
self.strategies: Dict[str, ColumnTypeStrategy] = {
|
|
222
|
+
"numeric": NumericStrategy(),
|
|
223
|
+
"categorical": CategoricalStrategy(),
|
|
224
|
+
"datetime": DateTimeStrategy(),
|
|
225
|
+
"boolean": BooleanStrategy(),
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
def infer_column_type(self, series: pd.Series) -> str:
|
|
229
|
+
if pd.api.types.is_datetime64_any_dtype(series):
|
|
230
|
+
return "datetime"
|
|
231
|
+
if pd.api.types.is_bool_dtype(series):
|
|
232
|
+
return "boolean"
|
|
233
|
+
if pd.api.types.is_numeric_dtype(series):
|
|
234
|
+
return "numeric"
|
|
235
|
+
return "categorical"
|
|
236
|
+
|
|
237
|
+
def analyze_column(self, series: pd.Series, col: str, n_total: int) -> ColumnProfile:
|
|
238
|
+
ctype = self.infer_column_type(series)
|
|
239
|
+
strategy = self.strategies[ctype]
|
|
240
|
+
return strategy.analyze(series, col, n_total)
|
|
241
|
+
|
|
242
|
+
def analyze_dataset(self, df: pd.DataFrame) -> DatasetReport:
|
|
243
|
+
profiles: Dict[str, ColumnProfile] = {}
|
|
244
|
+
all_alerts: List[str] = []
|
|
245
|
+
categorical_columns: List[str] = []
|
|
246
|
+
numeric_columns: List[str] = []
|
|
247
|
+
datetime_columns: List[str] = []
|
|
248
|
+
boolean_columns: List[str] = []
|
|
249
|
+
|
|
250
|
+
for col in df.columns:
|
|
251
|
+
profile = self.analyze_column(df[col], col, len(df))
|
|
252
|
+
profiles[col] = profile
|
|
253
|
+
all_alerts.extend(profile.alerts)
|
|
254
|
+
|
|
255
|
+
if profile.inferred_type == "numeric":
|
|
256
|
+
numeric_columns.append(col)
|
|
257
|
+
elif profile.inferred_type == "categorical":
|
|
258
|
+
categorical_columns.append(col)
|
|
259
|
+
elif profile.inferred_type == "datetime":
|
|
260
|
+
datetime_columns.append(col)
|
|
261
|
+
elif profile.inferred_type == "boolean":
|
|
262
|
+
boolean_columns.append(col)
|
|
263
|
+
|
|
264
|
+
n_duplicates = int(df.duplicated().sum())
|
|
265
|
+
if n_duplicates > 0:
|
|
266
|
+
all_alerts.append(f"Found {n_duplicates} duplicate rows")
|
|
267
|
+
|
|
268
|
+
correlation_pairs = self._find_correlations(df, numeric_columns)
|
|
269
|
+
|
|
270
|
+
mem_bytes = df.memory_usage(deep=True).sum()
|
|
271
|
+
if mem_bytes > 1e9:
|
|
272
|
+
memory_usage = f"{mem_bytes / 1e9:.2f} GB"
|
|
273
|
+
elif mem_bytes > 1e6:
|
|
274
|
+
memory_usage = f"{mem_bytes / 1e6:.2f} MB"
|
|
275
|
+
else:
|
|
276
|
+
memory_usage = f"{mem_bytes / 1e3:.1f} KB"
|
|
277
|
+
|
|
278
|
+
n_rows = len(df)
|
|
279
|
+
if n_rows > 1_000_000:
|
|
280
|
+
estimated_rows = f"{n_rows / 1_000_000:.1f}M"
|
|
281
|
+
elif n_rows > 1_000:
|
|
282
|
+
estimated_rows = f"{n_rows / 1_000:.1f}K"
|
|
283
|
+
else:
|
|
284
|
+
estimated_rows = str(n_rows)
|
|
285
|
+
|
|
286
|
+
n_missing_total = sum(p.n_missing for p in profiles.values())
|
|
287
|
+
total_cells = n_rows * len(df.columns)
|
|
288
|
+
p_missing_total = (n_missing_total / total_cells) * 100 if total_cells > 0 else 0
|
|
289
|
+
|
|
290
|
+
return DatasetReport(
|
|
291
|
+
n_rows=n_rows,
|
|
292
|
+
n_cols=len(df.columns),
|
|
293
|
+
n_duplicates=n_duplicates,
|
|
294
|
+
n_missing_total=n_missing_total,
|
|
295
|
+
p_missing_total=p_missing_total,
|
|
296
|
+
memory_usage=memory_usage,
|
|
297
|
+
estimated_rows=estimated_rows,
|
|
298
|
+
column_profiles=profiles,
|
|
299
|
+
correlation_pairs=correlation_pairs,
|
|
300
|
+
alerts=all_alerts,
|
|
301
|
+
categorical_columns=categorical_columns,
|
|
302
|
+
numeric_columns=numeric_columns,
|
|
303
|
+
datetime_columns=datetime_columns,
|
|
304
|
+
boolean_columns=boolean_columns,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
def _find_correlations(
|
|
308
|
+
self, df: pd.DataFrame, numeric_cols: List[str], threshold: float = 0.3
|
|
309
|
+
) -> List[Tuple[str, str, float]]:
|
|
310
|
+
if len(numeric_cols) < 2:
|
|
311
|
+
return []
|
|
312
|
+
|
|
313
|
+
corr_matrix = df[numeric_cols].corr().abs()
|
|
314
|
+
pairs = []
|
|
315
|
+
for i in range(len(numeric_cols)):
|
|
316
|
+
for j in range(i + 1, len(numeric_cols)):
|
|
317
|
+
r = corr_matrix.iloc[i, j]
|
|
318
|
+
if r >= threshold:
|
|
319
|
+
col_a = numeric_cols[i]
|
|
320
|
+
col_b = numeric_cols[j]
|
|
321
|
+
actual_r = df[[col_a, col_b]].corr().iloc[0, 1]
|
|
322
|
+
pairs.append((col_a, col_b, float(actual_r)))
|
|
323
|
+
|
|
324
|
+
pairs.sort(key=lambda x: abs(x[2]), reverse=True)
|
|
325
|
+
return pairs[:10]
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BibliometricsAnalyzer:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
self.column_map = {
|
|
12
|
+
"AU": ["Authors", "AU", "Author", "Autores", "AUTHOR", "authors"],
|
|
13
|
+
"PY": ["Year", "PY", "Publication Year", "Año", "YEAR", "year"],
|
|
14
|
+
"SO": [
|
|
15
|
+
"Source title", "SO", "Journal", "Source", "Revista",
|
|
16
|
+
"SOURCE", "source", "Publication Name",
|
|
17
|
+
],
|
|
18
|
+
"DE": [
|
|
19
|
+
"Author Keywords", "DE", "Keywords", "Palabras Clave",
|
|
20
|
+
"KEYWORDS", "keywords", "Index Keywords",
|
|
21
|
+
],
|
|
22
|
+
"TC": [
|
|
23
|
+
"Cited by", "TC", "Times Cited", "Citas",
|
|
24
|
+
"CITING", "citations", "Citations",
|
|
25
|
+
],
|
|
26
|
+
"TI": ["Title", "TI", "TITLE", "Article Title", "title"],
|
|
27
|
+
"DI": ["DOI", "DI", "DOI Number", "doi"],
|
|
28
|
+
"AF": [
|
|
29
|
+
"Affiliation", "AF", "AFFILIATION", "affiliation",
|
|
30
|
+
"Author Affiliation", "Author Affiliations",
|
|
31
|
+
],
|
|
32
|
+
"AB": ["Abstract", "AB", "ABSTRACT", "abstract"],
|
|
33
|
+
"DT": [
|
|
34
|
+
"Document Type", "DT", "Document Type", "document_type",
|
|
35
|
+
],
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def _find_column(self, df: pd.DataFrame, key: str) -> Optional[str]:
|
|
39
|
+
candidates = self.column_map[key]
|
|
40
|
+
for col in df.columns:
|
|
41
|
+
col_upper = col.strip().upper()
|
|
42
|
+
for c in candidates:
|
|
43
|
+
if col_upper == c.upper() or col.strip() == c:
|
|
44
|
+
return col
|
|
45
|
+
if col_upper.replace(" ", "_") == c.upper().replace(" ", "_"):
|
|
46
|
+
return col
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
def _split_multi(self, raw: str, separators: str = ";,", strip_parens: bool = True) -> List[str]:
|
|
50
|
+
if not isinstance(raw, str) or not raw.strip():
|
|
51
|
+
return []
|
|
52
|
+
result = []
|
|
53
|
+
for part in raw.split():
|
|
54
|
+
if not part.strip():
|
|
55
|
+
continue
|
|
56
|
+
items = [item.strip() for item in part.replace(";", ",").split(",")]
|
|
57
|
+
for item in items:
|
|
58
|
+
item = item.strip().strip(".").strip()
|
|
59
|
+
if strip_parens:
|
|
60
|
+
item = item.split("(")[0].strip()
|
|
61
|
+
if item and len(item) > 1:
|
|
62
|
+
result.append(item)
|
|
63
|
+
return result
|
|
64
|
+
|
|
65
|
+
def _split_list_field(self, raw: str) -> List[str]:
|
|
66
|
+
if not isinstance(raw, str) or not raw.strip():
|
|
67
|
+
return []
|
|
68
|
+
items = [x.strip() for x in raw.replace(";", ",").split(",")]
|
|
69
|
+
return [x for x in items if x]
|
|
70
|
+
|
|
71
|
+
def analyze(self, df: pd.DataFrame) -> Optional[dict]:
|
|
72
|
+
results: Dict[str, pd.DataFrame] = {}
|
|
73
|
+
|
|
74
|
+
py_col = self._find_column(df, "PY")
|
|
75
|
+
if py_col:
|
|
76
|
+
year_series = df[py_col].dropna()
|
|
77
|
+
year_numeric = pd.to_numeric(year_series, errors="coerce").dropna()
|
|
78
|
+
if len(year_numeric) > 0:
|
|
79
|
+
prod = year_numeric.value_counts().sort_index().reset_index()
|
|
80
|
+
prod.columns = ["Year", "Count"]
|
|
81
|
+
prod["Year"] = prod["Year"].astype(int)
|
|
82
|
+
results["annual_production"] = prod
|
|
83
|
+
|
|
84
|
+
au_col = self._find_column(df, "AU")
|
|
85
|
+
if au_col:
|
|
86
|
+
all_authors: List[str] = []
|
|
87
|
+
for entry in df[au_col].dropna():
|
|
88
|
+
all_authors.extend(self._split_list_field(str(entry)))
|
|
89
|
+
|
|
90
|
+
if all_authors:
|
|
91
|
+
au_counts = Counter(all_authors)
|
|
92
|
+
au_df = (
|
|
93
|
+
pd.DataFrame(au_counts.most_common(20), columns=["Author", "Count"])
|
|
94
|
+
.sort_values("Count", ascending=False)
|
|
95
|
+
.reset_index(drop=True)
|
|
96
|
+
)
|
|
97
|
+
results["top_authors"] = au_df
|
|
98
|
+
|
|
99
|
+
n_unique = len(au_counts)
|
|
100
|
+
total = sum(au_counts.values())
|
|
101
|
+
results["author_summary"] = {
|
|
102
|
+
"total_authors": total,
|
|
103
|
+
"unique_authors": n_unique,
|
|
104
|
+
"avg_per_publication": round(total / max(len(df), 1), 2),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
so_col = self._find_column(df, "SO")
|
|
108
|
+
if so_col:
|
|
109
|
+
so_counts = df[so_col].dropna().value_counts().head(15)
|
|
110
|
+
if len(so_counts) > 0:
|
|
111
|
+
so_df = so_counts.reset_index()
|
|
112
|
+
so_df.columns = ["Source", "Count"]
|
|
113
|
+
results["top_sources"] = so_df
|
|
114
|
+
|
|
115
|
+
de_col = self._find_column(df, "DE")
|
|
116
|
+
if de_col:
|
|
117
|
+
all_keywords: List[str] = []
|
|
118
|
+
for entry in df[de_col].dropna():
|
|
119
|
+
all_keywords.extend(self._split_list_field(str(entry)))
|
|
120
|
+
|
|
121
|
+
if all_keywords:
|
|
122
|
+
kw_counts = Counter(all_keywords)
|
|
123
|
+
kw_df = (
|
|
124
|
+
pd.DataFrame(kw_counts.most_common(20), columns=["Keyword", "Count"])
|
|
125
|
+
.sort_values("Count", ascending=False)
|
|
126
|
+
.reset_index(drop=True)
|
|
127
|
+
)
|
|
128
|
+
results["top_keywords"] = kw_df
|
|
129
|
+
|
|
130
|
+
tc_col = self._find_column(df, "TC")
|
|
131
|
+
if tc_col:
|
|
132
|
+
tc_series = pd.to_numeric(df[tc_col], errors="coerce").dropna()
|
|
133
|
+
if len(tc_series) > 0:
|
|
134
|
+
results["citation_summary"] = {
|
|
135
|
+
"total_citations": int(tc_series.sum()),
|
|
136
|
+
"mean_citations": round(tc_series.mean(), 2),
|
|
137
|
+
"median_citations": round(tc_series.median(), 2),
|
|
138
|
+
"max_citations": int(tc_series.max()),
|
|
139
|
+
"min_citations": int(tc_series.min()),
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
return results if results else None
|