dbx-elt-utils 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbx_elt_utils-0.1.0/PKG-INFO +47 -0
- dbx_elt_utils-0.1.0/README.md +37 -0
- dbx_elt_utils-0.1.0/pyproject.toml +27 -0
- dbx_elt_utils-0.1.0/src/dbx_elt_utils/__init__.py +0 -0
- dbx_elt_utils-0.1.0/src/dbx_elt_utils/clean_utils.py +16 -0
- dbx_elt_utils-0.1.0/src/dbx_elt_utils/ingest_utils.py +58 -0
- dbx_elt_utils-0.1.0/src/dbx_elt_utils/notebook_utils.py +545 -0
- dbx_elt_utils-0.1.0/src/my_elt_utils/__init__.py +0 -0
- dbx_elt_utils-0.1.0/src/my_elt_utils/clean_utils.py +16 -0
- dbx_elt_utils-0.1.0/src/my_elt_utils/ingest_utils.py +58 -0
- dbx_elt_utils-0.1.0/src/my_elt_utils/notebook_utils.py +545 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dbx-elt-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Librería de utilidades comunes para pipelines ELT orientados a Databricks (DLT).
|
|
5
|
+
Author-email: DBX Analyst <developer@example.com>
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Provides-Extra: local
|
|
8
|
+
Requires-Dist: databricks-connect>=14.0.0; extra == 'local'
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# DBX ELT Utilities
|
|
12
|
+
|
|
13
|
+
Librería de utilidades para simplificar la creación de iteraciones y flujos **Delta Live Tables (DLT)** en Databricks.
|
|
14
|
+
|
|
15
|
+
## Instalación
|
|
16
|
+
|
|
17
|
+
Instala este paquete usando `pip`:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install dbx-elt-utils
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Para desarrollo local (VS Code)
|
|
24
|
+
Si quieres desarrollar localmente simulando DLT y consumiendo datos desde Databricks Connect:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install dbx-elt-utils[local]
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Uso en Pipelines / Notebooks
|
|
31
|
+
|
|
32
|
+
En tus notebooks en vez de configurar el `sys.path`, simplemente importa el módulo base:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from dbx_elt_utils.notebook_utils import init_notebook
|
|
36
|
+
|
|
37
|
+
# Esto detectará si estás en Databricks o local automáticamente
|
|
38
|
+
notebook = init_notebook()
|
|
39
|
+
env = notebook.env
|
|
40
|
+
spark = notebook.spark
|
|
41
|
+
dlt = notebook.dlt
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Módulos Disponibles
|
|
45
|
+
- `ingest_utils`: Gestión transparente entre batch/streaming vía `ingesta_hibrida`.
|
|
46
|
+
- `notebook_utils`: Setup mágico del entorno de ejecución (Mock para DLT local).
|
|
47
|
+
- `clean_utils`: Utilidades puras de pyspark para limpieza de arreglos y strings.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# DBX ELT Utilities
|
|
2
|
+
|
|
3
|
+
Librería de utilidades para simplificar la creación de iteraciones y flujos **Delta Live Tables (DLT)** en Databricks.
|
|
4
|
+
|
|
5
|
+
## Instalación
|
|
6
|
+
|
|
7
|
+
Instala este paquete usando `pip`:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install dbx-elt-utils
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### Para desarrollo local (VS Code)
|
|
14
|
+
Si quieres desarrollar localmente simulando DLT y consumiendo datos desde Databricks Connect:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install dbx-elt-utils[local]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Uso en Pipelines / Notebooks
|
|
21
|
+
|
|
22
|
+
En tus notebooks en vez de configurar el `sys.path`, simplemente importa el módulo base:
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from dbx_elt_utils.notebook_utils import init_notebook
|
|
26
|
+
|
|
27
|
+
# Esto detectará si estás en Databricks o local automáticamente
|
|
28
|
+
notebook = init_notebook()
|
|
29
|
+
env = notebook.env
|
|
30
|
+
spark = notebook.spark
|
|
31
|
+
dlt = notebook.dlt
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Módulos Disponibles
|
|
35
|
+
- `ingest_utils`: Gestión transparente entre batch/streaming vía `ingesta_hibrida`.
|
|
36
|
+
- `notebook_utils`: Setup mágico del entorno de ejecución (Mock para DLT local).
|
|
37
|
+
- `clean_utils`: Utilidades puras de pyspark para limpieza de arreglos y strings.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dbx-elt-utils"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Librería de utilidades comunes para pipelines ELT orientados a Databricks (DLT)."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "DBX Analyst", email = "developer@example.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
dependencies = [
|
|
11
|
+
# Dependencias estrictamente necesarias para la lógica interna
|
|
12
|
+
# IMPORTANTE: pyspark, dlt o databricks-connect NO van aquí
|
|
13
|
+
# para evitar sobreescribir los entornos de producción en Databricks.
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
# Dependencias para desarrollar o probar localmente (ej: pip install dbx-elt-utils[local])
|
|
18
|
+
local = [
|
|
19
|
+
"databricks-connect>=14.0.0", # Usa el sdk compatible con tu entorno
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["hatchling"]
|
|
24
|
+
build-backend = "hatchling.build"
|
|
25
|
+
|
|
26
|
+
[tool.hatch.build.targets.wheel]
|
|
27
|
+
packages = ["src/dbx_elt_utils"]
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# src/common/clean_utils.py
|
|
2
|
+
from pyspark.sql.functions import regexp_replace, col, trim, when
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def extraer_valor_array_string(columna):
|
|
6
|
+
"""
|
|
7
|
+
Limpia columnas que vienen con formato de array JSON string.
|
|
8
|
+
Ejemplo: '["12.345.678-9"]' -> '12.345.678-9'
|
|
9
|
+
Ejemplo: '[]' -> NULL
|
|
10
|
+
"""
|
|
11
|
+
# 1. Eliminar corchetes y comillas
|
|
12
|
+
# Regex: Quita [ " ] al principio o al final
|
|
13
|
+
c = regexp_replace(columna, r'[\["\]]', "")
|
|
14
|
+
|
|
15
|
+
# 2. Si queda vacío, convertir a NULL
|
|
16
|
+
return when(trim(c) == "", None).otherwise(trim(c))
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# src/common/ingest_utils.py
|
|
2
|
+
from pyspark.sql.functions import current_timestamp, lit, input_file_name
|
|
3
|
+
from src.common.notebook_utils import get_schema_location_base
|
|
4
|
+
|
|
5
|
+
def ingesta_hibrida(spark, origen, tipo="auto_detect", formato_archivo="json", env_suffix="_dev"):
|
|
6
|
+
"""
|
|
7
|
+
Ingestor Universal.
|
|
8
|
+
Args:
|
|
9
|
+
origen (str): Nombre de tabla (cat.esq.tab) O Ruta (abfss://...)
|
|
10
|
+
tipo (str): "delta_table", "auto_loader" o "auto_detect"
|
|
11
|
+
formato_archivo (str): csv, json, parquet (Solo para Auto Loader)
|
|
12
|
+
env_suffix (str): Sufijo de ambiente ("_dev", "_prod") para rutas de checkpoint
|
|
13
|
+
"""
|
|
14
|
+
# 1. Auto-detección inteligente
|
|
15
|
+
if tipo == "auto_detect":
|
|
16
|
+
# Si parece una ruta de nube o ruta unix, asumimos archivo
|
|
17
|
+
if "abfss://" in origen or "s3://" in origen or "/" in origen:
|
|
18
|
+
tipo = "auto_loader"
|
|
19
|
+
else:
|
|
20
|
+
tipo = "delta_table"
|
|
21
|
+
|
|
22
|
+
print(f"🏭 Ingesta Híbrida activada: {tipo.upper()} -> {origen}")
|
|
23
|
+
|
|
24
|
+
# 2. Lógica de Lectura (Siempre Streaming para DLT)
|
|
25
|
+
reader = spark.readStream
|
|
26
|
+
|
|
27
|
+
# --- CASO A: UNITY CATALOG (Tablas Fivetran) ---
|
|
28
|
+
if tipo == "delta_table":
|
|
29
|
+
return (
|
|
30
|
+
reader.table(origen)
|
|
31
|
+
.withColumn("_origen_datos", lit("unity_catalog"))
|
|
32
|
+
.withColumn("_ingestado_en", current_timestamp())
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# --- CASO B: AUTO LOADER (Archivos Sueltos) ---
|
|
36
|
+
elif tipo == "auto_loader":
|
|
37
|
+
# Configuración "Schema Evolution" (Clave para archivos sueltos que cambian)
|
|
38
|
+
cloud_reader = (
|
|
39
|
+
reader.format("cloudFiles")
|
|
40
|
+
.option("cloudFiles.format", formato_archivo)
|
|
41
|
+
.option("cloudFiles.inferColumnTypes", "true")
|
|
42
|
+
.option("cloudFiles.schemaEvolutionMode", "addNewColumns") # Si el Excel trae columna nueva, no falla
|
|
43
|
+
.option("cloudFiles.schemaLocation", f"{get_schema_location_base(env_suffix)}/{origen.split('/')[-1]}")
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Ajustes específicos para CSV
|
|
47
|
+
if formato_archivo == "csv":
|
|
48
|
+
cloud_reader = cloud_reader.option("header", "true").option("delimiter", ",")
|
|
49
|
+
|
|
50
|
+
return (
|
|
51
|
+
cloud_reader.load(origen)
|
|
52
|
+
.withColumn("_origen_datos", lit(f"file_{formato_archivo}"))
|
|
53
|
+
.withColumn("_nombre_archivo", input_file_name()) # Útil para saber qué archivo trajo el dato
|
|
54
|
+
.withColumn("_ingestado_en", current_timestamp())
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError(f"Tipo desconocido: {tipo}")
|
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_checkpoint_base(env_suffix: str = "_dev") -> str:
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
Retorna la ruta base para checkpoints de streaming.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Usa Unity Catalog Volumes en lugar de DBFS /tmp (serverless no tiene acceso a DBFS /tmp).
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
env_suffix: Sufijo de ambiente ("_dev", "_prod", etc.)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Ruta base para checkpoints (ej: "dbfs:/Volumes/bronze_dev/temporary/checkpoints")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
Override:
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
Configurar variable de entorno ELT_CHECKPOINT_BASE para usar otra ruta.
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
base = os.getenv("ELT_CHECKPOINT_BASE")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if base:
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
return base.rstrip("/")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Default: Unity Catalog Volume compartido
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
return f"dbfs:/Volumes/bronze{env_suffix}/temporary/checkpoints"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_schema_location_base(env_suffix: str = "_dev") -> str:
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
Retorna la ruta base para schema location de Auto Loader.
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
env_suffix: Sufijo de ambiente ("_dev", "_prod", etc.)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
Ruta base para schema locations
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
base = os.getenv("ELT_SCHEMA_BASE")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if base:
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
return base.rstrip("/")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# Usa la misma base que checkpoints + /schema
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
checkpoint_base = get_checkpoint_base(env_suffix)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
return f"{checkpoint_base}/schema"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class NotebookContext:
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def __init__(self, env, is_local, spark=None, dlt=None):
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
self.env = env
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
self.is_local = is_local
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
self.spark = spark
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
self.dlt = dlt
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def init_notebook():
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
Inicializa el entorno del notebook.
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
1. Agrega src/ al sys.path
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
2. Detecta entorno (Local vs Nube) y mockea DLT si es necesario.
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
3. Retorna un objeto Contexto con {env, is_local, dlt, spark}.
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# 1. SETUP PATH DE PROYECTO
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# Intenta usar __file__ si se importa como modulo
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
current_path = Path(__file__).resolve().parent
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
except NameError:
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# En notebook interactivo usa cwd
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
current_path = Path(os.getcwd())
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
project_root = current_path
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
for _ in range(5):
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
if (project_root / "src").exists():
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
project_root = project_root.parent
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
if str(project_root) not in sys.path:
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
sys.path.insert(0, str(project_root))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
print(f"✅ Root agregado al path: {project_root}")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# 2. DETECCION DE ENTORNO & MOCK DLT
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
is_local = False
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
spark_session = None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
dlt_module = None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
env_suffix = "_dev" # Default seguro
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# Intenta importar dlt (Nube o Local con dlt instalado)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
import dlt
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
dlt_module = dlt
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# Intentar obtener spark session existente (Nube)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
import IPython
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
ip = IPython.get_ipython()
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
if ip and 'spark' in ip.user_ns:
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
spark_session = ip.user_ns['spark']
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# Leer configuracion si existe
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
env_suffix = spark_session.conf.get("env_suffix", "_dev")
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
except:
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
except:
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
except ImportError:
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ENTORNO LOCAL SIN DLT
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
is_local = True
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class DltMock:
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def table(self, *a, **k): return lambda f: f
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def view(self, *a, **k): return lambda f: f
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def expect(self, *a, **k): return lambda f: f
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def expect_or_drop(self, *a, **k): return lambda f: f
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def create_streaming_table(self, *a, **k): pass
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def apply_changes(self, *a, **k): pass
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
dlt_module = DltMock()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
print(f"⚠️ Modo Local: 'dlt' simulado. Usando env='{env_suffix}'")
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
return NotebookContext(env=env_suffix, is_local=is_local, dlt=dlt_module, spark=spark_session)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def get_local_source_table(spark, source_official: str) -> str:
|
|
362
|
+
"""
|
|
363
|
+
En modo local, permite a los notebooks Silver/Gold descubrir de dónde leer los datos
|
|
364
|
+
cuando las dependencias (Bronze) no están materializadas en Unity Catalog sino en tablas temporales.
|
|
365
|
+
|
|
366
|
+
1. Revisa si existe la tabla oficial en UC (ej: bronze_dev.schema.table).
|
|
367
|
+
2. Si no, revisa si existe la tabla temporal local (ej: bronze_dev.temporary.table_tmp_sql).
|
|
368
|
+
3. Si ninguna existe, lanza un error descriptivo.
|
|
369
|
+
"""
|
|
370
|
+
parts = source_official.split(".")
|
|
371
|
+
env_catalog = parts[0]
|
|
372
|
+
target_table = parts[-1]
|
|
373
|
+
|
|
374
|
+
oficial_name = source_official
|
|
375
|
+
temp_name = f"{env_catalog}.temporary.{target_table}_tmp_sql"
|
|
376
|
+
|
|
377
|
+
def table_exists(name):
|
|
378
|
+
try:
|
|
379
|
+
# Chequeo extra para MockSparkSession (ya que siempre devuelve un MockDataframe)
|
|
380
|
+
if type(spark).__name__ == "MockSparkSession":
|
|
381
|
+
# En pruebas locales sin databricks-connect forzamos a que use la temporal si se pregunta por ella
|
|
382
|
+
if "temporary" in name:
|
|
383
|
+
return True
|
|
384
|
+
return False
|
|
385
|
+
|
|
386
|
+
# En Databricks real o Connect evaluamos su existencia de forma directa con eager execution (Spark Connect)
|
|
387
|
+
return spark.catalog.tableExists(name)
|
|
388
|
+
except Exception:
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
if table_exists(oficial_name):
|
|
392
|
+
print(f"✅ Leyendo desde tabla oficial UC: {oficial_name}")
|
|
393
|
+
return oficial_name
|
|
394
|
+
elif table_exists(temp_name):
|
|
395
|
+
print(f"⚠️ Leyendo desde tabla TEMPORAL de prueba local: {temp_name}")
|
|
396
|
+
return temp_name
|
|
397
|
+
else:
|
|
398
|
+
raise ValueError(
|
|
399
|
+
f"❌ ERROR DE LINAJE LOCAL: No se encontró la tabla de origen.\n"
|
|
400
|
+
f" - Oficial (No existe): {oficial_name}\n"
|
|
401
|
+
f" - Temporal Local (No existe): {temp_name}\n"
|
|
402
|
+
f"Solución: Debes ejecutar primero el notebook de la capa anterior ({target_table}) para generar los datos temporales de prueba."
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def clean_local_test_table(spark, source_table: str):
|
|
407
|
+
"""
|
|
408
|
+
Se encarga de limpiar el entorno local tras una prueba exitosa.
|
|
409
|
+
Si source_table es una tabla temporal, la elimina.
|
|
410
|
+
Si source_table es una tabla oficial de UC, busca y elimina su versión temporal residual si existiese.
|
|
411
|
+
"""
|
|
412
|
+
print("-" * 50)
|
|
413
|
+
|
|
414
|
+
# Extraemos las partes para construir el nombre temporal siempre
|
|
415
|
+
parts = source_table.split(".")
|
|
416
|
+
env_catalog = parts[0]
|
|
417
|
+
target_table = parts[-1]
|
|
418
|
+
|
|
419
|
+
# Si la tabla ya era la temporal, limpiamos directo el '_tmp_sql' que viene en target_table,
|
|
420
|
+
# sino se lo añadimos a la tabla original
|
|
421
|
+
if target_table.endswith("_tmp_sql"):
|
|
422
|
+
temp_name = source_table
|
|
423
|
+
else:
|
|
424
|
+
temp_name = f"{env_catalog}.temporary.{target_table}_tmp_sql"
|
|
425
|
+
|
|
426
|
+
def table_exists(name):
|
|
427
|
+
try:
|
|
428
|
+
if type(spark).__name__ == "MockSparkSession":
|
|
429
|
+
# En el mock local, asumimos que no hay residuos a menos que sea explícito
|
|
430
|
+
# y evitamos el falso positivo de "Se detectó y eliminó una tabla residual"
|
|
431
|
+
return False
|
|
432
|
+
return spark.catalog.tableExists(name)
|
|
433
|
+
except Exception:
|
|
434
|
+
return False
|
|
435
|
+
|
|
436
|
+
if table_exists(temp_name):
|
|
437
|
+
print("🧹 Limpiando zona temporal...")
|
|
438
|
+
try:
|
|
439
|
+
spark.sql(f"DROP TABLE IF EXISTS {temp_name}")
|
|
440
|
+
if temp_name == source_table:
|
|
441
|
+
print(f"✨ Tabla origen '{temp_name}' ELIMINADA.")
|
|
442
|
+
else:
|
|
443
|
+
print(f"✨ Se detectó y eliminó una tabla local residual: '{temp_name}'.")
|
|
444
|
+
except Exception as e:
|
|
445
|
+
print(f"⚠️ Error limpiando la tabla temporal: {e}")
|
|
446
|
+
else:
|
|
447
|
+
if "temporary" not in source_table:
|
|
448
|
+
print("ℹ️ Tabla de Unity Catalog oficial detectada. No hay residuos temporales locales que limpiar.")
|
|
449
|
+
|
|
450
|
+
print(" Ciclo de prueba completado limpiamente.")
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
_SPARK_CACHE = None
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def get_test_spark():
|
|
460
|
+
global _SPARK_CACHE
|
|
461
|
+
if _SPARK_CACHE is not None:
|
|
462
|
+
return _SPARK_CACHE
|
|
463
|
+
try:
|
|
464
|
+
from databricks.connect import DatabricksSession
|
|
465
|
+
from databricks.sdk.core import Config
|
|
466
|
+
config = Config(profile="DEFAULT")
|
|
467
|
+
_SPARK_CACHE = DatabricksSession.builder.sdkConfig(config).getOrCreate()
|
|
468
|
+
return _SPARK_CACHE
|
|
469
|
+
except Exception as e:
|
|
470
|
+
print(f" Error creando sesin Spark Local: {e}")
|
|
471
|
+
print(" Fallback to MockSparkSession for local testing")
|
|
472
|
+
class MockDataFrameReader:
|
|
473
|
+
def format(self, *a, **k): return self
|
|
474
|
+
def load(self, *a, **k): return self
|
|
475
|
+
def table(self, *a, **k): return self
|
|
476
|
+
def option(self, *a, **k): return self
|
|
477
|
+
def count(self): return 100
|
|
478
|
+
def show(self, *a, **k): pass
|
|
479
|
+
def limit(self, *a, **k): return self
|
|
480
|
+
def withColumn(self, *a, **k): return self
|
|
481
|
+
def select(self, *a, **k): return self
|
|
482
|
+
def drop(self, *a, **k): return self
|
|
483
|
+
def filter(self, *a, **k): return self
|
|
484
|
+
class MockDataStreamReader(MockDataFrameReader):
|
|
485
|
+
pass
|
|
486
|
+
class MockDataStreamWriter:
|
|
487
|
+
def format(self, *a, **k): return self
|
|
488
|
+
def trigger(self, *a, **k): return self
|
|
489
|
+
def outputMode(self, *a, **k): return self
|
|
490
|
+
def option(self, *a, **k): return self
|
|
491
|
+
def toTable(self, *a, **k): return self
|
|
492
|
+
def start(self, *a, **k): return self
|
|
493
|
+
def awaitTermination(self): pass
|
|
494
|
+
class MockDataFrame:
|
|
495
|
+
@property
|
|
496
|
+
def writeStream(self): return MockDataStreamWriter()
|
|
497
|
+
def createOrReplaceTempView(self, name): pass
|
|
498
|
+
def limit(self, *a, **k): return self
|
|
499
|
+
def show(self, *a, **k): pass
|
|
500
|
+
def isStreaming(self): return True
|
|
501
|
+
def withColumn(self, *a, **k): return self
|
|
502
|
+
def select(self, *a, **k): return self
|
|
503
|
+
def drop(self, *a, **k): return self
|
|
504
|
+
def filter(self, *a, **k): return self
|
|
505
|
+
def colRegex(self, *a, **k): return self
|
|
506
|
+
def alias(self, *a, **k): return self
|
|
507
|
+
def unionByName(self, *a, **k): return self
|
|
508
|
+
def distinct(self, *a, **k): return self
|
|
509
|
+
def join(self, *a, **k): return self
|
|
510
|
+
def na(self, *a, **k): return self
|
|
511
|
+
def count(self): return 100
|
|
512
|
+
@property
|
|
513
|
+
def columns(self): return ["col1", "col2"]
|
|
514
|
+
class MockSparkSession:
|
|
515
|
+
@property
|
|
516
|
+
def read(self): return MockDataFrameReader()
|
|
517
|
+
@property
|
|
518
|
+
def readStream(self): return MockDataStreamReader()
|
|
519
|
+
def sql(self, *a, **k): return MockDataFrame()
|
|
520
|
+
def table(self, *a, **k): return MockDataFrame()
|
|
521
|
+
_SPARK_CACHE = MockSparkSession()
|
|
522
|
+
return _SPARK_CACHE
|
|
523
|
+
|
|
524
|
+
def stop_local_spark():
|
|
525
|
+
"""
|
|
526
|
+
Detiene la sesión local de Spark y limpia el caché.
|
|
527
|
+
Útil para liberar recursos y evitar que el kernel de Python se quede colgado
|
|
528
|
+
tras ejecutar múltiples celdas o finalizar una prueba local.
|
|
529
|
+
"""
|
|
530
|
+
global _SPARK_CACHE
|
|
531
|
+
if _SPARK_CACHE is not None:
|
|
532
|
+
try:
|
|
533
|
+
# Intentar detener todos los streaming activos por seguridad
|
|
534
|
+
if hasattr(_SPARK_CACHE, "streams") and hasattr(_SPARK_CACHE.streams, "active"):
|
|
535
|
+
for stream in _SPARK_CACHE.streams.active:
|
|
536
|
+
stream.stop()
|
|
537
|
+
|
|
538
|
+
_SPARK_CACHE.stop()
|
|
539
|
+
except Exception as e:
|
|
540
|
+
print(f"⚠️ Error deteniendo la sesión de Spark: {e}")
|
|
541
|
+
finally:
|
|
542
|
+
_SPARK_CACHE = None
|
|
543
|
+
print("🛑 Sesión local de Spark detenida correctamente. Recursos liberados.")
|
|
544
|
+
else:
|
|
545
|
+
print("ℹ️ No hay ninguna sesión local de Spark activa que detener.")
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# src/common/clean_utils.py
|
|
2
|
+
from pyspark.sql.functions import regexp_replace, col, trim, when
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def extraer_valor_array_string(columna):
|
|
6
|
+
"""
|
|
7
|
+
Limpia columnas que vienen con formato de array JSON string.
|
|
8
|
+
Ejemplo: '["12.345.678-9"]' -> '12.345.678-9'
|
|
9
|
+
Ejemplo: '[]' -> NULL
|
|
10
|
+
"""
|
|
11
|
+
# 1. Eliminar corchetes y comillas
|
|
12
|
+
# Regex: Quita [ " ] al principio o al final
|
|
13
|
+
c = regexp_replace(columna, r'[\["\]]', "")
|
|
14
|
+
|
|
15
|
+
# 2. Si queda vacío, convertir a NULL
|
|
16
|
+
return when(trim(c) == "", None).otherwise(trim(c))
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# src/common/ingest_utils.py
|
|
2
|
+
from pyspark.sql.functions import current_timestamp, lit, input_file_name
|
|
3
|
+
from src.common.notebook_utils import get_schema_location_base
|
|
4
|
+
|
|
5
|
+
def ingesta_hibrida(spark, origen, tipo="auto_detect", formato_archivo="json", env_suffix="_dev"):
|
|
6
|
+
"""
|
|
7
|
+
Ingestor Universal.
|
|
8
|
+
Args:
|
|
9
|
+
origen (str): Nombre de tabla (cat.esq.tab) O Ruta (abfss://...)
|
|
10
|
+
tipo (str): "delta_table", "auto_loader" o "auto_detect"
|
|
11
|
+
formato_archivo (str): csv, json, parquet (Solo para Auto Loader)
|
|
12
|
+
env_suffix (str): Sufijo de ambiente ("_dev", "_prod") para rutas de checkpoint
|
|
13
|
+
"""
|
|
14
|
+
# 1. Auto-detección inteligente
|
|
15
|
+
if tipo == "auto_detect":
|
|
16
|
+
# Si parece una ruta de nube o ruta unix, asumimos archivo
|
|
17
|
+
if "abfss://" in origen or "s3://" in origen or "/" in origen:
|
|
18
|
+
tipo = "auto_loader"
|
|
19
|
+
else:
|
|
20
|
+
tipo = "delta_table"
|
|
21
|
+
|
|
22
|
+
print(f"🏭 Ingesta Híbrida activada: {tipo.upper()} -> {origen}")
|
|
23
|
+
|
|
24
|
+
# 2. Lógica de Lectura (Siempre Streaming para DLT)
|
|
25
|
+
reader = spark.readStream
|
|
26
|
+
|
|
27
|
+
# --- CASO A: UNITY CATALOG (Tablas Fivetran) ---
|
|
28
|
+
if tipo == "delta_table":
|
|
29
|
+
return (
|
|
30
|
+
reader.table(origen)
|
|
31
|
+
.withColumn("_origen_datos", lit("unity_catalog"))
|
|
32
|
+
.withColumn("_ingestado_en", current_timestamp())
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# --- CASO B: AUTO LOADER (Archivos Sueltos) ---
|
|
36
|
+
elif tipo == "auto_loader":
|
|
37
|
+
# Configuración "Schema Evolution" (Clave para archivos sueltos que cambian)
|
|
38
|
+
cloud_reader = (
|
|
39
|
+
reader.format("cloudFiles")
|
|
40
|
+
.option("cloudFiles.format", formato_archivo)
|
|
41
|
+
.option("cloudFiles.inferColumnTypes", "true")
|
|
42
|
+
.option("cloudFiles.schemaEvolutionMode", "addNewColumns") # Si el Excel trae columna nueva, no falla
|
|
43
|
+
.option("cloudFiles.schemaLocation", f"{get_schema_location_base(env_suffix)}/{origen.split('/')[-1]}")
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Ajustes específicos para CSV
|
|
47
|
+
if formato_archivo == "csv":
|
|
48
|
+
cloud_reader = cloud_reader.option("header", "true").option("delimiter", ",")
|
|
49
|
+
|
|
50
|
+
return (
|
|
51
|
+
cloud_reader.load(origen)
|
|
52
|
+
.withColumn("_origen_datos", lit(f"file_{formato_archivo}"))
|
|
53
|
+
.withColumn("_nombre_archivo", input_file_name()) # Útil para saber qué archivo trajo el dato
|
|
54
|
+
.withColumn("_ingestado_en", current_timestamp())
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError(f"Tipo desconocido: {tipo}")
|
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_checkpoint_base(env_suffix: str = "_dev") -> str:
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
Retorna la ruta base para checkpoints de streaming.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Usa Unity Catalog Volumes en lugar de DBFS /tmp (serverless no tiene acceso a DBFS /tmp).
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
env_suffix: Sufijo de ambiente ("_dev", "_prod", etc.)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Ruta base para checkpoints (ej: "dbfs:/Volumes/bronze_dev/temporary/checkpoints")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
Override:
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
Configurar variable de entorno ELT_CHECKPOINT_BASE para usar otra ruta.
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
base = os.getenv("ELT_CHECKPOINT_BASE")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if base:
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
return base.rstrip("/")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Default: Unity Catalog Volume compartido
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
return f"dbfs:/Volumes/bronze{env_suffix}/temporary/checkpoints"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_schema_location_base(env_suffix: str = "_dev") -> str:
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
Retorna la ruta base para schema location de Auto Loader.
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
env_suffix: Sufijo de ambiente ("_dev", "_prod", etc.)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
Ruta base para schema locations
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
base = os.getenv("ELT_SCHEMA_BASE")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if base:
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
return base.rstrip("/")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# Usa la misma base que checkpoints + /schema
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
checkpoint_base = get_checkpoint_base(env_suffix)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
return f"{checkpoint_base}/schema"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class NotebookContext:
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def __init__(self, env, is_local, spark=None, dlt=None):
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
self.env = env
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
self.is_local = is_local
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
self.spark = spark
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
self.dlt = dlt
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def init_notebook():
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
Inicializa el entorno del notebook.
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
1. Agrega src/ al sys.path
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
2. Detecta entorno (Local vs Nube) y mockea DLT si es necesario.
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
3. Retorna un objeto Contexto con {env, is_local, dlt, spark}.
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# 1. SETUP PATH DE PROYECTO
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# Intenta usar __file__ si se importa como modulo
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
current_path = Path(__file__).resolve().parent
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
except NameError:
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# En notebook interactivo usa cwd
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
current_path = Path(os.getcwd())
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
project_root = current_path
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
for _ in range(5):
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
if (project_root / "src").exists():
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
project_root = project_root.parent
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
if str(project_root) not in sys.path:
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
sys.path.insert(0, str(project_root))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
print(f"✅ Root agregado al path: {project_root}")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# 2. DETECCION DE ENTORNO & MOCK DLT
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
is_local = False
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
spark_session = None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
dlt_module = None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
env_suffix = "_dev" # Default seguro
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# Intenta importar dlt (Nube o Local con dlt instalado)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
import dlt
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
dlt_module = dlt
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# Intentar obtener spark session existente (Nube)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
import IPython
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
ip = IPython.get_ipython()
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
if ip and 'spark' in ip.user_ns:
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
spark_session = ip.user_ns['spark']
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# Leer configuracion si existe
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
env_suffix = spark_session.conf.get("env_suffix", "_dev")
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
except:
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
except:
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
except ImportError:
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ENTORNO LOCAL SIN DLT
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
is_local = True
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class DltMock:
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def table(self, *a, **k): return lambda f: f
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def view(self, *a, **k): return lambda f: f
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def expect(self, *a, **k): return lambda f: f
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def expect_or_drop(self, *a, **k): return lambda f: f
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def create_streaming_table(self, *a, **k): pass
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def apply_changes(self, *a, **k): pass
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
dlt_module = DltMock()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
print(f"⚠️ Modo Local: 'dlt' simulado. Usando env='{env_suffix}'")
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
return NotebookContext(env=env_suffix, is_local=is_local, dlt=dlt_module, spark=spark_session)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def get_local_source_table(spark, source_official: str) -> str:
|
|
362
|
+
"""
|
|
363
|
+
En modo local, permite a los notebooks Silver/Gold descubrir de dónde leer los datos
|
|
364
|
+
cuando las dependencias (Bronze) no están materializadas en Unity Catalog sino en tablas temporales.
|
|
365
|
+
|
|
366
|
+
1. Revisa si existe la tabla oficial en UC (ej: bronze_dev.schema.table).
|
|
367
|
+
2. Si no, revisa si existe la tabla temporal local (ej: bronze_dev.temporary.table_tmp_sql).
|
|
368
|
+
3. Si ninguna existe, lanza un error descriptivo.
|
|
369
|
+
"""
|
|
370
|
+
parts = source_official.split(".")
|
|
371
|
+
env_catalog = parts[0]
|
|
372
|
+
target_table = parts[-1]
|
|
373
|
+
|
|
374
|
+
oficial_name = source_official
|
|
375
|
+
temp_name = f"{env_catalog}.temporary.{target_table}_tmp_sql"
|
|
376
|
+
|
|
377
|
+
def table_exists(name):
|
|
378
|
+
try:
|
|
379
|
+
# Chequeo extra para MockSparkSession (ya que siempre devuelve un MockDataframe)
|
|
380
|
+
if type(spark).__name__ == "MockSparkSession":
|
|
381
|
+
# En pruebas locales sin databricks-connect forzamos a que use la temporal si se pregunta por ella
|
|
382
|
+
if "temporary" in name:
|
|
383
|
+
return True
|
|
384
|
+
return False
|
|
385
|
+
|
|
386
|
+
# En Databricks real o Connect evaluamos su existencia de forma directa con eager execution (Spark Connect)
|
|
387
|
+
return spark.catalog.tableExists(name)
|
|
388
|
+
except Exception:
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
if table_exists(oficial_name):
|
|
392
|
+
print(f"✅ Leyendo desde tabla oficial UC: {oficial_name}")
|
|
393
|
+
return oficial_name
|
|
394
|
+
elif table_exists(temp_name):
|
|
395
|
+
print(f"⚠️ Leyendo desde tabla TEMPORAL de prueba local: {temp_name}")
|
|
396
|
+
return temp_name
|
|
397
|
+
else:
|
|
398
|
+
raise ValueError(
|
|
399
|
+
f"❌ ERROR DE LINAJE LOCAL: No se encontró la tabla de origen.\n"
|
|
400
|
+
f" - Oficial (No existe): {oficial_name}\n"
|
|
401
|
+
f" - Temporal Local (No existe): {temp_name}\n"
|
|
402
|
+
f"Solución: Debes ejecutar primero el notebook de la capa anterior ({target_table}) para generar los datos temporales de prueba."
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def clean_local_test_table(spark, source_table: str):
|
|
407
|
+
"""
|
|
408
|
+
Se encarga de limpiar el entorno local tras una prueba exitosa.
|
|
409
|
+
Si source_table es una tabla temporal, la elimina.
|
|
410
|
+
Si source_table es una tabla oficial de UC, busca y elimina su versión temporal residual si existiese.
|
|
411
|
+
"""
|
|
412
|
+
print("-" * 50)
|
|
413
|
+
|
|
414
|
+
# Extraemos las partes para construir el nombre temporal siempre
|
|
415
|
+
parts = source_table.split(".")
|
|
416
|
+
env_catalog = parts[0]
|
|
417
|
+
target_table = parts[-1]
|
|
418
|
+
|
|
419
|
+
# Si la tabla ya era la temporal, limpiamos directo el '_tmp_sql' que viene en target_table,
|
|
420
|
+
# sino se lo añadimos a la tabla original
|
|
421
|
+
if target_table.endswith("_tmp_sql"):
|
|
422
|
+
temp_name = source_table
|
|
423
|
+
else:
|
|
424
|
+
temp_name = f"{env_catalog}.temporary.{target_table}_tmp_sql"
|
|
425
|
+
|
|
426
|
+
def table_exists(name):
|
|
427
|
+
try:
|
|
428
|
+
if type(spark).__name__ == "MockSparkSession":
|
|
429
|
+
# En el mock local, asumimos que no hay residuos a menos que sea explícito
|
|
430
|
+
# y evitamos el falso positivo de "Se detectó y eliminó una tabla residual"
|
|
431
|
+
return False
|
|
432
|
+
return spark.catalog.tableExists(name)
|
|
433
|
+
except Exception:
|
|
434
|
+
return False
|
|
435
|
+
|
|
436
|
+
if table_exists(temp_name):
|
|
437
|
+
print("🧹 Limpiando zona temporal...")
|
|
438
|
+
try:
|
|
439
|
+
spark.sql(f"DROP TABLE IF EXISTS {temp_name}")
|
|
440
|
+
if temp_name == source_table:
|
|
441
|
+
print(f"✨ Tabla origen '{temp_name}' ELIMINADA.")
|
|
442
|
+
else:
|
|
443
|
+
print(f"✨ Se detectó y eliminó una tabla local residual: '{temp_name}'.")
|
|
444
|
+
except Exception as e:
|
|
445
|
+
print(f"⚠️ Error limpiando la tabla temporal: {e}")
|
|
446
|
+
else:
|
|
447
|
+
if "temporary" not in source_table:
|
|
448
|
+
print("ℹ️ Tabla de Unity Catalog oficial detectada. No hay residuos temporales locales que limpiar.")
|
|
449
|
+
|
|
450
|
+
print(" Ciclo de prueba completado limpiamente.")
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
_SPARK_CACHE = None
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def get_test_spark():
|
|
460
|
+
global _SPARK_CACHE
|
|
461
|
+
if _SPARK_CACHE is not None:
|
|
462
|
+
return _SPARK_CACHE
|
|
463
|
+
try:
|
|
464
|
+
from databricks.connect import DatabricksSession
|
|
465
|
+
from databricks.sdk.core import Config
|
|
466
|
+
config = Config(profile="DEFAULT")
|
|
467
|
+
_SPARK_CACHE = DatabricksSession.builder.sdkConfig(config).getOrCreate()
|
|
468
|
+
return _SPARK_CACHE
|
|
469
|
+
except Exception as e:
|
|
470
|
+
print(f" Error creando sesin Spark Local: {e}")
|
|
471
|
+
print(" Fallback to MockSparkSession for local testing")
|
|
472
|
+
class MockDataFrameReader:
|
|
473
|
+
def format(self, *a, **k): return self
|
|
474
|
+
def load(self, *a, **k): return self
|
|
475
|
+
def table(self, *a, **k): return self
|
|
476
|
+
def option(self, *a, **k): return self
|
|
477
|
+
def count(self): return 100
|
|
478
|
+
def show(self, *a, **k): pass
|
|
479
|
+
def limit(self, *a, **k): return self
|
|
480
|
+
def withColumn(self, *a, **k): return self
|
|
481
|
+
def select(self, *a, **k): return self
|
|
482
|
+
def drop(self, *a, **k): return self
|
|
483
|
+
def filter(self, *a, **k): return self
|
|
484
|
+
class MockDataStreamReader(MockDataFrameReader):
|
|
485
|
+
pass
|
|
486
|
+
class MockDataStreamWriter:
|
|
487
|
+
def format(self, *a, **k): return self
|
|
488
|
+
def trigger(self, *a, **k): return self
|
|
489
|
+
def outputMode(self, *a, **k): return self
|
|
490
|
+
def option(self, *a, **k): return self
|
|
491
|
+
def toTable(self, *a, **k): return self
|
|
492
|
+
def start(self, *a, **k): return self
|
|
493
|
+
def awaitTermination(self): pass
|
|
494
|
+
class MockDataFrame:
|
|
495
|
+
@property
|
|
496
|
+
def writeStream(self): return MockDataStreamWriter()
|
|
497
|
+
def createOrReplaceTempView(self, name): pass
|
|
498
|
+
def limit(self, *a, **k): return self
|
|
499
|
+
def show(self, *a, **k): pass
|
|
500
|
+
def isStreaming(self): return True
|
|
501
|
+
def withColumn(self, *a, **k): return self
|
|
502
|
+
def select(self, *a, **k): return self
|
|
503
|
+
def drop(self, *a, **k): return self
|
|
504
|
+
def filter(self, *a, **k): return self
|
|
505
|
+
def colRegex(self, *a, **k): return self
|
|
506
|
+
def alias(self, *a, **k): return self
|
|
507
|
+
def unionByName(self, *a, **k): return self
|
|
508
|
+
def distinct(self, *a, **k): return self
|
|
509
|
+
def join(self, *a, **k): return self
|
|
510
|
+
def na(self, *a, **k): return self
|
|
511
|
+
def count(self): return 100
|
|
512
|
+
@property
|
|
513
|
+
def columns(self): return ["col1", "col2"]
|
|
514
|
+
class MockSparkSession:
|
|
515
|
+
@property
|
|
516
|
+
def read(self): return MockDataFrameReader()
|
|
517
|
+
@property
|
|
518
|
+
def readStream(self): return MockDataStreamReader()
|
|
519
|
+
def sql(self, *a, **k): return MockDataFrame()
|
|
520
|
+
def table(self, *a, **k): return MockDataFrame()
|
|
521
|
+
_SPARK_CACHE = MockSparkSession()
|
|
522
|
+
return _SPARK_CACHE
|
|
523
|
+
|
|
524
|
+
def stop_local_spark():
|
|
525
|
+
"""
|
|
526
|
+
Detiene la sesión local de Spark y limpia el caché.
|
|
527
|
+
Útil para liberar recursos y evitar que el kernel de Python se quede colgado
|
|
528
|
+
tras ejecutar múltiples celdas o finalizar una prueba local.
|
|
529
|
+
"""
|
|
530
|
+
global _SPARK_CACHE
|
|
531
|
+
if _SPARK_CACHE is not None:
|
|
532
|
+
try:
|
|
533
|
+
# Intentar detener todos los streaming activos por seguridad
|
|
534
|
+
if hasattr(_SPARK_CACHE, "streams") and hasattr(_SPARK_CACHE.streams, "active"):
|
|
535
|
+
for stream in _SPARK_CACHE.streams.active:
|
|
536
|
+
stream.stop()
|
|
537
|
+
|
|
538
|
+
_SPARK_CACHE.stop()
|
|
539
|
+
except Exception as e:
|
|
540
|
+
print(f"⚠️ Error deteniendo la sesión de Spark: {e}")
|
|
541
|
+
finally:
|
|
542
|
+
_SPARK_CACHE = None
|
|
543
|
+
print("🛑 Sesión local de Spark detenida correctamente. Recursos liberados.")
|
|
544
|
+
else:
|
|
545
|
+
print("ℹ️ No hay ninguna sesión local de Spark activa que detener.")
|