gss-bi-udfs 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gss_bi_udfs/__init__.py +11 -0
- gss_bi_udfs/io.py +420 -0
- gss_bi_udfs/merges.py +191 -0
- gss_bi_udfs/transforms.py +56 -0
- gss_bi_udfs/utils.py +185 -0
- {gss_bi_udfs-0.1.0.dist-info → gss_bi_udfs-0.1.2.dist-info}/METADATA +9 -2
- gss_bi_udfs-0.1.2.dist-info/RECORD +18 -0
- {gss_bi_udfs-0.1.0.dist-info → gss_bi_udfs-0.1.2.dist-info}/WHEEL +1 -1
- gss_bi_udfs-0.1.2.dist-info/top_level.txt +4 -0
- scripts/run_tests.py +125 -0
- tests/test_io.py +128 -0
- tests/test_merges.py +65 -0
- tests/test_transforms.py +66 -0
- tests/test_utils.py +119 -0
- workspace/main.py +0 -0
- workspace/prueba.py +10 -0
- workspace/prueba_calculadora.py +20 -0
- workspace/tests/test_prueba_calculadora.py +14 -0
- gss_bi_udfs-0.1.0.dist-info/RECORD +0 -4
- gss_bi_udfs-0.1.0.dist-info/top_level.txt +0 -1
gss_bi_udfs/utils.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pyspark.sql.types import (IntegerType, LongType, ShortType, ByteType,
|
|
3
|
+
DecimalType, DoubleType, FloatType,
|
|
4
|
+
DateType, TimestampType, BooleanType, StringType)
|
|
5
|
+
from pyspark.sql.functions import lit, concat_ws, col
|
|
6
|
+
from pyspark.sql import DataFrame, Column
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_env(default="dev"):
|
|
10
|
+
"""
|
|
11
|
+
Obtiene el entorno de ejecución a partir de la variable de entorno ENV.
|
|
12
|
+
Si no está definida, retorna el valor por defecto indicado.
|
|
13
|
+
|
|
14
|
+
Parámetros:
|
|
15
|
+
- default (str): entorno por defecto a utilizar si ENV no está definida.
|
|
16
|
+
|
|
17
|
+
Retorna:
|
|
18
|
+
- str: nombre del entorno de ejecución (por ejemplo: 'dev', 'qa', 'prod').
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
return os.getenv("ENV", default)
|
|
22
|
+
|
|
23
|
+
def get_env_catalog(catalog):
|
|
24
|
+
"""
|
|
25
|
+
Genera el nombre del catálogo ajustado al environment.
|
|
26
|
+
|
|
27
|
+
Parámetros:
|
|
28
|
+
catalog (str): Nombre base del catálogo (ej. 'fi_comunes').
|
|
29
|
+
|
|
30
|
+
Retorna:
|
|
31
|
+
str: Nombre del catálogo ajustado al environment.
|
|
32
|
+
Ejemplo: 'fi_comunes_dev' si ENV='dev'
|
|
33
|
+
'fi_comunes' si ENV='pro'
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
if get_env() == "pro":
|
|
37
|
+
return catalog
|
|
38
|
+
else:
|
|
39
|
+
return f"{catalog}_{get_env()}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_env_table_path(catalog, table_path):
|
|
43
|
+
"""
|
|
44
|
+
Genera el path completo de una tabla incluyendo el sufijo de ambiente en el catálogo.
|
|
45
|
+
|
|
46
|
+
Parámetros:
|
|
47
|
+
catalog (str): Nombre base del catálogo (ej. 'fi_comunes').
|
|
48
|
+
table_path (str): Path de la tabla incluyendo esquema y nombre (ej. 'silver.dim_afiliado').
|
|
49
|
+
|
|
50
|
+
Retorna:
|
|
51
|
+
str: Path completo de la tabla ajustado al environment.
|
|
52
|
+
Ejemplo: 'fi_comunes_dev.silver.dim_afiliado' si ENV='dev'
|
|
53
|
+
'fi_comunes.silver.dim_afiliado' si ENV='pro'
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
# Concatena el catálogo modificado con el path de la tabla
|
|
57
|
+
return f"{get_env_catalog(catalog)}.{table_path}"
|
|
58
|
+
|
|
59
|
+
def get_schema_root_location(spark, catalog, schema):
|
|
60
|
+
"""
|
|
61
|
+
Obtiene la ruta física (RootLocation) de un esquema específico, considerando el catálogo ajustado al ambiente.
|
|
62
|
+
|
|
63
|
+
Parámetros:
|
|
64
|
+
catalog (str): Nombre base del catálogo (ej. 'fi_comunes').
|
|
65
|
+
schema (str): Nombre del esquema dentro del catálogo (ej. 'silver').
|
|
66
|
+
|
|
67
|
+
Retorna:
|
|
68
|
+
str: Ruta física donde se almacenan los datos del esquema.
|
|
69
|
+
Ejemplo: 's3://bucket/path/fi_comunes_dev/silver' si ENV='dev'
|
|
70
|
+
|
|
71
|
+
Requiere:
|
|
72
|
+
- La función get_env_catalog debe estar definida y retornar el nombre de catálogo ajustado al ambiente.
|
|
73
|
+
- SparkSession activa y permisos para ejecutar `DESCRIBE SCHEMA EXTENDED`.
|
|
74
|
+
|
|
75
|
+
Ejemplo:
|
|
76
|
+
>>> get_schema_root_location("fi_comunes", "silver")
|
|
77
|
+
's3://mi-bucket/datalake/fi_comunes_dev/silver'
|
|
78
|
+
"""
|
|
79
|
+
cat = get_env_catalog(catalog)
|
|
80
|
+
df = spark.sql(f"DESCRIBE SCHEMA EXTENDED {cat}.{schema}")
|
|
81
|
+
return df.filter("database_description_item = 'RootLocation'") \
|
|
82
|
+
.select("database_description_value") \
|
|
83
|
+
.collect()[0][0]
|
|
84
|
+
|
|
85
|
+
def get_table_info(
|
|
86
|
+
spark,
|
|
87
|
+
*,
|
|
88
|
+
full_table_name: str = None,
|
|
89
|
+
catalog: str = None,
|
|
90
|
+
schema: str = None,
|
|
91
|
+
table: str = None
|
|
92
|
+
) -> dict:
|
|
93
|
+
"""
|
|
94
|
+
Devuelve información de una tabla a partir de:
|
|
95
|
+
- full_table_name (catalog.schema.table)
|
|
96
|
+
o
|
|
97
|
+
- catalog + schema + table
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# -----------------------------
|
|
101
|
+
# 1. Resolver inputs
|
|
102
|
+
# -----------------------------
|
|
103
|
+
if full_table_name:
|
|
104
|
+
parts = full_table_name.split(".")
|
|
105
|
+
if len(parts) != 3:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"full_table_name debe tener formato catalog.schema.table"
|
|
108
|
+
)
|
|
109
|
+
catalog, schema, table = parts
|
|
110
|
+
|
|
111
|
+
elif catalog and schema and table:
|
|
112
|
+
full_table_name = f"{catalog}.{schema}.{table}"
|
|
113
|
+
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"Debe informar full_table_name o catalog + schema + table"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# -----------------------------
|
|
120
|
+
# 2. Environment catalog
|
|
121
|
+
# -----------------------------
|
|
122
|
+
catalog_env = get_env_catalog(catalog)
|
|
123
|
+
|
|
124
|
+
# -----------------------------
|
|
125
|
+
# 3. Path físico
|
|
126
|
+
# -----------------------------
|
|
127
|
+
root_location = get_schema_root_location(spark, catalog, schema)
|
|
128
|
+
path = f"{root_location.rstrip('/')}/{table}"
|
|
129
|
+
|
|
130
|
+
# -----------------------------
|
|
131
|
+
# 4. Metadata Spark (si existe)
|
|
132
|
+
# -----------------------------
|
|
133
|
+
info = {
|
|
134
|
+
"catalog": catalog_env,
|
|
135
|
+
"schema": schema,
|
|
136
|
+
"table": table,
|
|
137
|
+
"full_table_name": f"{catalog_env}.{schema}.{table}",
|
|
138
|
+
"path": path,
|
|
139
|
+
"exists": False,
|
|
140
|
+
"provider": None,
|
|
141
|
+
"table_type": None,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if spark.catalog.tableExists(info["full_table_name"]):
|
|
145
|
+
info["exists"] = True
|
|
146
|
+
|
|
147
|
+
desc = (
|
|
148
|
+
spark.sql(f"DESCRIBE EXTENDED {info['full_table_name']}")
|
|
149
|
+
.filter("col_name in ('Location', 'Provider', 'Type')")
|
|
150
|
+
.collect()
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
for row in desc:
|
|
154
|
+
if row.col_name == "Location":
|
|
155
|
+
info["path"] = row.data_type
|
|
156
|
+
elif row.col_name == "Provider":
|
|
157
|
+
info["provider"] = row.data_type
|
|
158
|
+
elif row.col_name == "Type":
|
|
159
|
+
info["table_type"] = row.data_type
|
|
160
|
+
|
|
161
|
+
return info
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def get_default_value_by_type(dtype):
|
|
166
|
+
"""
|
|
167
|
+
Devuelve "default" por tipo de dato para registros 'default/unknown'.
|
|
168
|
+
Parámetros:
|
|
169
|
+
- dtype: Tipo de dato (DataType) de PySpark.
|
|
170
|
+
Retorna:
|
|
171
|
+
- valor por defecto correspondiente al tipo de dato.
|
|
172
|
+
"""
|
|
173
|
+
if isinstance(dtype, (IntegerType, LongType, ShortType, ByteType)):
|
|
174
|
+
return lit(-999)
|
|
175
|
+
if isinstance(dtype, (DecimalType, DoubleType, FloatType)):
|
|
176
|
+
return lit(-999)
|
|
177
|
+
if isinstance(dtype, (DateType, TimestampType)):
|
|
178
|
+
return lit("1900-01-01").cast(dtype)
|
|
179
|
+
if isinstance(dtype, BooleanType):
|
|
180
|
+
return lit(False)
|
|
181
|
+
if isinstance(dtype, StringType):
|
|
182
|
+
return lit("N/A")
|
|
183
|
+
return lit(None)
|
|
184
|
+
|
|
185
|
+
|
|
@@ -1,13 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gss-bi-udfs
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Utilidades reutilizables para Spark y Delta Lake en arquitecturas Lakehouse.
|
|
5
5
|
Author: Geronimo Forconi
|
|
6
6
|
Requires-Python: >=3.8
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
|
-
Requires-Dist: pyspark>=3.0.0
|
|
9
8
|
|
|
10
9
|
# gss-bi-udfs
|
|
10
|
+
|
|
11
11
|
Creo modulo para guardar UDFs comunes a todas las areas de BI.
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
# para compilar
|
|
15
|
+
|
|
16
|
+
python3 -m build
|
|
17
|
+
|
|
18
|
+
# para publicar
|
|
19
|
+
|
|
20
|
+
python3 -m twine upload dist/*
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
gss_bi_udfs/__init__.py,sha256=VNj2_l7MHiRGF497XVM4KtU7p6JOX1xddkvFJLG1vUQ,152
|
|
2
|
+
gss_bi_udfs/io.py,sha256=BZVf1BACH71MQFn-png1828E2WnNEwqTq3AxZR_UZWk,17173
|
|
3
|
+
gss_bi_udfs/merges.py,sha256=4YHfw6TWU08ZWEMKBtFlMqj_tzXzjqkuM_CJn0uRNUI,7977
|
|
4
|
+
gss_bi_udfs/transforms.py,sha256=yDg7uvPFSTrGXgy5rOUKDdSrRBBZSubfi9K-6rATCWY,1876
|
|
5
|
+
gss_bi_udfs/utils.py,sha256=ryyqrzhybC6mZFTUWsnnrQXReUcLkVqw6e2gIf4Id_g,5982
|
|
6
|
+
scripts/run_tests.py,sha256=6yG35rkURojbHmEnLKkPnHSn5bmViP7yJwng5hXj9xs,4407
|
|
7
|
+
tests/test_io.py,sha256=oFTJK6UZJXec53lPoArWSHKPRpNGbhI8ZVb1ZjfXW8U,4847
|
|
8
|
+
tests/test_merges.py,sha256=_PHYRU0DwRn5Vg05clz8jL7_d8QutWiiTACHLiNPrZo,2221
|
|
9
|
+
tests/test_transforms.py,sha256=4fqKyemSV-4nfMzhTamaE5mWXnglV08uvw67sWj84Og,2206
|
|
10
|
+
tests/test_utils.py,sha256=FUap5pqqEDvmBmBLeSBN39FoQDQSz3hpN4qCQrUniEU,4541
|
|
11
|
+
workspace/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
workspace/prueba.py,sha256=n-zGeMhFRrMLp9cx-vvJYTSFMZHCYm_R-xmGiQvj1Fk,223
|
|
13
|
+
workspace/prueba_calculadora.py,sha256=4pu4Wg-h_aALbqDsfSeJlPbyx9FpODIbHGNAse5Mz5M,512
|
|
14
|
+
workspace/tests/test_prueba_calculadora.py,sha256=9cHXunht_EVwBgBELv5YlazkJMWSi97A5VnB9Mvs-kU,257
|
|
15
|
+
gss_bi_udfs-0.1.2.dist-info/METADATA,sha256=xcsXvP6SEvUiMh2g6XG85DycKXsFqom_cpV4Xpz6rgI,393
|
|
16
|
+
gss_bi_udfs-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
17
|
+
gss_bi_udfs-0.1.2.dist-info/top_level.txt,sha256=NzlGPsUajFQDfmDbDhTCgu4LBvrm0d1shuPSJIBpLfw,36
|
|
18
|
+
gss_bi_udfs-0.1.2.dist-info/RECORD,,
|
scripts/run_tests.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import html
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import unittest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _CollectingTextTestResult(unittest.TextTestResult):
|
|
11
|
+
def __init__(self, stream, descriptions, verbosity):
|
|
12
|
+
super().__init__(stream, descriptions, verbosity)
|
|
13
|
+
self.successes = []
|
|
14
|
+
|
|
15
|
+
def addSuccess(self, test):
|
|
16
|
+
super().addSuccess(test)
|
|
17
|
+
self.successes.append(test)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _run_tests(start_dir: str, pattern: str, verbosity: int) -> unittest.TestResult:
|
|
21
|
+
suite = unittest.defaultTestLoader.discover(start_dir=start_dir, pattern=pattern)
|
|
22
|
+
runner = unittest.TextTestRunner(verbosity=verbosity, resultclass=_CollectingTextTestResult)
|
|
23
|
+
return runner.run(suite)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _timestamped_path(path: str) -> str:
|
|
27
|
+
ts = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
28
|
+
base, ext = os.path.splitext(path)
|
|
29
|
+
if ext:
|
|
30
|
+
return f"{base}_{ts}{ext}"
|
|
31
|
+
return f"{path}_{ts}"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _render_html(result: unittest.TestResult, output_path: str) -> None:
|
|
35
|
+
rows = []
|
|
36
|
+
for test in getattr(result, "successes", []):
|
|
37
|
+
rows.append(("PASS", str(test), ""))
|
|
38
|
+
|
|
39
|
+
for test, tb in result.failures + result.errors:
|
|
40
|
+
rows.append(
|
|
41
|
+
(
|
|
42
|
+
"FAIL" if (test, tb) in result.failures else "ERROR",
|
|
43
|
+
str(test),
|
|
44
|
+
tb,
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
for test, reason in result.skipped:
|
|
49
|
+
rows.append(("SKIP", str(test), reason))
|
|
50
|
+
|
|
51
|
+
status = "PASSED" if result.wasSuccessful() else "FAILED"
|
|
52
|
+
now = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
53
|
+
|
|
54
|
+
html_body = [
|
|
55
|
+
"<!doctype html>",
|
|
56
|
+
"<html lang='en'>",
|
|
57
|
+
"<head>",
|
|
58
|
+
" <meta charset='utf-8'/>",
|
|
59
|
+
" <title>Unit Test Report</title>",
|
|
60
|
+
" <style>",
|
|
61
|
+
" body { font-family: Arial, sans-serif; margin: 24px; }",
|
|
62
|
+
" .ok { color: #0f5132; }",
|
|
63
|
+
" .bad { color: #842029; }",
|
|
64
|
+
" table { border-collapse: collapse; width: 100%; margin-top: 16px; }",
|
|
65
|
+
" th, td { border: 1px solid #ddd; padding: 8px; text-align: left; vertical-align: top; }",
|
|
66
|
+
" th { background: #f5f5f5; }",
|
|
67
|
+
" pre { white-space: pre-wrap; margin: 0; }",
|
|
68
|
+
" </style>",
|
|
69
|
+
"</head>",
|
|
70
|
+
"<body>",
|
|
71
|
+
f" <h1>Unit Test Report</h1>",
|
|
72
|
+
f" <p><strong>Generated:</strong> {html.escape(now)}</p>",
|
|
73
|
+
f" <p><strong>Status:</strong> <span class='{'ok' if result.wasSuccessful() else 'bad'}'>{status}</span></p>",
|
|
74
|
+
f" <p><strong>Ran:</strong> {result.testsRun} tests</p>",
|
|
75
|
+
f" <p><strong>Pass:</strong> {len(getattr(result, 'successes', []))} | <strong>Failures:</strong> {len(result.failures)} | <strong>Errors:</strong> {len(result.errors)} | <strong>Skipped:</strong> {len(result.skipped)}</p>",
|
|
76
|
+
" <table>",
|
|
77
|
+
" <thead><tr><th>Type</th><th>Test</th><th>Details</th></tr></thead>",
|
|
78
|
+
" <tbody>",
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
if rows:
|
|
82
|
+
for row_type, test_name, details in rows:
|
|
83
|
+
html_body.append(
|
|
84
|
+
" <tr>"
|
|
85
|
+
f"<td>{html.escape(row_type)}</td>"
|
|
86
|
+
f"<td>{html.escape(test_name)}</td>"
|
|
87
|
+
f"<td><pre>{html.escape(details)}</pre></td>"
|
|
88
|
+
"</tr>"
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
html_body.append(" <tr><td colspan='3'>No failures, errors or skipped tests.</td></tr>")
|
|
92
|
+
|
|
93
|
+
html_body += [
|
|
94
|
+
" </tbody>",
|
|
95
|
+
" </table>",
|
|
96
|
+
"</body>",
|
|
97
|
+
"</html>",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
101
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
102
|
+
f.write("\n".join(html_body))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def main() -> int:
|
|
106
|
+
parser = argparse.ArgumentParser(description="Run unittests and optionally write HTML report.")
|
|
107
|
+
parser.add_argument("--start-dir", default="tests")
|
|
108
|
+
parser.add_argument("--pattern", default="test*.py")
|
|
109
|
+
parser.add_argument("--verbosity", type=int, default=2)
|
|
110
|
+
parser.add_argument("--html", default="")
|
|
111
|
+
parser.add_argument("--timestamped", action="store_true")
|
|
112
|
+
args = parser.parse_args()
|
|
113
|
+
|
|
114
|
+
result = _run_tests(args.start_dir, args.pattern, args.verbosity)
|
|
115
|
+
|
|
116
|
+
if args.html:
|
|
117
|
+
output_path = _timestamped_path(args.html) if args.timestamped else args.html
|
|
118
|
+
_render_html(result, output_path)
|
|
119
|
+
print(f"HTML report written to: {output_path}")
|
|
120
|
+
|
|
121
|
+
return 0 if result.wasSuccessful() else 1
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
sys.exit(main())
|
tests/test_io.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from types import SimpleNamespace
|
|
5
|
+
from unittest.mock import MagicMock, patch
|
|
6
|
+
|
|
7
|
+
from gss_bi_udfs import io
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestIO(unittest.TestCase):
|
|
11
|
+
def test_normalize_path(self):
|
|
12
|
+
self.assertEqual(io._normalize_path("dbfs:/tmp/a.parquet"), "/tmp/a.parquet")
|
|
13
|
+
self.assertEqual(io._normalize_path("/tmp/a.parquet"), "/tmp/a.parquet")
|
|
14
|
+
|
|
15
|
+
def test_ls_path_local(self):
|
|
16
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
17
|
+
file_path = Path(tmpdir) / "file.txt"
|
|
18
|
+
file_path.write_text("x", encoding="utf-8")
|
|
19
|
+
(Path(tmpdir) / "folder").mkdir()
|
|
20
|
+
|
|
21
|
+
files = io._ls_path(tmpdir)
|
|
22
|
+
names = sorted([f.name for f in files])
|
|
23
|
+
self.assertEqual(names, ["file.txt", "folder"])
|
|
24
|
+
|
|
25
|
+
self.assertEqual(io._ls_path("/path/that/does/not/exist"), [])
|
|
26
|
+
|
|
27
|
+
@patch("gss_bi_udfs.io._ls_path")
|
|
28
|
+
def test_load_latest_parquet_returns_latest_match(self, mock_ls_path):
|
|
29
|
+
mock_ls_path.return_value = [
|
|
30
|
+
SimpleNamespace(name="clientes_20240101.parquet", path="/tmp/1"),
|
|
31
|
+
SimpleNamespace(name="clientes_20240102.parquet", path="/tmp/2"),
|
|
32
|
+
SimpleNamespace(name="otra_tabla_20240103.parquet", path="/tmp/3"),
|
|
33
|
+
]
|
|
34
|
+
spark = MagicMock()
|
|
35
|
+
expected_df = object()
|
|
36
|
+
spark.read.parquet.return_value = expected_df
|
|
37
|
+
|
|
38
|
+
out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
|
|
39
|
+
|
|
40
|
+
self.assertIs(out, expected_df)
|
|
41
|
+
spark.read.parquet.assert_called_once_with("/tmp/2")
|
|
42
|
+
|
|
43
|
+
@patch("gss_bi_udfs.io._ls_path")
|
|
44
|
+
def test_load_latest_parquet_returns_none_without_matches(self, mock_ls_path):
|
|
45
|
+
mock_ls_path.return_value = [SimpleNamespace(name="x.parquet", path="/tmp/x")]
|
|
46
|
+
spark = MagicMock()
|
|
47
|
+
|
|
48
|
+
out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
|
|
49
|
+
|
|
50
|
+
self.assertIsNone(out)
|
|
51
|
+
spark.read.parquet.assert_not_called()
|
|
52
|
+
|
|
53
|
+
@patch("gss_bi_udfs.io.load_latest_parquet")
|
|
54
|
+
def test_return_parquets_and_register_temp_views(self, mock_load_latest):
|
|
55
|
+
df = MagicMock()
|
|
56
|
+
mock_load_latest.return_value = df
|
|
57
|
+
spark = MagicMock()
|
|
58
|
+
tables_load = {
|
|
59
|
+
"db1": {
|
|
60
|
+
"sch1": [
|
|
61
|
+
{"table": "t1", "view": "vw_t1"},
|
|
62
|
+
{"table": "t2", "view": "vw_t2"},
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
out = io.return_parquets_and_register_temp_views(spark, tables_load, env="dev")
|
|
68
|
+
|
|
69
|
+
self.assertEqual(set(out.keys()), {"db1.sch1.t1", "db1.sch1.t2"})
|
|
70
|
+
self.assertEqual(mock_load_latest.call_count, 2)
|
|
71
|
+
self.assertEqual(df.createOrReplaceTempView.call_count, 2)
|
|
72
|
+
|
|
73
|
+
@patch("gss_bi_udfs.io.load_latest_parquet")
|
|
74
|
+
def test_parquets_register_temp_views(self, mock_load_latest):
|
|
75
|
+
df = MagicMock()
|
|
76
|
+
mock_load_latest.return_value = df
|
|
77
|
+
spark = MagicMock()
|
|
78
|
+
tables_load = {"db1": {"sch1": [{"table": "t1", "view": "vw_t1"}]}}
|
|
79
|
+
|
|
80
|
+
io.parquets_register_temp_views(spark, tables_load, env="dev")
|
|
81
|
+
|
|
82
|
+
df.createOrReplaceTempView.assert_called_once_with("vw_t1")
|
|
83
|
+
|
|
84
|
+
@patch("gss_bi_udfs.io._ls_path")
|
|
85
|
+
@patch("pandas.read_excel")
|
|
86
|
+
def test_load_latest_excel(self, mock_read_excel, mock_ls_path):
|
|
87
|
+
mock_ls_path.return_value = [
|
|
88
|
+
SimpleNamespace(name="a_old", path="dbfs:/tmp/a_old", isFile=lambda: True),
|
|
89
|
+
SimpleNamespace(name="b_new", path="dbfs:/tmp/b_new", isFile=lambda: True),
|
|
90
|
+
]
|
|
91
|
+
mock_pdf = object()
|
|
92
|
+
mock_read_excel.return_value = mock_pdf
|
|
93
|
+
spark = MagicMock()
|
|
94
|
+
expected_df = object()
|
|
95
|
+
spark.createDataFrame.return_value = expected_df
|
|
96
|
+
|
|
97
|
+
out = io.load_latest_excel(spark, "dom/sub/file", env="dev")
|
|
98
|
+
|
|
99
|
+
self.assertIs(out, expected_df)
|
|
100
|
+
mock_read_excel.assert_called_once_with("/tmp/b_new", header=0, engine="xlrd")
|
|
101
|
+
spark.createDataFrame.assert_called_once_with(mock_pdf)
|
|
102
|
+
|
|
103
|
+
def test_load_and_materialize_views_unknown_action(self):
|
|
104
|
+
out = io.load_and_materialize_views("accion_inexistente")
|
|
105
|
+
self.assertEqual(out, {})
|
|
106
|
+
|
|
107
|
+
@patch("gss_bi_udfs.io.get_table_info")
|
|
108
|
+
def test_save_table_to_delta_writes_delta(self, mock_get_table_info):
|
|
109
|
+
mock_get_table_info.return_value = {
|
|
110
|
+
"path": "/tmp/tbl",
|
|
111
|
+
"full_table_name": "cat.sch.tbl",
|
|
112
|
+
}
|
|
113
|
+
df = MagicMock()
|
|
114
|
+
writer = MagicMock()
|
|
115
|
+
writer.format.return_value = writer
|
|
116
|
+
writer.option.return_value = writer
|
|
117
|
+
writer.mode.return_value = writer
|
|
118
|
+
df.write = writer
|
|
119
|
+
|
|
120
|
+
io.save_table_to_delta(df, "cat", "sch", "tbl")
|
|
121
|
+
|
|
122
|
+
writer.format.assert_called_once_with("delta")
|
|
123
|
+
writer.mode.assert_called_once_with("overwrite")
|
|
124
|
+
writer.saveAsTable.assert_called_once_with("cat.sch.tbl")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
unittest.main()
|
tests/test_merges.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import MagicMock, patch
|
|
3
|
+
|
|
4
|
+
from gss_bi_udfs import merges
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestMerges(unittest.TestCase):
|
|
8
|
+
def test_merge_scd2_raises_if_business_key_is_missing(self):
|
|
9
|
+
spark = MagicMock()
|
|
10
|
+
df_dim_src = MagicMock()
|
|
11
|
+
df_dim_src.columns = ["id", "descripcion"]
|
|
12
|
+
|
|
13
|
+
with self.assertRaises(ValueError):
|
|
14
|
+
merges.merge_scd2(
|
|
15
|
+
spark=spark,
|
|
16
|
+
df_dim_src=df_dim_src,
|
|
17
|
+
table_name="cat.sch.dim",
|
|
18
|
+
business_keys="codigo_negocio",
|
|
19
|
+
surrogate_key="sk_dim",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
@patch("gss_bi_udfs.merges.save_table_to_delta")
|
|
23
|
+
@patch("gss_bi_udfs.merges.add_hashid")
|
|
24
|
+
@patch("gss_bi_udfs.merges.get_table_info")
|
|
25
|
+
def test_merge_scd2_full_load_path(self, mock_get_table_info, mock_add_hashid, mock_save_table):
|
|
26
|
+
spark = MagicMock()
|
|
27
|
+
spark.catalog.tableExists.return_value = False
|
|
28
|
+
mock_get_table_info.return_value = {
|
|
29
|
+
"catalog": "cat_dev",
|
|
30
|
+
"schema": "sch",
|
|
31
|
+
"table": "dim_cliente",
|
|
32
|
+
"full_table_name": "cat_dev.sch.dim_cliente",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
df_dim_src = MagicMock()
|
|
36
|
+
df_dim_src.columns = ["codigo_negocio", "descripcion"]
|
|
37
|
+
df_dim_src.withColumn.return_value = df_dim_src
|
|
38
|
+
|
|
39
|
+
df_hashed = MagicMock()
|
|
40
|
+
writer = MagicMock()
|
|
41
|
+
writer.format.return_value = writer
|
|
42
|
+
writer.mode.return_value = writer
|
|
43
|
+
writer.option.return_value = writer
|
|
44
|
+
df_hashed.write = writer
|
|
45
|
+
mock_add_hashid.return_value = df_hashed
|
|
46
|
+
|
|
47
|
+
merges.merge_scd2(
|
|
48
|
+
spark=spark,
|
|
49
|
+
df_dim_src=df_dim_src,
|
|
50
|
+
table_name="cat.sch.dim_cliente",
|
|
51
|
+
business_keys="codigo_negocio",
|
|
52
|
+
surrogate_key="sk_dim_cliente",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
mock_add_hashid.assert_called_once()
|
|
56
|
+
add_hashid_args = mock_add_hashid.call_args.args
|
|
57
|
+
self.assertEqual(add_hashid_args[1], ["codigo_negocio", "valid_from"])
|
|
58
|
+
self.assertEqual(add_hashid_args[2], "sk_dim_cliente")
|
|
59
|
+
|
|
60
|
+
self.assertEqual(mock_save_table.call_count, 1)
|
|
61
|
+
writer.saveAsTable.assert_called_once_with("cat.sch.dim_cliente")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
if __name__ == "__main__":
|
|
65
|
+
unittest.main()
|
tests/test_transforms.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
from unittest.mock import MagicMock, patch
|
|
4
|
+
|
|
5
|
+
from gss_bi_udfs import transforms
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _FakeCol:
|
|
9
|
+
def __init__(self, name):
|
|
10
|
+
self.name = name
|
|
11
|
+
|
|
12
|
+
def cast(self, dtype):
|
|
13
|
+
return f"{self.name}:{dtype}"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestTransforms(unittest.TestCase):
|
|
17
|
+
def test_add_hashid_raises_when_columns_empty(self):
|
|
18
|
+
with self.assertRaises(ValueError):
|
|
19
|
+
transforms.add_hashid(MagicMock(), [])
|
|
20
|
+
|
|
21
|
+
@patch("gss_bi_udfs.transforms.xxhash64")
|
|
22
|
+
@patch("gss_bi_udfs.transforms.concat_ws")
|
|
23
|
+
@patch("gss_bi_udfs.transforms.col")
|
|
24
|
+
def test_add_hashid_builds_hash_and_reorders_columns(self, mock_col, mock_concat_ws, mock_xxhash64):
|
|
25
|
+
mock_col.side_effect = lambda name: _FakeCol(name)
|
|
26
|
+
mock_concat_ws.return_value = "concat_expr"
|
|
27
|
+
mock_xxhash64.return_value = "hash_expr"
|
|
28
|
+
|
|
29
|
+
df = MagicMock()
|
|
30
|
+
df.columns = ["id", "name"]
|
|
31
|
+
df_with_hash = MagicMock()
|
|
32
|
+
df.withColumn.return_value = df_with_hash
|
|
33
|
+
df_with_hash.select.return_value = "result_df"
|
|
34
|
+
|
|
35
|
+
out = transforms.add_hashid(df, ["id", "name"], "hash_pk")
|
|
36
|
+
|
|
37
|
+
self.assertEqual(out, "result_df")
|
|
38
|
+
df.withColumn.assert_called_once_with("hash_pk", "hash_expr")
|
|
39
|
+
df_with_hash.select.assert_called_once_with("hash_pk", "id", "name")
|
|
40
|
+
|
|
41
|
+
@patch("gss_bi_udfs.transforms.get_default_value_by_type")
|
|
42
|
+
def test_get_default_record_builds_single_row_with_schema_defaults(self, mock_defaults):
|
|
43
|
+
mock_defaults.side_effect = lambda dtype: f"default_for_{dtype}"
|
|
44
|
+
spark = MagicMock()
|
|
45
|
+
expected = object()
|
|
46
|
+
spark.createDataFrame.return_value = expected
|
|
47
|
+
|
|
48
|
+
df = MagicMock()
|
|
49
|
+
df.schema = SimpleNamespace(
|
|
50
|
+
fields=[
|
|
51
|
+
SimpleNamespace(name="id", dataType="int"),
|
|
52
|
+
SimpleNamespace(name="desc", dataType="string"),
|
|
53
|
+
]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
out = transforms.get_default_record(spark, df)
|
|
57
|
+
|
|
58
|
+
self.assertIs(out, expected)
|
|
59
|
+
spark.createDataFrame.assert_called_once_with(
|
|
60
|
+
[{"id": "default_for_int", "desc": "default_for_string"}],
|
|
61
|
+
schema=df.schema,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
unittest.main()
|