gss-bi-udfs 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gss_bi_udfs/io.py +49 -4
- {gss_bi_udfs-0.1.1.dist-info → gss_bi_udfs-0.1.3.dist-info}/METADATA +1 -2
- gss_bi_udfs-0.1.3.dist-info/RECORD +18 -0
- {gss_bi_udfs-0.1.1.dist-info → gss_bi_udfs-0.1.3.dist-info}/WHEEL +1 -1
- gss_bi_udfs-0.1.3.dist-info/top_level.txt +4 -0
- scripts/run_tests.py +125 -0
- tests/test_io.py +128 -0
- tests/test_merges.py +65 -0
- tests/test_transforms.py +66 -0
- tests/test_utils.py +119 -0
- workspace/main.py +0 -0
- workspace/prueba.py +10 -0
- workspace/prueba_calculadora.py +20 -0
- workspace/tests/test_prueba_calculadora.py +14 -0
- gss_bi_udfs-0.1.1.dist-info/RECORD +0 -9
- gss_bi_udfs-0.1.1.dist-info/top_level.txt +0 -1
gss_bi_udfs/io.py
CHANGED
|
@@ -1,5 +1,48 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from datetime import datetime
|
|
1
3
|
from .utils import get_env, get_table_info
|
|
2
4
|
|
|
5
|
+
|
|
6
|
+
class _LocalFileInfo:
|
|
7
|
+
# univamentemente para uso de la libreria en entornos locales
|
|
8
|
+
def __init__(self, path: str):
|
|
9
|
+
self.path = path
|
|
10
|
+
p = Path(path)
|
|
11
|
+
self.name = p.name
|
|
12
|
+
self.size = p.stat().st_size if p.exists() else 0
|
|
13
|
+
self.modificationTime = int(p.stat().st_mtime * 1000) if p.exists() else 0
|
|
14
|
+
|
|
15
|
+
def isFile(self) -> bool:
|
|
16
|
+
return Path(self.path).is_file()
|
|
17
|
+
|
|
18
|
+
def __repr__(self) -> str:
|
|
19
|
+
return (
|
|
20
|
+
"FileInfo("
|
|
21
|
+
f"path='{self.path}', "
|
|
22
|
+
f"name='{self.name}', "
|
|
23
|
+
f"size={self.size}, "
|
|
24
|
+
f"modificationTime={self.modificationTime}"
|
|
25
|
+
")"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def _normalize_path(path):
|
|
29
|
+
if path.startswith("dbfs:"):
|
|
30
|
+
return path.replace("dbfs:", "", 1)
|
|
31
|
+
return path
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _ls_path(base_path):
|
|
35
|
+
try:
|
|
36
|
+
# Databricks runtime provides dbutils in globals.
|
|
37
|
+
files = dbutils.fs.ls(base_path) # type: ignore
|
|
38
|
+
return files
|
|
39
|
+
except Exception:
|
|
40
|
+
local_path = _normalize_path(base_path)
|
|
41
|
+
p = Path(local_path)
|
|
42
|
+
if not p.exists():
|
|
43
|
+
return []
|
|
44
|
+
return [_LocalFileInfo(str(child)) for child in p.iterdir()]
|
|
45
|
+
|
|
3
46
|
# def load_latest_file_bronze(spark, data_base, schema, table, env=None):
|
|
4
47
|
def load_latest_parquet(spark, data_base, schema, table, env=None):
|
|
5
48
|
"""
|
|
@@ -18,9 +61,10 @@ def load_latest_parquet(spark, data_base, schema, table, env=None):
|
|
|
18
61
|
"""
|
|
19
62
|
env = env or get_env()
|
|
20
63
|
base_path = f"/Volumes/bronze/{data_base}_{schema}/{env}/{table}/"
|
|
64
|
+
print("Ruta base:", base_path)
|
|
21
65
|
|
|
22
66
|
try:
|
|
23
|
-
files =
|
|
67
|
+
files = _ls_path(base_path)
|
|
24
68
|
|
|
25
69
|
parquet_files = [f for f in files if table in f.name]
|
|
26
70
|
|
|
@@ -178,7 +222,7 @@ def load_latest_excel(spark, source_file, env=None):
|
|
|
178
222
|
print("Ruta base:", base_path)
|
|
179
223
|
|
|
180
224
|
try:
|
|
181
|
-
files =
|
|
225
|
+
files = _ls_path(base_path)
|
|
182
226
|
print("Archivos encontrados:", [f.name for f in files])
|
|
183
227
|
excel_candidates = [f for f in files if f.isFile()]
|
|
184
228
|
|
|
@@ -330,12 +374,13 @@ def load_and_materialize_views(action, **kwargs):
|
|
|
330
374
|
return results
|
|
331
375
|
|
|
332
376
|
|
|
333
|
-
def save_table_to_delta(df, catalog, schema, table_name):
|
|
377
|
+
def save_table_to_delta(spark, df, catalog, schema, table_name):
|
|
334
378
|
"""
|
|
335
379
|
Guarda un DataFrame en formato Delta en la ubicación y tabla especificadas,
|
|
336
380
|
sobrescribiendo los datos existentes y el esquema si es necesario.
|
|
337
381
|
|
|
338
382
|
Parámetros:
|
|
383
|
+
spark (SparkSession): Sesión activa de Spark.
|
|
339
384
|
df (DataFrame): DataFrame de Spark que se desea guardar.
|
|
340
385
|
db_name (str): Nombre del catálogo o base de datos destino.
|
|
341
386
|
schema (str): Nombre del esquema, capa o entorno destino (ejemplo: 'silver', 'gold').
|
|
@@ -363,7 +408,7 @@ def save_table_to_delta(df, catalog, schema, table_name):
|
|
|
363
408
|
- Si ambas opciones se usan al mismo tiempo, solo una tendrá efecto (se aplicará la última indicada).
|
|
364
409
|
|
|
365
410
|
"""
|
|
366
|
-
dim_destino = get_table_info(catalog=catalog, schema=schema, table=table_name)
|
|
411
|
+
dim_destino = get_table_info(spark=spark, catalog=catalog, schema=schema, table=table_name)
|
|
367
412
|
(
|
|
368
413
|
df.write
|
|
369
414
|
.format("delta")
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gss-bi-udfs
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Utilidades reutilizables para Spark y Delta Lake en arquitecturas Lakehouse.
|
|
5
5
|
Author: Geronimo Forconi
|
|
6
6
|
Requires-Python: >=3.8
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
|
-
Requires-Dist: pyspark>=3.0.0
|
|
9
8
|
|
|
10
9
|
# gss-bi-udfs
|
|
11
10
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
gss_bi_udfs/__init__.py,sha256=VNj2_l7MHiRGF497XVM4KtU7p6JOX1xddkvFJLG1vUQ,152
|
|
2
|
+
gss_bi_udfs/io.py,sha256=yneOHVcHbcSOuAy02jS3_qm7YXGyeDpXFk9e-5VJ36A,17246
|
|
3
|
+
gss_bi_udfs/merges.py,sha256=4YHfw6TWU08ZWEMKBtFlMqj_tzXzjqkuM_CJn0uRNUI,7977
|
|
4
|
+
gss_bi_udfs/transforms.py,sha256=yDg7uvPFSTrGXgy5rOUKDdSrRBBZSubfi9K-6rATCWY,1876
|
|
5
|
+
gss_bi_udfs/utils.py,sha256=ryyqrzhybC6mZFTUWsnnrQXReUcLkVqw6e2gIf4Id_g,5982
|
|
6
|
+
scripts/run_tests.py,sha256=6yG35rkURojbHmEnLKkPnHSn5bmViP7yJwng5hXj9xs,4407
|
|
7
|
+
tests/test_io.py,sha256=oFTJK6UZJXec53lPoArWSHKPRpNGbhI8ZVb1ZjfXW8U,4847
|
|
8
|
+
tests/test_merges.py,sha256=_PHYRU0DwRn5Vg05clz8jL7_d8QutWiiTACHLiNPrZo,2221
|
|
9
|
+
tests/test_transforms.py,sha256=4fqKyemSV-4nfMzhTamaE5mWXnglV08uvw67sWj84Og,2206
|
|
10
|
+
tests/test_utils.py,sha256=FUap5pqqEDvmBmBLeSBN39FoQDQSz3hpN4qCQrUniEU,4541
|
|
11
|
+
workspace/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
workspace/prueba.py,sha256=n-zGeMhFRrMLp9cx-vvJYTSFMZHCYm_R-xmGiQvj1Fk,223
|
|
13
|
+
workspace/prueba_calculadora.py,sha256=4pu4Wg-h_aALbqDsfSeJlPbyx9FpODIbHGNAse5Mz5M,512
|
|
14
|
+
workspace/tests/test_prueba_calculadora.py,sha256=9cHXunht_EVwBgBELv5YlazkJMWSi97A5VnB9Mvs-kU,257
|
|
15
|
+
gss_bi_udfs-0.1.3.dist-info/METADATA,sha256=u4tbdf_peTRGGTMzIMlsle_BRrQ56uTPEHs63GmfFm8,393
|
|
16
|
+
gss_bi_udfs-0.1.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
17
|
+
gss_bi_udfs-0.1.3.dist-info/top_level.txt,sha256=NzlGPsUajFQDfmDbDhTCgu4LBvrm0d1shuPSJIBpLfw,36
|
|
18
|
+
gss_bi_udfs-0.1.3.dist-info/RECORD,,
|
scripts/run_tests.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import html
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import unittest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _CollectingTextTestResult(unittest.TextTestResult):
|
|
11
|
+
def __init__(self, stream, descriptions, verbosity):
|
|
12
|
+
super().__init__(stream, descriptions, verbosity)
|
|
13
|
+
self.successes = []
|
|
14
|
+
|
|
15
|
+
def addSuccess(self, test):
|
|
16
|
+
super().addSuccess(test)
|
|
17
|
+
self.successes.append(test)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _run_tests(start_dir: str, pattern: str, verbosity: int) -> unittest.TestResult:
|
|
21
|
+
suite = unittest.defaultTestLoader.discover(start_dir=start_dir, pattern=pattern)
|
|
22
|
+
runner = unittest.TextTestRunner(verbosity=verbosity, resultclass=_CollectingTextTestResult)
|
|
23
|
+
return runner.run(suite)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _timestamped_path(path: str) -> str:
|
|
27
|
+
ts = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
28
|
+
base, ext = os.path.splitext(path)
|
|
29
|
+
if ext:
|
|
30
|
+
return f"{base}_{ts}{ext}"
|
|
31
|
+
return f"{path}_{ts}"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _render_html(result: unittest.TestResult, output_path: str) -> None:
|
|
35
|
+
rows = []
|
|
36
|
+
for test in getattr(result, "successes", []):
|
|
37
|
+
rows.append(("PASS", str(test), ""))
|
|
38
|
+
|
|
39
|
+
for test, tb in result.failures + result.errors:
|
|
40
|
+
rows.append(
|
|
41
|
+
(
|
|
42
|
+
"FAIL" if (test, tb) in result.failures else "ERROR",
|
|
43
|
+
str(test),
|
|
44
|
+
tb,
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
for test, reason in result.skipped:
|
|
49
|
+
rows.append(("SKIP", str(test), reason))
|
|
50
|
+
|
|
51
|
+
status = "PASSED" if result.wasSuccessful() else "FAILED"
|
|
52
|
+
now = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
53
|
+
|
|
54
|
+
html_body = [
|
|
55
|
+
"<!doctype html>",
|
|
56
|
+
"<html lang='en'>",
|
|
57
|
+
"<head>",
|
|
58
|
+
" <meta charset='utf-8'/>",
|
|
59
|
+
" <title>Unit Test Report</title>",
|
|
60
|
+
" <style>",
|
|
61
|
+
" body { font-family: Arial, sans-serif; margin: 24px; }",
|
|
62
|
+
" .ok { color: #0f5132; }",
|
|
63
|
+
" .bad { color: #842029; }",
|
|
64
|
+
" table { border-collapse: collapse; width: 100%; margin-top: 16px; }",
|
|
65
|
+
" th, td { border: 1px solid #ddd; padding: 8px; text-align: left; vertical-align: top; }",
|
|
66
|
+
" th { background: #f5f5f5; }",
|
|
67
|
+
" pre { white-space: pre-wrap; margin: 0; }",
|
|
68
|
+
" </style>",
|
|
69
|
+
"</head>",
|
|
70
|
+
"<body>",
|
|
71
|
+
f" <h1>Unit Test Report</h1>",
|
|
72
|
+
f" <p><strong>Generated:</strong> {html.escape(now)}</p>",
|
|
73
|
+
f" <p><strong>Status:</strong> <span class='{'ok' if result.wasSuccessful() else 'bad'}'>{status}</span></p>",
|
|
74
|
+
f" <p><strong>Ran:</strong> {result.testsRun} tests</p>",
|
|
75
|
+
f" <p><strong>Pass:</strong> {len(getattr(result, 'successes', []))} | <strong>Failures:</strong> {len(result.failures)} | <strong>Errors:</strong> {len(result.errors)} | <strong>Skipped:</strong> {len(result.skipped)}</p>",
|
|
76
|
+
" <table>",
|
|
77
|
+
" <thead><tr><th>Type</th><th>Test</th><th>Details</th></tr></thead>",
|
|
78
|
+
" <tbody>",
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
if rows:
|
|
82
|
+
for row_type, test_name, details in rows:
|
|
83
|
+
html_body.append(
|
|
84
|
+
" <tr>"
|
|
85
|
+
f"<td>{html.escape(row_type)}</td>"
|
|
86
|
+
f"<td>{html.escape(test_name)}</td>"
|
|
87
|
+
f"<td><pre>{html.escape(details)}</pre></td>"
|
|
88
|
+
"</tr>"
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
html_body.append(" <tr><td colspan='3'>No failures, errors or skipped tests.</td></tr>")
|
|
92
|
+
|
|
93
|
+
html_body += [
|
|
94
|
+
" </tbody>",
|
|
95
|
+
" </table>",
|
|
96
|
+
"</body>",
|
|
97
|
+
"</html>",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
101
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
102
|
+
f.write("\n".join(html_body))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def main() -> int:
|
|
106
|
+
parser = argparse.ArgumentParser(description="Run unittests and optionally write HTML report.")
|
|
107
|
+
parser.add_argument("--start-dir", default="tests")
|
|
108
|
+
parser.add_argument("--pattern", default="test*.py")
|
|
109
|
+
parser.add_argument("--verbosity", type=int, default=2)
|
|
110
|
+
parser.add_argument("--html", default="")
|
|
111
|
+
parser.add_argument("--timestamped", action="store_true")
|
|
112
|
+
args = parser.parse_args()
|
|
113
|
+
|
|
114
|
+
result = _run_tests(args.start_dir, args.pattern, args.verbosity)
|
|
115
|
+
|
|
116
|
+
if args.html:
|
|
117
|
+
output_path = _timestamped_path(args.html) if args.timestamped else args.html
|
|
118
|
+
_render_html(result, output_path)
|
|
119
|
+
print(f"HTML report written to: {output_path}")
|
|
120
|
+
|
|
121
|
+
return 0 if result.wasSuccessful() else 1
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
sys.exit(main())
|
tests/test_io.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from types import SimpleNamespace
|
|
5
|
+
from unittest.mock import MagicMock, patch
|
|
6
|
+
|
|
7
|
+
from gss_bi_udfs import io
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestIO(unittest.TestCase):
|
|
11
|
+
def test_normalize_path(self):
|
|
12
|
+
self.assertEqual(io._normalize_path("dbfs:/tmp/a.parquet"), "/tmp/a.parquet")
|
|
13
|
+
self.assertEqual(io._normalize_path("/tmp/a.parquet"), "/tmp/a.parquet")
|
|
14
|
+
|
|
15
|
+
def test_ls_path_local(self):
|
|
16
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
17
|
+
file_path = Path(tmpdir) / "file.txt"
|
|
18
|
+
file_path.write_text("x", encoding="utf-8")
|
|
19
|
+
(Path(tmpdir) / "folder").mkdir()
|
|
20
|
+
|
|
21
|
+
files = io._ls_path(tmpdir)
|
|
22
|
+
names = sorted([f.name for f in files])
|
|
23
|
+
self.assertEqual(names, ["file.txt", "folder"])
|
|
24
|
+
|
|
25
|
+
self.assertEqual(io._ls_path("/path/that/does/not/exist"), [])
|
|
26
|
+
|
|
27
|
+
@patch("gss_bi_udfs.io._ls_path")
|
|
28
|
+
def test_load_latest_parquet_returns_latest_match(self, mock_ls_path):
|
|
29
|
+
mock_ls_path.return_value = [
|
|
30
|
+
SimpleNamespace(name="clientes_20240101.parquet", path="/tmp/1"),
|
|
31
|
+
SimpleNamespace(name="clientes_20240102.parquet", path="/tmp/2"),
|
|
32
|
+
SimpleNamespace(name="otra_tabla_20240103.parquet", path="/tmp/3"),
|
|
33
|
+
]
|
|
34
|
+
spark = MagicMock()
|
|
35
|
+
expected_df = object()
|
|
36
|
+
spark.read.parquet.return_value = expected_df
|
|
37
|
+
|
|
38
|
+
out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
|
|
39
|
+
|
|
40
|
+
self.assertIs(out, expected_df)
|
|
41
|
+
spark.read.parquet.assert_called_once_with("/tmp/2")
|
|
42
|
+
|
|
43
|
+
@patch("gss_bi_udfs.io._ls_path")
|
|
44
|
+
def test_load_latest_parquet_returns_none_without_matches(self, mock_ls_path):
|
|
45
|
+
mock_ls_path.return_value = [SimpleNamespace(name="x.parquet", path="/tmp/x")]
|
|
46
|
+
spark = MagicMock()
|
|
47
|
+
|
|
48
|
+
out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
|
|
49
|
+
|
|
50
|
+
self.assertIsNone(out)
|
|
51
|
+
spark.read.parquet.assert_not_called()
|
|
52
|
+
|
|
53
|
+
@patch("gss_bi_udfs.io.load_latest_parquet")
|
|
54
|
+
def test_return_parquets_and_register_temp_views(self, mock_load_latest):
|
|
55
|
+
df = MagicMock()
|
|
56
|
+
mock_load_latest.return_value = df
|
|
57
|
+
spark = MagicMock()
|
|
58
|
+
tables_load = {
|
|
59
|
+
"db1": {
|
|
60
|
+
"sch1": [
|
|
61
|
+
{"table": "t1", "view": "vw_t1"},
|
|
62
|
+
{"table": "t2", "view": "vw_t2"},
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
out = io.return_parquets_and_register_temp_views(spark, tables_load, env="dev")
|
|
68
|
+
|
|
69
|
+
self.assertEqual(set(out.keys()), {"db1.sch1.t1", "db1.sch1.t2"})
|
|
70
|
+
self.assertEqual(mock_load_latest.call_count, 2)
|
|
71
|
+
self.assertEqual(df.createOrReplaceTempView.call_count, 2)
|
|
72
|
+
|
|
73
|
+
@patch("gss_bi_udfs.io.load_latest_parquet")
|
|
74
|
+
def test_parquets_register_temp_views(self, mock_load_latest):
|
|
75
|
+
df = MagicMock()
|
|
76
|
+
mock_load_latest.return_value = df
|
|
77
|
+
spark = MagicMock()
|
|
78
|
+
tables_load = {"db1": {"sch1": [{"table": "t1", "view": "vw_t1"}]}}
|
|
79
|
+
|
|
80
|
+
io.parquets_register_temp_views(spark, tables_load, env="dev")
|
|
81
|
+
|
|
82
|
+
df.createOrReplaceTempView.assert_called_once_with("vw_t1")
|
|
83
|
+
|
|
84
|
+
@patch("gss_bi_udfs.io._ls_path")
|
|
85
|
+
@patch("pandas.read_excel")
|
|
86
|
+
def test_load_latest_excel(self, mock_read_excel, mock_ls_path):
|
|
87
|
+
mock_ls_path.return_value = [
|
|
88
|
+
SimpleNamespace(name="a_old", path="dbfs:/tmp/a_old", isFile=lambda: True),
|
|
89
|
+
SimpleNamespace(name="b_new", path="dbfs:/tmp/b_new", isFile=lambda: True),
|
|
90
|
+
]
|
|
91
|
+
mock_pdf = object()
|
|
92
|
+
mock_read_excel.return_value = mock_pdf
|
|
93
|
+
spark = MagicMock()
|
|
94
|
+
expected_df = object()
|
|
95
|
+
spark.createDataFrame.return_value = expected_df
|
|
96
|
+
|
|
97
|
+
out = io.load_latest_excel(spark, "dom/sub/file", env="dev")
|
|
98
|
+
|
|
99
|
+
self.assertIs(out, expected_df)
|
|
100
|
+
mock_read_excel.assert_called_once_with("/tmp/b_new", header=0, engine="xlrd")
|
|
101
|
+
spark.createDataFrame.assert_called_once_with(mock_pdf)
|
|
102
|
+
|
|
103
|
+
def test_load_and_materialize_views_unknown_action(self):
|
|
104
|
+
out = io.load_and_materialize_views("accion_inexistente")
|
|
105
|
+
self.assertEqual(out, {})
|
|
106
|
+
|
|
107
|
+
@patch("gss_bi_udfs.io.get_table_info")
|
|
108
|
+
def test_save_table_to_delta_writes_delta(self, mock_get_table_info):
|
|
109
|
+
mock_get_table_info.return_value = {
|
|
110
|
+
"path": "/tmp/tbl",
|
|
111
|
+
"full_table_name": "cat.sch.tbl",
|
|
112
|
+
}
|
|
113
|
+
df = MagicMock()
|
|
114
|
+
writer = MagicMock()
|
|
115
|
+
writer.format.return_value = writer
|
|
116
|
+
writer.option.return_value = writer
|
|
117
|
+
writer.mode.return_value = writer
|
|
118
|
+
df.write = writer
|
|
119
|
+
|
|
120
|
+
io.save_table_to_delta(df, "cat", "sch", "tbl")
|
|
121
|
+
|
|
122
|
+
writer.format.assert_called_once_with("delta")
|
|
123
|
+
writer.mode.assert_called_once_with("overwrite")
|
|
124
|
+
writer.saveAsTable.assert_called_once_with("cat.sch.tbl")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
unittest.main()
|
tests/test_merges.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import MagicMock, patch
|
|
3
|
+
|
|
4
|
+
from gss_bi_udfs import merges
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestMerges(unittest.TestCase):
|
|
8
|
+
def test_merge_scd2_raises_if_business_key_is_missing(self):
|
|
9
|
+
spark = MagicMock()
|
|
10
|
+
df_dim_src = MagicMock()
|
|
11
|
+
df_dim_src.columns = ["id", "descripcion"]
|
|
12
|
+
|
|
13
|
+
with self.assertRaises(ValueError):
|
|
14
|
+
merges.merge_scd2(
|
|
15
|
+
spark=spark,
|
|
16
|
+
df_dim_src=df_dim_src,
|
|
17
|
+
table_name="cat.sch.dim",
|
|
18
|
+
business_keys="codigo_negocio",
|
|
19
|
+
surrogate_key="sk_dim",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
@patch("gss_bi_udfs.merges.save_table_to_delta")
|
|
23
|
+
@patch("gss_bi_udfs.merges.add_hashid")
|
|
24
|
+
@patch("gss_bi_udfs.merges.get_table_info")
|
|
25
|
+
def test_merge_scd2_full_load_path(self, mock_get_table_info, mock_add_hashid, mock_save_table):
|
|
26
|
+
spark = MagicMock()
|
|
27
|
+
spark.catalog.tableExists.return_value = False
|
|
28
|
+
mock_get_table_info.return_value = {
|
|
29
|
+
"catalog": "cat_dev",
|
|
30
|
+
"schema": "sch",
|
|
31
|
+
"table": "dim_cliente",
|
|
32
|
+
"full_table_name": "cat_dev.sch.dim_cliente",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
df_dim_src = MagicMock()
|
|
36
|
+
df_dim_src.columns = ["codigo_negocio", "descripcion"]
|
|
37
|
+
df_dim_src.withColumn.return_value = df_dim_src
|
|
38
|
+
|
|
39
|
+
df_hashed = MagicMock()
|
|
40
|
+
writer = MagicMock()
|
|
41
|
+
writer.format.return_value = writer
|
|
42
|
+
writer.mode.return_value = writer
|
|
43
|
+
writer.option.return_value = writer
|
|
44
|
+
df_hashed.write = writer
|
|
45
|
+
mock_add_hashid.return_value = df_hashed
|
|
46
|
+
|
|
47
|
+
merges.merge_scd2(
|
|
48
|
+
spark=spark,
|
|
49
|
+
df_dim_src=df_dim_src,
|
|
50
|
+
table_name="cat.sch.dim_cliente",
|
|
51
|
+
business_keys="codigo_negocio",
|
|
52
|
+
surrogate_key="sk_dim_cliente",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
mock_add_hashid.assert_called_once()
|
|
56
|
+
add_hashid_args = mock_add_hashid.call_args.args
|
|
57
|
+
self.assertEqual(add_hashid_args[1], ["codigo_negocio", "valid_from"])
|
|
58
|
+
self.assertEqual(add_hashid_args[2], "sk_dim_cliente")
|
|
59
|
+
|
|
60
|
+
self.assertEqual(mock_save_table.call_count, 1)
|
|
61
|
+
writer.saveAsTable.assert_called_once_with("cat.sch.dim_cliente")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
if __name__ == "__main__":
|
|
65
|
+
unittest.main()
|
tests/test_transforms.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
from unittest.mock import MagicMock, patch
|
|
4
|
+
|
|
5
|
+
from gss_bi_udfs import transforms
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _FakeCol:
|
|
9
|
+
def __init__(self, name):
|
|
10
|
+
self.name = name
|
|
11
|
+
|
|
12
|
+
def cast(self, dtype):
|
|
13
|
+
return f"{self.name}:{dtype}"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestTransforms(unittest.TestCase):
|
|
17
|
+
def test_add_hashid_raises_when_columns_empty(self):
|
|
18
|
+
with self.assertRaises(ValueError):
|
|
19
|
+
transforms.add_hashid(MagicMock(), [])
|
|
20
|
+
|
|
21
|
+
@patch("gss_bi_udfs.transforms.xxhash64")
|
|
22
|
+
@patch("gss_bi_udfs.transforms.concat_ws")
|
|
23
|
+
@patch("gss_bi_udfs.transforms.col")
|
|
24
|
+
def test_add_hashid_builds_hash_and_reorders_columns(self, mock_col, mock_concat_ws, mock_xxhash64):
|
|
25
|
+
mock_col.side_effect = lambda name: _FakeCol(name)
|
|
26
|
+
mock_concat_ws.return_value = "concat_expr"
|
|
27
|
+
mock_xxhash64.return_value = "hash_expr"
|
|
28
|
+
|
|
29
|
+
df = MagicMock()
|
|
30
|
+
df.columns = ["id", "name"]
|
|
31
|
+
df_with_hash = MagicMock()
|
|
32
|
+
df.withColumn.return_value = df_with_hash
|
|
33
|
+
df_with_hash.select.return_value = "result_df"
|
|
34
|
+
|
|
35
|
+
out = transforms.add_hashid(df, ["id", "name"], "hash_pk")
|
|
36
|
+
|
|
37
|
+
self.assertEqual(out, "result_df")
|
|
38
|
+
df.withColumn.assert_called_once_with("hash_pk", "hash_expr")
|
|
39
|
+
df_with_hash.select.assert_called_once_with("hash_pk", "id", "name")
|
|
40
|
+
|
|
41
|
+
@patch("gss_bi_udfs.transforms.get_default_value_by_type")
|
|
42
|
+
def test_get_default_record_builds_single_row_with_schema_defaults(self, mock_defaults):
|
|
43
|
+
mock_defaults.side_effect = lambda dtype: f"default_for_{dtype}"
|
|
44
|
+
spark = MagicMock()
|
|
45
|
+
expected = object()
|
|
46
|
+
spark.createDataFrame.return_value = expected
|
|
47
|
+
|
|
48
|
+
df = MagicMock()
|
|
49
|
+
df.schema = SimpleNamespace(
|
|
50
|
+
fields=[
|
|
51
|
+
SimpleNamespace(name="id", dataType="int"),
|
|
52
|
+
SimpleNamespace(name="desc", dataType="string"),
|
|
53
|
+
]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
out = transforms.get_default_record(spark, df)
|
|
57
|
+
|
|
58
|
+
self.assertIs(out, expected)
|
|
59
|
+
spark.createDataFrame.assert_called_once_with(
|
|
60
|
+
[{"id": "default_for_int", "desc": "default_for_string"}],
|
|
61
|
+
schema=df.schema,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
unittest.main()
|
tests/test_utils.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import unittest
|
|
3
|
+
from types import SimpleNamespace
|
|
4
|
+
from unittest.mock import MagicMock, patch
|
|
5
|
+
|
|
6
|
+
from pyspark.sql.types import (
|
|
7
|
+
BooleanType,
|
|
8
|
+
DateType,
|
|
9
|
+
DecimalType,
|
|
10
|
+
DoubleType,
|
|
11
|
+
FloatType,
|
|
12
|
+
IntegerType,
|
|
13
|
+
LongType,
|
|
14
|
+
StringType,
|
|
15
|
+
TimestampType,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from gss_bi_udfs import utils
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestUtils(unittest.TestCase):
|
|
22
|
+
def test_get_env_uses_default_and_env_var(self):
|
|
23
|
+
with patch.dict(os.environ, {}, clear=True):
|
|
24
|
+
self.assertEqual(utils.get_env(), "dev")
|
|
25
|
+
self.assertEqual(utils.get_env(default="qa"), "qa")
|
|
26
|
+
|
|
27
|
+
with patch.dict(os.environ, {"ENV": "prod"}, clear=True):
|
|
28
|
+
self.assertEqual(utils.get_env(), "prod")
|
|
29
|
+
|
|
30
|
+
def test_get_env_catalog(self):
|
|
31
|
+
with patch.dict(os.environ, {"ENV": "pro"}, clear=True):
|
|
32
|
+
self.assertEqual(utils.get_env_catalog("fi_comunes"), "fi_comunes")
|
|
33
|
+
with patch.dict(os.environ, {"ENV": "dev"}, clear=True):
|
|
34
|
+
self.assertEqual(utils.get_env_catalog("fi_comunes"), "fi_comunes_dev")
|
|
35
|
+
|
|
36
|
+
def test_get_env_table_path(self):
|
|
37
|
+
with patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev"):
|
|
38
|
+
self.assertEqual(
|
|
39
|
+
utils.get_env_table_path("cat", "silver.dim_cliente"),
|
|
40
|
+
"cat_dev.silver.dim_cliente",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev")
|
|
44
|
+
def test_get_schema_root_location(self, _mock_catalog):
|
|
45
|
+
spark = MagicMock()
|
|
46
|
+
df = MagicMock()
|
|
47
|
+
df.filter.return_value = df
|
|
48
|
+
df.select.return_value = df
|
|
49
|
+
df.collect.return_value = [["s3://bucket/root"]]
|
|
50
|
+
spark.sql.return_value = df
|
|
51
|
+
|
|
52
|
+
out = utils.get_schema_root_location(spark, "cat", "silver")
|
|
53
|
+
|
|
54
|
+
self.assertEqual(out, "s3://bucket/root")
|
|
55
|
+
spark.sql.assert_called_once_with("DESCRIBE SCHEMA EXTENDED cat_dev.silver")
|
|
56
|
+
|
|
57
|
+
def test_get_table_info_validations(self):
|
|
58
|
+
spark = MagicMock()
|
|
59
|
+
with self.assertRaises(ValueError):
|
|
60
|
+
utils.get_table_info(spark, full_table_name="solo.dos")
|
|
61
|
+
with self.assertRaises(ValueError):
|
|
62
|
+
utils.get_table_info(spark)
|
|
63
|
+
|
|
64
|
+
@patch("gss_bi_udfs.utils.get_schema_root_location", return_value="s3://bucket/root/silver")
|
|
65
|
+
@patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev")
|
|
66
|
+
def test_get_table_info_when_table_does_not_exist(self, _mock_env_catalog, _mock_root):
|
|
67
|
+
spark = MagicMock()
|
|
68
|
+
spark.catalog.tableExists.return_value = False
|
|
69
|
+
|
|
70
|
+
info = utils.get_table_info(spark, full_table_name="cat.silver.dim_cliente")
|
|
71
|
+
|
|
72
|
+
self.assertEqual(info["catalog"], "cat_dev")
|
|
73
|
+
self.assertEqual(info["schema"], "silver")
|
|
74
|
+
self.assertEqual(info["table"], "dim_cliente")
|
|
75
|
+
self.assertEqual(info["full_table_name"], "cat_dev.silver.dim_cliente")
|
|
76
|
+
self.assertEqual(info["path"], "s3://bucket/root/silver/dim_cliente")
|
|
77
|
+
self.assertFalse(info["exists"])
|
|
78
|
+
|
|
79
|
+
@patch("gss_bi_udfs.utils.get_schema_root_location", return_value="s3://bucket/root/silver")
|
|
80
|
+
@patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev")
|
|
81
|
+
def test_get_table_info_when_table_exists(self, _mock_env_catalog, _mock_root):
|
|
82
|
+
spark = MagicMock()
|
|
83
|
+
spark.catalog.tableExists.return_value = True
|
|
84
|
+
desc_df = MagicMock()
|
|
85
|
+
desc_df.filter.return_value = desc_df
|
|
86
|
+
desc_df.collect.return_value = [
|
|
87
|
+
SimpleNamespace(col_name="Location", data_type="s3://bucket/real/location"),
|
|
88
|
+
SimpleNamespace(col_name="Provider", data_type="delta"),
|
|
89
|
+
SimpleNamespace(col_name="Type", data_type="MANAGED"),
|
|
90
|
+
]
|
|
91
|
+
spark.sql.return_value = desc_df
|
|
92
|
+
|
|
93
|
+
info = utils.get_table_info(spark, full_table_name="cat.silver.dim_cliente")
|
|
94
|
+
|
|
95
|
+
self.assertTrue(info["exists"])
|
|
96
|
+
self.assertEqual(info["path"], "s3://bucket/real/location")
|
|
97
|
+
self.assertEqual(info["provider"], "delta")
|
|
98
|
+
self.assertEqual(info["table_type"], "MANAGED")
|
|
99
|
+
|
|
100
|
+
def test_get_default_value_by_type_returns_column(self):
|
|
101
|
+
dtypes = [
|
|
102
|
+
IntegerType(),
|
|
103
|
+
LongType(),
|
|
104
|
+
DecimalType(10, 2),
|
|
105
|
+
DoubleType(),
|
|
106
|
+
FloatType(),
|
|
107
|
+
DateType(),
|
|
108
|
+
TimestampType(),
|
|
109
|
+
BooleanType(),
|
|
110
|
+
StringType(),
|
|
111
|
+
]
|
|
112
|
+
for dtype in dtypes:
|
|
113
|
+
with self.subTest(dtype=dtype):
|
|
114
|
+
out = utils.get_default_value_by_type(dtype)
|
|
115
|
+
self.assertEqual(out.__class__.__name__, "Column")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
unittest.main()
|
workspace/main.py
ADDED
|
File without changes
|
workspace/prueba.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
class Calculadora:
|
|
2
|
+
"""Clase calculadora con operaciones básicas"""
|
|
3
|
+
|
|
4
|
+
def sumar(self, a, b):
|
|
5
|
+
"""Suma dos números"""
|
|
6
|
+
return a + b
|
|
7
|
+
|
|
8
|
+
def restar(self, a, b):
|
|
9
|
+
"""Resta dos números"""
|
|
10
|
+
return a - b
|
|
11
|
+
|
|
12
|
+
def multiplicar(self, a, b):
|
|
13
|
+
"""Multiplica dos números"""
|
|
14
|
+
return a * b
|
|
15
|
+
|
|
16
|
+
def dividir(self, a, b):
|
|
17
|
+
"""Divide dos números"""
|
|
18
|
+
if b == 0:
|
|
19
|
+
raise ValueError("No se puede dividir entre cero")
|
|
20
|
+
return a / b
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import sys
|
|
3
|
+
sys.path.append("/workspace")
|
|
4
|
+
|
|
5
|
+
from .prueba_calculadora import sumar, restar
|
|
6
|
+
|
|
7
|
+
def test_sumar():
|
|
8
|
+
assert sumar(2, 3) == 5
|
|
9
|
+
|
|
10
|
+
def test_restar():
|
|
11
|
+
assert restar(5, 3) == 2
|
|
12
|
+
|
|
13
|
+
def test_sumar_negativos():
|
|
14
|
+
assert sumar(-1, -2) == -3
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
gss_bi_udfs/__init__.py,sha256=VNj2_l7MHiRGF497XVM4KtU7p6JOX1xddkvFJLG1vUQ,152
|
|
2
|
-
gss_bi_udfs/io.py,sha256=yEqQvpyBod9kIv7p-_5yLtINuIwsi-piWy5rKI3BgQk,15939
|
|
3
|
-
gss_bi_udfs/merges.py,sha256=4YHfw6TWU08ZWEMKBtFlMqj_tzXzjqkuM_CJn0uRNUI,7977
|
|
4
|
-
gss_bi_udfs/transforms.py,sha256=yDg7uvPFSTrGXgy5rOUKDdSrRBBZSubfi9K-6rATCWY,1876
|
|
5
|
-
gss_bi_udfs/utils.py,sha256=ryyqrzhybC6mZFTUWsnnrQXReUcLkVqw6e2gIf4Id_g,5982
|
|
6
|
-
gss_bi_udfs-0.1.1.dist-info/METADATA,sha256=q241xBvvuhhJRUL1wIGB_JKCkTxXAF9HY13yYjV3Ae8,423
|
|
7
|
-
gss_bi_udfs-0.1.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
8
|
-
gss_bi_udfs-0.1.1.dist-info/top_level.txt,sha256=jLjGHQoep6-wLbW6wFV611Zx4ak42Q9hKtH_3sUzX9o,12
|
|
9
|
-
gss_bi_udfs-0.1.1.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
gss_bi_udfs
|