gss-bi-udfs 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gss_bi_udfs/utils.py ADDED
@@ -0,0 +1,185 @@
1
+ import os
2
+ from pyspark.sql.types import (IntegerType, LongType, ShortType, ByteType,
3
+ DecimalType, DoubleType, FloatType,
4
+ DateType, TimestampType, BooleanType, StringType)
5
+ from pyspark.sql.functions import lit, concat_ws, col
6
+ from pyspark.sql import DataFrame, Column
7
+
8
+
9
+ def get_env(default="dev"):
10
+ """
11
+ Obtiene el entorno de ejecución a partir de la variable de entorno ENV.
12
+ Si no está definida, retorna el valor por defecto indicado.
13
+
14
+ Parámetros:
15
+ - default (str): entorno por defecto a utilizar si ENV no está definida.
16
+
17
+ Retorna:
18
+ - str: nombre del entorno de ejecución (por ejemplo: 'dev', 'qa', 'prod').
19
+ """
20
+
21
+ return os.getenv("ENV", default)
22
+
23
+ def get_env_catalog(catalog):
24
+ """
25
+ Genera el nombre del catálogo ajustado al environment.
26
+
27
+ Parámetros:
28
+ catalog (str): Nombre base del catálogo (ej. 'fi_comunes').
29
+
30
+ Retorna:
31
+ str: Nombre del catálogo ajustado al environment.
32
+ Ejemplo: 'fi_comunes_dev' si ENV='dev'
33
+ 'fi_comunes' si ENV='pro'
34
+ """
35
+
36
+ if get_env() == "pro":
37
+ return catalog
38
+ else:
39
+ return f"{catalog}_{get_env()}"
40
+
41
+
42
+ def get_env_table_path(catalog, table_path):
43
+ """
44
+ Genera el path completo de una tabla incluyendo el sufijo de ambiente en el catálogo.
45
+
46
+ Parámetros:
47
+ catalog (str): Nombre base del catálogo (ej. 'fi_comunes').
48
+ table_path (str): Path de la tabla incluyendo esquema y nombre (ej. 'silver.dim_afiliado').
49
+
50
+ Retorna:
51
+ str: Path completo de la tabla ajustado al environment.
52
+ Ejemplo: 'fi_comunes_dev.silver.dim_afiliado' si ENV='dev'
53
+ 'fi_comunes.silver.dim_afiliado' si ENV='pro'
54
+ """
55
+
56
+ # Concatena el catálogo modificado con el path de la tabla
57
+ return f"{get_env_catalog(catalog)}.{table_path}"
58
+
59
+ def get_schema_root_location(spark, catalog, schema):
60
+ """
61
+ Obtiene la ruta física (RootLocation) de un esquema específico, considerando el catálogo ajustado al ambiente.
62
+
63
+ Parámetros:
64
+ catalog (str): Nombre base del catálogo (ej. 'fi_comunes').
65
+ schema (str): Nombre del esquema dentro del catálogo (ej. 'silver').
66
+
67
+ Retorna:
68
+ str: Ruta física donde se almacenan los datos del esquema.
69
+ Ejemplo: 's3://bucket/path/fi_comunes_dev/silver' si ENV='dev'
70
+
71
+ Requiere:
72
+ - La función get_env_catalog debe estar definida y retornar el nombre de catálogo ajustado al ambiente.
73
+ - SparkSession activa y permisos para ejecutar `DESCRIBE SCHEMA EXTENDED`.
74
+
75
+ Ejemplo:
76
+ >>> get_schema_root_location("fi_comunes", "silver")
77
+ 's3://mi-bucket/datalake/fi_comunes_dev/silver'
78
+ """
79
+ cat = get_env_catalog(catalog)
80
+ df = spark.sql(f"DESCRIBE SCHEMA EXTENDED {cat}.{schema}")
81
+ return df.filter("database_description_item = 'RootLocation'") \
82
+ .select("database_description_value") \
83
+ .collect()[0][0]
84
+
85
+ def get_table_info(
86
+ spark,
87
+ *,
88
+ full_table_name: str = None,
89
+ catalog: str = None,
90
+ schema: str = None,
91
+ table: str = None
92
+ ) -> dict:
93
+ """
94
+ Devuelve información de una tabla a partir de:
95
+ - full_table_name (catalog.schema.table)
96
+ o
97
+ - catalog + schema + table
98
+ """
99
+
100
+ # -----------------------------
101
+ # 1. Resolver inputs
102
+ # -----------------------------
103
+ if full_table_name:
104
+ parts = full_table_name.split(".")
105
+ if len(parts) != 3:
106
+ raise ValueError(
107
+ "full_table_name debe tener formato catalog.schema.table"
108
+ )
109
+ catalog, schema, table = parts
110
+
111
+ elif catalog and schema and table:
112
+ full_table_name = f"{catalog}.{schema}.{table}"
113
+
114
+ else:
115
+ raise ValueError(
116
+ "Debe informar full_table_name o catalog + schema + table"
117
+ )
118
+
119
+ # -----------------------------
120
+ # 2. Environment catalog
121
+ # -----------------------------
122
+ catalog_env = get_env_catalog(catalog)
123
+
124
+ # -----------------------------
125
+ # 3. Path físico
126
+ # -----------------------------
127
+ root_location = get_schema_root_location(spark, catalog, schema)
128
+ path = f"{root_location.rstrip('/')}/{table}"
129
+
130
+ # -----------------------------
131
+ # 4. Metadata Spark (si existe)
132
+ # -----------------------------
133
+ info = {
134
+ "catalog": catalog_env,
135
+ "schema": schema,
136
+ "table": table,
137
+ "full_table_name": f"{catalog_env}.{schema}.{table}",
138
+ "path": path,
139
+ "exists": False,
140
+ "provider": None,
141
+ "table_type": None,
142
+ }
143
+
144
+ if spark.catalog.tableExists(info["full_table_name"]):
145
+ info["exists"] = True
146
+
147
+ desc = (
148
+ spark.sql(f"DESCRIBE EXTENDED {info['full_table_name']}")
149
+ .filter("col_name in ('Location', 'Provider', 'Type')")
150
+ .collect()
151
+ )
152
+
153
+ for row in desc:
154
+ if row.col_name == "Location":
155
+ info["path"] = row.data_type
156
+ elif row.col_name == "Provider":
157
+ info["provider"] = row.data_type
158
+ elif row.col_name == "Type":
159
+ info["table_type"] = row.data_type
160
+
161
+ return info
162
+
163
+
164
+
165
+ def get_default_value_by_type(dtype):
166
+ """
167
+ Devuelve "default" por tipo de dato para registros 'default/unknown'.
168
+ Parámetros:
169
+ - dtype: Tipo de dato (DataType) de PySpark.
170
+ Retorna:
171
+ - valor por defecto correspondiente al tipo de dato.
172
+ """
173
+ if isinstance(dtype, (IntegerType, LongType, ShortType, ByteType)):
174
+ return lit(-999)
175
+ if isinstance(dtype, (DecimalType, DoubleType, FloatType)):
176
+ return lit(-999)
177
+ if isinstance(dtype, (DateType, TimestampType)):
178
+ return lit("1900-01-01").cast(dtype)
179
+ if isinstance(dtype, BooleanType):
180
+ return lit(False)
181
+ if isinstance(dtype, StringType):
182
+ return lit("N/A")
183
+ return lit(None)
184
+
185
+
@@ -1,13 +1,20 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gss-bi-udfs
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Utilidades reutilizables para Spark y Delta Lake en arquitecturas Lakehouse.
5
5
  Author: Geronimo Forconi
6
6
  Requires-Python: >=3.8
7
7
  Description-Content-Type: text/markdown
8
- Requires-Dist: pyspark>=3.0.0
9
8
 
10
9
  # gss-bi-udfs
10
+
11
11
  Creo modulo para guardar UDFs comunes a todas las areas de BI.
12
12
 
13
13
 
14
+ # para compilar
15
+
16
+ python3 -m build
17
+
18
+ # para publicar
19
+
20
+ python3 -m twine upload dist/*
@@ -0,0 +1,18 @@
1
+ gss_bi_udfs/__init__.py,sha256=VNj2_l7MHiRGF497XVM4KtU7p6JOX1xddkvFJLG1vUQ,152
2
+ gss_bi_udfs/io.py,sha256=BZVf1BACH71MQFn-png1828E2WnNEwqTq3AxZR_UZWk,17173
3
+ gss_bi_udfs/merges.py,sha256=4YHfw6TWU08ZWEMKBtFlMqj_tzXzjqkuM_CJn0uRNUI,7977
4
+ gss_bi_udfs/transforms.py,sha256=yDg7uvPFSTrGXgy5rOUKDdSrRBBZSubfi9K-6rATCWY,1876
5
+ gss_bi_udfs/utils.py,sha256=ryyqrzhybC6mZFTUWsnnrQXReUcLkVqw6e2gIf4Id_g,5982
6
+ scripts/run_tests.py,sha256=6yG35rkURojbHmEnLKkPnHSn5bmViP7yJwng5hXj9xs,4407
7
+ tests/test_io.py,sha256=oFTJK6UZJXec53lPoArWSHKPRpNGbhI8ZVb1ZjfXW8U,4847
8
+ tests/test_merges.py,sha256=_PHYRU0DwRn5Vg05clz8jL7_d8QutWiiTACHLiNPrZo,2221
9
+ tests/test_transforms.py,sha256=4fqKyemSV-4nfMzhTamaE5mWXnglV08uvw67sWj84Og,2206
10
+ tests/test_utils.py,sha256=FUap5pqqEDvmBmBLeSBN39FoQDQSz3hpN4qCQrUniEU,4541
11
+ workspace/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ workspace/prueba.py,sha256=n-zGeMhFRrMLp9cx-vvJYTSFMZHCYm_R-xmGiQvj1Fk,223
13
+ workspace/prueba_calculadora.py,sha256=4pu4Wg-h_aALbqDsfSeJlPbyx9FpODIbHGNAse5Mz5M,512
14
+ workspace/tests/test_prueba_calculadora.py,sha256=9cHXunht_EVwBgBELv5YlazkJMWSi97A5VnB9Mvs-kU,257
15
+ gss_bi_udfs-0.1.2.dist-info/METADATA,sha256=xcsXvP6SEvUiMh2g6XG85DycKXsFqom_cpV4Xpz6rgI,393
16
+ gss_bi_udfs-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
17
+ gss_bi_udfs-0.1.2.dist-info/top_level.txt,sha256=NzlGPsUajFQDfmDbDhTCgu4LBvrm0d1shuPSJIBpLfw,36
18
+ gss_bi_udfs-0.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,4 @@
1
+ gss_bi_udfs
2
+ scripts
3
+ tests
4
+ workspace
scripts/run_tests.py ADDED
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import datetime as dt
4
+ import html
5
+ import os
6
+ import sys
7
+ import unittest
8
+
9
+
10
+ class _CollectingTextTestResult(unittest.TextTestResult):
11
+ def __init__(self, stream, descriptions, verbosity):
12
+ super().__init__(stream, descriptions, verbosity)
13
+ self.successes = []
14
+
15
+ def addSuccess(self, test):
16
+ super().addSuccess(test)
17
+ self.successes.append(test)
18
+
19
+
20
+ def _run_tests(start_dir: str, pattern: str, verbosity: int) -> unittest.TestResult:
21
+ suite = unittest.defaultTestLoader.discover(start_dir=start_dir, pattern=pattern)
22
+ runner = unittest.TextTestRunner(verbosity=verbosity, resultclass=_CollectingTextTestResult)
23
+ return runner.run(suite)
24
+
25
+
26
+ def _timestamped_path(path: str) -> str:
27
+ ts = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
28
+ base, ext = os.path.splitext(path)
29
+ if ext:
30
+ return f"{base}_{ts}{ext}"
31
+ return f"{path}_{ts}"
32
+
33
+
34
+ def _render_html(result: unittest.TestResult, output_path: str) -> None:
35
+ rows = []
36
+ for test in getattr(result, "successes", []):
37
+ rows.append(("PASS", str(test), ""))
38
+
39
+ for test, tb in result.failures + result.errors:
40
+ rows.append(
41
+ (
42
+ "FAIL" if (test, tb) in result.failures else "ERROR",
43
+ str(test),
44
+ tb,
45
+ )
46
+ )
47
+
48
+ for test, reason in result.skipped:
49
+ rows.append(("SKIP", str(test), reason))
50
+
51
+ status = "PASSED" if result.wasSuccessful() else "FAILED"
52
+ now = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
53
+
54
+ html_body = [
55
+ "<!doctype html>",
56
+ "<html lang='en'>",
57
+ "<head>",
58
+ " <meta charset='utf-8'/>",
59
+ " <title>Unit Test Report</title>",
60
+ " <style>",
61
+ " body { font-family: Arial, sans-serif; margin: 24px; }",
62
+ " .ok { color: #0f5132; }",
63
+ " .bad { color: #842029; }",
64
+ " table { border-collapse: collapse; width: 100%; margin-top: 16px; }",
65
+ " th, td { border: 1px solid #ddd; padding: 8px; text-align: left; vertical-align: top; }",
66
+ " th { background: #f5f5f5; }",
67
+ " pre { white-space: pre-wrap; margin: 0; }",
68
+ " </style>",
69
+ "</head>",
70
+ "<body>",
71
+ f" <h1>Unit Test Report</h1>",
72
+ f" <p><strong>Generated:</strong> {html.escape(now)}</p>",
73
+ f" <p><strong>Status:</strong> <span class='{'ok' if result.wasSuccessful() else 'bad'}'>{status}</span></p>",
74
+ f" <p><strong>Ran:</strong> {result.testsRun} tests</p>",
75
+ f" <p><strong>Pass:</strong> {len(getattr(result, 'successes', []))} | <strong>Failures:</strong> {len(result.failures)} | <strong>Errors:</strong> {len(result.errors)} | <strong>Skipped:</strong> {len(result.skipped)}</p>",
76
+ " <table>",
77
+ " <thead><tr><th>Type</th><th>Test</th><th>Details</th></tr></thead>",
78
+ " <tbody>",
79
+ ]
80
+
81
+ if rows:
82
+ for row_type, test_name, details in rows:
83
+ html_body.append(
84
+ " <tr>"
85
+ f"<td>{html.escape(row_type)}</td>"
86
+ f"<td>{html.escape(test_name)}</td>"
87
+ f"<td><pre>{html.escape(details)}</pre></td>"
88
+ "</tr>"
89
+ )
90
+ else:
91
+ html_body.append(" <tr><td colspan='3'>No failures, errors or skipped tests.</td></tr>")
92
+
93
+ html_body += [
94
+ " </tbody>",
95
+ " </table>",
96
+ "</body>",
97
+ "</html>",
98
+ ]
99
+
100
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
101
+ with open(output_path, "w", encoding="utf-8") as f:
102
+ f.write("\n".join(html_body))
103
+
104
+
105
+ def main() -> int:
106
+ parser = argparse.ArgumentParser(description="Run unittests and optionally write HTML report.")
107
+ parser.add_argument("--start-dir", default="tests")
108
+ parser.add_argument("--pattern", default="test*.py")
109
+ parser.add_argument("--verbosity", type=int, default=2)
110
+ parser.add_argument("--html", default="")
111
+ parser.add_argument("--timestamped", action="store_true")
112
+ args = parser.parse_args()
113
+
114
+ result = _run_tests(args.start_dir, args.pattern, args.verbosity)
115
+
116
+ if args.html:
117
+ output_path = _timestamped_path(args.html) if args.timestamped else args.html
118
+ _render_html(result, output_path)
119
+ print(f"HTML report written to: {output_path}")
120
+
121
+ return 0 if result.wasSuccessful() else 1
122
+
123
+
124
+ if __name__ == "__main__":
125
+ sys.exit(main())
tests/test_io.py ADDED
@@ -0,0 +1,128 @@
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+ from types import SimpleNamespace
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ from gss_bi_udfs import io
8
+
9
+
10
+ class TestIO(unittest.TestCase):
11
+ def test_normalize_path(self):
12
+ self.assertEqual(io._normalize_path("dbfs:/tmp/a.parquet"), "/tmp/a.parquet")
13
+ self.assertEqual(io._normalize_path("/tmp/a.parquet"), "/tmp/a.parquet")
14
+
15
+ def test_ls_path_local(self):
16
+ with tempfile.TemporaryDirectory() as tmpdir:
17
+ file_path = Path(tmpdir) / "file.txt"
18
+ file_path.write_text("x", encoding="utf-8")
19
+ (Path(tmpdir) / "folder").mkdir()
20
+
21
+ files = io._ls_path(tmpdir)
22
+ names = sorted([f.name for f in files])
23
+ self.assertEqual(names, ["file.txt", "folder"])
24
+
25
+ self.assertEqual(io._ls_path("/path/that/does/not/exist"), [])
26
+
27
+ @patch("gss_bi_udfs.io._ls_path")
28
+ def test_load_latest_parquet_returns_latest_match(self, mock_ls_path):
29
+ mock_ls_path.return_value = [
30
+ SimpleNamespace(name="clientes_20240101.parquet", path="/tmp/1"),
31
+ SimpleNamespace(name="clientes_20240102.parquet", path="/tmp/2"),
32
+ SimpleNamespace(name="otra_tabla_20240103.parquet", path="/tmp/3"),
33
+ ]
34
+ spark = MagicMock()
35
+ expected_df = object()
36
+ spark.read.parquet.return_value = expected_df
37
+
38
+ out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
39
+
40
+ self.assertIs(out, expected_df)
41
+ spark.read.parquet.assert_called_once_with("/tmp/2")
42
+
43
+ @patch("gss_bi_udfs.io._ls_path")
44
+ def test_load_latest_parquet_returns_none_without_matches(self, mock_ls_path):
45
+ mock_ls_path.return_value = [SimpleNamespace(name="x.parquet", path="/tmp/x")]
46
+ spark = MagicMock()
47
+
48
+ out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
49
+
50
+ self.assertIsNone(out)
51
+ spark.read.parquet.assert_not_called()
52
+
53
+ @patch("gss_bi_udfs.io.load_latest_parquet")
54
+ def test_return_parquets_and_register_temp_views(self, mock_load_latest):
55
+ df = MagicMock()
56
+ mock_load_latest.return_value = df
57
+ spark = MagicMock()
58
+ tables_load = {
59
+ "db1": {
60
+ "sch1": [
61
+ {"table": "t1", "view": "vw_t1"},
62
+ {"table": "t2", "view": "vw_t2"},
63
+ ]
64
+ }
65
+ }
66
+
67
+ out = io.return_parquets_and_register_temp_views(spark, tables_load, env="dev")
68
+
69
+ self.assertEqual(set(out.keys()), {"db1.sch1.t1", "db1.sch1.t2"})
70
+ self.assertEqual(mock_load_latest.call_count, 2)
71
+ self.assertEqual(df.createOrReplaceTempView.call_count, 2)
72
+
73
+ @patch("gss_bi_udfs.io.load_latest_parquet")
74
+ def test_parquets_register_temp_views(self, mock_load_latest):
75
+ df = MagicMock()
76
+ mock_load_latest.return_value = df
77
+ spark = MagicMock()
78
+ tables_load = {"db1": {"sch1": [{"table": "t1", "view": "vw_t1"}]}}
79
+
80
+ io.parquets_register_temp_views(spark, tables_load, env="dev")
81
+
82
+ df.createOrReplaceTempView.assert_called_once_with("vw_t1")
83
+
84
+ @patch("gss_bi_udfs.io._ls_path")
85
+ @patch("pandas.read_excel")
86
+ def test_load_latest_excel(self, mock_read_excel, mock_ls_path):
87
+ mock_ls_path.return_value = [
88
+ SimpleNamespace(name="a_old", path="dbfs:/tmp/a_old", isFile=lambda: True),
89
+ SimpleNamespace(name="b_new", path="dbfs:/tmp/b_new", isFile=lambda: True),
90
+ ]
91
+ mock_pdf = object()
92
+ mock_read_excel.return_value = mock_pdf
93
+ spark = MagicMock()
94
+ expected_df = object()
95
+ spark.createDataFrame.return_value = expected_df
96
+
97
+ out = io.load_latest_excel(spark, "dom/sub/file", env="dev")
98
+
99
+ self.assertIs(out, expected_df)
100
+ mock_read_excel.assert_called_once_with("/tmp/b_new", header=0, engine="xlrd")
101
+ spark.createDataFrame.assert_called_once_with(mock_pdf)
102
+
103
+ def test_load_and_materialize_views_unknown_action(self):
104
+ out = io.load_and_materialize_views("accion_inexistente")
105
+ self.assertEqual(out, {})
106
+
107
+ @patch("gss_bi_udfs.io.get_table_info")
108
+ def test_save_table_to_delta_writes_delta(self, mock_get_table_info):
109
+ mock_get_table_info.return_value = {
110
+ "path": "/tmp/tbl",
111
+ "full_table_name": "cat.sch.tbl",
112
+ }
113
+ df = MagicMock()
114
+ writer = MagicMock()
115
+ writer.format.return_value = writer
116
+ writer.option.return_value = writer
117
+ writer.mode.return_value = writer
118
+ df.write = writer
119
+
120
+ io.save_table_to_delta(df, "cat", "sch", "tbl")
121
+
122
+ writer.format.assert_called_once_with("delta")
123
+ writer.mode.assert_called_once_with("overwrite")
124
+ writer.saveAsTable.assert_called_once_with("cat.sch.tbl")
125
+
126
+
127
+ if __name__ == "__main__":
128
+ unittest.main()
tests/test_merges.py ADDED
@@ -0,0 +1,65 @@
1
+ import unittest
2
+ from unittest.mock import MagicMock, patch
3
+
4
+ from gss_bi_udfs import merges
5
+
6
+
7
+ class TestMerges(unittest.TestCase):
8
+ def test_merge_scd2_raises_if_business_key_is_missing(self):
9
+ spark = MagicMock()
10
+ df_dim_src = MagicMock()
11
+ df_dim_src.columns = ["id", "descripcion"]
12
+
13
+ with self.assertRaises(ValueError):
14
+ merges.merge_scd2(
15
+ spark=spark,
16
+ df_dim_src=df_dim_src,
17
+ table_name="cat.sch.dim",
18
+ business_keys="codigo_negocio",
19
+ surrogate_key="sk_dim",
20
+ )
21
+
22
+ @patch("gss_bi_udfs.merges.save_table_to_delta")
23
+ @patch("gss_bi_udfs.merges.add_hashid")
24
+ @patch("gss_bi_udfs.merges.get_table_info")
25
+ def test_merge_scd2_full_load_path(self, mock_get_table_info, mock_add_hashid, mock_save_table):
26
+ spark = MagicMock()
27
+ spark.catalog.tableExists.return_value = False
28
+ mock_get_table_info.return_value = {
29
+ "catalog": "cat_dev",
30
+ "schema": "sch",
31
+ "table": "dim_cliente",
32
+ "full_table_name": "cat_dev.sch.dim_cliente",
33
+ }
34
+
35
+ df_dim_src = MagicMock()
36
+ df_dim_src.columns = ["codigo_negocio", "descripcion"]
37
+ df_dim_src.withColumn.return_value = df_dim_src
38
+
39
+ df_hashed = MagicMock()
40
+ writer = MagicMock()
41
+ writer.format.return_value = writer
42
+ writer.mode.return_value = writer
43
+ writer.option.return_value = writer
44
+ df_hashed.write = writer
45
+ mock_add_hashid.return_value = df_hashed
46
+
47
+ merges.merge_scd2(
48
+ spark=spark,
49
+ df_dim_src=df_dim_src,
50
+ table_name="cat.sch.dim_cliente",
51
+ business_keys="codigo_negocio",
52
+ surrogate_key="sk_dim_cliente",
53
+ )
54
+
55
+ mock_add_hashid.assert_called_once()
56
+ add_hashid_args = mock_add_hashid.call_args.args
57
+ self.assertEqual(add_hashid_args[1], ["codigo_negocio", "valid_from"])
58
+ self.assertEqual(add_hashid_args[2], "sk_dim_cliente")
59
+
60
+ self.assertEqual(mock_save_table.call_count, 1)
61
+ writer.saveAsTable.assert_called_once_with("cat.sch.dim_cliente")
62
+
63
+
64
+ if __name__ == "__main__":
65
+ unittest.main()
@@ -0,0 +1,66 @@
1
+ import unittest
2
+ from types import SimpleNamespace
3
+ from unittest.mock import MagicMock, patch
4
+
5
+ from gss_bi_udfs import transforms
6
+
7
+
8
+ class _FakeCol:
9
+ def __init__(self, name):
10
+ self.name = name
11
+
12
+ def cast(self, dtype):
13
+ return f"{self.name}:{dtype}"
14
+
15
+
16
+ class TestTransforms(unittest.TestCase):
17
+ def test_add_hashid_raises_when_columns_empty(self):
18
+ with self.assertRaises(ValueError):
19
+ transforms.add_hashid(MagicMock(), [])
20
+
21
+ @patch("gss_bi_udfs.transforms.xxhash64")
22
+ @patch("gss_bi_udfs.transforms.concat_ws")
23
+ @patch("gss_bi_udfs.transforms.col")
24
+ def test_add_hashid_builds_hash_and_reorders_columns(self, mock_col, mock_concat_ws, mock_xxhash64):
25
+ mock_col.side_effect = lambda name: _FakeCol(name)
26
+ mock_concat_ws.return_value = "concat_expr"
27
+ mock_xxhash64.return_value = "hash_expr"
28
+
29
+ df = MagicMock()
30
+ df.columns = ["id", "name"]
31
+ df_with_hash = MagicMock()
32
+ df.withColumn.return_value = df_with_hash
33
+ df_with_hash.select.return_value = "result_df"
34
+
35
+ out = transforms.add_hashid(df, ["id", "name"], "hash_pk")
36
+
37
+ self.assertEqual(out, "result_df")
38
+ df.withColumn.assert_called_once_with("hash_pk", "hash_expr")
39
+ df_with_hash.select.assert_called_once_with("hash_pk", "id", "name")
40
+
41
+ @patch("gss_bi_udfs.transforms.get_default_value_by_type")
42
+ def test_get_default_record_builds_single_row_with_schema_defaults(self, mock_defaults):
43
+ mock_defaults.side_effect = lambda dtype: f"default_for_{dtype}"
44
+ spark = MagicMock()
45
+ expected = object()
46
+ spark.createDataFrame.return_value = expected
47
+
48
+ df = MagicMock()
49
+ df.schema = SimpleNamespace(
50
+ fields=[
51
+ SimpleNamespace(name="id", dataType="int"),
52
+ SimpleNamespace(name="desc", dataType="string"),
53
+ ]
54
+ )
55
+
56
+ out = transforms.get_default_record(spark, df)
57
+
58
+ self.assertIs(out, expected)
59
+ spark.createDataFrame.assert_called_once_with(
60
+ [{"id": "default_for_int", "desc": "default_for_string"}],
61
+ schema=df.schema,
62
+ )
63
+
64
+
65
+ if __name__ == "__main__":
66
+ unittest.main()