gss-bi-udfs 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gss_bi_udfs/io.py CHANGED
@@ -1,5 +1,48 @@
1
+ from pathlib import Path
2
+ from datetime import datetime
1
3
  from .utils import get_env, get_table_info
2
4
 
5
+
6
+ class _LocalFileInfo:
7
+ # univamentemente para uso de la libreria en entornos locales
8
+ def __init__(self, path: str):
9
+ self.path = path
10
+ p = Path(path)
11
+ self.name = p.name
12
+ self.size = p.stat().st_size if p.exists() else 0
13
+ self.modificationTime = int(p.stat().st_mtime * 1000) if p.exists() else 0
14
+
15
+ def isFile(self) -> bool:
16
+ return Path(self.path).is_file()
17
+
18
+ def __repr__(self) -> str:
19
+ return (
20
+ "FileInfo("
21
+ f"path='{self.path}', "
22
+ f"name='{self.name}', "
23
+ f"size={self.size}, "
24
+ f"modificationTime={self.modificationTime}"
25
+ ")"
26
+ )
27
+
28
+ def _normalize_path(path):
29
+ if path.startswith("dbfs:"):
30
+ return path.replace("dbfs:", "", 1)
31
+ return path
32
+
33
+
34
+ def _ls_path(base_path):
35
+ try:
36
+ # Databricks runtime provides dbutils in globals.
37
+ files = dbutils.fs.ls(base_path) # type: ignore
38
+ return files
39
+ except Exception:
40
+ local_path = _normalize_path(base_path)
41
+ p = Path(local_path)
42
+ if not p.exists():
43
+ return []
44
+ return [_LocalFileInfo(str(child)) for child in p.iterdir()]
45
+
3
46
  # def load_latest_file_bronze(spark, data_base, schema, table, env=None):
4
47
  def load_latest_parquet(spark, data_base, schema, table, env=None):
5
48
  """
@@ -18,9 +61,10 @@ def load_latest_parquet(spark, data_base, schema, table, env=None):
18
61
  """
19
62
  env = env or get_env()
20
63
  base_path = f"/Volumes/bronze/{data_base}_{schema}/{env}/{table}/"
64
+ print("Ruta base:", base_path)
21
65
 
22
66
  try:
23
- files = dbutils.fs.ls(base_path) # type: ignore
67
+ files = _ls_path(base_path)
24
68
 
25
69
  parquet_files = [f for f in files if table in f.name]
26
70
 
@@ -178,7 +222,7 @@ def load_latest_excel(spark, source_file, env=None):
178
222
  print("Ruta base:", base_path)
179
223
 
180
224
  try:
181
- files = dbutils.fs.ls(base_path) # type: ignore
225
+ files = _ls_path(base_path)
182
226
  print("Archivos encontrados:", [f.name for f in files])
183
227
  excel_candidates = [f for f in files if f.isFile()]
184
228
 
@@ -330,12 +374,13 @@ def load_and_materialize_views(action, **kwargs):
330
374
  return results
331
375
 
332
376
 
333
- def save_table_to_delta(df, catalog, schema, table_name):
377
+ def save_table_to_delta(spark, df, catalog, schema, table_name):
334
378
  """
335
379
  Guarda un DataFrame en formato Delta en la ubicación y tabla especificadas,
336
380
  sobrescribiendo los datos existentes y el esquema si es necesario.
337
381
 
338
382
  Parámetros:
383
+ spark (SparkSession): Sesión activa de Spark.
339
384
  df (DataFrame): DataFrame de Spark que se desea guardar.
340
385
  db_name (str): Nombre del catálogo o base de datos destino.
341
386
  schema (str): Nombre del esquema, capa o entorno destino (ejemplo: 'silver', 'gold').
@@ -363,7 +408,7 @@ def save_table_to_delta(df, catalog, schema, table_name):
363
408
  - Si ambas opciones se usan al mismo tiempo, solo una tendrá efecto (se aplicará la última indicada).
364
409
 
365
410
  """
366
- dim_destino = get_table_info(catalog=catalog, schema=schema, table=table_name)
411
+ dim_destino = get_table_info(spark=spark, catalog=catalog, schema=schema, table=table_name)
367
412
  (
368
413
  df.write
369
414
  .format("delta")
@@ -1,11 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gss-bi-udfs
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Utilidades reutilizables para Spark y Delta Lake en arquitecturas Lakehouse.
5
5
  Author: Geronimo Forconi
6
6
  Requires-Python: >=3.8
7
7
  Description-Content-Type: text/markdown
8
- Requires-Dist: pyspark>=3.0.0
9
8
 
10
9
  # gss-bi-udfs
11
10
 
@@ -0,0 +1,18 @@
1
+ gss_bi_udfs/__init__.py,sha256=VNj2_l7MHiRGF497XVM4KtU7p6JOX1xddkvFJLG1vUQ,152
2
+ gss_bi_udfs/io.py,sha256=yneOHVcHbcSOuAy02jS3_qm7YXGyeDpXFk9e-5VJ36A,17246
3
+ gss_bi_udfs/merges.py,sha256=4YHfw6TWU08ZWEMKBtFlMqj_tzXzjqkuM_CJn0uRNUI,7977
4
+ gss_bi_udfs/transforms.py,sha256=yDg7uvPFSTrGXgy5rOUKDdSrRBBZSubfi9K-6rATCWY,1876
5
+ gss_bi_udfs/utils.py,sha256=ryyqrzhybC6mZFTUWsnnrQXReUcLkVqw6e2gIf4Id_g,5982
6
+ scripts/run_tests.py,sha256=6yG35rkURojbHmEnLKkPnHSn5bmViP7yJwng5hXj9xs,4407
7
+ tests/test_io.py,sha256=oFTJK6UZJXec53lPoArWSHKPRpNGbhI8ZVb1ZjfXW8U,4847
8
+ tests/test_merges.py,sha256=_PHYRU0DwRn5Vg05clz8jL7_d8QutWiiTACHLiNPrZo,2221
9
+ tests/test_transforms.py,sha256=4fqKyemSV-4nfMzhTamaE5mWXnglV08uvw67sWj84Og,2206
10
+ tests/test_utils.py,sha256=FUap5pqqEDvmBmBLeSBN39FoQDQSz3hpN4qCQrUniEU,4541
11
+ workspace/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ workspace/prueba.py,sha256=n-zGeMhFRrMLp9cx-vvJYTSFMZHCYm_R-xmGiQvj1Fk,223
13
+ workspace/prueba_calculadora.py,sha256=4pu4Wg-h_aALbqDsfSeJlPbyx9FpODIbHGNAse5Mz5M,512
14
+ workspace/tests/test_prueba_calculadora.py,sha256=9cHXunht_EVwBgBELv5YlazkJMWSi97A5VnB9Mvs-kU,257
15
+ gss_bi_udfs-0.1.3.dist-info/METADATA,sha256=u4tbdf_peTRGGTMzIMlsle_BRrQ56uTPEHs63GmfFm8,393
16
+ gss_bi_udfs-0.1.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
17
+ gss_bi_udfs-0.1.3.dist-info/top_level.txt,sha256=NzlGPsUajFQDfmDbDhTCgu4LBvrm0d1shuPSJIBpLfw,36
18
+ gss_bi_udfs-0.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,4 @@
1
+ gss_bi_udfs
2
+ scripts
3
+ tests
4
+ workspace
scripts/run_tests.py ADDED
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import datetime as dt
4
+ import html
5
+ import os
6
+ import sys
7
+ import unittest
8
+
9
+
10
+ class _CollectingTextTestResult(unittest.TextTestResult):
11
+ def __init__(self, stream, descriptions, verbosity):
12
+ super().__init__(stream, descriptions, verbosity)
13
+ self.successes = []
14
+
15
+ def addSuccess(self, test):
16
+ super().addSuccess(test)
17
+ self.successes.append(test)
18
+
19
+
20
+ def _run_tests(start_dir: str, pattern: str, verbosity: int) -> unittest.TestResult:
21
+ suite = unittest.defaultTestLoader.discover(start_dir=start_dir, pattern=pattern)
22
+ runner = unittest.TextTestRunner(verbosity=verbosity, resultclass=_CollectingTextTestResult)
23
+ return runner.run(suite)
24
+
25
+
26
+ def _timestamped_path(path: str) -> str:
27
+ ts = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
28
+ base, ext = os.path.splitext(path)
29
+ if ext:
30
+ return f"{base}_{ts}{ext}"
31
+ return f"{path}_{ts}"
32
+
33
+
34
+ def _render_html(result: unittest.TestResult, output_path: str) -> None:
35
+ rows = []
36
+ for test in getattr(result, "successes", []):
37
+ rows.append(("PASS", str(test), ""))
38
+
39
+ for test, tb in result.failures + result.errors:
40
+ rows.append(
41
+ (
42
+ "FAIL" if (test, tb) in result.failures else "ERROR",
43
+ str(test),
44
+ tb,
45
+ )
46
+ )
47
+
48
+ for test, reason in result.skipped:
49
+ rows.append(("SKIP", str(test), reason))
50
+
51
+ status = "PASSED" if result.wasSuccessful() else "FAILED"
52
+ now = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
53
+
54
+ html_body = [
55
+ "<!doctype html>",
56
+ "<html lang='en'>",
57
+ "<head>",
58
+ " <meta charset='utf-8'/>",
59
+ " <title>Unit Test Report</title>",
60
+ " <style>",
61
+ " body { font-family: Arial, sans-serif; margin: 24px; }",
62
+ " .ok { color: #0f5132; }",
63
+ " .bad { color: #842029; }",
64
+ " table { border-collapse: collapse; width: 100%; margin-top: 16px; }",
65
+ " th, td { border: 1px solid #ddd; padding: 8px; text-align: left; vertical-align: top; }",
66
+ " th { background: #f5f5f5; }",
67
+ " pre { white-space: pre-wrap; margin: 0; }",
68
+ " </style>",
69
+ "</head>",
70
+ "<body>",
71
+ f" <h1>Unit Test Report</h1>",
72
+ f" <p><strong>Generated:</strong> {html.escape(now)}</p>",
73
+ f" <p><strong>Status:</strong> <span class='{'ok' if result.wasSuccessful() else 'bad'}'>{status}</span></p>",
74
+ f" <p><strong>Ran:</strong> {result.testsRun} tests</p>",
75
+ f" <p><strong>Pass:</strong> {len(getattr(result, 'successes', []))} | <strong>Failures:</strong> {len(result.failures)} | <strong>Errors:</strong> {len(result.errors)} | <strong>Skipped:</strong> {len(result.skipped)}</p>",
76
+ " <table>",
77
+ " <thead><tr><th>Type</th><th>Test</th><th>Details</th></tr></thead>",
78
+ " <tbody>",
79
+ ]
80
+
81
+ if rows:
82
+ for row_type, test_name, details in rows:
83
+ html_body.append(
84
+ " <tr>"
85
+ f"<td>{html.escape(row_type)}</td>"
86
+ f"<td>{html.escape(test_name)}</td>"
87
+ f"<td><pre>{html.escape(details)}</pre></td>"
88
+ "</tr>"
89
+ )
90
+ else:
91
+ html_body.append(" <tr><td colspan='3'>No failures, errors or skipped tests.</td></tr>")
92
+
93
+ html_body += [
94
+ " </tbody>",
95
+ " </table>",
96
+ "</body>",
97
+ "</html>",
98
+ ]
99
+
100
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
101
+ with open(output_path, "w", encoding="utf-8") as f:
102
+ f.write("\n".join(html_body))
103
+
104
+
105
+ def main() -> int:
106
+ parser = argparse.ArgumentParser(description="Run unittests and optionally write HTML report.")
107
+ parser.add_argument("--start-dir", default="tests")
108
+ parser.add_argument("--pattern", default="test*.py")
109
+ parser.add_argument("--verbosity", type=int, default=2)
110
+ parser.add_argument("--html", default="")
111
+ parser.add_argument("--timestamped", action="store_true")
112
+ args = parser.parse_args()
113
+
114
+ result = _run_tests(args.start_dir, args.pattern, args.verbosity)
115
+
116
+ if args.html:
117
+ output_path = _timestamped_path(args.html) if args.timestamped else args.html
118
+ _render_html(result, output_path)
119
+ print(f"HTML report written to: {output_path}")
120
+
121
+ return 0 if result.wasSuccessful() else 1
122
+
123
+
124
+ if __name__ == "__main__":
125
+ sys.exit(main())
tests/test_io.py ADDED
@@ -0,0 +1,128 @@
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+ from types import SimpleNamespace
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ from gss_bi_udfs import io
8
+
9
+
10
+ class TestIO(unittest.TestCase):
11
+ def test_normalize_path(self):
12
+ self.assertEqual(io._normalize_path("dbfs:/tmp/a.parquet"), "/tmp/a.parquet")
13
+ self.assertEqual(io._normalize_path("/tmp/a.parquet"), "/tmp/a.parquet")
14
+
15
+ def test_ls_path_local(self):
16
+ with tempfile.TemporaryDirectory() as tmpdir:
17
+ file_path = Path(tmpdir) / "file.txt"
18
+ file_path.write_text("x", encoding="utf-8")
19
+ (Path(tmpdir) / "folder").mkdir()
20
+
21
+ files = io._ls_path(tmpdir)
22
+ names = sorted([f.name for f in files])
23
+ self.assertEqual(names, ["file.txt", "folder"])
24
+
25
+ self.assertEqual(io._ls_path("/path/that/does/not/exist"), [])
26
+
27
+ @patch("gss_bi_udfs.io._ls_path")
28
+ def test_load_latest_parquet_returns_latest_match(self, mock_ls_path):
29
+ mock_ls_path.return_value = [
30
+ SimpleNamespace(name="clientes_20240101.parquet", path="/tmp/1"),
31
+ SimpleNamespace(name="clientes_20240102.parquet", path="/tmp/2"),
32
+ SimpleNamespace(name="otra_tabla_20240103.parquet", path="/tmp/3"),
33
+ ]
34
+ spark = MagicMock()
35
+ expected_df = object()
36
+ spark.read.parquet.return_value = expected_df
37
+
38
+ out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
39
+
40
+ self.assertIs(out, expected_df)
41
+ spark.read.parquet.assert_called_once_with("/tmp/2")
42
+
43
+ @patch("gss_bi_udfs.io._ls_path")
44
+ def test_load_latest_parquet_returns_none_without_matches(self, mock_ls_path):
45
+ mock_ls_path.return_value = [SimpleNamespace(name="x.parquet", path="/tmp/x")]
46
+ spark = MagicMock()
47
+
48
+ out = io.load_latest_parquet(spark, "db", "sch", "clientes", env="dev")
49
+
50
+ self.assertIsNone(out)
51
+ spark.read.parquet.assert_not_called()
52
+
53
+ @patch("gss_bi_udfs.io.load_latest_parquet")
54
+ def test_return_parquets_and_register_temp_views(self, mock_load_latest):
55
+ df = MagicMock()
56
+ mock_load_latest.return_value = df
57
+ spark = MagicMock()
58
+ tables_load = {
59
+ "db1": {
60
+ "sch1": [
61
+ {"table": "t1", "view": "vw_t1"},
62
+ {"table": "t2", "view": "vw_t2"},
63
+ ]
64
+ }
65
+ }
66
+
67
+ out = io.return_parquets_and_register_temp_views(spark, tables_load, env="dev")
68
+
69
+ self.assertEqual(set(out.keys()), {"db1.sch1.t1", "db1.sch1.t2"})
70
+ self.assertEqual(mock_load_latest.call_count, 2)
71
+ self.assertEqual(df.createOrReplaceTempView.call_count, 2)
72
+
73
+ @patch("gss_bi_udfs.io.load_latest_parquet")
74
+ def test_parquets_register_temp_views(self, mock_load_latest):
75
+ df = MagicMock()
76
+ mock_load_latest.return_value = df
77
+ spark = MagicMock()
78
+ tables_load = {"db1": {"sch1": [{"table": "t1", "view": "vw_t1"}]}}
79
+
80
+ io.parquets_register_temp_views(spark, tables_load, env="dev")
81
+
82
+ df.createOrReplaceTempView.assert_called_once_with("vw_t1")
83
+
84
+ @patch("gss_bi_udfs.io._ls_path")
85
+ @patch("pandas.read_excel")
86
+ def test_load_latest_excel(self, mock_read_excel, mock_ls_path):
87
+ mock_ls_path.return_value = [
88
+ SimpleNamespace(name="a_old", path="dbfs:/tmp/a_old", isFile=lambda: True),
89
+ SimpleNamespace(name="b_new", path="dbfs:/tmp/b_new", isFile=lambda: True),
90
+ ]
91
+ mock_pdf = object()
92
+ mock_read_excel.return_value = mock_pdf
93
+ spark = MagicMock()
94
+ expected_df = object()
95
+ spark.createDataFrame.return_value = expected_df
96
+
97
+ out = io.load_latest_excel(spark, "dom/sub/file", env="dev")
98
+
99
+ self.assertIs(out, expected_df)
100
+ mock_read_excel.assert_called_once_with("/tmp/b_new", header=0, engine="xlrd")
101
+ spark.createDataFrame.assert_called_once_with(mock_pdf)
102
+
103
+ def test_load_and_materialize_views_unknown_action(self):
104
+ out = io.load_and_materialize_views("accion_inexistente")
105
+ self.assertEqual(out, {})
106
+
107
+ @patch("gss_bi_udfs.io.get_table_info")
108
+ def test_save_table_to_delta_writes_delta(self, mock_get_table_info):
109
+ mock_get_table_info.return_value = {
110
+ "path": "/tmp/tbl",
111
+ "full_table_name": "cat.sch.tbl",
112
+ }
113
+ df = MagicMock()
114
+ writer = MagicMock()
115
+ writer.format.return_value = writer
116
+ writer.option.return_value = writer
117
+ writer.mode.return_value = writer
118
+ df.write = writer
119
+
120
+ io.save_table_to_delta(df, "cat", "sch", "tbl")
121
+
122
+ writer.format.assert_called_once_with("delta")
123
+ writer.mode.assert_called_once_with("overwrite")
124
+ writer.saveAsTable.assert_called_once_with("cat.sch.tbl")
125
+
126
+
127
+ if __name__ == "__main__":
128
+ unittest.main()
tests/test_merges.py ADDED
@@ -0,0 +1,65 @@
1
+ import unittest
2
+ from unittest.mock import MagicMock, patch
3
+
4
+ from gss_bi_udfs import merges
5
+
6
+
7
+ class TestMerges(unittest.TestCase):
8
+ def test_merge_scd2_raises_if_business_key_is_missing(self):
9
+ spark = MagicMock()
10
+ df_dim_src = MagicMock()
11
+ df_dim_src.columns = ["id", "descripcion"]
12
+
13
+ with self.assertRaises(ValueError):
14
+ merges.merge_scd2(
15
+ spark=spark,
16
+ df_dim_src=df_dim_src,
17
+ table_name="cat.sch.dim",
18
+ business_keys="codigo_negocio",
19
+ surrogate_key="sk_dim",
20
+ )
21
+
22
+ @patch("gss_bi_udfs.merges.save_table_to_delta")
23
+ @patch("gss_bi_udfs.merges.add_hashid")
24
+ @patch("gss_bi_udfs.merges.get_table_info")
25
+ def test_merge_scd2_full_load_path(self, mock_get_table_info, mock_add_hashid, mock_save_table):
26
+ spark = MagicMock()
27
+ spark.catalog.tableExists.return_value = False
28
+ mock_get_table_info.return_value = {
29
+ "catalog": "cat_dev",
30
+ "schema": "sch",
31
+ "table": "dim_cliente",
32
+ "full_table_name": "cat_dev.sch.dim_cliente",
33
+ }
34
+
35
+ df_dim_src = MagicMock()
36
+ df_dim_src.columns = ["codigo_negocio", "descripcion"]
37
+ df_dim_src.withColumn.return_value = df_dim_src
38
+
39
+ df_hashed = MagicMock()
40
+ writer = MagicMock()
41
+ writer.format.return_value = writer
42
+ writer.mode.return_value = writer
43
+ writer.option.return_value = writer
44
+ df_hashed.write = writer
45
+ mock_add_hashid.return_value = df_hashed
46
+
47
+ merges.merge_scd2(
48
+ spark=spark,
49
+ df_dim_src=df_dim_src,
50
+ table_name="cat.sch.dim_cliente",
51
+ business_keys="codigo_negocio",
52
+ surrogate_key="sk_dim_cliente",
53
+ )
54
+
55
+ mock_add_hashid.assert_called_once()
56
+ add_hashid_args = mock_add_hashid.call_args.args
57
+ self.assertEqual(add_hashid_args[1], ["codigo_negocio", "valid_from"])
58
+ self.assertEqual(add_hashid_args[2], "sk_dim_cliente")
59
+
60
+ self.assertEqual(mock_save_table.call_count, 1)
61
+ writer.saveAsTable.assert_called_once_with("cat.sch.dim_cliente")
62
+
63
+
64
+ if __name__ == "__main__":
65
+ unittest.main()
@@ -0,0 +1,66 @@
1
+ import unittest
2
+ from types import SimpleNamespace
3
+ from unittest.mock import MagicMock, patch
4
+
5
+ from gss_bi_udfs import transforms
6
+
7
+
8
+ class _FakeCol:
9
+ def __init__(self, name):
10
+ self.name = name
11
+
12
+ def cast(self, dtype):
13
+ return f"{self.name}:{dtype}"
14
+
15
+
16
+ class TestTransforms(unittest.TestCase):
17
+ def test_add_hashid_raises_when_columns_empty(self):
18
+ with self.assertRaises(ValueError):
19
+ transforms.add_hashid(MagicMock(), [])
20
+
21
+ @patch("gss_bi_udfs.transforms.xxhash64")
22
+ @patch("gss_bi_udfs.transforms.concat_ws")
23
+ @patch("gss_bi_udfs.transforms.col")
24
+ def test_add_hashid_builds_hash_and_reorders_columns(self, mock_col, mock_concat_ws, mock_xxhash64):
25
+ mock_col.side_effect = lambda name: _FakeCol(name)
26
+ mock_concat_ws.return_value = "concat_expr"
27
+ mock_xxhash64.return_value = "hash_expr"
28
+
29
+ df = MagicMock()
30
+ df.columns = ["id", "name"]
31
+ df_with_hash = MagicMock()
32
+ df.withColumn.return_value = df_with_hash
33
+ df_with_hash.select.return_value = "result_df"
34
+
35
+ out = transforms.add_hashid(df, ["id", "name"], "hash_pk")
36
+
37
+ self.assertEqual(out, "result_df")
38
+ df.withColumn.assert_called_once_with("hash_pk", "hash_expr")
39
+ df_with_hash.select.assert_called_once_with("hash_pk", "id", "name")
40
+
41
+ @patch("gss_bi_udfs.transforms.get_default_value_by_type")
42
+ def test_get_default_record_builds_single_row_with_schema_defaults(self, mock_defaults):
43
+ mock_defaults.side_effect = lambda dtype: f"default_for_{dtype}"
44
+ spark = MagicMock()
45
+ expected = object()
46
+ spark.createDataFrame.return_value = expected
47
+
48
+ df = MagicMock()
49
+ df.schema = SimpleNamespace(
50
+ fields=[
51
+ SimpleNamespace(name="id", dataType="int"),
52
+ SimpleNamespace(name="desc", dataType="string"),
53
+ ]
54
+ )
55
+
56
+ out = transforms.get_default_record(spark, df)
57
+
58
+ self.assertIs(out, expected)
59
+ spark.createDataFrame.assert_called_once_with(
60
+ [{"id": "default_for_int", "desc": "default_for_string"}],
61
+ schema=df.schema,
62
+ )
63
+
64
+
65
+ if __name__ == "__main__":
66
+ unittest.main()
tests/test_utils.py ADDED
@@ -0,0 +1,119 @@
1
+ import os
2
+ import unittest
3
+ from types import SimpleNamespace
4
+ from unittest.mock import MagicMock, patch
5
+
6
+ from pyspark.sql.types import (
7
+ BooleanType,
8
+ DateType,
9
+ DecimalType,
10
+ DoubleType,
11
+ FloatType,
12
+ IntegerType,
13
+ LongType,
14
+ StringType,
15
+ TimestampType,
16
+ )
17
+
18
+ from gss_bi_udfs import utils
19
+
20
+
21
+ class TestUtils(unittest.TestCase):
22
+ def test_get_env_uses_default_and_env_var(self):
23
+ with patch.dict(os.environ, {}, clear=True):
24
+ self.assertEqual(utils.get_env(), "dev")
25
+ self.assertEqual(utils.get_env(default="qa"), "qa")
26
+
27
+ with patch.dict(os.environ, {"ENV": "prod"}, clear=True):
28
+ self.assertEqual(utils.get_env(), "prod")
29
+
30
+ def test_get_env_catalog(self):
31
+ with patch.dict(os.environ, {"ENV": "pro"}, clear=True):
32
+ self.assertEqual(utils.get_env_catalog("fi_comunes"), "fi_comunes")
33
+ with patch.dict(os.environ, {"ENV": "dev"}, clear=True):
34
+ self.assertEqual(utils.get_env_catalog("fi_comunes"), "fi_comunes_dev")
35
+
36
+ def test_get_env_table_path(self):
37
+ with patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev"):
38
+ self.assertEqual(
39
+ utils.get_env_table_path("cat", "silver.dim_cliente"),
40
+ "cat_dev.silver.dim_cliente",
41
+ )
42
+
43
+ @patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev")
44
+ def test_get_schema_root_location(self, _mock_catalog):
45
+ spark = MagicMock()
46
+ df = MagicMock()
47
+ df.filter.return_value = df
48
+ df.select.return_value = df
49
+ df.collect.return_value = [["s3://bucket/root"]]
50
+ spark.sql.return_value = df
51
+
52
+ out = utils.get_schema_root_location(spark, "cat", "silver")
53
+
54
+ self.assertEqual(out, "s3://bucket/root")
55
+ spark.sql.assert_called_once_with("DESCRIBE SCHEMA EXTENDED cat_dev.silver")
56
+
57
+ def test_get_table_info_validations(self):
58
+ spark = MagicMock()
59
+ with self.assertRaises(ValueError):
60
+ utils.get_table_info(spark, full_table_name="solo.dos")
61
+ with self.assertRaises(ValueError):
62
+ utils.get_table_info(spark)
63
+
64
+ @patch("gss_bi_udfs.utils.get_schema_root_location", return_value="s3://bucket/root/silver")
65
+ @patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev")
66
+ def test_get_table_info_when_table_does_not_exist(self, _mock_env_catalog, _mock_root):
67
+ spark = MagicMock()
68
+ spark.catalog.tableExists.return_value = False
69
+
70
+ info = utils.get_table_info(spark, full_table_name="cat.silver.dim_cliente")
71
+
72
+ self.assertEqual(info["catalog"], "cat_dev")
73
+ self.assertEqual(info["schema"], "silver")
74
+ self.assertEqual(info["table"], "dim_cliente")
75
+ self.assertEqual(info["full_table_name"], "cat_dev.silver.dim_cliente")
76
+ self.assertEqual(info["path"], "s3://bucket/root/silver/dim_cliente")
77
+ self.assertFalse(info["exists"])
78
+
79
+ @patch("gss_bi_udfs.utils.get_schema_root_location", return_value="s3://bucket/root/silver")
80
+ @patch("gss_bi_udfs.utils.get_env_catalog", return_value="cat_dev")
81
+ def test_get_table_info_when_table_exists(self, _mock_env_catalog, _mock_root):
82
+ spark = MagicMock()
83
+ spark.catalog.tableExists.return_value = True
84
+ desc_df = MagicMock()
85
+ desc_df.filter.return_value = desc_df
86
+ desc_df.collect.return_value = [
87
+ SimpleNamespace(col_name="Location", data_type="s3://bucket/real/location"),
88
+ SimpleNamespace(col_name="Provider", data_type="delta"),
89
+ SimpleNamespace(col_name="Type", data_type="MANAGED"),
90
+ ]
91
+ spark.sql.return_value = desc_df
92
+
93
+ info = utils.get_table_info(spark, full_table_name="cat.silver.dim_cliente")
94
+
95
+ self.assertTrue(info["exists"])
96
+ self.assertEqual(info["path"], "s3://bucket/real/location")
97
+ self.assertEqual(info["provider"], "delta")
98
+ self.assertEqual(info["table_type"], "MANAGED")
99
+
100
+ def test_get_default_value_by_type_returns_column(self):
101
+ dtypes = [
102
+ IntegerType(),
103
+ LongType(),
104
+ DecimalType(10, 2),
105
+ DoubleType(),
106
+ FloatType(),
107
+ DateType(),
108
+ TimestampType(),
109
+ BooleanType(),
110
+ StringType(),
111
+ ]
112
+ for dtype in dtypes:
113
+ with self.subTest(dtype=dtype):
114
+ out = utils.get_default_value_by_type(dtype)
115
+ self.assertEqual(out.__class__.__name__, "Column")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ unittest.main()
workspace/main.py ADDED
File without changes
workspace/prueba.py ADDED
@@ -0,0 +1,10 @@
1
+ from pyspark.sql import SparkSession
2
+
3
+ spark = SparkSession.getActiveSession()
4
+ if spark is None:
5
+ spark = SparkSession.builder.appName("MiApp").getOrCreate()
6
+
7
+ df = spark.range(1000 * 1000)
8
+ print(df.count())
9
+
10
+ spark.stop()
@@ -0,0 +1,20 @@
1
+ class Calculadora:
2
+ """Clase calculadora con operaciones básicas"""
3
+
4
+ def sumar(self, a, b):
5
+ """Suma dos números"""
6
+ return a + b
7
+
8
+ def restar(self, a, b):
9
+ """Resta dos números"""
10
+ return a - b
11
+
12
+ def multiplicar(self, a, b):
13
+ """Multiplica dos números"""
14
+ return a * b
15
+
16
+ def dividir(self, a, b):
17
+ """Divide dos números"""
18
+ if b == 0:
19
+ raise ValueError("No se puede dividir entre cero")
20
+ return a / b
@@ -0,0 +1,14 @@
1
+ import pytest
2
+ import sys
3
+ sys.path.append("/workspace")
4
+
5
+ from .prueba_calculadora import sumar, restar
6
+
7
+ def test_sumar():
8
+ assert sumar(2, 3) == 5
9
+
10
+ def test_restar():
11
+ assert restar(5, 3) == 2
12
+
13
+ def test_sumar_negativos():
14
+ assert sumar(-1, -2) == -3
@@ -1,9 +0,0 @@
1
- gss_bi_udfs/__init__.py,sha256=VNj2_l7MHiRGF497XVM4KtU7p6JOX1xddkvFJLG1vUQ,152
2
- gss_bi_udfs/io.py,sha256=yEqQvpyBod9kIv7p-_5yLtINuIwsi-piWy5rKI3BgQk,15939
3
- gss_bi_udfs/merges.py,sha256=4YHfw6TWU08ZWEMKBtFlMqj_tzXzjqkuM_CJn0uRNUI,7977
4
- gss_bi_udfs/transforms.py,sha256=yDg7uvPFSTrGXgy5rOUKDdSrRBBZSubfi9K-6rATCWY,1876
5
- gss_bi_udfs/utils.py,sha256=ryyqrzhybC6mZFTUWsnnrQXReUcLkVqw6e2gIf4Id_g,5982
6
- gss_bi_udfs-0.1.1.dist-info/METADATA,sha256=q241xBvvuhhJRUL1wIGB_JKCkTxXAF9HY13yYjV3Ae8,423
7
- gss_bi_udfs-0.1.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
8
- gss_bi_udfs-0.1.1.dist-info/top_level.txt,sha256=jLjGHQoep6-wLbW6wFV611Zx4ak42Q9hKtH_3sUzX9o,12
9
- gss_bi_udfs-0.1.1.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- gss_bi_udfs