satisfactoscript 0.5.8__tar.gz → 0.5.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/PKG-INFO +1 -1
  2. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/pyproject.toml +1 -1
  3. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/__init__.py +2 -1
  4. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/core/core.py +15 -8
  5. satisfactoscript-0.5.10/src/satisfactoscript/registry.py +7 -0
  6. satisfactoscript-0.5.10/src/satisfactoscript/utils.py +29 -0
  7. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript.egg-info/PKG-INFO +1 -1
  8. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript.egg-info/SOURCES.txt +6 -1
  9. satisfactoscript-0.5.10/tests/test_core_join.py +140 -0
  10. satisfactoscript-0.5.10/tests/test_registry_import_paths.py +26 -0
  11. satisfactoscript-0.5.10/tests/test_utils_safe_columns.py +46 -0
  12. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/README.md +0 -0
  13. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/setup.cfg +0 -0
  14. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/agentic/__init__.py +0 -0
  15. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/agentic/agent.py +0 -0
  16. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/core/__init__.py +0 -0
  17. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/core/config.py +0 -0
  18. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/core/loaders.py +0 -0
  19. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/core/registry.py +0 -0
  20. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/semantic/__init__.py +0 -0
  21. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript/semantic/semantic.py +0 -0
  22. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript.egg-info/dependency_links.txt +0 -0
  23. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript.egg-info/requires.txt +0 -0
  24. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/src/satisfactoscript.egg-info/top_level.txt +0 -0
  25. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_config.py +0 -0
  26. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_core.py +0 -0
  27. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_core_connect_patch.py +0 -0
  28. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_core_env_detection.py +0 -0
  29. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_core_username.py +0 -0
  30. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_dummy.py +0 -0
  31. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_loaders.py +0 -0
  32. {satisfactoscript-0.5.8 → satisfactoscript-0.5.10}/tests/test_registry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: satisfactoscript
3
- Version: 0.5.8
3
+ Version: 0.5.10
4
4
  Summary: An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse.
5
5
  Author: julhouba
6
6
  Classifier: Programming Language :: Python :: 3
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "satisfactoscript"
7
- version = "0.5.8"
7
+ version = "0.5.10"
8
8
  description = "An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse."
9
9
  readme = "README.md"
10
10
  authors = [
@@ -1,5 +1,6 @@
1
1
  from .core.core import SatisfactoEngine
2
2
  from .core.registry import RuleRegistry
3
3
  from .core.config import ConfigurationManager
4
+ from .utils import safe_columns
4
5
 
5
- __all__ = ["SatisfactoEngine", "RuleRegistry", "ConfigurationManager"]
6
+ __all__ = ["SatisfactoEngine", "RuleRegistry", "ConfigurationManager", "safe_columns"]
@@ -657,17 +657,24 @@ class SatisfactoEngine:
657
657
  if "join" in schema_dict:
658
658
  for j in schema_dict["join"]:
659
659
  print(f" -> [Join] {j.get('type', 'left').upper()} JOIN {j['table_from']} -> {j['table_to']}")
660
-
660
+
661
661
  on_l = j["on_from"] if isinstance(j["on_from"], list) else [j["on_from"]]
662
662
  on_r = j["on_to"] if isinstance(j["on_to"], list) else [j["on_to"]]
663
-
664
- cond = reduce(lambda x, y: x & y, [F.col(f"l.{l}") == F.col(f"r.{r}") for l, r in zip(on_l, on_r)])
665
-
663
+ join_type = j.get("type", "left")
664
+
666
665
  df_to = dfs[j["table_to"]]
667
- cols_to_select = ["l.*"] + [F.col(f"r.{c}") for c in df_to.columns if c not in on_r]
668
-
669
- df_main = df_main.alias("l").join(df_to.alias("r"), cond, j.get("type", "left")) \
670
- .select(*cols_to_select)
666
+
667
+ if on_l == on_r:
668
+ # Same key names: PySpark list-join deduplicates automatically.
669
+ # No schema RPC (AnalyzePlan) needed.
670
+ df_main = df_main.join(df_to, on=on_l, how=join_type)
671
+ else:
672
+ # Different key names: build condition from DataFrame column references
673
+ # and drop right-side join keys post-join using unambiguous df refs.
674
+ # Avoids df_to.columns (AnalyzePlan RPC) that fails with UserContext
675
+ # on Databricks Connect v2 from local environments.
676
+ cond = reduce(lambda x, y: x & y, [df_main[l] == df_to[r] for l, r in zip(on_l, on_r)])
677
+ df_main = df_main.join(df_to, cond, join_type).drop(*[df_to[r] for r in on_r])
671
678
 
672
679
  # 3. BUSINESS RULES
673
680
  if "business_rules" in schema_dict:
@@ -0,0 +1,7 @@
1
+ """
2
+ Compatibility shim — keeps `from satisfactoscript.registry import RuleRegistry` working.
3
+ The canonical location is satisfactoscript.core.registry.
4
+ """
5
+ from satisfactoscript.core.registry import RuleRegistry
6
+
7
+ __all__ = ["RuleRegistry"]
@@ -0,0 +1,29 @@
1
+ """
2
+ Utility helpers for rule authoring — compatible with Databricks Connect v2.
3
+ """
4
+
5
+
6
+ def safe_columns(df):
7
+ """
8
+ Returns the column names of a DataFrame without triggering a failing AnalyzePlan RPC.
9
+
10
+ On native Databricks (cluster / notebook), this delegates to ``df.columns`` as normal.
11
+ On Databricks Connect v2 from local environments (PyCharm / VS Code on Windows), the
12
+ ``AnalyzePlan`` gRPC call that backs ``df.columns`` can fail with
13
+ ``Missing required field 'UserContext'``. In that case this function returns an empty
14
+ list so that optional-column branches in business rules are silently skipped rather than
15
+ crashing. Core transformations (withColumn, filter, write …) are unaffected because they
16
+ go through ``ExecutePlan``, not ``AnalyzePlan``.
17
+
18
+ Args:
19
+ df: A PySpark DataFrame.
20
+
21
+ Returns:
22
+ list[str]: Column names, or ``[]`` when schema inspection fails locally.
23
+ """
24
+ try:
25
+ return df.columns
26
+ except Exception as e:
27
+ if "UserContext" in str(e) or "INVALID_ARGUMENT" in str(e):
28
+ return []
29
+ raise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: satisfactoscript
3
- Version: 0.5.8
3
+ Version: 0.5.10
4
4
  Summary: An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse.
5
5
  Author: julhouba
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,6 +1,8 @@
1
1
  README.md
2
2
  pyproject.toml
3
3
  src/satisfactoscript/__init__.py
4
+ src/satisfactoscript/registry.py
5
+ src/satisfactoscript/utils.py
4
6
  src/satisfactoscript.egg-info/PKG-INFO
5
7
  src/satisfactoscript.egg-info/SOURCES.txt
6
8
  src/satisfactoscript.egg-info/dependency_links.txt
@@ -19,7 +21,10 @@ tests/test_config.py
19
21
  tests/test_core.py
20
22
  tests/test_core_connect_patch.py
21
23
  tests/test_core_env_detection.py
24
+ tests/test_core_join.py
22
25
  tests/test_core_username.py
23
26
  tests/test_dummy.py
24
27
  tests/test_loaders.py
25
- tests/test_registry.py
28
+ tests/test_registry.py
29
+ tests/test_registry_import_paths.py
30
+ tests/test_utils_safe_columns.py
@@ -0,0 +1,140 @@
1
+ """
2
+ Tests for process_schema JOIN logic.
3
+ Key invariant: df_to.columns (AnalyzePlan gRPC) must never be called during join planning.
4
+ """
5
+ import pytest
6
+ from unittest.mock import MagicMock, call, patch
7
+ from satisfactoscript.core.core import SatisfactoEngine
8
+
9
+
10
+ def _make_engine():
11
+ engine = object.__new__(SatisfactoEngine)
12
+ engine.spark = MagicMock()
13
+ engine.config = {}
14
+ return engine
15
+
16
+
17
+ def _joined_df():
18
+ """Return a MagicMock DataFrame that also handles chained .drop()."""
19
+ df = MagicMock(name="joined_df")
20
+ df.drop.return_value = df
21
+ return df
22
+
23
+
24
+ class TestProcessSchemaJoin:
25
+
26
+ def test_same_keys_uses_list_join(self):
27
+ """on_from == on_to → join(df_to, on=['id'], how='inner'). No .columns call."""
28
+ df_a = MagicMock(name="df_a")
29
+ df_b = MagicMock(name="df_b")
30
+ joined = _joined_df()
31
+ df_a.join.return_value = joined
32
+
33
+ schema = {
34
+ "join": [{
35
+ "table_from": "a",
36
+ "table_to": "b",
37
+ "on_from": "id",
38
+ "on_to": "id",
39
+ "type": "inner",
40
+ }]
41
+ }
42
+
43
+ engine = _make_engine()
44
+ result = engine.process_schema(schema, dataframes_in={"a": df_a, "b": df_b})
45
+
46
+ df_a.join.assert_called_once_with(df_b, on=["id"], how="inner")
47
+ df_b.columns.assert_not_called() # ← no AnalyzePlan gRPC
48
+ assert result == joined
49
+
50
+ def test_same_keys_list_join_default_left(self):
51
+ """Default join type is 'left'."""
52
+ df_a = MagicMock(name="df_a")
53
+ df_b = MagicMock(name="df_b")
54
+ df_a.join.return_value = _joined_df()
55
+
56
+ schema = {
57
+ "join": [{"table_from": "a", "table_to": "b", "on_from": "k", "on_to": "k"}]
58
+ }
59
+
60
+ engine = _make_engine()
61
+ engine.process_schema(schema, dataframes_in={"a": df_a, "b": df_b})
62
+
63
+ df_a.join.assert_called_once_with(df_b, on=["k"], how="left")
64
+
65
+ def test_different_keys_uses_expression_join_and_drop(self):
66
+ """on_from != on_to → expression join, then drop right-side keys. No .columns call."""
67
+ df_a = MagicMock(name="df_a")
68
+ df_b = MagicMock(name="df_b")
69
+ joined = _joined_df()
70
+ df_a.join.return_value = joined
71
+
72
+ schema = {
73
+ "join": [{
74
+ "table_from": "a",
75
+ "table_to": "b",
76
+ "on_from": "order_id",
77
+ "on_to": "id",
78
+ "type": "left",
79
+ }]
80
+ }
81
+
82
+ engine = _make_engine()
83
+ result = engine.process_schema(schema, dataframes_in={"a": df_a, "b": df_b})
84
+
85
+ # join must be called with an expression condition (not a list of strings)
86
+ join_args, join_kwargs = df_a.join.call_args
87
+ assert join_args[0] is df_b
88
+ assert join_args[2] == "left"
89
+ assert "on" not in join_kwargs # expression-based, not list-based
90
+
91
+ # drop must be called on joined df with df_b["id"] reference
92
+ joined.drop.assert_called_once_with(df_b["id"])
93
+
94
+ df_b.columns.assert_not_called() # ← no AnalyzePlan gRPC
95
+
96
+ def test_multi_key_same_names(self):
97
+ """Multi-column join with same key names uses list-join."""
98
+ df_a = MagicMock(name="df_a")
99
+ df_b = MagicMock(name="df_b")
100
+ df_a.join.return_value = _joined_df()
101
+
102
+ schema = {
103
+ "join": [{
104
+ "table_from": "a",
105
+ "table_to": "b",
106
+ "on_from": ["org", "date"],
107
+ "on_to": ["org", "date"],
108
+ "type": "left",
109
+ }]
110
+ }
111
+
112
+ engine = _make_engine()
113
+ engine.process_schema(schema, dataframes_in={"a": df_a, "b": df_b})
114
+
115
+ df_a.join.assert_called_once_with(df_b, on=["org", "date"], how="left")
116
+ df_b.columns.assert_not_called()
117
+
118
+ def test_multi_key_different_names_drops_all_right_keys(self):
119
+ """Multi-column join with different key names drops all right-side keys."""
120
+ df_a = MagicMock(name="df_a")
121
+ df_b = MagicMock(name="df_b")
122
+ joined = _joined_df()
123
+ df_a.join.return_value = joined
124
+
125
+ schema = {
126
+ "join": [{
127
+ "table_from": "a",
128
+ "table_to": "b",
129
+ "on_from": ["org_l", "date_l"],
130
+ "on_to": ["org_r", "date_r"],
131
+ "type": "inner",
132
+ }]
133
+ }
134
+
135
+ engine = _make_engine()
136
+ engine.process_schema(schema, dataframes_in={"a": df_a, "b": df_b})
137
+
138
+ # drop must receive both right-side key refs
139
+ joined.drop.assert_called_once_with(df_b["org_r"], df_b["date_r"])
140
+ df_b.columns.assert_not_called()
@@ -0,0 +1,26 @@
1
+ """
2
+ Tests for RuleRegistry import path compatibility.
3
+ All import paths must resolve to the same singleton class.
4
+ """
5
+ from satisfactoscript.core.registry import RuleRegistry as RR_core
6
+ from satisfactoscript.registry import RuleRegistry as RR_shim
7
+ from satisfactoscript import RuleRegistry as RR_top
8
+
9
+
10
+ def test_all_import_paths_resolve_to_same_class():
11
+ """All three import paths must return the exact same class (same singleton)."""
12
+ assert RR_core is RR_shim
13
+ assert RR_core is RR_top
14
+
15
+
16
+ def test_rule_registered_via_shim_is_visible_everywhere():
17
+ """A rule registered using the shim import must be retrievable from all paths."""
18
+ @RR_shim.register_rule(name="__test_shim_rule__")
19
+ def _my_rule(df):
20
+ return df
21
+
22
+ assert RR_core.get_rule("__test_shim_rule__") is _my_rule
23
+ assert RR_top.get_rule("__test_shim_rule__") is _my_rule
24
+
25
+ # Cleanup
26
+ RR_core._rules.pop("__test_shim_rule__", None)
@@ -0,0 +1,46 @@
1
+ """
2
+ Tests for satisfactoscript.utils.safe_columns.
3
+ """
4
+ import pytest
5
+ from unittest.mock import MagicMock, PropertyMock
6
+ from satisfactoscript import safe_columns
7
+
8
+
9
+ class TestSafeColumns:
10
+
11
+ def test_returns_df_columns_normally(self):
12
+ df = MagicMock()
13
+ type(df).columns = PropertyMock(return_value=["id", "name", "value"])
14
+
15
+ assert safe_columns(df) == ["id", "name", "value"]
16
+
17
+ def test_returns_empty_list_on_usercontext_grpc_error(self):
18
+ df = MagicMock()
19
+ type(df).columns = PropertyMock(
20
+ side_effect=Exception("gRPC: Missing required field 'UserContext' in the request.")
21
+ )
22
+
23
+ result = safe_columns(df)
24
+
25
+ assert result == []
26
+
27
+ def test_returns_empty_list_on_invalid_argument_grpc_error(self):
28
+ df = MagicMock()
29
+ type(df).columns = PropertyMock(
30
+ side_effect=Exception("StatusCode.INVALID_ARGUMENT details: something")
31
+ )
32
+
33
+ result = safe_columns(df)
34
+
35
+ assert result == []
36
+
37
+ def test_reraises_unrelated_exceptions(self):
38
+ df = MagicMock()
39
+ type(df).columns = PropertyMock(side_effect=RuntimeError("disk full"))
40
+
41
+ with pytest.raises(RuntimeError, match="disk full"):
42
+ safe_columns(df)
43
+
44
+ def test_importable_from_satisfactoscript_top_level(self):
45
+ from satisfactoscript import safe_columns as sc
46
+ assert callable(sc)