PyPI - ygg - Versions diffs - 0.1.34__tar.gz → 0.1.38__tar.gz - Mend

ygg 0.1.34tar.gz → 0.1.38tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{ygg-0.1.34 → ygg-0.1.38}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ygg
-Version: 0.1.34
+Version: 0.1.38
 Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
 Author: Yggdrasil contributors
 License:                                  Apache License

{ygg-0.1.34 → ygg-0.1.38}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ygg"
-version = "0.1.34"
+version = "0.1.38"
 description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { file = "LICENSE" }

{ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ygg
-Version: 0.1.34
+Version: 0.1.38
 Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
 Author: Yggdrasil contributors
 License:                                  Apache License

{ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/SOURCES.txt RENAMED Viewed

@@ -39,6 +39,7 @@ src/yggdrasil/libs/extensions/polars_extensions.py
 src/yggdrasil/libs/extensions/spark_extensions.py
 src/yggdrasil/pyutils/__init__.py
 src/yggdrasil/pyutils/callable_serde.py
+src/yggdrasil/pyutils/equality.py
 src/yggdrasil/pyutils/exceptions.py
 src/yggdrasil/pyutils/expiring_dict.py
 src/yggdrasil/pyutils/modules.py

{ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/cluster.py RENAMED Viewed

@@ -24,6 +24,7 @@ from .execution_context import ExecutionContext
 from ..workspaces.workspace import WorkspaceService, Workspace
 from ... import retry, CallableSerde
 from ...libs.databrickslib import databricks_sdk
+from ...pyutils.equality import dicts_equal, dict_diff
 from ...pyutils.expiring_dict import ExpiringDict
 from ...pyutils.modules import PipIndexSettings
 from ...pyutils.python_env import PythonEnv
@@ -110,7 +111,7 @@ class Cluster(WorkspaceService):
     _details: Optional["ClusterDetails"] = dataclasses.field(default=None, repr=False)
     _details_refresh_time: float = dataclasses.field(default=0, repr=False)
-    _system_context: Optional[ExecutionContext] = None
+    _system_context: Optional[ExecutionContext] = dataclasses.field(default=None, repr=False)
     # host → Cluster instance
     _env_clusters: ClassVar[Dict[str, "Cluster"]] = {}
@@ -309,6 +310,11 @@ class Cluster(WorkspaceService):
             self.details = self.clusters_client().get(cluster_id=self.cluster_id)
         return self._details
+    def refresh(self, max_delay: float | None = None):
+        self.details = self.fresh_details(max_delay=max_delay)
+        return self
     @details.setter
     def details(self, value: "ClusterDetails"):
         """Cache cluster details and update identifiers."""
@@ -321,10 +327,10 @@ class Cluster(WorkspaceService):
     @property
     def state(self):
         """Return the current cluster state."""
-        details = self.fresh_details(max_delay=10)
+        self.refresh()
-        if details is not None:
-            return details.state
+        if self._details is not None:
+            return self._details.state
         return State.UNKNOWN
     @property
@@ -355,7 +361,7 @@ class Cluster(WorkspaceService):
     def wait_for_status(
         self,
         tick: float = 0.5,
-        timeout: float = 600,
+        timeout: Union[float, dt.timedelta] = 600,
         backoff: int = 2,
         max_sleep_time: float = 15
     ):
@@ -373,6 +379,9 @@ class Cluster(WorkspaceService):
         start = time.time()
         sleep_time = tick
+        if isinstance(timeout, dt.timedelta):
+            timeout = timeout.total_seconds()
         while self.is_pending:
             time.sleep(sleep_time)
@@ -658,8 +667,6 @@ class Cluster(WorkspaceService):
         Returns:
             The updated Cluster instance.
         """
-        self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
         existing_details = {
             k: v
             for k, v in self.details.as_shallow_dict().items()
@@ -672,22 +679,36 @@ class Cluster(WorkspaceService):
             if k in _EDIT_ARG_NAMES
         }
-        if update_details != existing_details:
+        same = dicts_equal(
+            existing_details,
+            update_details,
+            keys=_EDIT_ARG_NAMES,
+            treat_missing_as_none=True,
+            float_tol=0.0,  # set e.g. 1e-6 if you have float-y stuff
+        )
+        if not same:
+            diff = {
+                k: v[1]
+                for k, v in dict_diff(existing_details, update_details, keys=_EDIT_ARG_NAMES).items()
+            }
             logger.debug(
                 "Updating %s with %s",
-                self, update_details
+                self, diff
             )
             self.wait_for_status()
-            self.details = retry(tries=4, delay=0.5, max_delay=2)(
-                self.clusters_client().edit_and_wait
-            )(**update_details)
+            self.details = self.clusters_client().edit(**update_details)
+            self.wait_for_status()
             logger.info(
                 "Updated %s",
                 self
             )
+        self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
         return self
     def list_clusters(self) -> Iterator["Cluster"]:
@@ -742,7 +763,10 @@ class Cluster(WorkspaceService):
                 return None
             return Cluster(
-                workspace=self.workspace, cluster_id=details.cluster_id, _details=details
+                workspace=self.workspace,
+                cluster_id=details.cluster_id,
+                cluster_name=details.cluster_name,
+                _details=details
             )
         for cluster in self.list_clusters():
@@ -760,16 +784,18 @@ class Cluster(WorkspaceService):
     def ensure_running(
         self,
+        wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
     ) -> "Cluster":
         """Ensure the cluster is running.
         Returns:
             The current Cluster instance.
         """
-        return self.start()
+        return self.start(wait_timeout=wait_timeout)
     def start(
         self,
+        wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
     ) -> "Cluster":
         """Start the cluster if it is not already running.
@@ -780,8 +806,13 @@ class Cluster(WorkspaceService):
         if not self.is_running:
             logger.info("Starting %s", self)
-            self.details = self.clusters_client().start_and_wait(cluster_id=self.cluster_id)
-            return self.wait_installed_libraries()
+            if wait_timeout:
+                self.clusters_client().start(cluster_id=self.cluster_id)
+                self.wait_for_status(timeout=wait_timeout.total_seconds())
+                self.wait_installed_libraries(timeout=wait_timeout)
+            else:
+                self.clusters_client().start(cluster_id=self.cluster_id)
         return self
@@ -1124,7 +1155,7 @@ class Cluster(WorkspaceService):
                     "Waiting %s to install libraries timed out" % self
                 )
-            time.sleep(10)
+            time.sleep(5)
             statuses = list(self.installed_library_statuses())
         return self

{ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/execution_context.py RENAMED Viewed

@@ -78,8 +78,8 @@ class ExecutionContext:
     language: Optional["Language"] = None
     context_id: Optional[str] = None
-    _was_connected: Optional[bool] = None
-    _remote_metadata: Optional[RemoteMetadata] = None
+    _was_connected: Optional[bool] = dc.field(default=None, repr=False)
+    _remote_metadata: Optional[RemoteMetadata] = dc.field(default=None, repr=False)
     _lock: threading.RLock = dc.field(default_factory=threading.RLock, init=False, repr=False)

{ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/remote.py RENAMED Viewed

@@ -2,11 +2,12 @@
 import datetime as dt
 import logging
+import os
 from typing import (
     Callable,
     Optional,
     TypeVar,
-    List, TYPE_CHECKING,
+    List, TYPE_CHECKING, Union,
 )
 if TYPE_CHECKING:
@@ -25,10 +26,15 @@ ReturnType = TypeVar("ReturnType")
 logger = logging.getLogger(__name__)
+def identity(x):
+    return x
 def databricks_remote_compute(
+    _func: Optional[Callable] = None,
     cluster_id: Optional[str] = None,
     cluster_name: Optional[str] = None,
-    workspace: Optional[Workspace] = None,
+    workspace: Optional[Union[Workspace, str]] = None,
     cluster: Optional["Cluster"] = None,
     timeout: Optional[dt.timedelta] = None,
     env_keys: Optional[List[str]] = None,
@@ -38,6 +44,7 @@ def databricks_remote_compute(
     """Return a decorator that executes functions on a remote cluster.
     Args:
+        _func: function to decorate
         cluster_id: Optional cluster id to target.
         cluster_name: Optional cluster name to target.
         workspace: Workspace instance or host string for lookup.
@@ -51,13 +58,19 @@ def databricks_remote_compute(
         A decorator that runs functions on the resolved Databricks cluster.
     """
     if force_local or Workspace.is_in_databricks_environment():
-        def identity(x):
-            return x
+        return identity if _func is None else _func
+    if workspace is None:
+        workspace = os.getenv("DATABRICKS_HOST")
-        return identity
+    if workspace is None:
+        return identity if _func is None else _func
-    if isinstance(workspace, str):
-        workspace = Workspace(host=workspace)
+    if not isinstance(workspace, Workspace):
+        if isinstance(workspace, str):
+            workspace = Workspace(host=workspace).connect(clone=False)
+        else:
+            raise ValueError("Cannot initialize databricks workspace with %s" % type(workspace))
     if cluster is None:
         if cluster_id or cluster_name:
@@ -68,10 +81,14 @@ def databricks_remote_compute(
         else:
             cluster = workspace.clusters().replicated_current_environment(
                 workspace=workspace,
-                cluster_name=cluster_name
+                cluster_name=cluster_name,
+                single_user_name=workspace.current_user.user_name
             )
+    cluster.ensure_running(wait_timeout=None)
     return cluster.execution_decorator(
+        _func=_func,
         env_keys=env_keys,
         timeout=timeout,
         **options

{ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/engine.py RENAMED Viewed

@@ -198,8 +198,7 @@ class SQLEngine(WorkspaceService):
         """Short, single-line preview for logs (avoids spewing giant SQL)."""
         if not sql:
             return ""
-        one_line = " ".join(sql.split())
-        return one_line[:limit] + ("…" if len(one_line) > limit else "")
+        return sql[:limit] + ("…" if len(sql) > limit else "")
     def execute(
         self,
@@ -218,7 +217,6 @@ class SQLEngine(WorkspaceService):
         schema_name: Optional[str] = None,
         table_name: Optional[str] = None,
         wait_result: bool = True,
-        **kwargs,
     ) -> "StatementResult":
         """Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
@@ -245,7 +243,6 @@ class SQLEngine(WorkspaceService):
             schema_name: Optional schema override for API engine.
             table_name: Optional table override used when `statement` is None.
             wait_result: Whether to block until completion (API engine).
-            **kwargs: Extra params forwarded to Databricks SDK execute_statement.
         Returns:
             StatementResult.
@@ -263,9 +260,12 @@ class SQLEngine(WorkspaceService):
             if spark_session is None:
                 raise ValueError("No spark session found to run sql query")
-            t0 = time.time()
-            df = spark_session.sql(statement)
-            logger.info("Spark SQL executed in %.3fs: %s", time.time() - t0, self._sql_preview(statement))
+            df: SparkDataFrame = spark_session.sql(statement)
+            if row_limit:
+                df = df.limit(row_limit)
+            logger.info("Spark SQL executed: %s", self._sql_preview(statement))
             # Avoid Disposition dependency if SDK imports are absent
             spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
@@ -287,7 +287,6 @@ class SQLEngine(WorkspaceService):
         if not statement:
             full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
             statement = f"SELECT * FROM {full_name}"
-            logger.debug("Autogenerated statement: %s", self._sql_preview(statement))
         if not warehouse_id:
             warehouse_id = self._get_or_default_warehouse_id()
@@ -310,11 +309,14 @@ class SQLEngine(WorkspaceService):
             engine=self,
             statement_id=response.statement_id,
             _response=response,
-            _response_refresh_time=time.time(),
             disposition=disposition,
         )
-        # BUGFIX: previously returned `wait_result` (a bool) on wait_result=False 🤦
+        logger.info(
+            "API SQL executed: %s",
+            self._sql_preview(statement)
+        )
         return execution.wait() if wait_result else execution
     def spark_table(
@@ -465,15 +467,7 @@ class SQLEngine(WorkspaceService):
             safe_chars=True,
         )
-        logger.info(
-            "Arrow insert into %s (mode=%s, match_by=%s, zorder_by=%s)",
-            location,
-            mode,
-            match_by,
-            zorder_by,
-        )
-        with self as connected:
+        with self.connect() as connected:
             if existing_schema is None:
                 try:
                     existing_schema = connected.get_table_schema(
@@ -482,7 +476,6 @@ class SQLEngine(WorkspaceService):
                         table_name=table_name,
                         to_arrow_schema=True,
                     )
-                    logger.debug("Fetched existing schema for %s (columns=%d)", location, len(existing_schema.names))
                 except ValueError as exc:
                     data_tbl = convert(data, pa.Table)
                     existing_schema = data_tbl.schema
@@ -527,7 +520,20 @@ class SQLEngine(WorkspaceService):
             transaction_id = self._random_suffix()
-            data_tbl = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
+            data_tbl = convert(
+                data, pa.Table,
+                options=cast_options, target_field=existing_schema
+            )
+            num_rows = data_tbl.num_rows
+            logger.debug(
+                "Arrow inserting %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
+                num_rows,
+                location,
+                mode,
+                match_by,
+                zorder_by,
+            )
             # Write in temp volume
             temp_volume_path = connected.dbfs_path(
@@ -545,7 +551,6 @@ class SQLEngine(WorkspaceService):
             statements: list[str] = []
             if match_by:
-                logger.info("Using MERGE INTO (match_by=%s)", match_by)
                 on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
                 update_cols = [c for c in columns if c not in match_by]
@@ -588,6 +593,15 @@ FROM parquet.`{temp_volume_path}`"""
                 except Exception:
                     logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
+            logger.info(
+                "Arrow inserted %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
+                num_rows,
+                location,
+                mode,
+                match_by,
+                zorder_by,
+            )
             if zorder_by:
                 zcols = ", ".join([f"`{c}`" for c in zorder_by])
                 optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
@@ -675,7 +689,6 @@ FROM parquet.`{temp_volume_path}`"""
                 table_name=table_name,
                 to_arrow_schema=False,
             )
-            logger.debug("Fetched destination Spark schema for %s", location)
         except ValueError:
             logger.warning("Destination table missing; creating table %s via overwrite write", location)
             data = convert(data, pyspark.sql.DataFrame)
@@ -704,10 +717,8 @@ FROM parquet.`{temp_volume_path}`"""
         if match_by:
             cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
-            logger.info("Running Delta MERGE (cond=%s)", cond)
             if mode.casefold() == "overwrite":
-                logger.info("Overwrite-by-key mode: delete matching keys then append")
                 data = data.cache()
                 distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
@@ -815,6 +826,7 @@ FROM parquet.`{temp_volume_path}`"""
         optimize_write: bool = True,
         auto_compact: bool = True,
         execute: bool = True,
+        wait_result: bool = True
     ) -> Union[str, "StatementResult"]:
         """Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
@@ -832,6 +844,7 @@ FROM parquet.`{temp_volume_path}`"""
             optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
             auto_compact: Sets delta.autoOptimize.autoCompact table property.
             execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
+            wait_result: Waits execution to complete
         Returns:
             StatementResult if execute=True, else the DDL SQL string.
@@ -897,11 +910,13 @@ FROM parquet.`{temp_volume_path}`"""
         statement = "\n".join(sql)
-        logger.info("Generated CREATE TABLE DDL for %s", location)
-        logger.debug("DDL:\n%s", statement)
+        logger.debug(
+            "Generated CREATE TABLE DDL for %s:\n%s",
+            location, statement
+        )
         if execute:
-            return self.execute(statement)
+            return self.execute(statement, wait_result=wait_result)
         return statement
     def _check_location_params(

{ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/statement_result.py RENAMED Viewed

@@ -44,6 +44,15 @@ if TYPE_CHECKING:
     from .engine import SQLEngine
+DONE_STATES = {
+    StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED,
+    StatementState.SUCCEEDED
+}
+FAILED_STATES = {
+    StatementState.FAILED, StatementState.CANCELED
+}
 __all__ = [
     "StatementResult"
 ]
@@ -57,7 +66,6 @@ class StatementResult:
     disposition: "Disposition"
     _response: Optional[StatementResponse] = dataclasses.field(default=None, repr=False)
-    _response_refresh_time: float = dataclasses.field(default=0, repr=False)
     _spark_df: Optional[SparkDataFrame] = dataclasses.field(default=None, repr=False)
     _arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
@@ -101,8 +109,30 @@ class StatementResult:
         Returns:
             The current StatementResponse object.
         """
-        if self._response is None and not self.is_spark_sql:
-            self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
+        if self.is_spark_sql:
+            return StatementResponse(
+                statement_id=self.statement_id or "sparksql",
+                status=StatementStatus(
+                    state=StatementState.SUCCEEDED
+                )
+            )
+        elif not self.statement_id:
+            return StatementResponse(
+                statement_id="unknown",
+                status=StatementStatus(
+                    state=StatementState.PENDING
+                )
+            )
+        statement_execution = self.workspace.sdk().statement_execution
+        if self._response is None:
+            # Initialize
+            self._response = statement_execution.get_statement(self.statement_id)
+        elif self._response.status.state not in DONE_STATES:
+            # Refresh
+            self._response = statement_execution.get_statement(self.statement_id)
         return self._response
     @response.setter
@@ -113,27 +143,8 @@ class StatementResult:
             value: StatementResponse to cache.
         """
         self._response = value
-        self._response_refresh_time = time.time()
         self.statement_id = self._response.statement_id
-    def fresh_response(self, delay: float):
-        """Refresh the response if it is older than ``delay`` seconds.
-        Args:
-            delay: Minimum age in seconds before refreshing.
-        Returns:
-            The refreshed StatementResponse object.
-        """
-        if self.is_spark_sql:
-            return self._response
-        if self.statement_id and not self.done and time.time() - self._response_refresh_time > delay:
-            self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
-        return self._response
     def result_data_at(self, chunk_index: int):
         """Fetch a specific result chunk by index.
@@ -166,17 +177,7 @@ class StatementResult:
         Returns:
             A StatementStatus object.
         """
-        if self.persisted:
-            return StatementStatus(
-                state=StatementState.SUCCEEDED
-            )
-        if not self.statement_id:
-            return StatementStatus(
-                state=StatementState.PENDING
-            )
-        return self.fresh_response(delay=1).status
+        return self.response.status
     @property
     def state(self):
@@ -194,8 +195,6 @@ class StatementResult:
         Returns:
             The result manifest or None for Spark SQL results.
         """
-        if self.is_spark_sql:
-            return None
         return self.response.manifest
     @property
@@ -214,15 +213,7 @@ class StatementResult:
         Returns:
             True if the statement is done, otherwise False.
         """
-        if self.persisted:
-            return True
-        if self._response is None:
-            return False
-        return self._response.status.state in [
-            StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED, StatementState.SUCCEEDED
-        ]
+        return self.state in DONE_STATES
     @property
     def failed(self):
@@ -231,13 +222,7 @@ class StatementResult:
         Returns:
             True if the statement failed or was cancelled.
         """
-        if self.persisted:
-            return True
-        if self._response is None:
-            return False
-        return self._response.status.state in [StatementState.CANCELED, StatementState.FAILED]
+        return self.state in FAILED_STATES
     @property
     def persisted(self):

ygg-0.1.38/src/yggdrasil/pyutils/equality.py ADDED Viewed

@@ -0,0 +1,107 @@
+from __future__ import annotations
+import math
+from typing import Any, Dict, Iterable, Tuple
+_MISSING = object()
+__all__ = [
+    "dicts_equal",
+    "dict_diff"
+]
+def _normalize(obj: Any) -> Any:
+    """
+    Normalize nested structures so equality is stable:
+    - dict: sort keys + normalize values
+    - list/tuple: normalize items (keeps order)
+    - set: sort normalized items (orderless)
+    - float: keep as float (handled separately for tolerance)
+    """
+    if isinstance(obj, dict):
+        return {k: _normalize(obj[k]) for k in sorted(obj.keys())}
+    if isinstance(obj, (list, tuple)):
+        return [_normalize(x) for x in obj]
+    if isinstance(obj, set):
+        return sorted(_normalize(x) for x in obj)
+    return obj
+def _equal(a: Any, b: Any, float_tol: float = 0.0) -> bool:
+    # Float tolerance (optional)
+    if isinstance(a, float) or isinstance(b, float):
+        if a is None or b is None:
+            return a is b
+        try:
+            return math.isclose(float(a), float(b), rel_tol=float_tol, abs_tol=float_tol)
+        except Exception:
+            pass
+    # Deep normalize compare for dict/list/set
+    return _normalize(a) == _normalize(b)
+def dicts_equal(
+    a: Dict[str, Any],
+    b: Dict[str, Any],
+    *,
+    keys: Iterable[str] | None = None,
+    treat_missing_as_none: bool = True,
+    float_tol: float = 0.0,
+) -> bool:
+    """
+    Equality check for two dicts with options:
+    - keys: only compare these keys
+    - treat_missing_as_none: missing key == None if other side is None
+    - float_tol: tolerance for float comparisons
+    """
+    if keys is None:
+        keys = set(a.keys()) | set(b.keys())
+    for k in keys:
+        av = a.get(k, _MISSING)
+        bv = b.get(k, _MISSING)
+        if treat_missing_as_none:
+            if av is _MISSING and bv is None:
+                continue
+            if bv is _MISSING and av is None:
+                continue
+            if av is _MISSING and bv is _MISSING:
+                continue
+        if not _equal(av, bv, float_tol=float_tol):
+            return False
+    return True
+def dict_diff(
+    a: Dict[str, Any],
+    b: Dict[str, Any],
+    *,
+    keys: Iterable[str] | None = None,
+    treat_missing_as_none: bool = True,
+    float_tol: float = 0.0,
+) -> Dict[str, Tuple[Any, Any]]:
+    """
+    Returns {key: (a_val, b_val)} for all keys that differ.
+    """
+    if keys is None:
+        keys = set(a.keys()) | set(b.keys())
+    out: Dict[str, Tuple[Any, Any]] = {}
+    for k in keys:
+        av = a.get(k, _MISSING)
+        bv = b.get(k, _MISSING)
+        if treat_missing_as_none:
+            if av is _MISSING and bv is None:
+                continue
+            if bv is _MISSING and av is None:
+                continue
+            if av is _MISSING and bv is _MISSING:
+                continue
+        if not _equal(av, bv, float_tol=float_tol):
+            out[k] = (None if av is _MISSING else av, None if bv is _MISSING else bv)
+    return out