ygg 0.1.34__tar.gz → 0.1.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {ygg-0.1.34 → ygg-0.1.38}/PKG-INFO +1 -1
  2. {ygg-0.1.34 → ygg-0.1.38}/pyproject.toml +1 -1
  3. {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/PKG-INFO +1 -1
  4. {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/SOURCES.txt +1 -0
  5. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/cluster.py +48 -17
  6. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/execution_context.py +2 -2
  7. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/remote.py +25 -8
  8. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/engine.py +43 -28
  9. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/statement_result.py +36 -51
  10. ygg-0.1.38/src/yggdrasil/pyutils/equality.py +107 -0
  11. ygg-0.1.38/src/yggdrasil/version.py +1 -0
  12. ygg-0.1.34/src/yggdrasil/version.py +0 -1
  13. {ygg-0.1.34 → ygg-0.1.38}/LICENSE +0 -0
  14. {ygg-0.1.34 → ygg-0.1.38}/README.md +0 -0
  15. {ygg-0.1.34 → ygg-0.1.38}/setup.cfg +0 -0
  16. {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/dependency_links.txt +0 -0
  17. {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/entry_points.txt +0 -0
  18. {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/requires.txt +0 -0
  19. {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/top_level.txt +0 -0
  20. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/__init__.py +0 -0
  21. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/__init__.py +0 -0
  22. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/__init__.py +0 -0
  23. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/jobs/__init__.py +0 -0
  24. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/jobs/config.py +0 -0
  25. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/__init__.py +0 -0
  26. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/exceptions.py +0 -0
  27. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/types.py +0 -0
  28. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/__init__.py +0 -0
  29. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/filesytem.py +0 -0
  30. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/io.py +0 -0
  31. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/path.py +0 -0
  32. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/path_kind.py +0 -0
  33. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/workspace.py +0 -0
  34. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/dataclasses/__init__.py +0 -0
  35. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/dataclasses/dataclass.py +0 -0
  36. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/__init__.py +0 -0
  37. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/databrickslib.py +0 -0
  38. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/extensions/__init__.py +0 -0
  39. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/extensions/polars_extensions.py +0 -0
  40. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/extensions/spark_extensions.py +0 -0
  41. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/pandaslib.py +0 -0
  42. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/polarslib.py +0 -0
  43. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/sparklib.py +0 -0
  44. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/__init__.py +0 -0
  45. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/callable_serde.py +0 -0
  46. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/exceptions.py +0 -0
  47. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/expiring_dict.py +0 -0
  48. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/modules.py +0 -0
  49. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/parallel.py +0 -0
  50. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/python_env.py +0 -0
  51. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/retry.py +0 -0
  52. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/requests/__init__.py +0 -0
  53. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/requests/msal.py +0 -0
  54. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/requests/session.py +0 -0
  55. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/__init__.py +0 -0
  56. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/__init__.py +0 -0
  57. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/arrow_cast.py +0 -0
  58. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/cast_options.py +0 -0
  59. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/pandas_cast.py +0 -0
  60. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/polars_cast.py +0 -0
  61. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/polars_pandas_cast.py +0 -0
  62. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/registry.py +0 -0
  63. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/spark_cast.py +0 -0
  64. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/spark_pandas_cast.py +0 -0
  65. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/spark_polars_cast.py +0 -0
  66. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/libs.py +0 -0
  67. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/python_arrow.py +0 -0
  68. {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/python_defaults.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.34
3
+ Version: 0.1.38
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ygg"
7
- version = "0.1.34"
7
+ version = "0.1.38"
8
8
  description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  license = { file = "LICENSE" }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.34
3
+ Version: 0.1.38
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -39,6 +39,7 @@ src/yggdrasil/libs/extensions/polars_extensions.py
39
39
  src/yggdrasil/libs/extensions/spark_extensions.py
40
40
  src/yggdrasil/pyutils/__init__.py
41
41
  src/yggdrasil/pyutils/callable_serde.py
42
+ src/yggdrasil/pyutils/equality.py
42
43
  src/yggdrasil/pyutils/exceptions.py
43
44
  src/yggdrasil/pyutils/expiring_dict.py
44
45
  src/yggdrasil/pyutils/modules.py
@@ -24,6 +24,7 @@ from .execution_context import ExecutionContext
24
24
  from ..workspaces.workspace import WorkspaceService, Workspace
25
25
  from ... import retry, CallableSerde
26
26
  from ...libs.databrickslib import databricks_sdk
27
+ from ...pyutils.equality import dicts_equal, dict_diff
27
28
  from ...pyutils.expiring_dict import ExpiringDict
28
29
  from ...pyutils.modules import PipIndexSettings
29
30
  from ...pyutils.python_env import PythonEnv
@@ -110,7 +111,7 @@ class Cluster(WorkspaceService):
110
111
 
111
112
  _details: Optional["ClusterDetails"] = dataclasses.field(default=None, repr=False)
112
113
  _details_refresh_time: float = dataclasses.field(default=0, repr=False)
113
- _system_context: Optional[ExecutionContext] = None
114
+ _system_context: Optional[ExecutionContext] = dataclasses.field(default=None, repr=False)
114
115
 
115
116
  # host → Cluster instance
116
117
  _env_clusters: ClassVar[Dict[str, "Cluster"]] = {}
@@ -309,6 +310,11 @@ class Cluster(WorkspaceService):
309
310
  self.details = self.clusters_client().get(cluster_id=self.cluster_id)
310
311
  return self._details
311
312
 
313
+ def refresh(self, max_delay: float | None = None):
314
+ self.details = self.fresh_details(max_delay=max_delay)
315
+
316
+ return self
317
+
312
318
  @details.setter
313
319
  def details(self, value: "ClusterDetails"):
314
320
  """Cache cluster details and update identifiers."""
@@ -321,10 +327,10 @@ class Cluster(WorkspaceService):
321
327
  @property
322
328
  def state(self):
323
329
  """Return the current cluster state."""
324
- details = self.fresh_details(max_delay=10)
330
+ self.refresh()
325
331
 
326
- if details is not None:
327
- return details.state
332
+ if self._details is not None:
333
+ return self._details.state
328
334
  return State.UNKNOWN
329
335
 
330
336
  @property
@@ -355,7 +361,7 @@ class Cluster(WorkspaceService):
355
361
  def wait_for_status(
356
362
  self,
357
363
  tick: float = 0.5,
358
- timeout: float = 600,
364
+ timeout: Union[float, dt.timedelta] = 600,
359
365
  backoff: int = 2,
360
366
  max_sleep_time: float = 15
361
367
  ):
@@ -373,6 +379,9 @@ class Cluster(WorkspaceService):
373
379
  start = time.time()
374
380
  sleep_time = tick
375
381
 
382
+ if isinstance(timeout, dt.timedelta):
383
+ timeout = timeout.total_seconds()
384
+
376
385
  while self.is_pending:
377
386
  time.sleep(sleep_time)
378
387
 
@@ -658,8 +667,6 @@ class Cluster(WorkspaceService):
658
667
  Returns:
659
668
  The updated Cluster instance.
660
669
  """
661
- self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
662
-
663
670
  existing_details = {
664
671
  k: v
665
672
  for k, v in self.details.as_shallow_dict().items()
@@ -672,22 +679,36 @@ class Cluster(WorkspaceService):
672
679
  if k in _EDIT_ARG_NAMES
673
680
  }
674
681
 
675
- if update_details != existing_details:
682
+ same = dicts_equal(
683
+ existing_details,
684
+ update_details,
685
+ keys=_EDIT_ARG_NAMES,
686
+ treat_missing_as_none=True,
687
+ float_tol=0.0, # set e.g. 1e-6 if you have float-y stuff
688
+ )
689
+
690
+ if not same:
691
+ diff = {
692
+ k: v[1]
693
+ for k, v in dict_diff(existing_details, update_details, keys=_EDIT_ARG_NAMES).items()
694
+ }
695
+
676
696
  logger.debug(
677
697
  "Updating %s with %s",
678
- self, update_details
698
+ self, diff
679
699
  )
680
700
 
681
701
  self.wait_for_status()
682
- self.details = retry(tries=4, delay=0.5, max_delay=2)(
683
- self.clusters_client().edit_and_wait
684
- )(**update_details)
702
+ self.details = self.clusters_client().edit(**update_details)
703
+ self.wait_for_status()
685
704
 
686
705
  logger.info(
687
706
  "Updated %s",
688
707
  self
689
708
  )
690
709
 
710
+ self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
711
+
691
712
  return self
692
713
 
693
714
  def list_clusters(self) -> Iterator["Cluster"]:
@@ -742,7 +763,10 @@ class Cluster(WorkspaceService):
742
763
  return None
743
764
 
744
765
  return Cluster(
745
- workspace=self.workspace, cluster_id=details.cluster_id, _details=details
766
+ workspace=self.workspace,
767
+ cluster_id=details.cluster_id,
768
+ cluster_name=details.cluster_name,
769
+ _details=details
746
770
  )
747
771
 
748
772
  for cluster in self.list_clusters():
@@ -760,16 +784,18 @@ class Cluster(WorkspaceService):
760
784
 
761
785
  def ensure_running(
762
786
  self,
787
+ wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
763
788
  ) -> "Cluster":
764
789
  """Ensure the cluster is running.
765
790
 
766
791
  Returns:
767
792
  The current Cluster instance.
768
793
  """
769
- return self.start()
794
+ return self.start(wait_timeout=wait_timeout)
770
795
 
771
796
  def start(
772
797
  self,
798
+ wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
773
799
  ) -> "Cluster":
774
800
  """Start the cluster if it is not already running.
775
801
 
@@ -780,8 +806,13 @@ class Cluster(WorkspaceService):
780
806
 
781
807
  if not self.is_running:
782
808
  logger.info("Starting %s", self)
783
- self.details = self.clusters_client().start_and_wait(cluster_id=self.cluster_id)
784
- return self.wait_installed_libraries()
809
+
810
+ if wait_timeout:
811
+ self.clusters_client().start(cluster_id=self.cluster_id)
812
+ self.wait_for_status(timeout=wait_timeout.total_seconds())
813
+ self.wait_installed_libraries(timeout=wait_timeout)
814
+ else:
815
+ self.clusters_client().start(cluster_id=self.cluster_id)
785
816
 
786
817
  return self
787
818
 
@@ -1124,7 +1155,7 @@ class Cluster(WorkspaceService):
1124
1155
  "Waiting %s to install libraries timed out" % self
1125
1156
  )
1126
1157
 
1127
- time.sleep(10)
1158
+ time.sleep(5)
1128
1159
  statuses = list(self.installed_library_statuses())
1129
1160
 
1130
1161
  return self
@@ -78,8 +78,8 @@ class ExecutionContext:
78
78
  language: Optional["Language"] = None
79
79
  context_id: Optional[str] = None
80
80
 
81
- _was_connected: Optional[bool] = None
82
- _remote_metadata: Optional[RemoteMetadata] = None
81
+ _was_connected: Optional[bool] = dc.field(default=None, repr=False)
82
+ _remote_metadata: Optional[RemoteMetadata] = dc.field(default=None, repr=False)
83
83
 
84
84
  _lock: threading.RLock = dc.field(default_factory=threading.RLock, init=False, repr=False)
85
85
 
@@ -2,11 +2,12 @@
2
2
 
3
3
  import datetime as dt
4
4
  import logging
5
+ import os
5
6
  from typing import (
6
7
  Callable,
7
8
  Optional,
8
9
  TypeVar,
9
- List, TYPE_CHECKING,
10
+ List, TYPE_CHECKING, Union,
10
11
  )
11
12
 
12
13
  if TYPE_CHECKING:
@@ -25,10 +26,15 @@ ReturnType = TypeVar("ReturnType")
25
26
  logger = logging.getLogger(__name__)
26
27
 
27
28
 
29
+ def identity(x):
30
+ return x
31
+
32
+
28
33
  def databricks_remote_compute(
34
+ _func: Optional[Callable] = None,
29
35
  cluster_id: Optional[str] = None,
30
36
  cluster_name: Optional[str] = None,
31
- workspace: Optional[Workspace] = None,
37
+ workspace: Optional[Union[Workspace, str]] = None,
32
38
  cluster: Optional["Cluster"] = None,
33
39
  timeout: Optional[dt.timedelta] = None,
34
40
  env_keys: Optional[List[str]] = None,
@@ -38,6 +44,7 @@ def databricks_remote_compute(
38
44
  """Return a decorator that executes functions on a remote cluster.
39
45
 
40
46
  Args:
47
+ _func: function to decorate
41
48
  cluster_id: Optional cluster id to target.
42
49
  cluster_name: Optional cluster name to target.
43
50
  workspace: Workspace instance or host string for lookup.
@@ -51,13 +58,19 @@ def databricks_remote_compute(
51
58
  A decorator that runs functions on the resolved Databricks cluster.
52
59
  """
53
60
  if force_local or Workspace.is_in_databricks_environment():
54
- def identity(x):
55
- return x
61
+ return identity if _func is None else _func
62
+
63
+ if workspace is None:
64
+ workspace = os.getenv("DATABRICKS_HOST")
56
65
 
57
- return identity
66
+ if workspace is None:
67
+ return identity if _func is None else _func
58
68
 
59
- if isinstance(workspace, str):
60
- workspace = Workspace(host=workspace)
69
+ if not isinstance(workspace, Workspace):
70
+ if isinstance(workspace, str):
71
+ workspace = Workspace(host=workspace).connect(clone=False)
72
+ else:
73
+ raise ValueError("Cannot initialize databricks workspace with %s" % type(workspace))
61
74
 
62
75
  if cluster is None:
63
76
  if cluster_id or cluster_name:
@@ -68,10 +81,14 @@ def databricks_remote_compute(
68
81
  else:
69
82
  cluster = workspace.clusters().replicated_current_environment(
70
83
  workspace=workspace,
71
- cluster_name=cluster_name
84
+ cluster_name=cluster_name,
85
+ single_user_name=workspace.current_user.user_name
72
86
  )
73
87
 
88
+ cluster.ensure_running(wait_timeout=None)
89
+
74
90
  return cluster.execution_decorator(
91
+ _func=_func,
75
92
  env_keys=env_keys,
76
93
  timeout=timeout,
77
94
  **options
@@ -198,8 +198,7 @@ class SQLEngine(WorkspaceService):
198
198
  """Short, single-line preview for logs (avoids spewing giant SQL)."""
199
199
  if not sql:
200
200
  return ""
201
- one_line = " ".join(sql.split())
202
- return one_line[:limit] + ("…" if len(one_line) > limit else "")
201
+ return sql[:limit] + ("…" if len(sql) > limit else "")
203
202
 
204
203
  def execute(
205
204
  self,
@@ -218,7 +217,6 @@ class SQLEngine(WorkspaceService):
218
217
  schema_name: Optional[str] = None,
219
218
  table_name: Optional[str] = None,
220
219
  wait_result: bool = True,
221
- **kwargs,
222
220
  ) -> "StatementResult":
223
221
  """Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
224
222
 
@@ -245,7 +243,6 @@ class SQLEngine(WorkspaceService):
245
243
  schema_name: Optional schema override for API engine.
246
244
  table_name: Optional table override used when `statement` is None.
247
245
  wait_result: Whether to block until completion (API engine).
248
- **kwargs: Extra params forwarded to Databricks SDK execute_statement.
249
246
 
250
247
  Returns:
251
248
  StatementResult.
@@ -263,9 +260,12 @@ class SQLEngine(WorkspaceService):
263
260
  if spark_session is None:
264
261
  raise ValueError("No spark session found to run sql query")
265
262
 
266
- t0 = time.time()
267
- df = spark_session.sql(statement)
268
- logger.info("Spark SQL executed in %.3fs: %s", time.time() - t0, self._sql_preview(statement))
263
+ df: SparkDataFrame = spark_session.sql(statement)
264
+
265
+ if row_limit:
266
+ df = df.limit(row_limit)
267
+
268
+ logger.info("Spark SQL executed: %s", self._sql_preview(statement))
269
269
 
270
270
  # Avoid Disposition dependency if SDK imports are absent
271
271
  spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
@@ -287,7 +287,6 @@ class SQLEngine(WorkspaceService):
287
287
  if not statement:
288
288
  full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
289
289
  statement = f"SELECT * FROM {full_name}"
290
- logger.debug("Autogenerated statement: %s", self._sql_preview(statement))
291
290
 
292
291
  if not warehouse_id:
293
292
  warehouse_id = self._get_or_default_warehouse_id()
@@ -310,11 +309,14 @@ class SQLEngine(WorkspaceService):
310
309
  engine=self,
311
310
  statement_id=response.statement_id,
312
311
  _response=response,
313
- _response_refresh_time=time.time(),
314
312
  disposition=disposition,
315
313
  )
316
314
 
317
- # BUGFIX: previously returned `wait_result` (a bool) on wait_result=False 🤦
315
+ logger.info(
316
+ "API SQL executed: %s",
317
+ self._sql_preview(statement)
318
+ )
319
+
318
320
  return execution.wait() if wait_result else execution
319
321
 
320
322
  def spark_table(
@@ -465,15 +467,7 @@ class SQLEngine(WorkspaceService):
465
467
  safe_chars=True,
466
468
  )
467
469
 
468
- logger.info(
469
- "Arrow insert into %s (mode=%s, match_by=%s, zorder_by=%s)",
470
- location,
471
- mode,
472
- match_by,
473
- zorder_by,
474
- )
475
-
476
- with self as connected:
470
+ with self.connect() as connected:
477
471
  if existing_schema is None:
478
472
  try:
479
473
  existing_schema = connected.get_table_schema(
@@ -482,7 +476,6 @@ class SQLEngine(WorkspaceService):
482
476
  table_name=table_name,
483
477
  to_arrow_schema=True,
484
478
  )
485
- logger.debug("Fetched existing schema for %s (columns=%d)", location, len(existing_schema.names))
486
479
  except ValueError as exc:
487
480
  data_tbl = convert(data, pa.Table)
488
481
  existing_schema = data_tbl.schema
@@ -527,7 +520,20 @@ class SQLEngine(WorkspaceService):
527
520
 
528
521
  transaction_id = self._random_suffix()
529
522
 
530
- data_tbl = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
523
+ data_tbl = convert(
524
+ data, pa.Table,
525
+ options=cast_options, target_field=existing_schema
526
+ )
527
+ num_rows = data_tbl.num_rows
528
+
529
+ logger.debug(
530
+ "Arrow inserting %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
531
+ num_rows,
532
+ location,
533
+ mode,
534
+ match_by,
535
+ zorder_by,
536
+ )
531
537
 
532
538
  # Write in temp volume
533
539
  temp_volume_path = connected.dbfs_path(
@@ -545,7 +551,6 @@ class SQLEngine(WorkspaceService):
545
551
  statements: list[str] = []
546
552
 
547
553
  if match_by:
548
- logger.info("Using MERGE INTO (match_by=%s)", match_by)
549
554
  on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
550
555
 
551
556
  update_cols = [c for c in columns if c not in match_by]
@@ -588,6 +593,15 @@ FROM parquet.`{temp_volume_path}`"""
588
593
  except Exception:
589
594
  logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
590
595
 
596
+ logger.info(
597
+ "Arrow inserted %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
598
+ num_rows,
599
+ location,
600
+ mode,
601
+ match_by,
602
+ zorder_by,
603
+ )
604
+
591
605
  if zorder_by:
592
606
  zcols = ", ".join([f"`{c}`" for c in zorder_by])
593
607
  optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
@@ -675,7 +689,6 @@ FROM parquet.`{temp_volume_path}`"""
675
689
  table_name=table_name,
676
690
  to_arrow_schema=False,
677
691
  )
678
- logger.debug("Fetched destination Spark schema for %s", location)
679
692
  except ValueError:
680
693
  logger.warning("Destination table missing; creating table %s via overwrite write", location)
681
694
  data = convert(data, pyspark.sql.DataFrame)
@@ -704,10 +717,8 @@ FROM parquet.`{temp_volume_path}`"""
704
717
 
705
718
  if match_by:
706
719
  cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
707
- logger.info("Running Delta MERGE (cond=%s)", cond)
708
720
 
709
721
  if mode.casefold() == "overwrite":
710
- logger.info("Overwrite-by-key mode: delete matching keys then append")
711
722
  data = data.cache()
712
723
  distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
713
724
 
@@ -815,6 +826,7 @@ FROM parquet.`{temp_volume_path}`"""
815
826
  optimize_write: bool = True,
816
827
  auto_compact: bool = True,
817
828
  execute: bool = True,
829
+ wait_result: bool = True
818
830
  ) -> Union[str, "StatementResult"]:
819
831
  """Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
820
832
 
@@ -832,6 +844,7 @@ FROM parquet.`{temp_volume_path}`"""
832
844
  optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
833
845
  auto_compact: Sets delta.autoOptimize.autoCompact table property.
834
846
  execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
847
+ wait_result: Waits execution to complete
835
848
 
836
849
  Returns:
837
850
  StatementResult if execute=True, else the DDL SQL string.
@@ -897,11 +910,13 @@ FROM parquet.`{temp_volume_path}`"""
897
910
 
898
911
  statement = "\n".join(sql)
899
912
 
900
- logger.info("Generated CREATE TABLE DDL for %s", location)
901
- logger.debug("DDL:\n%s", statement)
913
+ logger.debug(
914
+ "Generated CREATE TABLE DDL for %s:\n%s",
915
+ location, statement
916
+ )
902
917
 
903
918
  if execute:
904
- return self.execute(statement)
919
+ return self.execute(statement, wait_result=wait_result)
905
920
  return statement
906
921
 
907
922
  def _check_location_params(
@@ -44,6 +44,15 @@ if TYPE_CHECKING:
44
44
  from .engine import SQLEngine
45
45
 
46
46
 
47
+ DONE_STATES = {
48
+ StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED,
49
+ StatementState.SUCCEEDED
50
+ }
51
+
52
+ FAILED_STATES = {
53
+ StatementState.FAILED, StatementState.CANCELED
54
+ }
55
+
47
56
  __all__ = [
48
57
  "StatementResult"
49
58
  ]
@@ -57,7 +66,6 @@ class StatementResult:
57
66
  disposition: "Disposition"
58
67
 
59
68
  _response: Optional[StatementResponse] = dataclasses.field(default=None, repr=False)
60
- _response_refresh_time: float = dataclasses.field(default=0, repr=False)
61
69
 
62
70
  _spark_df: Optional[SparkDataFrame] = dataclasses.field(default=None, repr=False)
63
71
  _arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
@@ -101,8 +109,30 @@ class StatementResult:
101
109
  Returns:
102
110
  The current StatementResponse object.
103
111
  """
104
- if self._response is None and not self.is_spark_sql:
105
- self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
112
+ if self.is_spark_sql:
113
+ return StatementResponse(
114
+ statement_id=self.statement_id or "sparksql",
115
+ status=StatementStatus(
116
+ state=StatementState.SUCCEEDED
117
+ )
118
+ )
119
+ elif not self.statement_id:
120
+ return StatementResponse(
121
+ statement_id="unknown",
122
+ status=StatementStatus(
123
+ state=StatementState.PENDING
124
+ )
125
+ )
126
+
127
+ statement_execution = self.workspace.sdk().statement_execution
128
+
129
+ if self._response is None:
130
+ # Initialize
131
+ self._response = statement_execution.get_statement(self.statement_id)
132
+ elif self._response.status.state not in DONE_STATES:
133
+ # Refresh
134
+ self._response = statement_execution.get_statement(self.statement_id)
135
+
106
136
  return self._response
107
137
 
108
138
  @response.setter
@@ -113,27 +143,8 @@ class StatementResult:
113
143
  value: StatementResponse to cache.
114
144
  """
115
145
  self._response = value
116
- self._response_refresh_time = time.time()
117
-
118
146
  self.statement_id = self._response.statement_id
119
147
 
120
- def fresh_response(self, delay: float):
121
- """Refresh the response if it is older than ``delay`` seconds.
122
-
123
- Args:
124
- delay: Minimum age in seconds before refreshing.
125
-
126
- Returns:
127
- The refreshed StatementResponse object.
128
- """
129
- if self.is_spark_sql:
130
- return self._response
131
-
132
- if self.statement_id and not self.done and time.time() - self._response_refresh_time > delay:
133
- self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
134
-
135
- return self._response
136
-
137
148
  def result_data_at(self, chunk_index: int):
138
149
  """Fetch a specific result chunk by index.
139
150
 
@@ -166,17 +177,7 @@ class StatementResult:
166
177
  Returns:
167
178
  A StatementStatus object.
168
179
  """
169
- if self.persisted:
170
- return StatementStatus(
171
- state=StatementState.SUCCEEDED
172
- )
173
-
174
- if not self.statement_id:
175
- return StatementStatus(
176
- state=StatementState.PENDING
177
- )
178
-
179
- return self.fresh_response(delay=1).status
180
+ return self.response.status
180
181
 
181
182
  @property
182
183
  def state(self):
@@ -194,8 +195,6 @@ class StatementResult:
194
195
  Returns:
195
196
  The result manifest or None for Spark SQL results.
196
197
  """
197
- if self.is_spark_sql:
198
- return None
199
198
  return self.response.manifest
200
199
 
201
200
  @property
@@ -214,15 +213,7 @@ class StatementResult:
214
213
  Returns:
215
214
  True if the statement is done, otherwise False.
216
215
  """
217
- if self.persisted:
218
- return True
219
-
220
- if self._response is None:
221
- return False
222
-
223
- return self._response.status.state in [
224
- StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED, StatementState.SUCCEEDED
225
- ]
216
+ return self.state in DONE_STATES
226
217
 
227
218
  @property
228
219
  def failed(self):
@@ -231,13 +222,7 @@ class StatementResult:
231
222
  Returns:
232
223
  True if the statement failed or was cancelled.
233
224
  """
234
- if self.persisted:
235
- return True
236
-
237
- if self._response is None:
238
- return False
239
-
240
- return self._response.status.state in [StatementState.CANCELED, StatementState.FAILED]
225
+ return self.state in FAILED_STATES
241
226
 
242
227
  @property
243
228
  def persisted(self):
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from typing import Any, Dict, Iterable, Tuple
5
+
6
+ _MISSING = object()
7
+
8
+
9
+ __all__ = [
10
+ "dicts_equal",
11
+ "dict_diff"
12
+ ]
13
+
14
+
15
+ def _normalize(obj: Any) -> Any:
16
+ """
17
+ Normalize nested structures so equality is stable:
18
+ - dict: sort keys + normalize values
19
+ - list/tuple: normalize items (keeps order)
20
+ - set: sort normalized items (orderless)
21
+ - float: keep as float (handled separately for tolerance)
22
+ """
23
+ if isinstance(obj, dict):
24
+ return {k: _normalize(obj[k]) for k in sorted(obj.keys())}
25
+ if isinstance(obj, (list, tuple)):
26
+ return [_normalize(x) for x in obj]
27
+ if isinstance(obj, set):
28
+ return sorted(_normalize(x) for x in obj)
29
+ return obj
30
+
31
+ def _equal(a: Any, b: Any, float_tol: float = 0.0) -> bool:
32
+ # Float tolerance (optional)
33
+ if isinstance(a, float) or isinstance(b, float):
34
+ if a is None or b is None:
35
+ return a is b
36
+ try:
37
+ return math.isclose(float(a), float(b), rel_tol=float_tol, abs_tol=float_tol)
38
+ except Exception:
39
+ pass
40
+
41
+ # Deep normalize compare for dict/list/set
42
+ return _normalize(a) == _normalize(b)
43
+
44
+ def dicts_equal(
45
+ a: Dict[str, Any],
46
+ b: Dict[str, Any],
47
+ *,
48
+ keys: Iterable[str] | None = None,
49
+ treat_missing_as_none: bool = True,
50
+ float_tol: float = 0.0,
51
+ ) -> bool:
52
+ """
53
+ Equality check for two dicts with options:
54
+ - keys: only compare these keys
55
+ - treat_missing_as_none: missing key == None if other side is None
56
+ - float_tol: tolerance for float comparisons
57
+ """
58
+ if keys is None:
59
+ keys = set(a.keys()) | set(b.keys())
60
+
61
+ for k in keys:
62
+ av = a.get(k, _MISSING)
63
+ bv = b.get(k, _MISSING)
64
+
65
+ if treat_missing_as_none:
66
+ if av is _MISSING and bv is None:
67
+ continue
68
+ if bv is _MISSING and av is None:
69
+ continue
70
+ if av is _MISSING and bv is _MISSING:
71
+ continue
72
+
73
+ if not _equal(av, bv, float_tol=float_tol):
74
+ return False
75
+
76
+ return True
77
+
78
+ def dict_diff(
79
+ a: Dict[str, Any],
80
+ b: Dict[str, Any],
81
+ *,
82
+ keys: Iterable[str] | None = None,
83
+ treat_missing_as_none: bool = True,
84
+ float_tol: float = 0.0,
85
+ ) -> Dict[str, Tuple[Any, Any]]:
86
+ """
87
+ Returns {key: (a_val, b_val)} for all keys that differ.
88
+ """
89
+ if keys is None:
90
+ keys = set(a.keys()) | set(b.keys())
91
+
92
+ out: Dict[str, Tuple[Any, Any]] = {}
93
+ for k in keys:
94
+ av = a.get(k, _MISSING)
95
+ bv = b.get(k, _MISSING)
96
+
97
+ if treat_missing_as_none:
98
+ if av is _MISSING and bv is None:
99
+ continue
100
+ if bv is _MISSING and av is None:
101
+ continue
102
+ if av is _MISSING and bv is _MISSING:
103
+ continue
104
+
105
+ if not _equal(av, bv, float_tol=float_tol):
106
+ out[k] = (None if av is _MISSING else av, None if bv is _MISSING else bv)
107
+ return out
@@ -0,0 +1 @@
1
+ __version__ = "0.1.38"
@@ -1 +0,0 @@
1
- __version__ = "0.1.34"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes