ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
  2. ygg-0.1.32.dist-info/RECORD +60 -0
  3. yggdrasil/__init__.py +2 -0
  4. yggdrasil/databricks/__init__.py +2 -0
  5. yggdrasil/databricks/compute/__init__.py +2 -0
  6. yggdrasil/databricks/compute/cluster.py +241 -2
  7. yggdrasil/databricks/compute/execution_context.py +100 -11
  8. yggdrasil/databricks/compute/remote.py +16 -0
  9. yggdrasil/databricks/jobs/__init__.py +5 -0
  10. yggdrasil/databricks/jobs/config.py +31 -34
  11. yggdrasil/databricks/sql/__init__.py +2 -0
  12. yggdrasil/databricks/sql/engine.py +217 -36
  13. yggdrasil/databricks/sql/exceptions.py +1 -0
  14. yggdrasil/databricks/sql/statement_result.py +148 -1
  15. yggdrasil/databricks/sql/types.py +49 -1
  16. yggdrasil/databricks/workspaces/__init__.py +4 -1
  17. yggdrasil/databricks/workspaces/filesytem.py +344 -0
  18. yggdrasil/databricks/workspaces/io.py +1123 -0
  19. yggdrasil/databricks/workspaces/path.py +1415 -0
  20. yggdrasil/databricks/workspaces/path_kind.py +13 -0
  21. yggdrasil/databricks/workspaces/workspace.py +298 -154
  22. yggdrasil/dataclasses/__init__.py +2 -0
  23. yggdrasil/dataclasses/dataclass.py +42 -1
  24. yggdrasil/libs/__init__.py +2 -0
  25. yggdrasil/libs/databrickslib.py +9 -0
  26. yggdrasil/libs/extensions/__init__.py +2 -0
  27. yggdrasil/libs/extensions/polars_extensions.py +72 -0
  28. yggdrasil/libs/extensions/spark_extensions.py +116 -0
  29. yggdrasil/libs/pandaslib.py +7 -0
  30. yggdrasil/libs/polarslib.py +7 -0
  31. yggdrasil/libs/sparklib.py +41 -0
  32. yggdrasil/pyutils/__init__.py +4 -0
  33. yggdrasil/pyutils/callable_serde.py +106 -0
  34. yggdrasil/pyutils/exceptions.py +16 -0
  35. yggdrasil/pyutils/modules.py +44 -1
  36. yggdrasil/pyutils/parallel.py +29 -0
  37. yggdrasil/pyutils/python_env.py +301 -0
  38. yggdrasil/pyutils/retry.py +57 -0
  39. yggdrasil/requests/__init__.py +4 -0
  40. yggdrasil/requests/msal.py +124 -3
  41. yggdrasil/requests/session.py +18 -0
  42. yggdrasil/types/__init__.py +2 -0
  43. yggdrasil/types/cast/__init__.py +2 -1
  44. yggdrasil/types/cast/arrow_cast.py +131 -0
  45. yggdrasil/types/cast/cast_options.py +119 -1
  46. yggdrasil/types/cast/pandas_cast.py +29 -0
  47. yggdrasil/types/cast/polars_cast.py +47 -0
  48. yggdrasil/types/cast/polars_pandas_cast.py +29 -0
  49. yggdrasil/types/cast/registry.py +176 -0
  50. yggdrasil/types/cast/spark_cast.py +76 -0
  51. yggdrasil/types/cast/spark_pandas_cast.py +29 -0
  52. yggdrasil/types/cast/spark_polars_cast.py +28 -0
  53. yggdrasil/types/libs.py +2 -0
  54. yggdrasil/types/python_arrow.py +191 -0
  55. yggdrasil/types/python_defaults.py +73 -0
  56. yggdrasil/version.py +1 -0
  57. ygg-0.1.30.dist-info/RECORD +0 -56
  58. yggdrasil/databricks/workspaces/databricks_path.py +0 -784
  59. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
  60. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
  61. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
  62. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,18 @@
1
+ """Databricks SQL engine utilities and helpers."""
2
+
1
3
  import dataclasses
2
- import io
3
4
  import logging
4
5
  import random
5
6
  import string
6
7
  import time
7
- from typing import Optional, Union, Any, Dict, List
8
+ from typing import Optional, Union, Any, Dict, List, Literal
8
9
 
9
10
  import pyarrow as pa
10
11
  import pyarrow.parquet as pq
11
12
 
12
13
  from .statement_result import StatementResult
13
14
  from .types import column_info_to_arrow_field
14
- from .. import DatabricksPathKind
15
+ from .. import DatabricksPathKind, DatabricksPath
15
16
  from ..workspaces import WorkspaceService
16
17
  from ...libs.databrickslib import databricks_sdk
17
18
  from ...libs.sparklib import SparkSession, SparkDataFrame, pyspark
@@ -56,11 +57,12 @@ __all__ = [
56
57
 
57
58
 
58
59
  class SqlExecutionError(RuntimeError):
59
- pass
60
+ """Raised when a SQL statement execution fails."""
60
61
 
61
62
 
62
63
  @dataclasses.dataclass
63
64
  class SQLEngine(WorkspaceService):
65
+ """Execute SQL statements and manage tables via Databricks."""
64
66
  warehouse_id: Optional[str] = None
65
67
  catalog_name: Optional[str] = None
66
68
  schema_name: Optional[str] = None
@@ -72,6 +74,17 @@ class SQLEngine(WorkspaceService):
72
74
  table_name: Optional[str] = None,
73
75
  safe_chars: bool = True
74
76
  ):
77
+ """Build a fully qualified table name for the current catalog/schema.
78
+
79
+ Args:
80
+ catalog_name: Optional catalog override.
81
+ schema_name: Optional schema override.
82
+ table_name: Table name to qualify.
83
+ safe_chars: Whether to wrap identifiers in backticks.
84
+
85
+ Returns:
86
+ The fully qualified table name.
87
+ """
75
88
  catalog_name = catalog_name or self.catalog_name
76
89
  schema_name = schema_name or self.schema_name
77
90
 
@@ -87,6 +100,14 @@ class SQLEngine(WorkspaceService):
87
100
  self,
88
101
  full_name: str,
89
102
  ):
103
+ """Parse a catalog.schema.table string into components.
104
+
105
+ Args:
106
+ full_name: A fully qualified name or partial name.
107
+
108
+ Returns:
109
+ A tuple of (catalog_name, schema_name, table_name).
110
+ """
90
111
  parts = [
91
112
  _.strip("`") for _ in full_name.split(".")
92
113
  ]
@@ -108,6 +129,14 @@ class SQLEngine(WorkspaceService):
108
129
  self,
109
130
  cluster_size: str = "Small"
110
131
  ):
132
+ """Return a default SQL warehouse matching the desired size.
133
+
134
+ Args:
135
+ cluster_size: Desired warehouse size filter.
136
+
137
+ Returns:
138
+ The matched warehouse object.
139
+ """
111
140
  wk = self.workspace.sdk()
112
141
  existing = list(wk.warehouses.list())
113
142
  first = None
@@ -131,6 +160,14 @@ class SQLEngine(WorkspaceService):
131
160
  self,
132
161
  cluster_size = "Small"
133
162
  ):
163
+ """Return the configured warehouse id or a default one.
164
+
165
+ Args:
166
+ cluster_size: Desired warehouse size filter.
167
+
168
+ Returns:
169
+ The warehouse id string.
170
+ """
134
171
  if not self.warehouse_id:
135
172
  dft = self._default_warehouse(cluster_size=cluster_size)
136
173
 
@@ -139,6 +176,14 @@ class SQLEngine(WorkspaceService):
139
176
 
140
177
  @staticmethod
141
178
  def _random_suffix(prefix: str = "") -> str:
179
+ """Generate a unique suffix for temporary resources.
180
+
181
+ Args:
182
+ prefix: Optional prefix to prepend.
183
+
184
+ Returns:
185
+ A unique suffix string.
186
+ """
142
187
  unique = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
143
188
  timestamp = int(time.time() * 1000)
144
189
  return f"{prefix}{timestamp}_{unique}"
@@ -147,6 +192,7 @@ class SQLEngine(WorkspaceService):
147
192
  self,
148
193
  statement: Optional[str] = None,
149
194
  *,
195
+ engine: Optional[Literal["spark", "api"]] = None,
150
196
  warehouse_id: Optional[str] = None,
151
197
  byte_limit: Optional[int] = None,
152
198
  disposition: Optional["Disposition"] = None,
@@ -158,6 +204,7 @@ class SQLEngine(WorkspaceService):
158
204
  catalog_name: Optional[str] = None,
159
205
  schema_name: Optional[str] = None,
160
206
  table_name: Optional[str] = None,
207
+ wait_result: bool = True,
161
208
  **kwargs,
162
209
  ) -> "StatementResult":
163
210
  """
@@ -167,19 +214,46 @@ class SQLEngine(WorkspaceService):
167
214
  - On SUCCEEDED: return final statement object
168
215
  - On FAILED / CANCELED: raise SqlExecutionError
169
216
  - If wait=False: return initial execution handle without polling.
217
+
218
+ Args:
219
+ statement: SQL statement to execute. If omitted, selects from the table.
220
+ engine: Execution engine ("spark" or "api").
221
+ warehouse_id: Optional warehouse id override.
222
+ byte_limit: Optional byte limit for results.
223
+ disposition: Result disposition mode.
224
+ format: Result format for Databricks SQL API.
225
+ on_wait_timeout: Timeout behavior for waiting.
226
+ parameters: Optional statement parameters.
227
+ row_limit: Optional row limit.
228
+ wait_timeout: Optional API wait timeout.
229
+ catalog_name: Optional catalog override.
230
+ schema_name: Optional schema override.
231
+ table_name: Optional table name override.
232
+ wait_result: Whether to block until completion.
233
+ **kwargs: Additional API parameters.
234
+
235
+ Returns:
236
+ A StatementResult wrapper for the execution.
170
237
  """
171
- if pyspark is not None:
238
+ if not engine:
239
+ if pyspark is not None:
240
+ spark_session = SparkSession.getActiveSession()
241
+
242
+ if spark_session is not None:
243
+ engine = "spark"
244
+
245
+ if engine == "spark":
172
246
  spark_session = SparkSession.getActiveSession()
173
247
 
174
- if spark_session is not None:
175
- result = spark_session.sql(statement)
248
+ if spark_session is None:
249
+ raise ValueError("No spark session found to run sql query")
176
250
 
177
- return StatementResult(
178
- engine=self,
179
- statement_id="sparksql",
180
- disposition=Disposition.EXTERNAL_LINKS,
181
- _spark_df=result
182
- )
251
+ return StatementResult(
252
+ engine=self,
253
+ statement_id="sparksql",
254
+ disposition=Disposition.EXTERNAL_LINKS,
255
+ _spark_df=spark_session.sql(statement)
256
+ )
183
257
 
184
258
  if format is None:
185
259
  format = Format.ARROW_STREAM
@@ -217,7 +291,7 @@ class SQLEngine(WorkspaceService):
217
291
  disposition=disposition
218
292
  )
219
293
 
220
- return execution
294
+ return execution.wait() if wait_result else wait_result
221
295
 
222
296
  def spark_table(
223
297
  self,
@@ -226,6 +300,17 @@ class SQLEngine(WorkspaceService):
226
300
  schema_name: Optional[str] = None,
227
301
  table_name: Optional[str] = None,
228
302
  ):
303
+ """Return a DeltaTable handle for a given table name.
304
+
305
+ Args:
306
+ full_name: Fully qualified table name.
307
+ catalog_name: Optional catalog override.
308
+ schema_name: Optional schema override.
309
+ table_name: Optional table name override.
310
+
311
+ Returns:
312
+ A Spark DeltaTable handle.
313
+ """
229
314
  if not full_name:
230
315
  full_name = self.table_full_name(
231
316
  catalog_name=catalog_name,
@@ -258,6 +343,27 @@ class SQLEngine(WorkspaceService):
258
343
  spark_session: Optional[SparkSession] = None,
259
344
  spark_options: Optional[Dict[str, Any]] = None
260
345
  ):
346
+ """Insert data into a table using Spark or Arrow paths.
347
+
348
+ Args:
349
+ data: Arrow or Spark data to insert.
350
+ location: Fully qualified table name override.
351
+ catalog_name: Optional catalog override.
352
+ schema_name: Optional schema override.
353
+ table_name: Optional table name override.
354
+ mode: Insert mode ("auto", "append", "overwrite").
355
+ cast_options: Optional casting options.
356
+ overwrite_schema: Whether to overwrite schema (Spark).
357
+ match_by: Optional merge keys for upserts.
358
+ zorder_by: Optional Z-ORDER columns.
359
+ optimize_after_merge: Whether to run OPTIMIZE after merge.
360
+ vacuum_hours: Optional VACUUM retention window.
361
+ spark_session: Optional SparkSession override.
362
+ spark_options: Optional Spark write options.
363
+
364
+ Returns:
365
+ None for Arrow inserts, or the Spark insert result.
366
+ """
261
367
  # -------- existing logic you provided (kept intact) ----------
262
368
  if pyspark is not None:
263
369
  spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
@@ -310,8 +416,30 @@ class SQLEngine(WorkspaceService):
310
416
  zorder_by: list[str] = None,
311
417
  optimize_after_merge: bool = False,
312
418
  vacuum_hours: int | None = None, # e.g., 168 for 7 days
313
- existing_schema: pa.Schema | None = None
419
+ existing_schema: pa.Schema | None = None,
420
+ temp_volume_path: Optional[Union[str, DatabricksPath]] = None
314
421
  ):
422
+ """Insert Arrow data by staging to a temp volume and running SQL.
423
+
424
+ Args:
425
+ data: Arrow table/batch data to insert.
426
+ location: Fully qualified table name override.
427
+ catalog_name: Optional catalog override.
428
+ schema_name: Optional schema override.
429
+ table_name: Optional table name override.
430
+ mode: Insert mode ("auto", "append", "overwrite").
431
+ cast_options: Optional casting options.
432
+ overwrite_schema: Whether to overwrite schema.
433
+ match_by: Optional merge keys for upserts.
434
+ zorder_by: Optional Z-ORDER columns.
435
+ optimize_after_merge: Whether to run OPTIMIZE after merge.
436
+ vacuum_hours: Optional VACUUM retention window.
437
+ existing_schema: Optional pre-fetched schema.
438
+ temp_volume_path: Optional temp volume path override.
439
+
440
+ Returns:
441
+ None.
442
+ """
315
443
  location, catalog_name, schema_name, table_name = self._check_location_params(
316
444
  location=location,
317
445
  catalog_name=catalog_name,
@@ -375,14 +503,14 @@ class SQLEngine(WorkspaceService):
375
503
  data = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
376
504
 
377
505
  # Write in temp volume
378
- databricks_tmp_path = connected.dbfs_path(
506
+ temp_volume_path = connected.dbfs_path(
379
507
  kind=DatabricksPathKind.VOLUME,
380
- parts=[catalog_name, schema_name, "tmp", transaction_id, "data.parquet"]
381
- )
382
- databricks_tmp_folder = databricks_tmp_path.parent
508
+ parts=[catalog_name, schema_name, "tmp", "sql", transaction_id]
509
+ ) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
510
+
511
+ temp_volume_path.mkdir()
383
512
 
384
- with databricks_tmp_path.open(mode="wb") as f:
385
- pq.write_table(data, f, compression="snappy")
513
+ temp_volume_path.write_arrow_table(data)
386
514
 
387
515
  # get column list from arrow schema
388
516
  columns = [c for c in existing_schema.names]
@@ -412,7 +540,7 @@ class SQLEngine(WorkspaceService):
412
540
 
413
541
  merge_sql = f"""MERGE INTO {location} AS T
414
542
  USING (
415
- SELECT {cols_quoted} FROM parquet.`{databricks_tmp_folder}`
543
+ SELECT {cols_quoted} FROM parquet.`{temp_volume_path}`
416
544
  ) AS S
417
545
  ON {on_condition}
418
546
  {update_clause}
@@ -424,12 +552,12 @@ ON {on_condition}
424
552
  if mode.lower() in ("overwrite",):
425
553
  insert_sql = f"""INSERT OVERWRITE {location}
426
554
  SELECT {cols_quoted}
427
- FROM parquet.`{databricks_tmp_folder}`"""
555
+ FROM parquet.`{temp_volume_path}`"""
428
556
  else:
429
557
  # default: append
430
558
  insert_sql = f"""INSERT INTO {location} ({cols_quoted})
431
559
  SELECT {cols_quoted}
432
- FROM parquet.`{databricks_tmp_folder}`"""
560
+ FROM parquet.`{temp_volume_path}`"""
433
561
  statements.append(insert_sql)
434
562
 
435
563
  # Execute statements (use your existing execute helper)
@@ -439,7 +567,7 @@ FROM parquet.`{databricks_tmp_folder}`"""
439
567
  connected.execute(stmt.strip())
440
568
  finally:
441
569
  try:
442
- databricks_tmp_folder.rmdir(recursive=True)
570
+ temp_volume_path.rmdir(recursive=True)
443
571
  except Exception as e:
444
572
  logger.warning(e)
445
573
 
@@ -474,6 +602,26 @@ FROM parquet.`{databricks_tmp_folder}`"""
474
602
  vacuum_hours: int | None = None, # e.g., 168 for 7 days
475
603
  spark_options: Optional[Dict[str, Any]] = None,
476
604
  ):
605
+ """Insert a Spark DataFrame into a Delta table with optional merge semantics.
606
+
607
+ Args:
608
+ data: Spark DataFrame to insert.
609
+ location: Fully qualified table name override.
610
+ catalog_name: Optional catalog override.
611
+ schema_name: Optional schema override.
612
+ table_name: Optional table name override.
613
+ mode: Insert mode ("auto", "append", "overwrite").
614
+ cast_options: Optional casting options.
615
+ overwrite_schema: Whether to overwrite schema.
616
+ match_by: Optional merge keys for upserts.
617
+ zorder_by: Optional Z-ORDER columns.
618
+ optimize_after_merge: Whether to run OPTIMIZE after merge.
619
+ vacuum_hours: Optional VACUUM retention window.
620
+ spark_options: Optional Spark write options.
621
+
622
+ Returns:
623
+ None.
624
+ """
477
625
  location, catalog_name, schema_name, table_name = self._check_location_params(
478
626
  location=location,
479
627
  catalog_name=catalog_name,
@@ -573,6 +721,17 @@ FROM parquet.`{databricks_tmp_folder}`"""
573
721
  table_name: Optional[str] = None,
574
722
  to_arrow_schema: bool = True
575
723
  ) -> Union[pa.Field, pa.Schema]:
724
+ """Fetch a table schema from Unity Catalog as Arrow types.
725
+
726
+ Args:
727
+ catalog_name: Optional catalog override.
728
+ schema_name: Optional schema override.
729
+ table_name: Optional table name override.
730
+ to_arrow_schema: Whether to return an Arrow schema or field.
731
+
732
+ Returns:
733
+ Arrow Schema or Field representing the table.
734
+ """
576
735
  full_name = self.table_full_name(
577
736
  catalog_name=catalog_name,
578
737
  schema_name=schema_name,
@@ -603,6 +762,17 @@ FROM parquet.`{databricks_tmp_folder}`"""
603
762
  schema_name: Optional[str] = None,
604
763
  table_name: Optional[str] = None,
605
764
  ):
765
+ """Drop a table if it exists.
766
+
767
+ Args:
768
+ location: Fully qualified table name override.
769
+ catalog_name: Optional catalog override.
770
+ schema_name: Optional schema override.
771
+ table_name: Optional table name override.
772
+
773
+ Returns:
774
+ The StatementResult from executing the drop statement.
775
+ """
606
776
  location, _, _, _ = self._check_location_params(
607
777
  location=location,
608
778
  catalog_name=catalog_name,
@@ -656,23 +826,22 @@ FROM parquet.`{databricks_tmp_folder}`"""
656
826
  safe_chars=True
657
827
  )
658
828
 
659
- # Create the DDL statement
660
- sql = [f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} ("]
661
-
662
- # Generate column definitions
663
- column_defs = []
664
-
665
829
  if pa.types.is_struct(field.type):
666
830
  children = list(field.type)
667
831
  else:
668
832
  children = [field]
669
833
 
670
- for child in children:
671
- column_def = self._field_to_ddl(child)
672
- column_defs.append(column_def)
834
+ # Create the DDL statement
835
+ column_definitions = [
836
+ self._field_to_ddl(child)
837
+ for child in children
838
+ ]
673
839
 
674
- sql.append(",\n ".join(column_defs))
675
- sql.append(")")
840
+ sql = [
841
+ f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} (",
842
+ ",\n ".join(column_definitions),
843
+ ")"
844
+ ]
676
845
 
677
846
  # Add partition by clause if provided
678
847
  if partition_by and len(partition_by) > 0:
@@ -729,6 +898,18 @@ FROM parquet.`{databricks_tmp_folder}`"""
729
898
  table_name: Optional[str] = None,
730
899
  safe_chars: bool = True
731
900
  ):
901
+ """Resolve location/catalog/schema/table parameters to a full name.
902
+
903
+ Args:
904
+ location: Fully qualified table name override.
905
+ catalog_name: Optional catalog override.
906
+ schema_name: Optional schema override.
907
+ table_name: Optional table name override.
908
+ safe_chars: Whether to wrap identifiers in backticks.
909
+
910
+ Returns:
911
+ A tuple of (location, catalog_name, schema_name, table_name).
912
+ """
732
913
  if location:
733
914
  c, s, t = self._catalog_schema_table_names(location)
734
915
  catalog_name, schema_name, table_name = catalog_name or c, schema_name or s, table_name or t
@@ -0,0 +1 @@
1
+ """Custom exceptions for Databricks SQL helpers."""