ygg 0.1.57__py3-none-any.whl → 0.1.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/METADATA +2 -2
  2. ygg-0.1.64.dist-info/RECORD +74 -0
  3. yggdrasil/ai/__init__.py +2 -0
  4. yggdrasil/ai/session.py +87 -0
  5. yggdrasil/ai/sql_session.py +310 -0
  6. yggdrasil/databricks/__init__.py +0 -3
  7. yggdrasil/databricks/compute/cluster.py +68 -113
  8. yggdrasil/databricks/compute/command_execution.py +674 -0
  9. yggdrasil/databricks/compute/exceptions.py +19 -0
  10. yggdrasil/databricks/compute/execution_context.py +491 -282
  11. yggdrasil/databricks/compute/remote.py +4 -14
  12. yggdrasil/databricks/exceptions.py +10 -0
  13. yggdrasil/databricks/sql/__init__.py +0 -4
  14. yggdrasil/databricks/sql/engine.py +178 -178
  15. yggdrasil/databricks/sql/exceptions.py +9 -1
  16. yggdrasil/databricks/sql/statement_result.py +108 -120
  17. yggdrasil/databricks/sql/warehouse.py +339 -92
  18. yggdrasil/databricks/workspaces/io.py +185 -40
  19. yggdrasil/databricks/workspaces/path.py +114 -100
  20. yggdrasil/databricks/workspaces/workspace.py +210 -61
  21. yggdrasil/exceptions.py +7 -0
  22. yggdrasil/libs/databrickslib.py +22 -18
  23. yggdrasil/libs/extensions/spark_extensions.py +1 -1
  24. yggdrasil/libs/pandaslib.py +15 -6
  25. yggdrasil/libs/polarslib.py +49 -13
  26. yggdrasil/pyutils/__init__.py +1 -2
  27. yggdrasil/pyutils/callable_serde.py +12 -19
  28. yggdrasil/pyutils/exceptions.py +16 -0
  29. yggdrasil/pyutils/modules.py +6 -7
  30. yggdrasil/pyutils/python_env.py +16 -21
  31. yggdrasil/pyutils/waiting_config.py +171 -0
  32. yggdrasil/requests/msal.py +9 -96
  33. yggdrasil/types/cast/arrow_cast.py +3 -0
  34. yggdrasil/types/cast/pandas_cast.py +157 -169
  35. yggdrasil/types/cast/polars_cast.py +11 -43
  36. yggdrasil/types/dummy_class.py +81 -0
  37. yggdrasil/types/file_format.py +6 -2
  38. yggdrasil/types/python_defaults.py +92 -76
  39. yggdrasil/version.py +1 -1
  40. ygg-0.1.57.dist-info/RECORD +0 -66
  41. yggdrasil/databricks/ai/loki.py +0 -53
  42. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/WHEEL +0 -0
  43. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/entry_points.txt +0 -0
  44. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/licenses/LICENSE +0 -0
  45. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/top_level.txt +0 -0
  46. /yggdrasil/{databricks/ai/__init__.py → pyutils/mimetypes.py} +0 -0
@@ -2,18 +2,21 @@
2
2
  from dataclasses import dataclass
3
3
  from typing import Optional, Any
4
4
 
5
+ from ..exceptions import DatabricksException
6
+
5
7
  __all__ = [
6
8
  "SqlStatementError"
7
9
  ]
8
10
 
9
11
 
10
12
  @dataclass(frozen=True)
11
- class SqlStatementError(RuntimeError):
13
+ class SqlStatementError(DatabricksException):
12
14
  statement_id: str
13
15
  state: str
14
16
  message: str
15
17
  error_code: Optional[str] = None
16
18
  sql_state: Optional[str] = None
19
+ url: Optional[str] = None
17
20
 
18
21
  def __str__(self) -> str:
19
22
  meta = []
@@ -22,6 +25,8 @@ class SqlStatementError(RuntimeError):
22
25
  meta.append(f"code={self.error_code}")
23
26
  if self.sql_state:
24
27
  meta.append(f"state={self.sql_state}")
28
+ if self.url:
29
+ meta.append(f"url={self.url}")
25
30
 
26
31
  meta_str = f" ({', '.join(meta)})" if meta else ""
27
32
 
@@ -38,10 +43,13 @@ class SqlStatementError(RuntimeError):
38
43
  error_code = getattr(err, "error_code", None)
39
44
  sql_state = getattr(err, "sql_state", None)
40
45
 
46
+ url = getattr(err, "monitoring_url", None)
47
+
41
48
  return cls(
42
49
  statement_id=str(statement_id),
43
50
  state=str(state),
44
51
  message=str(message),
45
52
  error_code=str(error_code) if error_code is not None else None,
46
53
  sql_state=str(sql_state) if sql_state is not None else None,
54
+ url=str(url) if url is not None else None
47
55
  )
@@ -3,7 +3,7 @@
3
3
  import dataclasses
4
4
  import threading
5
5
  import time
6
- from concurrent.futures import ThreadPoolExecutor, FIRST_COMPLETED, wait
6
+ from concurrent.futures import ThreadPoolExecutor, FIRST_COMPLETED, wait as concurrent_wait
7
7
  from typing import Optional, Iterator, TYPE_CHECKING
8
8
 
9
9
  import pyarrow as pa
@@ -11,36 +11,28 @@ import pyarrow.ipc as pipc
11
11
 
12
12
  from .exceptions import SqlStatementError
13
13
  from .types import column_info_to_arrow_field
14
- from ...libs.databrickslib import databricks_sdk
15
- from ...libs.pandaslib import pandas
14
+ from ...libs.databrickslib import databricks_sdk, WorkspaceClient, DatabricksDummyClass
15
+ from ...libs.pandaslib import PandasDataFrame
16
16
  from ...libs.polarslib import polars
17
17
  from ...libs.sparklib import SparkDataFrame
18
+ from ...pyutils.waiting_config import WaitingConfigArg, WaitingConfig
18
19
  from ...requests.session import YGGSession
19
20
  from ...types import spark_dataframe_to_arrow_table, \
20
21
  spark_schema_to_arrow_schema, arrow_table_to_spark_dataframe
21
22
 
22
- try:
23
- from delta.tables import DeltaTable as SparkDeltaTable
24
- except ImportError:
25
- class SparkDeltaTable:
26
- @classmethod
27
- def forName(cls, *args, **kwargs):
28
- from delta.tables import DeltaTable
29
-
30
- return DeltaTable.forName(*args, **kwargs)
31
-
32
-
33
23
  if databricks_sdk is not None:
34
24
  from databricks.sdk.service.sql import (
35
25
  StatementState, StatementResponse, Disposition, StatementStatus
36
26
  )
37
27
  else:
38
- class StatementResponse:
39
- pass
28
+ StatementState = DatabricksDummyClass
29
+ StatementResponse = DatabricksDummyClass
30
+ Disposition = DatabricksDummyClass
31
+ StatementStatus = DatabricksDummyClass
40
32
 
41
33
 
42
34
  if TYPE_CHECKING:
43
- from .engine import SQLEngine
35
+ pass
44
36
 
45
37
 
46
38
  DONE_STATES = {
@@ -60,9 +52,10 @@ __all__ = [
60
52
  @dataclasses.dataclass
61
53
  class StatementResult:
62
54
  """Container for statement responses, data extraction, and conversions."""
63
- engine: "SQLEngine"
55
+ workspace_client: WorkspaceClient
56
+ warehouse_id: str
64
57
  statement_id: str
65
- disposition: "Disposition"
58
+ disposition: Disposition
66
59
 
67
60
  _response: Optional[StatementResponse] = dataclasses.field(default=None, repr=False)
68
61
 
@@ -96,6 +89,20 @@ class StatementResult:
96
89
  """Iterate over Arrow record batches."""
97
90
  return self.to_arrow_batches()
98
91
 
92
+ def __repr__(self):
93
+ return "StatementResult(url='%s')" % self.monitoring_url
94
+
95
+ def __str__(self):
96
+ return self.monitoring_url
97
+
98
+ @property
99
+ def monitoring_url(self):
100
+ return "%s/sql/warehouses/%s/monitoring?queryId=%s" % (
101
+ self.workspace_client.config.host,
102
+ self.warehouse_id,
103
+ self.statement_id
104
+ )
105
+
99
106
  @property
100
107
  def is_spark_sql(self):
101
108
  """Return True when this result was produced by Spark SQL."""
@@ -123,7 +130,7 @@ class StatementResult:
123
130
  )
124
131
  )
125
132
 
126
- statement_execution = self.workspace.sdk().statement_execution
133
+ statement_execution = self.workspace_client.statement_execution
127
134
 
128
135
  if self._response is None:
129
136
  # Initialize
@@ -134,17 +141,7 @@ class StatementResult:
134
141
 
135
142
  return self._response
136
143
 
137
- @response.setter
138
- def response(self, value: "StatementResponse"):
139
- """Update the cached response and refresh timestamp.
140
-
141
- Args:
142
- value: StatementResponse to cache.
143
- """
144
- self._response = value
145
- self.statement_id = self._response.statement_id
146
-
147
- def result_data_at(self, chunk_index: int):
144
+ def api_result_data_at_index(self, chunk_index: int):
148
145
  """Fetch a specific result chunk by index.
149
146
 
150
147
  Args:
@@ -153,22 +150,13 @@ class StatementResult:
153
150
  Returns:
154
151
  The SDK result chunk response.
155
152
  """
156
- sdk = self.workspace.sdk()
153
+ sdk = self.workspace_client
157
154
 
158
155
  return sdk.statement_execution.get_statement_result_chunk_n(
159
156
  statement_id=self.statement_id,
160
157
  chunk_index=chunk_index,
161
158
  )
162
159
 
163
- @property
164
- def workspace(self):
165
- """Expose the underlying workspace from the engine.
166
-
167
- Returns:
168
- The Workspace instance backing this statement.
169
- """
170
- return self.engine.workspace
171
-
172
160
  @property
173
161
  def status(self):
174
162
  """Return the statement status, handling persisted data.
@@ -255,7 +243,7 @@ class StatementResult:
255
243
  )
256
244
 
257
245
  result_data = self.result
258
- wsdk = self.workspace.sdk()
246
+ wsdk = self.workspace_client
259
247
 
260
248
  seen_chunk_indexes = set()
261
249
 
@@ -304,34 +292,27 @@ class StatementResult:
304
292
 
305
293
  def wait(
306
294
  self,
307
- timeout: Optional[int] = None,
308
- poll_interval: Optional[float] = None
295
+ wait: WaitingConfigArg = True,
296
+ raise_error: bool = True
309
297
  ):
310
298
  """Wait for statement completion with optional timeout.
311
299
 
312
300
  Args:
313
- timeout: Maximum seconds to wait.
314
- poll_interval: Initial poll interval in seconds.
301
+ wait: Waiting config
302
+ raise_error: Raise error if failed
315
303
 
316
304
  Returns:
317
305
  The current StatementResult instance.
318
306
  """
319
- if not self.done:
320
- start = time.time()
321
- poll_interval = poll_interval or 1
307
+ wait = WaitingConfig.check_arg(wait)
308
+ iteration, start = 0, time.time()
322
309
 
323
- while not self.done:
324
- # still running / queued / pending
325
- if timeout is not None and (time.time() - start) > timeout:
326
- raise TimeoutError(
327
- f"Statement {self.statement_id} did not finish within {timeout} seconds "
328
- f"(last state={self.state})"
329
- )
330
-
331
- poll_interval = max(10, poll_interval * 1.2)
332
- time.sleep(poll_interval)
310
+ if not self.done:
311
+ wait.sleep(iteration=iteration, start=start)
312
+ iteration += 1
333
313
 
334
- self.raise_for_status()
314
+ if raise_error:
315
+ self.raise_for_status()
335
316
 
336
317
  return self
337
318
 
@@ -346,20 +327,32 @@ class StatementResult:
346
327
  return self._arrow_table.schema
347
328
  elif self._spark_df is not None:
348
329
  return spark_schema_to_arrow_schema(self._spark_df.schema)
349
- raise NotImplementedError("")
330
+ else:
331
+ raise NotImplementedError("")
350
332
 
351
333
  manifest = self.manifest
352
334
 
335
+ metadata = {
336
+ "source": "databricks-sql",
337
+ "sid": self.statement_id or ""
338
+ }
339
+
353
340
  if manifest is None:
354
- return pa.schema([])
341
+ return pa.schema([], metadata=metadata)
355
342
 
356
343
  fields = [
357
344
  column_info_to_arrow_field(_) for _ in manifest.schema.columns
358
345
  ]
359
346
 
360
- return pa.schema(fields)
347
+ return pa.schema(
348
+ fields,
349
+ metadata=metadata
350
+ )
361
351
 
362
- def to_arrow_table(self, parallel_pool: Optional[int] = 4) -> pa.Table:
352
+ def to_arrow_table(
353
+ self,
354
+ parallel_pool: int = 4
355
+ ) -> pa.Table:
363
356
  """Collect the statement result into a single Arrow table.
364
357
 
365
358
  Args:
@@ -383,23 +376,27 @@ class StatementResult:
383
376
 
384
377
  def to_arrow_batches(
385
378
  self,
386
- parallel_pool: Optional[int] = 4
379
+ parallel_pool: int = 4,
380
+ batch_size: Optional[int] = None
387
381
  ) -> Iterator[pa.RecordBatch]:
388
382
  """Stream the result as Arrow record batches.
389
383
 
390
384
  Args:
391
385
  parallel_pool: Maximum parallel fetch workers.
386
+ batch_size: Fetch batch size
392
387
 
393
388
  Yields:
394
389
  Arrow RecordBatch objects.
395
390
  """
396
391
  if self.persisted:
397
392
  if self._arrow_table is not None:
398
- for batch in self._arrow_table.to_batches(max_chunksize=64 * 1024):
393
+ for batch in self._arrow_table.to_batches(max_chunksize=batch_size):
399
394
  yield batch
400
395
  elif self._spark_df is not None:
401
- for batch in self._spark_df.toArrow().to_batches(max_chunksize=64 * 1024):
396
+ for batch in self._spark_df.toArrow().to_batches(max_chunksize=batch_size):
402
397
  yield batch
398
+ else:
399
+ raise NotImplementedError("")
403
400
  else:
404
401
  _tls = threading.local()
405
402
 
@@ -417,66 +414,57 @@ class StatementResult:
417
414
  resp.raise_for_status()
418
415
  return resp.content
419
416
 
420
- # ---- in your generator ----
421
- if self.persisted:
422
- if self._arrow_table is not None:
423
- for batch in self._arrow_table.to_batches(max_chunksize=64 * 1024):
424
- yield batch
425
- elif self._spark_df is not None:
426
- for batch in self._spark_df.toArrow().to_batches(max_chunksize=64 * 1024):
427
- yield batch
428
- else:
429
- max_workers = max(1, int(parallel_pool) if parallel_pool else 4)
430
- max_in_flight = max_workers * 2 # keeps pipeline full without exploding memory
431
-
432
- links_iter = enumerate(self.external_links())
433
- pending = {} # future -> idx
434
- ready = {} # idx -> bytes
435
- next_idx = 0
436
-
437
- def submit_more(ex):
438
- while len(pending) < max_in_flight:
439
- try:
440
- idx, link = next(links_iter)
441
- except StopIteration:
442
- break
443
- fut = ex.submit(_fetch_bytes, link)
444
- pending[fut] = idx
445
-
446
- with ThreadPoolExecutor(max_workers=max_workers) as ex:
447
- submit_more(ex)
417
+ max_workers = max(1, int(parallel_pool) if parallel_pool else 4)
418
+ max_in_flight = max_workers * 2 # keeps pipeline full without exploding memory
419
+
420
+ links_iter = enumerate(self.external_links())
421
+ pending = {} # future -> idx
422
+ ready = {} # idx -> bytes
423
+ next_idx = 0
424
+
425
+ def submit_more(ex):
426
+ while len(pending) < max_in_flight:
427
+ try:
428
+ idx, link = next(links_iter)
429
+ except StopIteration:
430
+ break
431
+ fut = ex.submit(_fetch_bytes, link)
432
+ pending[fut] = idx
433
+
434
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
435
+ submit_more(ex)
448
436
 
449
- while pending:
450
- done, _ = wait(pending, return_when=FIRST_COMPLETED)
437
+ while pending:
438
+ done, _ = concurrent_wait(pending, return_when=FIRST_COMPLETED)
451
439
 
452
- # collect completed downloads
453
- for fut in done:
454
- idx = pending.pop(fut)
455
- ready[idx] = fut.result() # raises here if the GET failed
440
+ # collect completed downloads
441
+ for fut in done:
442
+ idx = pending.pop(fut)
443
+ ready[idx] = fut.result() # raises here if the GET failed
456
444
 
457
- # yield strictly in-order
458
- while next_idx in ready:
459
- content = ready.pop(next_idx)
445
+ # yield strictly in-order
446
+ while next_idx in ready:
447
+ content = ready.pop(next_idx)
460
448
 
461
- buf = pa.BufferReader(content)
449
+ buf = pa.BufferReader(content)
462
450
 
463
- # IPC stream (your current format)
464
- reader = pipc.open_stream(buf)
451
+ # IPC stream (your current format)
452
+ reader = pipc.open_stream(buf)
465
453
 
466
- # if it’s IPC file instead:
467
- # reader = pipc.open_file(buf)
454
+ # if it’s IPC file instead:
455
+ # reader = pipc.open_file(buf)
468
456
 
469
- for batch in reader:
470
- yield batch
457
+ for batch in reader:
458
+ yield batch
471
459
 
472
- next_idx += 1
460
+ next_idx += 1
473
461
 
474
- submit_more(ex)
462
+ submit_more(ex)
475
463
 
476
464
  def to_pandas(
477
465
  self,
478
466
  parallel_pool: Optional[int] = 4
479
- ) -> "pandas.DataFrame":
467
+ ) -> PandasDataFrame:
480
468
  """Return the result as a pandas DataFrame.
481
469
 
482
470
  Args:
@@ -489,7 +477,7 @@ class StatementResult:
489
477
 
490
478
  def to_polars(
491
479
  self,
492
- parallel_pool: Optional[int] = 4
480
+ parallel_pool: int = 4
493
481
  ) -> "polars.DataFrame":
494
482
  """Return the result as a polars DataFrame.
495
483
 
@@ -499,9 +487,11 @@ class StatementResult:
499
487
  Returns:
500
488
  A polars DataFrame with the result rows.
501
489
  """
502
- return polars.from_arrow(self.to_arrow_table(parallel_pool=parallel_pool))
490
+ arrow_table = self.to_arrow_table(parallel_pool=parallel_pool)
503
491
 
504
- def to_spark(self):
492
+ return polars.from_arrow(arrow_table)
493
+
494
+ def to_spark(self) -> SparkDataFrame:
505
495
  """Return the result as a Spark DataFrame, caching it locally.
506
496
 
507
497
  Returns:
@@ -510,6 +500,4 @@ class StatementResult:
510
500
  if self._spark_df is not None:
511
501
  return self._spark_df
512
502
 
513
- self._spark_df = arrow_table_to_spark_dataframe(self.to_arrow_table())
514
-
515
- return self._spark_df
503
+ return arrow_table_to_spark_dataframe(self.to_arrow_table())