ygg 0.1.56__py3-none-any.whl → 0.1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/METADATA +1 -1
- ygg-0.1.60.dist-info/RECORD +74 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/WHEEL +1 -1
- yggdrasil/ai/__init__.py +2 -0
- yggdrasil/ai/session.py +89 -0
- yggdrasil/ai/sql_session.py +310 -0
- yggdrasil/databricks/__init__.py +0 -3
- yggdrasil/databricks/compute/cluster.py +68 -113
- yggdrasil/databricks/compute/command_execution.py +674 -0
- yggdrasil/databricks/compute/exceptions.py +7 -2
- yggdrasil/databricks/compute/execution_context.py +465 -277
- yggdrasil/databricks/compute/remote.py +4 -14
- yggdrasil/databricks/exceptions.py +10 -0
- yggdrasil/databricks/sql/__init__.py +0 -4
- yggdrasil/databricks/sql/engine.py +161 -173
- yggdrasil/databricks/sql/exceptions.py +9 -1
- yggdrasil/databricks/sql/statement_result.py +108 -120
- yggdrasil/databricks/sql/warehouse.py +331 -92
- yggdrasil/databricks/workspaces/io.py +92 -9
- yggdrasil/databricks/workspaces/path.py +120 -74
- yggdrasil/databricks/workspaces/workspace.py +212 -68
- yggdrasil/libs/databrickslib.py +23 -18
- yggdrasil/libs/extensions/spark_extensions.py +1 -1
- yggdrasil/libs/pandaslib.py +15 -6
- yggdrasil/libs/polarslib.py +49 -13
- yggdrasil/pyutils/__init__.py +1 -0
- yggdrasil/pyutils/callable_serde.py +12 -19
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/mimetypes.py +0 -0
- yggdrasil/pyutils/python_env.py +13 -12
- yggdrasil/pyutils/waiting_config.py +171 -0
- yggdrasil/types/cast/arrow_cast.py +3 -0
- yggdrasil/types/cast/pandas_cast.py +157 -169
- yggdrasil/types/cast/polars_cast.py +11 -43
- yggdrasil/types/dummy_class.py +81 -0
- yggdrasil/version.py +1 -1
- ygg-0.1.56.dist-info/RECORD +0 -68
- yggdrasil/databricks/ai/__init__.py +0 -1
- yggdrasil/databricks/ai/loki.py +0 -374
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/top_level.txt +0 -0
|
@@ -2,18 +2,21 @@
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Optional, Any
|
|
4
4
|
|
|
5
|
+
from ..exceptions import DatabricksException
|
|
6
|
+
|
|
5
7
|
__all__ = [
|
|
6
8
|
"SqlStatementError"
|
|
7
9
|
]
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
@dataclass(frozen=True)
|
|
11
|
-
class SqlStatementError(
|
|
13
|
+
class SqlStatementError(DatabricksException):
|
|
12
14
|
statement_id: str
|
|
13
15
|
state: str
|
|
14
16
|
message: str
|
|
15
17
|
error_code: Optional[str] = None
|
|
16
18
|
sql_state: Optional[str] = None
|
|
19
|
+
url: Optional[str] = None
|
|
17
20
|
|
|
18
21
|
def __str__(self) -> str:
|
|
19
22
|
meta = []
|
|
@@ -22,6 +25,8 @@ class SqlStatementError(RuntimeError):
|
|
|
22
25
|
meta.append(f"code={self.error_code}")
|
|
23
26
|
if self.sql_state:
|
|
24
27
|
meta.append(f"state={self.sql_state}")
|
|
28
|
+
if self.url:
|
|
29
|
+
meta.append(f"url={self.url}")
|
|
25
30
|
|
|
26
31
|
meta_str = f" ({', '.join(meta)})" if meta else ""
|
|
27
32
|
|
|
@@ -38,10 +43,13 @@ class SqlStatementError(RuntimeError):
|
|
|
38
43
|
error_code = getattr(err, "error_code", None)
|
|
39
44
|
sql_state = getattr(err, "sql_state", None)
|
|
40
45
|
|
|
46
|
+
url = getattr(err, "monitoring_url", None)
|
|
47
|
+
|
|
41
48
|
return cls(
|
|
42
49
|
statement_id=str(statement_id),
|
|
43
50
|
state=str(state),
|
|
44
51
|
message=str(message),
|
|
45
52
|
error_code=str(error_code) if error_code is not None else None,
|
|
46
53
|
sql_state=str(sql_state) if sql_state is not None else None,
|
|
54
|
+
url=str(url) if url is not None else None
|
|
47
55
|
)
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import threading
|
|
5
5
|
import time
|
|
6
|
-
from concurrent.futures import ThreadPoolExecutor, FIRST_COMPLETED, wait
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, FIRST_COMPLETED, wait as concurrent_wait
|
|
7
7
|
from typing import Optional, Iterator, TYPE_CHECKING
|
|
8
8
|
|
|
9
9
|
import pyarrow as pa
|
|
@@ -11,36 +11,28 @@ import pyarrow.ipc as pipc
|
|
|
11
11
|
|
|
12
12
|
from .exceptions import SqlStatementError
|
|
13
13
|
from .types import column_info_to_arrow_field
|
|
14
|
-
from ...libs.databrickslib import databricks_sdk
|
|
15
|
-
from ...libs.pandaslib import
|
|
14
|
+
from ...libs.databrickslib import databricks_sdk, WorkspaceClient, DatabricksDummyClass
|
|
15
|
+
from ...libs.pandaslib import PandasDataFrame
|
|
16
16
|
from ...libs.polarslib import polars
|
|
17
17
|
from ...libs.sparklib import SparkDataFrame
|
|
18
|
+
from ...pyutils.waiting_config import WaitingConfigArg, WaitingConfig
|
|
18
19
|
from ...requests.session import YGGSession
|
|
19
20
|
from ...types import spark_dataframe_to_arrow_table, \
|
|
20
21
|
spark_schema_to_arrow_schema, arrow_table_to_spark_dataframe
|
|
21
22
|
|
|
22
|
-
try:
|
|
23
|
-
from delta.tables import DeltaTable as SparkDeltaTable
|
|
24
|
-
except ImportError:
|
|
25
|
-
class SparkDeltaTable:
|
|
26
|
-
@classmethod
|
|
27
|
-
def forName(cls, *args, **kwargs):
|
|
28
|
-
from delta.tables import DeltaTable
|
|
29
|
-
|
|
30
|
-
return DeltaTable.forName(*args, **kwargs)
|
|
31
|
-
|
|
32
|
-
|
|
33
23
|
if databricks_sdk is not None:
|
|
34
24
|
from databricks.sdk.service.sql import (
|
|
35
25
|
StatementState, StatementResponse, Disposition, StatementStatus
|
|
36
26
|
)
|
|
37
27
|
else:
|
|
38
|
-
|
|
39
|
-
|
|
28
|
+
StatementState = DatabricksDummyClass
|
|
29
|
+
StatementResponse = DatabricksDummyClass
|
|
30
|
+
Disposition = DatabricksDummyClass
|
|
31
|
+
StatementStatus = DatabricksDummyClass
|
|
40
32
|
|
|
41
33
|
|
|
42
34
|
if TYPE_CHECKING:
|
|
43
|
-
|
|
35
|
+
pass
|
|
44
36
|
|
|
45
37
|
|
|
46
38
|
DONE_STATES = {
|
|
@@ -60,9 +52,10 @@ __all__ = [
|
|
|
60
52
|
@dataclasses.dataclass
|
|
61
53
|
class StatementResult:
|
|
62
54
|
"""Container for statement responses, data extraction, and conversions."""
|
|
63
|
-
|
|
55
|
+
workspace_client: WorkspaceClient
|
|
56
|
+
warehouse_id: str
|
|
64
57
|
statement_id: str
|
|
65
|
-
disposition:
|
|
58
|
+
disposition: Disposition
|
|
66
59
|
|
|
67
60
|
_response: Optional[StatementResponse] = dataclasses.field(default=None, repr=False)
|
|
68
61
|
|
|
@@ -96,6 +89,20 @@ class StatementResult:
|
|
|
96
89
|
"""Iterate over Arrow record batches."""
|
|
97
90
|
return self.to_arrow_batches()
|
|
98
91
|
|
|
92
|
+
def __repr__(self):
|
|
93
|
+
return "StatementResult(url='%s')" % self.monitoring_url
|
|
94
|
+
|
|
95
|
+
def __str__(self):
|
|
96
|
+
return self.monitoring_url
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def monitoring_url(self):
|
|
100
|
+
return "%s/sql/warehouses/%s/monitoring?queryId=%s" % (
|
|
101
|
+
self.workspace_client.config.host,
|
|
102
|
+
self.warehouse_id,
|
|
103
|
+
self.statement_id
|
|
104
|
+
)
|
|
105
|
+
|
|
99
106
|
@property
|
|
100
107
|
def is_spark_sql(self):
|
|
101
108
|
"""Return True when this result was produced by Spark SQL."""
|
|
@@ -123,7 +130,7 @@ class StatementResult:
|
|
|
123
130
|
)
|
|
124
131
|
)
|
|
125
132
|
|
|
126
|
-
statement_execution = self.
|
|
133
|
+
statement_execution = self.workspace_client.statement_execution
|
|
127
134
|
|
|
128
135
|
if self._response is None:
|
|
129
136
|
# Initialize
|
|
@@ -134,17 +141,7 @@ class StatementResult:
|
|
|
134
141
|
|
|
135
142
|
return self._response
|
|
136
143
|
|
|
137
|
-
|
|
138
|
-
def response(self, value: "StatementResponse"):
|
|
139
|
-
"""Update the cached response and refresh timestamp.
|
|
140
|
-
|
|
141
|
-
Args:
|
|
142
|
-
value: StatementResponse to cache.
|
|
143
|
-
"""
|
|
144
|
-
self._response = value
|
|
145
|
-
self.statement_id = self._response.statement_id
|
|
146
|
-
|
|
147
|
-
def result_data_at(self, chunk_index: int):
|
|
144
|
+
def api_result_data_at_index(self, chunk_index: int):
|
|
148
145
|
"""Fetch a specific result chunk by index.
|
|
149
146
|
|
|
150
147
|
Args:
|
|
@@ -153,22 +150,13 @@ class StatementResult:
|
|
|
153
150
|
Returns:
|
|
154
151
|
The SDK result chunk response.
|
|
155
152
|
"""
|
|
156
|
-
sdk = self.
|
|
153
|
+
sdk = self.workspace_client
|
|
157
154
|
|
|
158
155
|
return sdk.statement_execution.get_statement_result_chunk_n(
|
|
159
156
|
statement_id=self.statement_id,
|
|
160
157
|
chunk_index=chunk_index,
|
|
161
158
|
)
|
|
162
159
|
|
|
163
|
-
@property
|
|
164
|
-
def workspace(self):
|
|
165
|
-
"""Expose the underlying workspace from the engine.
|
|
166
|
-
|
|
167
|
-
Returns:
|
|
168
|
-
The Workspace instance backing this statement.
|
|
169
|
-
"""
|
|
170
|
-
return self.engine.workspace
|
|
171
|
-
|
|
172
160
|
@property
|
|
173
161
|
def status(self):
|
|
174
162
|
"""Return the statement status, handling persisted data.
|
|
@@ -255,7 +243,7 @@ class StatementResult:
|
|
|
255
243
|
)
|
|
256
244
|
|
|
257
245
|
result_data = self.result
|
|
258
|
-
wsdk = self.
|
|
246
|
+
wsdk = self.workspace_client
|
|
259
247
|
|
|
260
248
|
seen_chunk_indexes = set()
|
|
261
249
|
|
|
@@ -304,34 +292,27 @@ class StatementResult:
|
|
|
304
292
|
|
|
305
293
|
def wait(
|
|
306
294
|
self,
|
|
307
|
-
|
|
308
|
-
|
|
295
|
+
wait: WaitingConfigArg = True,
|
|
296
|
+
raise_error: bool = True
|
|
309
297
|
):
|
|
310
298
|
"""Wait for statement completion with optional timeout.
|
|
311
299
|
|
|
312
300
|
Args:
|
|
313
|
-
|
|
314
|
-
|
|
301
|
+
wait: Waiting config
|
|
302
|
+
raise_error: Raise error if failed
|
|
315
303
|
|
|
316
304
|
Returns:
|
|
317
305
|
The current StatementResult instance.
|
|
318
306
|
"""
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
poll_interval = poll_interval or 1
|
|
307
|
+
wait = WaitingConfig.check_arg(wait)
|
|
308
|
+
iteration, start = 0, time.time()
|
|
322
309
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
raise TimeoutError(
|
|
327
|
-
f"Statement {self.statement_id} did not finish within {timeout} seconds "
|
|
328
|
-
f"(last state={self.state})"
|
|
329
|
-
)
|
|
330
|
-
|
|
331
|
-
poll_interval = max(10, poll_interval * 1.2)
|
|
332
|
-
time.sleep(poll_interval)
|
|
310
|
+
if not self.done:
|
|
311
|
+
wait.sleep(iteration=iteration, start=start)
|
|
312
|
+
iteration += 1
|
|
333
313
|
|
|
334
|
-
|
|
314
|
+
if raise_error:
|
|
315
|
+
self.raise_for_status()
|
|
335
316
|
|
|
336
317
|
return self
|
|
337
318
|
|
|
@@ -346,20 +327,32 @@ class StatementResult:
|
|
|
346
327
|
return self._arrow_table.schema
|
|
347
328
|
elif self._spark_df is not None:
|
|
348
329
|
return spark_schema_to_arrow_schema(self._spark_df.schema)
|
|
349
|
-
|
|
330
|
+
else:
|
|
331
|
+
raise NotImplementedError("")
|
|
350
332
|
|
|
351
333
|
manifest = self.manifest
|
|
352
334
|
|
|
335
|
+
metadata = {
|
|
336
|
+
"source": "databricks-sql",
|
|
337
|
+
"sid": self.statement_id or ""
|
|
338
|
+
}
|
|
339
|
+
|
|
353
340
|
if manifest is None:
|
|
354
|
-
return pa.schema([])
|
|
341
|
+
return pa.schema([], metadata=metadata)
|
|
355
342
|
|
|
356
343
|
fields = [
|
|
357
344
|
column_info_to_arrow_field(_) for _ in manifest.schema.columns
|
|
358
345
|
]
|
|
359
346
|
|
|
360
|
-
return pa.schema(
|
|
347
|
+
return pa.schema(
|
|
348
|
+
fields,
|
|
349
|
+
metadata=metadata
|
|
350
|
+
)
|
|
361
351
|
|
|
362
|
-
def to_arrow_table(
|
|
352
|
+
def to_arrow_table(
|
|
353
|
+
self,
|
|
354
|
+
parallel_pool: int = 4
|
|
355
|
+
) -> pa.Table:
|
|
363
356
|
"""Collect the statement result into a single Arrow table.
|
|
364
357
|
|
|
365
358
|
Args:
|
|
@@ -383,23 +376,27 @@ class StatementResult:
|
|
|
383
376
|
|
|
384
377
|
def to_arrow_batches(
|
|
385
378
|
self,
|
|
386
|
-
parallel_pool:
|
|
379
|
+
parallel_pool: int = 4,
|
|
380
|
+
batch_size: Optional[int] = None
|
|
387
381
|
) -> Iterator[pa.RecordBatch]:
|
|
388
382
|
"""Stream the result as Arrow record batches.
|
|
389
383
|
|
|
390
384
|
Args:
|
|
391
385
|
parallel_pool: Maximum parallel fetch workers.
|
|
386
|
+
batch_size: Fetch batch size
|
|
392
387
|
|
|
393
388
|
Yields:
|
|
394
389
|
Arrow RecordBatch objects.
|
|
395
390
|
"""
|
|
396
391
|
if self.persisted:
|
|
397
392
|
if self._arrow_table is not None:
|
|
398
|
-
for batch in self._arrow_table.to_batches(max_chunksize=
|
|
393
|
+
for batch in self._arrow_table.to_batches(max_chunksize=batch_size):
|
|
399
394
|
yield batch
|
|
400
395
|
elif self._spark_df is not None:
|
|
401
|
-
for batch in self._spark_df.toArrow().to_batches(max_chunksize=
|
|
396
|
+
for batch in self._spark_df.toArrow().to_batches(max_chunksize=batch_size):
|
|
402
397
|
yield batch
|
|
398
|
+
else:
|
|
399
|
+
raise NotImplementedError("")
|
|
403
400
|
else:
|
|
404
401
|
_tls = threading.local()
|
|
405
402
|
|
|
@@ -417,66 +414,57 @@ class StatementResult:
|
|
|
417
414
|
resp.raise_for_status()
|
|
418
415
|
return resp.content
|
|
419
416
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
try:
|
|
440
|
-
idx, link = next(links_iter)
|
|
441
|
-
except StopIteration:
|
|
442
|
-
break
|
|
443
|
-
fut = ex.submit(_fetch_bytes, link)
|
|
444
|
-
pending[fut] = idx
|
|
445
|
-
|
|
446
|
-
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
447
|
-
submit_more(ex)
|
|
417
|
+
max_workers = max(1, int(parallel_pool) if parallel_pool else 4)
|
|
418
|
+
max_in_flight = max_workers * 2 # keeps pipeline full without exploding memory
|
|
419
|
+
|
|
420
|
+
links_iter = enumerate(self.external_links())
|
|
421
|
+
pending = {} # future -> idx
|
|
422
|
+
ready = {} # idx -> bytes
|
|
423
|
+
next_idx = 0
|
|
424
|
+
|
|
425
|
+
def submit_more(ex):
|
|
426
|
+
while len(pending) < max_in_flight:
|
|
427
|
+
try:
|
|
428
|
+
idx, link = next(links_iter)
|
|
429
|
+
except StopIteration:
|
|
430
|
+
break
|
|
431
|
+
fut = ex.submit(_fetch_bytes, link)
|
|
432
|
+
pending[fut] = idx
|
|
433
|
+
|
|
434
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
435
|
+
submit_more(ex)
|
|
448
436
|
|
|
449
|
-
|
|
450
|
-
|
|
437
|
+
while pending:
|
|
438
|
+
done, _ = concurrent_wait(pending, return_when=FIRST_COMPLETED)
|
|
451
439
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
440
|
+
# collect completed downloads
|
|
441
|
+
for fut in done:
|
|
442
|
+
idx = pending.pop(fut)
|
|
443
|
+
ready[idx] = fut.result() # raises here if the GET failed
|
|
456
444
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
445
|
+
# yield strictly in-order
|
|
446
|
+
while next_idx in ready:
|
|
447
|
+
content = ready.pop(next_idx)
|
|
460
448
|
|
|
461
|
-
|
|
449
|
+
buf = pa.BufferReader(content)
|
|
462
450
|
|
|
463
|
-
|
|
464
|
-
|
|
451
|
+
# IPC stream (your current format)
|
|
452
|
+
reader = pipc.open_stream(buf)
|
|
465
453
|
|
|
466
|
-
|
|
467
|
-
|
|
454
|
+
# if it’s IPC file instead:
|
|
455
|
+
# reader = pipc.open_file(buf)
|
|
468
456
|
|
|
469
|
-
|
|
470
|
-
|
|
457
|
+
for batch in reader:
|
|
458
|
+
yield batch
|
|
471
459
|
|
|
472
|
-
|
|
460
|
+
next_idx += 1
|
|
473
461
|
|
|
474
|
-
|
|
462
|
+
submit_more(ex)
|
|
475
463
|
|
|
476
464
|
def to_pandas(
|
|
477
465
|
self,
|
|
478
466
|
parallel_pool: Optional[int] = 4
|
|
479
|
-
) ->
|
|
467
|
+
) -> PandasDataFrame:
|
|
480
468
|
"""Return the result as a pandas DataFrame.
|
|
481
469
|
|
|
482
470
|
Args:
|
|
@@ -489,7 +477,7 @@ class StatementResult:
|
|
|
489
477
|
|
|
490
478
|
def to_polars(
|
|
491
479
|
self,
|
|
492
|
-
parallel_pool:
|
|
480
|
+
parallel_pool: int = 4
|
|
493
481
|
) -> "polars.DataFrame":
|
|
494
482
|
"""Return the result as a polars DataFrame.
|
|
495
483
|
|
|
@@ -499,9 +487,11 @@ class StatementResult:
|
|
|
499
487
|
Returns:
|
|
500
488
|
A polars DataFrame with the result rows.
|
|
501
489
|
"""
|
|
502
|
-
|
|
490
|
+
arrow_table = self.to_arrow_table(parallel_pool=parallel_pool)
|
|
503
491
|
|
|
504
|
-
|
|
492
|
+
return polars.from_arrow(arrow_table)
|
|
493
|
+
|
|
494
|
+
def to_spark(self) -> SparkDataFrame:
|
|
505
495
|
"""Return the result as a Spark DataFrame, caching it locally.
|
|
506
496
|
|
|
507
497
|
Returns:
|
|
@@ -510,6 +500,4 @@ class StatementResult:
|
|
|
510
500
|
if self._spark_df is not None:
|
|
511
501
|
return self._spark_df
|
|
512
502
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
return self._spark_df
|
|
503
|
+
return arrow_table_to_spark_dataframe(self.to_arrow_table())
|