ygg 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
- ygg-0.1.32.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +241 -2
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +16 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +29 -4
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +147 -0
- yggdrasil/databricks/sql/types.py +33 -1
- yggdrasil/databricks/workspaces/__init__.py +2 -1
- yggdrasil/databricks/workspaces/filesytem.py +183 -0
- yggdrasil/databricks/workspaces/io.py +387 -9
- yggdrasil/databricks/workspaces/path.py +297 -2
- yggdrasil/databricks/workspaces/path_kind.py +3 -0
- yggdrasil/databricks/workspaces/workspace.py +202 -5
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +123 -1
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.31.dist-info/RECORD +0 -59
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Result wrapper for Databricks SQL statement execution."""
|
|
2
|
+
|
|
1
3
|
import dataclasses
|
|
2
4
|
import threading
|
|
3
5
|
import time
|
|
@@ -49,6 +51,7 @@ __all__ = [
|
|
|
49
51
|
|
|
50
52
|
@dataclasses.dataclass
|
|
51
53
|
class StatementResult:
|
|
54
|
+
"""Container for statement responses, data extraction, and conversions."""
|
|
52
55
|
engine: "SQLEngine"
|
|
53
56
|
statement_id: str
|
|
54
57
|
disposition: "Disposition"
|
|
@@ -60,6 +63,11 @@ class StatementResult:
|
|
|
60
63
|
_arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
|
|
61
64
|
|
|
62
65
|
def __getstate__(self):
|
|
66
|
+
"""Serialize statement results, converting Spark dataframes to Arrow.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A pickle-ready state dictionary.
|
|
70
|
+
"""
|
|
63
71
|
state = self.__dict__.copy()
|
|
64
72
|
|
|
65
73
|
_spark_df = state.pop("_spark_df", None)
|
|
@@ -70,29 +78,54 @@ class StatementResult:
|
|
|
70
78
|
return state
|
|
71
79
|
|
|
72
80
|
def __setstate__(self, state):
|
|
81
|
+
"""Restore statement result state, rehydrating cached data.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
state: Serialized state dictionary.
|
|
85
|
+
"""
|
|
73
86
|
_spark_df = state.pop("_spark_df")
|
|
74
87
|
|
|
75
88
|
def __iter__(self):
|
|
89
|
+
"""Iterate over Arrow record batches."""
|
|
76
90
|
return self.to_arrow_batches()
|
|
77
91
|
|
|
78
92
|
@property
|
|
79
93
|
def is_spark_sql(self):
|
|
94
|
+
"""Return True when this result was produced by Spark SQL."""
|
|
80
95
|
return self._spark_df is not None
|
|
81
96
|
|
|
82
97
|
@property
|
|
83
98
|
def response(self):
|
|
99
|
+
"""Return the latest statement response, refreshing when needed.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
The current StatementResponse object.
|
|
103
|
+
"""
|
|
84
104
|
if self._response is None and not self.is_spark_sql:
|
|
85
105
|
self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
|
|
86
106
|
return self._response
|
|
87
107
|
|
|
88
108
|
@response.setter
|
|
89
109
|
def response(self, value: "StatementResponse"):
|
|
110
|
+
"""Update the cached response and refresh timestamp.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
value: StatementResponse to cache.
|
|
114
|
+
"""
|
|
90
115
|
self._response = value
|
|
91
116
|
self._response_refresh_time = time.time()
|
|
92
117
|
|
|
93
118
|
self.statement_id = self._response.statement_id
|
|
94
119
|
|
|
95
120
|
def fresh_response(self, delay: float):
|
|
121
|
+
"""Refresh the response if it is older than ``delay`` seconds.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
delay: Minimum age in seconds before refreshing.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
The refreshed StatementResponse object.
|
|
128
|
+
"""
|
|
96
129
|
if self.is_spark_sql:
|
|
97
130
|
return self._response
|
|
98
131
|
|
|
@@ -102,6 +135,14 @@ class StatementResult:
|
|
|
102
135
|
return self._response
|
|
103
136
|
|
|
104
137
|
def result_data_at(self, chunk_index: int):
|
|
138
|
+
"""Fetch a specific result chunk by index.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
chunk_index: Result chunk index to retrieve.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
The SDK result chunk response.
|
|
145
|
+
"""
|
|
105
146
|
sdk = self.workspace.sdk()
|
|
106
147
|
|
|
107
148
|
return sdk.statement_execution.get_statement_result_chunk_n(
|
|
@@ -111,10 +152,20 @@ class StatementResult:
|
|
|
111
152
|
|
|
112
153
|
@property
|
|
113
154
|
def workspace(self):
|
|
155
|
+
"""Expose the underlying workspace from the engine.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
The Workspace instance backing this statement.
|
|
159
|
+
"""
|
|
114
160
|
return self.engine.workspace
|
|
115
161
|
|
|
116
162
|
@property
|
|
117
163
|
def status(self):
|
|
164
|
+
"""Return the statement status, handling persisted data.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
A StatementStatus object.
|
|
168
|
+
"""
|
|
118
169
|
if self.persisted:
|
|
119
170
|
return StatementStatus(
|
|
120
171
|
state=StatementState.SUCCEEDED
|
|
@@ -129,20 +180,40 @@ class StatementResult:
|
|
|
129
180
|
|
|
130
181
|
@property
|
|
131
182
|
def state(self):
|
|
183
|
+
"""Return the statement state.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
The StatementState enum value.
|
|
187
|
+
"""
|
|
132
188
|
return self.status.state
|
|
133
189
|
|
|
134
190
|
@property
|
|
135
191
|
def manifest(self):
|
|
192
|
+
"""Return the SQL result manifest, if available.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
The result manifest or None for Spark SQL results.
|
|
196
|
+
"""
|
|
136
197
|
if self.is_spark_sql:
|
|
137
198
|
return None
|
|
138
199
|
return self.response.manifest
|
|
139
200
|
|
|
140
201
|
@property
|
|
141
202
|
def result(self):
|
|
203
|
+
"""Return the raw statement result object.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
The statement result payload from the API.
|
|
207
|
+
"""
|
|
142
208
|
return self.response.result
|
|
143
209
|
|
|
144
210
|
@property
|
|
145
211
|
def done(self):
|
|
212
|
+
"""Return True when the statement is in a terminal state.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
True if the statement is done, otherwise False.
|
|
216
|
+
"""
|
|
146
217
|
if self.persisted:
|
|
147
218
|
return True
|
|
148
219
|
|
|
@@ -155,6 +226,11 @@ class StatementResult:
|
|
|
155
226
|
|
|
156
227
|
@property
|
|
157
228
|
def failed(self):
|
|
229
|
+
"""Return True when the statement failed or was cancelled.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
True if the statement failed or was cancelled.
|
|
233
|
+
"""
|
|
158
234
|
if self.persisted:
|
|
159
235
|
return True
|
|
160
236
|
|
|
@@ -165,14 +241,29 @@ class StatementResult:
|
|
|
165
241
|
|
|
166
242
|
@property
|
|
167
243
|
def persisted(self):
|
|
244
|
+
"""Return True when data is cached locally.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
True when cached Arrow or Spark data is present.
|
|
248
|
+
"""
|
|
168
249
|
return self._spark_df is not None or self._arrow_table is not None
|
|
169
250
|
|
|
170
251
|
def persist(self):
|
|
252
|
+
"""Cache the statement result locally as Arrow data.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
The current StatementResult instance.
|
|
256
|
+
"""
|
|
171
257
|
if not self.persisted:
|
|
172
258
|
self._arrow_table = self.to_arrow_table()
|
|
173
259
|
return self
|
|
174
260
|
|
|
175
261
|
def external_links(self):
|
|
262
|
+
"""Yield external result links for EXTERNAL_LINKS dispositions.
|
|
263
|
+
|
|
264
|
+
Yields:
|
|
265
|
+
External link objects in result order.
|
|
266
|
+
"""
|
|
176
267
|
assert self.disposition == Disposition.EXTERNAL_LINKS, "Cannot get from %s, disposition %s != %s" % (
|
|
177
268
|
self, self.disposition, Disposition.EXTERNAL_LINKS
|
|
178
269
|
)
|
|
@@ -222,6 +313,11 @@ class StatementResult:
|
|
|
222
313
|
)
|
|
223
314
|
|
|
224
315
|
def raise_for_status(self):
|
|
316
|
+
"""Raise a ValueError if the statement failed.
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
None.
|
|
320
|
+
"""
|
|
225
321
|
if self.failed:
|
|
226
322
|
# grab error info if present
|
|
227
323
|
err = self.status.error
|
|
@@ -244,6 +340,15 @@ class StatementResult:
|
|
|
244
340
|
timeout: Optional[int] = None,
|
|
245
341
|
poll_interval: Optional[float] = None
|
|
246
342
|
):
|
|
343
|
+
"""Wait for statement completion with optional timeout.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
timeout: Maximum seconds to wait.
|
|
347
|
+
poll_interval: Initial poll interval in seconds.
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
The current StatementResult instance.
|
|
351
|
+
"""
|
|
247
352
|
if self.done:
|
|
248
353
|
return self
|
|
249
354
|
|
|
@@ -265,6 +370,11 @@ class StatementResult:
|
|
|
265
370
|
return current
|
|
266
371
|
|
|
267
372
|
def arrow_schema(self):
|
|
373
|
+
"""Return the Arrow schema for the result.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
An Arrow Schema instance.
|
|
377
|
+
"""
|
|
268
378
|
if self.persisted:
|
|
269
379
|
if self._arrow_table is not None:
|
|
270
380
|
return self._arrow_table.schema
|
|
@@ -277,6 +387,14 @@ class StatementResult:
|
|
|
277
387
|
return pa.schema(fields)
|
|
278
388
|
|
|
279
389
|
def to_arrow_table(self, parallel_pool: Optional[int] = 4) -> pa.Table:
|
|
390
|
+
"""Collect the statement result into a single Arrow table.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
An Arrow Table containing all rows.
|
|
397
|
+
"""
|
|
280
398
|
if self.persisted:
|
|
281
399
|
if self._arrow_table:
|
|
282
400
|
return self._arrow_table
|
|
@@ -295,6 +413,14 @@ class StatementResult:
|
|
|
295
413
|
self,
|
|
296
414
|
parallel_pool: Optional[int] = 4
|
|
297
415
|
) -> Iterator[pa.RecordBatch]:
|
|
416
|
+
"""Stream the result as Arrow record batches.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
420
|
+
|
|
421
|
+
Yields:
|
|
422
|
+
Arrow RecordBatch objects.
|
|
423
|
+
"""
|
|
298
424
|
if self.persisted:
|
|
299
425
|
if self._arrow_table is not None:
|
|
300
426
|
for batch in self._arrow_table.to_batches(max_chunksize=64 * 1024):
|
|
@@ -379,15 +505,36 @@ class StatementResult:
|
|
|
379
505
|
self,
|
|
380
506
|
parallel_pool: Optional[int] = 4
|
|
381
507
|
) -> "pandas.DataFrame":
|
|
508
|
+
"""Return the result as a pandas DataFrame.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
A pandas DataFrame with the result rows.
|
|
515
|
+
"""
|
|
382
516
|
return self.to_arrow_table(parallel_pool=parallel_pool).to_pandas()
|
|
383
517
|
|
|
384
518
|
def to_polars(
|
|
385
519
|
self,
|
|
386
520
|
parallel_pool: Optional[int] = 4
|
|
387
521
|
) -> "polars.DataFrame":
|
|
522
|
+
"""Return the result as a polars DataFrame.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
A polars DataFrame with the result rows.
|
|
529
|
+
"""
|
|
388
530
|
return polars.from_arrow(self.to_arrow_table(parallel_pool=parallel_pool))
|
|
389
531
|
|
|
390
532
|
def to_spark(self):
|
|
533
|
+
"""Return the result as a Spark DataFrame, caching it locally.
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
A Spark DataFrame with the result rows.
|
|
537
|
+
"""
|
|
391
538
|
if self._spark_df:
|
|
392
539
|
return self._spark_df
|
|
393
540
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Type utilities for Databricks SQL metadata and Arrow."""
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import re
|
|
3
5
|
from typing import Union
|
|
@@ -86,6 +88,14 @@ _struct_re = re.compile(r"^STRUCT\s*<\s*(.+)\s*>$", re.IGNORECASE)
|
|
|
86
88
|
|
|
87
89
|
|
|
88
90
|
def _split_top_level_commas(s: str):
|
|
91
|
+
"""Split a type string by commas, respecting nested angle brackets.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
s: Type string to split.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
A list of top-level comma-separated parts.
|
|
98
|
+
"""
|
|
89
99
|
parts, cur, depth = [], [], 0
|
|
90
100
|
for ch in s:
|
|
91
101
|
if ch == '<':
|
|
@@ -103,6 +113,14 @@ def _split_top_level_commas(s: str):
|
|
|
103
113
|
|
|
104
114
|
|
|
105
115
|
def _safe_bytes(obj):
|
|
116
|
+
"""Convert an object to UTF-8 bytes, with safe handling for None.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
obj: Value to convert.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
UTF-8 encoded bytes.
|
|
123
|
+
"""
|
|
106
124
|
if not isinstance(obj, bytes):
|
|
107
125
|
if not obj:
|
|
108
126
|
return b""
|
|
@@ -120,6 +138,12 @@ def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
|
|
|
120
138
|
- looks up base types in STRING_TYPE_MAP (expects uppercase keys)
|
|
121
139
|
- supports DECIMAL(p,s), ARRAY<...>, MAP<k,v>, STRUCT<...> recursively
|
|
122
140
|
- raises ValueError if it cannot map the provided type string
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
type_str: SQL type string to parse.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
The corresponding Arrow DataType.
|
|
123
147
|
"""
|
|
124
148
|
if not type_str:
|
|
125
149
|
raise ValueError("Empty type string")
|
|
@@ -177,6 +201,14 @@ def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
|
|
|
177
201
|
|
|
178
202
|
|
|
179
203
|
def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
|
|
204
|
+
"""Convert Databricks SQL/Catalog column info into an Arrow field.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
col: ColumnInfo from SQL or Catalog APIs.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
An Arrow Field for the column.
|
|
211
|
+
"""
|
|
180
212
|
arrow_type = parse_sql_type_to_pa(col.type_text)
|
|
181
213
|
|
|
182
214
|
if isinstance(col, CatalogColumnInfo):
|
|
@@ -198,4 +230,4 @@ def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
|
|
|
198
230
|
arrow_type,
|
|
199
231
|
nullable=nullable,
|
|
200
232
|
metadata=md
|
|
201
|
-
)
|
|
233
|
+
)
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""PyArrow filesystem wrappers for Databricks paths."""
|
|
2
|
+
|
|
1
3
|
__all__ = [
|
|
2
4
|
"DatabricksFileSystem",
|
|
3
5
|
"DatabricksFileSystemHandler"
|
|
@@ -14,26 +16,60 @@ if TYPE_CHECKING:
|
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class DatabricksFileSystemHandler(FileSystemHandler):
|
|
19
|
+
"""PyArrow FileSystemHandler backed by Databricks paths."""
|
|
17
20
|
|
|
18
21
|
def __init__(
|
|
19
22
|
self,
|
|
20
23
|
workspace: "Workspace",
|
|
21
24
|
):
|
|
25
|
+
"""Create a handler bound to a Workspace.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
workspace: Workspace instance to use.
|
|
29
|
+
"""
|
|
22
30
|
super().__init__()
|
|
23
31
|
self.workspace = workspace
|
|
24
32
|
|
|
25
33
|
def __enter__(self):
|
|
34
|
+
"""Enter a context manager and connect to the workspace.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
A connected DatabricksFileSystemHandler instance.
|
|
38
|
+
"""
|
|
26
39
|
return self.connect(clone=True)
|
|
27
40
|
|
|
28
41
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
42
|
+
"""Exit the context manager and close the workspace.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
exc_type: Exception type, if raised.
|
|
46
|
+
exc_val: Exception value, if raised.
|
|
47
|
+
exc_tb: Exception traceback, if raised.
|
|
48
|
+
"""
|
|
29
49
|
self.workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
30
50
|
|
|
31
51
|
def _parse_path(self, obj: Any) -> "DatabricksPath":
|
|
52
|
+
"""Parse a path-like object into a DatabricksPath.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
obj: Path-like object to parse.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
A DatabricksPath instance.
|
|
59
|
+
"""
|
|
32
60
|
from .path import DatabricksPath
|
|
33
61
|
|
|
34
62
|
return DatabricksPath.parse(obj, workspace=self.workspace)
|
|
35
63
|
|
|
36
64
|
def connect(self, clone: bool = True):
|
|
65
|
+
"""Connect the workspace and optionally return a cloned handler.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
clone: Whether to return a cloned handler.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
A connected handler.
|
|
72
|
+
"""
|
|
37
73
|
workspace = self.connect(clone=clone)
|
|
38
74
|
|
|
39
75
|
if clone:
|
|
@@ -45,9 +81,21 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
45
81
|
return self
|
|
46
82
|
|
|
47
83
|
def close(self):
|
|
84
|
+
"""Close the underlying workspace client.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
None.
|
|
88
|
+
"""
|
|
48
89
|
self.workspace.close()
|
|
49
90
|
|
|
50
91
|
def copy_file(self, src, dest, *, chunk_size: int = 4 * 1024 * 1024):
|
|
92
|
+
"""Copy a file between Databricks paths.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
src: Source path.
|
|
96
|
+
dest: Destination path.
|
|
97
|
+
chunk_size: Chunk size in bytes.
|
|
98
|
+
"""
|
|
51
99
|
src = self._parse_path(src)
|
|
52
100
|
dest = self._parse_path(dest)
|
|
53
101
|
|
|
@@ -59,24 +107,66 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
59
107
|
w.write(chunk)
|
|
60
108
|
|
|
61
109
|
def create_dir(self, path, *args, recursive: bool = True, **kwargs):
|
|
110
|
+
"""Create a directory at the given path.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
path: Directory path to create.
|
|
114
|
+
recursive: Whether to create parents.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
The created DatabricksPath instance.
|
|
118
|
+
"""
|
|
62
119
|
return self._parse_path(path).mkdir(parents=recursive)
|
|
63
120
|
|
|
64
121
|
def delete_dir(self, path):
|
|
122
|
+
"""Delete a directory recursively.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
path: Directory path to delete.
|
|
126
|
+
"""
|
|
65
127
|
return self._parse_path(path).rmdir(recursive=True)
|
|
66
128
|
|
|
67
129
|
def delete_dir_contents(self, path, *args, accept_root_dir: bool = False, **kwargs):
|
|
130
|
+
"""Delete the contents of a directory.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
path: Directory path whose contents should be removed.
|
|
134
|
+
accept_root_dir: Whether to allow deleting root contents.
|
|
135
|
+
"""
|
|
68
136
|
return self._parse_path(path).rmdir(recursive=True)
|
|
69
137
|
|
|
70
138
|
def delete_root_dir_contents(self):
|
|
139
|
+
"""Delete the contents of the root directory."""
|
|
71
140
|
return self.delete_dir_contents("/", accept_root_dir=True)
|
|
72
141
|
|
|
73
142
|
def delete_file(self, path):
|
|
143
|
+
"""Delete a single file.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
path: File path to delete.
|
|
147
|
+
"""
|
|
74
148
|
return self._parse_path(path).rmfile()
|
|
75
149
|
|
|
76
150
|
def equals(self, other: FileSystem):
|
|
151
|
+
"""Return True if the filesystem handler matches another.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
other: Another FileSystem instance.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
True if equal, otherwise False.
|
|
158
|
+
"""
|
|
77
159
|
return self == other
|
|
78
160
|
|
|
79
161
|
def from_uri(self, uri):
|
|
162
|
+
"""Return a handler for the workspace in the provided URI.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
uri: URI or path to parse.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
A DatabricksFileSystemHandler for the URI.
|
|
169
|
+
"""
|
|
80
170
|
uri = self._parse_path(uri)
|
|
81
171
|
|
|
82
172
|
return self.__class__(
|
|
@@ -87,6 +177,14 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
87
177
|
self,
|
|
88
178
|
paths_or_selector: Union[FileSelector, str, "DatabricksPath", List[Union[str, "DatabricksPath"]]]
|
|
89
179
|
) -> Union[FileInfo, List[FileInfo]]:
|
|
180
|
+
"""Return FileInfo objects for paths or selectors.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
paths_or_selector: Path(s) or a FileSelector.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
A FileInfo or list of FileInfo objects.
|
|
187
|
+
"""
|
|
90
188
|
from .path import DatabricksPath
|
|
91
189
|
|
|
92
190
|
if isinstance(paths_or_selector, (str, DatabricksPath)):
|
|
@@ -106,6 +204,14 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
106
204
|
self,
|
|
107
205
|
selector: FileSelector
|
|
108
206
|
):
|
|
207
|
+
"""Return FileInfo entries for a FileSelector.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
selector: FileSelector describing the listing.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
A list of FileInfo entries.
|
|
214
|
+
"""
|
|
109
215
|
base_dir = self._parse_path(selector.base_dir)
|
|
110
216
|
|
|
111
217
|
return [
|
|
@@ -117,9 +223,20 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
117
223
|
]
|
|
118
224
|
|
|
119
225
|
def get_type_name(self):
|
|
226
|
+
"""Return the filesystem type name.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
The filesystem type name string.
|
|
230
|
+
"""
|
|
120
231
|
return "dbfs"
|
|
121
232
|
|
|
122
233
|
def move(self, src, dest):
|
|
234
|
+
"""Move a file by copying then deleting.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
src: Source path.
|
|
238
|
+
dest: Destination path.
|
|
239
|
+
"""
|
|
123
240
|
src = self._parse_path(src)
|
|
124
241
|
|
|
125
242
|
src.copy_to(dest)
|
|
@@ -127,6 +244,14 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
127
244
|
src.remove(recursive=True)
|
|
128
245
|
|
|
129
246
|
def normalize_path(self, path):
|
|
247
|
+
"""Normalize a path to a full Databricks path string.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
path: Path to normalize.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
The normalized full path string.
|
|
254
|
+
"""
|
|
130
255
|
return self._parse_path(path).full_path()
|
|
131
256
|
|
|
132
257
|
def open(
|
|
@@ -135,12 +260,43 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
135
260
|
mode: str = "r+",
|
|
136
261
|
encoding: Optional[str] = None,
|
|
137
262
|
):
|
|
263
|
+
"""Open a file path as a Databricks IO stream.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
path: Path to open.
|
|
267
|
+
mode: File mode string.
|
|
268
|
+
encoding: Optional text encoding.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
A DatabricksIO instance.
|
|
272
|
+
"""
|
|
138
273
|
return self._parse_path(path).open(mode=mode, encoding=encoding, clone=False)
|
|
139
274
|
|
|
140
275
|
def open_append_stream(self, path, compression='detect', buffer_size=None, metadata=None):
|
|
276
|
+
"""Open an append stream.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
path: Path to open.
|
|
280
|
+
compression: Optional compression hint.
|
|
281
|
+
buffer_size: Optional buffer size.
|
|
282
|
+
metadata: Optional metadata.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
A DatabricksIO instance.
|
|
286
|
+
"""
|
|
141
287
|
return self._parse_path(path).open(mode="ab")
|
|
142
288
|
|
|
143
289
|
def open_input_file(self, path, mode: str = "rb", **kwargs):
|
|
290
|
+
"""Open an input file as a PyArrow PythonFile.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
path: Path to open.
|
|
294
|
+
mode: File mode string.
|
|
295
|
+
**kwargs: Additional options.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
A PyArrow PythonFile instance.
|
|
299
|
+
"""
|
|
144
300
|
buf = self._parse_path(path).open(mode=mode).connect(clone=True)
|
|
145
301
|
|
|
146
302
|
return PythonFile(
|
|
@@ -149,13 +305,40 @@ class DatabricksFileSystemHandler(FileSystemHandler):
|
|
|
149
305
|
)
|
|
150
306
|
|
|
151
307
|
def open_input_stream(self, path, compression='detect', buffer_size=None):
|
|
308
|
+
"""Open an input stream for reading bytes.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
path: Path to open.
|
|
312
|
+
compression: Optional compression hint.
|
|
313
|
+
buffer_size: Optional buffer size.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
A DatabricksIO instance.
|
|
317
|
+
"""
|
|
152
318
|
return self._parse_path(path).open(mode="rb")
|
|
153
319
|
|
|
154
320
|
def open_output_stream(self, path, compression='detect', buffer_size=None, metadata=None):
|
|
321
|
+
"""Open an output stream for writing bytes.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
path: Path to open.
|
|
325
|
+
compression: Optional compression hint.
|
|
326
|
+
buffer_size: Optional buffer size.
|
|
327
|
+
metadata: Optional metadata.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
A DatabricksIO instance.
|
|
331
|
+
"""
|
|
155
332
|
return self._parse_path(path).open(mode="wb")
|
|
156
333
|
|
|
157
334
|
|
|
158
335
|
class DatabricksFileSystem(PyFileSystem):
|
|
336
|
+
"""PyArrow filesystem wrapper for Databricks paths."""
|
|
159
337
|
|
|
160
338
|
def __init__(self, handler): # real signature unknown; restored from __doc__
|
|
339
|
+
"""Initialize the filesystem with a handler.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
handler: FileSystemHandler instance.
|
|
343
|
+
"""
|
|
161
344
|
super().__init__(handler)
|