ygg 0.1.49__py3-none-any.whl → 0.1.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.49.dist-info → ygg-0.1.51.dist-info}/METADATA +1 -1
- {ygg-0.1.49.dist-info → ygg-0.1.51.dist-info}/RECORD +13 -13
- yggdrasil/databricks/sql/statement_result.py +12 -5
- yggdrasil/databricks/workspaces/io.py +58 -46
- yggdrasil/databricks/workspaces/path.py +95 -48
- yggdrasil/libs/pandaslib.py +6 -0
- yggdrasil/libs/polarslib.py +5 -0
- yggdrasil/types/cast/polars_cast.py +1 -0
- yggdrasil/version.py +1 -1
- {ygg-0.1.49.dist-info → ygg-0.1.51.dist-info}/WHEEL +0 -0
- {ygg-0.1.49.dist-info → ygg-0.1.51.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.49.dist-info → ygg-0.1.51.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.49.dist-info → ygg-0.1.51.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
ygg-0.1.
|
|
1
|
+
ygg-0.1.51.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
2
2
|
yggdrasil/__init__.py,sha256=4-ghPak2S6zfMqmnlxW2GCgPb5s79znpKa2hGEGXcE4,24
|
|
3
|
-
yggdrasil/version.py,sha256=
|
|
3
|
+
yggdrasil/version.py,sha256=Vba463sBalMddSSVNE_881HL99Fg7msGZpAiG0JX6bg,22
|
|
4
4
|
yggdrasil/databricks/__init__.py,sha256=skctY2c8W-hI81upx9F_PWRe5ishL3hrdiTuizgDjdw,152
|
|
5
5
|
yggdrasil/databricks/compute/__init__.py,sha256=NvdzmaJSNYY1uJthv1hHdBuNu3bD_-Z65DWnaJt9yXg,289
|
|
6
6
|
yggdrasil/databricks/compute/cluster.py,sha256=YomLfvB0oxbgl6WDgBRxI1UXsxwlEbR6gq3FUbPHscY,44199
|
|
@@ -11,21 +11,21 @@ yggdrasil/databricks/jobs/config.py,sha256=9LGeHD04hbfy0xt8_6oobC4moKJh4_DTjZiK4
|
|
|
11
11
|
yggdrasil/databricks/sql/__init__.py,sha256=Vp_1cFaX1l-JGzCknvkbiB8CBFX2fQbBNntIeVn3lEg,231
|
|
12
12
|
yggdrasil/databricks/sql/engine.py,sha256=K5WmGKpXU78JA3UdK8dLxBD_GXKidZJFe7hytuC5UHg,41029
|
|
13
13
|
yggdrasil/databricks/sql/exceptions.py,sha256=uC-BoG0u0LtORKUS1X3iLID8nc-0TV5MQN3M8RXHsO4,1495
|
|
14
|
-
yggdrasil/databricks/sql/statement_result.py,sha256=
|
|
14
|
+
yggdrasil/databricks/sql/statement_result.py,sha256=GZyVhhrUK5opNo-8HGqsMx0Rp9fa_0zqvn8McSHPQ8U,16310
|
|
15
15
|
yggdrasil/databricks/sql/types.py,sha256=5G-BM9_eOsRKEMzeDTWUsWW5g4Idvs-czVCpOCrMhdA,6412
|
|
16
16
|
yggdrasil/databricks/sql/warehouse.py,sha256=1J0dyQLJb-OS1_1xU1eAVZ4CoL2-FhFeowKSvU3RzFc,9773
|
|
17
17
|
yggdrasil/databricks/workspaces/__init__.py,sha256=dv2zotoFVhNFlTCdRq6gwf5bEzeZkOZszoNZMs0k59g,114
|
|
18
18
|
yggdrasil/databricks/workspaces/filesytem.py,sha256=Z8JXU7_XUEbw9fpTQT1avRQKi-IAP2KemXBMPkUoY4w,9805
|
|
19
|
-
yggdrasil/databricks/workspaces/io.py,sha256=
|
|
20
|
-
yggdrasil/databricks/workspaces/path.py,sha256=
|
|
19
|
+
yggdrasil/databricks/workspaces/io.py,sha256=PhXMVrK8ngDl6kKjnh8_jlZ2GsKtU2nLSi1nFgV4Sks,33302
|
|
20
|
+
yggdrasil/databricks/workspaces/path.py,sha256=HA73r0qedm8IiE_FPrDzRLc5BBkU9_a1qF2JXdWXMQk,49290
|
|
21
21
|
yggdrasil/databricks/workspaces/path_kind.py,sha256=Xc319NysH8_6E9C0Q8nCxDHYG07_SnzyUVKHe0dNdDQ,305
|
|
22
22
|
yggdrasil/databricks/workspaces/workspace.py,sha256=zBlQdYNT_xKwUCYo3O4Q4g-8pfMvff3I26efyCfY_TY,24961
|
|
23
23
|
yggdrasil/dataclasses/__init__.py,sha256=_RkhfF3KC1eSORby1dzvBXQ0-UGG3u6wyUQWX2jq1Pc,108
|
|
24
24
|
yggdrasil/dataclasses/dataclass.py,sha256=LxrCjwvmBnb8yRI_N-c31RHHxB4XoJPixmKg9iBIuaI,1148
|
|
25
25
|
yggdrasil/libs/__init__.py,sha256=zdC9OU0Xy36CLY9mg2drxN6S7isPR8aTLzJA6xVIeLE,91
|
|
26
26
|
yggdrasil/libs/databrickslib.py,sha256=NHJeUViHhZc8LI5oDVfi1axRyUy_pDJLy4hjD0KZEBQ,980
|
|
27
|
-
yggdrasil/libs/pandaslib.py,sha256=
|
|
28
|
-
yggdrasil/libs/polarslib.py,sha256=
|
|
27
|
+
yggdrasil/libs/pandaslib.py,sha256=GoUjh9dxZAFLe9hs8-6RliLD3jsH_BexYW1w-8BZzb0,618
|
|
28
|
+
yggdrasil/libs/polarslib.py,sha256=hnL8x6ygsyIoiJyIUMaeoji3fRzab4lBiHcMqa29C_Q,618
|
|
29
29
|
yggdrasil/libs/sparklib.py,sha256=FQ3W1iz2EIpQreorOiQuFt15rdhq2QhGEAWp8Zrbl9A,10177
|
|
30
30
|
yggdrasil/libs/extensions/__init__.py,sha256=mcXW5Li3Cbprbs4Ci-b5A0Ju0wmLcfvEiFusTx6xNjU,117
|
|
31
31
|
yggdrasil/libs/extensions/polars_extensions.py,sha256=RTkGi8llhPJjX7x9egix7-yXWo2X24zIAPSKXV37SSA,12397
|
|
@@ -49,14 +49,14 @@ yggdrasil/types/cast/__init__.py,sha256=Oft3pTs2bRM5hT7YqJAuOKTYYk-SACLaMOXUVdaf
|
|
|
49
49
|
yggdrasil/types/cast/arrow_cast.py,sha256=_OMYc4t5GlgE4ztlWaCoK8Jnba09rgDbmHVP-QXhOL0,41523
|
|
50
50
|
yggdrasil/types/cast/cast_options.py,sha256=nDaEvCCs7TBamhTWyDrYf3LVaBWzioIP2Q5_LXrChF4,15532
|
|
51
51
|
yggdrasil/types/cast/pandas_cast.py,sha256=I3xu0sZ59ZbK3NDcQ2dslzdeKzhpFV5zR02ZEixd5hI,8713
|
|
52
|
-
yggdrasil/types/cast/polars_cast.py,sha256=
|
|
52
|
+
yggdrasil/types/cast/polars_cast.py,sha256=RILcbfL4o1XDMp5H-06c0BMrDal5pehOT7ACiItDB6E,28791
|
|
53
53
|
yggdrasil/types/cast/polars_pandas_cast.py,sha256=CS0P7teVv15IdX5g7v40RfkH1VMg6b-HM0V_gOfacm8,5071
|
|
54
54
|
yggdrasil/types/cast/registry.py,sha256=_zdFGmUBB7P-e_LIcJlOxMcxAkXoA-UXB6HqLMgTokg,21491
|
|
55
55
|
yggdrasil/types/cast/spark_cast.py,sha256=_KAsl1DqmKMSfWxqhVE7gosjYdgiL1C5bDQv6eP3HtA,24926
|
|
56
56
|
yggdrasil/types/cast/spark_pandas_cast.py,sha256=BuTiWrdCANZCdD_p2MAytqm74eq-rdRXd-LGojBRrfU,5023
|
|
57
57
|
yggdrasil/types/cast/spark_polars_cast.py,sha256=btmZNHXn2NSt3fUuB4xg7coaE0RezIBdZD92H8NK0Jw,9073
|
|
58
|
-
ygg-0.1.
|
|
59
|
-
ygg-0.1.
|
|
60
|
-
ygg-0.1.
|
|
61
|
-
ygg-0.1.
|
|
62
|
-
ygg-0.1.
|
|
58
|
+
ygg-0.1.51.dist-info/METADATA,sha256=JprFwC_aHRV7jMw6YBV4-uAZcTZrEFVu7eE6_2dulG4,18528
|
|
59
|
+
ygg-0.1.51.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
60
|
+
ygg-0.1.51.dist-info/entry_points.txt,sha256=6q-vpWG3kvw2dhctQ0LALdatoeefkN855Ev02I1dKGY,70
|
|
61
|
+
ygg-0.1.51.dist-info/top_level.txt,sha256=iBe9Kk4VIVbLpgv_p8OZUIfxgj4dgJ5wBg6vO3rigso,10
|
|
62
|
+
ygg-0.1.51.dist-info/RECORD,,
|
|
@@ -344,10 +344,17 @@ class StatementResult:
|
|
|
344
344
|
if self.persisted:
|
|
345
345
|
if self._arrow_table is not None:
|
|
346
346
|
return self._arrow_table.schema
|
|
347
|
-
|
|
347
|
+
elif self._spark_df is not None:
|
|
348
|
+
return spark_schema_to_arrow_schema(self._spark_df.schema)
|
|
349
|
+
raise NotImplementedError("")
|
|
350
|
+
|
|
351
|
+
manifest = self.manifest
|
|
352
|
+
|
|
353
|
+
if manifest is None:
|
|
354
|
+
return pa.schema([])
|
|
348
355
|
|
|
349
356
|
fields = [
|
|
350
|
-
column_info_to_arrow_field(_) for _ in
|
|
357
|
+
column_info_to_arrow_field(_) for _ in manifest.schema.columns
|
|
351
358
|
]
|
|
352
359
|
|
|
353
360
|
return pa.schema(fields)
|
|
@@ -362,7 +369,7 @@ class StatementResult:
|
|
|
362
369
|
An Arrow Table containing all rows.
|
|
363
370
|
"""
|
|
364
371
|
if self.persisted:
|
|
365
|
-
if self._arrow_table:
|
|
372
|
+
if self._arrow_table is not None:
|
|
366
373
|
return self._arrow_table
|
|
367
374
|
else:
|
|
368
375
|
return self._spark_df.toArrow()
|
|
@@ -370,7 +377,6 @@ class StatementResult:
|
|
|
370
377
|
batches = list(self.to_arrow_batches(parallel_pool=parallel_pool))
|
|
371
378
|
|
|
372
379
|
if not batches:
|
|
373
|
-
# empty table with no columns
|
|
374
380
|
return pa.Table.from_batches([], schema=self.arrow_schema())
|
|
375
381
|
|
|
376
382
|
return pa.Table.from_batches(batches)
|
|
@@ -501,8 +507,9 @@ class StatementResult:
|
|
|
501
507
|
Returns:
|
|
502
508
|
A Spark DataFrame with the result rows.
|
|
503
509
|
"""
|
|
504
|
-
if self._spark_df:
|
|
510
|
+
if self._spark_df is not None:
|
|
505
511
|
return self._spark_df
|
|
506
512
|
|
|
507
513
|
self._spark_df = arrow_table_to_spark_dataframe(self.to_arrow_table())
|
|
514
|
+
|
|
508
515
|
return self._spark_df
|
|
@@ -13,8 +13,8 @@ from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat
|
|
|
13
13
|
|
|
14
14
|
from .path_kind import DatabricksPathKind
|
|
15
15
|
from ...libs.databrickslib import databricks
|
|
16
|
-
from ...
|
|
17
|
-
from ...
|
|
16
|
+
from ...libs.pandaslib import pandas, PandasDataFrame
|
|
17
|
+
from ...libs.polarslib import polars, PolarsDataFrame
|
|
18
18
|
from ...types.cast.registry import convert
|
|
19
19
|
|
|
20
20
|
if databricks is not None:
|
|
@@ -42,7 +42,6 @@ class DatabricksIO(ABC, IO):
|
|
|
42
42
|
path: "DatabricksPath",
|
|
43
43
|
mode: str,
|
|
44
44
|
encoding: Optional[str] = None,
|
|
45
|
-
compression: Optional[str] = "detect",
|
|
46
45
|
position: int = 0,
|
|
47
46
|
buffer: Optional[io.BytesIO] = None,
|
|
48
47
|
):
|
|
@@ -50,7 +49,6 @@ class DatabricksIO(ABC, IO):
|
|
|
50
49
|
|
|
51
50
|
self.encoding = encoding
|
|
52
51
|
self.mode = mode
|
|
53
|
-
self.compression = compression
|
|
54
52
|
|
|
55
53
|
self.path = path
|
|
56
54
|
|
|
@@ -111,7 +109,6 @@ class DatabricksIO(ABC, IO):
|
|
|
111
109
|
path=path,
|
|
112
110
|
mode=mode,
|
|
113
111
|
encoding=encoding,
|
|
114
|
-
compression=compression,
|
|
115
112
|
position=position,
|
|
116
113
|
buffer=buffer,
|
|
117
114
|
)
|
|
@@ -120,7 +117,6 @@ class DatabricksIO(ABC, IO):
|
|
|
120
117
|
path=path,
|
|
121
118
|
mode=mode,
|
|
122
119
|
encoding=encoding,
|
|
123
|
-
compression=compression,
|
|
124
120
|
position=position,
|
|
125
121
|
buffer=buffer,
|
|
126
122
|
)
|
|
@@ -129,7 +125,6 @@ class DatabricksIO(ABC, IO):
|
|
|
129
125
|
path=path,
|
|
130
126
|
mode=mode,
|
|
131
127
|
encoding=encoding,
|
|
132
|
-
compression=compression,
|
|
133
128
|
position=position,
|
|
134
129
|
buffer=buffer,
|
|
135
130
|
)
|
|
@@ -226,7 +221,6 @@ class DatabricksIO(ABC, IO):
|
|
|
226
221
|
path=kwargs.get("path", self.path),
|
|
227
222
|
mode=kwargs.get("mode", self.mode),
|
|
228
223
|
encoding=kwargs.get("encoding", self.encoding),
|
|
229
|
-
compression=kwargs.get("compression", self.compression),
|
|
230
224
|
position=kwargs.get("position", self.position),
|
|
231
225
|
buffer=kwargs.get("buffer", self._buffer),
|
|
232
226
|
)
|
|
@@ -264,8 +258,7 @@ class DatabricksIO(ABC, IO):
|
|
|
264
258
|
None.
|
|
265
259
|
"""
|
|
266
260
|
self.flush()
|
|
267
|
-
|
|
268
|
-
self._buffer.close()
|
|
261
|
+
self.clear_buffer()
|
|
269
262
|
|
|
270
263
|
def fileno(self):
|
|
271
264
|
"""Return a pseudo file descriptor based on object hash.
|
|
@@ -403,9 +396,6 @@ class DatabricksIO(ABC, IO):
|
|
|
403
396
|
Returns:
|
|
404
397
|
The read bytes or string depending on mode.
|
|
405
398
|
"""
|
|
406
|
-
if not self.readable():
|
|
407
|
-
raise IOError("File not open for reading")
|
|
408
|
-
|
|
409
399
|
current_position = self.position
|
|
410
400
|
all_data = self.read_all_bytes(use_cache=use_cache)
|
|
411
401
|
|
|
@@ -431,9 +421,6 @@ class DatabricksIO(ABC, IO):
|
|
|
431
421
|
Returns:
|
|
432
422
|
The next line as bytes or string.
|
|
433
423
|
"""
|
|
434
|
-
if not self.readable():
|
|
435
|
-
raise IOError("File not open for reading")
|
|
436
|
-
|
|
437
424
|
if self.encoding:
|
|
438
425
|
# Text-mode: accumulate characters
|
|
439
426
|
out_chars = []
|
|
@@ -475,9 +462,6 @@ class DatabricksIO(ABC, IO):
|
|
|
475
462
|
Returns:
|
|
476
463
|
A list of lines.
|
|
477
464
|
"""
|
|
478
|
-
if not self.readable():
|
|
479
|
-
raise IOError("File not open for reading")
|
|
480
|
-
|
|
481
465
|
lines = []
|
|
482
466
|
total = 0
|
|
483
467
|
|
|
@@ -492,14 +476,6 @@ class DatabricksIO(ABC, IO):
|
|
|
492
476
|
|
|
493
477
|
return lines
|
|
494
478
|
|
|
495
|
-
def appendable(self):
|
|
496
|
-
"""Return True when the file is open in append mode.
|
|
497
|
-
|
|
498
|
-
Returns:
|
|
499
|
-
True if in append mode.
|
|
500
|
-
"""
|
|
501
|
-
return "a" in self.mode
|
|
502
|
-
|
|
503
479
|
def writable(self):
|
|
504
480
|
"""Return True to indicate write support.
|
|
505
481
|
|
|
@@ -561,9 +537,6 @@ class DatabricksIO(ABC, IO):
|
|
|
561
537
|
Returns:
|
|
562
538
|
The number of bytes written.
|
|
563
539
|
"""
|
|
564
|
-
if not self.writable():
|
|
565
|
-
raise IOError("File not open for writing")
|
|
566
|
-
|
|
567
540
|
if isinstance(data, str):
|
|
568
541
|
data = data.encode(self.encoding or "utf-8")
|
|
569
542
|
|
|
@@ -664,8 +637,12 @@ class DatabricksIO(ABC, IO):
|
|
|
664
637
|
return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
665
638
|
elif isinstance(table, PandasDataFrame):
|
|
666
639
|
return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
667
|
-
|
|
668
|
-
|
|
640
|
+
|
|
641
|
+
return self.write_arrow(
|
|
642
|
+
table=table,
|
|
643
|
+
file_format=file_format,
|
|
644
|
+
batch_size=batch_size
|
|
645
|
+
)
|
|
669
646
|
|
|
670
647
|
# ---- Arrow ----
|
|
671
648
|
|
|
@@ -691,14 +668,16 @@ class DatabricksIO(ABC, IO):
|
|
|
691
668
|
if isinstance(file_format, ParquetFileFormat):
|
|
692
669
|
return pq.read_table(self, **kwargs)
|
|
693
670
|
|
|
694
|
-
|
|
671
|
+
elif isinstance(file_format, CsvFileFormat):
|
|
695
672
|
return pcsv.read_csv(self, parse_options=file_format.parse_options)
|
|
696
673
|
|
|
697
|
-
|
|
674
|
+
else:
|
|
675
|
+
ValueError(f"Unsupported file format for Arrow table: {file_format}")
|
|
698
676
|
|
|
699
677
|
def write_arrow(
|
|
700
678
|
self,
|
|
701
679
|
table: Union[pa.Table, pa.RecordBatch],
|
|
680
|
+
file_format: Optional[FileFormat] = None,
|
|
702
681
|
batch_size: Optional[int] = None,
|
|
703
682
|
**kwargs
|
|
704
683
|
):
|
|
@@ -706,6 +685,7 @@ class DatabricksIO(ABC, IO):
|
|
|
706
685
|
|
|
707
686
|
Args:
|
|
708
687
|
table: Arrow table or batch to write.
|
|
688
|
+
file_format: Optional file format override.
|
|
709
689
|
batch_size: Optional batch size for writes.
|
|
710
690
|
**kwargs: Format-specific options.
|
|
711
691
|
|
|
@@ -717,6 +697,7 @@ class DatabricksIO(ABC, IO):
|
|
|
717
697
|
|
|
718
698
|
return self.write_arrow_table(
|
|
719
699
|
table=table,
|
|
700
|
+
file_format=file_format,
|
|
720
701
|
batch_size=batch_size,
|
|
721
702
|
**kwargs
|
|
722
703
|
)
|
|
@@ -776,12 +757,14 @@ class DatabricksIO(ABC, IO):
|
|
|
776
757
|
|
|
777
758
|
def read_arrow_batches(
|
|
778
759
|
self,
|
|
760
|
+
file_format: Optional[FileFormat] = None,
|
|
779
761
|
batch_size: Optional[int] = None,
|
|
780
762
|
**kwargs
|
|
781
763
|
):
|
|
782
764
|
"""Yield Arrow record batches from the file.
|
|
783
765
|
|
|
784
766
|
Args:
|
|
767
|
+
file_format: Optional file format override.
|
|
785
768
|
batch_size: Optional batch size for reads.
|
|
786
769
|
**kwargs: Format-specific options.
|
|
787
770
|
|
|
@@ -790,7 +773,11 @@ class DatabricksIO(ABC, IO):
|
|
|
790
773
|
"""
|
|
791
774
|
return (
|
|
792
775
|
self
|
|
793
|
-
.read_arrow_table(
|
|
776
|
+
.read_arrow_table(
|
|
777
|
+
file_format=file_format,
|
|
778
|
+
batch_size=batch_size,
|
|
779
|
+
**kwargs
|
|
780
|
+
)
|
|
794
781
|
.to_batches(max_chunksize=batch_size)
|
|
795
782
|
)
|
|
796
783
|
|
|
@@ -798,23 +785,36 @@ class DatabricksIO(ABC, IO):
|
|
|
798
785
|
|
|
799
786
|
def read_pandas(
|
|
800
787
|
self,
|
|
788
|
+
file_format: Optional[FileFormat] = None,
|
|
801
789
|
batch_size: Optional[int] = None,
|
|
802
790
|
**kwargs
|
|
803
791
|
):
|
|
804
792
|
"""Read the file into a pandas DataFrame.
|
|
805
793
|
|
|
806
794
|
Args:
|
|
795
|
+
file_format: Optional file format override.
|
|
807
796
|
batch_size: Optional batch size for reads.
|
|
808
797
|
**kwargs: Format-specific options.
|
|
809
798
|
|
|
810
799
|
Returns:
|
|
811
800
|
A pandas DataFrame with the file contents.
|
|
812
801
|
"""
|
|
813
|
-
|
|
802
|
+
file_format = self.path.file_format if file_format is None else file_format
|
|
803
|
+
self.seek(0)
|
|
804
|
+
|
|
805
|
+
if isinstance(file_format, ParquetFileFormat):
|
|
806
|
+
return pandas.read_parquet(self, **kwargs)
|
|
807
|
+
|
|
808
|
+
elif isinstance(file_format, CsvFileFormat):
|
|
809
|
+
return pandas.read_csv(self, **kwargs)
|
|
810
|
+
|
|
811
|
+
else:
|
|
812
|
+
raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
|
|
814
813
|
|
|
815
814
|
def write_pandas(
|
|
816
815
|
self,
|
|
817
|
-
df,
|
|
816
|
+
df: PandasDataFrame,
|
|
817
|
+
file_format: Optional[FileFormat] = None,
|
|
818
818
|
batch_size: Optional[int] = None,
|
|
819
819
|
**kwargs
|
|
820
820
|
):
|
|
@@ -822,13 +822,26 @@ class DatabricksIO(ABC, IO):
|
|
|
822
822
|
|
|
823
823
|
Args:
|
|
824
824
|
df: pandas DataFrame to write.
|
|
825
|
+
file_format: Optional file format override.
|
|
825
826
|
batch_size: Optional batch size for writes.
|
|
826
827
|
**kwargs: Format-specific options.
|
|
827
828
|
|
|
828
829
|
Returns:
|
|
829
830
|
None.
|
|
830
831
|
"""
|
|
831
|
-
self.
|
|
832
|
+
file_format = self.path.file_format if file_format is None else FileFormat
|
|
833
|
+
buffer = io.BytesIO()
|
|
834
|
+
|
|
835
|
+
if isinstance(file_format, ParquetFileFormat):
|
|
836
|
+
df.to_parquet(buffer, **kwargs)
|
|
837
|
+
|
|
838
|
+
elif isinstance(file_format, CsvFileFormat):
|
|
839
|
+
df.to_csv(buffer, **kwargs)
|
|
840
|
+
|
|
841
|
+
else:
|
|
842
|
+
raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
|
|
843
|
+
|
|
844
|
+
self.write_all_bytes(data=buffer.getvalue())
|
|
832
845
|
|
|
833
846
|
# ---- Polars ----
|
|
834
847
|
|
|
@@ -848,22 +861,21 @@ class DatabricksIO(ABC, IO):
|
|
|
848
861
|
Returns:
|
|
849
862
|
A polars DataFrame with the file contents.
|
|
850
863
|
"""
|
|
851
|
-
import polars as pl
|
|
852
|
-
|
|
853
864
|
file_format = self.path.file_format if file_format is None else file_format
|
|
854
865
|
self.seek(0)
|
|
855
866
|
|
|
856
867
|
if isinstance(file_format, ParquetFileFormat):
|
|
857
|
-
return
|
|
868
|
+
return polars.read_parquet(self, **kwargs)
|
|
858
869
|
|
|
859
|
-
|
|
860
|
-
return
|
|
870
|
+
elif isinstance(file_format, CsvFileFormat):
|
|
871
|
+
return polars.read_csv(self, **kwargs)
|
|
861
872
|
|
|
862
|
-
|
|
873
|
+
else:
|
|
874
|
+
raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
|
|
863
875
|
|
|
864
876
|
def write_polars(
|
|
865
877
|
self,
|
|
866
|
-
df,
|
|
878
|
+
df: PolarsDataFrame,
|
|
867
879
|
file_format: Optional[FileFormat] = None,
|
|
868
880
|
batch_size: Optional[int] = None,
|
|
869
881
|
**kwargs
|
|
@@ -997,7 +1009,7 @@ class DatabricksVolumeIO(DatabricksIO):
|
|
|
997
1009
|
resp = client.download(full_path)
|
|
998
1010
|
except Exception as e:
|
|
999
1011
|
# Databricks SDK exceptions vary a bit by version; keep it pragmatic.
|
|
1000
|
-
if allow_not_found and any(s in str(e).lower() for s in ("not found", "
|
|
1012
|
+
if allow_not_found and any(s in str(e).lower() for s in ("not found", "not exist", "404")):
|
|
1001
1013
|
return b""
|
|
1002
1014
|
raise
|
|
1003
1015
|
|
|
@@ -12,17 +12,18 @@ from pathlib import PurePosixPath
|
|
|
12
12
|
from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Iterable
|
|
13
13
|
|
|
14
14
|
import pyarrow as pa
|
|
15
|
+
import pyarrow.dataset as ds
|
|
15
16
|
from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
|
|
16
17
|
from pyarrow.fs import FileInfo, FileType, FileSystem
|
|
17
|
-
import pyarrow.dataset as ds
|
|
18
18
|
|
|
19
19
|
from .io import DatabricksIO
|
|
20
20
|
from .path_kind import DatabricksPathKind
|
|
21
21
|
from ...libs.databrickslib import databricks
|
|
22
|
-
from ...
|
|
22
|
+
from ...libs.pandaslib import PandasDataFrame
|
|
23
|
+
from ...libs.polarslib import polars, PolarsDataFrame
|
|
24
|
+
from ...types.cast.arrow_cast import cast_arrow_tabular
|
|
23
25
|
from ...types.cast.cast_options import CastOptions
|
|
24
|
-
from ...types.cast.polars_cast import polars_converter
|
|
25
|
-
from ...types.cast.polars_pandas_cast import PolarsDataFrame
|
|
26
|
+
from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
|
|
26
27
|
from ...types.cast.registry import convert, register_converter
|
|
27
28
|
|
|
28
29
|
if databricks is not None:
|
|
@@ -504,7 +505,7 @@ class DatabricksPath:
|
|
|
504
505
|
else None
|
|
505
506
|
)
|
|
506
507
|
|
|
507
|
-
return self.reset_metadata(is_file=False, is_dir=True, size=
|
|
508
|
+
return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=mtime)
|
|
508
509
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
509
510
|
pass
|
|
510
511
|
|
|
@@ -639,22 +640,12 @@ class DatabricksPath:
|
|
|
639
640
|
Returns:
|
|
640
641
|
The DatabricksPath instance.
|
|
641
642
|
"""
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
|
|
649
|
-
except (NotFound, ResourceDoesNotExist):
|
|
650
|
-
if not parents or self.parent == self:
|
|
651
|
-
raise
|
|
652
|
-
|
|
653
|
-
self.parent.mkdir(parents=True, exist_ok=True)
|
|
654
|
-
self.mkdir(parents=False, exist_ok=exist_ok)
|
|
655
|
-
except (AlreadyExists, ResourceAlreadyExists):
|
|
656
|
-
if not exist_ok:
|
|
657
|
-
raise
|
|
643
|
+
if self.kind == DatabricksPathKind.WORKSPACE:
|
|
644
|
+
self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
|
|
645
|
+
elif self.kind == DatabricksPathKind.VOLUME:
|
|
646
|
+
self.make_volume_dir(parents=parents, exist_ok=exist_ok)
|
|
647
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
648
|
+
self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
|
|
658
649
|
|
|
659
650
|
return self
|
|
660
651
|
|
|
@@ -770,15 +761,13 @@ class DatabricksPath:
|
|
|
770
761
|
Returns:
|
|
771
762
|
The DatabricksPath instance.
|
|
772
763
|
"""
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
finally:
|
|
781
|
-
self.reset_metadata()
|
|
764
|
+
if self.kind == DatabricksPathKind.VOLUME:
|
|
765
|
+
return self._remove_volume_file()
|
|
766
|
+
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
767
|
+
return self._remove_workspace_file()
|
|
768
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
769
|
+
return self._remove_dbfs_file()
|
|
770
|
+
|
|
782
771
|
return self
|
|
783
772
|
|
|
784
773
|
def _remove_volume_file(self):
|
|
@@ -787,6 +776,9 @@ class DatabricksPath:
|
|
|
787
776
|
sdk.files.delete(self.files_full_path())
|
|
788
777
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
789
778
|
pass
|
|
779
|
+
finally:
|
|
780
|
+
self.reset_metadata()
|
|
781
|
+
|
|
790
782
|
return self
|
|
791
783
|
|
|
792
784
|
def _remove_workspace_file(self):
|
|
@@ -795,6 +787,9 @@ class DatabricksPath:
|
|
|
795
787
|
sdk.workspace.delete(self.workspace_full_path(), recursive=True)
|
|
796
788
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
797
789
|
pass
|
|
790
|
+
finally:
|
|
791
|
+
self.reset_metadata()
|
|
792
|
+
|
|
798
793
|
return self
|
|
799
794
|
|
|
800
795
|
def _remove_dbfs_file(self):
|
|
@@ -803,6 +798,9 @@ class DatabricksPath:
|
|
|
803
798
|
sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
|
|
804
799
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
805
800
|
pass
|
|
801
|
+
finally:
|
|
802
|
+
self.reset_metadata()
|
|
803
|
+
|
|
806
804
|
return self
|
|
807
805
|
|
|
808
806
|
def rmdir(self, recursive: bool = True):
|
|
@@ -827,7 +825,9 @@ class DatabricksPath:
|
|
|
827
825
|
sdk.workspace.delete(self.workspace_full_path(), recursive=recursive)
|
|
828
826
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
829
827
|
pass
|
|
830
|
-
|
|
828
|
+
finally:
|
|
829
|
+
self.reset_metadata()
|
|
830
|
+
|
|
831
831
|
return self
|
|
832
832
|
|
|
833
833
|
def _remove_dbfs_dir(self, recursive: bool = True):
|
|
@@ -836,7 +836,9 @@ class DatabricksPath:
|
|
|
836
836
|
sdk.dbfs.delete(self.dbfs_full_path(), recursive=recursive)
|
|
837
837
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
838
838
|
pass
|
|
839
|
-
|
|
839
|
+
finally:
|
|
840
|
+
self.reset_metadata()
|
|
841
|
+
|
|
840
842
|
return self
|
|
841
843
|
|
|
842
844
|
def _remove_volume_dir(self, recursive: bool = True):
|
|
@@ -1042,7 +1044,7 @@ class DatabricksPath:
|
|
|
1042
1044
|
Returns:
|
|
1043
1045
|
None.
|
|
1044
1046
|
"""
|
|
1045
|
-
if self.is_file()
|
|
1047
|
+
if self.is_file():
|
|
1046
1048
|
with self.open(mode="rb") as src:
|
|
1047
1049
|
src.copy_to(dest=dest)
|
|
1048
1050
|
|
|
@@ -1067,6 +1069,13 @@ class DatabricksPath:
|
|
|
1067
1069
|
else:
|
|
1068
1070
|
raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
|
|
1069
1071
|
|
|
1072
|
+
def write_bytes(self, data: bytes):
|
|
1073
|
+
if hasattr(data, "read"):
|
|
1074
|
+
data = data.read()
|
|
1075
|
+
|
|
1076
|
+
with self.open("wb") as f:
|
|
1077
|
+
f.write_all_bytes(data=data)
|
|
1078
|
+
|
|
1070
1079
|
# -------------------------
|
|
1071
1080
|
# Data ops (Arrow / Pandas / Polars)
|
|
1072
1081
|
# -------------------------
|
|
@@ -1112,9 +1121,9 @@ class DatabricksPath:
|
|
|
1112
1121
|
"""
|
|
1113
1122
|
if self.is_file():
|
|
1114
1123
|
with self.open("rb") as f:
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1124
|
+
data = f.read_arrow_table(batch_size=batch_size, **kwargs)
|
|
1125
|
+
return data
|
|
1126
|
+
elif self.is_dir():
|
|
1118
1127
|
tables: list[pa.Table] = []
|
|
1119
1128
|
for child in self.ls(recursive=True):
|
|
1120
1129
|
if child.is_file():
|
|
@@ -1210,6 +1219,7 @@ class DatabricksPath:
|
|
|
1210
1219
|
|
|
1211
1220
|
def read_pandas(
|
|
1212
1221
|
self,
|
|
1222
|
+
file_format: Optional[FileFormat] = None,
|
|
1213
1223
|
batch_size: Optional[int] = None,
|
|
1214
1224
|
concat: bool = True,
|
|
1215
1225
|
**kwargs
|
|
@@ -1217,6 +1227,7 @@ class DatabricksPath:
|
|
|
1217
1227
|
"""Read the path into a pandas DataFrame.
|
|
1218
1228
|
|
|
1219
1229
|
Args:
|
|
1230
|
+
file_format: Optional file format override.
|
|
1220
1231
|
batch_size: Optional batch size for reads.
|
|
1221
1232
|
concat: Whether to concatenate results for directories.
|
|
1222
1233
|
**kwargs: Format-specific options.
|
|
@@ -1225,14 +1236,26 @@ class DatabricksPath:
|
|
|
1225
1236
|
A pandas DataFrame or list of DataFrames if concat=False.
|
|
1226
1237
|
"""
|
|
1227
1238
|
if concat:
|
|
1228
|
-
return self.read_arrow_table(
|
|
1239
|
+
return self.read_arrow_table(
|
|
1240
|
+
file_format=file_format,
|
|
1241
|
+
batch_size=batch_size,
|
|
1242
|
+
concat=True,
|
|
1243
|
+
**kwargs
|
|
1244
|
+
).to_pandas()
|
|
1245
|
+
|
|
1246
|
+
tables = self.read_arrow_table(
|
|
1247
|
+
batch_size=batch_size,
|
|
1248
|
+
file_format=file_format,
|
|
1249
|
+
concat=False,
|
|
1250
|
+
**kwargs
|
|
1251
|
+
)
|
|
1229
1252
|
|
|
1230
|
-
tables = self.read_arrow_table(batch_size=batch_size, concat=False, **kwargs)
|
|
1231
1253
|
return [t.to_pandas() for t in tables] # type: ignore[arg-type]
|
|
1232
1254
|
|
|
1233
1255
|
def write_pandas(
|
|
1234
1256
|
self,
|
|
1235
|
-
df,
|
|
1257
|
+
df: PandasDataFrame,
|
|
1258
|
+
file_format: Optional[FileFormat] = None,
|
|
1236
1259
|
batch_size: Optional[int] = None,
|
|
1237
1260
|
**kwargs
|
|
1238
1261
|
):
|
|
@@ -1240,13 +1263,41 @@ class DatabricksPath:
|
|
|
1240
1263
|
|
|
1241
1264
|
Args:
|
|
1242
1265
|
df: pandas DataFrame to write.
|
|
1266
|
+
file_format: Optional file format override.
|
|
1243
1267
|
batch_size: Optional batch size for writes.
|
|
1244
1268
|
**kwargs: Format-specific options.
|
|
1245
1269
|
|
|
1246
1270
|
Returns:
|
|
1247
1271
|
The DatabricksPath instance.
|
|
1248
1272
|
"""
|
|
1249
|
-
|
|
1273
|
+
with self.connect(clone=False) as connected:
|
|
1274
|
+
if connected.is_dir_sink():
|
|
1275
|
+
seed = int(time.time() * 1000)
|
|
1276
|
+
|
|
1277
|
+
def df_batches(pdf, bs: int):
|
|
1278
|
+
for start in range(0, len(pdf), batch_size):
|
|
1279
|
+
yield pdf.iloc[start:start + batch_size]
|
|
1280
|
+
|
|
1281
|
+
for i, batch in enumerate(df_batches(df, batch_size)):
|
|
1282
|
+
part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
|
|
1283
|
+
|
|
1284
|
+
with part_path.open(mode="wb", clone=False) as f:
|
|
1285
|
+
f.write_pandas(
|
|
1286
|
+
batch,
|
|
1287
|
+
file_format=file_format,
|
|
1288
|
+
batch_size=batch_size,
|
|
1289
|
+
**kwargs
|
|
1290
|
+
)
|
|
1291
|
+
else:
|
|
1292
|
+
with connected.open(mode="wb", clone=False) as f:
|
|
1293
|
+
f.write_pandas(
|
|
1294
|
+
df,
|
|
1295
|
+
file_format=file_format,
|
|
1296
|
+
batch_size=batch_size,
|
|
1297
|
+
**kwargs
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
return self
|
|
1250
1301
|
|
|
1251
1302
|
def read_polars(
|
|
1252
1303
|
self,
|
|
@@ -1268,8 +1319,6 @@ class DatabricksPath:
|
|
|
1268
1319
|
Returns:
|
|
1269
1320
|
A polars DataFrame or list of DataFrames if concat=False.
|
|
1270
1321
|
"""
|
|
1271
|
-
import polars as pl
|
|
1272
|
-
|
|
1273
1322
|
if self.is_file():
|
|
1274
1323
|
with self.open("rb") as f:
|
|
1275
1324
|
return f.read_polars(batch_size=batch_size, **kwargs)
|
|
@@ -1282,10 +1331,10 @@ class DatabricksPath:
|
|
|
1282
1331
|
dfs.append(f.read_polars(batch_size=batch_size, **kwargs))
|
|
1283
1332
|
|
|
1284
1333
|
if not dfs:
|
|
1285
|
-
return
|
|
1334
|
+
return polars.DataFrame()
|
|
1286
1335
|
|
|
1287
1336
|
if concat:
|
|
1288
|
-
return
|
|
1337
|
+
return polars.concat(dfs, how=how, rechunk=rechunk)
|
|
1289
1338
|
return dfs # type: ignore[return-value]
|
|
1290
1339
|
|
|
1291
1340
|
raise FileNotFoundError(f"Path does not exist: {self}")
|
|
@@ -1316,12 +1365,10 @@ class DatabricksPath:
|
|
|
1316
1365
|
Notes:
|
|
1317
1366
|
- If `df` is a LazyFrame, we collect it first (optionally streaming).
|
|
1318
1367
|
"""
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
if isinstance(df, pl.LazyFrame):
|
|
1368
|
+
if isinstance(df, polars.LazyFrame):
|
|
1322
1369
|
df = df.collect()
|
|
1323
1370
|
|
|
1324
|
-
if not isinstance(df,
|
|
1371
|
+
if not isinstance(df, polars.DataFrame):
|
|
1325
1372
|
raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
|
|
1326
1373
|
|
|
1327
1374
|
with self.connect() as connected:
|
yggdrasil/libs/pandaslib.py
CHANGED
|
@@ -3,9 +3,14 @@
|
|
|
3
3
|
try:
|
|
4
4
|
import pandas # type: ignore
|
|
5
5
|
pandas = pandas
|
|
6
|
+
|
|
7
|
+
PandasDataFrame = pandas.DataFrame
|
|
6
8
|
except ImportError:
|
|
7
9
|
pandas = None
|
|
8
10
|
|
|
11
|
+
class PandasDataFrame:
|
|
12
|
+
pass
|
|
13
|
+
|
|
9
14
|
|
|
10
15
|
def require_pandas():
|
|
11
16
|
"""Ensure pandas is available before using pandas helpers.
|
|
@@ -23,4 +28,5 @@ def require_pandas():
|
|
|
23
28
|
__all__ = [
|
|
24
29
|
"pandas",
|
|
25
30
|
"require_pandas",
|
|
31
|
+
"PandasDataFrame"
|
|
26
32
|
]
|
yggdrasil/libs/polarslib.py
CHANGED
|
@@ -4,13 +4,18 @@ try:
|
|
|
4
4
|
import polars # type: ignore
|
|
5
5
|
|
|
6
6
|
polars = polars
|
|
7
|
+
|
|
8
|
+
PolarsDataFrame = polars.DataFrame
|
|
7
9
|
except ImportError:
|
|
8
10
|
polars = None
|
|
9
11
|
|
|
12
|
+
class PolarsDataFrame:
|
|
13
|
+
pass
|
|
10
14
|
|
|
11
15
|
__all__ = [
|
|
12
16
|
"polars",
|
|
13
17
|
"require_polars",
|
|
18
|
+
"PolarsDataFrame"
|
|
14
19
|
]
|
|
15
20
|
|
|
16
21
|
|
yggdrasil/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.51"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|