ygg 0.1.49__py3-none-any.whl → 0.1.51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.49
3
+ Version: 0.1.51
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
- ygg-0.1.49.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
1
+ ygg-0.1.51.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
2
2
  yggdrasil/__init__.py,sha256=4-ghPak2S6zfMqmnlxW2GCgPb5s79znpKa2hGEGXcE4,24
3
- yggdrasil/version.py,sha256=pnii9XXudF0U50FobVvNgNzGy9lA9q_DntGQAvyqaFA,22
3
+ yggdrasil/version.py,sha256=Vba463sBalMddSSVNE_881HL99Fg7msGZpAiG0JX6bg,22
4
4
  yggdrasil/databricks/__init__.py,sha256=skctY2c8W-hI81upx9F_PWRe5ishL3hrdiTuizgDjdw,152
5
5
  yggdrasil/databricks/compute/__init__.py,sha256=NvdzmaJSNYY1uJthv1hHdBuNu3bD_-Z65DWnaJt9yXg,289
6
6
  yggdrasil/databricks/compute/cluster.py,sha256=YomLfvB0oxbgl6WDgBRxI1UXsxwlEbR6gq3FUbPHscY,44199
@@ -11,21 +11,21 @@ yggdrasil/databricks/jobs/config.py,sha256=9LGeHD04hbfy0xt8_6oobC4moKJh4_DTjZiK4
11
11
  yggdrasil/databricks/sql/__init__.py,sha256=Vp_1cFaX1l-JGzCknvkbiB8CBFX2fQbBNntIeVn3lEg,231
12
12
  yggdrasil/databricks/sql/engine.py,sha256=K5WmGKpXU78JA3UdK8dLxBD_GXKidZJFe7hytuC5UHg,41029
13
13
  yggdrasil/databricks/sql/exceptions.py,sha256=uC-BoG0u0LtORKUS1X3iLID8nc-0TV5MQN3M8RXHsO4,1495
14
- yggdrasil/databricks/sql/statement_result.py,sha256=kMBvpwyRv3_JUZSvxMS0c9Vqlh6LtCRJvXsDpu9RIAs,16137
14
+ yggdrasil/databricks/sql/statement_result.py,sha256=GZyVhhrUK5opNo-8HGqsMx0Rp9fa_0zqvn8McSHPQ8U,16310
15
15
  yggdrasil/databricks/sql/types.py,sha256=5G-BM9_eOsRKEMzeDTWUsWW5g4Idvs-czVCpOCrMhdA,6412
16
16
  yggdrasil/databricks/sql/warehouse.py,sha256=1J0dyQLJb-OS1_1xU1eAVZ4CoL2-FhFeowKSvU3RzFc,9773
17
17
  yggdrasil/databricks/workspaces/__init__.py,sha256=dv2zotoFVhNFlTCdRq6gwf5bEzeZkOZszoNZMs0k59g,114
18
18
  yggdrasil/databricks/workspaces/filesytem.py,sha256=Z8JXU7_XUEbw9fpTQT1avRQKi-IAP2KemXBMPkUoY4w,9805
19
- yggdrasil/databricks/workspaces/io.py,sha256=CDq9NsYFjlSJ1QbKFlfWvZLQPVoWyZ4b3XR_lxNPcZE,32776
20
- yggdrasil/databricks/workspaces/path.py,sha256=BxDwxE7q1-NLKEZQT4xLM3LeCeQKO3wUy7R-Ce-cSMk,47875
19
+ yggdrasil/databricks/workspaces/io.py,sha256=PhXMVrK8ngDl6kKjnh8_jlZ2GsKtU2nLSi1nFgV4Sks,33302
20
+ yggdrasil/databricks/workspaces/path.py,sha256=HA73r0qedm8IiE_FPrDzRLc5BBkU9_a1qF2JXdWXMQk,49290
21
21
  yggdrasil/databricks/workspaces/path_kind.py,sha256=Xc319NysH8_6E9C0Q8nCxDHYG07_SnzyUVKHe0dNdDQ,305
22
22
  yggdrasil/databricks/workspaces/workspace.py,sha256=zBlQdYNT_xKwUCYo3O4Q4g-8pfMvff3I26efyCfY_TY,24961
23
23
  yggdrasil/dataclasses/__init__.py,sha256=_RkhfF3KC1eSORby1dzvBXQ0-UGG3u6wyUQWX2jq1Pc,108
24
24
  yggdrasil/dataclasses/dataclass.py,sha256=LxrCjwvmBnb8yRI_N-c31RHHxB4XoJPixmKg9iBIuaI,1148
25
25
  yggdrasil/libs/__init__.py,sha256=zdC9OU0Xy36CLY9mg2drxN6S7isPR8aTLzJA6xVIeLE,91
26
26
  yggdrasil/libs/databrickslib.py,sha256=NHJeUViHhZc8LI5oDVfi1axRyUy_pDJLy4hjD0KZEBQ,980
27
- yggdrasil/libs/pandaslib.py,sha256=Edm3SXgvr8qe2wsojuRvD1ewNB-Sff0RWoTqaddVruI,509
28
- yggdrasil/libs/polarslib.py,sha256=7EWP5iS8F9cW79M6d8Yg5ysjnOY3w4_k7TW-5DCRACw,511
27
+ yggdrasil/libs/pandaslib.py,sha256=GoUjh9dxZAFLe9hs8-6RliLD3jsH_BexYW1w-8BZzb0,618
28
+ yggdrasil/libs/polarslib.py,sha256=hnL8x6ygsyIoiJyIUMaeoji3fRzab4lBiHcMqa29C_Q,618
29
29
  yggdrasil/libs/sparklib.py,sha256=FQ3W1iz2EIpQreorOiQuFt15rdhq2QhGEAWp8Zrbl9A,10177
30
30
  yggdrasil/libs/extensions/__init__.py,sha256=mcXW5Li3Cbprbs4Ci-b5A0Ju0wmLcfvEiFusTx6xNjU,117
31
31
  yggdrasil/libs/extensions/polars_extensions.py,sha256=RTkGi8llhPJjX7x9egix7-yXWo2X24zIAPSKXV37SSA,12397
@@ -49,14 +49,14 @@ yggdrasil/types/cast/__init__.py,sha256=Oft3pTs2bRM5hT7YqJAuOKTYYk-SACLaMOXUVdaf
49
49
  yggdrasil/types/cast/arrow_cast.py,sha256=_OMYc4t5GlgE4ztlWaCoK8Jnba09rgDbmHVP-QXhOL0,41523
50
50
  yggdrasil/types/cast/cast_options.py,sha256=nDaEvCCs7TBamhTWyDrYf3LVaBWzioIP2Q5_LXrChF4,15532
51
51
  yggdrasil/types/cast/pandas_cast.py,sha256=I3xu0sZ59ZbK3NDcQ2dslzdeKzhpFV5zR02ZEixd5hI,8713
52
- yggdrasil/types/cast/polars_cast.py,sha256=K2nnQ7bexArneYEhUPgV_6er4JNq6N5RmbMUhw-2_Xw,28766
52
+ yggdrasil/types/cast/polars_cast.py,sha256=RILcbfL4o1XDMp5H-06c0BMrDal5pehOT7ACiItDB6E,28791
53
53
  yggdrasil/types/cast/polars_pandas_cast.py,sha256=CS0P7teVv15IdX5g7v40RfkH1VMg6b-HM0V_gOfacm8,5071
54
54
  yggdrasil/types/cast/registry.py,sha256=_zdFGmUBB7P-e_LIcJlOxMcxAkXoA-UXB6HqLMgTokg,21491
55
55
  yggdrasil/types/cast/spark_cast.py,sha256=_KAsl1DqmKMSfWxqhVE7gosjYdgiL1C5bDQv6eP3HtA,24926
56
56
  yggdrasil/types/cast/spark_pandas_cast.py,sha256=BuTiWrdCANZCdD_p2MAytqm74eq-rdRXd-LGojBRrfU,5023
57
57
  yggdrasil/types/cast/spark_polars_cast.py,sha256=btmZNHXn2NSt3fUuB4xg7coaE0RezIBdZD92H8NK0Jw,9073
58
- ygg-0.1.49.dist-info/METADATA,sha256=CHTqeVyiYa1868ZDwISDHKyXYxPeUH0mHhvHLYYoDbg,18528
59
- ygg-0.1.49.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
60
- ygg-0.1.49.dist-info/entry_points.txt,sha256=6q-vpWG3kvw2dhctQ0LALdatoeefkN855Ev02I1dKGY,70
61
- ygg-0.1.49.dist-info/top_level.txt,sha256=iBe9Kk4VIVbLpgv_p8OZUIfxgj4dgJ5wBg6vO3rigso,10
62
- ygg-0.1.49.dist-info/RECORD,,
58
+ ygg-0.1.51.dist-info/METADATA,sha256=JprFwC_aHRV7jMw6YBV4-uAZcTZrEFVu7eE6_2dulG4,18528
59
+ ygg-0.1.51.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
60
+ ygg-0.1.51.dist-info/entry_points.txt,sha256=6q-vpWG3kvw2dhctQ0LALdatoeefkN855Ev02I1dKGY,70
61
+ ygg-0.1.51.dist-info/top_level.txt,sha256=iBe9Kk4VIVbLpgv_p8OZUIfxgj4dgJ5wBg6vO3rigso,10
62
+ ygg-0.1.51.dist-info/RECORD,,
@@ -344,10 +344,17 @@ class StatementResult:
344
344
  if self.persisted:
345
345
  if self._arrow_table is not None:
346
346
  return self._arrow_table.schema
347
- return spark_schema_to_arrow_schema(self._spark_df.schema)
347
+ elif self._spark_df is not None:
348
+ return spark_schema_to_arrow_schema(self._spark_df.schema)
349
+ raise NotImplementedError("")
350
+
351
+ manifest = self.manifest
352
+
353
+ if manifest is None:
354
+ return pa.schema([])
348
355
 
349
356
  fields = [
350
- column_info_to_arrow_field(_) for _ in self.manifest.schema.columns
357
+ column_info_to_arrow_field(_) for _ in manifest.schema.columns
351
358
  ]
352
359
 
353
360
  return pa.schema(fields)
@@ -362,7 +369,7 @@ class StatementResult:
362
369
  An Arrow Table containing all rows.
363
370
  """
364
371
  if self.persisted:
365
- if self._arrow_table:
372
+ if self._arrow_table is not None:
366
373
  return self._arrow_table
367
374
  else:
368
375
  return self._spark_df.toArrow()
@@ -370,7 +377,6 @@ class StatementResult:
370
377
  batches = list(self.to_arrow_batches(parallel_pool=parallel_pool))
371
378
 
372
379
  if not batches:
373
- # empty table with no columns
374
380
  return pa.Table.from_batches([], schema=self.arrow_schema())
375
381
 
376
382
  return pa.Table.from_batches(batches)
@@ -501,8 +507,9 @@ class StatementResult:
501
507
  Returns:
502
508
  A Spark DataFrame with the result rows.
503
509
  """
504
- if self._spark_df:
510
+ if self._spark_df is not None:
505
511
  return self._spark_df
506
512
 
507
513
  self._spark_df = arrow_table_to_spark_dataframe(self.to_arrow_table())
514
+
508
515
  return self._spark_df
@@ -13,8 +13,8 @@ from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat
13
13
 
14
14
  from .path_kind import DatabricksPathKind
15
15
  from ...libs.databrickslib import databricks
16
- from ...types.cast.pandas_cast import PandasDataFrame
17
- from ...types.cast.polars_pandas_cast import PolarsDataFrame
16
+ from ...libs.pandaslib import pandas, PandasDataFrame
17
+ from ...libs.polarslib import polars, PolarsDataFrame
18
18
  from ...types.cast.registry import convert
19
19
 
20
20
  if databricks is not None:
@@ -42,7 +42,6 @@ class DatabricksIO(ABC, IO):
42
42
  path: "DatabricksPath",
43
43
  mode: str,
44
44
  encoding: Optional[str] = None,
45
- compression: Optional[str] = "detect",
46
45
  position: int = 0,
47
46
  buffer: Optional[io.BytesIO] = None,
48
47
  ):
@@ -50,7 +49,6 @@ class DatabricksIO(ABC, IO):
50
49
 
51
50
  self.encoding = encoding
52
51
  self.mode = mode
53
- self.compression = compression
54
52
 
55
53
  self.path = path
56
54
 
@@ -111,7 +109,6 @@ class DatabricksIO(ABC, IO):
111
109
  path=path,
112
110
  mode=mode,
113
111
  encoding=encoding,
114
- compression=compression,
115
112
  position=position,
116
113
  buffer=buffer,
117
114
  )
@@ -120,7 +117,6 @@ class DatabricksIO(ABC, IO):
120
117
  path=path,
121
118
  mode=mode,
122
119
  encoding=encoding,
123
- compression=compression,
124
120
  position=position,
125
121
  buffer=buffer,
126
122
  )
@@ -129,7 +125,6 @@ class DatabricksIO(ABC, IO):
129
125
  path=path,
130
126
  mode=mode,
131
127
  encoding=encoding,
132
- compression=compression,
133
128
  position=position,
134
129
  buffer=buffer,
135
130
  )
@@ -226,7 +221,6 @@ class DatabricksIO(ABC, IO):
226
221
  path=kwargs.get("path", self.path),
227
222
  mode=kwargs.get("mode", self.mode),
228
223
  encoding=kwargs.get("encoding", self.encoding),
229
- compression=kwargs.get("compression", self.compression),
230
224
  position=kwargs.get("position", self.position),
231
225
  buffer=kwargs.get("buffer", self._buffer),
232
226
  )
@@ -264,8 +258,7 @@ class DatabricksIO(ABC, IO):
264
258
  None.
265
259
  """
266
260
  self.flush()
267
- if self._buffer is not None:
268
- self._buffer.close()
261
+ self.clear_buffer()
269
262
 
270
263
  def fileno(self):
271
264
  """Return a pseudo file descriptor based on object hash.
@@ -403,9 +396,6 @@ class DatabricksIO(ABC, IO):
403
396
  Returns:
404
397
  The read bytes or string depending on mode.
405
398
  """
406
- if not self.readable():
407
- raise IOError("File not open for reading")
408
-
409
399
  current_position = self.position
410
400
  all_data = self.read_all_bytes(use_cache=use_cache)
411
401
 
@@ -431,9 +421,6 @@ class DatabricksIO(ABC, IO):
431
421
  Returns:
432
422
  The next line as bytes or string.
433
423
  """
434
- if not self.readable():
435
- raise IOError("File not open for reading")
436
-
437
424
  if self.encoding:
438
425
  # Text-mode: accumulate characters
439
426
  out_chars = []
@@ -475,9 +462,6 @@ class DatabricksIO(ABC, IO):
475
462
  Returns:
476
463
  A list of lines.
477
464
  """
478
- if not self.readable():
479
- raise IOError("File not open for reading")
480
-
481
465
  lines = []
482
466
  total = 0
483
467
 
@@ -492,14 +476,6 @@ class DatabricksIO(ABC, IO):
492
476
 
493
477
  return lines
494
478
 
495
- def appendable(self):
496
- """Return True when the file is open in append mode.
497
-
498
- Returns:
499
- True if in append mode.
500
- """
501
- return "a" in self.mode
502
-
503
479
  def writable(self):
504
480
  """Return True to indicate write support.
505
481
 
@@ -561,9 +537,6 @@ class DatabricksIO(ABC, IO):
561
537
  Returns:
562
538
  The number of bytes written.
563
539
  """
564
- if not self.writable():
565
- raise IOError("File not open for writing")
566
-
567
540
  if isinstance(data, str):
568
541
  data = data.encode(self.encoding or "utf-8")
569
542
 
@@ -664,8 +637,12 @@ class DatabricksIO(ABC, IO):
664
637
  return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
665
638
  elif isinstance(table, PandasDataFrame):
666
639
  return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
667
- else:
668
- raise ValueError(f"Cannot write {type(table)} to {self.path}")
640
+
641
+ return self.write_arrow(
642
+ table=table,
643
+ file_format=file_format,
644
+ batch_size=batch_size
645
+ )
669
646
 
670
647
  # ---- Arrow ----
671
648
 
@@ -691,14 +668,16 @@ class DatabricksIO(ABC, IO):
691
668
  if isinstance(file_format, ParquetFileFormat):
692
669
  return pq.read_table(self, **kwargs)
693
670
 
694
- if isinstance(file_format, CsvFileFormat):
671
+ elif isinstance(file_format, CsvFileFormat):
695
672
  return pcsv.read_csv(self, parse_options=file_format.parse_options)
696
673
 
697
- raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
674
+ else:
675
+ ValueError(f"Unsupported file format for Arrow table: {file_format}")
698
676
 
699
677
  def write_arrow(
700
678
  self,
701
679
  table: Union[pa.Table, pa.RecordBatch],
680
+ file_format: Optional[FileFormat] = None,
702
681
  batch_size: Optional[int] = None,
703
682
  **kwargs
704
683
  ):
@@ -706,6 +685,7 @@ class DatabricksIO(ABC, IO):
706
685
 
707
686
  Args:
708
687
  table: Arrow table or batch to write.
688
+ file_format: Optional file format override.
709
689
  batch_size: Optional batch size for writes.
710
690
  **kwargs: Format-specific options.
711
691
 
@@ -717,6 +697,7 @@ class DatabricksIO(ABC, IO):
717
697
 
718
698
  return self.write_arrow_table(
719
699
  table=table,
700
+ file_format=file_format,
720
701
  batch_size=batch_size,
721
702
  **kwargs
722
703
  )
@@ -776,12 +757,14 @@ class DatabricksIO(ABC, IO):
776
757
 
777
758
  def read_arrow_batches(
778
759
  self,
760
+ file_format: Optional[FileFormat] = None,
779
761
  batch_size: Optional[int] = None,
780
762
  **kwargs
781
763
  ):
782
764
  """Yield Arrow record batches from the file.
783
765
 
784
766
  Args:
767
+ file_format: Optional file format override.
785
768
  batch_size: Optional batch size for reads.
786
769
  **kwargs: Format-specific options.
787
770
 
@@ -790,7 +773,11 @@ class DatabricksIO(ABC, IO):
790
773
  """
791
774
  return (
792
775
  self
793
- .read_arrow_table(batch_size=batch_size, **kwargs)
776
+ .read_arrow_table(
777
+ file_format=file_format,
778
+ batch_size=batch_size,
779
+ **kwargs
780
+ )
794
781
  .to_batches(max_chunksize=batch_size)
795
782
  )
796
783
 
@@ -798,23 +785,36 @@ class DatabricksIO(ABC, IO):
798
785
 
799
786
  def read_pandas(
800
787
  self,
788
+ file_format: Optional[FileFormat] = None,
801
789
  batch_size: Optional[int] = None,
802
790
  **kwargs
803
791
  ):
804
792
  """Read the file into a pandas DataFrame.
805
793
 
806
794
  Args:
795
+ file_format: Optional file format override.
807
796
  batch_size: Optional batch size for reads.
808
797
  **kwargs: Format-specific options.
809
798
 
810
799
  Returns:
811
800
  A pandas DataFrame with the file contents.
812
801
  """
813
- return self.read_arrow_table(batch_size=batch_size, **kwargs).to_pandas()
802
+ file_format = self.path.file_format if file_format is None else file_format
803
+ self.seek(0)
804
+
805
+ if isinstance(file_format, ParquetFileFormat):
806
+ return pandas.read_parquet(self, **kwargs)
807
+
808
+ elif isinstance(file_format, CsvFileFormat):
809
+ return pandas.read_csv(self, **kwargs)
810
+
811
+ else:
812
+ raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
814
813
 
815
814
  def write_pandas(
816
815
  self,
817
- df,
816
+ df: PandasDataFrame,
817
+ file_format: Optional[FileFormat] = None,
818
818
  batch_size: Optional[int] = None,
819
819
  **kwargs
820
820
  ):
@@ -822,13 +822,26 @@ class DatabricksIO(ABC, IO):
822
822
 
823
823
  Args:
824
824
  df: pandas DataFrame to write.
825
+ file_format: Optional file format override.
825
826
  batch_size: Optional batch size for writes.
826
827
  **kwargs: Format-specific options.
827
828
 
828
829
  Returns:
829
830
  None.
830
831
  """
831
- self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
832
+ file_format = self.path.file_format if file_format is None else FileFormat
833
+ buffer = io.BytesIO()
834
+
835
+ if isinstance(file_format, ParquetFileFormat):
836
+ df.to_parquet(buffer, **kwargs)
837
+
838
+ elif isinstance(file_format, CsvFileFormat):
839
+ df.to_csv(buffer, **kwargs)
840
+
841
+ else:
842
+ raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
843
+
844
+ self.write_all_bytes(data=buffer.getvalue())
832
845
 
833
846
  # ---- Polars ----
834
847
 
@@ -848,22 +861,21 @@ class DatabricksIO(ABC, IO):
848
861
  Returns:
849
862
  A polars DataFrame with the file contents.
850
863
  """
851
- import polars as pl
852
-
853
864
  file_format = self.path.file_format if file_format is None else file_format
854
865
  self.seek(0)
855
866
 
856
867
  if isinstance(file_format, ParquetFileFormat):
857
- return pl.read_parquet(self, **kwargs)
868
+ return polars.read_parquet(self, **kwargs)
858
869
 
859
- if isinstance(file_format, CsvFileFormat):
860
- return pl.read_csv(self, **kwargs)
870
+ elif isinstance(file_format, CsvFileFormat):
871
+ return polars.read_csv(self, **kwargs)
861
872
 
862
- raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
873
+ else:
874
+ raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
863
875
 
864
876
  def write_polars(
865
877
  self,
866
- df,
878
+ df: PolarsDataFrame,
867
879
  file_format: Optional[FileFormat] = None,
868
880
  batch_size: Optional[int] = None,
869
881
  **kwargs
@@ -997,7 +1009,7 @@ class DatabricksVolumeIO(DatabricksIO):
997
1009
  resp = client.download(full_path)
998
1010
  except Exception as e:
999
1011
  # Databricks SDK exceptions vary a bit by version; keep it pragmatic.
1000
- if allow_not_found and any(s in str(e).lower() for s in ("not found", "does not exist", "404")):
1012
+ if allow_not_found and any(s in str(e).lower() for s in ("not found", "not exist", "404")):
1001
1013
  return b""
1002
1014
  raise
1003
1015
 
@@ -12,17 +12,18 @@ from pathlib import PurePosixPath
12
12
  from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Iterable
13
13
 
14
14
  import pyarrow as pa
15
+ import pyarrow.dataset as ds
15
16
  from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
16
17
  from pyarrow.fs import FileInfo, FileType, FileSystem
17
- import pyarrow.dataset as ds
18
18
 
19
19
  from .io import DatabricksIO
20
20
  from .path_kind import DatabricksPathKind
21
21
  from ...libs.databrickslib import databricks
22
- from ...types import cast_arrow_tabular, cast_polars_dataframe
22
+ from ...libs.pandaslib import PandasDataFrame
23
+ from ...libs.polarslib import polars, PolarsDataFrame
24
+ from ...types.cast.arrow_cast import cast_arrow_tabular
23
25
  from ...types.cast.cast_options import CastOptions
24
- from ...types.cast.polars_cast import polars_converter
25
- from ...types.cast.polars_pandas_cast import PolarsDataFrame
26
+ from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
26
27
  from ...types.cast.registry import convert, register_converter
27
28
 
28
29
  if databricks is not None:
@@ -504,7 +505,7 @@ class DatabricksPath:
504
505
  else None
505
506
  )
506
507
 
507
- return self.reset_metadata(is_file=False, is_dir=True, size=info, mtime=mtime)
508
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=mtime)
508
509
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
509
510
  pass
510
511
 
@@ -639,22 +640,12 @@ class DatabricksPath:
639
640
  Returns:
640
641
  The DatabricksPath instance.
641
642
  """
642
- try:
643
- if self.kind == DatabricksPathKind.WORKSPACE:
644
- self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
645
- elif self.kind == DatabricksPathKind.VOLUME:
646
- self.make_volume_dir(parents=parents, exist_ok=exist_ok)
647
- elif self.kind == DatabricksPathKind.DBFS:
648
- self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
649
- except (NotFound, ResourceDoesNotExist):
650
- if not parents or self.parent == self:
651
- raise
652
-
653
- self.parent.mkdir(parents=True, exist_ok=True)
654
- self.mkdir(parents=False, exist_ok=exist_ok)
655
- except (AlreadyExists, ResourceAlreadyExists):
656
- if not exist_ok:
657
- raise
643
+ if self.kind == DatabricksPathKind.WORKSPACE:
644
+ self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
645
+ elif self.kind == DatabricksPathKind.VOLUME:
646
+ self.make_volume_dir(parents=parents, exist_ok=exist_ok)
647
+ elif self.kind == DatabricksPathKind.DBFS:
648
+ self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
658
649
 
659
650
  return self
660
651
 
@@ -770,15 +761,13 @@ class DatabricksPath:
770
761
  Returns:
771
762
  The DatabricksPath instance.
772
763
  """
773
- try:
774
- if self.kind == DatabricksPathKind.VOLUME:
775
- return self._remove_volume_file()
776
- elif self.kind == DatabricksPathKind.WORKSPACE:
777
- return self._remove_workspace_file()
778
- elif self.kind == DatabricksPathKind.DBFS:
779
- return self._remove_dbfs_file()
780
- finally:
781
- self.reset_metadata()
764
+ if self.kind == DatabricksPathKind.VOLUME:
765
+ return self._remove_volume_file()
766
+ elif self.kind == DatabricksPathKind.WORKSPACE:
767
+ return self._remove_workspace_file()
768
+ elif self.kind == DatabricksPathKind.DBFS:
769
+ return self._remove_dbfs_file()
770
+
782
771
  return self
783
772
 
784
773
  def _remove_volume_file(self):
@@ -787,6 +776,9 @@ class DatabricksPath:
787
776
  sdk.files.delete(self.files_full_path())
788
777
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
789
778
  pass
779
+ finally:
780
+ self.reset_metadata()
781
+
790
782
  return self
791
783
 
792
784
  def _remove_workspace_file(self):
@@ -795,6 +787,9 @@ class DatabricksPath:
795
787
  sdk.workspace.delete(self.workspace_full_path(), recursive=True)
796
788
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
797
789
  pass
790
+ finally:
791
+ self.reset_metadata()
792
+
798
793
  return self
799
794
 
800
795
  def _remove_dbfs_file(self):
@@ -803,6 +798,9 @@ class DatabricksPath:
803
798
  sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
804
799
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
805
800
  pass
801
+ finally:
802
+ self.reset_metadata()
803
+
806
804
  return self
807
805
 
808
806
  def rmdir(self, recursive: bool = True):
@@ -827,7 +825,9 @@ class DatabricksPath:
827
825
  sdk.workspace.delete(self.workspace_full_path(), recursive=recursive)
828
826
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
829
827
  pass
830
- self.reset_metadata()
828
+ finally:
829
+ self.reset_metadata()
830
+
831
831
  return self
832
832
 
833
833
  def _remove_dbfs_dir(self, recursive: bool = True):
@@ -836,7 +836,9 @@ class DatabricksPath:
836
836
  sdk.dbfs.delete(self.dbfs_full_path(), recursive=recursive)
837
837
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
838
838
  pass
839
- self.reset_metadata()
839
+ finally:
840
+ self.reset_metadata()
841
+
840
842
  return self
841
843
 
842
844
  def _remove_volume_dir(self, recursive: bool = True):
@@ -1042,7 +1044,7 @@ class DatabricksPath:
1042
1044
  Returns:
1043
1045
  None.
1044
1046
  """
1045
- if self.is_file() and dest.is_file():
1047
+ if self.is_file():
1046
1048
  with self.open(mode="rb") as src:
1047
1049
  src.copy_to(dest=dest)
1048
1050
 
@@ -1067,6 +1069,13 @@ class DatabricksPath:
1067
1069
  else:
1068
1070
  raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
1069
1071
 
1072
+ def write_bytes(self, data: bytes):
1073
+ if hasattr(data, "read"):
1074
+ data = data.read()
1075
+
1076
+ with self.open("wb") as f:
1077
+ f.write_all_bytes(data=data)
1078
+
1070
1079
  # -------------------------
1071
1080
  # Data ops (Arrow / Pandas / Polars)
1072
1081
  # -------------------------
@@ -1112,9 +1121,9 @@ class DatabricksPath:
1112
1121
  """
1113
1122
  if self.is_file():
1114
1123
  with self.open("rb") as f:
1115
- return f.read_arrow_table(batch_size=batch_size, **kwargs)
1116
-
1117
- if self.is_dir():
1124
+ data = f.read_arrow_table(batch_size=batch_size, **kwargs)
1125
+ return data
1126
+ elif self.is_dir():
1118
1127
  tables: list[pa.Table] = []
1119
1128
  for child in self.ls(recursive=True):
1120
1129
  if child.is_file():
@@ -1210,6 +1219,7 @@ class DatabricksPath:
1210
1219
 
1211
1220
  def read_pandas(
1212
1221
  self,
1222
+ file_format: Optional[FileFormat] = None,
1213
1223
  batch_size: Optional[int] = None,
1214
1224
  concat: bool = True,
1215
1225
  **kwargs
@@ -1217,6 +1227,7 @@ class DatabricksPath:
1217
1227
  """Read the path into a pandas DataFrame.
1218
1228
 
1219
1229
  Args:
1230
+ file_format: Optional file format override.
1220
1231
  batch_size: Optional batch size for reads.
1221
1232
  concat: Whether to concatenate results for directories.
1222
1233
  **kwargs: Format-specific options.
@@ -1225,14 +1236,26 @@ class DatabricksPath:
1225
1236
  A pandas DataFrame or list of DataFrames if concat=False.
1226
1237
  """
1227
1238
  if concat:
1228
- return self.read_arrow_table(batch_size=batch_size, concat=True, **kwargs).to_pandas()
1239
+ return self.read_arrow_table(
1240
+ file_format=file_format,
1241
+ batch_size=batch_size,
1242
+ concat=True,
1243
+ **kwargs
1244
+ ).to_pandas()
1245
+
1246
+ tables = self.read_arrow_table(
1247
+ batch_size=batch_size,
1248
+ file_format=file_format,
1249
+ concat=False,
1250
+ **kwargs
1251
+ )
1229
1252
 
1230
- tables = self.read_arrow_table(batch_size=batch_size, concat=False, **kwargs)
1231
1253
  return [t.to_pandas() for t in tables] # type: ignore[arg-type]
1232
1254
 
1233
1255
  def write_pandas(
1234
1256
  self,
1235
- df,
1257
+ df: PandasDataFrame,
1258
+ file_format: Optional[FileFormat] = None,
1236
1259
  batch_size: Optional[int] = None,
1237
1260
  **kwargs
1238
1261
  ):
@@ -1240,13 +1263,41 @@ class DatabricksPath:
1240
1263
 
1241
1264
  Args:
1242
1265
  df: pandas DataFrame to write.
1266
+ file_format: Optional file format override.
1243
1267
  batch_size: Optional batch size for writes.
1244
1268
  **kwargs: Format-specific options.
1245
1269
 
1246
1270
  Returns:
1247
1271
  The DatabricksPath instance.
1248
1272
  """
1249
- return self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
1273
+ with self.connect(clone=False) as connected:
1274
+ if connected.is_dir_sink():
1275
+ seed = int(time.time() * 1000)
1276
+
1277
+ def df_batches(pdf, bs: int):
1278
+ for start in range(0, len(pdf), batch_size):
1279
+ yield pdf.iloc[start:start + batch_size]
1280
+
1281
+ for i, batch in enumerate(df_batches(df, batch_size)):
1282
+ part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
1283
+
1284
+ with part_path.open(mode="wb", clone=False) as f:
1285
+ f.write_pandas(
1286
+ batch,
1287
+ file_format=file_format,
1288
+ batch_size=batch_size,
1289
+ **kwargs
1290
+ )
1291
+ else:
1292
+ with connected.open(mode="wb", clone=False) as f:
1293
+ f.write_pandas(
1294
+ df,
1295
+ file_format=file_format,
1296
+ batch_size=batch_size,
1297
+ **kwargs
1298
+ )
1299
+
1300
+ return self
1250
1301
 
1251
1302
  def read_polars(
1252
1303
  self,
@@ -1268,8 +1319,6 @@ class DatabricksPath:
1268
1319
  Returns:
1269
1320
  A polars DataFrame or list of DataFrames if concat=False.
1270
1321
  """
1271
- import polars as pl
1272
-
1273
1322
  if self.is_file():
1274
1323
  with self.open("rb") as f:
1275
1324
  return f.read_polars(batch_size=batch_size, **kwargs)
@@ -1282,10 +1331,10 @@ class DatabricksPath:
1282
1331
  dfs.append(f.read_polars(batch_size=batch_size, **kwargs))
1283
1332
 
1284
1333
  if not dfs:
1285
- return pl.DataFrame()
1334
+ return polars.DataFrame()
1286
1335
 
1287
1336
  if concat:
1288
- return pl.concat(dfs, how=how, rechunk=rechunk)
1337
+ return polars.concat(dfs, how=how, rechunk=rechunk)
1289
1338
  return dfs # type: ignore[return-value]
1290
1339
 
1291
1340
  raise FileNotFoundError(f"Path does not exist: {self}")
@@ -1316,12 +1365,10 @@ class DatabricksPath:
1316
1365
  Notes:
1317
1366
  - If `df` is a LazyFrame, we collect it first (optionally streaming).
1318
1367
  """
1319
- import polars as pl
1320
-
1321
- if isinstance(df, pl.LazyFrame):
1368
+ if isinstance(df, polars.LazyFrame):
1322
1369
  df = df.collect()
1323
1370
 
1324
- if not isinstance(df, pl.DataFrame):
1371
+ if not isinstance(df, polars.DataFrame):
1325
1372
  raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
1326
1373
 
1327
1374
  with self.connect() as connected:
@@ -3,9 +3,14 @@
3
3
  try:
4
4
  import pandas # type: ignore
5
5
  pandas = pandas
6
+
7
+ PandasDataFrame = pandas.DataFrame
6
8
  except ImportError:
7
9
  pandas = None
8
10
 
11
+ class PandasDataFrame:
12
+ pass
13
+
9
14
 
10
15
  def require_pandas():
11
16
  """Ensure pandas is available before using pandas helpers.
@@ -23,4 +28,5 @@ def require_pandas():
23
28
  __all__ = [
24
29
  "pandas",
25
30
  "require_pandas",
31
+ "PandasDataFrame"
26
32
  ]
@@ -4,13 +4,18 @@ try:
4
4
  import polars # type: ignore
5
5
 
6
6
  polars = polars
7
+
8
+ PolarsDataFrame = polars.DataFrame
7
9
  except ImportError:
8
10
  polars = None
9
11
 
12
+ class PolarsDataFrame:
13
+ pass
10
14
 
11
15
  __all__ = [
12
16
  "polars",
13
17
  "require_polars",
18
+ "PolarsDataFrame"
14
19
  ]
15
20
 
16
21
 
@@ -15,6 +15,7 @@ from ..python_defaults import default_arrow_scalar
15
15
  from ...libs.polarslib import polars
16
16
 
17
17
  __all__ = [
18
+ "polars_converter",
18
19
  "cast_polars_array",
19
20
  "cast_polars_dataframe",
20
21
  "arrow_type_to_polars_type",
yggdrasil/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.49"
1
+ __version__ = "0.1.51"
File without changes