deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import pytest
2
2
 
3
- from deltacat import Dataset
3
+ from deltacat.storage.rivulet.dataset import Dataset
4
4
  from deltacat.storage.rivulet.fs.file_provider import FileProvider
5
5
  from deltacat.storage.rivulet.fs.file_store import FileStore
6
6
  from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
@@ -1,29 +1,39 @@
1
1
  import shutil
2
2
  import tempfile
3
+
3
4
  import deltacat as dc
5
+ from deltacat.constants import METAFILE_FORMAT_MSGPACK
6
+ from deltacat import Namespace, DeltaCatUrl, DatasetType
7
+ from deltacat.storage import Metafile
8
+
9
+ from deltacat.io import (
10
+ METAFILE_TYPE_COLUMN_NAME,
11
+ METAFILE_DATA_COLUMN_NAME,
12
+ )
4
13
 
5
14
 
6
15
  class TestDeltaCAT:
7
16
  @classmethod
8
- def setup_class(cls):
17
+ def setup_method(cls):
9
18
  cls.temp_dir_1 = tempfile.mkdtemp()
10
19
  cls.temp_dir_2 = tempfile.mkdtemp()
11
20
  # Initialize DeltaCAT with two local catalogs.
12
- dc.put("test_catalog_1", root=cls.temp_dir_1)
13
- dc.put("test_catalog_2", root=cls.temp_dir_2)
21
+ dc.init()
22
+ dc.put(DeltaCatUrl("dc://test_catalog_1"), root=cls.temp_dir_1)
23
+ dc.put(DeltaCatUrl("dc://test_catalog_2"), root=cls.temp_dir_2)
14
24
 
15
25
  @classmethod
16
- def teardown_class(cls):
26
+ def teardown_method(cls):
17
27
  shutil.rmtree(cls.temp_dir_1)
18
28
  shutil.rmtree(cls.temp_dir_2)
19
29
 
20
30
  def test_cross_catalog_namespace_copy(self):
21
31
  # Given two empty DeltaCAT catalogs.
22
32
  # When a namespace is copied across catalogs.
23
- namespace_src = dc.put("test_catalog_1/test_namespace")
33
+ namespace_src = dc.put(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
24
34
  namespace_dst = dc.copy(
25
- "test_catalog_1/test_namespace",
26
- "test_catalog_2",
35
+ DeltaCatUrl("dc://test_catalog_1/test_namespace"),
36
+ DeltaCatUrl("dc://test_catalog_2/test_namespace"),
27
37
  )
28
38
  # Expect the catalog namespace created in each catalog
29
39
  # method to be equivalent and equal to the source namespace.
@@ -33,7 +43,38 @@ class TestDeltaCAT:
33
43
  # When each catalog namespace is fetched explicitly
34
44
  # Expect them to be equivalent but not equal
35
45
  # (due to different metafile IDs).
36
- actual_namespace_src = dc.get("test_catalog_1/test_namespace")
37
- actual_namespace_dst = dc.get("test_catalog_2/test_namespace")
46
+ actual_namespace_src = dc.get(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
47
+ actual_namespace_dst = dc.get(DeltaCatUrl("dc://test_catalog_2/test_namespace"))
38
48
  assert actual_namespace_src.equivalent_to(actual_namespace_dst)
39
49
  assert not actual_namespace_src == actual_namespace_dst
50
+
51
+ def test_catalog_listing_shallow_local_metafiles(self):
52
+ # Given two empty DeltaCAT catalogs.
53
+ # When a namespace is put in the catalog.
54
+ namespace_src: Namespace = dc.put(
55
+ DeltaCatUrl("dc://test_catalog_1/test_namespace")
56
+ )
57
+ # Expect the namespace to be listed.
58
+ assert any(
59
+ namespace_src.equivalent_to(other)
60
+ for other in dc.list(DeltaCatUrl("dc://test_catalog_1"))
61
+ )
62
+
63
+ def test_catalog_listing_shallow_ray_dataset(self):
64
+ # Given two empty DeltaCAT catalogs.
65
+ # When a namespace is put in the catalog.
66
+ namespace_src: Namespace = dc.put(
67
+ DeltaCatUrl("dc://test_catalog_1/test_namespace")
68
+ )
69
+ # Expect the namespace to be listed.
70
+ dataset = dc.list(
71
+ DeltaCatUrl("dc://test_catalog_1"),
72
+ dataset_type=DatasetType.RAY_DATASET,
73
+ )
74
+ actual_namespace = Metafile.deserialize(
75
+ serialized=dataset.take(1)[0][METAFILE_DATA_COLUMN_NAME],
76
+ meta_format=METAFILE_FORMAT_MSGPACK,
77
+ )
78
+ assert actual_namespace.equivalent_to(namespace_src)
79
+ namespace_type = dataset.take(1)[0][METAFILE_TYPE_COLUMN_NAME]
80
+ assert namespace_type == "Namespace"
deltacat/types/media.py CHANGED
@@ -1,30 +1,48 @@
1
1
  from enum import Enum
2
- from typing import Dict, Set
2
+ from typing import Set
3
3
 
4
4
 
5
5
  class ContentType(str, Enum):
6
- # See also:
7
- # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
8
- # https://www.iana.org/assignments/media-types/media-types.xhtml
6
+ """
7
+ Enumeration used to resolve the entity-body Media Type (formerly known as
8
+ MIME type) in an HTTP request.
9
+
10
+ https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
11
+
12
+ https://www.iana.org/assignments/media-types/media-types.xhtml
13
+ """
9
14
 
10
15
  # IANA registered types
16
+ AVRO = "application/avro"
17
+ BINARY = "application/octet-stream"
11
18
  CSV = "text/csv"
19
+ HDF = "application/x-hdf"
20
+ HTML = "text/html"
12
21
  JSON = "application/json"
22
+ TEXT = "text/plain"
23
+ WEBDATASET = "application/x-web-dataset"
24
+ XML = "text/xml"
13
25
 
14
26
  # unregistered types
15
- TSV = "text/tsv"
16
- PSV = "text/psv"
17
- PARQUET = "application/parquet"
18
- ORC = "application/orc"
19
27
  FEATHER = "application/feather"
20
- UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
21
28
  ION = "application/x-amzn-ion"
29
+ ORC = "application/orc"
30
+ PARQUET = "application/parquet"
31
+ PSV = "text/psv"
32
+ TSV = "text/tsv"
33
+ UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
22
34
 
23
35
 
24
36
  class ContentEncoding(str, Enum):
25
- # See also:
26
- # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
27
- # http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
37
+ """
38
+ Enumeration used as a modifier for :class:`deltacat.types.media.ContentType`
39
+ to indicate that additional encodings have been applied to the entity-body
40
+ Media Type in an HTTP request.
41
+
42
+ https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
43
+
44
+ http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
45
+ """
28
46
 
29
47
  # IANA registered encodings
30
48
  GZIP = "gzip"
@@ -37,27 +55,6 @@ class ContentEncoding(str, Enum):
37
55
  SNAPPY = "snappy"
38
56
 
39
57
 
40
- class TableType(str, Enum):
41
- PYARROW = "pyarrow"
42
- PANDAS = "pandas"
43
- NUMPY = "numpy"
44
- PYARROW_PARQUET = "pyarrow_parquet"
45
-
46
-
47
- class DistributedDatasetType(str, Enum):
48
- DAFT = "daft"
49
- RAY_DATASET = "ray_dataset"
50
-
51
-
52
- class SchemaType(str, Enum):
53
- ARROW = "arrow"
54
-
55
-
56
- class StorageType(str, Enum):
57
- LOCAL = "local"
58
- DISTRIBUTED = "distributed"
59
-
60
-
61
58
  DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
62
59
  ContentType.UNESCAPED_TSV.value,
63
60
  ContentType.TSV.value,
@@ -73,6 +70,7 @@ TABULAR_CONTENT_TYPES: Set[str] = {
73
70
  ContentType.PARQUET.value,
74
71
  ContentType.ORC.value,
75
72
  ContentType.FEATHER.value,
73
+ ContentType.AVRO.value,
76
74
  }
77
75
 
78
76
  EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
@@ -83,13 +81,113 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
83
81
  ContentType.JSON.value,
84
82
  }
85
83
 
86
- CONTENT_TYPE_TO_USER_KWARGS_KEY: Dict[str, str] = {
87
- ContentType.UNESCAPED_TSV.value: "unescaped_tsv",
88
- ContentType.TSV.value: "csv",
89
- ContentType.CSV.value: "csv",
90
- ContentType.PSV.value: "csv",
91
- ContentType.PARQUET.value: "parquet",
92
- ContentType.FEATHER.value: "feather",
93
- ContentType.ORC.value: "orc",
94
- ContentType.JSON.value: "json",
95
- }
84
+
85
+ class DatasetType(str, Enum):
86
+ """
87
+ Enumeration used to identify the in-memory local or distributed dataset
88
+ to be used for file IO, queries, and data transformation. Typically used
89
+ together with :class:`deltacat.types.media.DatastoreType` to resolve the
90
+ compute layer that will be responsible for reading, transforming, and
91
+ writing data to a given datastore.
92
+ """
93
+
94
+ # local
95
+ NUMPY = "numpy" # numpy.ndarray
96
+ PANDAS = "pandas" # pandas.DataFrame
97
+ POLARS = "polars" # polars.DataFrame
98
+ PYARROW = "pyarrow" # pyarrow.Table
99
+ PYARROW_PARQUET = "pyarrow_parquet" # pyarrow.parquet.ParquetFile
100
+
101
+ # distributed
102
+ DAFT = "daft" # daft.DataFrame
103
+ RAY_DATASET = "ray_dataset" # ray.data.Dataset
104
+
105
+ @staticmethod
106
+ def distributed():
107
+ return {
108
+ DatasetType.DAFT,
109
+ DatasetType.RAY_DATASET,
110
+ }
111
+
112
+ @staticmethod
113
+ def local():
114
+ return {
115
+ DatasetType.NUMPY,
116
+ DatasetType.PANDAS,
117
+ DatasetType.POLARS,
118
+ DatasetType.PYARROW,
119
+ DatasetType.PYARROW_PARQUET,
120
+ }
121
+
122
+
123
+ # deprecated by DatasetType - populated dynamically for backwards compatibility
124
+ TableType = Enum(
125
+ "TableType",
126
+ {d.name: d.value for d in DatasetType.local()},
127
+ )
128
+
129
+ # deprecated by DatasetType - populated dynamically for backwards compatibility
130
+ DistributedDatasetType = Enum(
131
+ "DistributedDatasetType",
132
+ {d.name: d.value for d in DatasetType.distributed()},
133
+ )
134
+
135
+
136
+ # deprecated by DatasetType.local() and DatasetType.distributed()
137
+ # kept for backwards compatibility
138
+ class StorageType(str, Enum):
139
+ LOCAL = "local"
140
+ DISTRIBUTED = "distributed"
141
+
142
+
143
+ class DatastoreType(str, Enum):
144
+ """
145
+ Enumeration used to identify the type of reader required to connect to and
146
+ correctly interpret data stored at a given path. Typically used together
147
+ with :class:`deltacat.types.media.DatasetType` to resolve a reader or
148
+ writer for that data store. Note that, although some overlap exists between
149
+ enum values here and in :class:`deltacat.types.media.ContentType`, each
150
+ enum serve a different purpose. The purpose of
151
+ :class:`deltacat.types.media.ContentType` is to resolve the MIME type for
152
+ specific types of files, and may be used together with multi-content-type
153
+ datastore types to describe the specific file types read/written to that
154
+ datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
155
+ """
156
+
157
+ # DeltaCAT Catalog Datasets
158
+ DELTACAT = "dc"
159
+ DELTACAT_NAMESPACE = "namespace"
160
+ DELTACAT_TABLE = "table"
161
+ DELTACAT_TABLE_VERSION = "tableversion"
162
+ DELTACAT_STREAM = "stream"
163
+ DELTACAT_PARTITION = "partition"
164
+ DELTACAT_DELTA = "delta"
165
+
166
+ # External Datasets
167
+ AUDIO = "audio"
168
+ AVRO = "avro"
169
+ BIGQUERY = "bigquery"
170
+ BINARY = "binary"
171
+ CSV = "csv"
172
+ CLICKHOUSE = "clickhouse"
173
+ DATABRICKS_TABLES = "databricks"
174
+ DELTA_LAKE = "deltalake"
175
+ DELTA_SHARING = "deltasharing"
176
+ FEATHER = "feather"
177
+ HDF = "hdf"
178
+ HTML = "html"
179
+ HUDI = "hudi"
180
+ ICEBERG = "iceberg"
181
+ IMAGES = "images"
182
+ JSON = "json"
183
+ LANCE = "lance"
184
+ MONGO = "mongodb"
185
+ NUMPY = "numpy"
186
+ ORC = "orc"
187
+ PARQUET = "parquet"
188
+ TEXT = "text"
189
+ TFRECORDS = "tfrecords"
190
+ VIDEOS = "videos"
191
+ WARC = "warc"
192
+ WEBDATASET = "webdataset"
193
+ XML = "xml"
deltacat/types/tables.py CHANGED
@@ -3,9 +3,10 @@ from typing import Callable, Dict, Type, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
+ import polars as pl
6
7
  import pyarrow as pa
7
8
  import pyarrow.parquet as papq
8
- from ray.data.dataset import Dataset
9
+ from ray.data.dataset import Dataset as RayDataset
9
10
  from ray.data.read_api import (
10
11
  from_arrow,
11
12
  from_arrow_refs,
@@ -18,11 +19,12 @@ import deltacat.storage as dcs
18
19
  from deltacat.types.media import TableType, DistributedDatasetType
19
20
  from deltacat.utils import numpy as np_utils
20
21
  from deltacat.utils import pandas as pd_utils
22
+ from deltacat.utils import polars as pl_utils
21
23
  from deltacat.utils import pyarrow as pa_utils
22
24
  from deltacat.utils import daft as daft_utils
23
25
  from deltacat.utils.ray_utils import dataset as ds_utils
24
26
 
25
- TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
27
+ TABLE_TYPE_TO_S3_READER_FUNC: Dict[int, Callable] = {
26
28
  TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
27
29
  TableType.PYARROW.value: pa_utils.s3_file_to_table,
28
30
  TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
@@ -34,8 +36,9 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
34
36
  ] = {
35
37
  pa.Table: pa_utils.table_to_file,
36
38
  pd.DataFrame: pd_utils.dataframe_to_file,
39
+ pl.DataFrame: pl_utils.dataframe_to_file,
37
40
  np.ndarray: np_utils.ndarray_to_file,
38
- Dataset: ds_utils.dataset_to_file,
41
+ RayDataset: ds_utils.dataset_to_file,
39
42
  }
40
43
 
41
44
  TABLE_CLASS_TO_SLICER_FUNC: Dict[
@@ -43,8 +46,9 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
43
46
  ] = {
44
47
  pa.Table: pa_utils.slice_table,
45
48
  pd.DataFrame: pd_utils.slice_dataframe,
49
+ pl.DataFrame: pl_utils.slice_table,
46
50
  np.ndarray: np_utils.slice_ndarray,
47
- Dataset: ds_utils.slice_dataset,
51
+ RayDataset: ds_utils.slice_dataset,
48
52
  }
49
53
 
50
54
  TABLE_CLASS_TO_SIZE_FUNC: Dict[
@@ -53,13 +57,27 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
53
57
  pa.Table: pa_utils.table_size,
54
58
  papq.ParquetFile: pa_utils.parquet_file_size,
55
59
  pd.DataFrame: pd_utils.dataframe_size,
60
+ pl.DataFrame: pl_utils.dataframe_size,
56
61
  np.ndarray: np_utils.ndarray_size,
57
- Dataset: ds_utils.dataset_size,
62
+ RayDataset: ds_utils.dataset_size,
63
+ }
64
+
65
+ TABLE_CLASS_TO_PYARROW_FUNC: Dict[
66
+ Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
67
+ ] = {
68
+ pa.Table: lambda table, **kwargs: table,
69
+ papq.ParquetFile: lambda table, **kwargs: table.read(**kwargs),
70
+ pd.DataFrame: lambda table, **kwargs: pa.Table.from_pandas(table, **kwargs),
71
+ pl.DataFrame: lambda table, **kwargs: pl.DataFrame.to_arrow(table, **kwargs),
72
+ np.ndarray: lambda table, **kwargs: pa.Table.from_arrays(
73
+ [pa.array(table[:, i]) for i in range(table.shape[1])]
74
+ ),
58
75
  }
59
76
 
60
77
  TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
61
78
  pa.Table: TableType.PYARROW.value,
62
79
  papq.ParquetFile: TableType.PYARROW_PARQUET.value,
80
+ pl.DataFrame: TableType.POLARS.value,
63
81
  pd.DataFrame: TableType.PANDAS.value,
64
82
  np.ndarray: TableType.NUMPY.value,
65
83
  }
@@ -78,7 +96,6 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
78
96
  TableType.PANDAS.value: from_pandas_refs,
79
97
  }
80
98
 
81
-
82
99
  DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
83
100
  DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
84
101
  }
@@ -106,7 +123,18 @@ class TableWriteMode(str, Enum):
106
123
 
107
124
 
108
125
  def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
109
- return len(table) if not isinstance(table, Dataset) else table.count()
126
+ return len(table) if not isinstance(table, RayDataset) else table.count()
127
+
128
+
129
+ def get_table_size(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
130
+ table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
131
+ if table_size_func is None:
132
+ msg = (
133
+ f"No size function found for table type: {type(table)}.\n"
134
+ f"Known table types: {TABLE_CLASS_TO_SIZE_FUNC.keys}"
135
+ )
136
+ raise ValueError(msg)
137
+ return table_size_func(table)
110
138
 
111
139
 
112
140
  def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
deltacat/utils/daft.py CHANGED
@@ -2,8 +2,8 @@ import logging
2
2
  from typing import Optional, List, Any, Dict, Callable
3
3
  import daft
4
4
  import ray
5
- from daft.recordbatch import read_parquet_into_pyarrow
6
5
  from daft import TimeUnit, DataFrame
6
+ from daft.recordbatch import read_parquet_into_pyarrow
7
7
  from daft.io import IOConfig, S3Config
8
8
  import pyarrow as pa
9
9
 
@@ -51,7 +51,7 @@ def s3_files_to_dataframe(
51
51
  ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
52
52
 
53
53
  if not ray.is_initialized():
54
- ray.init(address="auto", ignore_reinit_error=True, **ray_init_options)
54
+ ray.init(ignore_reinit_error=True, **ray_init_options)
55
55
 
56
56
  daft.context.set_runner_ray(noop_if_initialized=True)
57
57
 
@@ -2,12 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  from typing import Optional, Tuple, Union, List
5
+ from datetime import timedelta
5
6
 
6
7
  import sys
7
8
  import urllib
8
9
  import pathlib
9
10
 
10
- import pyarrow
11
11
  import pyarrow as pa
12
12
  from pyarrow.fs import (
13
13
  _resolve_filesystem_and_path,
@@ -17,6 +17,7 @@ from pyarrow.fs import (
17
17
  FileSystem,
18
18
  FSSpecHandler,
19
19
  PyFileSystem,
20
+ GcsFileSystem,
20
21
  )
21
22
 
22
23
  _LOCAL_SCHEME = "local"
@@ -24,8 +25,8 @@ _LOCAL_SCHEME = "local"
24
25
 
25
26
  def resolve_paths_and_filesystem(
26
27
  paths: Union[str, List[str]],
27
- filesystem: pyarrow.fs.FileSystem = None,
28
- ) -> Tuple[List[str], pyarrow.fs.FileSystem]:
28
+ filesystem: FileSystem = None,
29
+ ) -> Tuple[List[str], FileSystem]:
29
30
  """
30
31
  Resolves and normalizes all provided paths, infers a filesystem from the
31
32
  paths or validates the provided filesystem against the paths and ensures
@@ -113,19 +114,26 @@ def resolve_paths_and_filesystem(
113
114
  else:
114
115
  raise
115
116
  if filesystem is None:
116
- filesystem = resolved_filesystem
117
+ if isinstance(resolved_filesystem, GcsFileSystem):
118
+ # Configure a retry time limit for GcsFileSystem so that it
119
+ # doesn't hang forever trying to get file info (e.g., when
120
+ # trying to get a public file w/o anonymous=True).
121
+ filesystem = GcsFileSystem(
122
+ retry_time_limit=timedelta(seconds=60),
123
+ )
124
+ else:
125
+ filesystem = resolved_filesystem
117
126
  elif need_unwrap_path_protocol:
118
127
  resolved_path = _unwrap_protocol(resolved_path)
119
128
  resolved_path = filesystem.normalize_path(resolved_path)
120
129
  resolved_paths.append(resolved_path)
121
-
122
130
  return resolved_paths, filesystem
123
131
 
124
132
 
125
133
  def resolve_path_and_filesystem(
126
134
  path: str,
127
- filesystem: Optional[pyarrow.fs.FileSystem] = None,
128
- ) -> Tuple[str, pyarrow.fs.FileSystem]:
135
+ filesystem: Optional[FileSystem] = None,
136
+ ) -> Tuple[str, FileSystem]:
129
137
  """
130
138
  Resolves and normalizes the provided path, infers a filesystem from the
131
139
  path or validates the provided filesystem against the path.
@@ -148,7 +156,7 @@ def resolve_path_and_filesystem(
148
156
 
149
157
  def list_directory(
150
158
  path: str,
151
- filesystem: pyarrow.fs.FileSystem,
159
+ filesystem: FileSystem,
152
160
  exclude_prefixes: Optional[List[str]] = None,
153
161
  ignore_missing_path: bool = False,
154
162
  recursive: bool = False,
@@ -199,7 +207,7 @@ def list_directory(
199
207
 
200
208
  def get_file_info(
201
209
  path: str,
202
- filesystem: pyarrow.fs.FileSystem,
210
+ filesystem: FileSystem,
203
211
  ignore_missing_path: bool = False,
204
212
  ) -> FileInfo:
205
213
  """Get the file info for the provided path."""
@@ -227,6 +235,9 @@ def _handle_read_os_error(
227
235
  r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
228
236
  r"body\.(.*))$"
229
237
  )
238
+ gcp_error_pattern = (
239
+ r"^(?:(.*)google::cloud::Status\(UNAVAILABLE:(.*?)Couldn't resolve host name)"
240
+ )
230
241
  if re.match(aws_error_pattern, str(error)):
231
242
  # Specially handle AWS error when reading files, to give a clearer error
232
243
  # message to avoid confusing users. The real issue is most likely that the AWS
@@ -243,9 +254,28 @@ def _handle_read_os_error(
243
254
  "You can also run AWS CLI command to get more detailed error message "
244
255
  "(e.g., aws s3 ls <file-name>). "
245
256
  "See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
257
+ "and https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html "
246
258
  "for more information."
247
259
  )
248
260
  )
261
+ elif re.match(gcp_error_pattern, str(error)):
262
+ # Special handling for GCP errors (e.g., handling the special case of
263
+ # requiring the filesystem to be instantiated with anonymous access to
264
+ # read public files).
265
+ if isinstance(paths, str):
266
+ paths = f'"{paths}"'
267
+ raise OSError(
268
+ (
269
+ f"Failing to read GCP GS file(s): {paths}. "
270
+ "Please check that file exists and has properly configured access. "
271
+ "If this is a public file, please instantiate a filesystem with "
272
+ "anonymous access via `pyarrow.fs.GcsFileSystem(anonymous=True)` "
273
+ "to read it. See https://google.aip.dev/auth/4110 and "
274
+ "https://arrow.apache.org/docs/python/generated/pyarrow.fs.GcsFileSystem.html" # noqa
275
+ "for more information."
276
+ )
277
+ )
278
+
249
279
  else:
250
280
  raise error
251
281
 
@@ -0,0 +1,128 @@
1
+ import logging
2
+ from typing import Optional, List, Dict, Callable, Union
3
+
4
+ import polars as pl
5
+
6
+ from fsspec import AbstractFileSystem
7
+ from ray.data.datasource import FilenameProvider
8
+
9
+ from deltacat import logs
10
+
11
+ from deltacat.types.media import ContentType
12
+
13
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
+
15
+
16
+ def write_json(
17
+ table: pl.DataFrame,
18
+ path: str,
19
+ *,
20
+ filesystem: Optional[AbstractFileSystem] = None,
21
+ fs_open_kwargs: Dict[str, any] = {},
22
+ **write_kwargs,
23
+ ) -> None:
24
+ if not filesystem:
25
+ table.write_ndjson(path, **write_kwargs)
26
+ else:
27
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
28
+ table.write_ndjson(f, **write_kwargs)
29
+
30
+
31
+ def write_csv(
32
+ table: pl.DataFrame,
33
+ path: str,
34
+ *,
35
+ filesystem: Optional[AbstractFileSystem] = None,
36
+ fs_open_kwargs: Dict[str, any] = {},
37
+ **write_kwargs,
38
+ ) -> None:
39
+ if not filesystem:
40
+ table.write_csv(path, **write_kwargs)
41
+ else:
42
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
43
+ table.write_csv(f, **write_kwargs)
44
+
45
+
46
+ def write_avro(
47
+ table: pl.DataFrame,
48
+ path: str,
49
+ *,
50
+ filesystem: Optional[AbstractFileSystem] = None,
51
+ fs_open_kwargs: Dict[str, any] = {},
52
+ **write_kwargs,
53
+ ) -> None:
54
+ if not filesystem:
55
+ table.write_avro(path, **write_kwargs)
56
+ else:
57
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
58
+ table.write_avro(f, **write_kwargs)
59
+
60
+
61
+ def write_parquet(
62
+ table: pl.DataFrame,
63
+ path: str,
64
+ *,
65
+ filesystem: Optional[AbstractFileSystem] = None,
66
+ fs_open_kwargs: Dict[str, any] = {},
67
+ **write_kwargs,
68
+ ) -> None:
69
+ if not filesystem:
70
+ table.write_parquet(path, **write_kwargs)
71
+ else:
72
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
73
+ table.write_parquet(f, **write_kwargs)
74
+
75
+
76
+ CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
77
+ # TODO (pdames): add support for other delimited text content types as
78
+ # pyarrow adds support for custom delimiters, escaping, and None value
79
+ # representations to pyarrow.csv.WriteOptions.
80
+ ContentType.AVRO.value: write_avro,
81
+ ContentType.CSV.value: write_csv,
82
+ ContentType.PARQUET.value: write_parquet,
83
+ ContentType.JSON.value: write_json,
84
+ }
85
+
86
+
87
+ def slice_table(table: pl.DataFrame, max_len: Optional[int]) -> List[pl.DataFrame]:
88
+ """
89
+ Iteratively create 0-copy table slices.
90
+ """
91
+ if max_len is None:
92
+ return [table]
93
+ tables = []
94
+ offset = 0
95
+ records_remaining = len(table)
96
+ while records_remaining > 0:
97
+ records_this_entry = min(max_len, records_remaining)
98
+ tables.append(table.slice(offset, records_this_entry))
99
+ records_remaining -= records_this_entry
100
+ offset += records_this_entry
101
+ return tables
102
+
103
+
104
+ def dataframe_size(table: pl.DataFrame) -> int:
105
+ return table.estimated_size()
106
+
107
+
108
+ def dataframe_to_file(
109
+ table: pl.DataFrame,
110
+ base_path: str,
111
+ file_system: Optional[AbstractFileSystem],
112
+ block_path_provider: Union[Callable, FilenameProvider],
113
+ content_type: str = ContentType.PARQUET.value,
114
+ **kwargs,
115
+ ) -> None:
116
+ """
117
+ Writes the given Pyarrow Table to a file.
118
+ """
119
+ writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
120
+ if not writer:
121
+ raise NotImplementedError(
122
+ f"Pyarrow writer for content type '{content_type}' not "
123
+ f"implemented. Known content types: "
124
+ f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
125
+ )
126
+ path = block_path_provider(base_path)
127
+ logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
128
+ writer(table, path, filesystem=file_system, **kwargs)