deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
import pytest
|
2
2
|
|
3
|
-
from deltacat import Dataset
|
3
|
+
from deltacat.storage.rivulet.dataset import Dataset
|
4
4
|
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
5
5
|
from deltacat.storage.rivulet.fs.file_store import FileStore
|
6
6
|
from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
|
@@ -1,29 +1,39 @@
|
|
1
1
|
import shutil
|
2
2
|
import tempfile
|
3
|
+
|
3
4
|
import deltacat as dc
|
5
|
+
from deltacat.constants import METAFILE_FORMAT_MSGPACK
|
6
|
+
from deltacat import Namespace, DeltaCatUrl, DatasetType
|
7
|
+
from deltacat.storage import Metafile
|
8
|
+
|
9
|
+
from deltacat.io import (
|
10
|
+
METAFILE_TYPE_COLUMN_NAME,
|
11
|
+
METAFILE_DATA_COLUMN_NAME,
|
12
|
+
)
|
4
13
|
|
5
14
|
|
6
15
|
class TestDeltaCAT:
|
7
16
|
@classmethod
|
8
|
-
def
|
17
|
+
def setup_method(cls):
|
9
18
|
cls.temp_dir_1 = tempfile.mkdtemp()
|
10
19
|
cls.temp_dir_2 = tempfile.mkdtemp()
|
11
20
|
# Initialize DeltaCAT with two local catalogs.
|
12
|
-
dc.
|
13
|
-
dc.put("
|
21
|
+
dc.init()
|
22
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1"), root=cls.temp_dir_1)
|
23
|
+
dc.put(DeltaCatUrl("dc://test_catalog_2"), root=cls.temp_dir_2)
|
14
24
|
|
15
25
|
@classmethod
|
16
|
-
def
|
26
|
+
def teardown_method(cls):
|
17
27
|
shutil.rmtree(cls.temp_dir_1)
|
18
28
|
shutil.rmtree(cls.temp_dir_2)
|
19
29
|
|
20
30
|
def test_cross_catalog_namespace_copy(self):
|
21
31
|
# Given two empty DeltaCAT catalogs.
|
22
32
|
# When a namespace is copied across catalogs.
|
23
|
-
namespace_src = dc.put("test_catalog_1/test_namespace")
|
33
|
+
namespace_src = dc.put(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
|
24
34
|
namespace_dst = dc.copy(
|
25
|
-
"test_catalog_1/test_namespace",
|
26
|
-
"test_catalog_2",
|
35
|
+
DeltaCatUrl("dc://test_catalog_1/test_namespace"),
|
36
|
+
DeltaCatUrl("dc://test_catalog_2/test_namespace"),
|
27
37
|
)
|
28
38
|
# Expect the catalog namespace created in each catalog
|
29
39
|
# method to be equivalent and equal to the source namespace.
|
@@ -33,7 +43,38 @@ class TestDeltaCAT:
|
|
33
43
|
# When each catalog namespace is fetched explicitly
|
34
44
|
# Expect them to be equivalent but not equal
|
35
45
|
# (due to different metafile IDs).
|
36
|
-
actual_namespace_src = dc.get("test_catalog_1/test_namespace")
|
37
|
-
actual_namespace_dst = dc.get("test_catalog_2/test_namespace")
|
46
|
+
actual_namespace_src = dc.get(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
|
47
|
+
actual_namespace_dst = dc.get(DeltaCatUrl("dc://test_catalog_2/test_namespace"))
|
38
48
|
assert actual_namespace_src.equivalent_to(actual_namespace_dst)
|
39
49
|
assert not actual_namespace_src == actual_namespace_dst
|
50
|
+
|
51
|
+
def test_catalog_listing_shallow_local_metafiles(self):
|
52
|
+
# Given two empty DeltaCAT catalogs.
|
53
|
+
# When a namespace is put in the catalog.
|
54
|
+
namespace_src: Namespace = dc.put(
|
55
|
+
DeltaCatUrl("dc://test_catalog_1/test_namespace")
|
56
|
+
)
|
57
|
+
# Expect the namespace to be listed.
|
58
|
+
assert any(
|
59
|
+
namespace_src.equivalent_to(other)
|
60
|
+
for other in dc.list(DeltaCatUrl("dc://test_catalog_1"))
|
61
|
+
)
|
62
|
+
|
63
|
+
def test_catalog_listing_shallow_ray_dataset(self):
|
64
|
+
# Given two empty DeltaCAT catalogs.
|
65
|
+
# When a namespace is put in the catalog.
|
66
|
+
namespace_src: Namespace = dc.put(
|
67
|
+
DeltaCatUrl("dc://test_catalog_1/test_namespace")
|
68
|
+
)
|
69
|
+
# Expect the namespace to be listed.
|
70
|
+
dataset = dc.list(
|
71
|
+
DeltaCatUrl("dc://test_catalog_1"),
|
72
|
+
dataset_type=DatasetType.RAY_DATASET,
|
73
|
+
)
|
74
|
+
actual_namespace = Metafile.deserialize(
|
75
|
+
serialized=dataset.take(1)[0][METAFILE_DATA_COLUMN_NAME],
|
76
|
+
meta_format=METAFILE_FORMAT_MSGPACK,
|
77
|
+
)
|
78
|
+
assert actual_namespace.equivalent_to(namespace_src)
|
79
|
+
namespace_type = dataset.take(1)[0][METAFILE_TYPE_COLUMN_NAME]
|
80
|
+
assert namespace_type == "Namespace"
|
deltacat/types/media.py
CHANGED
@@ -1,30 +1,48 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import
|
2
|
+
from typing import Set
|
3
3
|
|
4
4
|
|
5
5
|
class ContentType(str, Enum):
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
"""
|
7
|
+
Enumeration used to resolve the entity-body Media Type (formerly known as
|
8
|
+
MIME type) in an HTTP request.
|
9
|
+
|
10
|
+
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
|
11
|
+
|
12
|
+
https://www.iana.org/assignments/media-types/media-types.xhtml
|
13
|
+
"""
|
9
14
|
|
10
15
|
# IANA registered types
|
16
|
+
AVRO = "application/avro"
|
17
|
+
BINARY = "application/octet-stream"
|
11
18
|
CSV = "text/csv"
|
19
|
+
HDF = "application/x-hdf"
|
20
|
+
HTML = "text/html"
|
12
21
|
JSON = "application/json"
|
22
|
+
TEXT = "text/plain"
|
23
|
+
WEBDATASET = "application/x-web-dataset"
|
24
|
+
XML = "text/xml"
|
13
25
|
|
14
26
|
# unregistered types
|
15
|
-
TSV = "text/tsv"
|
16
|
-
PSV = "text/psv"
|
17
|
-
PARQUET = "application/parquet"
|
18
|
-
ORC = "application/orc"
|
19
27
|
FEATHER = "application/feather"
|
20
|
-
UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
|
21
28
|
ION = "application/x-amzn-ion"
|
29
|
+
ORC = "application/orc"
|
30
|
+
PARQUET = "application/parquet"
|
31
|
+
PSV = "text/psv"
|
32
|
+
TSV = "text/tsv"
|
33
|
+
UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
|
22
34
|
|
23
35
|
|
24
36
|
class ContentEncoding(str, Enum):
|
25
|
-
|
26
|
-
|
27
|
-
|
37
|
+
"""
|
38
|
+
Enumeration used as a modifier for :class:`deltacat.types.media.ContentType`
|
39
|
+
to indicate that additional encodings have been applied to the entity-body
|
40
|
+
Media Type in an HTTP request.
|
41
|
+
|
42
|
+
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
|
43
|
+
|
44
|
+
http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
|
45
|
+
"""
|
28
46
|
|
29
47
|
# IANA registered encodings
|
30
48
|
GZIP = "gzip"
|
@@ -37,27 +55,6 @@ class ContentEncoding(str, Enum):
|
|
37
55
|
SNAPPY = "snappy"
|
38
56
|
|
39
57
|
|
40
|
-
class TableType(str, Enum):
|
41
|
-
PYARROW = "pyarrow"
|
42
|
-
PANDAS = "pandas"
|
43
|
-
NUMPY = "numpy"
|
44
|
-
PYARROW_PARQUET = "pyarrow_parquet"
|
45
|
-
|
46
|
-
|
47
|
-
class DistributedDatasetType(str, Enum):
|
48
|
-
DAFT = "daft"
|
49
|
-
RAY_DATASET = "ray_dataset"
|
50
|
-
|
51
|
-
|
52
|
-
class SchemaType(str, Enum):
|
53
|
-
ARROW = "arrow"
|
54
|
-
|
55
|
-
|
56
|
-
class StorageType(str, Enum):
|
57
|
-
LOCAL = "local"
|
58
|
-
DISTRIBUTED = "distributed"
|
59
|
-
|
60
|
-
|
61
58
|
DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
|
62
59
|
ContentType.UNESCAPED_TSV.value,
|
63
60
|
ContentType.TSV.value,
|
@@ -73,6 +70,7 @@ TABULAR_CONTENT_TYPES: Set[str] = {
|
|
73
70
|
ContentType.PARQUET.value,
|
74
71
|
ContentType.ORC.value,
|
75
72
|
ContentType.FEATHER.value,
|
73
|
+
ContentType.AVRO.value,
|
76
74
|
}
|
77
75
|
|
78
76
|
EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
|
@@ -83,13 +81,113 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
|
|
83
81
|
ContentType.JSON.value,
|
84
82
|
}
|
85
83
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
84
|
+
|
85
|
+
class DatasetType(str, Enum):
|
86
|
+
"""
|
87
|
+
Enumeration used to identify the in-memory local or distributed dataset
|
88
|
+
to be used for file IO, queries, and data transformation. Typically used
|
89
|
+
together with :class:`deltacat.types.media.DatastoreType` to resolve the
|
90
|
+
compute layer that will be responsible for reading, transforming, and
|
91
|
+
writing data to a given datastore.
|
92
|
+
"""
|
93
|
+
|
94
|
+
# local
|
95
|
+
NUMPY = "numpy" # numpy.ndarray
|
96
|
+
PANDAS = "pandas" # pandas.DataFrame
|
97
|
+
POLARS = "polars" # polars.DataFrame
|
98
|
+
PYARROW = "pyarrow" # pyarrow.Table
|
99
|
+
PYARROW_PARQUET = "pyarrow_parquet" # pyarrow.parquet.ParquetFile
|
100
|
+
|
101
|
+
# distributed
|
102
|
+
DAFT = "daft" # daft.DataFrame
|
103
|
+
RAY_DATASET = "ray_dataset" # ray.data.Dataset
|
104
|
+
|
105
|
+
@staticmethod
|
106
|
+
def distributed():
|
107
|
+
return {
|
108
|
+
DatasetType.DAFT,
|
109
|
+
DatasetType.RAY_DATASET,
|
110
|
+
}
|
111
|
+
|
112
|
+
@staticmethod
|
113
|
+
def local():
|
114
|
+
return {
|
115
|
+
DatasetType.NUMPY,
|
116
|
+
DatasetType.PANDAS,
|
117
|
+
DatasetType.POLARS,
|
118
|
+
DatasetType.PYARROW,
|
119
|
+
DatasetType.PYARROW_PARQUET,
|
120
|
+
}
|
121
|
+
|
122
|
+
|
123
|
+
# deprecated by DatasetType - populated dynamically for backwards compatibility
|
124
|
+
TableType = Enum(
|
125
|
+
"TableType",
|
126
|
+
{d.name: d.value for d in DatasetType.local()},
|
127
|
+
)
|
128
|
+
|
129
|
+
# deprecated by DatasetType - populated dynamically for backwards compatibility
|
130
|
+
DistributedDatasetType = Enum(
|
131
|
+
"DistributedDatasetType",
|
132
|
+
{d.name: d.value for d in DatasetType.distributed()},
|
133
|
+
)
|
134
|
+
|
135
|
+
|
136
|
+
# deprecated by DatasetType.local() and DatasetType.distributed()
|
137
|
+
# kept for backwards compatibility
|
138
|
+
class StorageType(str, Enum):
|
139
|
+
LOCAL = "local"
|
140
|
+
DISTRIBUTED = "distributed"
|
141
|
+
|
142
|
+
|
143
|
+
class DatastoreType(str, Enum):
|
144
|
+
"""
|
145
|
+
Enumeration used to identify the type of reader required to connect to and
|
146
|
+
correctly interpret data stored at a given path. Typically used together
|
147
|
+
with :class:`deltacat.types.media.DatasetType` to resolve a reader or
|
148
|
+
writer for that data store. Note that, although some overlap exists between
|
149
|
+
enum values here and in :class:`deltacat.types.media.ContentType`, each
|
150
|
+
enum serve a different purpose. The purpose of
|
151
|
+
:class:`deltacat.types.media.ContentType` is to resolve the MIME type for
|
152
|
+
specific types of files, and may be used together with multi-content-type
|
153
|
+
datastore types to describe the specific file types read/written to that
|
154
|
+
datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
|
155
|
+
"""
|
156
|
+
|
157
|
+
# DeltaCAT Catalog Datasets
|
158
|
+
DELTACAT = "dc"
|
159
|
+
DELTACAT_NAMESPACE = "namespace"
|
160
|
+
DELTACAT_TABLE = "table"
|
161
|
+
DELTACAT_TABLE_VERSION = "tableversion"
|
162
|
+
DELTACAT_STREAM = "stream"
|
163
|
+
DELTACAT_PARTITION = "partition"
|
164
|
+
DELTACAT_DELTA = "delta"
|
165
|
+
|
166
|
+
# External Datasets
|
167
|
+
AUDIO = "audio"
|
168
|
+
AVRO = "avro"
|
169
|
+
BIGQUERY = "bigquery"
|
170
|
+
BINARY = "binary"
|
171
|
+
CSV = "csv"
|
172
|
+
CLICKHOUSE = "clickhouse"
|
173
|
+
DATABRICKS_TABLES = "databricks"
|
174
|
+
DELTA_LAKE = "deltalake"
|
175
|
+
DELTA_SHARING = "deltasharing"
|
176
|
+
FEATHER = "feather"
|
177
|
+
HDF = "hdf"
|
178
|
+
HTML = "html"
|
179
|
+
HUDI = "hudi"
|
180
|
+
ICEBERG = "iceberg"
|
181
|
+
IMAGES = "images"
|
182
|
+
JSON = "json"
|
183
|
+
LANCE = "lance"
|
184
|
+
MONGO = "mongodb"
|
185
|
+
NUMPY = "numpy"
|
186
|
+
ORC = "orc"
|
187
|
+
PARQUET = "parquet"
|
188
|
+
TEXT = "text"
|
189
|
+
TFRECORDS = "tfrecords"
|
190
|
+
VIDEOS = "videos"
|
191
|
+
WARC = "warc"
|
192
|
+
WEBDATASET = "webdataset"
|
193
|
+
XML = "xml"
|
deltacat/types/tables.py
CHANGED
@@ -3,9 +3,10 @@ from typing import Callable, Dict, Type, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
|
+
import polars as pl
|
6
7
|
import pyarrow as pa
|
7
8
|
import pyarrow.parquet as papq
|
8
|
-
from ray.data.dataset import Dataset
|
9
|
+
from ray.data.dataset import Dataset as RayDataset
|
9
10
|
from ray.data.read_api import (
|
10
11
|
from_arrow,
|
11
12
|
from_arrow_refs,
|
@@ -18,11 +19,12 @@ import deltacat.storage as dcs
|
|
18
19
|
from deltacat.types.media import TableType, DistributedDatasetType
|
19
20
|
from deltacat.utils import numpy as np_utils
|
20
21
|
from deltacat.utils import pandas as pd_utils
|
22
|
+
from deltacat.utils import polars as pl_utils
|
21
23
|
from deltacat.utils import pyarrow as pa_utils
|
22
24
|
from deltacat.utils import daft as daft_utils
|
23
25
|
from deltacat.utils.ray_utils import dataset as ds_utils
|
24
26
|
|
25
|
-
|
27
|
+
TABLE_TYPE_TO_S3_READER_FUNC: Dict[int, Callable] = {
|
26
28
|
TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
|
27
29
|
TableType.PYARROW.value: pa_utils.s3_file_to_table,
|
28
30
|
TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
|
@@ -34,8 +36,9 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
|
|
34
36
|
] = {
|
35
37
|
pa.Table: pa_utils.table_to_file,
|
36
38
|
pd.DataFrame: pd_utils.dataframe_to_file,
|
39
|
+
pl.DataFrame: pl_utils.dataframe_to_file,
|
37
40
|
np.ndarray: np_utils.ndarray_to_file,
|
38
|
-
|
41
|
+
RayDataset: ds_utils.dataset_to_file,
|
39
42
|
}
|
40
43
|
|
41
44
|
TABLE_CLASS_TO_SLICER_FUNC: Dict[
|
@@ -43,8 +46,9 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
|
|
43
46
|
] = {
|
44
47
|
pa.Table: pa_utils.slice_table,
|
45
48
|
pd.DataFrame: pd_utils.slice_dataframe,
|
49
|
+
pl.DataFrame: pl_utils.slice_table,
|
46
50
|
np.ndarray: np_utils.slice_ndarray,
|
47
|
-
|
51
|
+
RayDataset: ds_utils.slice_dataset,
|
48
52
|
}
|
49
53
|
|
50
54
|
TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
@@ -53,13 +57,27 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
|
53
57
|
pa.Table: pa_utils.table_size,
|
54
58
|
papq.ParquetFile: pa_utils.parquet_file_size,
|
55
59
|
pd.DataFrame: pd_utils.dataframe_size,
|
60
|
+
pl.DataFrame: pl_utils.dataframe_size,
|
56
61
|
np.ndarray: np_utils.ndarray_size,
|
57
|
-
|
62
|
+
RayDataset: ds_utils.dataset_size,
|
63
|
+
}
|
64
|
+
|
65
|
+
TABLE_CLASS_TO_PYARROW_FUNC: Dict[
|
66
|
+
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
67
|
+
] = {
|
68
|
+
pa.Table: lambda table, **kwargs: table,
|
69
|
+
papq.ParquetFile: lambda table, **kwargs: table.read(**kwargs),
|
70
|
+
pd.DataFrame: lambda table, **kwargs: pa.Table.from_pandas(table, **kwargs),
|
71
|
+
pl.DataFrame: lambda table, **kwargs: pl.DataFrame.to_arrow(table, **kwargs),
|
72
|
+
np.ndarray: lambda table, **kwargs: pa.Table.from_arrays(
|
73
|
+
[pa.array(table[:, i]) for i in range(table.shape[1])]
|
74
|
+
),
|
58
75
|
}
|
59
76
|
|
60
77
|
TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
|
61
78
|
pa.Table: TableType.PYARROW.value,
|
62
79
|
papq.ParquetFile: TableType.PYARROW_PARQUET.value,
|
80
|
+
pl.DataFrame: TableType.POLARS.value,
|
63
81
|
pd.DataFrame: TableType.PANDAS.value,
|
64
82
|
np.ndarray: TableType.NUMPY.value,
|
65
83
|
}
|
@@ -78,7 +96,6 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
|
|
78
96
|
TableType.PANDAS.value: from_pandas_refs,
|
79
97
|
}
|
80
98
|
|
81
|
-
|
82
99
|
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
|
83
100
|
DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
|
84
101
|
}
|
@@ -106,7 +123,18 @@ class TableWriteMode(str, Enum):
|
|
106
123
|
|
107
124
|
|
108
125
|
def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
|
109
|
-
return len(table) if not isinstance(table,
|
126
|
+
return len(table) if not isinstance(table, RayDataset) else table.count()
|
127
|
+
|
128
|
+
|
129
|
+
def get_table_size(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
|
130
|
+
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
131
|
+
if table_size_func is None:
|
132
|
+
msg = (
|
133
|
+
f"No size function found for table type: {type(table)}.\n"
|
134
|
+
f"Known table types: {TABLE_CLASS_TO_SIZE_FUNC.keys}"
|
135
|
+
)
|
136
|
+
raise ValueError(msg)
|
137
|
+
return table_size_func(table)
|
110
138
|
|
111
139
|
|
112
140
|
def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
|
deltacat/utils/daft.py
CHANGED
@@ -2,8 +2,8 @@ import logging
|
|
2
2
|
from typing import Optional, List, Any, Dict, Callable
|
3
3
|
import daft
|
4
4
|
import ray
|
5
|
-
from daft.recordbatch import read_parquet_into_pyarrow
|
6
5
|
from daft import TimeUnit, DataFrame
|
6
|
+
from daft.recordbatch import read_parquet_into_pyarrow
|
7
7
|
from daft.io import IOConfig, S3Config
|
8
8
|
import pyarrow as pa
|
9
9
|
|
@@ -51,7 +51,7 @@ def s3_files_to_dataframe(
|
|
51
51
|
), f"daft native reader currently only supports identity encoding, got {content_encoding}"
|
52
52
|
|
53
53
|
if not ray.is_initialized():
|
54
|
-
ray.init(
|
54
|
+
ray.init(ignore_reinit_error=True, **ray_init_options)
|
55
55
|
|
56
56
|
daft.context.set_runner_ray(noop_if_initialized=True)
|
57
57
|
|
deltacat/utils/filesystem.py
CHANGED
@@ -2,12 +2,12 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import re
|
4
4
|
from typing import Optional, Tuple, Union, List
|
5
|
+
from datetime import timedelta
|
5
6
|
|
6
7
|
import sys
|
7
8
|
import urllib
|
8
9
|
import pathlib
|
9
10
|
|
10
|
-
import pyarrow
|
11
11
|
import pyarrow as pa
|
12
12
|
from pyarrow.fs import (
|
13
13
|
_resolve_filesystem_and_path,
|
@@ -17,6 +17,7 @@ from pyarrow.fs import (
|
|
17
17
|
FileSystem,
|
18
18
|
FSSpecHandler,
|
19
19
|
PyFileSystem,
|
20
|
+
GcsFileSystem,
|
20
21
|
)
|
21
22
|
|
22
23
|
_LOCAL_SCHEME = "local"
|
@@ -24,8 +25,8 @@ _LOCAL_SCHEME = "local"
|
|
24
25
|
|
25
26
|
def resolve_paths_and_filesystem(
|
26
27
|
paths: Union[str, List[str]],
|
27
|
-
filesystem:
|
28
|
-
) -> Tuple[List[str],
|
28
|
+
filesystem: FileSystem = None,
|
29
|
+
) -> Tuple[List[str], FileSystem]:
|
29
30
|
"""
|
30
31
|
Resolves and normalizes all provided paths, infers a filesystem from the
|
31
32
|
paths or validates the provided filesystem against the paths and ensures
|
@@ -113,19 +114,26 @@ def resolve_paths_and_filesystem(
|
|
113
114
|
else:
|
114
115
|
raise
|
115
116
|
if filesystem is None:
|
116
|
-
|
117
|
+
if isinstance(resolved_filesystem, GcsFileSystem):
|
118
|
+
# Configure a retry time limit for GcsFileSystem so that it
|
119
|
+
# doesn't hang forever trying to get file info (e.g., when
|
120
|
+
# trying to get a public file w/o anonymous=True).
|
121
|
+
filesystem = GcsFileSystem(
|
122
|
+
retry_time_limit=timedelta(seconds=60),
|
123
|
+
)
|
124
|
+
else:
|
125
|
+
filesystem = resolved_filesystem
|
117
126
|
elif need_unwrap_path_protocol:
|
118
127
|
resolved_path = _unwrap_protocol(resolved_path)
|
119
128
|
resolved_path = filesystem.normalize_path(resolved_path)
|
120
129
|
resolved_paths.append(resolved_path)
|
121
|
-
|
122
130
|
return resolved_paths, filesystem
|
123
131
|
|
124
132
|
|
125
133
|
def resolve_path_and_filesystem(
|
126
134
|
path: str,
|
127
|
-
filesystem: Optional[
|
128
|
-
) -> Tuple[str,
|
135
|
+
filesystem: Optional[FileSystem] = None,
|
136
|
+
) -> Tuple[str, FileSystem]:
|
129
137
|
"""
|
130
138
|
Resolves and normalizes the provided path, infers a filesystem from the
|
131
139
|
path or validates the provided filesystem against the path.
|
@@ -148,7 +156,7 @@ def resolve_path_and_filesystem(
|
|
148
156
|
|
149
157
|
def list_directory(
|
150
158
|
path: str,
|
151
|
-
filesystem:
|
159
|
+
filesystem: FileSystem,
|
152
160
|
exclude_prefixes: Optional[List[str]] = None,
|
153
161
|
ignore_missing_path: bool = False,
|
154
162
|
recursive: bool = False,
|
@@ -199,7 +207,7 @@ def list_directory(
|
|
199
207
|
|
200
208
|
def get_file_info(
|
201
209
|
path: str,
|
202
|
-
filesystem:
|
210
|
+
filesystem: FileSystem,
|
203
211
|
ignore_missing_path: bool = False,
|
204
212
|
) -> FileInfo:
|
205
213
|
"""Get the file info for the provided path."""
|
@@ -227,6 +235,9 @@ def _handle_read_os_error(
|
|
227
235
|
r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
|
228
236
|
r"body\.(.*))$"
|
229
237
|
)
|
238
|
+
gcp_error_pattern = (
|
239
|
+
r"^(?:(.*)google::cloud::Status\(UNAVAILABLE:(.*?)Couldn't resolve host name)"
|
240
|
+
)
|
230
241
|
if re.match(aws_error_pattern, str(error)):
|
231
242
|
# Specially handle AWS error when reading files, to give a clearer error
|
232
243
|
# message to avoid confusing users. The real issue is most likely that the AWS
|
@@ -243,9 +254,28 @@ def _handle_read_os_error(
|
|
243
254
|
"You can also run AWS CLI command to get more detailed error message "
|
244
255
|
"(e.g., aws s3 ls <file-name>). "
|
245
256
|
"See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
|
257
|
+
"and https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html "
|
246
258
|
"for more information."
|
247
259
|
)
|
248
260
|
)
|
261
|
+
elif re.match(gcp_error_pattern, str(error)):
|
262
|
+
# Special handling for GCP errors (e.g., handling the special case of
|
263
|
+
# requiring the filesystem to be instantiated with anonymous access to
|
264
|
+
# read public files).
|
265
|
+
if isinstance(paths, str):
|
266
|
+
paths = f'"{paths}"'
|
267
|
+
raise OSError(
|
268
|
+
(
|
269
|
+
f"Failing to read GCP GS file(s): {paths}. "
|
270
|
+
"Please check that file exists and has properly configured access. "
|
271
|
+
"If this is a public file, please instantiate a filesystem with "
|
272
|
+
"anonymous access via `pyarrow.fs.GcsFileSystem(anonymous=True)` "
|
273
|
+
"to read it. See https://google.aip.dev/auth/4110 and "
|
274
|
+
"https://arrow.apache.org/docs/python/generated/pyarrow.fs.GcsFileSystem.html" # noqa
|
275
|
+
"for more information."
|
276
|
+
)
|
277
|
+
)
|
278
|
+
|
249
279
|
else:
|
250
280
|
raise error
|
251
281
|
|
deltacat/utils/polars.py
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, List, Dict, Callable, Union
|
3
|
+
|
4
|
+
import polars as pl
|
5
|
+
|
6
|
+
from fsspec import AbstractFileSystem
|
7
|
+
from ray.data.datasource import FilenameProvider
|
8
|
+
|
9
|
+
from deltacat import logs
|
10
|
+
|
11
|
+
from deltacat.types.media import ContentType
|
12
|
+
|
13
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
|
+
|
15
|
+
|
16
|
+
def write_json(
|
17
|
+
table: pl.DataFrame,
|
18
|
+
path: str,
|
19
|
+
*,
|
20
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
21
|
+
fs_open_kwargs: Dict[str, any] = {},
|
22
|
+
**write_kwargs,
|
23
|
+
) -> None:
|
24
|
+
if not filesystem:
|
25
|
+
table.write_ndjson(path, **write_kwargs)
|
26
|
+
else:
|
27
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
28
|
+
table.write_ndjson(f, **write_kwargs)
|
29
|
+
|
30
|
+
|
31
|
+
def write_csv(
|
32
|
+
table: pl.DataFrame,
|
33
|
+
path: str,
|
34
|
+
*,
|
35
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
36
|
+
fs_open_kwargs: Dict[str, any] = {},
|
37
|
+
**write_kwargs,
|
38
|
+
) -> None:
|
39
|
+
if not filesystem:
|
40
|
+
table.write_csv(path, **write_kwargs)
|
41
|
+
else:
|
42
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
43
|
+
table.write_csv(f, **write_kwargs)
|
44
|
+
|
45
|
+
|
46
|
+
def write_avro(
|
47
|
+
table: pl.DataFrame,
|
48
|
+
path: str,
|
49
|
+
*,
|
50
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
51
|
+
fs_open_kwargs: Dict[str, any] = {},
|
52
|
+
**write_kwargs,
|
53
|
+
) -> None:
|
54
|
+
if not filesystem:
|
55
|
+
table.write_avro(path, **write_kwargs)
|
56
|
+
else:
|
57
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
58
|
+
table.write_avro(f, **write_kwargs)
|
59
|
+
|
60
|
+
|
61
|
+
def write_parquet(
|
62
|
+
table: pl.DataFrame,
|
63
|
+
path: str,
|
64
|
+
*,
|
65
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
66
|
+
fs_open_kwargs: Dict[str, any] = {},
|
67
|
+
**write_kwargs,
|
68
|
+
) -> None:
|
69
|
+
if not filesystem:
|
70
|
+
table.write_parquet(path, **write_kwargs)
|
71
|
+
else:
|
72
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
73
|
+
table.write_parquet(f, **write_kwargs)
|
74
|
+
|
75
|
+
|
76
|
+
CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
|
77
|
+
# TODO (pdames): add support for other delimited text content types as
|
78
|
+
# pyarrow adds support for custom delimiters, escaping, and None value
|
79
|
+
# representations to pyarrow.csv.WriteOptions.
|
80
|
+
ContentType.AVRO.value: write_avro,
|
81
|
+
ContentType.CSV.value: write_csv,
|
82
|
+
ContentType.PARQUET.value: write_parquet,
|
83
|
+
ContentType.JSON.value: write_json,
|
84
|
+
}
|
85
|
+
|
86
|
+
|
87
|
+
def slice_table(table: pl.DataFrame, max_len: Optional[int]) -> List[pl.DataFrame]:
|
88
|
+
"""
|
89
|
+
Iteratively create 0-copy table slices.
|
90
|
+
"""
|
91
|
+
if max_len is None:
|
92
|
+
return [table]
|
93
|
+
tables = []
|
94
|
+
offset = 0
|
95
|
+
records_remaining = len(table)
|
96
|
+
while records_remaining > 0:
|
97
|
+
records_this_entry = min(max_len, records_remaining)
|
98
|
+
tables.append(table.slice(offset, records_this_entry))
|
99
|
+
records_remaining -= records_this_entry
|
100
|
+
offset += records_this_entry
|
101
|
+
return tables
|
102
|
+
|
103
|
+
|
104
|
+
def dataframe_size(table: pl.DataFrame) -> int:
|
105
|
+
return table.estimated_size()
|
106
|
+
|
107
|
+
|
108
|
+
def dataframe_to_file(
|
109
|
+
table: pl.DataFrame,
|
110
|
+
base_path: str,
|
111
|
+
file_system: Optional[AbstractFileSystem],
|
112
|
+
block_path_provider: Union[Callable, FilenameProvider],
|
113
|
+
content_type: str = ContentType.PARQUET.value,
|
114
|
+
**kwargs,
|
115
|
+
) -> None:
|
116
|
+
"""
|
117
|
+
Writes the given Pyarrow Table to a file.
|
118
|
+
"""
|
119
|
+
writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
|
120
|
+
if not writer:
|
121
|
+
raise NotImplementedError(
|
122
|
+
f"Pyarrow writer for content type '{content_type}' not "
|
123
|
+
f"implemented. Known content types: "
|
124
|
+
f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
|
125
|
+
)
|
126
|
+
path = block_path_provider(base_path)
|
127
|
+
logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
|
128
|
+
writer(table, path, filesystem=file_system, **kwargs)
|