deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -16
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +26 -10
- deltacat/catalog/model/catalog.py +165 -109
- deltacat/catalog/model/properties.py +25 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/client.py +406 -0
- deltacat/constants.py +5 -6
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/types.py +5 -3
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +531 -5
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -111
- deltacat/daft/model.py +0 -258
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → compute/jobs}/__init__.py +0 -0
- /deltacat/examples/{common → experimental}/__init__.py +0 -0
- /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,39 @@
|
|
1
1
|
import shutil
|
2
2
|
import tempfile
|
3
|
+
|
3
4
|
import deltacat as dc
|
5
|
+
from deltacat.constants import METAFILE_FORMAT_MSGPACK
|
6
|
+
from deltacat import Namespace, DeltaCatUrl, DatasetType
|
7
|
+
from deltacat.storage import Metafile
|
8
|
+
|
9
|
+
from deltacat.io import (
|
10
|
+
METAFILE_TYPE_COLUMN_NAME,
|
11
|
+
METAFILE_DATA_COLUMN_NAME,
|
12
|
+
)
|
4
13
|
|
5
14
|
|
6
15
|
class TestDeltaCAT:
|
7
16
|
@classmethod
|
8
|
-
def
|
17
|
+
def setup_method(cls):
|
9
18
|
cls.temp_dir_1 = tempfile.mkdtemp()
|
10
19
|
cls.temp_dir_2 = tempfile.mkdtemp()
|
11
20
|
# Initialize DeltaCAT with two local catalogs.
|
12
|
-
dc.
|
13
|
-
dc.put("
|
21
|
+
dc.init()
|
22
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1"), root=cls.temp_dir_1)
|
23
|
+
dc.put(DeltaCatUrl("dc://test_catalog_2"), root=cls.temp_dir_2)
|
14
24
|
|
15
25
|
@classmethod
|
16
|
-
def
|
26
|
+
def teardown_method(cls):
|
17
27
|
shutil.rmtree(cls.temp_dir_1)
|
18
28
|
shutil.rmtree(cls.temp_dir_2)
|
19
29
|
|
20
30
|
def test_cross_catalog_namespace_copy(self):
|
21
31
|
# Given two empty DeltaCAT catalogs.
|
22
32
|
# When a namespace is copied across catalogs.
|
23
|
-
namespace_src = dc.put("test_catalog_1/test_namespace")
|
33
|
+
namespace_src = dc.put(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
|
24
34
|
namespace_dst = dc.copy(
|
25
|
-
"test_catalog_1/test_namespace",
|
26
|
-
"test_catalog_2",
|
35
|
+
DeltaCatUrl("dc://test_catalog_1/test_namespace"),
|
36
|
+
DeltaCatUrl("dc://test_catalog_2/test_namespace"),
|
27
37
|
)
|
28
38
|
# Expect the catalog namespace created in each catalog
|
29
39
|
# method to be equivalent and equal to the source namespace.
|
@@ -33,7 +43,38 @@ class TestDeltaCAT:
|
|
33
43
|
# When each catalog namespace is fetched explicitly
|
34
44
|
# Expect them to be equivalent but not equal
|
35
45
|
# (due to different metafile IDs).
|
36
|
-
actual_namespace_src = dc.get("test_catalog_1/test_namespace")
|
37
|
-
actual_namespace_dst = dc.get("test_catalog_2/test_namespace")
|
46
|
+
actual_namespace_src = dc.get(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
|
47
|
+
actual_namespace_dst = dc.get(DeltaCatUrl("dc://test_catalog_2/test_namespace"))
|
38
48
|
assert actual_namespace_src.equivalent_to(actual_namespace_dst)
|
39
49
|
assert not actual_namespace_src == actual_namespace_dst
|
50
|
+
|
51
|
+
def test_catalog_listing_shallow_local_metafiles(self):
|
52
|
+
# Given two empty DeltaCAT catalogs.
|
53
|
+
# When a namespace is put in the catalog.
|
54
|
+
namespace_src: Namespace = dc.put(
|
55
|
+
DeltaCatUrl("dc://test_catalog_1/test_namespace")
|
56
|
+
)
|
57
|
+
# Expect the namespace to be listed.
|
58
|
+
assert any(
|
59
|
+
namespace_src.equivalent_to(other)
|
60
|
+
for other in dc.list(DeltaCatUrl("dc://test_catalog_1"))
|
61
|
+
)
|
62
|
+
|
63
|
+
def test_catalog_listing_shallow_ray_dataset(self):
|
64
|
+
# Given two empty DeltaCAT catalogs.
|
65
|
+
# When a namespace is put in the catalog.
|
66
|
+
namespace_src: Namespace = dc.put(
|
67
|
+
DeltaCatUrl("dc://test_catalog_1/test_namespace")
|
68
|
+
)
|
69
|
+
# Expect the namespace to be listed.
|
70
|
+
dataset = dc.list(
|
71
|
+
DeltaCatUrl("dc://test_catalog_1"),
|
72
|
+
dataset_type=DatasetType.RAY_DATASET,
|
73
|
+
)
|
74
|
+
actual_namespace = Metafile.deserialize(
|
75
|
+
serialized=dataset.take(1)[0][METAFILE_DATA_COLUMN_NAME],
|
76
|
+
meta_format=METAFILE_FORMAT_MSGPACK,
|
77
|
+
)
|
78
|
+
assert actual_namespace.equivalent_to(namespace_src)
|
79
|
+
namespace_type = dataset.take(1)[0][METAFILE_TYPE_COLUMN_NAME]
|
80
|
+
assert namespace_type == "Namespace"
|
deltacat/types/media.py
CHANGED
@@ -1,30 +1,48 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import
|
2
|
+
from typing import Set
|
3
3
|
|
4
4
|
|
5
5
|
class ContentType(str, Enum):
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
"""
|
7
|
+
Enumeration used to resolve the entity-body Media Type (formerly known as
|
8
|
+
MIME type) in an HTTP request.
|
9
|
+
|
10
|
+
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
|
11
|
+
|
12
|
+
https://www.iana.org/assignments/media-types/media-types.xhtml
|
13
|
+
"""
|
9
14
|
|
10
15
|
# IANA registered types
|
16
|
+
AVRO = "application/avro"
|
17
|
+
BINARY = "application/octet-stream"
|
11
18
|
CSV = "text/csv"
|
19
|
+
HDF = "application/x-hdf"
|
20
|
+
HTML = "text/html"
|
12
21
|
JSON = "application/json"
|
22
|
+
TEXT = "text/plain"
|
23
|
+
WEBDATASET = "application/x-web-dataset"
|
24
|
+
XML = "text/xml"
|
13
25
|
|
14
26
|
# unregistered types
|
15
|
-
TSV = "text/tsv"
|
16
|
-
PSV = "text/psv"
|
17
|
-
PARQUET = "application/parquet"
|
18
|
-
ORC = "application/orc"
|
19
27
|
FEATHER = "application/feather"
|
20
|
-
UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
|
21
28
|
ION = "application/x-amzn-ion"
|
29
|
+
ORC = "application/orc"
|
30
|
+
PARQUET = "application/parquet"
|
31
|
+
PSV = "text/psv"
|
32
|
+
TSV = "text/tsv"
|
33
|
+
UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
|
22
34
|
|
23
35
|
|
24
36
|
class ContentEncoding(str, Enum):
|
25
|
-
|
26
|
-
|
27
|
-
|
37
|
+
"""
|
38
|
+
Enumeration used as a modifier for :class:`deltacat.types.media.ContentType`
|
39
|
+
to indicate that additional encodings have been applied to the entity-body
|
40
|
+
Media Type in an HTTP request.
|
41
|
+
|
42
|
+
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
|
43
|
+
|
44
|
+
http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
|
45
|
+
"""
|
28
46
|
|
29
47
|
# IANA registered encodings
|
30
48
|
GZIP = "gzip"
|
@@ -37,27 +55,6 @@ class ContentEncoding(str, Enum):
|
|
37
55
|
SNAPPY = "snappy"
|
38
56
|
|
39
57
|
|
40
|
-
class TableType(str, Enum):
|
41
|
-
PYARROW = "pyarrow"
|
42
|
-
PANDAS = "pandas"
|
43
|
-
NUMPY = "numpy"
|
44
|
-
PYARROW_PARQUET = "pyarrow_parquet"
|
45
|
-
|
46
|
-
|
47
|
-
class DistributedDatasetType(str, Enum):
|
48
|
-
DAFT = "daft"
|
49
|
-
RAY_DATASET = "ray_dataset"
|
50
|
-
|
51
|
-
|
52
|
-
class SchemaType(str, Enum):
|
53
|
-
ARROW = "arrow"
|
54
|
-
|
55
|
-
|
56
|
-
class StorageType(str, Enum):
|
57
|
-
LOCAL = "local"
|
58
|
-
DISTRIBUTED = "distributed"
|
59
|
-
|
60
|
-
|
61
58
|
DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
|
62
59
|
ContentType.UNESCAPED_TSV.value,
|
63
60
|
ContentType.TSV.value,
|
@@ -73,6 +70,7 @@ TABULAR_CONTENT_TYPES: Set[str] = {
|
|
73
70
|
ContentType.PARQUET.value,
|
74
71
|
ContentType.ORC.value,
|
75
72
|
ContentType.FEATHER.value,
|
73
|
+
ContentType.AVRO.value,
|
76
74
|
}
|
77
75
|
|
78
76
|
EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
|
@@ -83,13 +81,113 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
|
|
83
81
|
ContentType.JSON.value,
|
84
82
|
}
|
85
83
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
84
|
+
|
85
|
+
class DatasetType(str, Enum):
|
86
|
+
"""
|
87
|
+
Enumeration used to identify the in-memory local or distributed dataset
|
88
|
+
to be used for file IO, queries, and data transformation. Typically used
|
89
|
+
together with :class:`deltacat.types.media.DatastoreType` to resolve the
|
90
|
+
compute layer that will be responsible for reading, transforming, and
|
91
|
+
writing data to a given datastore.
|
92
|
+
"""
|
93
|
+
|
94
|
+
# local
|
95
|
+
NUMPY = "numpy" # numpy.ndarray
|
96
|
+
PANDAS = "pandas" # pandas.DataFrame
|
97
|
+
POLARS = "polars" # polars.DataFrame
|
98
|
+
PYARROW = "pyarrow" # pyarrow.Table
|
99
|
+
PYARROW_PARQUET = "pyarrow_parquet" # pyarrow.parquet.ParquetFile
|
100
|
+
|
101
|
+
# distributed
|
102
|
+
DAFT = "daft" # daft.DataFrame
|
103
|
+
RAY_DATASET = "ray_dataset" # ray.data.Dataset
|
104
|
+
|
105
|
+
@staticmethod
|
106
|
+
def distributed():
|
107
|
+
return {
|
108
|
+
DatasetType.DAFT,
|
109
|
+
DatasetType.RAY_DATASET,
|
110
|
+
}
|
111
|
+
|
112
|
+
@staticmethod
|
113
|
+
def local():
|
114
|
+
return {
|
115
|
+
DatasetType.NUMPY,
|
116
|
+
DatasetType.PANDAS,
|
117
|
+
DatasetType.POLARS,
|
118
|
+
DatasetType.PYARROW,
|
119
|
+
DatasetType.PYARROW_PARQUET,
|
120
|
+
}
|
121
|
+
|
122
|
+
|
123
|
+
# deprecated by DatasetType - populated dynamically for backwards compatibility
|
124
|
+
TableType = Enum(
|
125
|
+
"TableType",
|
126
|
+
{d.name: d.value for d in DatasetType.local()},
|
127
|
+
)
|
128
|
+
|
129
|
+
# deprecated by DatasetType - populated dynamically for backwards compatibility
|
130
|
+
DistributedDatasetType = Enum(
|
131
|
+
"DistributedDatasetType",
|
132
|
+
{d.name: d.value for d in DatasetType.distributed()},
|
133
|
+
)
|
134
|
+
|
135
|
+
|
136
|
+
# deprecated by DatasetType.local() and DatasetType.distributed()
|
137
|
+
# kept for backwards compatibility
|
138
|
+
class StorageType(str, Enum):
|
139
|
+
LOCAL = "local"
|
140
|
+
DISTRIBUTED = "distributed"
|
141
|
+
|
142
|
+
|
143
|
+
class DatastoreType(str, Enum):
|
144
|
+
"""
|
145
|
+
Enumeration used to identify the type of reader required to connect to and
|
146
|
+
correctly interpret data stored at a given path. Typically used together
|
147
|
+
with :class:`deltacat.types.media.DatasetType` to resolve a reader or
|
148
|
+
writer for that data store. Note that, although some overlap exists between
|
149
|
+
enum values here and in :class:`deltacat.types.media.ContentType`, each
|
150
|
+
enum serve a different purpose. The purpose of
|
151
|
+
:class:`deltacat.types.media.ContentType` is to resolve a file's MIME type,
|
152
|
+
and may be used together with datastores that support storing different
|
153
|
+
file types to describe the specific file type read/written from/to that
|
154
|
+
datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
|
155
|
+
"""
|
156
|
+
|
157
|
+
# DeltaCAT Catalog Datasets
|
158
|
+
DELTACAT = "dc"
|
159
|
+
DELTACAT_NAMESPACE = "namespace"
|
160
|
+
DELTACAT_TABLE = "table"
|
161
|
+
DELTACAT_TABLE_VERSION = "tableversion"
|
162
|
+
DELTACAT_STREAM = "stream"
|
163
|
+
DELTACAT_PARTITION = "partition"
|
164
|
+
DELTACAT_DELTA = "delta"
|
165
|
+
|
166
|
+
# External Datasets
|
167
|
+
AUDIO = "audio"
|
168
|
+
AVRO = "avro"
|
169
|
+
BIGQUERY = "bigquery"
|
170
|
+
BINARY = "binary"
|
171
|
+
CSV = "csv"
|
172
|
+
CLICKHOUSE = "clickhouse"
|
173
|
+
DATABRICKS_TABLES = "databricks"
|
174
|
+
DELTA_LAKE = "deltalake"
|
175
|
+
DELTA_SHARING = "deltasharing"
|
176
|
+
FEATHER = "feather"
|
177
|
+
HDF = "hdf"
|
178
|
+
HTML = "html"
|
179
|
+
HUDI = "hudi"
|
180
|
+
ICEBERG = "iceberg"
|
181
|
+
IMAGES = "images"
|
182
|
+
JSON = "json"
|
183
|
+
LANCE = "lance"
|
184
|
+
MONGO = "mongodb"
|
185
|
+
NUMPY = "numpy"
|
186
|
+
ORC = "orc"
|
187
|
+
PARQUET = "parquet"
|
188
|
+
TEXT = "text"
|
189
|
+
TFRECORDS = "tfrecords"
|
190
|
+
VIDEOS = "videos"
|
191
|
+
WARC = "warc"
|
192
|
+
WEBDATASET = "webdataset"
|
193
|
+
XML = "xml"
|
deltacat/types/tables.py
CHANGED
@@ -3,9 +3,10 @@ from typing import Callable, Dict, Type, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
|
+
import polars as pl
|
6
7
|
import pyarrow as pa
|
7
8
|
import pyarrow.parquet as papq
|
8
|
-
from ray.data.dataset import Dataset
|
9
|
+
from ray.data.dataset import Dataset as RayDataset
|
9
10
|
from ray.data.read_api import (
|
10
11
|
from_arrow,
|
11
12
|
from_arrow_refs,
|
@@ -18,11 +19,12 @@ import deltacat.storage as dcs
|
|
18
19
|
from deltacat.types.media import TableType, DistributedDatasetType
|
19
20
|
from deltacat.utils import numpy as np_utils
|
20
21
|
from deltacat.utils import pandas as pd_utils
|
22
|
+
from deltacat.utils import polars as pl_utils
|
21
23
|
from deltacat.utils import pyarrow as pa_utils
|
22
24
|
from deltacat.utils import daft as daft_utils
|
23
25
|
from deltacat.utils.ray_utils import dataset as ds_utils
|
24
26
|
|
25
|
-
|
27
|
+
TABLE_TYPE_TO_S3_READER_FUNC: Dict[int, Callable] = {
|
26
28
|
TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
|
27
29
|
TableType.PYARROW.value: pa_utils.s3_file_to_table,
|
28
30
|
TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
|
@@ -34,8 +36,9 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
|
|
34
36
|
] = {
|
35
37
|
pa.Table: pa_utils.table_to_file,
|
36
38
|
pd.DataFrame: pd_utils.dataframe_to_file,
|
39
|
+
pl.DataFrame: pl_utils.dataframe_to_file,
|
37
40
|
np.ndarray: np_utils.ndarray_to_file,
|
38
|
-
|
41
|
+
RayDataset: ds_utils.dataset_to_file,
|
39
42
|
}
|
40
43
|
|
41
44
|
TABLE_CLASS_TO_SLICER_FUNC: Dict[
|
@@ -43,8 +46,9 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
|
|
43
46
|
] = {
|
44
47
|
pa.Table: pa_utils.slice_table,
|
45
48
|
pd.DataFrame: pd_utils.slice_dataframe,
|
49
|
+
pl.DataFrame: pl_utils.slice_table,
|
46
50
|
np.ndarray: np_utils.slice_ndarray,
|
47
|
-
|
51
|
+
RayDataset: ds_utils.slice_dataset,
|
48
52
|
}
|
49
53
|
|
50
54
|
TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
@@ -53,13 +57,27 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
|
53
57
|
pa.Table: pa_utils.table_size,
|
54
58
|
papq.ParquetFile: pa_utils.parquet_file_size,
|
55
59
|
pd.DataFrame: pd_utils.dataframe_size,
|
60
|
+
pl.DataFrame: pl_utils.dataframe_size,
|
56
61
|
np.ndarray: np_utils.ndarray_size,
|
57
|
-
|
62
|
+
RayDataset: ds_utils.dataset_size,
|
63
|
+
}
|
64
|
+
|
65
|
+
TABLE_CLASS_TO_PYARROW_FUNC: Dict[
|
66
|
+
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
67
|
+
] = {
|
68
|
+
pa.Table: lambda table, **kwargs: table,
|
69
|
+
papq.ParquetFile: lambda table, **kwargs: table.read(**kwargs),
|
70
|
+
pd.DataFrame: lambda table, **kwargs: pa.Table.from_pandas(table, **kwargs),
|
71
|
+
pl.DataFrame: lambda table, **kwargs: pl.DataFrame.to_arrow(table, **kwargs),
|
72
|
+
np.ndarray: lambda table, **kwargs: pa.Table.from_arrays(
|
73
|
+
[pa.array(table[:, i]) for i in range(table.shape[1])]
|
74
|
+
),
|
58
75
|
}
|
59
76
|
|
60
77
|
TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
|
61
78
|
pa.Table: TableType.PYARROW.value,
|
62
79
|
papq.ParquetFile: TableType.PYARROW_PARQUET.value,
|
80
|
+
pl.DataFrame: TableType.POLARS.value,
|
63
81
|
pd.DataFrame: TableType.PANDAS.value,
|
64
82
|
np.ndarray: TableType.NUMPY.value,
|
65
83
|
}
|
@@ -78,7 +96,6 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
|
|
78
96
|
TableType.PANDAS.value: from_pandas_refs,
|
79
97
|
}
|
80
98
|
|
81
|
-
|
82
99
|
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
|
83
100
|
DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
|
84
101
|
}
|
@@ -106,7 +123,18 @@ class TableWriteMode(str, Enum):
|
|
106
123
|
|
107
124
|
|
108
125
|
def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
|
109
|
-
return len(table) if not isinstance(table,
|
126
|
+
return len(table) if not isinstance(table, RayDataset) else table.count()
|
127
|
+
|
128
|
+
|
129
|
+
def get_table_size(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
|
130
|
+
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
131
|
+
if table_size_func is None:
|
132
|
+
msg = (
|
133
|
+
f"No size function found for table type: {type(table)}.\n"
|
134
|
+
f"Known table types: {TABLE_CLASS_TO_SIZE_FUNC.keys}"
|
135
|
+
)
|
136
|
+
raise ValueError(msg)
|
137
|
+
return table_size_func(table)
|
110
138
|
|
111
139
|
|
112
140
|
def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
|