deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,172 @@
|
|
1
|
+
from typing import List, Optional, Union, Dict, Any
|
2
|
+
|
3
|
+
from ray.data import Dataset as RayDataset
|
4
|
+
from ray.data import read_datasource
|
5
|
+
|
6
|
+
from deltacat.io.datasource.deltacat_datasource import DeltaCatDatasource
|
7
|
+
from deltacat.io.dataset.deltacat_dataset import DeltaCatDataset
|
8
|
+
from deltacat.utils.common import ReadKwargsProvider
|
9
|
+
from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlReader
|
10
|
+
from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
|
11
|
+
|
12
|
+
|
13
|
+
class EmptyReadKwargsProvider(ReadKwargsProvider):
|
14
|
+
def _get_kwargs(
|
15
|
+
self,
|
16
|
+
datasource_type: str,
|
17
|
+
kwargs: Dict[str, Any],
|
18
|
+
) -> Dict[str, Any]:
|
19
|
+
return {}
|
20
|
+
|
21
|
+
|
22
|
+
def read_deltacat(
|
23
|
+
urls: Union[DeltaCatUrl, List[DeltaCatUrl]],
|
24
|
+
*,
|
25
|
+
deltacat_read_type: DeltacatReadType = DeltacatReadType.DATA,
|
26
|
+
timestamp_as_of: Optional[int] = None,
|
27
|
+
merge_on_read: Optional[bool] = False,
|
28
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = EmptyReadKwargsProvider(),
|
29
|
+
) -> DeltaCatDataset:
|
30
|
+
"""Reads the given DeltaCAT URLs into a Ray Dataset. DeltaCAT URLs can
|
31
|
+
either reference objects registered in a DeltaCAT catalog, or unregistered
|
32
|
+
external objects that are readable into a Ray Dataset.
|
33
|
+
|
34
|
+
Unless `metadata_only` is `True`, all reads of registered DeltaCAT catalog
|
35
|
+
object data must resolve to a single table version.
|
36
|
+
|
37
|
+
When reading unregistered external objects, all additional keyword
|
38
|
+
arguments specified are passed into the Ray Datasource resolved for the
|
39
|
+
given DeltaCAT URLs.
|
40
|
+
|
41
|
+
Examples:
|
42
|
+
>>> # Read the latest active DeltaCAT table version:
|
43
|
+
>>> import deltacat as dc
|
44
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table")
|
45
|
+
>>> # If `my_catalog is the default catalog, this is equivalent to:
|
46
|
+
>>> dc.io.read_deltacat("namespace://my_namespace/my_table")
|
47
|
+
>>> # If `my_namespace` is the default namespace, this is equivalent to:
|
48
|
+
>>> dc.io.read_deltacat("table://my_table")
|
49
|
+
|
50
|
+
>>> # Read metadata from all partitions and deltas of the latest active
|
51
|
+
>>> # DeltaCAT table version:
|
52
|
+
>>> import deltacat as dc
|
53
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table", metadata_only=True)
|
54
|
+
>>> # Since "default" always resolves to the latest active table version.
|
55
|
+
>>> # This is equivalent to:
|
56
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default", metadata_only=True)
|
57
|
+
|
58
|
+
>>> # Read only the latest active table version's top-level metadata:
|
59
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default", metadata_only=True, recursive=False)
|
60
|
+
|
61
|
+
>>> # Read only top-level metadata from a DeltaCAT table:
|
62
|
+
>>> import deltacat as dc
|
63
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table", metadata_only=True, recursive=False)
|
64
|
+
|
65
|
+
>>> # Read top-level table metadata from all table versions:
|
66
|
+
>>> import deltacat as dc
|
67
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/*", metadata_only=True, recursive=False)
|
68
|
+
|
69
|
+
>>> # Read metadata from all partitions and deltas of all table versions:
|
70
|
+
>>> import deltacat as dc
|
71
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/*", metadata_only=True)
|
72
|
+
|
73
|
+
>>> # Read metadata from all tables and table versions of the namespace:
|
74
|
+
>>> import deltacat as dc
|
75
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/*", metadata_only=True)
|
76
|
+
|
77
|
+
>>> # Read metadata from the latest active table version for each
|
78
|
+
>>> # table in the namespace:
|
79
|
+
>>> import deltacat as dc
|
80
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace", metadata_only=True)
|
81
|
+
|
82
|
+
>>> # Read metadata from the latest active table version for each
|
83
|
+
>>> # table in the namespace:
|
84
|
+
>>> import deltacat as dc
|
85
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace", metadata_only=True)
|
86
|
+
|
87
|
+
>>> # Read metadata from the latest active table version for each
|
88
|
+
>>> # table in the catalog's default namespace:
|
89
|
+
>>> import deltacat as dc
|
90
|
+
>>> dc.io.read_deltacat("dc://my_catalog", metadata_only=True)
|
91
|
+
|
92
|
+
>>> # Read metadata from all table versions for each table in each
|
93
|
+
>>> # catalog namespace:
|
94
|
+
>>> import deltacat as dc
|
95
|
+
>>> dc.io.read_deltacat("dc://my_catalog/*", metadata_only=True)
|
96
|
+
|
97
|
+
>>> # Read the Iceberg stream of the latest active DeltaCAT table version,
|
98
|
+
>>> import deltacat as dc
|
99
|
+
>>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default/iceberg")
|
100
|
+
>>> # Or, if `my_catalog is the default catalog, this is equivalent to:
|
101
|
+
>>> dc.io.read_deltacat("namespace://my_namespace/my_table/default/iceberg")
|
102
|
+
>>> # Or, if `my_namespace` is the default namespace, this is equivalent to:
|
103
|
+
>>> dc.io.read_deltacat("table://my_table/default/iceberg")
|
104
|
+
|
105
|
+
>>> # Read an external unregistered Iceberg table `my_db.my_table`:
|
106
|
+
>>> import deltacat as dc
|
107
|
+
>>> dc.io.read_deltacat("iceberg://my_db.my_table")
|
108
|
+
|
109
|
+
>>> # Read an external unregistered audio file from /my/audio.mp4:
|
110
|
+
>>> import deltacat as dc
|
111
|
+
>>> dc.io.read_deltacat("audio+file:///my/audio.mp4")
|
112
|
+
|
113
|
+
>>> # Read an external unregistered audio file from s3://my/audio.mp4:
|
114
|
+
>>> import deltacat as dc
|
115
|
+
>>> dc.io.read_deltacat("audio+s3://my/audio.mp4")
|
116
|
+
|
117
|
+
Args:
|
118
|
+
urls: The DeltaCAT URLs to read.
|
119
|
+
deltacat_read_type: If METADATA, reads only DeltaCAT metadata for the
|
120
|
+
given URL and skips both recursive metadata expansion and reads
|
121
|
+
of the underlying data files. If METADATA_RECURSIVE then recursively
|
122
|
+
expands child metadata but does not read underlying data files. If
|
123
|
+
DATA then recursively expands child metadata to discover and read
|
124
|
+
all underlying data files.
|
125
|
+
timestamp_as_of: Reads a historic snapshot of the given paths as-of the
|
126
|
+
given millisecond-precision epoch timestamp (only used when reading
|
127
|
+
registered DeltaCAT catalog objects).
|
128
|
+
merge_on_read: If True, merges all unmaterialized inserts, updates,
|
129
|
+
and deletes in the registered DeltaCAT table version being read. Only
|
130
|
+
applicable if `metadata_only` is False.
|
131
|
+
ray_remote_args: kwargs passed to `ray.remote` in the read tasks.
|
132
|
+
read_kwargs_provider: Resolves
|
133
|
+
:class:`~deltacat.types.media.DatasourceType` string keys to
|
134
|
+
kwarg dictionaries to pass to the resolved
|
135
|
+
:class:`~ray.data.Datasource` implementation for each distinct
|
136
|
+
DeltaCAT URL type.
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
DeltacatDataset holding Arrow records read from the specified URL.
|
140
|
+
"""
|
141
|
+
# TODO(pdames): The below implementation serializes reads of each URL and
|
142
|
+
# then unions their respective datasets together. While this was an easy
|
143
|
+
# starting point to implement, a more efficient implementation should push
|
144
|
+
# all URLs down into `DeltacatDatasource` to parallelize all reads
|
145
|
+
# (i.e., by returning the `ReadTask` for all datasources in
|
146
|
+
# `get_read_tasks()` and estimating the corresponding memory size across
|
147
|
+
# all datasources in `estimate_inmemory_data_size()`.
|
148
|
+
dataset: RayDataset = None
|
149
|
+
for url in urls:
|
150
|
+
if not url.is_deltacat_catalog_url():
|
151
|
+
# this URL points to an external unregistered Ray Datasource
|
152
|
+
# TODO(pdames): Honor metadata only reads of external datasources
|
153
|
+
# by registering only file paths & metadata in delta manifests.
|
154
|
+
reader = DeltaCatUrlReader(url)
|
155
|
+
next_ds = reader.read(read_kwargs_provider(url.datastore_type, {}))
|
156
|
+
else:
|
157
|
+
# this URL points to a registered DeltaCAT object
|
158
|
+
next_ds = read_datasource(
|
159
|
+
DeltaCatDatasource(
|
160
|
+
url=url,
|
161
|
+
deltacat_read_type=deltacat_read_type,
|
162
|
+
timestamp_as_of=timestamp_as_of,
|
163
|
+
merge_on_read=merge_on_read,
|
164
|
+
read_kwargs_provider=read_kwargs_provider,
|
165
|
+
)
|
166
|
+
)
|
167
|
+
# union the last dataset read into the result set
|
168
|
+
if not dataset:
|
169
|
+
dataset = next_ds
|
170
|
+
else:
|
171
|
+
dataset.union(next_ds)
|
172
|
+
return DeltaCatDataset.from_dataset(dataset)
|
deltacat/storage/__init__.py
CHANGED
@@ -78,6 +78,7 @@ from deltacat.storage.model.transform import (
|
|
78
78
|
)
|
79
79
|
from deltacat.storage.model.types import (
|
80
80
|
CommitState,
|
81
|
+
Dataset,
|
81
82
|
DeltaType,
|
82
83
|
DistributedDataset,
|
83
84
|
LifecycleState,
|
@@ -102,6 +103,7 @@ __all__ = [
|
|
102
103
|
"BucketTransform",
|
103
104
|
"BucketTransformParameters",
|
104
105
|
"CommitState",
|
106
|
+
"Dataset",
|
105
107
|
"DayTransform",
|
106
108
|
"Delta",
|
107
109
|
"DeltaLocator",
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from deltacat.storage.model.expression.expression import (
|
2
|
+
Expression,
|
3
|
+
UnaryExpression,
|
4
|
+
BinaryExpression,
|
5
|
+
BooleanExpression,
|
6
|
+
Reference,
|
7
|
+
Literal,
|
8
|
+
Equal,
|
9
|
+
NotEqual,
|
10
|
+
GreaterThan,
|
11
|
+
LessThan,
|
12
|
+
GreaterThanEqual,
|
13
|
+
LessThanEqual,
|
14
|
+
And,
|
15
|
+
Or,
|
16
|
+
Not,
|
17
|
+
In,
|
18
|
+
Between,
|
19
|
+
Like,
|
20
|
+
IsNull,
|
21
|
+
)
|
22
|
+
|
23
|
+
from deltacat.storage.model.expression.visitor import ExpressionVisitor, DisplayVisitor
|
24
|
+
|
25
|
+
__all__ = [
|
26
|
+
"Expression",
|
27
|
+
"UnaryExpression",
|
28
|
+
"BinaryExpression",
|
29
|
+
"BooleanExpression",
|
30
|
+
"Reference",
|
31
|
+
"Literal",
|
32
|
+
"Equal",
|
33
|
+
"NotEqual",
|
34
|
+
"GreaterThan",
|
35
|
+
"LessThan",
|
36
|
+
"GreaterThanEqual",
|
37
|
+
"LessThanEqual",
|
38
|
+
"And",
|
39
|
+
"Or",
|
40
|
+
"Not",
|
41
|
+
"In",
|
42
|
+
"Between",
|
43
|
+
"Like",
|
44
|
+
"IsNull",
|
45
|
+
"ExpressionVisitor",
|
46
|
+
"DisplayVisitor",
|
47
|
+
]
|