deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,207 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from collections import OrderedDict
|
4
|
+
from typing import Dict, Any, Optional, List, Iterable
|
5
|
+
|
6
|
+
from ray.data import Datasink
|
7
|
+
from ray.data._internal.execution.interfaces import TaskContext
|
8
|
+
from ray.data.block import Block, BlockAccessor
|
9
|
+
from ray.data.datasource import WriteResult
|
10
|
+
|
11
|
+
from ray.data.datasource.filename_provider import (
|
12
|
+
FilenameProvider,
|
13
|
+
)
|
14
|
+
|
15
|
+
from deltacat import logs
|
16
|
+
|
17
|
+
from deltacat.constants import METAFILE_FORMAT_MSGPACK
|
18
|
+
from deltacat.storage import Metafile
|
19
|
+
from deltacat.io.datasource.deltacat_datasource import (
|
20
|
+
METAFILE_DATA_COLUMN_NAME,
|
21
|
+
METAFILE_TYPE_COLUMN_NAME,
|
22
|
+
)
|
23
|
+
from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlWriter
|
24
|
+
|
25
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
+
|
27
|
+
|
28
|
+
class CapturingBlockWritePathProvider(FilenameProvider):
|
29
|
+
"""Delegating block write path provider that saves an ordered dictionary of
|
30
|
+
input keyword arguments for every block write path returned."""
|
31
|
+
|
32
|
+
def __init__(
|
33
|
+
self,
|
34
|
+
block_write_path_provider: FilenameProvider,
|
35
|
+
base_path: Optional[str] = None,
|
36
|
+
):
|
37
|
+
self.base_path = base_path
|
38
|
+
self.block_write_path_provider = block_write_path_provider
|
39
|
+
self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
|
40
|
+
|
41
|
+
def get_filename_for_block(
|
42
|
+
self,
|
43
|
+
block: Any,
|
44
|
+
task_index: int,
|
45
|
+
block_index: int,
|
46
|
+
) -> str:
|
47
|
+
if self.base_path is None:
|
48
|
+
raise ValueError(
|
49
|
+
"Base path must be provided to CapturingBlockWritePathProvider",
|
50
|
+
)
|
51
|
+
return self._get_write_path_for_block(
|
52
|
+
base_path=self.base_path,
|
53
|
+
block=block,
|
54
|
+
block_index=block_index,
|
55
|
+
)
|
56
|
+
|
57
|
+
def _get_write_path_for_block(
|
58
|
+
self,
|
59
|
+
base_path: str,
|
60
|
+
*args,
|
61
|
+
**kwargs,
|
62
|
+
) -> str:
|
63
|
+
filename = self.block_write_path_provider.get_filename_for_block(
|
64
|
+
*args,
|
65
|
+
**kwargs,
|
66
|
+
)
|
67
|
+
write_path = f"{base_path}/{filename}"
|
68
|
+
kwargs["base_path"] = base_path
|
69
|
+
self.write_path_kwargs[write_path] = kwargs
|
70
|
+
return write_path
|
71
|
+
|
72
|
+
|
73
|
+
class DeltaCatWriteResult:
|
74
|
+
def __init__(self):
|
75
|
+
self.metadata = None
|
76
|
+
self.path = None
|
77
|
+
self.dataset_uuid = None
|
78
|
+
self.block_write_path_provider = None
|
79
|
+
self.content_type = None
|
80
|
+
self.content_encoding = None
|
81
|
+
self.filesystem = None
|
82
|
+
|
83
|
+
|
84
|
+
class DeltaCatDatasink(Datasink[List[Metafile]]):
|
85
|
+
def __init__(
|
86
|
+
self,
|
87
|
+
url: DeltaCatUrl,
|
88
|
+
*,
|
89
|
+
metadata_only: bool = False,
|
90
|
+
copy_on_write: Optional[bool] = False,
|
91
|
+
):
|
92
|
+
self._url = url
|
93
|
+
self._metadata_only = metadata_only
|
94
|
+
self._copy_on_write = copy_on_write
|
95
|
+
|
96
|
+
def on_write_start(self) -> None:
|
97
|
+
pass
|
98
|
+
|
99
|
+
def write(
|
100
|
+
self,
|
101
|
+
blocks: Iterable[Block],
|
102
|
+
ctx: TaskContext,
|
103
|
+
) -> List[Metafile]:
|
104
|
+
for block in blocks:
|
105
|
+
pa_table = BlockAccessor.for_block(block).to_arrow()
|
106
|
+
if (
|
107
|
+
METAFILE_DATA_COLUMN_NAME in pa_table.column_names
|
108
|
+
and METAFILE_TYPE_COLUMN_NAME in pa_table.column_names
|
109
|
+
):
|
110
|
+
for pa_scalar in pa_table[METAFILE_DATA_COLUMN_NAME]:
|
111
|
+
metafile_msgpack_bytes = pa_scalar.as_py()
|
112
|
+
metafile = Metafile.deserialize(
|
113
|
+
serialized=metafile_msgpack_bytes,
|
114
|
+
meta_format=METAFILE_FORMAT_MSGPACK,
|
115
|
+
)
|
116
|
+
# TODO(pdames): Add `metafile` to writer as a kwarg instead
|
117
|
+
# of constructing a new URL with the metafile as input.
|
118
|
+
writer_url = DeltaCatUrlWriter(self._url, metafile=metafile)
|
119
|
+
# TODO(pdames): Run writes in order from catalog -> delta
|
120
|
+
# by truncating the URL down to just dc://{catalog-name}
|
121
|
+
# and rebuilding all path elements from there.
|
122
|
+
writer_url.write(metafile)
|
123
|
+
else:
|
124
|
+
raise NotImplementedError(
|
125
|
+
f"Expected {METAFILE_DATA_COLUMN_NAME} and "
|
126
|
+
f"{METAFILE_TYPE_COLUMN_NAME} columns in the input block, "
|
127
|
+
f"but found {pa_table.column_names}."
|
128
|
+
)
|
129
|
+
|
130
|
+
def on_write_complete(
|
131
|
+
self,
|
132
|
+
write_result: WriteResult[List[Metafile]],
|
133
|
+
):
|
134
|
+
pass
|
135
|
+
|
136
|
+
|
137
|
+
"""
|
138
|
+
def write(
|
139
|
+
self,
|
140
|
+
blocks: Iterable[Block],
|
141
|
+
ctx: TaskContext,
|
142
|
+
) -> List[ObjectRef[DeltacatWriteResult]]:
|
143
|
+
paths, filesystem = resolve_paths_and_filesystem(
|
144
|
+
self.path,
|
145
|
+
self.filesystem,
|
146
|
+
)
|
147
|
+
assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
|
148
|
+
path = paths[0]
|
149
|
+
write_results = super().write(blocks)
|
150
|
+
# append a summary of this write operation in the last write result
|
151
|
+
metadata = [BlockAccessor.for_block(_).get_metadata() for _ in blocks]
|
152
|
+
rwr = DeltacatWriteResult()
|
153
|
+
rwr.metadata = metadata
|
154
|
+
rwr.path = path
|
155
|
+
rwr.dataset_uuid = self.dataset_uuid
|
156
|
+
rwr.block_write_path_provider = self.filename_provider
|
157
|
+
rwr.content_type = ContentType.PARQUET.value
|
158
|
+
rwr.content_encoding = ContentEncoding.IDENTITY.value
|
159
|
+
rwr.filesystem = filesystem
|
160
|
+
rwr_obj_ref = ray.put(rwr)
|
161
|
+
write_results.append(rwr_obj_ref)
|
162
|
+
return write_results
|
163
|
+
|
164
|
+
def on_write_complete(self, write_results: List[Any], **kwargs) -> None:
|
165
|
+
# TODO (pdames): time latency of this operation - overall s3 write times
|
166
|
+
# are 2-3x pure read_parquet_fast() times
|
167
|
+
# restore the write operation summary from the last write result
|
168
|
+
result: DeltacatWriteResult = write_results[len(write_results) - 1]
|
169
|
+
write_path_args = result.block_write_path_provider.write_path_kwargs
|
170
|
+
blocks_written = len(write_path_args)
|
171
|
+
expected_blocks_written = len(result.metadata)
|
172
|
+
# TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
|
173
|
+
# Blocks filtered/split/merged to more/less write paths?
|
174
|
+
assert blocks_written == expected_blocks_written, (
|
175
|
+
f"Dataset write result validation failed. Found "
|
176
|
+
f"{blocks_written}/{expected_blocks_written} Dataset blocks "
|
177
|
+
f"written. Refusing to commit DeltaCAT Manifest."
|
178
|
+
)
|
179
|
+
manifest_entries = ManifestEntryList()
|
180
|
+
for block_idx, path in enumerate(write_path_args.keys()):
|
181
|
+
file_info = result.filesystem.get_file_info(path)
|
182
|
+
if file_info.type == pyarrow.fs.FileType.File:
|
183
|
+
content_length = file_info.size
|
184
|
+
else:
|
185
|
+
raise FileNotFoundError(ENOENT, strerror(ENOENT), path)
|
186
|
+
num_rows = result.metadata[block_idx].num_rows
|
187
|
+
source_content_length = result.metadata[block_idx].size_bytes
|
188
|
+
manifest_entry_meta = ManifestMeta.of(
|
189
|
+
int(num_rows) if num_rows is not None else None,
|
190
|
+
int(content_length) if content_length is not None else None,
|
191
|
+
result.content_type,
|
192
|
+
result.content_encoding,
|
193
|
+
int(source_content_length) if source_content_length else None,
|
194
|
+
)
|
195
|
+
parsed_url = parse_s3_url(path)
|
196
|
+
manifest_entry = ManifestEntry.of(
|
197
|
+
parsed_url.url,
|
198
|
+
manifest_entry_meta,
|
199
|
+
)
|
200
|
+
manifest_entries.append(manifest_entry)
|
201
|
+
manifest = Manifest.of(manifest_entries)
|
202
|
+
manifest_path = f"{result.path}/manifest"
|
203
|
+
logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
|
204
|
+
with result.filesystem.open_output_stream(manifest_path) as f:
|
205
|
+
f.write(json.dumps(manifest).encode("utf-8"))
|
206
|
+
logger.debug(f"Manifest committed to: {manifest_path}")
|
207
|
+
"""
|
File without changes
|