deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
deltacat/utils/arguments.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import inspect
|
2
|
-
|
2
|
+
import functools
|
3
|
+
from typing import Any, Callable, Dict, List
|
3
4
|
|
4
5
|
|
5
6
|
def sanitize_kwargs_to_callable(callable: Any, kwargs: Dict) -> Dict:
|
@@ -42,3 +43,34 @@ def sanitize_kwargs_by_supported_kwargs(
|
|
42
43
|
new_kwargs[key] = kwargs[key]
|
43
44
|
|
44
45
|
return new_kwargs
|
46
|
+
|
47
|
+
|
48
|
+
def alias(aliases: Dict[str, str]) -> Callable:
|
49
|
+
"""
|
50
|
+
This decorator allows for aliases to be used for function arguments.
|
51
|
+
:param aliases: A dictionary of aliases to use for the function arguments.
|
52
|
+
:return: A decorator that can be used to decorate a function.
|
53
|
+
|
54
|
+
For example:
|
55
|
+
>>> @alias({'long_parameter_name': 'param'})
|
56
|
+
>>> def example_fn(long_parameter_name='foo', **kwargs):
|
57
|
+
... print(long_parameter_name)
|
58
|
+
>>> example_fn(long_parameter_name="bar")
|
59
|
+
>>> bar
|
60
|
+
>>> example_fn(param="baz")
|
61
|
+
>>> baz
|
62
|
+
>>> example_fn()
|
63
|
+
>>> foo
|
64
|
+
"""
|
65
|
+
|
66
|
+
def decorator(func: Callable) -> Callable:
|
67
|
+
@functools.wraps(func)
|
68
|
+
def wrapper(**kwargs: Any) -> Any:
|
69
|
+
for name, alias in aliases.items():
|
70
|
+
if name not in kwargs and alias in kwargs:
|
71
|
+
kwargs[name] = kwargs[alias]
|
72
|
+
return func(**kwargs)
|
73
|
+
|
74
|
+
return wrapper
|
75
|
+
|
76
|
+
return decorator
|
deltacat/utils/daft.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Optional, List, Any, Dict, Callable, Iterator
|
2
|
+
from typing import Optional, List, Any, Dict, Callable, Iterator, Union
|
3
3
|
|
4
4
|
from daft.daft import (
|
5
5
|
StorageConfig,
|
@@ -34,9 +34,10 @@ from daft.io.scan import (
|
|
34
34
|
make_partition_field,
|
35
35
|
)
|
36
36
|
import pyarrow as pa
|
37
|
+
import pyarrow.fs as pafs
|
38
|
+
from fsspec import AbstractFileSystem
|
37
39
|
|
38
40
|
from deltacat import logs
|
39
|
-
from deltacat.catalog.model.table_definition import TableDefinition
|
40
41
|
from deltacat.utils.common import ReadKwargsProvider
|
41
42
|
from deltacat.utils.schema import coerce_pyarrow_table_to_schema
|
42
43
|
from deltacat.types.media import ContentType, ContentEncoding
|
@@ -44,14 +45,16 @@ from deltacat.aws.constants import (
|
|
44
45
|
BOTO_MAX_RETRIES,
|
45
46
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
46
47
|
AWS_REGION,
|
47
|
-
DEFAULT_FILE_READ_TIMEOUT_MS,
|
48
48
|
)
|
49
|
+
from deltacat.constants import DEFAULT_FILE_READ_TIMEOUT_MS
|
49
50
|
from deltacat.utils.performance import timed_invocation
|
50
51
|
|
51
52
|
from deltacat.types.partial_download import (
|
52
53
|
PartialFileDownloadParams,
|
53
54
|
)
|
54
|
-
|
55
|
+
|
56
|
+
# Import directly from storage model modules to avoid circular import
|
57
|
+
from deltacat.storage.model.transform import (
|
55
58
|
Transform,
|
56
59
|
IdentityTransform,
|
57
60
|
HourTransform,
|
@@ -61,9 +64,10 @@ from deltacat.storage import (
|
|
61
64
|
BucketTransform,
|
62
65
|
BucketingStrategy,
|
63
66
|
TruncateTransform,
|
64
|
-
|
65
|
-
Schema,
|
67
|
+
TruncateStrategy,
|
66
68
|
)
|
69
|
+
from deltacat.storage.model.partition import PartitionKey
|
70
|
+
from deltacat.storage.model.schema import Schema
|
67
71
|
from deltacat.storage.model.interop import ModelMapper
|
68
72
|
from deltacat.storage.model.expression import (
|
69
73
|
Expression,
|
@@ -225,150 +229,13 @@ class DaftToDeltacatVisitor(PredicateVisitor[Expression]):
|
|
225
229
|
return Not.of(IsNull.of(self.visit(expr)))
|
226
230
|
|
227
231
|
|
228
|
-
def s3_files_to_dataframe(
|
229
|
-
uris: List[str],
|
230
|
-
content_type: str,
|
231
|
-
content_encoding: str,
|
232
|
-
column_names: Optional[List[str]] = None,
|
233
|
-
include_columns: Optional[List[str]] = None,
|
234
|
-
read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
235
|
-
ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
|
236
|
-
s3_client_kwargs: Optional[Any] = None,
|
237
|
-
ray_init_options: Optional[Dict[str, Any]] = None,
|
238
|
-
) -> DataFrame:
|
239
|
-
|
240
|
-
if ray_init_options is None:
|
241
|
-
ray_init_options = {}
|
242
|
-
|
243
|
-
assert (
|
244
|
-
content_type == ContentType.PARQUET.value
|
245
|
-
), f"daft native reader currently only supports parquet, got {content_type}"
|
246
|
-
|
247
|
-
assert (
|
248
|
-
content_encoding == ContentEncoding.IDENTITY.value
|
249
|
-
), f"daft native reader currently only supports identity encoding, got {content_encoding}"
|
250
|
-
|
251
|
-
if not ray.is_initialized():
|
252
|
-
ray.init(ignore_reinit_error=True, **ray_init_options)
|
253
|
-
|
254
|
-
daft.context.set_runner_ray(noop_if_initialized=True)
|
255
|
-
|
256
|
-
if s3_client_kwargs is None:
|
257
|
-
s3_client_kwargs = {}
|
258
|
-
|
259
|
-
kwargs = {}
|
260
|
-
if read_func_kwargs_provider is not None:
|
261
|
-
kwargs = read_func_kwargs_provider(content_type, kwargs)
|
262
|
-
|
263
|
-
# TODO(raghumdani): pass in coerce_int96_timestamp arg
|
264
|
-
# https://github.com/Eventual-Inc/Daft/issues/1894
|
265
|
-
|
266
|
-
io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
|
267
|
-
|
268
|
-
logger.debug(
|
269
|
-
f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
|
270
|
-
)
|
271
|
-
|
272
|
-
df, latency = timed_invocation(daft.read_parquet, path=uris, io_config=io_config)
|
273
|
-
|
274
|
-
logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
|
275
|
-
|
276
|
-
columns_to_read = include_columns or column_names
|
277
|
-
|
278
|
-
logger.debug(f"Taking columns {columns_to_read} from the daft df.")
|
279
|
-
|
280
|
-
if columns_to_read:
|
281
|
-
return df.select(*columns_to_read)
|
282
|
-
else:
|
283
|
-
return df
|
284
|
-
|
285
|
-
|
286
|
-
def daft_s3_file_to_table(
|
287
|
-
s3_url: str,
|
288
|
-
content_type: str,
|
289
|
-
content_encoding: str,
|
290
|
-
column_names: Optional[List[str]] = None,
|
291
|
-
include_columns: Optional[List[str]] = None,
|
292
|
-
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
293
|
-
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
294
|
-
**s3_client_kwargs,
|
295
|
-
):
|
296
|
-
assert (
|
297
|
-
content_type == ContentType.PARQUET.value
|
298
|
-
), f"daft native reader currently only supports parquet, got {content_type}"
|
299
|
-
|
300
|
-
assert (
|
301
|
-
content_encoding == ContentEncoding.IDENTITY.value
|
302
|
-
), f"daft native reader currently only supports identity encoding, got {content_encoding}"
|
303
|
-
|
304
|
-
kwargs = {}
|
305
|
-
if pa_read_func_kwargs_provider is not None:
|
306
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
307
|
-
|
308
|
-
coerce_int96_timestamp_unit = TimeUnit.from_str(
|
309
|
-
kwargs.get("coerce_int96_timestamp_unit", "ms")
|
310
|
-
)
|
311
|
-
file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
|
312
|
-
|
313
|
-
row_groups = None
|
314
|
-
if (
|
315
|
-
partial_file_download_params is not None
|
316
|
-
and partial_file_download_params.row_groups_to_download is not None
|
317
|
-
):
|
318
|
-
row_groups = partial_file_download_params.row_groups_to_download
|
319
|
-
|
320
|
-
io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
|
321
|
-
|
322
|
-
logger.debug(f"Preparing to read S3 object from {s3_url} into daft table")
|
323
|
-
|
324
|
-
pa_table, latency = timed_invocation(
|
325
|
-
read_parquet_into_pyarrow,
|
326
|
-
path=s3_url,
|
327
|
-
columns=include_columns or column_names,
|
328
|
-
row_groups=row_groups,
|
329
|
-
io_config=io_config,
|
330
|
-
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
|
331
|
-
multithreaded_io=False,
|
332
|
-
file_timeout_ms=file_timeout_ms,
|
333
|
-
)
|
334
|
-
|
335
|
-
logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
|
336
|
-
|
337
|
-
if kwargs.get("schema") is not None:
|
338
|
-
input_schema = kwargs["schema"]
|
339
|
-
if include_columns is not None:
|
340
|
-
input_schema = pa.schema(
|
341
|
-
[input_schema.field(col) for col in include_columns],
|
342
|
-
metadata=input_schema.metadata,
|
343
|
-
)
|
344
|
-
elif column_names is not None:
|
345
|
-
input_schema = pa.schema(
|
346
|
-
[input_schema.field(col) for col in column_names],
|
347
|
-
metadata=input_schema.metadata,
|
348
|
-
)
|
349
|
-
return coerce_pyarrow_table_to_schema(pa_table, input_schema)
|
350
|
-
else:
|
351
|
-
return pa_table
|
352
|
-
|
353
|
-
|
354
|
-
def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
|
355
|
-
return IOConfig(
|
356
|
-
s3=S3Config(
|
357
|
-
key_id=s3_client_kwargs.get("aws_access_key_id"),
|
358
|
-
access_key=s3_client_kwargs.get("aws_secret_access_key"),
|
359
|
-
session_token=s3_client_kwargs.get("aws_session_token"),
|
360
|
-
region_name=AWS_REGION,
|
361
|
-
retry_mode="adaptive",
|
362
|
-
num_tries=BOTO_MAX_RETRIES,
|
363
|
-
max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
364
|
-
connect_timeout_ms=5_000, # Timeout to connect to server
|
365
|
-
read_timeout_ms=10_000, # Timeout for first byte from server
|
366
|
-
)
|
367
|
-
)
|
368
|
-
|
369
|
-
|
370
232
|
class DeltaCatScanOperator(ScanOperator):
|
371
|
-
def __init__(self, table
|
233
|
+
def __init__(self, table, storage_config: StorageConfig) -> None:
|
234
|
+
# Import inside method to avoid circular import
|
235
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
236
|
+
|
237
|
+
if not isinstance(table, TableDefinition):
|
238
|
+
raise TypeError("table must be a TableDefinition instance")
|
372
239
|
super().__init__()
|
373
240
|
self.table = table
|
374
241
|
self._schema = self._infer_schema()
|
@@ -463,6 +330,158 @@ class DeltaCatScanOperator(ScanOperator):
|
|
463
330
|
return partition_fields
|
464
331
|
|
465
332
|
|
333
|
+
def read_csv(
|
334
|
+
path: Union[str, List[str]],
|
335
|
+
*,
|
336
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
337
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
338
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
339
|
+
content_type: Optional[str] = None,
|
340
|
+
**read_kwargs,
|
341
|
+
) -> DataFrame:
|
342
|
+
"""
|
343
|
+
Read a CSV file into a Daft DataFrame.
|
344
|
+
|
345
|
+
Args:
|
346
|
+
path: Path to the CSV file
|
347
|
+
filesystem: Optional filesystem to use
|
348
|
+
fs_open_kwargs: Optional filesystem open kwargs
|
349
|
+
content_encoding: Content encoding (IDENTITY or GZIP supported)
|
350
|
+
content_type: Optional content type (PARQUET, JSON, CSV, etc.)
|
351
|
+
**read_kwargs: Additional arguments passed to daft.read_csv
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
DataFrame: The Daft DataFrame
|
355
|
+
"""
|
356
|
+
logger.debug(
|
357
|
+
f"Reading CSV file {path} into Daft DataFrame with kwargs: {read_kwargs}"
|
358
|
+
)
|
359
|
+
|
360
|
+
# If content_type is provided, add appropriate reader kwargs
|
361
|
+
if content_type is not None:
|
362
|
+
content_kwargs = content_type_to_reader_kwargs(content_type)
|
363
|
+
read_kwargs.update(content_kwargs)
|
364
|
+
logger.debug(f"Added content type kwargs for {content_type}: {content_kwargs}")
|
365
|
+
|
366
|
+
# Files should now be written with proper extensions, so we can read them directly
|
367
|
+
logger.debug(f"Reading CSV with Daft from: {path}")
|
368
|
+
df, latency = timed_invocation(daft.read_csv, path, **read_kwargs)
|
369
|
+
|
370
|
+
logger.debug(f"Time to read CSV {path} into Daft DataFrame: {latency}s")
|
371
|
+
return df
|
372
|
+
|
373
|
+
|
374
|
+
def read_json(
|
375
|
+
path: Union[str, List[str]],
|
376
|
+
*,
|
377
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
378
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
379
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
380
|
+
**read_kwargs,
|
381
|
+
) -> DataFrame:
|
382
|
+
"""
|
383
|
+
Read a JSON file into a Daft DataFrame.
|
384
|
+
|
385
|
+
Args:
|
386
|
+
path: Path to the JSON file (supports line-delimited JSON)
|
387
|
+
filesystem: Optional filesystem to use
|
388
|
+
fs_open_kwargs: Optional filesystem open kwargs
|
389
|
+
content_encoding: Content encoding (IDENTITY or GZIP supported)
|
390
|
+
**read_kwargs: Additional arguments passed to daft.read_json
|
391
|
+
|
392
|
+
Returns:
|
393
|
+
DataFrame: The Daft DataFrame
|
394
|
+
"""
|
395
|
+
logger.debug(
|
396
|
+
f"Reading JSON file {path} into Daft DataFrame with kwargs: {read_kwargs}"
|
397
|
+
)
|
398
|
+
|
399
|
+
# Files should now be written with proper extensions, so we can read them directly
|
400
|
+
logger.debug(f"Reading JSON with Daft from: {path}")
|
401
|
+
df, latency = timed_invocation(daft.read_json, path, **read_kwargs)
|
402
|
+
|
403
|
+
logger.debug(f"Time to read JSON {path} into Daft DataFrame: {latency}s")
|
404
|
+
return df
|
405
|
+
|
406
|
+
|
407
|
+
def read_parquet(
|
408
|
+
path: Union[str, List[str]],
|
409
|
+
*,
|
410
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
411
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
412
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
413
|
+
**read_kwargs,
|
414
|
+
) -> DataFrame:
|
415
|
+
"""
|
416
|
+
Read a Parquet file into a Daft DataFrame.
|
417
|
+
|
418
|
+
Args:
|
419
|
+
path: Path to the Parquet file
|
420
|
+
filesystem: Optional filesystem to use
|
421
|
+
fs_open_kwargs: Optional filesystem open kwargs
|
422
|
+
content_encoding: Content encoding (IDENTITY or GZIP supported)
|
423
|
+
**read_kwargs: Additional arguments passed to daft.read_parquet
|
424
|
+
|
425
|
+
Returns:
|
426
|
+
DataFrame: The Daft DataFrame
|
427
|
+
"""
|
428
|
+
logger.debug(
|
429
|
+
f"Reading Parquet file {path} into Daft DataFrame with kwargs: {read_kwargs}"
|
430
|
+
)
|
431
|
+
logger.debug(f"Reading Parquet with Daft from: {path}")
|
432
|
+
df, latency = timed_invocation(daft.read_parquet, path=path, **read_kwargs)
|
433
|
+
logger.debug(f"Time to read Parquet {path} into Daft DataFrame: {latency}s")
|
434
|
+
return df
|
435
|
+
|
436
|
+
|
437
|
+
# Map content types to their respective Daft read functions
|
438
|
+
CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
|
439
|
+
ContentType.UNESCAPED_TSV.value: read_csv,
|
440
|
+
ContentType.TSV.value: read_csv,
|
441
|
+
ContentType.CSV.value: read_csv,
|
442
|
+
ContentType.PSV.value: read_csv,
|
443
|
+
ContentType.PARQUET.value: read_parquet,
|
444
|
+
ContentType.JSON.value: read_json,
|
445
|
+
}
|
446
|
+
|
447
|
+
|
448
|
+
def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
449
|
+
"""
|
450
|
+
Returns reader kwargs for the given content type when reading with Daft.
|
451
|
+
"""
|
452
|
+
if content_type == ContentType.UNESCAPED_TSV.value:
|
453
|
+
return {
|
454
|
+
"delimiter": "\t",
|
455
|
+
"has_headers": False,
|
456
|
+
"double_quote": False,
|
457
|
+
"allow_variable_columns": True,
|
458
|
+
}
|
459
|
+
if content_type == ContentType.TSV.value:
|
460
|
+
return {
|
461
|
+
"delimiter": "\t",
|
462
|
+
"has_headers": False,
|
463
|
+
"allow_variable_columns": True,
|
464
|
+
}
|
465
|
+
if content_type == ContentType.CSV.value:
|
466
|
+
return {
|
467
|
+
"delimiter": ",",
|
468
|
+
"has_headers": False,
|
469
|
+
"allow_variable_columns": True,
|
470
|
+
}
|
471
|
+
if content_type == ContentType.PSV.value:
|
472
|
+
return {
|
473
|
+
"delimiter": "|",
|
474
|
+
"has_headers": False,
|
475
|
+
"allow_variable_columns": True,
|
476
|
+
}
|
477
|
+
if content_type in {
|
478
|
+
ContentType.PARQUET.value,
|
479
|
+
ContentType.JSON.value,
|
480
|
+
}:
|
481
|
+
return {}
|
482
|
+
raise ValueError(f"Unsupported content type for Daft reader: {content_type}")
|
483
|
+
|
484
|
+
|
466
485
|
class DaftFieldMapper(ModelMapper[DaftField, PaField]):
|
467
486
|
@staticmethod
|
468
487
|
def map(
|
@@ -569,7 +588,12 @@ class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
|
|
569
588
|
f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
|
570
589
|
)
|
571
590
|
elif isinstance(obj, TruncateTransform):
|
572
|
-
|
591
|
+
if obj.parameters.truncate_strategy == TruncateStrategy.ICEBERG:
|
592
|
+
return DaftTransform.iceberg_truncate(obj.parameters.width)
|
593
|
+
else:
|
594
|
+
raise ValueError(
|
595
|
+
f"Unsupported Truncate Strategy: {obj.parameters.truncate_strategy}"
|
596
|
+
)
|
573
597
|
|
574
598
|
raise ValueError(f"Unsupported Transform: {obj}")
|
575
599
|
|
@@ -693,3 +717,240 @@ class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
|
|
693
717
|
name=partition_field_name,
|
694
718
|
dtype=result_type,
|
695
719
|
)
|
720
|
+
|
721
|
+
|
722
|
+
def files_to_dataframe(
|
723
|
+
uris: List[str],
|
724
|
+
content_type: str,
|
725
|
+
content_encoding: str,
|
726
|
+
column_names: Optional[List[str]] = None,
|
727
|
+
include_columns: Optional[List[str]] = None,
|
728
|
+
read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
729
|
+
ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
|
730
|
+
ray_init_options: Optional[Dict[str, Any]] = None,
|
731
|
+
**kwargs,
|
732
|
+
) -> DataFrame:
|
733
|
+
"""
|
734
|
+
Read multiple files into a Daft DataFrame using any filesystem.
|
735
|
+
|
736
|
+
This function supports reading PARQUET, CSV, JSON, TSV, and PSV files.
|
737
|
+
|
738
|
+
Args:
|
739
|
+
uris: List of file URIs to read
|
740
|
+
content_type: The content type (PARQUET, CSV, JSON, TSV, UNESCAPED_TSV, PSV)
|
741
|
+
content_encoding: The content encoding (currently only IDENTITY is supported)
|
742
|
+
column_names: Optional column names to assign
|
743
|
+
include_columns: Optional columns to include in the result
|
744
|
+
read_func_kwargs_provider: Optional kwargs provider for customization
|
745
|
+
ray_options_provider: Optional Ray options provider
|
746
|
+
ray_init_options: Optional Ray initialization options
|
747
|
+
**kwargs: Additional kwargs, including optional 'io_config' for filesystem configuration
|
748
|
+
|
749
|
+
Returns:
|
750
|
+
DataFrame: The Daft DataFrame
|
751
|
+
|
752
|
+
Raises:
|
753
|
+
AssertionError: If content_type is not supported or content_encoding is not IDENTITY
|
754
|
+
|
755
|
+
Examples:
|
756
|
+
# Read local parquet files (filesystem auto-inferred)
|
757
|
+
df = files_to_dataframe(
|
758
|
+
uris=["file1.parquet", "file2.parquet"],
|
759
|
+
content_type=ContentType.PARQUET.value,
|
760
|
+
content_encoding=ContentEncoding.IDENTITY.value
|
761
|
+
)
|
762
|
+
|
763
|
+
# Read CSV files
|
764
|
+
df = files_to_dataframe(
|
765
|
+
uris=["file1.csv", "file2.csv"],
|
766
|
+
content_type=ContentType.CSV.value,
|
767
|
+
content_encoding=ContentEncoding.IDENTITY.value
|
768
|
+
)
|
769
|
+
|
770
|
+
# Read S3 files with custom IOConfig
|
771
|
+
from daft.io import IOConfig, S3Config
|
772
|
+
s3_config = IOConfig(s3=S3Config(...))
|
773
|
+
df = files_to_dataframe(
|
774
|
+
uris=["s3://bucket/file1.parquet", "s3://bucket/file2.parquet"],
|
775
|
+
content_type=ContentType.PARQUET.value,
|
776
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
777
|
+
io_config=s3_config
|
778
|
+
)
|
779
|
+
"""
|
780
|
+
if ray_init_options is None:
|
781
|
+
ray_init_options = {}
|
782
|
+
|
783
|
+
if content_type not in CONTENT_TYPE_TO_READ_FN.keys():
|
784
|
+
raise NotImplementedError(
|
785
|
+
f"Daft native reader supports {CONTENT_TYPE_TO_READ_FN.keys()}, got {content_type}."
|
786
|
+
f"Try using the Ray Dataset reader instead."
|
787
|
+
)
|
788
|
+
|
789
|
+
# Handle content encoding - for now, we only support identity and gzip
|
790
|
+
if content_encoding not in [
|
791
|
+
ContentEncoding.IDENTITY.value,
|
792
|
+
ContentEncoding.GZIP.value,
|
793
|
+
]:
|
794
|
+
raise NotImplementedError(
|
795
|
+
f"Daft native reader currently supports identity and gzip encoding, got {content_encoding}"
|
796
|
+
)
|
797
|
+
|
798
|
+
if not ray.is_initialized():
|
799
|
+
ray.init(**ray_init_options)
|
800
|
+
|
801
|
+
daft.context.set_runner_ray(noop_if_initialized=True)
|
802
|
+
|
803
|
+
read_kwargs = {}
|
804
|
+
if read_func_kwargs_provider is not None:
|
805
|
+
read_kwargs = read_func_kwargs_provider(content_type, read_kwargs)
|
806
|
+
|
807
|
+
# Add content-type-specific reader kwargs
|
808
|
+
content_type_kwargs = content_type_to_reader_kwargs(content_type)
|
809
|
+
read_kwargs.update(content_type_kwargs)
|
810
|
+
|
811
|
+
# Extract io_config from kwargs if provided, otherwise use None
|
812
|
+
io_config = kwargs.pop("io_config", None)
|
813
|
+
|
814
|
+
# Merge any remaining kwargs into read_kwargs (including file_path_column for native Daft support)
|
815
|
+
read_kwargs.update(kwargs)
|
816
|
+
|
817
|
+
logger.debug(f"Preparing to read {len(uris)} files into daft dataframe")
|
818
|
+
logger.debug(f"Content type: {content_type}")
|
819
|
+
logger.debug(f"Final read_kwargs: {read_kwargs}")
|
820
|
+
|
821
|
+
# Get the appropriate Daft reader function based on content type
|
822
|
+
daft_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
|
823
|
+
if not daft_read_func:
|
824
|
+
raise NotImplementedError(
|
825
|
+
f"Daft reader for content type '{content_type}' not implemented. "
|
826
|
+
f"Known content types: {list(CONTENT_TYPE_TO_READ_FN.keys())}"
|
827
|
+
)
|
828
|
+
|
829
|
+
# Handle schema for all supported formats
|
830
|
+
table_version_schema = kwargs.get("table_version_schema")
|
831
|
+
if table_version_schema is not None:
|
832
|
+
# Convert PyArrow schema to Daft schema using the official API
|
833
|
+
daft_schema = daft.Schema.from_pyarrow_schema(table_version_schema)
|
834
|
+
# Convert DaftSchema to dictionary format required by Daft readers
|
835
|
+
schema_dict = {field.name: field.dtype for field in daft_schema}
|
836
|
+
# Remove table_version_schema from kwargs since Daft readers don't recognize it
|
837
|
+
read_kwargs.pop("table_version_schema", None)
|
838
|
+
# Use explicit schema with infer_schema=False for correctness and performance
|
839
|
+
read_kwargs.update({"infer_schema": False, "schema": schema_dict})
|
840
|
+
else:
|
841
|
+
# Remove table_version_schema parameter if present but None
|
842
|
+
read_kwargs.pop("table_version_schema", None)
|
843
|
+
|
844
|
+
logger.debug(f"Reading {len(uris)} files with Daft using {daft_read_func}.")
|
845
|
+
|
846
|
+
# Call the appropriate Daft reader function
|
847
|
+
if io_config is not None and content_type == ContentType.PARQUET.value:
|
848
|
+
# Only parquet reader supports io_config parameter
|
849
|
+
df, latency = timed_invocation(
|
850
|
+
daft_read_func, path=uris, io_config=io_config, **read_kwargs
|
851
|
+
)
|
852
|
+
else:
|
853
|
+
df, latency = timed_invocation(daft_read_func, path=uris, **read_kwargs)
|
854
|
+
|
855
|
+
logger.debug(f"Daft read {len(uris)} files in {latency}s.")
|
856
|
+
|
857
|
+
# Apply column selection after reading
|
858
|
+
columns_to_read = include_columns or column_names
|
859
|
+
file_path_column = read_kwargs.get("file_path_column")
|
860
|
+
if file_path_column and columns_to_read and file_path_column not in columns_to_read:
|
861
|
+
# Add file_path_column to selection if it was specified
|
862
|
+
columns_to_read.append(file_path_column)
|
863
|
+
|
864
|
+
if columns_to_read:
|
865
|
+
logger.debug(f"Selecting columns {columns_to_read} with Daft.")
|
866
|
+
return df.select(*columns_to_read)
|
867
|
+
else:
|
868
|
+
return df
|
869
|
+
|
870
|
+
|
871
|
+
def daft_file_to_pyarrow_table(
|
872
|
+
path: str,
|
873
|
+
content_type: str,
|
874
|
+
content_encoding: str,
|
875
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
876
|
+
column_names: Optional[List[str]] = None,
|
877
|
+
include_columns: Optional[List[str]] = None,
|
878
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
879
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
880
|
+
**kwargs,
|
881
|
+
) -> pa.Table:
|
882
|
+
assert (
|
883
|
+
content_type == ContentType.PARQUET.value
|
884
|
+
), f"daft native reader currently only supports parquet, got {content_type}"
|
885
|
+
|
886
|
+
assert (
|
887
|
+
content_encoding == ContentEncoding.IDENTITY.value
|
888
|
+
), f"daft native reader currently only supports identity encoding, got {content_encoding}"
|
889
|
+
|
890
|
+
kwargs = {}
|
891
|
+
if pa_read_func_kwargs_provider is not None:
|
892
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
893
|
+
|
894
|
+
coerce_int96_timestamp_unit = TimeUnit.from_str(
|
895
|
+
kwargs.get("coerce_int96_timestamp_unit", "ms")
|
896
|
+
)
|
897
|
+
file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
|
898
|
+
|
899
|
+
row_groups = None
|
900
|
+
if (
|
901
|
+
partial_file_download_params is not None
|
902
|
+
and partial_file_download_params.row_groups_to_download is not None
|
903
|
+
):
|
904
|
+
row_groups = partial_file_download_params.row_groups_to_download
|
905
|
+
|
906
|
+
# Extract io_config from kwargs if provided
|
907
|
+
io_config = kwargs.pop("io_config", None)
|
908
|
+
if not io_config and path.startswith("s3://"):
|
909
|
+
io_config = _get_s3_io_config(kwargs)
|
910
|
+
|
911
|
+
logger.debug(f"Preparing to read object from {path} into daft table")
|
912
|
+
|
913
|
+
pa_table, latency = timed_invocation(
|
914
|
+
read_parquet_into_pyarrow,
|
915
|
+
path=path,
|
916
|
+
columns=include_columns or column_names,
|
917
|
+
row_groups=row_groups,
|
918
|
+
io_config=io_config,
|
919
|
+
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
|
920
|
+
multithreaded_io=False,
|
921
|
+
file_timeout_ms=file_timeout_ms,
|
922
|
+
)
|
923
|
+
|
924
|
+
logger.debug(f"Time to read object from {path} into daft table: {latency}s")
|
925
|
+
|
926
|
+
if kwargs.get("schema") is not None:
|
927
|
+
input_schema = kwargs["schema"]
|
928
|
+
if include_columns is not None:
|
929
|
+
input_schema = pa.schema(
|
930
|
+
[input_schema.field(col) for col in include_columns],
|
931
|
+
metadata=input_schema.metadata,
|
932
|
+
)
|
933
|
+
elif column_names is not None:
|
934
|
+
input_schema = pa.schema(
|
935
|
+
[input_schema.field(col) for col in column_names],
|
936
|
+
metadata=input_schema.metadata,
|
937
|
+
)
|
938
|
+
return coerce_pyarrow_table_to_schema(pa_table, input_schema)
|
939
|
+
else:
|
940
|
+
return pa_table
|
941
|
+
|
942
|
+
|
943
|
+
def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
|
944
|
+
return IOConfig(
|
945
|
+
s3=S3Config(
|
946
|
+
key_id=s3_client_kwargs.get("aws_access_key_id"),
|
947
|
+
access_key=s3_client_kwargs.get("aws_secret_access_key"),
|
948
|
+
session_token=s3_client_kwargs.get("aws_session_token"),
|
949
|
+
region_name=AWS_REGION,
|
950
|
+
retry_mode="adaptive",
|
951
|
+
num_tries=BOTO_MAX_RETRIES,
|
952
|
+
max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
953
|
+
connect_timeout_ms=5_000, # Timeout to connect to server
|
954
|
+
read_timeout_ms=10_000, # Timeout for first byte from server
|
955
|
+
)
|
956
|
+
)
|