deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from typing import Optional, Tuple, Union, List
|
5
|
+
|
6
|
+
import sys
|
7
|
+
import urllib
|
8
|
+
import pathlib
|
9
|
+
|
10
|
+
import pyarrow
|
11
|
+
import pyarrow as pa
|
12
|
+
from pyarrow.fs import (
|
13
|
+
_resolve_filesystem_and_path,
|
14
|
+
FileSelector,
|
15
|
+
FileInfo,
|
16
|
+
FileType,
|
17
|
+
FileSystem,
|
18
|
+
FSSpecHandler,
|
19
|
+
PyFileSystem,
|
20
|
+
)
|
21
|
+
|
22
|
+
_LOCAL_SCHEME = "local"
|
23
|
+
|
24
|
+
|
25
|
+
def resolve_paths_and_filesystem(
|
26
|
+
paths: Union[str, List[str]],
|
27
|
+
filesystem: pyarrow.fs.FileSystem = None,
|
28
|
+
) -> Tuple[List[str], pyarrow.fs.FileSystem]:
|
29
|
+
"""
|
30
|
+
Resolves and normalizes all provided paths, infers a filesystem from the
|
31
|
+
paths or validates the provided filesystem against the paths and ensures
|
32
|
+
that all paths use the same filesystem.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
paths: A single file/directory path or a list of file/directory paths.
|
36
|
+
A list of paths can contain both files and directories.
|
37
|
+
filesystem: The filesystem implementation that should be used for
|
38
|
+
reading these files. If None, a filesystem will be inferred. If not
|
39
|
+
None, the provided filesystem will still be validated against all
|
40
|
+
filesystems inferred from the provided paths to ensure
|
41
|
+
compatibility.
|
42
|
+
"""
|
43
|
+
if isinstance(paths, str):
|
44
|
+
paths = [paths]
|
45
|
+
if isinstance(paths, pathlib.Path):
|
46
|
+
paths = [str(paths)]
|
47
|
+
elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths):
|
48
|
+
raise ValueError(
|
49
|
+
"Expected `paths` to be a `str`, `pathlib.Path`, or `list[str]`, but got "
|
50
|
+
f"`{paths}`."
|
51
|
+
)
|
52
|
+
elif len(paths) == 0:
|
53
|
+
raise ValueError("Must provide at least one path.")
|
54
|
+
|
55
|
+
need_unwrap_path_protocol = True
|
56
|
+
if filesystem and not isinstance(filesystem, FileSystem):
|
57
|
+
err_msg = (
|
58
|
+
f"The filesystem passed must either conform to "
|
59
|
+
f"pyarrow.fs.FileSystem, or "
|
60
|
+
f"fsspec.spec.AbstractFileSystem. The provided "
|
61
|
+
f"filesystem was: {filesystem}"
|
62
|
+
)
|
63
|
+
try:
|
64
|
+
import fsspec
|
65
|
+
from fsspec.implementations.http import HTTPFileSystem
|
66
|
+
except ModuleNotFoundError:
|
67
|
+
# If filesystem is not a pyarrow filesystem and fsspec isn't
|
68
|
+
# installed, then filesystem is neither a pyarrow filesystem nor
|
69
|
+
# an fsspec filesystem, so we raise a TypeError.
|
70
|
+
raise TypeError(err_msg) from None
|
71
|
+
if not isinstance(filesystem, fsspec.spec.AbstractFileSystem):
|
72
|
+
raise TypeError(err_msg) from None
|
73
|
+
if isinstance(filesystem, HTTPFileSystem):
|
74
|
+
# If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths
|
75
|
+
# should not be unwrapped/removed, because HTTPFileSystem expects full file
|
76
|
+
# paths including protocol/scheme. This is different behavior compared to
|
77
|
+
# file systems implementation in pyarrow.fs.FileSystem.
|
78
|
+
need_unwrap_path_protocol = False
|
79
|
+
|
80
|
+
filesystem = PyFileSystem(FSSpecHandler(filesystem))
|
81
|
+
|
82
|
+
resolved_paths = []
|
83
|
+
for path in paths:
|
84
|
+
path = _resolve_custom_scheme(path)
|
85
|
+
try:
|
86
|
+
resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
|
87
|
+
path, filesystem
|
88
|
+
)
|
89
|
+
except pa.lib.ArrowInvalid as e:
|
90
|
+
if "Cannot parse URI" in str(e):
|
91
|
+
resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
|
92
|
+
_encode_url(path), filesystem
|
93
|
+
)
|
94
|
+
resolved_path = _decode_url(resolved_path)
|
95
|
+
elif "Unrecognized filesystem type in URI" in str(e):
|
96
|
+
scheme = urllib.parse.urlparse(path, allow_fragments=False).scheme
|
97
|
+
if scheme in ["http", "https"]:
|
98
|
+
# If scheme of path is HTTP and filesystem is not resolved,
|
99
|
+
# try to use fsspec HTTPFileSystem. This expects fsspec is
|
100
|
+
# installed.
|
101
|
+
try:
|
102
|
+
from fsspec.implementations.http import HTTPFileSystem
|
103
|
+
except ModuleNotFoundError:
|
104
|
+
raise ImportError(
|
105
|
+
"Please install fsspec to read files from HTTP."
|
106
|
+
) from None
|
107
|
+
|
108
|
+
resolved_filesystem = PyFileSystem(FSSpecHandler(HTTPFileSystem()))
|
109
|
+
resolved_path = path
|
110
|
+
need_unwrap_path_protocol = False
|
111
|
+
else:
|
112
|
+
raise
|
113
|
+
else:
|
114
|
+
raise
|
115
|
+
if filesystem is None:
|
116
|
+
filesystem = resolved_filesystem
|
117
|
+
elif need_unwrap_path_protocol:
|
118
|
+
resolved_path = _unwrap_protocol(resolved_path)
|
119
|
+
resolved_path = filesystem.normalize_path(resolved_path)
|
120
|
+
resolved_paths.append(resolved_path)
|
121
|
+
|
122
|
+
return resolved_paths, filesystem
|
123
|
+
|
124
|
+
|
125
|
+
def resolve_path_and_filesystem(
|
126
|
+
path: str,
|
127
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
128
|
+
) -> Tuple[str, pyarrow.fs.FileSystem]:
|
129
|
+
"""
|
130
|
+
Resolves and normalizes the provided path, infers a filesystem from the
|
131
|
+
path or validates the provided filesystem against the path.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
path: A single file/directory path.
|
135
|
+
filesystem: The filesystem implementation that should be used for
|
136
|
+
reading these files. If None, a filesystem will be inferred. If not
|
137
|
+
None, the provided filesystem will still be validated against all
|
138
|
+
filesystems inferred from the provided paths to ensure
|
139
|
+
compatibility.
|
140
|
+
"""
|
141
|
+
paths, filesystem = resolve_paths_and_filesystem(
|
142
|
+
paths=path,
|
143
|
+
filesystem=filesystem,
|
144
|
+
)
|
145
|
+
assert len(paths) == 1, len(paths)
|
146
|
+
return paths[0], filesystem
|
147
|
+
|
148
|
+
|
149
|
+
def list_directory(
|
150
|
+
path: str,
|
151
|
+
filesystem: pyarrow.fs.FileSystem,
|
152
|
+
exclude_prefixes: Optional[List[str]] = None,
|
153
|
+
ignore_missing_path: bool = False,
|
154
|
+
recursive: bool = False,
|
155
|
+
) -> List[Tuple[str, int]]:
|
156
|
+
"""
|
157
|
+
Expand the provided directory path to a list of file paths.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
path: The directory path to expand.
|
161
|
+
filesystem: The filesystem implementation that should be used for
|
162
|
+
reading these files.
|
163
|
+
exclude_prefixes: The file relative path prefixes that should be
|
164
|
+
excluded from the returned file set. Default excluded prefixes are
|
165
|
+
"." and "_".
|
166
|
+
recursive: Whether to expand subdirectories or not.
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
An iterator of (file_path, file_size) tuples.
|
170
|
+
"""
|
171
|
+
if exclude_prefixes is None:
|
172
|
+
exclude_prefixes = [".", "_"]
|
173
|
+
|
174
|
+
selector = FileSelector(
|
175
|
+
base_dir=path,
|
176
|
+
recursive=recursive,
|
177
|
+
allow_not_found=ignore_missing_path,
|
178
|
+
)
|
179
|
+
try:
|
180
|
+
files = filesystem.get_file_info(selector)
|
181
|
+
except OSError as e:
|
182
|
+
if isinstance(e, FileNotFoundError):
|
183
|
+
files = []
|
184
|
+
else:
|
185
|
+
_handle_read_os_error(e, path)
|
186
|
+
base_path = selector.base_dir
|
187
|
+
out = []
|
188
|
+
for file_ in files:
|
189
|
+
file_path = file_.path
|
190
|
+
if not file_path.startswith(base_path):
|
191
|
+
continue
|
192
|
+
relative = file_path[len(base_path) :]
|
193
|
+
if any(relative.startswith(prefix) for prefix in exclude_prefixes):
|
194
|
+
continue
|
195
|
+
out.append((file_path, file_.size))
|
196
|
+
# We sort the paths to guarantee a stable order.
|
197
|
+
return sorted(out)
|
198
|
+
|
199
|
+
|
200
|
+
def get_file_info(
|
201
|
+
path: str,
|
202
|
+
filesystem: pyarrow.fs.FileSystem,
|
203
|
+
ignore_missing_path: bool = False,
|
204
|
+
) -> FileInfo:
|
205
|
+
"""Get the file info for the provided path."""
|
206
|
+
try:
|
207
|
+
file_info = filesystem.get_file_info(path)
|
208
|
+
except OSError as e:
|
209
|
+
_handle_read_os_error(e, path)
|
210
|
+
if file_info.type == FileType.NotFound and not ignore_missing_path:
|
211
|
+
raise FileNotFoundError(path)
|
212
|
+
|
213
|
+
return file_info
|
214
|
+
|
215
|
+
|
216
|
+
def _handle_read_os_error(
|
217
|
+
error: OSError,
|
218
|
+
paths: Union[str, List[str]],
|
219
|
+
) -> str:
|
220
|
+
# NOTE: this is not comprehensive yet, and should be extended as more errors arise.
|
221
|
+
# NOTE: The latter patterns are raised in Arrow 10+, while the former is raised in
|
222
|
+
# Arrow < 10.
|
223
|
+
aws_error_pattern = (
|
224
|
+
r"^(?:(.*)AWS Error \[code \d+\]: No response body\.(.*))|"
|
225
|
+
r"(?:(.*)AWS Error UNKNOWN \(HTTP status 400\) during HeadObject operation: "
|
226
|
+
r"No response body\.(.*))|"
|
227
|
+
r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
|
228
|
+
r"body\.(.*))$"
|
229
|
+
)
|
230
|
+
if re.match(aws_error_pattern, str(error)):
|
231
|
+
# Specially handle AWS error when reading files, to give a clearer error
|
232
|
+
# message to avoid confusing users. The real issue is most likely that the AWS
|
233
|
+
# S3 file credentials have not been properly configured yet.
|
234
|
+
if isinstance(paths, str):
|
235
|
+
# Quote to highlight single file path in error message for better
|
236
|
+
# readability. List of file paths will be shown up as ['foo', 'boo'],
|
237
|
+
# so only quote single file path here.
|
238
|
+
paths = f'"{paths}"'
|
239
|
+
raise OSError(
|
240
|
+
(
|
241
|
+
f"Failing to read AWS S3 file(s): {paths}. "
|
242
|
+
"Please check that file exists and has properly configured access. "
|
243
|
+
"You can also run AWS CLI command to get more detailed error message "
|
244
|
+
"(e.g., aws s3 ls <file-name>). "
|
245
|
+
"See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
|
246
|
+
"for more information."
|
247
|
+
)
|
248
|
+
)
|
249
|
+
else:
|
250
|
+
raise error
|
251
|
+
|
252
|
+
|
253
|
+
def _is_local_windows_path(path: str) -> bool:
|
254
|
+
"""Determines if path is a Windows file-system location."""
|
255
|
+
if sys.platform != "win32":
|
256
|
+
return False
|
257
|
+
|
258
|
+
if len(path) >= 1 and path[0] == "\\":
|
259
|
+
return True
|
260
|
+
if (
|
261
|
+
len(path) >= 3
|
262
|
+
and path[1] == ":"
|
263
|
+
and (path[2] == "/" or path[2] == "\\")
|
264
|
+
and path[0].isalpha()
|
265
|
+
):
|
266
|
+
return True
|
267
|
+
return False
|
268
|
+
|
269
|
+
|
270
|
+
def _unwrap_protocol(path):
|
271
|
+
"""
|
272
|
+
Slice off any protocol prefixes on path.
|
273
|
+
"""
|
274
|
+
if sys.platform == "win32" and _is_local_windows_path(path):
|
275
|
+
# Represent as posix path such that downstream functions properly handle it.
|
276
|
+
# This is executed when 'file://' is NOT included in the path.
|
277
|
+
return pathlib.Path(path).as_posix()
|
278
|
+
|
279
|
+
parsed = urllib.parse.urlparse(path, allow_fragments=False) # support '#' in path
|
280
|
+
query = "?" + parsed.query if parsed.query else "" # support '?' in path
|
281
|
+
netloc = parsed.netloc
|
282
|
+
if parsed.scheme == "s3" and "@" in parsed.netloc:
|
283
|
+
# If the path contains an @, it is assumed to be an anonymous
|
284
|
+
# credentialed path, and we need to strip off the credentials.
|
285
|
+
netloc = parsed.netloc.split("@")[-1]
|
286
|
+
|
287
|
+
parsed_path = parsed.path
|
288
|
+
# urlparse prepends the path with a '/'. This does not work on Windows
|
289
|
+
# so if this is the case strip the leading slash.
|
290
|
+
if (
|
291
|
+
sys.platform == "win32"
|
292
|
+
and not netloc
|
293
|
+
and len(parsed_path) >= 3
|
294
|
+
and parsed_path[0] == "/" # The problematic leading slash
|
295
|
+
and parsed_path[1].isalpha() # Ensure it is a drive letter.
|
296
|
+
and parsed_path[2:4] in (":", ":/")
|
297
|
+
):
|
298
|
+
parsed_path = parsed_path[1:]
|
299
|
+
|
300
|
+
return netloc + parsed_path + query
|
301
|
+
|
302
|
+
|
303
|
+
def _encode_url(path):
|
304
|
+
return urllib.parse.quote(path, safe="/:")
|
305
|
+
|
306
|
+
|
307
|
+
def _decode_url(path):
|
308
|
+
return urllib.parse.unquote(path)
|
309
|
+
|
310
|
+
|
311
|
+
def _resolve_custom_scheme(path: str) -> str:
|
312
|
+
"""Returns the resolved path if the given path follows a Ray-specific custom
|
313
|
+
scheme. Othewise, returns the path unchanged.
|
314
|
+
|
315
|
+
The supported custom schemes are: "local", "example".
|
316
|
+
"""
|
317
|
+
parsed_uri = urllib.parse.urlparse(path)
|
318
|
+
if parsed_uri.scheme == _LOCAL_SCHEME:
|
319
|
+
path = parsed_uri.netloc + parsed_uri.path
|
320
|
+
return path
|
@@ -0,0 +1,73 @@
|
|
1
|
+
import posixpath
|
2
|
+
import pyarrow.fs
|
3
|
+
|
4
|
+
from deltacat.storage.model.partition import PartitionLocator
|
5
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
6
|
+
|
7
|
+
"""
|
8
|
+
Helper functions to work with deltacat metadata paths.
|
9
|
+
TODO: Replace with direct calls to Deltacat storage interface.
|
10
|
+
"""
|
11
|
+
|
12
|
+
|
13
|
+
def _find_first_child_with_rev(
|
14
|
+
parent_path: str, filesystem: pyarrow.fs.FileSystem
|
15
|
+
) -> str:
|
16
|
+
"""
|
17
|
+
Walks the filesystem to find the first child directory with a `rev/` folder.
|
18
|
+
|
19
|
+
This is a temporary solution to locate the first Namespace and Table directories.
|
20
|
+
The Deltacat Storage interface will provide a more robust way to locate these directories.
|
21
|
+
|
22
|
+
param: parent_path: The parent directory to search for a child with a `rev/` folder.
|
23
|
+
param: filesystem: The filesystem to search for the child directory.
|
24
|
+
returns: The name of the first child directory with a `rev/` folder.
|
25
|
+
"""
|
26
|
+
children = filesystem.get_file_info(
|
27
|
+
pyarrow.fs.FileSelector(parent_path, allow_not_found=True)
|
28
|
+
)
|
29
|
+
for child in children:
|
30
|
+
if child.type == pyarrow.fs.FileType.Directory:
|
31
|
+
rev_path = posixpath.join(child.path, "rev")
|
32
|
+
if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
|
33
|
+
return child.base_name
|
34
|
+
raise ValueError(f"No directory with 'rev/' found under {parent_path}")
|
35
|
+
|
36
|
+
|
37
|
+
def _find_table_path(root_path: str, filesystem: pyarrow.fs.FileSystem):
|
38
|
+
"""
|
39
|
+
Finds a path with structure: root/namespace_id/table_id
|
40
|
+
Uses _find_first_child_with_rev to determine the namespace and table ids.
|
41
|
+
|
42
|
+
param: root_path: The root directory to search for the namespace and table directories.
|
43
|
+
param: filesystem: The filesystem to search for the namespace and table directories.
|
44
|
+
returns: The path to the table directory.
|
45
|
+
raises: ValueError if the namespace or table directories are not found.
|
46
|
+
"""
|
47
|
+
try:
|
48
|
+
# Find Namespace (first directory under root with rev/)
|
49
|
+
namespace_id = _find_first_child_with_rev(root_path, filesystem)
|
50
|
+
namespace_path = posixpath.join(root_path, namespace_id)
|
51
|
+
|
52
|
+
# Find Table (first directory under namespace with rev/)
|
53
|
+
table_id = _find_first_child_with_rev(namespace_path, filesystem)
|
54
|
+
return posixpath.join(namespace_path, table_id)
|
55
|
+
|
56
|
+
except ValueError as e:
|
57
|
+
raise ValueError(f"Failed to locate Namespace or Table: {e}") from e
|
58
|
+
|
59
|
+
|
60
|
+
def _find_partition_path(root_path: str, locator: PartitionLocator) -> str:
|
61
|
+
"""
|
62
|
+
Finds the path to the partition directory for the specified locator.
|
63
|
+
|
64
|
+
param: root_uri: The root URI of the dataset.
|
65
|
+
param: locator: The DeltaLocator for the delta.
|
66
|
+
returns: The path to the delta directory.
|
67
|
+
"""
|
68
|
+
root_path, filesystem = resolve_path_and_filesystem(root_path)
|
69
|
+
return posixpath.join(
|
70
|
+
_find_table_path(root_path, filesystem),
|
71
|
+
locator.table_version,
|
72
|
+
locator.stream_id,
|
73
|
+
)
|
deltacat/utils/pyarrow.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
import copy
|
5
4
|
import bz2
|
6
5
|
import gzip
|
7
6
|
import io
|
@@ -47,19 +46,6 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
46
|
|
48
47
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
49
48
|
READER_TYPE_KWARG = "reader_type"
|
50
|
-
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
|
51
|
-
|
52
|
-
"""
|
53
|
-
By default, round decimal values using half_to_even round mode when
|
54
|
-
rescaling a decimal to the given scale and precision in the schema would cause
|
55
|
-
data loss. Setting any non null value of this argument will result
|
56
|
-
in an error instead.
|
57
|
-
"""
|
58
|
-
RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
59
|
-
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
60
|
-
DECIMAL256_DEFAULT_SCALE = 38
|
61
|
-
DECIMAL256_MAX_PRECISION = 76
|
62
|
-
MAX_INT_BYTES = 2147483646
|
63
49
|
|
64
50
|
|
65
51
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
@@ -78,164 +64,45 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
|
|
78
64
|
return target_schema
|
79
65
|
|
80
66
|
|
81
|
-
def
|
82
|
-
schema = None
|
83
|
-
if (
|
84
|
-
"convert_options" in kwargs
|
85
|
-
and kwargs["convert_options"].column_types is not None
|
86
|
-
):
|
87
|
-
schema = kwargs["convert_options"].column_types
|
88
|
-
if not isinstance(schema, pa.Schema):
|
89
|
-
schema = pa.schema(schema)
|
90
|
-
if kwargs["convert_options"].include_columns:
|
91
|
-
schema = _filter_schema_for_columns(
|
92
|
-
schema, kwargs["convert_options"].include_columns
|
93
|
-
)
|
94
|
-
elif (
|
95
|
-
kwargs.get("read_options") is not None
|
96
|
-
and kwargs["read_options"].column_names
|
97
|
-
):
|
98
|
-
schema = _filter_schema_for_columns(
|
99
|
-
schema, kwargs["read_options"].column_names
|
100
|
-
)
|
101
|
-
else:
|
102
|
-
logger.debug(
|
103
|
-
"Schema not specified in the kwargs."
|
104
|
-
" Hence, schema could not be inferred from the empty CSV."
|
105
|
-
)
|
106
|
-
|
107
|
-
return schema
|
108
|
-
|
109
|
-
|
110
|
-
def _new_schema_with_replaced_fields(
|
111
|
-
schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
|
112
|
-
) -> pa.Schema:
|
113
|
-
if schema is None:
|
114
|
-
return None
|
115
|
-
|
116
|
-
new_schema_fields = []
|
117
|
-
for field in schema:
|
118
|
-
new_field = field_to_replace(field)
|
119
|
-
if new_field is not None:
|
120
|
-
new_schema_fields.append(new_field)
|
121
|
-
else:
|
122
|
-
new_schema_fields.append(field)
|
123
|
-
|
124
|
-
return pa.schema(new_schema_fields, metadata=schema.metadata)
|
125
|
-
|
126
|
-
|
127
|
-
def _read_csv_rounding_decimal_columns_to_fit_scale(
|
128
|
-
schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
|
129
|
-
) -> pa.Table:
|
130
|
-
# Note: We read decimals as strings first because CSV
|
131
|
-
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
132
|
-
new_schema = _new_schema_with_replaced_fields(
|
133
|
-
schema,
|
134
|
-
lambda fld: (
|
135
|
-
pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
136
|
-
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
137
|
-
else None
|
138
|
-
),
|
139
|
-
)
|
140
|
-
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
141
|
-
["read_options", "parse_options", "convert_options", "memory_pool"],
|
142
|
-
reader_kwargs,
|
143
|
-
)
|
144
|
-
# Creating a shallow copy for efficiency
|
145
|
-
new_convert_options = copy.copy(new_kwargs["convert_options"])
|
146
|
-
new_convert_options.column_types = new_schema
|
147
|
-
new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
|
148
|
-
arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
|
149
|
-
|
150
|
-
for column_index, field in enumerate(schema):
|
151
|
-
if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
|
152
|
-
column_array = arrow_table[field.name]
|
153
|
-
# We always cast to decimal256 to accomodate fixed scale of 38
|
154
|
-
cast_to_type = pa.decimal256(
|
155
|
-
DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
|
156
|
-
)
|
157
|
-
casted_decimal_array = pc.cast(column_array, cast_to_type)
|
158
|
-
# Note that scale can be negative
|
159
|
-
rounded_column_array = pc.round(
|
160
|
-
casted_decimal_array, ndigits=field.type.scale
|
161
|
-
)
|
162
|
-
final_decimal_array = pc.cast(rounded_column_array, field.type)
|
163
|
-
arrow_table = arrow_table.set_column(
|
164
|
-
column_index,
|
165
|
-
field,
|
166
|
-
final_decimal_array,
|
167
|
-
)
|
168
|
-
logger.debug(
|
169
|
-
f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
|
170
|
-
f" {field.type.precision} precision"
|
171
|
-
)
|
172
|
-
|
173
|
-
return arrow_table
|
174
|
-
|
175
|
-
|
176
|
-
def pyarrow_read_csv_default(*args, **kwargs):
|
177
|
-
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
178
|
-
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
179
|
-
)
|
180
|
-
|
67
|
+
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
181
68
|
try:
|
69
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
70
|
+
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
71
|
+
)
|
182
72
|
return pacsv.read_csv(*args, **new_kwargs)
|
183
73
|
except pa.lib.ArrowInvalid as e:
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
"Rescaling Decimal" in error_str
|
201
|
-
and "value would cause data loss" in error_str
|
74
|
+
if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
|
75
|
+
schema = None
|
76
|
+
if (
|
77
|
+
"convert_options" in kwargs
|
78
|
+
and kwargs["convert_options"].column_types is not None
|
79
|
+
):
|
80
|
+
schema = kwargs["convert_options"].column_types
|
81
|
+
if not isinstance(schema, pa.Schema):
|
82
|
+
schema = pa.schema(schema)
|
83
|
+
if kwargs["convert_options"].include_columns:
|
84
|
+
schema = _filter_schema_for_columns(
|
85
|
+
schema, kwargs["convert_options"].include_columns
|
86
|
+
)
|
87
|
+
elif (
|
88
|
+
kwargs.get("read_options") is not None
|
89
|
+
and kwargs["read_options"].column_names
|
202
90
|
):
|
203
|
-
|
204
|
-
|
205
|
-
if isinstance(args[0], io.IOBase) and args[0].seekable():
|
206
|
-
logger.debug(f"Seeking to the beginning of the file {args[0]}")
|
207
|
-
args[0].seek(0)
|
208
|
-
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
209
|
-
schema=schema, reader_args=args, reader_kwargs=kwargs
|
91
|
+
schema = _filter_schema_for_columns(
|
92
|
+
schema, kwargs["read_options"].column_names
|
210
93
|
)
|
94
|
+
|
211
95
|
else:
|
212
96
|
logger.debug(
|
213
|
-
"Schema
|
214
|
-
"Hence,
|
97
|
+
"Schema not specified in the kwargs."
|
98
|
+
" Hence, schema could not be inferred from the empty CSV."
|
215
99
|
)
|
216
100
|
|
101
|
+
logger.debug(f"Read CSV empty schema being used: {schema}")
|
102
|
+
return pa.Table.from_pylist([], schema=schema)
|
217
103
|
raise e
|
218
104
|
|
219
105
|
|
220
|
-
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
221
|
-
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
222
|
-
|
223
|
-
# CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
|
224
|
-
# Below ensures decimal256 is casted properly.
|
225
|
-
schema_includes_decimal256 = (
|
226
|
-
(True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
|
227
|
-
if schema is not None
|
228
|
-
else None
|
229
|
-
)
|
230
|
-
if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
231
|
-
# falling back to expensive method of reading CSV
|
232
|
-
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
233
|
-
schema, reader_args=args, reader_kwargs=kwargs
|
234
|
-
)
|
235
|
-
else:
|
236
|
-
return pyarrow_read_csv_default(*args, **kwargs)
|
237
|
-
|
238
|
-
|
239
106
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
240
107
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
241
108
|
ContentType.TSV.value: pyarrow_read_csv,
|
@@ -544,15 +411,6 @@ def s3_file_to_table(
|
|
544
411
|
if pa_read_func_kwargs_provider is not None:
|
545
412
|
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
546
413
|
|
547
|
-
if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
|
548
|
-
new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
|
549
|
-
if content_type == ContentType.PARQUET.value:
|
550
|
-
logger.debug(
|
551
|
-
f"Overriding {s3_url} content encoding from {content_encoding} "
|
552
|
-
f"to {new_content_encoding}"
|
553
|
-
)
|
554
|
-
content_encoding = new_content_encoding
|
555
|
-
|
556
414
|
if (
|
557
415
|
content_type == ContentType.PARQUET.value
|
558
416
|
and content_encoding == ContentEncoding.IDENTITY.value
|
@@ -582,8 +440,8 @@ def s3_file_to_table(
|
|
582
440
|
**s3_client_kwargs,
|
583
441
|
)
|
584
442
|
|
585
|
-
|
586
|
-
|
443
|
+
if READER_TYPE_KWARG in kwargs:
|
444
|
+
kwargs.pop(READER_TYPE_KWARG)
|
587
445
|
|
588
446
|
filesystem = io
|
589
447
|
if s3_url.startswith("s3://"):
|
@@ -617,18 +475,7 @@ def s3_file_to_parquet(
|
|
617
475
|
f"Reading {s3_url} to PyArrow ParquetFile. "
|
618
476
|
f"Content type: {content_type}. Encoding: {content_encoding}"
|
619
477
|
)
|
620
|
-
kwargs = {}
|
621
|
-
if pa_read_func_kwargs_provider:
|
622
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
623
478
|
|
624
|
-
if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
|
625
|
-
new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
|
626
|
-
if content_type == ContentType.PARQUET.value:
|
627
|
-
logger.debug(
|
628
|
-
f"Overriding {s3_url} content encoding from {content_encoding} "
|
629
|
-
f"to {new_content_encoding}"
|
630
|
-
)
|
631
|
-
content_encoding = new_content_encoding
|
632
479
|
if (
|
633
480
|
content_type != ContentType.PARQUET.value
|
634
481
|
or content_encoding != ContentEncoding.IDENTITY
|
@@ -641,10 +488,15 @@ def s3_file_to_parquet(
|
|
641
488
|
if s3_client_kwargs is None:
|
642
489
|
s3_client_kwargs = {}
|
643
490
|
|
491
|
+
kwargs = {}
|
492
|
+
|
644
493
|
if s3_url.startswith("s3://"):
|
645
494
|
s3_file_system = create_s3_file_system(s3_client_kwargs)
|
646
495
|
kwargs["filesystem"] = s3_file_system
|
647
496
|
|
497
|
+
if pa_read_func_kwargs_provider:
|
498
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
499
|
+
|
648
500
|
logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
|
649
501
|
|
650
502
|
kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
|
@@ -931,6 +783,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
931
783
|
TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
|
932
784
|
"""
|
933
785
|
dtype = array.type
|
786
|
+
MAX_BYTES = 2147483646
|
934
787
|
max_str_len = None
|
935
788
|
if pa.types.is_integer(dtype):
|
936
789
|
max_str_len = _int_max_string_len()
|
@@ -942,7 +795,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
942
795
|
max_str_len = _max_decimal256_string_len()
|
943
796
|
|
944
797
|
if max_str_len is not None:
|
945
|
-
max_elems_per_chunk =
|
798
|
+
max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
|
946
799
|
all_chunks = []
|
947
800
|
for chunk in array.chunks:
|
948
801
|
if len(chunk) < max_elems_per_chunk:
|