deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
deltacat/storage/model/delta.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
import json
|
4
5
|
import posixpath
|
5
6
|
from typing import Any, Dict, List, Optional
|
6
7
|
|
@@ -48,8 +49,7 @@ class Delta(Metafile):
|
|
48
49
|
) -> Delta:
|
49
50
|
"""
|
50
51
|
Creates a Delta metadata model with the given Delta Locator, Delta Type,
|
51
|
-
manifest metadata, properties, manifest, and previous delta stream
|
52
|
-
position.
|
52
|
+
manifest metadata, properties, manifest, and previous delta stream position.
|
53
53
|
"""
|
54
54
|
delta = Delta()
|
55
55
|
delta.locator = locator
|
@@ -275,6 +275,13 @@ class Delta(Metafile):
|
|
275
275
|
return delta_locator.partition_values
|
276
276
|
return None
|
277
277
|
|
278
|
+
@property
|
279
|
+
def partition_values_json(self) -> Optional[str]:
|
280
|
+
partition_values = (
|
281
|
+
self.partition_values if self.partition_values is not None else None
|
282
|
+
)
|
283
|
+
return json.dumps(partition_values)
|
284
|
+
|
278
285
|
@property
|
279
286
|
def stream_position(self) -> Optional[int]:
|
280
287
|
delta_locator = self.locator
|
@@ -282,6 +289,13 @@ class Delta(Metafile):
|
|
282
289
|
return delta_locator.stream_position
|
283
290
|
return None
|
284
291
|
|
292
|
+
def url(self, catalog_name: Optional[str] = None) -> str:
|
293
|
+
return (
|
294
|
+
f"dc://{catalog_name}/{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/{self.partition_values_json}/{self.stream_position}/"
|
295
|
+
if catalog_name
|
296
|
+
else f"table://{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/{self.partition_values_json}/{self.stream_position}/"
|
297
|
+
)
|
298
|
+
|
285
299
|
def to_serializable(self) -> Delta:
|
286
300
|
serializable = self
|
287
301
|
if serializable.table_locator:
|
@@ -378,7 +392,17 @@ class DeltaLocator(Locator, dict):
|
|
378
392
|
partition_values,
|
379
393
|
partition_id,
|
380
394
|
)
|
381
|
-
if
|
395
|
+
if any(
|
396
|
+
[
|
397
|
+
partition_id,
|
398
|
+
partition_values,
|
399
|
+
stream_id,
|
400
|
+
stream_format,
|
401
|
+
table_name,
|
402
|
+
table_version,
|
403
|
+
namespace,
|
404
|
+
]
|
405
|
+
)
|
382
406
|
else None
|
383
407
|
)
|
384
408
|
return DeltaLocator.of(
|
@@ -90,29 +90,23 @@ class Locator:
|
|
90
90
|
def canonical_string(self, separator: str = DEFAULT_NAME_SEPARATOR) -> str:
|
91
91
|
"""
|
92
92
|
Returns a unique string for the given locator that can be used
|
93
|
-
for equality checks
|
94
|
-
the same canonical string).
|
93
|
+
for equality checks between objects with the same parent.
|
95
94
|
"""
|
96
|
-
|
97
|
-
parent_hexdigest = self.parent.hexdigest() if self.parent else None
|
98
|
-
if parent_hexdigest:
|
99
|
-
parts.append(parent_hexdigest)
|
100
|
-
parts.extend(self.name.parts())
|
101
|
-
return separator.join([str(part) for part in parts])
|
95
|
+
return separator.join([str(part) for part in self.name.parts()])
|
102
96
|
|
103
97
|
def digest(self) -> bytes:
|
104
98
|
"""
|
105
99
|
Return a digest of the given locator that can be used for
|
106
|
-
equality checks
|
107
|
-
|
100
|
+
equality checks between objects with the same parent and uniform
|
101
|
+
random hash distribution.
|
108
102
|
"""
|
109
103
|
return sha1_digest(self.canonical_string().encode("utf-8"))
|
110
104
|
|
111
105
|
def hexdigest(self) -> str:
|
112
106
|
"""
|
113
107
|
Returns a hexdigest of the given locator suitable
|
114
|
-
|
115
|
-
|
108
|
+
equality checks between objects with the same parent and
|
109
|
+
inclusion in URLs.
|
116
110
|
"""
|
117
111
|
return sha1_hexdigest(self.canonical_string().encode("utf-8"))
|
118
112
|
|
@@ -4,14 +4,26 @@ import logging
|
|
4
4
|
import itertools
|
5
5
|
|
6
6
|
from enum import Enum
|
7
|
-
from typing import Optional, List, Dict, Any
|
7
|
+
from typing import Optional, List, Dict, Any, TYPE_CHECKING
|
8
8
|
from uuid import uuid4
|
9
9
|
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from deltacat.storage.model.schema import FieldLocator
|
12
|
+
|
10
13
|
from deltacat import logs
|
11
14
|
|
12
|
-
from deltacat.
|
15
|
+
from deltacat.types.media import (
|
16
|
+
ContentType,
|
17
|
+
ContentEncoding,
|
18
|
+
EXT_TO_CONTENT_TYPE,
|
19
|
+
EXT_TO_CONTENT_ENCODING,
|
20
|
+
)
|
13
21
|
|
14
22
|
import json
|
23
|
+
import pyarrow as pa
|
24
|
+
import posixpath
|
25
|
+
|
26
|
+
from deltacat.utils.filesystem import get_file_info
|
15
27
|
|
16
28
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
17
29
|
|
@@ -64,7 +76,7 @@ class EntryParams(dict):
|
|
64
76
|
|
65
77
|
@staticmethod
|
66
78
|
def of(
|
67
|
-
equality_field_locators: Optional[List[FieldLocator]] = None,
|
79
|
+
equality_field_locators: Optional[List["FieldLocator"]] = None,
|
68
80
|
) -> EntryParams:
|
69
81
|
params = EntryParams()
|
70
82
|
if equality_field_locators is not None:
|
@@ -72,7 +84,7 @@ class EntryParams(dict):
|
|
72
84
|
return params
|
73
85
|
|
74
86
|
@property
|
75
|
-
def equality_field_locators(self) -> Optional[List[FieldLocator]]:
|
87
|
+
def equality_field_locators(self) -> Optional[List["FieldLocator"]]:
|
76
88
|
return self.get("equality_field_locators")
|
77
89
|
|
78
90
|
|
@@ -118,11 +130,35 @@ class Manifest(dict):
|
|
118
130
|
content_encoding = None
|
119
131
|
credentials = None
|
120
132
|
content_type_params = None
|
133
|
+
schema_id = None
|
134
|
+
sort_scheme_id = None
|
121
135
|
if entries:
|
122
136
|
content_type = entries[0].meta.content_type
|
123
137
|
content_encoding = entries[0].meta.content_encoding
|
124
138
|
credentials = entries[0].meta.credentials
|
125
139
|
content_type_params = entries[0].meta.content_type_parameters
|
140
|
+
|
141
|
+
# Keep the latest schema ID
|
142
|
+
# Schema IDs are >= 0, and schema evolution always increments the last schema ID
|
143
|
+
entry_schema_ids = [
|
144
|
+
entry.meta.schema_id if entry.meta.schema_id is not None else -1
|
145
|
+
for entry in entries
|
146
|
+
]
|
147
|
+
max_schema_id = max(entry_schema_ids) if entry_schema_ids else -1
|
148
|
+
schema_id = max_schema_id if max_schema_id >= 0 else None
|
149
|
+
|
150
|
+
# Handle sort_scheme_id: set to None if entries have multiple different sort_scheme_ids
|
151
|
+
entry_sort_scheme_ids = set(
|
152
|
+
entry.meta.sort_scheme_id
|
153
|
+
for entry in entries
|
154
|
+
if entry.meta.sort_scheme_id is not None
|
155
|
+
)
|
156
|
+
sort_scheme_id = (
|
157
|
+
list(entry_sort_scheme_ids)[0]
|
158
|
+
if len(entry_sort_scheme_ids) == 1
|
159
|
+
else None
|
160
|
+
)
|
161
|
+
|
126
162
|
for entry in entries:
|
127
163
|
meta = entry.meta
|
128
164
|
if meta.content_type != content_type:
|
@@ -130,7 +166,7 @@ class Manifest(dict):
|
|
130
166
|
if meta.content_encoding != content_encoding:
|
131
167
|
content_encoding = None
|
132
168
|
entry_content_type = meta.content_type
|
133
|
-
if entry_content_type != content_type:
|
169
|
+
if content_type and entry_content_type != content_type:
|
134
170
|
msg = (
|
135
171
|
f"Expected all manifest entries to have content "
|
136
172
|
f"type '{content_type}' but found "
|
@@ -138,7 +174,7 @@ class Manifest(dict):
|
|
138
174
|
)
|
139
175
|
raise ValueError(msg)
|
140
176
|
entry_content_encoding = meta.get("content_encoding", None)
|
141
|
-
if entry_content_encoding != content_encoding:
|
177
|
+
if content_encoding and entry_content_encoding != content_encoding:
|
142
178
|
msg = (
|
143
179
|
f"Expected all manifest entries to have content "
|
144
180
|
f"encoding '{content_encoding}' but found "
|
@@ -190,6 +226,8 @@ class Manifest(dict):
|
|
190
226
|
content_type_parameters=content_type_params,
|
191
227
|
entry_type=entry_type,
|
192
228
|
entry_params=entry_params,
|
229
|
+
schema_id=schema_id,
|
230
|
+
sort_scheme_id=sort_scheme_id,
|
193
231
|
)
|
194
232
|
manifest = Manifest._build_manifest(meta, entries, author, uuid)
|
195
233
|
return manifest
|
@@ -256,6 +294,8 @@ class ManifestMeta(dict):
|
|
256
294
|
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
257
295
|
entry_type: Optional[EntryType] = None,
|
258
296
|
entry_params: Optional[EntryParams] = None,
|
297
|
+
schema_id: Optional[int] = None,
|
298
|
+
sort_scheme_id: Optional[str] = None,
|
259
299
|
) -> ManifestMeta:
|
260
300
|
manifest_meta = ManifestMeta()
|
261
301
|
if record_count is not None:
|
@@ -278,6 +318,10 @@ class ManifestMeta(dict):
|
|
278
318
|
)
|
279
319
|
if entry_params is not None:
|
280
320
|
manifest_meta["entry_params"] = entry_params
|
321
|
+
if schema_id is not None:
|
322
|
+
manifest_meta["schema_id"] = schema_id
|
323
|
+
if sort_scheme_id is not None:
|
324
|
+
manifest_meta["sort_scheme_id"] = sort_scheme_id
|
281
325
|
return manifest_meta
|
282
326
|
|
283
327
|
@staticmethod
|
@@ -295,6 +339,8 @@ class ManifestMeta(dict):
|
|
295
339
|
content_type_parameters=obj.get("content_type_parameters"),
|
296
340
|
entry_type=obj.get("entry_type"),
|
297
341
|
entry_params=obj.get("entry_params"),
|
342
|
+
schema_id=obj.get("schema_id"),
|
343
|
+
sort_scheme_id=obj.get("sort_scheme_id"),
|
298
344
|
)
|
299
345
|
|
300
346
|
@property
|
@@ -343,6 +389,14 @@ class ManifestMeta(dict):
|
|
343
389
|
self["entry_params"] = val = EntryParams(val)
|
344
390
|
return val
|
345
391
|
|
392
|
+
@property
|
393
|
+
def schema_id(self) -> Optional[int]:
|
394
|
+
return self.get("schema_id")
|
395
|
+
|
396
|
+
@property
|
397
|
+
def sort_scheme_id(self) -> Optional[str]:
|
398
|
+
return self.get("sort_scheme_id")
|
399
|
+
|
346
400
|
|
347
401
|
class ManifestEntry(dict):
|
348
402
|
@staticmethod
|
@@ -375,6 +429,10 @@ class ManifestEntry(dict):
|
|
375
429
|
url: str,
|
376
430
|
record_count: int,
|
377
431
|
source_content_length: Optional[int] = None,
|
432
|
+
credentials: Optional[Dict[str, str]] = None,
|
433
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
434
|
+
entry_type: Optional[EntryType] = None,
|
435
|
+
entry_params: Optional[EntryParams] = None,
|
378
436
|
**s3_client_kwargs,
|
379
437
|
) -> ManifestEntry:
|
380
438
|
from deltacat.aws import s3u as s3_utils
|
@@ -387,6 +445,10 @@ class ManifestEntry(dict):
|
|
387
445
|
content_type=s3_obj["ContentType"],
|
388
446
|
content_encoding=s3_obj["ContentEncoding"],
|
389
447
|
source_content_length=source_content_length,
|
448
|
+
credentials=credentials,
|
449
|
+
content_type_parameters=content_type_parameters,
|
450
|
+
entry_type=entry_type,
|
451
|
+
entry_params=entry_params,
|
390
452
|
)
|
391
453
|
manifest_entry = ManifestEntry.of(url, manifest_entry_meta)
|
392
454
|
return manifest_entry
|
@@ -401,6 +463,116 @@ class ManifestEntry(dict):
|
|
401
463
|
uuid=obj.get("id"),
|
402
464
|
)
|
403
465
|
|
466
|
+
@staticmethod
|
467
|
+
def from_path(
|
468
|
+
path: str,
|
469
|
+
filesystem: pa.fs.FileSystem,
|
470
|
+
record_count: int,
|
471
|
+
source_content_length: Optional[int] = None,
|
472
|
+
content_type: Optional[str] = None,
|
473
|
+
content_encoding: Optional[str] = None,
|
474
|
+
credentials: Optional[Dict[str, str]] = None,
|
475
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
476
|
+
entry_type: Optional[EntryType] = None,
|
477
|
+
entry_params: Optional[EntryParams] = None,
|
478
|
+
schema_id: Optional[int] = None,
|
479
|
+
sort_scheme_id: Optional[str] = None,
|
480
|
+
) -> ManifestEntry:
|
481
|
+
"""
|
482
|
+
Creates a manifest entry from a path using a pyarrow filesystem.
|
483
|
+
|
484
|
+
Args:
|
485
|
+
path: Path to the file
|
486
|
+
filesystem: PyArrow filesystem to use for accessing the file
|
487
|
+
record_count: Number of records in the file
|
488
|
+
source_content_length: Optional original content length in-memory
|
489
|
+
before writing to disk.
|
490
|
+
content_type: Optional content type override. If not provided, will
|
491
|
+
be derived from file extension.
|
492
|
+
content_encoding: Optional content encoding override. If not
|
493
|
+
provided, will be derived from file extension.
|
494
|
+
credentials: Optional credentials required to read this manifest entry.
|
495
|
+
content_type_parameters: Optional content type parameters.
|
496
|
+
entry_type: Optional entry type of this manifest entry. Defaults to DATA.
|
497
|
+
entry_params: Optional entry type parameters.
|
498
|
+
schema_id: Schema ID used to write this manifest entry.
|
499
|
+
sort_scheme_id: Sort scheme ID used to write this manifest entry.
|
500
|
+
|
501
|
+
Returns:
|
502
|
+
A ManifestEntry instance
|
503
|
+
"""
|
504
|
+
file_info = get_file_info(path, filesystem)
|
505
|
+
if file_info.type != pa.fs.FileType.File:
|
506
|
+
raise FileNotFoundError(f"Path does not point to a file: {path}")
|
507
|
+
|
508
|
+
# Extract extensions from right to left
|
509
|
+
# First split will get potential encoding extension
|
510
|
+
base_path, ext1 = posixpath.splitext(path)
|
511
|
+
|
512
|
+
# Initialize with defaults for no extensions
|
513
|
+
derived_content_type = ContentType.BINARY
|
514
|
+
derived_content_encoding = ContentEncoding.IDENTITY
|
515
|
+
|
516
|
+
# Only proceed with extension checks if we found at least one extension
|
517
|
+
if ext1:
|
518
|
+
# Check if the first extension is a known encoding
|
519
|
+
derived_content_encoding = EXT_TO_CONTENT_ENCODING.get(
|
520
|
+
ext1,
|
521
|
+
ContentEncoding.IDENTITY,
|
522
|
+
)
|
523
|
+
|
524
|
+
# Get second extension only if first was an encoding
|
525
|
+
if derived_content_encoding != ContentEncoding.IDENTITY:
|
526
|
+
# Second split will get potential content type extension
|
527
|
+
_, ext2 = posixpath.splitext(base_path)
|
528
|
+
if ext2:
|
529
|
+
derived_content_type = EXT_TO_CONTENT_TYPE.get(
|
530
|
+
ext2,
|
531
|
+
ContentType.BINARY,
|
532
|
+
)
|
533
|
+
else:
|
534
|
+
# First extension wasn't an encoding, check if it's a
|
535
|
+
# content type
|
536
|
+
derived_content_type = EXT_TO_CONTENT_TYPE.get(
|
537
|
+
ext1,
|
538
|
+
ContentType.BINARY,
|
539
|
+
)
|
540
|
+
|
541
|
+
if (
|
542
|
+
derived_content_type == ContentType.BINARY
|
543
|
+
and derived_content_encoding != ContentEncoding.IDENTITY
|
544
|
+
):
|
545
|
+
logger.debug(
|
546
|
+
f"Found encoding {derived_content_encoding.value} but no "
|
547
|
+
f"content type for {path}, assuming binary"
|
548
|
+
)
|
549
|
+
|
550
|
+
# Use provided values if available, otherwise use derived values
|
551
|
+
final_content_type = (
|
552
|
+
content_type if content_type is not None else derived_content_type.value
|
553
|
+
)
|
554
|
+
final_content_encoding = (
|
555
|
+
content_encoding
|
556
|
+
if content_encoding is not None
|
557
|
+
else derived_content_encoding.value
|
558
|
+
)
|
559
|
+
|
560
|
+
manifest_entry_meta = ManifestMeta.of(
|
561
|
+
record_count=record_count,
|
562
|
+
content_length=file_info.size,
|
563
|
+
content_type=final_content_type,
|
564
|
+
content_encoding=final_content_encoding,
|
565
|
+
source_content_length=source_content_length,
|
566
|
+
credentials=credentials,
|
567
|
+
content_type_parameters=content_type_parameters,
|
568
|
+
entry_type=entry_type,
|
569
|
+
entry_params=entry_params,
|
570
|
+
schema_id=schema_id,
|
571
|
+
sort_scheme_id=sort_scheme_id,
|
572
|
+
)
|
573
|
+
manifest_entry = ManifestEntry.of(path, manifest_entry_meta)
|
574
|
+
return manifest_entry
|
575
|
+
|
404
576
|
@property
|
405
577
|
def uri(self) -> Optional[str]:
|
406
578
|
return self.get("uri")
|
@@ -465,3 +637,7 @@ class ManifestEntryList(List[ManifestEntry]):
|
|
465
637
|
if val is not None and not isinstance(val, ManifestEntry):
|
466
638
|
self[item] = val = ManifestEntry(val)
|
467
639
|
return val
|
640
|
+
|
641
|
+
def __iter__(self):
|
642
|
+
for i in range(len(self)):
|
643
|
+
yield self[i] # This triggers __getitem__ conversion
|