deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,50 @@
|
|
1
|
-
|
1
|
+
import logging
|
2
|
+
from typing import Optional, Any, Set
|
2
3
|
|
3
4
|
from pyiceberg.catalog import Catalog
|
4
|
-
from
|
5
|
+
from pyiceberg.table import Table
|
6
|
+
import deltacat.logs as logs
|
7
|
+
|
8
|
+
from deltacat.storage.model.scan.push_down import Pushdown, PartitionFilter
|
5
9
|
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
6
10
|
from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
|
7
11
|
from deltacat.storage.util.scan_planner import ScanPlanner
|
8
12
|
from deltacat.experimental.storage.iceberg.impl import _try_load_iceberg_table
|
13
|
+
from deltacat.experimental.storage.iceberg.visitor import IcebergExpressionVisitor
|
14
|
+
|
15
|
+
# Initialize DeltaCAT logger
|
16
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
9
17
|
|
10
18
|
|
11
19
|
class IcebergScanPlanner(ScanPlanner):
|
12
20
|
def __init__(self, catalog: Catalog):
|
13
21
|
self.catalog = catalog
|
22
|
+
self.expression_visitor = IcebergExpressionVisitor()
|
23
|
+
|
24
|
+
@classmethod
|
25
|
+
def _collect_filter_fields(cls, expr: Any) -> Set[str]:
|
26
|
+
"""
|
27
|
+
Collects all field names referenced in the filter expression.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
expr: The expression to analyze
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
Set of field names referenced in the expression
|
34
|
+
"""
|
35
|
+
fields = set()
|
36
|
+
if hasattr(expr, "field"):
|
37
|
+
fields.add(expr.field)
|
38
|
+
if hasattr(expr, "left"):
|
39
|
+
fields.update(cls._collect_filter_fields(expr.left))
|
40
|
+
if hasattr(expr, "right"):
|
41
|
+
fields.update(cls._collect_filter_fields(expr.right))
|
42
|
+
if hasattr(expr, "expr"):
|
43
|
+
fields.update(cls._collect_filter_fields(expr.expr))
|
44
|
+
if hasattr(expr, "values"):
|
45
|
+
for value in expr.values:
|
46
|
+
fields.update(cls._collect_filter_fields(value))
|
47
|
+
return fields
|
14
48
|
|
15
49
|
def create_scan_plan(
|
16
50
|
self,
|
@@ -21,8 +55,75 @@ class IcebergScanPlanner(ScanPlanner):
|
|
21
55
|
iceberg_table = _try_load_iceberg_table(
|
22
56
|
self.catalog, namespace=namespace, table_name=table_name
|
23
57
|
)
|
58
|
+
|
59
|
+
# TODO: implement row, column predicate pushdown to Iceberg
|
60
|
+
|
61
|
+
# Get the partition spec
|
62
|
+
partition_spec = iceberg_table.spec()
|
63
|
+
|
64
|
+
# Check if the table is partitioned
|
65
|
+
is_partitioned = len(partition_spec.fields) > 0
|
66
|
+
|
67
|
+
scan = iceberg_table.scan()
|
68
|
+
if is_partitioned:
|
69
|
+
if pushdown and pushdown.partition_filter:
|
70
|
+
filter_fields = self._collect_filter_fields(pushdown.partition_filter)
|
71
|
+
logger.info(
|
72
|
+
f"Pushdown partition filter is enabled, converting to Iceberg. Fields discovered in filter: {', '.join(sorted(filter_fields))}"
|
73
|
+
)
|
74
|
+
# Handle partition filter if present, DeltaCAT only supports partition-level filters right now
|
75
|
+
iceberg_expression = self._convert_partition_filter(
|
76
|
+
iceberg_table, pushdown.partition_filter
|
77
|
+
)
|
78
|
+
scan = scan.filter(iceberg_expression)
|
79
|
+
|
24
80
|
file_scan_tasks = []
|
25
|
-
|
26
|
-
for scan_task in iceberg_table.scan().plan_files():
|
81
|
+
for scan_task in scan.plan_files():
|
27
82
|
file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
|
28
83
|
return ScanPlan(file_scan_tasks)
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def _validate_partition_references(
|
87
|
+
cls, expr: Any, partition_cols: Set[str]
|
88
|
+
) -> None:
|
89
|
+
"""
|
90
|
+
Validates that the expression only references partition columns.
|
91
|
+
|
92
|
+
Args:
|
93
|
+
expr: The expression to validate
|
94
|
+
partition_cols: Set of valid partition column names
|
95
|
+
|
96
|
+
Raises:
|
97
|
+
ValueError: If the expression references a non-partition column
|
98
|
+
"""
|
99
|
+
if hasattr(expr, "field"): # Reference type expression
|
100
|
+
if expr.field not in partition_cols:
|
101
|
+
raise ValueError(
|
102
|
+
f"Filter references non-partition column: {expr.field}. "
|
103
|
+
f"Partition columns are: {partition_cols}"
|
104
|
+
)
|
105
|
+
# Recursively validate nested expressions
|
106
|
+
if hasattr(expr, "left"):
|
107
|
+
cls._validate_partition_references(expr.left, partition_cols)
|
108
|
+
if hasattr(expr, "right"):
|
109
|
+
cls._validate_partition_references(expr.right, partition_cols)
|
110
|
+
if hasattr(expr, "expr"):
|
111
|
+
cls._validate_partition_references(expr.expr, partition_cols)
|
112
|
+
if hasattr(expr, "values"):
|
113
|
+
for value in expr.values:
|
114
|
+
cls._validate_partition_references(value, partition_cols)
|
115
|
+
|
116
|
+
def _convert_partition_filter(
|
117
|
+
self, table: Table, partition_filter: PartitionFilter
|
118
|
+
):
|
119
|
+
"""
|
120
|
+
Convert DeltaCAT partition filter to PyIceberg expression,
|
121
|
+
validating that only partition columns are referenced.
|
122
|
+
"""
|
123
|
+
partition_cols = set(field.name for field in table.spec().fields)
|
124
|
+
|
125
|
+
# Validate before converting
|
126
|
+
self._validate_partition_references(partition_filter, partition_cols)
|
127
|
+
|
128
|
+
# Convert to PyIceberg expression
|
129
|
+
return self.expression_visitor.visit(partition_filter)
|
@@ -41,7 +41,7 @@ from deltacat.experimental.storage.iceberg.model import (
|
|
41
41
|
NamespaceMapper,
|
42
42
|
TableMapper,
|
43
43
|
)
|
44
|
-
from deltacat.types.media import ContentType, StorageType,
|
44
|
+
from deltacat.types.media import ContentType, StorageType, DatasetType
|
45
45
|
from deltacat.utils.common import ReadKwargsProvider
|
46
46
|
|
47
47
|
from pyiceberg.catalog import Catalog
|
@@ -281,7 +281,7 @@ def get_latest_delta(
|
|
281
281
|
|
282
282
|
def download_delta(
|
283
283
|
delta_like: Union[Delta, DeltaLocator],
|
284
|
-
table_type:
|
284
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
285
285
|
storage_type: StorageType = StorageType.DISTRIBUTED,
|
286
286
|
max_parallelism: Optional[int] = None,
|
287
287
|
columns: Optional[List[str]] = None,
|
@@ -303,7 +303,7 @@ def download_delta(
|
|
303
303
|
def download_delta_manifest_entry(
|
304
304
|
delta_like: Union[Delta, DeltaLocator],
|
305
305
|
entry_index: int,
|
306
|
-
table_type:
|
306
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
307
307
|
columns: Optional[List[str]] = None,
|
308
308
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
309
309
|
*args,
|
@@ -603,6 +603,8 @@ def stage_delta(
|
|
603
603
|
properties: Optional[DeltaProperties] = None,
|
604
604
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
605
605
|
content_type: ContentType = ContentType.PARQUET,
|
606
|
+
schema: Optional[Schema] = None,
|
607
|
+
sort_scheme_id: Optional[str] = None,
|
606
608
|
*args,
|
607
609
|
**kwargs,
|
608
610
|
) -> Delta:
|
@@ -66,6 +66,7 @@ from deltacat.storage import (
|
|
66
66
|
TableVersionLocator,
|
67
67
|
Transform,
|
68
68
|
TransformName,
|
69
|
+
TruncateStrategy,
|
69
70
|
TruncateTransform,
|
70
71
|
TruncateTransformParameters,
|
71
72
|
UnknownTransform,
|
@@ -227,7 +228,10 @@ class TransformMapper(ModelMapper[IcebergTransform, Transform]):
|
|
227
228
|
)
|
228
229
|
if isinstance(obj, IcebergTruncateTransform):
|
229
230
|
return TruncateTransform.of(
|
230
|
-
TruncateTransformParameters.of(
|
231
|
+
TruncateTransformParameters.of(
|
232
|
+
width=obj.width,
|
233
|
+
truncate_strategy=TruncateStrategy.ICEBERG,
|
234
|
+
),
|
231
235
|
)
|
232
236
|
return UnknownTransform.of()
|
233
237
|
|
@@ -323,7 +327,7 @@ class PartitionSchemeMapper(ModelMapper[PartitionSpec, PartitionScheme]):
|
|
323
327
|
elif not schema:
|
324
328
|
err_msg = "Schema is required for Partition Spec conversion."
|
325
329
|
raise ValueError(err_msg)
|
326
|
-
keys = [PartitionKeyMapper.map(field, schema) for field in obj.fields]
|
330
|
+
keys = [PartitionKeyMapper.map(field, schema) for field in obj.fields] or None
|
327
331
|
return PartitionScheme.of(
|
328
332
|
keys=keys,
|
329
333
|
name=name,
|
@@ -425,7 +429,7 @@ class SortSchemeMapper(ModelMapper[IcebergSortOrder, SortScheme]):
|
|
425
429
|
elif not schema:
|
426
430
|
err_msg = "Schema is required for Sort Order conversion."
|
427
431
|
raise ValueError(err_msg)
|
428
|
-
keys = [SortKeyMapper.map(field, schema) for field in obj.fields]
|
432
|
+
keys = [SortKeyMapper.map(field, schema) for field in obj.fields] or None
|
429
433
|
return SortScheme.of(
|
430
434
|
keys=keys,
|
431
435
|
name=name,
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
import pyarrow
|
5
|
+
from deltacat.storage.model.scan.push_down import PartitionFilter
|
6
|
+
|
7
|
+
import deltacat.logs as logs
|
8
|
+
from deltacat.storage.model.expression import Reference, Literal
|
9
|
+
from deltacat.storage.model.expression.visitor import ExpressionVisitor
|
10
|
+
from pyiceberg.expressions import (
|
11
|
+
And,
|
12
|
+
Or,
|
13
|
+
Not,
|
14
|
+
EqualTo,
|
15
|
+
NotEqualTo,
|
16
|
+
GreaterThan,
|
17
|
+
GreaterThanOrEqual,
|
18
|
+
LessThan,
|
19
|
+
LessThanOrEqual,
|
20
|
+
IsNull,
|
21
|
+
In,
|
22
|
+
)
|
23
|
+
|
24
|
+
# Initialize DeltaCAT logger
|
25
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
+
|
27
|
+
|
28
|
+
class IcebergExpressionVisitor(ExpressionVisitor[None, Any]):
|
29
|
+
"""
|
30
|
+
Visitor that translates DeltaCAT expressions to PyIceberg expressions.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def visit(self, expr, context=None):
|
34
|
+
# Handle PartitionFilter by extracting and visiting the inner expression
|
35
|
+
if isinstance(expr, PartitionFilter):
|
36
|
+
return self.visit(expr.expr, context)
|
37
|
+
# Handle all other expressions using the parent's visit method
|
38
|
+
return super().visit(expr, context)
|
39
|
+
|
40
|
+
def visit_reference(self, expr: Reference, context=None) -> str:
|
41
|
+
return expr.field
|
42
|
+
|
43
|
+
def visit_literal(self, expr: Literal, context=None) -> Any:
|
44
|
+
# Convert PyArrow scalar to Python native type
|
45
|
+
return (
|
46
|
+
expr.value.as_py() if isinstance(expr.value, pyarrow.Scalar) else expr.value
|
47
|
+
)
|
48
|
+
|
49
|
+
def visit_and(self, expr, context=None):
|
50
|
+
left = self.visit(expr.left, context)
|
51
|
+
right = self.visit(expr.right, context)
|
52
|
+
return And(left, right)
|
53
|
+
|
54
|
+
def visit_or(self, expr, context=None):
|
55
|
+
left = self.visit(expr.left, context)
|
56
|
+
right = self.visit(expr.right, context)
|
57
|
+
return Or(left, right)
|
58
|
+
|
59
|
+
def visit_not(self, expr, context=None):
|
60
|
+
operand = self.visit(expr.operand, context)
|
61
|
+
return Not(operand)
|
62
|
+
|
63
|
+
def visit_equal(self, expr, context=None):
|
64
|
+
left = self.visit(expr.left, context)
|
65
|
+
right = self.visit(expr.right, context)
|
66
|
+
return EqualTo(left, right)
|
67
|
+
|
68
|
+
def visit_not_equal(self, expr, context=None):
|
69
|
+
left = self.visit(expr.left, context)
|
70
|
+
right = self.visit(expr.right, context)
|
71
|
+
return NotEqualTo(left, right)
|
72
|
+
|
73
|
+
def visit_greater_than(self, expr, context=None):
|
74
|
+
left = self.visit(expr.left, context)
|
75
|
+
right = self.visit(expr.right, context)
|
76
|
+
return GreaterThan(left, right)
|
77
|
+
|
78
|
+
def visit_greater_than_equal(self, expr, context=None):
|
79
|
+
left = self.visit(expr.left, context)
|
80
|
+
right = self.visit(expr.right, context)
|
81
|
+
return GreaterThanOrEqual(left, right)
|
82
|
+
|
83
|
+
def visit_less_than(self, expr, context=None):
|
84
|
+
left = self.visit(expr.left, context)
|
85
|
+
right = self.visit(expr.right, context)
|
86
|
+
return LessThan(left, right)
|
87
|
+
|
88
|
+
def visit_less_than_equal(self, expr, context=None):
|
89
|
+
left = self.visit(expr.left, context)
|
90
|
+
right = self.visit(expr.right, context)
|
91
|
+
return LessThanOrEqual(left, right)
|
92
|
+
|
93
|
+
def visit_is_null(self, expr, context=None):
|
94
|
+
operand = self.visit(expr.operand, context)
|
95
|
+
return IsNull(operand)
|
96
|
+
|
97
|
+
def visit_in(self, expr, context=None):
|
98
|
+
value = self.visit(expr.value, context)
|
99
|
+
values = [self.visit(v, context) for v in expr.values]
|
100
|
+
return In(value, values)
|
101
|
+
|
102
|
+
def visit_between(self, expr, context=None):
|
103
|
+
value = self.visit(expr.value, context)
|
104
|
+
lower = self.visit(expr.lower, context)
|
105
|
+
upper = self.visit(expr.upper, context)
|
106
|
+
return And(GreaterThanOrEqual(value, lower), LessThanOrEqual(value, upper))
|
107
|
+
|
108
|
+
# PyIceberg does not have a direct equivalent of LIKE
|
109
|
+
def visit_like(self, expr, context=None):
|
110
|
+
value = self.visit(expr.value, context)
|
111
|
+
pattern = self.visit(expr.pattern, context)
|
112
|
+
logger.warning(
|
113
|
+
f"LIKE operation is not supported in PyIceberg. Ignoring LIKE filter: {value} LIKE '{pattern}'. "
|
114
|
+
"This may result in more data being returned than expected."
|
115
|
+
)
|
116
|
+
# Return None or a default expression that won't filter anything
|
117
|
+
return (
|
118
|
+
None # or return NotEqualTo(value, None) # matches everything except NULL
|
119
|
+
)
|
@@ -52,7 +52,6 @@ from deltacat.storage import (
|
|
52
52
|
TableVersion,
|
53
53
|
TableVersionLocator,
|
54
54
|
Transaction,
|
55
|
-
TransactionType,
|
56
55
|
TransactionOperation,
|
57
56
|
TransactionOperationType,
|
58
57
|
)
|
@@ -299,7 +298,6 @@ class Dataset:
|
|
299
298
|
partition_values=DEFAULT_PARTITION_VALUES,
|
300
299
|
partition_id=self._partition_id,
|
301
300
|
),
|
302
|
-
schema=None,
|
303
301
|
content_types=None,
|
304
302
|
),
|
305
303
|
]
|
@@ -312,7 +310,6 @@ class Dataset:
|
|
312
310
|
]
|
313
311
|
|
314
312
|
transaction = Transaction.of(
|
315
|
-
txn_type=TransactionType.APPEND,
|
316
313
|
txn_operations=TransactionOperationList.of(txn_operations),
|
317
314
|
)
|
318
315
|
|
@@ -10,7 +10,6 @@ from deltacat.storage import (
|
|
10
10
|
Delta,
|
11
11
|
DeltaType,
|
12
12
|
Transaction,
|
13
|
-
TransactionType,
|
14
13
|
TransactionOperation,
|
15
14
|
TransactionOperationType,
|
16
15
|
)
|
@@ -169,7 +168,6 @@ class DeltacatManifestIO(ManifestIO):
|
|
169
168
|
delta["level"] = level
|
170
169
|
|
171
170
|
tx_results = Transaction.of(
|
172
|
-
txn_type=TransactionType.APPEND,
|
173
171
|
txn_operations=TransactionOperationList.of(
|
174
172
|
[
|
175
173
|
TransactionOperation.of(
|
@@ -5,6 +5,7 @@ from typing import Generator, Optional
|
|
5
5
|
import pyarrow
|
6
6
|
import pyarrow.fs
|
7
7
|
|
8
|
+
from deltacat.constants import REV_DIR_NAME
|
8
9
|
from deltacat.storage import Delta
|
9
10
|
from deltacat.storage.model.partition import PartitionLocator
|
10
11
|
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
@@ -83,7 +84,7 @@ class DatasetMetastore:
|
|
83
84
|
param: filesystem: The filesystem to search for the revisions.
|
84
85
|
returns: The latest revision as a RivuletDelta.
|
85
86
|
"""
|
86
|
-
rev_directory = posixpath.join(delta_dir,
|
87
|
+
rev_directory = posixpath.join(delta_dir, REV_DIR_NAME)
|
87
88
|
revisions = filesystem.get_file_info(
|
88
89
|
pyarrow.fs.FileSelector(rev_directory, allow_not_found=True)
|
89
90
|
)
|
@@ -128,7 +129,7 @@ class DatasetMetastore:
|
|
128
129
|
return
|
129
130
|
|
130
131
|
# Locate "rev" directory inside the partition
|
131
|
-
rev_directory = posixpath.join(partition_path,
|
132
|
+
rev_directory = posixpath.join(partition_path, REV_DIR_NAME)
|
132
133
|
rev_info = filesystem.get_file_info(rev_directory)
|
133
134
|
|
134
135
|
if rev_info.type != pyarrow.fs.FileType.Directory:
|
@@ -350,7 +350,6 @@ class DeltaCatDatasource(Datasource):
|
|
350
350
|
]
|
351
351
|
elif self._deltacat_read_type == DeltacatReadType.METADATA_LIST:
|
352
352
|
# do a shallow read of the top-level DeltaCAT metadata
|
353
|
-
print(f"listers: {self._reader.listers}")
|
354
353
|
listers = copy.deepcopy(self._reader.listers)
|
355
354
|
listers = [listers[0]]
|
356
355
|
read_tasks = self._list_all_metafiles_read_tasks(
|
deltacat/storage/__init__.py
CHANGED
@@ -20,6 +20,9 @@ from deltacat.storage.model.metafile import (
|
|
20
20
|
from deltacat.storage.model.transaction import (
|
21
21
|
TransactionOperation,
|
22
22
|
Transaction,
|
23
|
+
read_transaction,
|
24
|
+
transactions,
|
25
|
+
transaction,
|
23
26
|
)
|
24
27
|
from deltacat.storage.model.namespace import (
|
25
28
|
Namespace,
|
@@ -31,6 +34,7 @@ from deltacat.storage.model.partition import (
|
|
31
34
|
PartitionLocator,
|
32
35
|
PartitionLocatorAlias,
|
33
36
|
PartitionKey,
|
37
|
+
PartitionKeyList,
|
34
38
|
PartitionScheme,
|
35
39
|
PartitionSchemeList,
|
36
40
|
PartitionValues,
|
@@ -43,6 +47,9 @@ from deltacat.storage.model.schema import (
|
|
43
47
|
NestedFieldName,
|
44
48
|
Schema,
|
45
49
|
SchemaList,
|
50
|
+
SchemaUpdate,
|
51
|
+
SchemaUpdateOperation,
|
52
|
+
SchemaUpdateOperations,
|
46
53
|
)
|
47
54
|
from deltacat.storage.model.stream import (
|
48
55
|
Stream,
|
@@ -75,6 +82,7 @@ from deltacat.storage.model.transform import (
|
|
75
82
|
MonthTransform,
|
76
83
|
YearTransform,
|
77
84
|
TruncateTransform,
|
85
|
+
TruncateStrategy,
|
78
86
|
)
|
79
87
|
from deltacat.storage.model.types import (
|
80
88
|
CommitState,
|
@@ -88,11 +96,12 @@ from deltacat.storage.model.types import (
|
|
88
96
|
SchemaConsistencyType,
|
89
97
|
StreamFormat,
|
90
98
|
SortOrder,
|
91
|
-
TransactionType,
|
92
99
|
TransactionOperationType,
|
100
|
+
TransactionStatus,
|
93
101
|
)
|
94
102
|
from deltacat.storage.model.sort_key import (
|
95
103
|
SortKey,
|
104
|
+
SortKeyList,
|
96
105
|
SortScheme,
|
97
106
|
SortSchemeList,
|
98
107
|
)
|
@@ -138,6 +147,7 @@ __all__ = [
|
|
138
147
|
"NullOrder",
|
139
148
|
"Partition",
|
140
149
|
"PartitionKey",
|
150
|
+
"PartitionKeyList",
|
141
151
|
"PartitionLocator",
|
142
152
|
"PartitionLocatorAlias",
|
143
153
|
"PartitionScheme",
|
@@ -145,8 +155,12 @@ __all__ = [
|
|
145
155
|
"PartitionValues",
|
146
156
|
"Schema",
|
147
157
|
"SchemaList",
|
158
|
+
"SchemaUpdate",
|
159
|
+
"SchemaUpdateOperation",
|
160
|
+
"SchemaUpdateOperations",
|
148
161
|
"SchemaConsistencyType",
|
149
162
|
"SortKey",
|
163
|
+
"SortKeyList",
|
150
164
|
"SortOrder",
|
151
165
|
"SortScheme",
|
152
166
|
"SortSchemeList",
|
@@ -163,13 +177,17 @@ __all__ = [
|
|
163
177
|
"Transaction",
|
164
178
|
"TransactionOperation",
|
165
179
|
"TransactionOperationType",
|
166
|
-
"
|
180
|
+
"TransactionStatus",
|
167
181
|
"Transform",
|
168
182
|
"TransformName",
|
169
183
|
"TransformParameters",
|
170
184
|
"TruncateTransform",
|
171
185
|
"TruncateTransformParameters",
|
186
|
+
"TruncateStrategy",
|
172
187
|
"UnknownTransform",
|
173
188
|
"VoidTransform",
|
174
189
|
"YearTransform",
|
190
|
+
"read_transaction",
|
191
|
+
"transactions",
|
192
|
+
"transaction",
|
175
193
|
]
|