deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,248 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Dict, Generic, TypeVar, Callable, Optional
|
3
|
+
from functools import singledispatchmethod
|
4
|
+
import re
|
5
|
+
|
6
|
+
from deltacat.storage.model.expression import (
|
7
|
+
Expression,
|
8
|
+
Reference,
|
9
|
+
Literal,
|
10
|
+
BinaryExpression,
|
11
|
+
UnaryExpression,
|
12
|
+
In,
|
13
|
+
Between,
|
14
|
+
Like,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
C = TypeVar("C") # Context type
|
19
|
+
R = TypeVar("R") # Return type
|
20
|
+
|
21
|
+
|
22
|
+
class ExpressionVisitor(ABC, Generic[C, R]):
|
23
|
+
"""
|
24
|
+
Visitor pattern for deltacat expressions.
|
25
|
+
|
26
|
+
This base class provides two ways to implement visitors:
|
27
|
+
1. Using a procedure dictionary (_PROCEDURES) - for simple, declarative visitors
|
28
|
+
2. Using specialized visit_xyz methods with snake_case naming - for more control
|
29
|
+
|
30
|
+
Subclasses need only implement visit_reference and visit_literal, plus either:
|
31
|
+
- Define _PROCEDURES dictionary with functions for handling different expression types
|
32
|
+
- Implement specific visit_xyz methods (using snake_case) for individual expressions
|
33
|
+
"""
|
34
|
+
|
35
|
+
# Default procedure dictionary for subclasses to override
|
36
|
+
_PROCEDURES: Dict[str, Callable] = {}
|
37
|
+
|
38
|
+
def __init__(self):
|
39
|
+
"""Initialize visitor and validate required methods."""
|
40
|
+
# Pre-check for required methods
|
41
|
+
if not hasattr(self, "visit_reference") or not callable(
|
42
|
+
getattr(self, "visit_reference")
|
43
|
+
):
|
44
|
+
raise NotImplementedError("Subclasses must implement visit_reference")
|
45
|
+
if not hasattr(self, "visit_literal") or not callable(
|
46
|
+
getattr(self, "visit_literal")
|
47
|
+
):
|
48
|
+
raise NotImplementedError("Subclasses must implement visit_literal")
|
49
|
+
self._setup_default_procedure_handlers()
|
50
|
+
|
51
|
+
def _to_snake_case(self, name: str) -> str:
|
52
|
+
"""Convert PascalCase or camelCase to snake_case."""
|
53
|
+
pattern = re.compile(r"(?<!^)(?=[A-Z])")
|
54
|
+
return pattern.sub("_", name).lower()
|
55
|
+
|
56
|
+
def _setup_default_procedure_handlers(self):
|
57
|
+
"""Set up default procedure application methods if not overridden."""
|
58
|
+
if not hasattr(self, "_apply_binary") or not callable(
|
59
|
+
getattr(self, "_apply_binary")
|
60
|
+
):
|
61
|
+
self._apply_binary = lambda proc, left, right: proc(left, right)
|
62
|
+
if not hasattr(self, "_apply_unary") or not callable(
|
63
|
+
getattr(self, "_apply_unary")
|
64
|
+
):
|
65
|
+
self._apply_unary = lambda proc, operand: proc(operand)
|
66
|
+
if not hasattr(self, "_apply_in") or not callable(getattr(self, "_apply_in")):
|
67
|
+
self._apply_in = lambda proc, value, values: proc(value, values)
|
68
|
+
if not hasattr(self, "_apply_between") or not callable(
|
69
|
+
getattr(self, "_apply_between")
|
70
|
+
):
|
71
|
+
self._apply_between = lambda proc, value, lower, upper: proc(
|
72
|
+
value, lower, upper
|
73
|
+
)
|
74
|
+
if not hasattr(self, "_apply_like") or not callable(
|
75
|
+
getattr(self, "_apply_like")
|
76
|
+
):
|
77
|
+
self._apply_like = lambda proc, value, pattern: proc(value, pattern)
|
78
|
+
|
79
|
+
@singledispatchmethod
|
80
|
+
def visit(self, expr: Expression, context: Optional[C] = None) -> R:
|
81
|
+
"""
|
82
|
+
Generic visit method that dispatches to specific methods based on expression type.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
expr: The expression to visit
|
86
|
+
context: Optional context to pass through the visitor
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
Result of visiting the expression
|
90
|
+
"""
|
91
|
+
expr_type = type(expr).__name__
|
92
|
+
raise NotImplementedError(f"No visit method for type {expr_type}")
|
93
|
+
|
94
|
+
@visit.register
|
95
|
+
def _visit_reference(self, expr: Reference, context: Optional[C] = None) -> R:
|
96
|
+
"""Visit a Reference expression."""
|
97
|
+
return self.visit_reference(expr, context)
|
98
|
+
|
99
|
+
@visit.register
|
100
|
+
def _visit_literal(self, expr: Literal, context: Optional[C] = None) -> R:
|
101
|
+
"""Visit a Literal expression."""
|
102
|
+
return self.visit_literal(expr, context)
|
103
|
+
|
104
|
+
@visit.register
|
105
|
+
def _visit_binary(self, expr: BinaryExpression, context: Optional[C] = None) -> R:
|
106
|
+
"""Visit a binary expression using method specialization or procedures."""
|
107
|
+
expr_type = type(expr).__name__
|
108
|
+
|
109
|
+
left_result = self.visit(expr.left, context)
|
110
|
+
right_result = self.visit(expr.right, context)
|
111
|
+
|
112
|
+
method_name = f"visit_{self._to_snake_case(expr_type)}"
|
113
|
+
if hasattr(self, method_name):
|
114
|
+
method = getattr(self, method_name)
|
115
|
+
return method(expr, context)
|
116
|
+
|
117
|
+
if expr_type in self._PROCEDURES:
|
118
|
+
return self._apply_binary(
|
119
|
+
self._PROCEDURES[expr_type], left_result, right_result
|
120
|
+
)
|
121
|
+
|
122
|
+
try:
|
123
|
+
return self.visit_binary_expression(
|
124
|
+
expr, left_result, right_result, context
|
125
|
+
)
|
126
|
+
except NotImplementedError:
|
127
|
+
raise NotImplementedError(f"No handler for {expr_type}")
|
128
|
+
|
129
|
+
@visit.register
|
130
|
+
def _visit_unary(self, expr: UnaryExpression, context: Optional[C] = None) -> R:
|
131
|
+
"""Visit a unary expression using method specialization or procedures."""
|
132
|
+
expr_type = type(expr).__name__
|
133
|
+
|
134
|
+
operand_result = self.visit(expr.operand, context)
|
135
|
+
|
136
|
+
method_name = f"visit_{self._to_snake_case(expr_type)}"
|
137
|
+
if hasattr(self, method_name):
|
138
|
+
method = getattr(self, method_name)
|
139
|
+
return method(expr, context)
|
140
|
+
|
141
|
+
if expr_type in self._PROCEDURES:
|
142
|
+
return self._apply_unary(self._PROCEDURES[expr_type], operand_result)
|
143
|
+
|
144
|
+
try:
|
145
|
+
return self.visit_unary_expression(expr, operand_result, context)
|
146
|
+
except NotImplementedError:
|
147
|
+
raise NotImplementedError(f"No handler for {expr_type}")
|
148
|
+
|
149
|
+
@visit.register
|
150
|
+
def _visit_in(self, expr: In, context: Optional[C] = None) -> R:
|
151
|
+
"""Visit an In expression."""
|
152
|
+
if hasattr(self, "visit_in"):
|
153
|
+
return self.visit_in(expr, context)
|
154
|
+
|
155
|
+
if "In" in self._PROCEDURES:
|
156
|
+
value_result = self.visit(expr.value, context)
|
157
|
+
values_results = [self.visit(v, context) for v in expr.values]
|
158
|
+
return self._apply_in(self._PROCEDURES["In"], value_result, values_results)
|
159
|
+
|
160
|
+
raise NotImplementedError("No handler for In expression")
|
161
|
+
|
162
|
+
@visit.register
|
163
|
+
def _visit_between(self, expr: Between, context: Optional[C] = None) -> R:
|
164
|
+
"""Visit a Between expression."""
|
165
|
+
if hasattr(self, "visit_between"):
|
166
|
+
return self.visit_between(expr, context)
|
167
|
+
|
168
|
+
if "Between" in self._PROCEDURES:
|
169
|
+
value_result = self.visit(expr.value, context)
|
170
|
+
lower_result = self.visit(expr.lower, context)
|
171
|
+
upper_result = self.visit(expr.upper, context)
|
172
|
+
return self._apply_between(
|
173
|
+
self._PROCEDURES["Between"], value_result, lower_result, upper_result
|
174
|
+
)
|
175
|
+
|
176
|
+
raise NotImplementedError("No handler for Between expression")
|
177
|
+
|
178
|
+
@visit.register
|
179
|
+
def _visit_like(self, expr: Like, context: Optional[C] = None) -> R:
|
180
|
+
"""Visit a Like expression."""
|
181
|
+
if hasattr(self, "visit_like"):
|
182
|
+
return self.visit_like(expr, context)
|
183
|
+
|
184
|
+
if "Like" in self._PROCEDURES:
|
185
|
+
value_result = self.visit(expr.value, context)
|
186
|
+
pattern_result = self.visit(expr.pattern, context)
|
187
|
+
return self._apply_like(
|
188
|
+
self._PROCEDURES["Like"], value_result, pattern_result
|
189
|
+
)
|
190
|
+
|
191
|
+
raise NotImplementedError("No handler for Like expression")
|
192
|
+
|
193
|
+
@abstractmethod
|
194
|
+
def visit_reference(self, expr: Reference, context: Optional[C] = None) -> R:
|
195
|
+
"""Visit a Reference expression."""
|
196
|
+
pass
|
197
|
+
|
198
|
+
@abstractmethod
|
199
|
+
def visit_literal(self, expr: Literal, context: Optional[C] = None) -> R:
|
200
|
+
"""Visit a Literal expression."""
|
201
|
+
pass
|
202
|
+
|
203
|
+
def visit_binary_expression(
|
204
|
+
self, expr: BinaryExpression, left: R, right: R, context: Optional[C] = None
|
205
|
+
) -> R:
|
206
|
+
"""Default fallback handler for binary expressions."""
|
207
|
+
raise NotImplementedError(f"No handler for {type(expr).__name__}")
|
208
|
+
|
209
|
+
def visit_unary_expression(
|
210
|
+
self, expr: UnaryExpression, operand: R, context: Optional[C] = None
|
211
|
+
) -> R:
|
212
|
+
"""Default fallback handler for unary expressions."""
|
213
|
+
raise NotImplementedError(f"No handler for {type(expr).__name__}")
|
214
|
+
|
215
|
+
|
216
|
+
class DisplayVisitor(ExpressionVisitor[Expression, str]):
|
217
|
+
"""
|
218
|
+
Visitor implementation that formats expressions in standard infix notation.
|
219
|
+
For example: "a = b AND c > d" instead of "(AND (= a b) (> c d))".
|
220
|
+
"""
|
221
|
+
|
222
|
+
# Map all expression types to their string formatting procedures with infix notation
|
223
|
+
_PROCEDURES = {
|
224
|
+
# Binary operations with infix notation
|
225
|
+
"Equal": lambda left, right: f"{left} = {right}",
|
226
|
+
"NotEqual": lambda left, right: f"{left} <> {right}",
|
227
|
+
"GreaterThan": lambda left, right: f"{left} > {right}",
|
228
|
+
"LessThan": lambda left, right: f"{left} < {right}",
|
229
|
+
"GreaterThanEqual": lambda left, right: f"{left} >= {right}",
|
230
|
+
"LessThanEqual": lambda left, right: f"{left} <= {right}",
|
231
|
+
"And": lambda left, right: f"({left} AND {right})",
|
232
|
+
"Or": lambda left, right: f"({left} OR {right})",
|
233
|
+
# Unary operations
|
234
|
+
"Not": lambda operand: f"NOT ({operand})",
|
235
|
+
"IsNull": lambda operand: f"({operand}) IS NULL",
|
236
|
+
# Special operations
|
237
|
+
"In": lambda value, values: f"{value} IN ({', '.join(values)})",
|
238
|
+
"Between": lambda value, lower, upper: f"{value} BETWEEN {lower} AND {upper}",
|
239
|
+
"Like": lambda value, pattern: f"{value} LIKE {pattern}",
|
240
|
+
}
|
241
|
+
|
242
|
+
def visit_reference(self, expr: Reference, context=None) -> str:
|
243
|
+
"""Format a field reference."""
|
244
|
+
return expr.field
|
245
|
+
|
246
|
+
def visit_literal(self, expr: Literal, context=None) -> str:
|
247
|
+
"""Format a literal value using its PyArrow representation."""
|
248
|
+
return str(expr.value)
|
@@ -90,29 +90,23 @@ class Locator:
|
|
90
90
|
def canonical_string(self, separator: str = DEFAULT_NAME_SEPARATOR) -> str:
|
91
91
|
"""
|
92
92
|
Returns a unique string for the given locator that can be used
|
93
|
-
for equality checks
|
94
|
-
the same canonical string).
|
93
|
+
for equality checks between objects with the same parent.
|
95
94
|
"""
|
96
|
-
|
97
|
-
parent_hexdigest = self.parent.hexdigest() if self.parent else None
|
98
|
-
if parent_hexdigest:
|
99
|
-
parts.append(parent_hexdigest)
|
100
|
-
parts.extend(self.name.parts())
|
101
|
-
return separator.join([str(part) for part in parts])
|
95
|
+
return separator.join([str(part) for part in self.name.parts()])
|
102
96
|
|
103
97
|
def digest(self) -> bytes:
|
104
98
|
"""
|
105
99
|
Return a digest of the given locator that can be used for
|
106
|
-
equality checks
|
107
|
-
|
100
|
+
equality checks between objects with the same parent and uniform
|
101
|
+
random hash distribution.
|
108
102
|
"""
|
109
103
|
return sha1_digest(self.canonical_string().encode("utf-8"))
|
110
104
|
|
111
105
|
def hexdigest(self) -> str:
|
112
106
|
"""
|
113
107
|
Returns a hexdigest of the given locator suitable
|
114
|
-
|
115
|
-
|
108
|
+
equality checks between objects with the same parent and
|
109
|
+
inclusion in URLs.
|
116
110
|
"""
|
117
111
|
return sha1_hexdigest(self.canonical_string().encode("utf-8"))
|
118
112
|
|
@@ -4,12 +4,26 @@ import logging
|
|
4
4
|
import itertools
|
5
5
|
|
6
6
|
from enum import Enum
|
7
|
-
from typing import Optional, List, Dict, Any
|
7
|
+
from typing import Optional, List, Dict, Any, TYPE_CHECKING
|
8
8
|
from uuid import uuid4
|
9
9
|
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from deltacat.storage.model.schema import FieldLocator
|
12
|
+
|
10
13
|
from deltacat import logs
|
11
14
|
|
12
|
-
from deltacat.
|
15
|
+
from deltacat.types.media import (
|
16
|
+
ContentType,
|
17
|
+
ContentEncoding,
|
18
|
+
EXT_TO_CONTENT_TYPE,
|
19
|
+
EXT_TO_CONTENT_ENCODING,
|
20
|
+
)
|
21
|
+
|
22
|
+
import json
|
23
|
+
import pyarrow as pa
|
24
|
+
import posixpath
|
25
|
+
|
26
|
+
from deltacat.utils.filesystem import get_file_info
|
13
27
|
|
14
28
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
29
|
|
@@ -62,7 +76,7 @@ class EntryParams(dict):
|
|
62
76
|
|
63
77
|
@staticmethod
|
64
78
|
def of(
|
65
|
-
equality_field_locators: Optional[List[FieldLocator]] = None,
|
79
|
+
equality_field_locators: Optional[List["FieldLocator"]] = None,
|
66
80
|
) -> EntryParams:
|
67
81
|
params = EntryParams()
|
68
82
|
if equality_field_locators is not None:
|
@@ -70,7 +84,7 @@ class EntryParams(dict):
|
|
70
84
|
return params
|
71
85
|
|
72
86
|
@property
|
73
|
-
def equality_field_locators(self) -> Optional[List[FieldLocator]]:
|
87
|
+
def equality_field_locators(self) -> Optional[List["FieldLocator"]]:
|
74
88
|
return self.get("equality_field_locators")
|
75
89
|
|
76
90
|
|
@@ -116,11 +130,35 @@ class Manifest(dict):
|
|
116
130
|
content_encoding = None
|
117
131
|
credentials = None
|
118
132
|
content_type_params = None
|
133
|
+
schema_id = None
|
134
|
+
sort_scheme_id = None
|
119
135
|
if entries:
|
120
136
|
content_type = entries[0].meta.content_type
|
121
137
|
content_encoding = entries[0].meta.content_encoding
|
122
138
|
credentials = entries[0].meta.credentials
|
123
139
|
content_type_params = entries[0].meta.content_type_parameters
|
140
|
+
|
141
|
+
# Keep the latest schema ID
|
142
|
+
# Schema IDs are >= 0, and schema evolution always increments the last schema ID
|
143
|
+
entry_schema_ids = [
|
144
|
+
entry.meta.schema_id if entry.meta.schema_id is not None else -1
|
145
|
+
for entry in entries
|
146
|
+
]
|
147
|
+
max_schema_id = max(entry_schema_ids) if entry_schema_ids else -1
|
148
|
+
schema_id = max_schema_id if max_schema_id >= 0 else None
|
149
|
+
|
150
|
+
# Handle sort_scheme_id: set to None if entries have multiple different sort_scheme_ids
|
151
|
+
entry_sort_scheme_ids = set(
|
152
|
+
entry.meta.sort_scheme_id
|
153
|
+
for entry in entries
|
154
|
+
if entry.meta.sort_scheme_id is not None
|
155
|
+
)
|
156
|
+
sort_scheme_id = (
|
157
|
+
list(entry_sort_scheme_ids)[0]
|
158
|
+
if len(entry_sort_scheme_ids) == 1
|
159
|
+
else None
|
160
|
+
)
|
161
|
+
|
124
162
|
for entry in entries:
|
125
163
|
meta = entry.meta
|
126
164
|
if meta.content_type != content_type:
|
@@ -128,7 +166,7 @@ class Manifest(dict):
|
|
128
166
|
if meta.content_encoding != content_encoding:
|
129
167
|
content_encoding = None
|
130
168
|
entry_content_type = meta.content_type
|
131
|
-
if entry_content_type != content_type:
|
169
|
+
if content_type and entry_content_type != content_type:
|
132
170
|
msg = (
|
133
171
|
f"Expected all manifest entries to have content "
|
134
172
|
f"type '{content_type}' but found "
|
@@ -136,7 +174,7 @@ class Manifest(dict):
|
|
136
174
|
)
|
137
175
|
raise ValueError(msg)
|
138
176
|
entry_content_encoding = meta.get("content_encoding", None)
|
139
|
-
if entry_content_encoding != content_encoding:
|
177
|
+
if content_encoding and entry_content_encoding != content_encoding:
|
140
178
|
msg = (
|
141
179
|
f"Expected all manifest entries to have content "
|
142
180
|
f"encoding '{content_encoding}' but found "
|
@@ -188,10 +226,26 @@ class Manifest(dict):
|
|
188
226
|
content_type_parameters=content_type_params,
|
189
227
|
entry_type=entry_type,
|
190
228
|
entry_params=entry_params,
|
229
|
+
schema_id=schema_id,
|
230
|
+
sort_scheme_id=sort_scheme_id,
|
191
231
|
)
|
192
232
|
manifest = Manifest._build_manifest(meta, entries, author, uuid)
|
193
233
|
return manifest
|
194
234
|
|
235
|
+
@staticmethod
|
236
|
+
def from_json(json_string: str) -> Manifest:
|
237
|
+
parsed_dict = json.loads(json_string)
|
238
|
+
return Manifest.of(
|
239
|
+
entries=ManifestEntryList.of(
|
240
|
+
[
|
241
|
+
ManifestEntry.from_dict(entry)
|
242
|
+
for entry in parsed_dict.get("entries", [])
|
243
|
+
]
|
244
|
+
),
|
245
|
+
author=ManifestAuthor.from_dict(parsed_dict.get("author")),
|
246
|
+
uuid=parsed_dict.get("id"),
|
247
|
+
)
|
248
|
+
|
195
249
|
@staticmethod
|
196
250
|
def merge_manifests(
|
197
251
|
manifests: List[Manifest], author: Optional[ManifestAuthor] = None
|
@@ -240,6 +294,8 @@ class ManifestMeta(dict):
|
|
240
294
|
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
241
295
|
entry_type: Optional[EntryType] = None,
|
242
296
|
entry_params: Optional[EntryParams] = None,
|
297
|
+
schema_id: Optional[int] = None,
|
298
|
+
sort_scheme_id: Optional[str] = None,
|
243
299
|
) -> ManifestMeta:
|
244
300
|
manifest_meta = ManifestMeta()
|
245
301
|
if record_count is not None:
|
@@ -262,8 +318,31 @@ class ManifestMeta(dict):
|
|
262
318
|
)
|
263
319
|
if entry_params is not None:
|
264
320
|
manifest_meta["entry_params"] = entry_params
|
321
|
+
if schema_id is not None:
|
322
|
+
manifest_meta["schema_id"] = schema_id
|
323
|
+
if sort_scheme_id is not None:
|
324
|
+
manifest_meta["sort_scheme_id"] = sort_scheme_id
|
265
325
|
return manifest_meta
|
266
326
|
|
327
|
+
@staticmethod
|
328
|
+
def from_dict(obj: dict) -> Optional[ManifestMeta]:
|
329
|
+
if obj is None:
|
330
|
+
return None
|
331
|
+
|
332
|
+
return ManifestMeta.of(
|
333
|
+
record_count=obj.get("record_count"),
|
334
|
+
content_length=obj.get("content_length"),
|
335
|
+
content_type=obj.get("content_type"),
|
336
|
+
content_encoding=obj.get("content_encoding"),
|
337
|
+
source_content_length=obj.get("source_content_length"),
|
338
|
+
credentials=obj.get("credentials"),
|
339
|
+
content_type_parameters=obj.get("content_type_parameters"),
|
340
|
+
entry_type=obj.get("entry_type"),
|
341
|
+
entry_params=obj.get("entry_params"),
|
342
|
+
schema_id=obj.get("schema_id"),
|
343
|
+
sort_scheme_id=obj.get("sort_scheme_id"),
|
344
|
+
)
|
345
|
+
|
267
346
|
@property
|
268
347
|
def record_count(self) -> Optional[int]:
|
269
348
|
return self.get("record_count")
|
@@ -310,6 +389,14 @@ class ManifestMeta(dict):
|
|
310
389
|
self["entry_params"] = val = EntryParams(val)
|
311
390
|
return val
|
312
391
|
|
392
|
+
@property
|
393
|
+
def schema_id(self) -> Optional[int]:
|
394
|
+
return self.get("schema_id")
|
395
|
+
|
396
|
+
@property
|
397
|
+
def sort_scheme_id(self) -> Optional[str]:
|
398
|
+
return self.get("sort_scheme_id")
|
399
|
+
|
313
400
|
|
314
401
|
class ManifestEntry(dict):
|
315
402
|
@staticmethod
|
@@ -342,6 +429,10 @@ class ManifestEntry(dict):
|
|
342
429
|
url: str,
|
343
430
|
record_count: int,
|
344
431
|
source_content_length: Optional[int] = None,
|
432
|
+
credentials: Optional[Dict[str, str]] = None,
|
433
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
434
|
+
entry_type: Optional[EntryType] = None,
|
435
|
+
entry_params: Optional[EntryParams] = None,
|
345
436
|
**s3_client_kwargs,
|
346
437
|
) -> ManifestEntry:
|
347
438
|
from deltacat.aws import s3u as s3_utils
|
@@ -354,10 +445,134 @@ class ManifestEntry(dict):
|
|
354
445
|
content_type=s3_obj["ContentType"],
|
355
446
|
content_encoding=s3_obj["ContentEncoding"],
|
356
447
|
source_content_length=source_content_length,
|
448
|
+
credentials=credentials,
|
449
|
+
content_type_parameters=content_type_parameters,
|
450
|
+
entry_type=entry_type,
|
451
|
+
entry_params=entry_params,
|
357
452
|
)
|
358
453
|
manifest_entry = ManifestEntry.of(url, manifest_entry_meta)
|
359
454
|
return manifest_entry
|
360
455
|
|
456
|
+
@staticmethod
|
457
|
+
def from_dict(obj: dict) -> ManifestEntry:
|
458
|
+
return ManifestEntry.of(
|
459
|
+
url=obj.get("url"),
|
460
|
+
uri=obj.get("uri"),
|
461
|
+
meta=ManifestMeta.from_dict(obj.get("meta")),
|
462
|
+
mandatory=obj.get("mandatory", True),
|
463
|
+
uuid=obj.get("id"),
|
464
|
+
)
|
465
|
+
|
466
|
+
@staticmethod
|
467
|
+
def from_path(
|
468
|
+
path: str,
|
469
|
+
filesystem: pa.fs.FileSystem,
|
470
|
+
record_count: int,
|
471
|
+
source_content_length: Optional[int] = None,
|
472
|
+
content_type: Optional[str] = None,
|
473
|
+
content_encoding: Optional[str] = None,
|
474
|
+
credentials: Optional[Dict[str, str]] = None,
|
475
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
476
|
+
entry_type: Optional[EntryType] = None,
|
477
|
+
entry_params: Optional[EntryParams] = None,
|
478
|
+
schema_id: Optional[int] = None,
|
479
|
+
sort_scheme_id: Optional[str] = None,
|
480
|
+
) -> ManifestEntry:
|
481
|
+
"""
|
482
|
+
Creates a manifest entry from a path using a pyarrow filesystem.
|
483
|
+
|
484
|
+
Args:
|
485
|
+
path: Path to the file
|
486
|
+
filesystem: PyArrow filesystem to use for accessing the file
|
487
|
+
record_count: Number of records in the file
|
488
|
+
source_content_length: Optional original content length in-memory
|
489
|
+
before writing to disk.
|
490
|
+
content_type: Optional content type override. If not provided, will
|
491
|
+
be derived from file extension.
|
492
|
+
content_encoding: Optional content encoding override. If not
|
493
|
+
provided, will be derived from file extension.
|
494
|
+
credentials: Optional credentials required to read this manifest entry.
|
495
|
+
content_type_parameters: Optional content type parameters.
|
496
|
+
entry_type: Optional entry type of this manifest entry. Defaults to DATA.
|
497
|
+
entry_params: Optional entry type parameters.
|
498
|
+
schema_id: Schema ID used to write this manifest entry.
|
499
|
+
sort_scheme_id: Sort scheme ID used to write this manifest entry.
|
500
|
+
|
501
|
+
Returns:
|
502
|
+
A ManifestEntry instance
|
503
|
+
"""
|
504
|
+
file_info = get_file_info(path, filesystem)
|
505
|
+
if file_info.type != pa.fs.FileType.File:
|
506
|
+
raise FileNotFoundError(f"Path does not point to a file: {path}")
|
507
|
+
|
508
|
+
# Extract extensions from right to left
|
509
|
+
# First split will get potential encoding extension
|
510
|
+
base_path, ext1 = posixpath.splitext(path)
|
511
|
+
|
512
|
+
# Initialize with defaults for no extensions
|
513
|
+
derived_content_type = ContentType.BINARY
|
514
|
+
derived_content_encoding = ContentEncoding.IDENTITY
|
515
|
+
|
516
|
+
# Only proceed with extension checks if we found at least one extension
|
517
|
+
if ext1:
|
518
|
+
# Check if the first extension is a known encoding
|
519
|
+
derived_content_encoding = EXT_TO_CONTENT_ENCODING.get(
|
520
|
+
ext1,
|
521
|
+
ContentEncoding.IDENTITY,
|
522
|
+
)
|
523
|
+
|
524
|
+
# Get second extension only if first was an encoding
|
525
|
+
if derived_content_encoding != ContentEncoding.IDENTITY:
|
526
|
+
# Second split will get potential content type extension
|
527
|
+
_, ext2 = posixpath.splitext(base_path)
|
528
|
+
if ext2:
|
529
|
+
derived_content_type = EXT_TO_CONTENT_TYPE.get(
|
530
|
+
ext2,
|
531
|
+
ContentType.BINARY,
|
532
|
+
)
|
533
|
+
else:
|
534
|
+
# First extension wasn't an encoding, check if it's a
|
535
|
+
# content type
|
536
|
+
derived_content_type = EXT_TO_CONTENT_TYPE.get(
|
537
|
+
ext1,
|
538
|
+
ContentType.BINARY,
|
539
|
+
)
|
540
|
+
|
541
|
+
if (
|
542
|
+
derived_content_type == ContentType.BINARY
|
543
|
+
and derived_content_encoding != ContentEncoding.IDENTITY
|
544
|
+
):
|
545
|
+
logger.debug(
|
546
|
+
f"Found encoding {derived_content_encoding.value} but no "
|
547
|
+
f"content type for {path}, assuming binary"
|
548
|
+
)
|
549
|
+
|
550
|
+
# Use provided values if available, otherwise use derived values
|
551
|
+
final_content_type = (
|
552
|
+
content_type if content_type is not None else derived_content_type.value
|
553
|
+
)
|
554
|
+
final_content_encoding = (
|
555
|
+
content_encoding
|
556
|
+
if content_encoding is not None
|
557
|
+
else derived_content_encoding.value
|
558
|
+
)
|
559
|
+
|
560
|
+
manifest_entry_meta = ManifestMeta.of(
|
561
|
+
record_count=record_count,
|
562
|
+
content_length=file_info.size,
|
563
|
+
content_type=final_content_type,
|
564
|
+
content_encoding=final_content_encoding,
|
565
|
+
source_content_length=source_content_length,
|
566
|
+
credentials=credentials,
|
567
|
+
content_type_parameters=content_type_parameters,
|
568
|
+
entry_type=entry_type,
|
569
|
+
entry_params=entry_params,
|
570
|
+
schema_id=schema_id,
|
571
|
+
sort_scheme_id=sort_scheme_id,
|
572
|
+
)
|
573
|
+
manifest_entry = ManifestEntry.of(path, manifest_entry_meta)
|
574
|
+
return manifest_entry
|
575
|
+
|
361
576
|
@property
|
362
577
|
def uri(self) -> Optional[str]:
|
363
578
|
return self.get("uri")
|
@@ -392,6 +607,12 @@ class ManifestAuthor(dict):
|
|
392
607
|
manifest_author["version"] = version
|
393
608
|
return manifest_author
|
394
609
|
|
610
|
+
@staticmethod
|
611
|
+
def from_dict(obj: dict) -> Optional[ManifestAuthor]:
|
612
|
+
if obj is None:
|
613
|
+
return None
|
614
|
+
return ManifestAuthor.of(obj.get("name"), obj.get("version"))
|
615
|
+
|
395
616
|
@property
|
396
617
|
def name(self) -> Optional[str]:
|
397
618
|
return self.get("name")
|
@@ -416,3 +637,7 @@ class ManifestEntryList(List[ManifestEntry]):
|
|
416
637
|
if val is not None and not isinstance(val, ManifestEntry):
|
417
638
|
self[item] = val = ManifestEntry(val)
|
418
639
|
return val
|
640
|
+
|
641
|
+
def __iter__(self):
|
642
|
+
for i in range(len(self)):
|
643
|
+
yield self[i] # This triggers __getitem__ conversion
|