deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
import pyarrow
|
5
|
+
from deltacat.storage.model.scan.push_down import PartitionFilter
|
6
|
+
|
7
|
+
import deltacat.logs as logs
|
8
|
+
from deltacat.storage.model.expression import Reference, Literal
|
9
|
+
from deltacat.storage.model.expression.visitor import ExpressionVisitor
|
10
|
+
from pyiceberg.expressions import (
|
11
|
+
And,
|
12
|
+
Or,
|
13
|
+
Not,
|
14
|
+
EqualTo,
|
15
|
+
NotEqualTo,
|
16
|
+
GreaterThan,
|
17
|
+
GreaterThanOrEqual,
|
18
|
+
LessThan,
|
19
|
+
LessThanOrEqual,
|
20
|
+
IsNull,
|
21
|
+
In,
|
22
|
+
)
|
23
|
+
|
24
|
+
# Initialize DeltaCAT logger
|
25
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
+
|
27
|
+
|
28
|
+
class IcebergExpressionVisitor(ExpressionVisitor[None, Any]):
|
29
|
+
"""
|
30
|
+
Visitor that translates DeltaCAT expressions to PyIceberg expressions.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def visit(self, expr, context=None):
|
34
|
+
# Handle PartitionFilter by extracting and visiting the inner expression
|
35
|
+
if isinstance(expr, PartitionFilter):
|
36
|
+
return self.visit(expr.expr, context)
|
37
|
+
# Handle all other expressions using the parent's visit method
|
38
|
+
return super().visit(expr, context)
|
39
|
+
|
40
|
+
def visit_reference(self, expr: Reference, context=None) -> str:
|
41
|
+
return expr.field
|
42
|
+
|
43
|
+
def visit_literal(self, expr: Literal, context=None) -> Any:
|
44
|
+
# Convert PyArrow scalar to Python native type
|
45
|
+
return (
|
46
|
+
expr.value.as_py() if isinstance(expr.value, pyarrow.Scalar) else expr.value
|
47
|
+
)
|
48
|
+
|
49
|
+
def visit_and(self, expr, context=None):
|
50
|
+
left = self.visit(expr.left, context)
|
51
|
+
right = self.visit(expr.right, context)
|
52
|
+
return And(left, right)
|
53
|
+
|
54
|
+
def visit_or(self, expr, context=None):
|
55
|
+
left = self.visit(expr.left, context)
|
56
|
+
right = self.visit(expr.right, context)
|
57
|
+
return Or(left, right)
|
58
|
+
|
59
|
+
def visit_not(self, expr, context=None):
|
60
|
+
operand = self.visit(expr.operand, context)
|
61
|
+
return Not(operand)
|
62
|
+
|
63
|
+
def visit_equal(self, expr, context=None):
|
64
|
+
left = self.visit(expr.left, context)
|
65
|
+
right = self.visit(expr.right, context)
|
66
|
+
return EqualTo(left, right)
|
67
|
+
|
68
|
+
def visit_not_equal(self, expr, context=None):
|
69
|
+
left = self.visit(expr.left, context)
|
70
|
+
right = self.visit(expr.right, context)
|
71
|
+
return NotEqualTo(left, right)
|
72
|
+
|
73
|
+
def visit_greater_than(self, expr, context=None):
|
74
|
+
left = self.visit(expr.left, context)
|
75
|
+
right = self.visit(expr.right, context)
|
76
|
+
return GreaterThan(left, right)
|
77
|
+
|
78
|
+
def visit_greater_than_equal(self, expr, context=None):
|
79
|
+
left = self.visit(expr.left, context)
|
80
|
+
right = self.visit(expr.right, context)
|
81
|
+
return GreaterThanOrEqual(left, right)
|
82
|
+
|
83
|
+
def visit_less_than(self, expr, context=None):
|
84
|
+
left = self.visit(expr.left, context)
|
85
|
+
right = self.visit(expr.right, context)
|
86
|
+
return LessThan(left, right)
|
87
|
+
|
88
|
+
def visit_less_than_equal(self, expr, context=None):
|
89
|
+
left = self.visit(expr.left, context)
|
90
|
+
right = self.visit(expr.right, context)
|
91
|
+
return LessThanOrEqual(left, right)
|
92
|
+
|
93
|
+
def visit_is_null(self, expr, context=None):
|
94
|
+
operand = self.visit(expr.operand, context)
|
95
|
+
return IsNull(operand)
|
96
|
+
|
97
|
+
def visit_in(self, expr, context=None):
|
98
|
+
value = self.visit(expr.value, context)
|
99
|
+
values = [self.visit(v, context) for v in expr.values]
|
100
|
+
return In(value, values)
|
101
|
+
|
102
|
+
def visit_between(self, expr, context=None):
|
103
|
+
value = self.visit(expr.value, context)
|
104
|
+
lower = self.visit(expr.lower, context)
|
105
|
+
upper = self.visit(expr.upper, context)
|
106
|
+
return And(GreaterThanOrEqual(value, lower), LessThanOrEqual(value, upper))
|
107
|
+
|
108
|
+
# PyIceberg does not have a direct equivalent of LIKE
|
109
|
+
def visit_like(self, expr, context=None):
|
110
|
+
value = self.visit(expr.value, context)
|
111
|
+
pattern = self.visit(expr.pattern, context)
|
112
|
+
logger.warning(
|
113
|
+
f"LIKE operation is not supported in PyIceberg. Ignoring LIKE filter: {value} LIKE '{pattern}'. "
|
114
|
+
"This may result in more data being returned than expected."
|
115
|
+
)
|
116
|
+
# Return None or a default expression that won't filter anything
|
117
|
+
return (
|
118
|
+
None # or return NotEqualTo(value, None) # matches everything except NULL
|
119
|
+
)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
2
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Field
|
3
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
4
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Datatype
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"Schema",
|
8
|
+
"Field",
|
9
|
+
"Dataset",
|
10
|
+
"Datatype",
|
11
|
+
]
|
@@ -2,10 +2,13 @@ from abc import ABC, abstractmethod
|
|
2
2
|
from typing import Iterator, List, Any
|
3
3
|
import pyarrow as pa
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
6
|
-
from deltacat.storage.rivulet import Schema
|
7
|
-
from deltacat.storage.rivulet.serializer import
|
8
|
-
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema
|
7
|
+
from deltacat.experimental.storage.rivulet.serializer import (
|
8
|
+
DataSerializer,
|
9
|
+
MEMTABLE_DATA,
|
10
|
+
)
|
11
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
9
12
|
|
10
13
|
|
11
14
|
class ArrowSerializer(DataSerializer, ABC):
|
@@ -24,19 +24,23 @@ from deltacat.storage.model.shard import Shard, ShardingStrategy
|
|
24
24
|
from deltacat.storage.model.stream import Stream, StreamLocator
|
25
25
|
from deltacat.storage.model.transaction import TransactionOperationList
|
26
26
|
from deltacat.storage.model.types import CommitState, StreamFormat
|
27
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
28
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
29
|
-
from deltacat.storage.rivulet.reader.dataset_metastore import
|
30
|
-
|
27
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
28
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
29
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
30
|
+
DatasetMetastore,
|
31
|
+
)
|
32
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
31
33
|
from deltacat.utils.export import export_dataset
|
32
34
|
from .schema.schema import Datatype
|
33
35
|
|
34
|
-
from deltacat.storage.rivulet.reader.data_scan import DataScan
|
35
|
-
from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
|
36
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
36
|
+
from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
|
37
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
|
38
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
39
|
+
QueryExpression,
|
40
|
+
)
|
37
41
|
|
38
|
-
from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
|
39
|
-
from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
|
42
|
+
from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
|
43
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
40
44
|
MemtableDatasetWriter,
|
41
45
|
)
|
42
46
|
|
@@ -48,7 +52,6 @@ from deltacat.storage import (
|
|
48
52
|
TableVersion,
|
49
53
|
TableVersionLocator,
|
50
54
|
Transaction,
|
51
|
-
TransactionType,
|
52
55
|
TransactionOperation,
|
53
56
|
TransactionOperationType,
|
54
57
|
)
|
@@ -295,7 +298,6 @@ class Dataset:
|
|
295
298
|
partition_values=DEFAULT_PARTITION_VALUES,
|
296
299
|
partition_id=self._partition_id,
|
297
300
|
),
|
298
|
-
schema=None,
|
299
301
|
content_types=None,
|
300
302
|
),
|
301
303
|
]
|
@@ -308,7 +310,6 @@ class Dataset:
|
|
308
310
|
]
|
309
311
|
|
310
312
|
transaction = Transaction.of(
|
311
|
-
txn_type=TransactionType.APPEND,
|
312
313
|
txn_operations=TransactionOperationList.of(txn_operations),
|
313
314
|
)
|
314
315
|
|
@@ -2,13 +2,16 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import List, Callable, Any
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.
|
6
|
-
from deltacat.storage.rivulet
|
7
|
-
from deltacat.storage.rivulet import
|
8
|
-
from deltacat.storage.rivulet.reader.
|
9
|
-
|
10
|
-
|
11
|
-
from deltacat.storage.rivulet.reader.
|
5
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema
|
7
|
+
from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
9
|
+
DatasetMetastore,
|
10
|
+
)
|
11
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
|
12
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
13
|
+
QueryExpression,
|
14
|
+
)
|
12
15
|
|
13
16
|
|
14
17
|
class DatasetExecutor:
|
@@ -22,12 +25,10 @@ class DatasetExecutor:
|
|
22
25
|
|
23
26
|
def __init__(
|
24
27
|
self,
|
25
|
-
field_groups: List[FieldGroup],
|
26
28
|
schema: Schema,
|
27
29
|
metastore: DatasetMetastore,
|
28
30
|
):
|
29
31
|
self.effective_schema: Schema = schema.__deepcopy__()
|
30
|
-
self.field_groups = field_groups
|
31
32
|
self.output: MvpTable | None = None
|
32
33
|
self._metastore = metastore
|
33
34
|
|
@@ -64,18 +65,9 @@ class DatasetExecutor:
|
|
64
65
|
|
65
66
|
TODO for now this is doing dumb in-memory implementation and later this is going to be replaced by rust library
|
66
67
|
"""
|
67
|
-
|
68
|
-
return self._read_as_mvp_table(schema, self.field_groups[0])
|
69
|
-
else:
|
70
|
-
ds1 = self._read_as_mvp_table(schema, self.field_groups[0])
|
71
|
-
ds2 = self._read_as_mvp_table(schema, self.field_groups[1])
|
72
|
-
merged = MvpTable.merge(ds1, ds2, schema.primary_key.name)
|
73
|
-
for i in range(2, len(self.field_groups)):
|
74
|
-
ds_i = self._read_as_mvp_table(schema, self.field_groups[i])
|
75
|
-
merged = MvpTable.merge(merged, ds_i, schema.primary_key.name)
|
76
|
-
return merged
|
68
|
+
return self._read_as_mvp_table(schema)
|
77
69
|
|
78
|
-
def _read_as_mvp_table(self, schema: Schema
|
70
|
+
def _read_as_mvp_table(self, schema: Schema):
|
79
71
|
data = list(
|
80
72
|
DataScan(
|
81
73
|
schema, QueryExpression(), DatasetReader(self._metastore)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# TODO later on this will be moved to a dedicated package
|
2
|
+
from deltacat.experimental.storage.rivulet.feather.file_reader import FeatherFileReader
|
3
|
+
from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
|
4
|
+
FileReaderRegistrar,
|
5
|
+
)
|
6
|
+
|
7
|
+
FileReaderRegistrar.register_reader("feather", FeatherFileReader)
|
@@ -5,15 +5,17 @@ from typing import Optional
|
|
5
5
|
import pyarrow.ipc
|
6
6
|
from pyarrow import RecordBatch, RecordBatchFileReader
|
7
7
|
|
8
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
9
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
10
|
-
from deltacat.storage.rivulet.reader.data_reader import (
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
9
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
10
|
+
from deltacat.experimental.storage.rivulet.reader.data_reader import (
|
11
11
|
RowAndKey,
|
12
12
|
FileReader,
|
13
13
|
FILE_FORMAT,
|
14
14
|
)
|
15
|
-
from deltacat.storage.rivulet.reader.pyarrow_data_reader import
|
16
|
-
|
15
|
+
from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
|
16
|
+
RecordBatchRowIndex,
|
17
|
+
)
|
18
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
17
19
|
|
18
20
|
|
19
21
|
class FeatherFileReader(FileReader[RecordBatchRowIndex]):
|
@@ -3,10 +3,10 @@ from typing import List
|
|
3
3
|
import pyarrow as pa
|
4
4
|
from pyarrow import feather
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
7
|
-
from deltacat.storage.rivulet import Schema
|
8
|
-
from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
|
9
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema
|
8
|
+
from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
|
9
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
10
10
|
|
11
11
|
|
12
12
|
class FeatherDataSerializer(ArrowSerializer):
|
@@ -3,9 +3,9 @@ import time
|
|
3
3
|
from typing import List, Generator
|
4
4
|
|
5
5
|
from deltacat.storage.model.partition import PartitionLocator
|
6
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
7
|
-
from deltacat.storage.rivulet.fs.input_file import InputFile
|
8
|
-
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
6
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
|
9
9
|
from deltacat.utils.metafile_locator import _find_partition_path
|
10
10
|
|
11
11
|
|
@@ -4,8 +4,8 @@ from pyarrow.fs import FileSystem, FileType, FileSelector
|
|
4
4
|
# TODO(deltacat): Rely on deltacat implementation to resolve path and filesystem.
|
5
5
|
from ray.data.datasource.path_util import _resolve_paths_and_filesystem
|
6
6
|
|
7
|
-
from deltacat.storage.rivulet.fs.input_file import FSInputFile
|
8
|
-
from deltacat.storage.rivulet.fs.output_file import FSOutputFile
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import FSOutputFile
|
9
9
|
|
10
10
|
|
11
11
|
class FileStore:
|
@@ -5,7 +5,7 @@ from typing import Protocol
|
|
5
5
|
|
6
6
|
from pyarrow.fs import FileSystem, FileType
|
7
7
|
|
8
|
-
from deltacat.storage.rivulet.fs.input_file import FSInputFile, InputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile, InputFile
|
9
9
|
|
10
10
|
|
11
11
|
class OutputStream(Protocol): # pragma: no cover
|
@@ -1,9 +1,9 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import List, Callable, Any, Protocol
|
3
3
|
|
4
|
-
from deltacat.storage.rivulet.dataset_executor import DatasetExecutor
|
5
|
-
from deltacat.storage.rivulet
|
6
|
-
from deltacat.storage.rivulet import
|
4
|
+
from deltacat.experimental.storage.rivulet.dataset_executor import DatasetExecutor
|
5
|
+
from deltacat.experimental.storage.rivulet import Schema
|
6
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
7
7
|
|
8
8
|
|
9
9
|
class DatasetOperation(Protocol):
|
@@ -99,7 +99,7 @@ class LogicalPlan:
|
|
99
99
|
self.operations.append(CollectOperation())
|
100
100
|
return self
|
101
101
|
|
102
|
-
def execute(self, executor: DatasetExecutor) ->
|
102
|
+
def execute(self, executor: DatasetExecutor) -> MvpTable:
|
103
103
|
for operation in self.operations:
|
104
104
|
operation.visit(executor)
|
105
105
|
return executor.output
|
File without changes
|
@@ -10,7 +10,6 @@ from deltacat.storage import (
|
|
10
10
|
Delta,
|
11
11
|
DeltaType,
|
12
12
|
Transaction,
|
13
|
-
TransactionType,
|
14
13
|
TransactionOperation,
|
15
14
|
TransactionOperationType,
|
16
15
|
)
|
@@ -19,7 +18,7 @@ from deltacat.storage.model.partition import PartitionLocator
|
|
19
18
|
from deltacat.storage.model.transaction import TransactionOperationList
|
20
19
|
|
21
20
|
from deltacat.storage.model.types import StreamFormat
|
22
|
-
from deltacat.storage.rivulet import Schema
|
21
|
+
from deltacat.experimental.storage.rivulet import Schema
|
23
22
|
|
24
23
|
StreamPosition = int
|
25
24
|
"""The stream position for creating a consistent ordering of manifests."""
|
@@ -169,7 +168,6 @@ class DeltacatManifestIO(ManifestIO):
|
|
169
168
|
delta["level"] = level
|
170
169
|
|
171
170
|
tx_results = Transaction.of(
|
172
|
-
txn_type=TransactionType.APPEND,
|
173
171
|
txn_operations=TransactionOperationList.of(
|
174
172
|
[
|
175
173
|
TransactionOperation.of(
|
@@ -4,9 +4,9 @@ import json
|
|
4
4
|
from itertools import zip_longest
|
5
5
|
from typing import List
|
6
6
|
|
7
|
-
from deltacat.storage.rivulet.fs.input_file import InputFile
|
8
|
-
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
9
|
-
from deltacat.storage.rivulet.metastore.sst import (
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
|
9
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import (
|
10
10
|
SSTWriter,
|
11
11
|
SSTableRow,
|
12
12
|
SSTReader,
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import Protocol, Any, List
|
3
3
|
|
4
|
-
from deltacat.storage.rivulet.fs.input_file import InputFile
|
5
|
-
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
4
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
|
5
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
|
6
6
|
|
7
7
|
|
8
8
|
@dataclass(frozen=True)
|
@@ -8,9 +8,9 @@ from typing import Any, Dict, Set, List, FrozenSet, Iterable, TypeVar, NamedTupl
|
|
8
8
|
|
9
9
|
from intervaltree import Interval, IntervalTree
|
10
10
|
|
11
|
-
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
12
|
-
from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
13
|
-
from deltacat.storage.rivulet import Schema
|
11
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
12
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
13
|
+
from deltacat.experimental.storage.rivulet import Schema
|
14
14
|
|
15
15
|
T = TypeVar("T")
|
16
16
|
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# TODO later on this will be moved to a dedicated package
|
2
|
+
from deltacat.experimental.storage.rivulet.parquet.file_reader import ParquetFileReader
|
3
|
+
from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
|
4
|
+
FileReaderRegistrar,
|
5
|
+
)
|
6
|
+
|
7
|
+
FileReaderRegistrar.register_reader("parquet", ParquetFileReader)
|
File without changes
|
@@ -4,15 +4,17 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from pyarrow import RecordBatch
|
6
6
|
|
7
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
8
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
9
|
-
from deltacat.storage.rivulet.reader.data_reader import (
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
8
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
9
|
+
from deltacat.experimental.storage.rivulet.reader.data_reader import (
|
10
10
|
RowAndKey,
|
11
11
|
FileReader,
|
12
12
|
FILE_FORMAT,
|
13
13
|
)
|
14
|
-
from deltacat.storage.rivulet.reader.pyarrow_data_reader import
|
15
|
-
|
14
|
+
from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
|
15
|
+
RecordBatchRowIndex,
|
16
|
+
)
|
17
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
16
18
|
import pyarrow.parquet as pq
|
17
19
|
import pyarrow as pa
|
18
20
|
|
@@ -3,11 +3,11 @@ from typing import List, Any
|
|
3
3
|
import pyarrow as pa
|
4
4
|
from pyarrow.parquet import FileMetaData
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
7
|
-
from deltacat.storage.rivulet import Schema
|
8
|
-
from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema
|
8
|
+
from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
|
9
9
|
|
10
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
10
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
11
11
|
|
12
12
|
|
13
13
|
class ParquetDataSerializer(ArrowSerializer):
|
File without changes
|
@@ -15,19 +15,30 @@ from typing import (
|
|
15
15
|
AbstractSet,
|
16
16
|
)
|
17
17
|
|
18
|
-
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
19
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
20
|
-
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
18
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
19
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
20
|
+
from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
|
21
21
|
OrderedBlockGroups,
|
22
22
|
BlockGroup,
|
23
23
|
Block,
|
24
24
|
)
|
25
|
-
from deltacat.storage.rivulet.reader.data_reader import
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
from deltacat.storage.rivulet.reader.
|
30
|
-
|
25
|
+
from deltacat.experimental.storage.rivulet.reader.data_reader import (
|
26
|
+
RowAndKey,
|
27
|
+
FileReader,
|
28
|
+
)
|
29
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
30
|
+
DatasetMetastore,
|
31
|
+
)
|
32
|
+
from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
|
33
|
+
ArrowDataReader,
|
34
|
+
)
|
35
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
36
|
+
QueryExpression,
|
37
|
+
)
|
38
|
+
from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
|
39
|
+
FileReaderRegistrar,
|
40
|
+
)
|
41
|
+
from deltacat.experimental.storage.rivulet import Schema
|
31
42
|
from deltacat import logs
|
32
43
|
|
33
44
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -13,9 +13,9 @@ from typing import (
|
|
13
13
|
Optional,
|
14
14
|
)
|
15
15
|
|
16
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
17
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
18
|
-
from deltacat.storage.rivulet.schema.schema import Schema
|
16
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
17
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
18
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
19
19
|
|
20
20
|
FILE_FORMAT = TypeVar("FILE_FORMAT")
|
21
21
|
MEMORY_FORMAT = TypeVar("MEMORY_FORMAT")
|
@@ -3,9 +3,11 @@ from typing import Generator, Dict, Optional
|
|
3
3
|
import pyarrow as pa
|
4
4
|
|
5
5
|
from deltacat.storage.model.shard import Shard
|
6
|
-
from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
|
7
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
8
|
-
|
6
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
|
7
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
8
|
+
QueryExpression,
|
9
|
+
)
|
10
|
+
from deltacat.experimental.storage.rivulet import Schema
|
9
11
|
|
10
12
|
|
11
13
|
class DataScan:
|
@@ -5,18 +5,19 @@ from typing import Generator, Optional
|
|
5
5
|
import pyarrow
|
6
6
|
import pyarrow.fs
|
7
7
|
|
8
|
+
from deltacat.constants import REV_DIR_NAME
|
8
9
|
from deltacat.storage import Delta
|
9
10
|
from deltacat.storage.model.partition import PartitionLocator
|
10
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
11
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
11
12
|
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
12
|
-
from deltacat.storage.rivulet.metastore.json_sst import JsonSstReader
|
13
|
-
from deltacat.storage.rivulet.metastore.delta import (
|
13
|
+
from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstReader
|
14
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import (
|
14
15
|
ManifestIO,
|
15
16
|
DeltaContext,
|
16
17
|
RivuletDelta,
|
17
18
|
DeltacatManifestIO,
|
18
19
|
)
|
19
|
-
from deltacat.storage.rivulet.metastore.sst import SSTReader, SSTable
|
20
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTReader, SSTable
|
20
21
|
from deltacat.utils.metafile_locator import _find_table_path
|
21
22
|
from deltacat import logs
|
22
23
|
|
@@ -83,7 +84,7 @@ class DatasetMetastore:
|
|
83
84
|
param: filesystem: The filesystem to search for the revisions.
|
84
85
|
returns: The latest revision as a RivuletDelta.
|
85
86
|
"""
|
86
|
-
rev_directory = posixpath.join(delta_dir,
|
87
|
+
rev_directory = posixpath.join(delta_dir, REV_DIR_NAME)
|
87
88
|
revisions = filesystem.get_file_info(
|
88
89
|
pyarrow.fs.FileSelector(rev_directory, allow_not_found=True)
|
89
90
|
)
|
@@ -128,7 +129,7 @@ class DatasetMetastore:
|
|
128
129
|
return
|
129
130
|
|
130
131
|
# Locate "rev" directory inside the partition
|
131
|
-
rev_directory = posixpath.join(partition_path,
|
132
|
+
rev_directory = posixpath.join(partition_path, REV_DIR_NAME)
|
132
133
|
rev_info = filesystem.get_file_info(rev_directory)
|
133
134
|
|
134
135
|
if rev_info.type != pyarrow.fs.FileType.Directory:
|
@@ -2,18 +2,20 @@ import logging
|
|
2
2
|
from typing import Generator, Optional, Set, Type, TypeVar, Any
|
3
3
|
|
4
4
|
from deltacat.storage.model.shard import Shard
|
5
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow, SSTable
|
6
|
-
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow, SSTable
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
|
7
7
|
BlockIntervalTree,
|
8
8
|
OrderedBlockGroups,
|
9
9
|
)
|
10
|
-
from deltacat.storage.rivulet.reader.block_scanner import BlockScanner
|
11
|
-
from deltacat.storage.rivulet.reader.dataset_metastore import (
|
10
|
+
from deltacat.experimental.storage.rivulet.reader.block_scanner import BlockScanner
|
11
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
12
12
|
DatasetMetastore,
|
13
13
|
ManifestAccessor,
|
14
14
|
)
|
15
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
16
|
-
|
15
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
16
|
+
QueryExpression,
|
17
|
+
)
|
18
|
+
from deltacat.experimental.storage.rivulet import Schema
|
17
19
|
|
18
20
|
# The type of data returned to reader
|
19
21
|
T = TypeVar("T")
|
@@ -4,7 +4,10 @@ from typing import Generator, Dict, Type, NamedTuple, List
|
|
4
4
|
|
5
5
|
from pyarrow import RecordBatch
|
6
6
|
|
7
|
-
from deltacat.storage.rivulet.reader.data_reader import
|
7
|
+
from deltacat.experimental.storage.rivulet.reader.data_reader import (
|
8
|
+
DataReader,
|
9
|
+
MEMORY_FORMAT,
|
10
|
+
)
|
8
11
|
import pyarrow as pa
|
9
12
|
|
10
13
|
|