deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
|
|
1
|
+
"""
|
2
|
+
Spark SQL utilities for Iceberg table operations.
|
3
|
+
|
4
|
+
This module provides Beam DoFn classes that use Spark SQL to work with Iceberg tables,
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import apache_beam as beam
|
9
|
+
from apache_beam import Row
|
10
|
+
|
11
|
+
|
12
|
+
class SparkSQLIcebergRead(beam.DoFn):
|
13
|
+
"""
|
14
|
+
Custom Beam DoFn that uses Spark SQL to read Iceberg tables.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
table_name: str,
|
20
|
+
catalog_uri: str = "http://localhost:8181",
|
21
|
+
warehouse: str = "warehouse/",
|
22
|
+
):
|
23
|
+
"""
|
24
|
+
Initialize the Spark SQL reader.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
table_name: Name of the Iceberg table
|
28
|
+
catalog_uri: URI of the Iceberg REST catalog
|
29
|
+
warehouse: Warehouse path
|
30
|
+
"""
|
31
|
+
self.table_name = table_name
|
32
|
+
self.catalog_uri = catalog_uri
|
33
|
+
self.warehouse = warehouse
|
34
|
+
self.spark = None
|
35
|
+
|
36
|
+
def setup(self):
|
37
|
+
"""Set up Spark session (called once per worker)."""
|
38
|
+
try:
|
39
|
+
from pyspark.sql import SparkSession
|
40
|
+
import importlib.metadata
|
41
|
+
|
42
|
+
# Get Spark version for dependency resolution
|
43
|
+
try:
|
44
|
+
spark_version = ".".join(
|
45
|
+
importlib.metadata.version("pyspark").split(".")[:2]
|
46
|
+
)
|
47
|
+
except Exception:
|
48
|
+
spark_version = "3.5" # Default fallback
|
49
|
+
|
50
|
+
scala_version = "2.12"
|
51
|
+
iceberg_version = "1.6.0"
|
52
|
+
|
53
|
+
print(f"🔧 Setting up Spark session for reading {self.table_name}")
|
54
|
+
print(f" - Spark version: {spark_version}")
|
55
|
+
print(f" - Iceberg version: {iceberg_version}")
|
56
|
+
|
57
|
+
# Set Spark packages for Iceberg runtime
|
58
|
+
os.environ["PYSPARK_SUBMIT_ARGS"] = (
|
59
|
+
f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version} "
|
60
|
+
f"pyspark-shell"
|
61
|
+
)
|
62
|
+
|
63
|
+
# Create Spark session with Iceberg REST catalog configuration
|
64
|
+
self.spark = (
|
65
|
+
SparkSession.builder.appName(f"DeltaCAT Read - {self.table_name}")
|
66
|
+
.config("spark.sql.session.timeZone", "UTC")
|
67
|
+
.config(
|
68
|
+
"spark.serializer", "org.apache.spark.serializer.KryoSerializer"
|
69
|
+
)
|
70
|
+
.config(
|
71
|
+
"spark.sql.extensions",
|
72
|
+
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
|
73
|
+
)
|
74
|
+
# Configure REST catalog
|
75
|
+
.config(
|
76
|
+
"spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog"
|
77
|
+
)
|
78
|
+
.config("spark.sql.catalog.rest.type", "rest")
|
79
|
+
.config("spark.sql.catalog.rest.uri", self.catalog_uri)
|
80
|
+
.config("spark.sql.catalog.rest.warehouse", self.warehouse)
|
81
|
+
# Set REST as default catalog
|
82
|
+
.config("spark.sql.defaultCatalog", "rest")
|
83
|
+
# Local mode configuration (within Beam workers)
|
84
|
+
.config("spark.master", "local[1]") # Single thread per worker
|
85
|
+
.config("spark.sql.adaptive.enabled", "true")
|
86
|
+
# Networking binding
|
87
|
+
.config("spark.driver.bindAddress", "127.0.0.1")
|
88
|
+
.config("spark.driver.host", "127.0.0.1")
|
89
|
+
.config("spark.ui.enabled", "false")
|
90
|
+
.config("spark.sql.adaptive.coalescePartitions.enabled", "false")
|
91
|
+
.getOrCreate()
|
92
|
+
)
|
93
|
+
|
94
|
+
print(f"✅ Spark session created successfully")
|
95
|
+
|
96
|
+
except Exception as e:
|
97
|
+
print(f"❌ Failed to set up Spark session: {e}")
|
98
|
+
raise
|
99
|
+
|
100
|
+
def teardown(self):
|
101
|
+
"""Clean up Spark session (called once per worker)."""
|
102
|
+
if self.spark:
|
103
|
+
try:
|
104
|
+
self.spark.stop()
|
105
|
+
print("✅ Spark session stopped")
|
106
|
+
except Exception as e:
|
107
|
+
print(f"⚠️ Error stopping Spark session: {e}")
|
108
|
+
|
109
|
+
def process(self, element):
|
110
|
+
"""
|
111
|
+
Process element (read from Iceberg table using Spark SQL).
|
112
|
+
|
113
|
+
Args:
|
114
|
+
element: Input element (not used, just triggers the read)
|
115
|
+
|
116
|
+
Yields:
|
117
|
+
Records from the Iceberg table
|
118
|
+
"""
|
119
|
+
try:
|
120
|
+
if not self.spark:
|
121
|
+
raise RuntimeError("Spark session not initialized")
|
122
|
+
|
123
|
+
print(f"📖 Reading table {self.table_name} using Spark SQL")
|
124
|
+
|
125
|
+
# Read from Iceberg table using Spark SQL
|
126
|
+
df = self.spark.sql(f"SELECT * FROM {self.table_name}")
|
127
|
+
|
128
|
+
# Collect all records
|
129
|
+
records = df.collect()
|
130
|
+
|
131
|
+
print(f"📊 Successfully read {len(records)} records from {self.table_name}")
|
132
|
+
|
133
|
+
# Convert Spark rows to Beam Row objects and yield
|
134
|
+
for row in records:
|
135
|
+
row_dict = row.asDict()
|
136
|
+
# Convert to Beam Row for consistency with write mode
|
137
|
+
beam_row = Row(**row_dict)
|
138
|
+
yield beam_row
|
139
|
+
|
140
|
+
except Exception as e:
|
141
|
+
print(f"❌ Failed to read from table {self.table_name}: {e}")
|
142
|
+
raise
|
143
|
+
|
144
|
+
|
145
|
+
class SparkSQLIcebergRewrite(beam.DoFn):
|
146
|
+
"""
|
147
|
+
Custom Beam DoFn that uses Spark SQL to rewrite Iceberg table data files.
|
148
|
+
|
149
|
+
This uses Spark's rewrite_data_files procedure to materialize positional deletes
|
150
|
+
by rewriting data files. The result is a "clean" table without positional deletes.
|
151
|
+
"""
|
152
|
+
|
153
|
+
def __init__(self, catalog_uri, warehouse_path, table_name):
|
154
|
+
self.catalog_uri = catalog_uri
|
155
|
+
self.warehouse_path = warehouse_path
|
156
|
+
self.table_name = table_name
|
157
|
+
|
158
|
+
def setup(self):
|
159
|
+
"""Initialize Spark session for rewrite operations."""
|
160
|
+
try:
|
161
|
+
from pyspark.sql import SparkSession
|
162
|
+
import importlib.metadata
|
163
|
+
|
164
|
+
print(f"🔧 Setting up Spark session for rewriting {self.table_name}")
|
165
|
+
|
166
|
+
# Detect Spark version for appropriate Iceberg runtime
|
167
|
+
spark_version = importlib.metadata.version("pyspark")
|
168
|
+
major_minor = ".".join(spark_version.split(".")[:2])
|
169
|
+
print(f" - Spark version: {major_minor}")
|
170
|
+
print(f" - Iceberg version: 1.6.0")
|
171
|
+
|
172
|
+
# Configure Spark with Iceberg
|
173
|
+
self.spark = (
|
174
|
+
SparkSession.builder.appName("IcebergRewrite")
|
175
|
+
.config(
|
176
|
+
"spark.jars.packages",
|
177
|
+
f"org.apache.iceberg:iceberg-spark-runtime-{major_minor}_2.12:1.6.0",
|
178
|
+
)
|
179
|
+
.config(
|
180
|
+
"spark.sql.extensions",
|
181
|
+
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
|
182
|
+
)
|
183
|
+
.config(
|
184
|
+
"spark.sql.catalog.spark_catalog",
|
185
|
+
"org.apache.iceberg.spark.SparkSessionCatalog",
|
186
|
+
)
|
187
|
+
.config("spark.sql.catalog.spark_catalog.type", "rest")
|
188
|
+
.config("spark.sql.catalog.spark_catalog.uri", self.catalog_uri)
|
189
|
+
.config(
|
190
|
+
"spark.sql.catalog.spark_catalog.warehouse", self.warehouse_path
|
191
|
+
)
|
192
|
+
.config("spark.driver.bindAddress", "127.0.0.1")
|
193
|
+
.config("spark.driver.host", "127.0.0.1")
|
194
|
+
.config("spark.ui.enabled", "false")
|
195
|
+
.getOrCreate()
|
196
|
+
)
|
197
|
+
|
198
|
+
print("✅ Spark session created successfully")
|
199
|
+
|
200
|
+
except ImportError as e:
|
201
|
+
raise RuntimeError(
|
202
|
+
f"PySpark is required for rewrite mode. Install with: pip install pyspark"
|
203
|
+
) from e
|
204
|
+
except Exception as e:
|
205
|
+
raise RuntimeError(f"Failed to create Spark session: {e}") from e
|
206
|
+
|
207
|
+
def process(self, element):
|
208
|
+
"""Rewrite table data files to materialize positional deletes."""
|
209
|
+
try:
|
210
|
+
print(
|
211
|
+
f"📋 Rewriting table {self.table_name} to materialize positional deletes"
|
212
|
+
)
|
213
|
+
|
214
|
+
# Use Spark's rewrite_data_files procedure with delete_file_threshold=1
|
215
|
+
# This forces rewrite even when there's only 1 positional delete file
|
216
|
+
rewrite_sql = f"""
|
217
|
+
CALL spark_catalog.system.rewrite_data_files(
|
218
|
+
table => '{self.table_name}',
|
219
|
+
options => map('delete-file-threshold', '1')
|
220
|
+
)
|
221
|
+
"""
|
222
|
+
|
223
|
+
print(f"🔄 Executing rewrite procedure with delete_file_threshold=1...")
|
224
|
+
print(f" SQL: {rewrite_sql.strip()}")
|
225
|
+
print(
|
226
|
+
f" Rationale: Forces rewrite even with single positional delete file"
|
227
|
+
)
|
228
|
+
|
229
|
+
result = self.spark.sql(rewrite_sql)
|
230
|
+
|
231
|
+
# Collect results to see what was rewritten
|
232
|
+
rewrite_result = result.collect()[0]
|
233
|
+
print(f"📊 Rewrite result: {rewrite_result}")
|
234
|
+
|
235
|
+
# Check if we actually rewrote anything
|
236
|
+
if rewrite_result.rewritten_data_files_count > 0:
|
237
|
+
print(
|
238
|
+
f"✅ Successfully rewrote {rewrite_result.rewritten_data_files_count} data files"
|
239
|
+
)
|
240
|
+
print(
|
241
|
+
f" - Added {rewrite_result.added_data_files_count} new data files"
|
242
|
+
)
|
243
|
+
print(f" - Rewrote {rewrite_result.rewritten_bytes_count} bytes")
|
244
|
+
print(f" - Positional deletes have been materialized!")
|
245
|
+
else:
|
246
|
+
print(f"⚠️ No files were rewritten (rewritten_data_files_count=0)")
|
247
|
+
print(f" - This may indicate no positional deletes exist")
|
248
|
+
print(f" - Or the table may already be in optimal state")
|
249
|
+
|
250
|
+
yield f"Rewrite completed for {self.table_name}"
|
251
|
+
|
252
|
+
except Exception as e:
|
253
|
+
print(f"❌ Error during rewrite: {e}")
|
254
|
+
import traceback
|
255
|
+
|
256
|
+
traceback.print_exc()
|
257
|
+
yield f"Rewrite failed for {self.table_name}: {e}"
|
258
|
+
|
259
|
+
def teardown(self):
|
260
|
+
"""Clean up Spark session."""
|
261
|
+
if hasattr(self, "spark"):
|
262
|
+
print("✅ Spark session stopped")
|
263
|
+
self.spark.stop()
|
@@ -1,14 +1,16 @@
|
|
1
1
|
import os
|
2
2
|
import logging
|
3
3
|
|
4
|
+
import uuid
|
4
5
|
import daft
|
6
|
+
from pyiceberg.catalog import CatalogType
|
7
|
+
|
5
8
|
import deltacat as dc
|
6
9
|
|
7
10
|
from deltacat import logs
|
8
11
|
from deltacat import IcebergCatalog
|
9
|
-
from deltacat.
|
10
|
-
|
11
|
-
)
|
12
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
13
|
+
from env import store_cli_args_in_os_environ
|
12
14
|
|
13
15
|
from pyiceberg.schema import (
|
14
16
|
Schema,
|
@@ -19,7 +21,7 @@ from pyiceberg.schema import (
|
|
19
21
|
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
20
22
|
from pyiceberg.transforms import BucketTransform
|
21
23
|
|
22
|
-
from deltacat.storage.iceberg.model import (
|
24
|
+
from deltacat.experimental.storage.iceberg.model import (
|
23
25
|
SchemaMapper,
|
24
26
|
PartitionSchemeMapper,
|
25
27
|
)
|
@@ -30,6 +32,24 @@ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
|
30
32
|
|
31
33
|
|
32
34
|
def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
35
|
+
"""
|
36
|
+
This is an e2e example that
|
37
|
+
1. creates a DeltaCAT Table (backed by an Iceberg Table) in Glue
|
38
|
+
2. writes data into the DeltaCAT Table
|
39
|
+
3. reads data from the DeltaCAT Table using Daft
|
40
|
+
|
41
|
+
To run the script:
|
42
|
+
1. prepare an AWS Account
|
43
|
+
1. prepare a S3 location where the data will be written to, which will be used in Step 3.
|
44
|
+
2. prepare an IAM Role that has access to the S3 location and Glue
|
45
|
+
2. retrieve the IAM Role AWS Credential and cache locally in ~/.aws/credentials
|
46
|
+
3. run below command to execute the example
|
47
|
+
```
|
48
|
+
make venv && source venv/bin/activate
|
49
|
+
python -m deltacat.examples.iceberg.iceberg_bucket_writer --warehouse=s3://<YOUR_S3_LOCATION>
|
50
|
+
```
|
51
|
+
|
52
|
+
"""
|
33
53
|
# create any runtime environment required to run the example
|
34
54
|
runtime_env = create_ray_runtime_environment()
|
35
55
|
|
@@ -38,6 +58,7 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
|
38
58
|
# Only the `iceberg` data catalog is provided so it will become the default.
|
39
59
|
# If initializing multiple catalogs, use the `default_catalog_name` param
|
40
60
|
# to specify which catalog should be the default.
|
61
|
+
|
41
62
|
dc.init(
|
42
63
|
catalogs={
|
43
64
|
# the name of the DeltaCAT catalog is "iceberg"
|
@@ -49,11 +70,13 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
|
49
70
|
name="example-iceberg-catalog",
|
50
71
|
# for additional properties see:
|
51
72
|
# https://py.iceberg.apache.org/configuration/
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
73
|
+
config=IcebergCatalogConfig(
|
74
|
+
type=CatalogType.GLUE,
|
75
|
+
properties={
|
76
|
+
"warehouse": warehouse,
|
77
|
+
"region_name": "us-east-1",
|
78
|
+
},
|
79
|
+
),
|
57
80
|
)
|
58
81
|
},
|
59
82
|
# pass the runtime environment into ray.init()
|
@@ -89,10 +112,10 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
|
89
112
|
}
|
90
113
|
)
|
91
114
|
|
92
|
-
# write to a table named `test_namespace.test_table_bucketed
|
115
|
+
# write to a table named `test_namespace.test_table_bucketed-<SUFFIX>`
|
93
116
|
# we don't need to specify which catalog to create this table in since
|
94
117
|
# only the "iceberg" catalog is available
|
95
|
-
table_name = "test_table_bucketed"
|
118
|
+
table_name = f"test_table_bucketed-{uuid.uuid4().hex[:8]}"
|
96
119
|
namespace = "test_namespace"
|
97
120
|
print(f"Creating Glue Table: {namespace}.{table_name}")
|
98
121
|
dc.write_to_table(
|
@@ -106,9 +129,40 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
|
106
129
|
)
|
107
130
|
|
108
131
|
print(f"Getting Glue Table: {namespace}.{table_name}")
|
109
|
-
table_definition = dc.get_table(table_name, namespace)
|
132
|
+
table_definition = dc.get_table(name=table_name, namespace=namespace)
|
110
133
|
print(f"Retrieved Glue Table: {table_definition}")
|
111
134
|
|
135
|
+
# Read Data from DeltaCAT Table (backed by Iceberg) using Daft
|
136
|
+
daft_dataframe = dc.read_table(table=table_name, namespace=namespace)
|
137
|
+
|
138
|
+
daft_dataframe.where(df["bid"] > 200.0).show()
|
139
|
+
# Expected result:
|
140
|
+
# ╭────────┬─────────┬─────────╮
|
141
|
+
# │ symbol ┆ bid ┆ ask │
|
142
|
+
# │ --- ┆ --- ┆ --- │
|
143
|
+
# │ Utf8 ┆ Float64 ┆ Float64 │
|
144
|
+
# ╞════════╪═════════╪═════════╡
|
145
|
+
# │ meta ┆ 392.03 ┆ 392.09 │
|
146
|
+
# ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
147
|
+
# │ msft ┆ 403.25 ┆ 403.27 │
|
148
|
+
# ╰────────┴─────────┴─────────╯
|
149
|
+
|
150
|
+
daft_dataframe.select("symbol").show()
|
151
|
+
# Expected result:
|
152
|
+
# ╭────────╮
|
153
|
+
# │ symbol │
|
154
|
+
# │ --- │
|
155
|
+
# │ Utf8 │
|
156
|
+
# ╞════════╡
|
157
|
+
# │ meta │
|
158
|
+
# ├╌╌╌╌╌╌╌╌┤
|
159
|
+
# │ amzn │
|
160
|
+
# ├╌╌╌╌╌╌╌╌┤
|
161
|
+
# │ goog │
|
162
|
+
# ├╌╌╌╌╌╌╌╌┤
|
163
|
+
# │ msft │
|
164
|
+
# ╰────────╯
|
165
|
+
|
112
166
|
|
113
167
|
if __name__ == "__main__":
|
114
168
|
example_script_args = [
|
@@ -121,15 +175,6 @@ if __name__ == "__main__":
|
|
121
175
|
"type": str,
|
122
176
|
},
|
123
177
|
),
|
124
|
-
(
|
125
|
-
[
|
126
|
-
"--STAGE",
|
127
|
-
],
|
128
|
-
{
|
129
|
-
"help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
|
130
|
-
"type": str,
|
131
|
-
},
|
132
|
-
),
|
133
178
|
]
|
134
179
|
|
135
180
|
# store any CLI args in the runtime environment
|
@@ -4,9 +4,7 @@ import deltacat as dc
|
|
4
4
|
|
5
5
|
from deltacat import logs
|
6
6
|
from deltacat import IcebergCatalog
|
7
|
-
from
|
8
|
-
store_cli_args_in_os_environ,
|
9
|
-
)
|
7
|
+
from env import store_cli_args_in_os_environ
|
10
8
|
|
11
9
|
from pyiceberg.schema import (
|
12
10
|
Schema,
|
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
|
|
22
20
|
from pyiceberg.table.sorting import SortField, SortOrder
|
23
21
|
|
24
22
|
from deltacat.exceptions import TableAlreadyExistsError
|
25
|
-
from deltacat.storage.iceberg.model import (
|
23
|
+
from deltacat.experimental.storage.iceberg.model import (
|
26
24
|
SchemaMapper,
|
27
25
|
PartitionSchemeMapper,
|
28
26
|
SortSchemeMapper,
|
deltacat/examples/hello_world.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
import ray
|
2
2
|
import deltacat
|
3
3
|
import daft
|
4
|
-
import pyiceberg
|
5
4
|
|
6
5
|
|
7
6
|
def print_package_version_info():
|
8
7
|
print(f"DeltaCAT Version: {deltacat.__version__}")
|
9
|
-
print(f"PyIceberg Version: {pyiceberg.__version__}")
|
10
8
|
print(f"Ray Version: {ray.__version__}")
|
11
9
|
print(f"Daft Version: {daft.__version__}")
|
12
10
|
|
@@ -24,4 +22,8 @@ def run():
|
|
24
22
|
|
25
23
|
|
26
24
|
if __name__ == "__main__":
|
25
|
+
# initialize deltacat
|
26
|
+
deltacat.init()
|
27
|
+
|
28
|
+
# run the example
|
27
29
|
run()
|
@@ -0,0 +1,163 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
import ray
|
6
|
+
|
7
|
+
import deltacat
|
8
|
+
import daft
|
9
|
+
import pyarrow as pa
|
10
|
+
import pandas as pd
|
11
|
+
import polars as pl
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
from deltacat import DeltaCatUrl
|
15
|
+
|
16
|
+
|
17
|
+
def print_package_version_info() -> None:
|
18
|
+
print(f"DeltaCAT Version: {deltacat.__version__}")
|
19
|
+
print(f"Ray Version: {ray.__version__}")
|
20
|
+
print(f"Daft Version: {daft.__version__}")
|
21
|
+
print(f"NumPy Version: {np.__version__}")
|
22
|
+
print(f"PyArrow Version: {pa.__version__}")
|
23
|
+
print(f"Polars Version: {pl.__version__}")
|
24
|
+
print(f"Pandas Version: {pd.__version__}")
|
25
|
+
|
26
|
+
|
27
|
+
def json_path_to_regex(path: str):
|
28
|
+
if not path:
|
29
|
+
raise ValueError("Path cannot be empty")
|
30
|
+
parts = path.split("/")
|
31
|
+
leaf_key = parts.pop()
|
32
|
+
regex = r""
|
33
|
+
for part in parts:
|
34
|
+
if part.strip(): # discard leading and/or redundant separators
|
35
|
+
regex += rf'"{part}"\s*:\s*[{{\[].*?'
|
36
|
+
regex += rf'"{leaf_key}"\s*:\s*"(?<{leaf_key}>.*?)"'
|
37
|
+
return regex
|
38
|
+
|
39
|
+
|
40
|
+
def run(
|
41
|
+
source: str,
|
42
|
+
dest: str,
|
43
|
+
) -> None:
|
44
|
+
# print package version info
|
45
|
+
print_package_version_info()
|
46
|
+
|
47
|
+
# run a synchronous copy from the source to the destination
|
48
|
+
deltacat.copy(
|
49
|
+
DeltaCatUrl(source),
|
50
|
+
DeltaCatUrl(dest),
|
51
|
+
# reader arguments to pass to the default reader (polars)
|
52
|
+
# for the given text-based datasource, it accepts the same
|
53
|
+
# arguments as polars.read_csv except for `source`, `n_threads`
|
54
|
+
# `new_columns`, `separator`, `has_header`, `quote_char`, and
|
55
|
+
# `infer_schema`.
|
56
|
+
reader_args={
|
57
|
+
"low_memory": True, # try to use less memory (++stability, --perf)
|
58
|
+
"batch_size": 1024, # text line count read into a buffer at once
|
59
|
+
"use_pyarrow": True, # use the native pyarrow reader
|
60
|
+
},
|
61
|
+
# writer arguments to pass to the default writer (polars)
|
62
|
+
# for the given parquet-based datasink, it generally accepts the same
|
63
|
+
# arguments as polars.DataFrame.write_{dest-type} except for `file`
|
64
|
+
writer_args={
|
65
|
+
"compression": "lz4", # faster compression & decompression
|
66
|
+
# "compression": "zstd", # better compression ratio
|
67
|
+
# "compression": "snappy", # compatible w/ older Parquet readers
|
68
|
+
},
|
69
|
+
# Transforms to run against the default polars dataframe read.
|
70
|
+
# By default, each transform takes a polars dataframe `df` as input
|
71
|
+
# and produces a polars dataframe as output. All transforms listed
|
72
|
+
# are run in order (i.e., the dataframe output from transform[0]
|
73
|
+
# is the dataframe input to transform[1]).
|
74
|
+
#
|
75
|
+
# See:
|
76
|
+
# https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
|
77
|
+
# https://docs.pola.rs/api/python/stable/reference/expressions/index.html
|
78
|
+
transforms=[
|
79
|
+
lambda df, src: df.rename(
|
80
|
+
{"text": "utf8_body"},
|
81
|
+
),
|
82
|
+
lambda df, src: df.with_columns(
|
83
|
+
pl.col("utf8_body").hash().alias("utf8_body_hash"),
|
84
|
+
pl.lit(datetime.utcnow()).dt.datetime().alias("processing_time"),
|
85
|
+
pl.lit(src.url_path).alias("source_file_path"),
|
86
|
+
),
|
87
|
+
],
|
88
|
+
)
|
89
|
+
|
90
|
+
|
91
|
+
if __name__ == "__main__":
|
92
|
+
"""
|
93
|
+
Example 1: Run this script locally using Ray:
|
94
|
+
$ python indexer.py \
|
95
|
+
$ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
|
96
|
+
$ --dest 'parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet'
|
97
|
+
|
98
|
+
Example 2: Submit this script as a local Ray job using a local job client:
|
99
|
+
>>> from deltacat import local_job_client
|
100
|
+
>>> client = local_job_client()
|
101
|
+
>>> # read the source file as line-delimited text
|
102
|
+
>>> src = "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31"
|
103
|
+
>>> # write to the destination file using the default DeltaCAT Parquet writer (i.e., polars.DataFrame.write_parquet)
|
104
|
+
>>> dst = "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet"
|
105
|
+
>>> try:
|
106
|
+
>>> job_run_result = client.run_job(
|
107
|
+
>>> # Entrypoint shell command to run the indexer job
|
108
|
+
>>> entrypoint=f"python indexer.py --source '{src}' --dest '{dst}'",
|
109
|
+
>>> # Path to the local directory that contains the indexer.py file
|
110
|
+
>>> runtime_env={"working_dir": "./deltacat/examples/indexer.py"},
|
111
|
+
>>> )
|
112
|
+
>>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
|
113
|
+
>>> print(f"Job ID {job_run_result.job_id} logs: ")
|
114
|
+
>>> print(job_run_result.job_logs)
|
115
|
+
>>> except RuntimeError as e:
|
116
|
+
>>> print(f"Job Run Failed: {e}")
|
117
|
+
>>> except TimeoutError as e:
|
118
|
+
>>> print(f"Job Run Timed Out: {e}")
|
119
|
+
|
120
|
+
Example 3: Submit this script as a remote Ray job using a remote job client:
|
121
|
+
>>> from deltacat import job_client
|
122
|
+
>>> # use `deltacat.yaml` from the current working directory as the ray cluster launcher config file
|
123
|
+
>>> # automatically launches the cluster if it doesn't exist or has died
|
124
|
+
>>> # automatically forwards the ray cluster's dashboard for viewing in a web browser @ http://localhost:8265
|
125
|
+
>>> client = job_client()
|
126
|
+
>>> # ... follow the same steps as above to submit a synchronous indexer job ...
|
127
|
+
>>>
|
128
|
+
>>> # OR use an explicit cluster launcher config file path
|
129
|
+
>>> client = job_client("/Users/pdames/workspace/deltacat.yaml")
|
130
|
+
>>> # ... follow the same steps as above to submit a synchronous indexer job ...
|
131
|
+
"""
|
132
|
+
script_args = [
|
133
|
+
(
|
134
|
+
[
|
135
|
+
"--source",
|
136
|
+
],
|
137
|
+
{
|
138
|
+
"help": "Source DeltaCAT URL to index.",
|
139
|
+
"type": str,
|
140
|
+
},
|
141
|
+
),
|
142
|
+
(
|
143
|
+
[
|
144
|
+
"--dest",
|
145
|
+
],
|
146
|
+
{
|
147
|
+
"help": "Destination DeltaCAT URL to index.",
|
148
|
+
"type": str,
|
149
|
+
},
|
150
|
+
),
|
151
|
+
]
|
152
|
+
# parse CLI input arguments
|
153
|
+
parser = argparse.ArgumentParser()
|
154
|
+
for args, kwargs in script_args:
|
155
|
+
parser.add_argument(*args, **kwargs)
|
156
|
+
args = parser.parse_args()
|
157
|
+
print(f"Command Line Arguments: {args}")
|
158
|
+
|
159
|
+
# initialize deltacat
|
160
|
+
deltacat.init()
|
161
|
+
|
162
|
+
# run the example using the parsed arguments
|
163
|
+
run(**vars(args))
|