deltacat 2.0.0b11__py3-none-any.whl โ 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py โ conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info โ deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info โ deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model โ docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils โ docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info โ deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info โ deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,226 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
import apache_beam as beam
|
3
|
+
from apache_beam.options.pipeline_options import PipelineOptions
|
4
|
+
from apache_beam import Row
|
5
|
+
import os
|
6
|
+
import pyarrow.fs as pafs
|
7
|
+
from deltacat.experimental.converter_agent.beam.managed import (
|
8
|
+
write as deltacat_beam_managed_write,
|
9
|
+
)
|
10
|
+
from deltacat.examples.experimental.iceberg.converter.beam.utils.common import (
|
11
|
+
generate_random_suffix,
|
12
|
+
verify_duplicate_resolution,
|
13
|
+
wait_for_deltacat_jobs,
|
14
|
+
)
|
15
|
+
from deltacat.examples.experimental.iceberg.converter.beam.utils.spark import (
|
16
|
+
SparkSQLIcebergRead,
|
17
|
+
SparkSQLIcebergRewrite,
|
18
|
+
)
|
19
|
+
|
20
|
+
# Monkey-patch beam.managed.Write and beam.managed.Read
|
21
|
+
beam.managed.Write = deltacat_beam_managed_write
|
22
|
+
|
23
|
+
|
24
|
+
def run(
|
25
|
+
beam_options: Optional[PipelineOptions] = None,
|
26
|
+
mode: str = "write", # 'write' to write data, 'read' to read data
|
27
|
+
rest_catalog_uri: str = "http://localhost:8181", # REST catalog server URI
|
28
|
+
warehouse_path: Optional[str] = None, # Optional custom warehouse path
|
29
|
+
table_name: Optional[str] = None, # Table name with namespace
|
30
|
+
deltacat_converter_interval: float = 5.0, # Converter monitoring interval
|
31
|
+
ray_inactivity_timeout: int = 20, # Ray cluster shutdown timeout
|
32
|
+
max_converter_parallelism: int = 1, # Maximum converter task parallelism
|
33
|
+
filesystem: Optional[pafs.FileSystem] = None, # Optional PyArrow filesystem
|
34
|
+
) -> None:
|
35
|
+
"""
|
36
|
+
Run the pipeline in either 'write' or 'read' mode using Iceberg REST Catalog.
|
37
|
+
|
38
|
+
Prerequisites:
|
39
|
+
- Start the Iceberg REST catalog server:
|
40
|
+
docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0
|
41
|
+
- For read mode: Install PySpark:
|
42
|
+
pip install pyspark
|
43
|
+
|
44
|
+
Args:
|
45
|
+
beam_options: Apache Beam pipeline options
|
46
|
+
mode: 'write' to write data, 'read' to read data
|
47
|
+
rest_catalog_uri: URI of the REST catalog server (default: http://localhost:8181)
|
48
|
+
warehouse_path: Custom warehouse path (default: temporary directory)
|
49
|
+
table_name: Name of the Iceberg table (default: None - generates a random table name)
|
50
|
+
deltacat_converter_interval: Interval for DeltaCat optimizer monitoring
|
51
|
+
ray_inactivity_timeout: Timeout for shutting down Ray cluster
|
52
|
+
max_converter_parallelism: Maximum number of concurrent converter tasks
|
53
|
+
filesystem: PyArrow filesystem instance (default: LocalFileSystem)
|
54
|
+
|
55
|
+
Pipeline Operations:
|
56
|
+
- 'write': Write sample data to the Iceberg table with merge-on-read functionality.
|
57
|
+
Uses job-based table monitoring for better scalability and resource management.
|
58
|
+
- 'read': Read deduplicated data from the Iceberg table using Spark SQL.
|
59
|
+
Uses Spark SQL instead of Beam's native Iceberg I/O to properly handle positional deletes.
|
60
|
+
"""
|
61
|
+
# Use custom warehouse path or create a temporary one
|
62
|
+
if warehouse_path is None:
|
63
|
+
warehouse_path = os.path.join("/tmp", "iceberg_rest_warehouse")
|
64
|
+
os.makedirs(warehouse_path, exist_ok=True)
|
65
|
+
|
66
|
+
# Use provided filesystem or create a LocalFileSystem by default
|
67
|
+
if filesystem is None:
|
68
|
+
filesystem = pafs.LocalFileSystem()
|
69
|
+
|
70
|
+
# Generate unique table name if using default to avoid conflicts
|
71
|
+
if not table_name:
|
72
|
+
random_suffix = generate_random_suffix()
|
73
|
+
table_name = f"default.demo_table_{random_suffix}"
|
74
|
+
print(f"๐ Generated unique table name: {table_name}")
|
75
|
+
|
76
|
+
# Define catalog configuration for REST catalog (simplified, table creation handled separately)
|
77
|
+
catalog_config = {
|
78
|
+
"catalog_properties": {
|
79
|
+
"warehouse": warehouse_path,
|
80
|
+
"catalog-impl": "org.apache.iceberg.rest.RESTCatalog",
|
81
|
+
"uri": rest_catalog_uri,
|
82
|
+
},
|
83
|
+
"deltacat_converter_properties": {
|
84
|
+
"deltacat_converter_interval": deltacat_converter_interval,
|
85
|
+
"merge_keys": ["id"], # Configure merge keys for duplicate detection
|
86
|
+
"ray_inactivity_timeout": ray_inactivity_timeout,
|
87
|
+
"filesystem": filesystem, # Pass filesystem to DeltaCAT converter
|
88
|
+
"max_converter_parallelism": max_converter_parallelism,
|
89
|
+
},
|
90
|
+
}
|
91
|
+
|
92
|
+
# Ensure table name includes namespace
|
93
|
+
if "." not in table_name:
|
94
|
+
full_table_name = f"default.{table_name}"
|
95
|
+
else:
|
96
|
+
full_table_name = table_name
|
97
|
+
|
98
|
+
print(f"๐ง Using Iceberg REST Catalog")
|
99
|
+
print(f" REST Server: {rest_catalog_uri}")
|
100
|
+
print(f" Warehouse: {warehouse_path}")
|
101
|
+
print(f" Mode: {mode}")
|
102
|
+
print(f" Table: {full_table_name}")
|
103
|
+
print(f" Filesystem: {type(filesystem).__name__}")
|
104
|
+
|
105
|
+
# Remind user about prerequisites
|
106
|
+
if mode == "write":
|
107
|
+
print("๐ Prerequisites:")
|
108
|
+
print(" Make sure the Iceberg REST catalog server is running:")
|
109
|
+
print(
|
110
|
+
" docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
|
111
|
+
)
|
112
|
+
print()
|
113
|
+
with beam.Pipeline(options=beam_options) as p:
|
114
|
+
# Step 1: Write initial data to create the table
|
115
|
+
initial_data = p | "Create initial data" >> beam.Create(
|
116
|
+
[
|
117
|
+
Row(id=1, name="Alice", value=100, version=1),
|
118
|
+
Row(id=2, name="Bob", value=200, version=1),
|
119
|
+
Row(id=3, name="Charlie", value=300, version=1),
|
120
|
+
Row(id=4, name="David", value=400, version=1),
|
121
|
+
Row(id=5, name="Eve", value=500, version=1),
|
122
|
+
Row(id=6, name="Frank", value=600, version=1),
|
123
|
+
Row(id=7, name="Grace", value=700, version=1),
|
124
|
+
Row(id=8, name="Henry", value=800, version=1),
|
125
|
+
Row(
|
126
|
+
id=2, name="Robert", value=201, version=2
|
127
|
+
), # Update Bob's record
|
128
|
+
Row(
|
129
|
+
id=3, name="Charles", value=301, version=2
|
130
|
+
), # Update Charlie's record
|
131
|
+
Row(id=9, name="Ivy", value=900, version=1), # Add a new record
|
132
|
+
]
|
133
|
+
)
|
134
|
+
|
135
|
+
initial_data | "Write initial data to Iceberg" >> beam.managed.Write(
|
136
|
+
beam.managed.ICEBERG,
|
137
|
+
config={
|
138
|
+
"table": full_table_name, # Use fully qualified table name for REST catalog
|
139
|
+
"write_mode": "append",
|
140
|
+
**catalog_config,
|
141
|
+
},
|
142
|
+
)
|
143
|
+
|
144
|
+
# Wait for the DeltaCAT converter job to complete and shutdown
|
145
|
+
wait_for_deltacat_jobs(
|
146
|
+
full_table_name, warehouse_path, ray_inactivity_timeout * 2
|
147
|
+
)
|
148
|
+
|
149
|
+
print(f"\n๐ Data writing completed with DeltaCAT optimization enabled.")
|
150
|
+
print(f" - Table monitoring interval: {deltacat_converter_interval} seconds")
|
151
|
+
print(f" - Ray cluster shutdown timeout: {ray_inactivity_timeout} seconds")
|
152
|
+
print(f" - Automatic duplicate detection and resolution")
|
153
|
+
print(f" - Position delete creation for duplicate resolution")
|
154
|
+
print(f" - Job-based table monitoring with Ray")
|
155
|
+
print(f" - Filesystem: {type(filesystem).__name__}")
|
156
|
+
print(
|
157
|
+
f"๐ Read the table with: `python main.py --mode read --table-name {full_table_name}`"
|
158
|
+
)
|
159
|
+
|
160
|
+
elif mode == "read":
|
161
|
+
with beam.Pipeline(options=beam_options) as p:
|
162
|
+
# Read from the Iceberg table using Spark SQL
|
163
|
+
# Note: We use Spark SQL instead of beam.managed.Read because Beam's native Iceberg I/O
|
164
|
+
# cannot handle positional delete files created by DeltaCAT converter sessions.
|
165
|
+
|
166
|
+
print(f"๐ Reading from Iceberg table '{full_table_name}' using Spark SQL")
|
167
|
+
|
168
|
+
# Create a trigger element to start the read
|
169
|
+
trigger = p | "Create read trigger" >> beam.Create([None])
|
170
|
+
|
171
|
+
# Read from Iceberg table using Spark SQL
|
172
|
+
elements = trigger | "Read with Spark SQL" >> beam.ParDo(
|
173
|
+
SparkSQLIcebergRead(
|
174
|
+
table_name=full_table_name,
|
175
|
+
catalog_uri=rest_catalog_uri,
|
176
|
+
warehouse=warehouse_path,
|
177
|
+
)
|
178
|
+
)
|
179
|
+
|
180
|
+
# Display the data read (after positional deletes are applied)
|
181
|
+
elements | "Print deduplicated data" >> beam.Map(
|
182
|
+
lambda row: print(f"๐ Record: {row}")
|
183
|
+
)
|
184
|
+
|
185
|
+
# Count records for summary
|
186
|
+
def count_and_display(elements_list):
|
187
|
+
print(f"\n๐ Read Summary:")
|
188
|
+
print(f" - Total records: {len(elements_list)}")
|
189
|
+
return elements_list
|
190
|
+
|
191
|
+
# Collect all elements for counting
|
192
|
+
elements | "Count records" >> beam.combiners.ToList() | "Display summary" >> beam.Map(
|
193
|
+
count_and_display
|
194
|
+
)
|
195
|
+
|
196
|
+
# Verify that the data was correctly merged by ID
|
197
|
+
verify_duplicate_resolution(full_table_name, warehouse_path)
|
198
|
+
|
199
|
+
elif mode == "rewrite":
|
200
|
+
with beam.Pipeline(options=beam_options) as p:
|
201
|
+
# Rewrite table data files to materialize positional deletes
|
202
|
+
print(f"๐ Rewriting Iceberg table to materialize positional deletes")
|
203
|
+
print(f" - Table: {full_table_name}")
|
204
|
+
print(f" - Purpose: Remove positional deletes to enable Beam writes")
|
205
|
+
print(f" - Method: Spark rewrite_data_files procedure")
|
206
|
+
|
207
|
+
# Create a trigger element to start the rewrite
|
208
|
+
trigger = p | "Create rewrite trigger" >> beam.Create(
|
209
|
+
[f"rewrite_{full_table_name}"]
|
210
|
+
)
|
211
|
+
|
212
|
+
# Use Spark SQL to rewrite the table
|
213
|
+
rewrite_results = trigger | "Rewrite table with Spark SQL" >> beam.ParDo(
|
214
|
+
SparkSQLIcebergRewrite(
|
215
|
+
catalog_uri=rest_catalog_uri,
|
216
|
+
warehouse_path=warehouse_path,
|
217
|
+
table_name=full_table_name,
|
218
|
+
)
|
219
|
+
)
|
220
|
+
|
221
|
+
# Log the results
|
222
|
+
rewrite_results | "Log rewrite results" >> beam.Map(
|
223
|
+
lambda result: print(f"๐ Rewrite result: {result}")
|
224
|
+
)
|
225
|
+
else:
|
226
|
+
raise ValueError(f"Unknown mode: {mode}. Use 'write', 'read', or 'rewrite'.")
|
@@ -0,0 +1,133 @@
|
|
1
|
+
import pyarrow.fs as pafs
|
2
|
+
import argparse
|
3
|
+
import logging
|
4
|
+
|
5
|
+
from apache_beam.options.pipeline_options import PipelineOptions
|
6
|
+
|
7
|
+
from deltacat.examples.experimental.iceberg.converter.beam import app
|
8
|
+
|
9
|
+
|
10
|
+
if __name__ == "__main__":
|
11
|
+
logging.getLogger().setLevel(logging.INFO)
|
12
|
+
parser = argparse.ArgumentParser(
|
13
|
+
description="DeltaCat Beam Iceberg Converter Example using REST Catalog",
|
14
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
15
|
+
epilog="""
|
16
|
+
Examples:
|
17
|
+
# Start REST catalog server first (Iceberg 1.6.0):
|
18
|
+
docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0
|
19
|
+
|
20
|
+
# Install PySpark for read and rewrite modes:
|
21
|
+
pip install pyspark
|
22
|
+
|
23
|
+
# Write sample data with DeltaCAT data file converter (automatic merge by key):
|
24
|
+
python main.py --mode write --table-name "deltacat.hello_world"
|
25
|
+
|
26
|
+
# Read data back (uses Spark SQL to read positional deletes):
|
27
|
+
python main.py --mode read --table-name "deltacat.hello_world"
|
28
|
+
|
29
|
+
# Rewrite table to materialize positional deletes:
|
30
|
+
python main.py --mode rewrite --table-name "deltacat.hello_world"
|
31
|
+
|
32
|
+
# Use custom REST catalog server:
|
33
|
+
python main.py --mode write --rest-uri http://localhost:9000 --table-name "deltacat.hello_world"
|
34
|
+
|
35
|
+
# Use custom warehouse path:
|
36
|
+
python main.py --mode write --warehouse-path /tmp/my_warehouse --table-name "deltacat.hello_world"
|
37
|
+
""",
|
38
|
+
)
|
39
|
+
|
40
|
+
parser.add_argument(
|
41
|
+
"--mode",
|
42
|
+
default="write",
|
43
|
+
choices=["write", "read", "rewrite"],
|
44
|
+
help="Pipeline mode: 'write' to write data, 'read' to read data, 'rewrite' to materialize positional deletes (default: write). "
|
45
|
+
" Note: Beam writes may fail on tables processed by external tools.",
|
46
|
+
)
|
47
|
+
|
48
|
+
parser.add_argument(
|
49
|
+
"--rest-uri",
|
50
|
+
default="http://localhost:8181",
|
51
|
+
help="REST catalog server URI (default: http://localhost:8181).",
|
52
|
+
)
|
53
|
+
|
54
|
+
parser.add_argument(
|
55
|
+
"--warehouse-path",
|
56
|
+
default=None,
|
57
|
+
help="Custom warehouse path (default: temporary directory).",
|
58
|
+
)
|
59
|
+
|
60
|
+
parser.add_argument(
|
61
|
+
"--table-name",
|
62
|
+
default=None,
|
63
|
+
help="Table name to use (default: autogenerated table name).",
|
64
|
+
)
|
65
|
+
|
66
|
+
parser.add_argument(
|
67
|
+
"--deltacat-converter-interval",
|
68
|
+
type=float,
|
69
|
+
default=5.0,
|
70
|
+
help="DeltaCat converter monitoring interval in seconds (default: 5.0).",
|
71
|
+
)
|
72
|
+
|
73
|
+
parser.add_argument(
|
74
|
+
"--ray-inactivity-timeout",
|
75
|
+
type=int,
|
76
|
+
default=20,
|
77
|
+
help="Ray cluster shutdown timeout after inactivity in seconds (default: 20).",
|
78
|
+
)
|
79
|
+
|
80
|
+
parser.add_argument(
|
81
|
+
"--max-converter-parallelism",
|
82
|
+
type=int,
|
83
|
+
default=1,
|
84
|
+
help="Maximum converter task parallelism - number of concurrent converter tasks (default: 1).",
|
85
|
+
)
|
86
|
+
|
87
|
+
args, beam_args = parser.parse_known_args()
|
88
|
+
|
89
|
+
beam_options = PipelineOptions(
|
90
|
+
beam_args,
|
91
|
+
save_main_session=True,
|
92
|
+
)
|
93
|
+
|
94
|
+
print("DeltaCAT Beam Iceberg Upsert Example")
|
95
|
+
print("=" * 50)
|
96
|
+
print(f"Mode: {args.mode}")
|
97
|
+
print(f"REST Catalog URI: {args.rest_uri}")
|
98
|
+
print(f"Warehouse Path: {args.warehouse_path or 'temporary directory'}")
|
99
|
+
print(f"Table Name: {args.table_name}")
|
100
|
+
print(f"Converter Interval: {args.deltacat_converter_interval}s")
|
101
|
+
print(f"Ray Inactivity Timeout: {args.ray_inactivity_timeout}s")
|
102
|
+
print(f"Max Converter Parallelism: {args.max_converter_parallelism}")
|
103
|
+
print()
|
104
|
+
|
105
|
+
# Remind user about prerequisites
|
106
|
+
if args.mode == "write":
|
107
|
+
print("Prerequisites:")
|
108
|
+
print(" Make sure the Iceberg REST catalog server is running:")
|
109
|
+
print(
|
110
|
+
" docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
|
111
|
+
)
|
112
|
+
print()
|
113
|
+
elif args.mode in ["read", "rewrite"]:
|
114
|
+
print("Prerequisites:")
|
115
|
+
print(" Make sure the Iceberg REST catalog server is running:")
|
116
|
+
print(
|
117
|
+
" docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
|
118
|
+
)
|
119
|
+
print(" PySpark is required for this mode:")
|
120
|
+
print(" pip install pyspark")
|
121
|
+
print()
|
122
|
+
|
123
|
+
app.run(
|
124
|
+
beam_options=beam_options,
|
125
|
+
mode=args.mode,
|
126
|
+
rest_catalog_uri=args.rest_uri,
|
127
|
+
warehouse_path=args.warehouse_path,
|
128
|
+
table_name=args.table_name,
|
129
|
+
deltacat_converter_interval=args.deltacat_converter_interval,
|
130
|
+
ray_inactivity_timeout=args.ray_inactivity_timeout,
|
131
|
+
filesystem=pafs.LocalFileSystem(),
|
132
|
+
max_converter_parallelism=args.max_converter_parallelism,
|
133
|
+
)
|
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Quick test script to demonstrate the REST catalog workflow.
|
4
|
+
This script shows the complete write โ read cycle with DeltaCAT monitoring and conversion.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import subprocess
|
8
|
+
import sys
|
9
|
+
from deltacat.examples.experimental.iceberg.converter.beam.utils.common import (
|
10
|
+
generate_random_suffix,
|
11
|
+
check_rest_catalog,
|
12
|
+
)
|
13
|
+
from deltacat.examples.experimental.iceberg.converter.beam.utils.common import (
|
14
|
+
verify_duplicate_resolution,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
def run_example(mode, table_name, input_text="Workflow Test"):
|
19
|
+
"""Run the example in the specified mode."""
|
20
|
+
print(f"\n๐ Running example in {mode} mode with table: {table_name}")
|
21
|
+
cmd = [
|
22
|
+
sys.executable,
|
23
|
+
"main.py",
|
24
|
+
"--mode",
|
25
|
+
mode,
|
26
|
+
"--input-text",
|
27
|
+
input_text,
|
28
|
+
"--table-name",
|
29
|
+
table_name,
|
30
|
+
]
|
31
|
+
|
32
|
+
try:
|
33
|
+
result = subprocess.run(
|
34
|
+
cmd, capture_output=True, text=True, timeout=120
|
35
|
+
) # Increased timeout for converter
|
36
|
+
if result.returncode == 0:
|
37
|
+
print(f"โ
{mode.capitalize()} operation completed successfully")
|
38
|
+
if mode == "read":
|
39
|
+
# Show sample data from the output
|
40
|
+
lines = result.stdout.split("\n")
|
41
|
+
data_lines = [line for line in lines if "BeamSchema" in line]
|
42
|
+
if data_lines:
|
43
|
+
print(f"๐ Found {len(data_lines)} records in table")
|
44
|
+
print("Sample records:")
|
45
|
+
for line in data_lines[:5]: # Show first 5 records
|
46
|
+
print(f" {line}")
|
47
|
+
if len(data_lines) > 5:
|
48
|
+
print(f" ... and {len(data_lines) - 5} more records")
|
49
|
+
return True
|
50
|
+
else:
|
51
|
+
print(f"โ {mode.capitalize()} operation failed:")
|
52
|
+
print(result.stderr)
|
53
|
+
return False
|
54
|
+
except subprocess.TimeoutExpired:
|
55
|
+
print(f"โฐ {mode.capitalize()} operation timed out")
|
56
|
+
return False
|
57
|
+
except Exception as e:
|
58
|
+
print(f"โ Error running {mode} operation: {e}")
|
59
|
+
return False
|
60
|
+
|
61
|
+
|
62
|
+
def main():
|
63
|
+
"""Main workflow test."""
|
64
|
+
print("๐งช DeltaCAT Beam Iceberg REST Catalog Workflow Test")
|
65
|
+
print("=" * 60)
|
66
|
+
|
67
|
+
# Generate unique table name to avoid conflicts
|
68
|
+
random_suffix = generate_random_suffix()
|
69
|
+
table_name = f"default.demo_table_{random_suffix}"
|
70
|
+
print(f"๐ Generated unique table name: {table_name}")
|
71
|
+
|
72
|
+
# Step 1: Check prerequisites
|
73
|
+
if not check_rest_catalog():
|
74
|
+
sys.exit(1)
|
75
|
+
|
76
|
+
# Step 2: Write data (creates table with duplicates and triggers converter)
|
77
|
+
print(f"\n๐ Phase 1: Writing data and triggering DeltaCAT converter")
|
78
|
+
if not run_example("write", table_name, "Workflow Demo User"):
|
79
|
+
print("โ Write test failed")
|
80
|
+
sys.exit(1)
|
81
|
+
|
82
|
+
# Step 3: Verify upsert merge worked as expected
|
83
|
+
print(f"\n๐ Phase 2: Direct verification of duplicate resolution")
|
84
|
+
verification_success = verify_duplicate_resolution(table_name)
|
85
|
+
|
86
|
+
# Step 4: Read data back to show final state
|
87
|
+
print(f"\n๐ Phase 3: Reading final table state")
|
88
|
+
if not run_example("read", table_name):
|
89
|
+
print("โ Read test failed")
|
90
|
+
sys.exit(1)
|
91
|
+
|
92
|
+
# Final summary
|
93
|
+
print("\n๐ Workflow test completed!")
|
94
|
+
|
95
|
+
if verification_success:
|
96
|
+
print("\nโ
SUCCESS:")
|
97
|
+
print(" โ
Table creation and writes")
|
98
|
+
print(" โ
DeltaCAT monitoring merged duplicates")
|
99
|
+
print(" โ
Read operations correctly read merged data")
|
100
|
+
else:
|
101
|
+
print("\nโ ๏ธ PARTIAL SUCCESS:")
|
102
|
+
print(" โ
Table creation and writes")
|
103
|
+
print(" โ Converter may still be processing or failed")
|
104
|
+
print(" ๐ Check logs for converter execution details")
|
105
|
+
|
106
|
+
print("\n๐ What happened:")
|
107
|
+
print(" 1. Beam wrote data creating duplicates (IDs 2,3)")
|
108
|
+
print(" 2. DeltaCAT monitoring merged duplicates")
|
109
|
+
print(" 3. Table now contains merged data")
|
110
|
+
|
111
|
+
|
112
|
+
if __name__ == "__main__":
|
113
|
+
main()
|
@@ -0,0 +1,174 @@
|
|
1
|
+
"""
|
2
|
+
Common utility functions for the Iceberg converter example.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import random
|
6
|
+
import string
|
7
|
+
import time
|
8
|
+
from pyiceberg.catalog import load_catalog
|
9
|
+
import requests
|
10
|
+
from deltacat import local_job_client
|
11
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
12
|
+
from deltacat.experimental.converter_agent.table_monitor import _generate_job_name
|
13
|
+
|
14
|
+
|
15
|
+
def generate_random_suffix(length=8):
|
16
|
+
"""Generate a random string of specified length using letters and digits."""
|
17
|
+
return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))
|
18
|
+
|
19
|
+
|
20
|
+
def check_rest_catalog():
|
21
|
+
"""Check if REST catalog is running."""
|
22
|
+
try:
|
23
|
+
response = requests.get("http://localhost:8181/v1/config", timeout=5)
|
24
|
+
if response.status_code == 200:
|
25
|
+
print("โ
REST catalog is running")
|
26
|
+
return True
|
27
|
+
except requests.exceptions.RequestException:
|
28
|
+
pass
|
29
|
+
|
30
|
+
print("โ REST catalog is not running")
|
31
|
+
print(
|
32
|
+
"๐ Start it with: docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
|
33
|
+
)
|
34
|
+
return False
|
35
|
+
|
36
|
+
|
37
|
+
def wait_for_deltacat_jobs(
|
38
|
+
table_name, warehouse_path="/tmp/iceberg_rest_warehouse", timeout=120
|
39
|
+
):
|
40
|
+
"""
|
41
|
+
Wait for DeltaCAT converter jobs to complete by checking job status.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
table_name: Full table name (e.g., "default.demo_table_abc123")
|
45
|
+
warehouse_path: Warehouse path used for job tracking
|
46
|
+
timeout: Maximum seconds to wait for job completion
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
True if all jobs completed, False if timeout
|
50
|
+
"""
|
51
|
+
print(f"\nโณ Monitoring DeltaCAT converter jobs for table: {table_name}")
|
52
|
+
|
53
|
+
# Parse table name to get namespace and table name
|
54
|
+
if "." in table_name:
|
55
|
+
namespace, actual_table_name = table_name.split(".", 1)
|
56
|
+
else:
|
57
|
+
namespace = DEFAULT_NAMESPACE
|
58
|
+
actual_table_name = table_name
|
59
|
+
|
60
|
+
# Create job key matching the format used in managed.py
|
61
|
+
job_name = _generate_job_name(
|
62
|
+
warehouse_path=warehouse_path, namespace=namespace, table_name=actual_table_name
|
63
|
+
)
|
64
|
+
|
65
|
+
start_time = time.time()
|
66
|
+
|
67
|
+
try:
|
68
|
+
# Get the job client
|
69
|
+
client = local_job_client(ray_init_args={"local_mode": True})
|
70
|
+
|
71
|
+
while time.time() - start_time < timeout:
|
72
|
+
job_details_list = client.list_jobs()
|
73
|
+
print(f"๐ Job details list: {job_details_list}")
|
74
|
+
job_submission_ids = [
|
75
|
+
job_details.submission_id for job_details in job_details_list
|
76
|
+
]
|
77
|
+
|
78
|
+
# Check if we have any tracked jobs for this table
|
79
|
+
print(f"๐ Looking for submission ID: {job_name} in {job_submission_ids}")
|
80
|
+
if job_name in job_submission_ids:
|
81
|
+
# Check job status with Ray
|
82
|
+
try:
|
83
|
+
job_status = client.get_job_status(job_name)
|
84
|
+
print(f"๐ Job {job_name} status: {job_status}")
|
85
|
+
# Check if job is still running
|
86
|
+
if job_status and str(job_status) in ["PENDING", "RUNNING"]:
|
87
|
+
time.sleep(2) # Short polling interval
|
88
|
+
continue
|
89
|
+
else:
|
90
|
+
print(f"โ
Job {job_name} completed with status: {job_status}")
|
91
|
+
return True
|
92
|
+
|
93
|
+
except Exception as e:
|
94
|
+
print(f"โ ๏ธ Could not check job status for {job_name}: {e}")
|
95
|
+
# If we can't check status, assume job is done
|
96
|
+
return True
|
97
|
+
time.sleep(1)
|
98
|
+
print(f"โฐ Timeout waiting for DeltaCAT job completion after {timeout} seconds")
|
99
|
+
return False
|
100
|
+
|
101
|
+
except Exception as e:
|
102
|
+
print(f"โ Error monitoring DeltaCAT jobs: {e}")
|
103
|
+
# Fall back to short sleep if monitoring fails
|
104
|
+
print(f"๐ Falling back to {timeout}-second wait...")
|
105
|
+
time.sleep(timeout)
|
106
|
+
return True
|
107
|
+
|
108
|
+
|
109
|
+
def verify_duplicate_resolution(
|
110
|
+
table_name, warehouse_path="/tmp/iceberg_rest_warehouse"
|
111
|
+
):
|
112
|
+
"""
|
113
|
+
Verify that the DeltaCAT converter successfully resolved duplicates.
|
114
|
+
"""
|
115
|
+
try:
|
116
|
+
print(f"\n๐ Verifying duplicate resolution for table: {table_name}")
|
117
|
+
|
118
|
+
# Create PyIceberg catalog to check results
|
119
|
+
verification_catalog = load_catalog(
|
120
|
+
"workflow_verification_catalog",
|
121
|
+
**{
|
122
|
+
"type": "rest",
|
123
|
+
"warehouse": warehouse_path,
|
124
|
+
"uri": "http://localhost:8181",
|
125
|
+
},
|
126
|
+
)
|
127
|
+
|
128
|
+
# Load the table and scan its contents
|
129
|
+
table_identifier = table_name
|
130
|
+
tbl = verification_catalog.load_table(table_identifier)
|
131
|
+
scan_result = tbl.scan().to_arrow().to_pydict()
|
132
|
+
|
133
|
+
# Check the results
|
134
|
+
result_ids = sorted(scan_result["id"])
|
135
|
+
unique_ids = sorted(set(result_ids))
|
136
|
+
|
137
|
+
print(f"๐ Final verification results:")
|
138
|
+
print(f" - Total records: {len(result_ids)}")
|
139
|
+
print(f" - Unique IDs: {len(unique_ids)}")
|
140
|
+
print(f" - IDs found: {result_ids}")
|
141
|
+
|
142
|
+
# Check if duplicates were resolved
|
143
|
+
expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
144
|
+
if result_ids == expected_unique_ids:
|
145
|
+
# Verify that the latest versions were preserved
|
146
|
+
names_by_id = {}
|
147
|
+
versions_by_id = {}
|
148
|
+
for i, id_val in enumerate(scan_result["id"]):
|
149
|
+
names_by_id[id_val] = scan_result["name"][i]
|
150
|
+
versions_by_id[id_val] = scan_result["version"][i]
|
151
|
+
|
152
|
+
if (
|
153
|
+
names_by_id.get(2) == "Robert"
|
154
|
+
and names_by_id.get(3) == "Charles"
|
155
|
+
and versions_by_id.get(2) == 2
|
156
|
+
and versions_by_id.get(3) == 2
|
157
|
+
):
|
158
|
+
print(f"โ
Duplicate resolution SUCCESSFUL!")
|
159
|
+
print(f" - All 9 IDs are unique")
|
160
|
+
print(f" - Latest versions preserved (BobโRobert, CharlieโCharles)")
|
161
|
+
print(f" - Version numbers correct (v2 for updated records)")
|
162
|
+
return True
|
163
|
+
else:
|
164
|
+
print(f"โ Latest versions not preserved correctly")
|
165
|
+
else:
|
166
|
+
print(f"โ Duplicates still present or unexpected record count")
|
167
|
+
print(f" - Expected: {expected_unique_ids}")
|
168
|
+
print(f" - Got: {result_ids}")
|
169
|
+
|
170
|
+
return False
|
171
|
+
|
172
|
+
except Exception as e:
|
173
|
+
print(f"โ Error during verification: {e}")
|
174
|
+
return False
|