deltacat 2.0.0b11__py3-none-any.whl โ†’ 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py โ†’ conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info โ†’ deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info โ†’ deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model โ†’ docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils โ†’ docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info โ†’ deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info โ†’ deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,226 @@
1
+ from typing import Optional
2
+ import apache_beam as beam
3
+ from apache_beam.options.pipeline_options import PipelineOptions
4
+ from apache_beam import Row
5
+ import os
6
+ import pyarrow.fs as pafs
7
+ from deltacat.experimental.converter_agent.beam.managed import (
8
+ write as deltacat_beam_managed_write,
9
+ )
10
+ from deltacat.examples.experimental.iceberg.converter.beam.utils.common import (
11
+ generate_random_suffix,
12
+ verify_duplicate_resolution,
13
+ wait_for_deltacat_jobs,
14
+ )
15
+ from deltacat.examples.experimental.iceberg.converter.beam.utils.spark import (
16
+ SparkSQLIcebergRead,
17
+ SparkSQLIcebergRewrite,
18
+ )
19
+
20
+ # Monkey-patch beam.managed.Write and beam.managed.Read
21
+ beam.managed.Write = deltacat_beam_managed_write
22
+
23
+
24
+ def run(
25
+ beam_options: Optional[PipelineOptions] = None,
26
+ mode: str = "write", # 'write' to write data, 'read' to read data
27
+ rest_catalog_uri: str = "http://localhost:8181", # REST catalog server URI
28
+ warehouse_path: Optional[str] = None, # Optional custom warehouse path
29
+ table_name: Optional[str] = None, # Table name with namespace
30
+ deltacat_converter_interval: float = 5.0, # Converter monitoring interval
31
+ ray_inactivity_timeout: int = 20, # Ray cluster shutdown timeout
32
+ max_converter_parallelism: int = 1, # Maximum converter task parallelism
33
+ filesystem: Optional[pafs.FileSystem] = None, # Optional PyArrow filesystem
34
+ ) -> None:
35
+ """
36
+ Run the pipeline in either 'write' or 'read' mode using Iceberg REST Catalog.
37
+
38
+ Prerequisites:
39
+ - Start the Iceberg REST catalog server:
40
+ docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0
41
+ - For read mode: Install PySpark:
42
+ pip install pyspark
43
+
44
+ Args:
45
+ beam_options: Apache Beam pipeline options
46
+ mode: 'write' to write data, 'read' to read data
47
+ rest_catalog_uri: URI of the REST catalog server (default: http://localhost:8181)
48
+ warehouse_path: Custom warehouse path (default: temporary directory)
49
+ table_name: Name of the Iceberg table (default: None - generates a random table name)
50
+ deltacat_converter_interval: Interval for DeltaCat optimizer monitoring
51
+ ray_inactivity_timeout: Timeout for shutting down Ray cluster
52
+ max_converter_parallelism: Maximum number of concurrent converter tasks
53
+ filesystem: PyArrow filesystem instance (default: LocalFileSystem)
54
+
55
+ Pipeline Operations:
56
+ - 'write': Write sample data to the Iceberg table with merge-on-read functionality.
57
+ Uses job-based table monitoring for better scalability and resource management.
58
+ - 'read': Read deduplicated data from the Iceberg table using Spark SQL.
59
+ Uses Spark SQL instead of Beam's native Iceberg I/O to properly handle positional deletes.
60
+ """
61
+ # Use custom warehouse path or create a temporary one
62
+ if warehouse_path is None:
63
+ warehouse_path = os.path.join("/tmp", "iceberg_rest_warehouse")
64
+ os.makedirs(warehouse_path, exist_ok=True)
65
+
66
+ # Use provided filesystem or create a LocalFileSystem by default
67
+ if filesystem is None:
68
+ filesystem = pafs.LocalFileSystem()
69
+
70
+ # Generate unique table name if using default to avoid conflicts
71
+ if not table_name:
72
+ random_suffix = generate_random_suffix()
73
+ table_name = f"default.demo_table_{random_suffix}"
74
+ print(f"๐Ÿ“‹ Generated unique table name: {table_name}")
75
+
76
+ # Define catalog configuration for REST catalog (simplified, table creation handled separately)
77
+ catalog_config = {
78
+ "catalog_properties": {
79
+ "warehouse": warehouse_path,
80
+ "catalog-impl": "org.apache.iceberg.rest.RESTCatalog",
81
+ "uri": rest_catalog_uri,
82
+ },
83
+ "deltacat_converter_properties": {
84
+ "deltacat_converter_interval": deltacat_converter_interval,
85
+ "merge_keys": ["id"], # Configure merge keys for duplicate detection
86
+ "ray_inactivity_timeout": ray_inactivity_timeout,
87
+ "filesystem": filesystem, # Pass filesystem to DeltaCAT converter
88
+ "max_converter_parallelism": max_converter_parallelism,
89
+ },
90
+ }
91
+
92
+ # Ensure table name includes namespace
93
+ if "." not in table_name:
94
+ full_table_name = f"default.{table_name}"
95
+ else:
96
+ full_table_name = table_name
97
+
98
+ print(f"๐Ÿ”ง Using Iceberg REST Catalog")
99
+ print(f" REST Server: {rest_catalog_uri}")
100
+ print(f" Warehouse: {warehouse_path}")
101
+ print(f" Mode: {mode}")
102
+ print(f" Table: {full_table_name}")
103
+ print(f" Filesystem: {type(filesystem).__name__}")
104
+
105
+ # Remind user about prerequisites
106
+ if mode == "write":
107
+ print("๐Ÿ“‹ Prerequisites:")
108
+ print(" Make sure the Iceberg REST catalog server is running:")
109
+ print(
110
+ " docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
111
+ )
112
+ print()
113
+ with beam.Pipeline(options=beam_options) as p:
114
+ # Step 1: Write initial data to create the table
115
+ initial_data = p | "Create initial data" >> beam.Create(
116
+ [
117
+ Row(id=1, name="Alice", value=100, version=1),
118
+ Row(id=2, name="Bob", value=200, version=1),
119
+ Row(id=3, name="Charlie", value=300, version=1),
120
+ Row(id=4, name="David", value=400, version=1),
121
+ Row(id=5, name="Eve", value=500, version=1),
122
+ Row(id=6, name="Frank", value=600, version=1),
123
+ Row(id=7, name="Grace", value=700, version=1),
124
+ Row(id=8, name="Henry", value=800, version=1),
125
+ Row(
126
+ id=2, name="Robert", value=201, version=2
127
+ ), # Update Bob's record
128
+ Row(
129
+ id=3, name="Charles", value=301, version=2
130
+ ), # Update Charlie's record
131
+ Row(id=9, name="Ivy", value=900, version=1), # Add a new record
132
+ ]
133
+ )
134
+
135
+ initial_data | "Write initial data to Iceberg" >> beam.managed.Write(
136
+ beam.managed.ICEBERG,
137
+ config={
138
+ "table": full_table_name, # Use fully qualified table name for REST catalog
139
+ "write_mode": "append",
140
+ **catalog_config,
141
+ },
142
+ )
143
+
144
+ # Wait for the DeltaCAT converter job to complete and shutdown
145
+ wait_for_deltacat_jobs(
146
+ full_table_name, warehouse_path, ray_inactivity_timeout * 2
147
+ )
148
+
149
+ print(f"\n๐Ÿ“ Data writing completed with DeltaCAT optimization enabled.")
150
+ print(f" - Table monitoring interval: {deltacat_converter_interval} seconds")
151
+ print(f" - Ray cluster shutdown timeout: {ray_inactivity_timeout} seconds")
152
+ print(f" - Automatic duplicate detection and resolution")
153
+ print(f" - Position delete creation for duplicate resolution")
154
+ print(f" - Job-based table monitoring with Ray")
155
+ print(f" - Filesystem: {type(filesystem).__name__}")
156
+ print(
157
+ f"๐Ÿ” Read the table with: `python main.py --mode read --table-name {full_table_name}`"
158
+ )
159
+
160
+ elif mode == "read":
161
+ with beam.Pipeline(options=beam_options) as p:
162
+ # Read from the Iceberg table using Spark SQL
163
+ # Note: We use Spark SQL instead of beam.managed.Read because Beam's native Iceberg I/O
164
+ # cannot handle positional delete files created by DeltaCAT converter sessions.
165
+
166
+ print(f"๐Ÿ“– Reading from Iceberg table '{full_table_name}' using Spark SQL")
167
+
168
+ # Create a trigger element to start the read
169
+ trigger = p | "Create read trigger" >> beam.Create([None])
170
+
171
+ # Read from Iceberg table using Spark SQL
172
+ elements = trigger | "Read with Spark SQL" >> beam.ParDo(
173
+ SparkSQLIcebergRead(
174
+ table_name=full_table_name,
175
+ catalog_uri=rest_catalog_uri,
176
+ warehouse=warehouse_path,
177
+ )
178
+ )
179
+
180
+ # Display the data read (after positional deletes are applied)
181
+ elements | "Print deduplicated data" >> beam.Map(
182
+ lambda row: print(f"๐Ÿ“‹ Record: {row}")
183
+ )
184
+
185
+ # Count records for summary
186
+ def count_and_display(elements_list):
187
+ print(f"\n๐Ÿ“Š Read Summary:")
188
+ print(f" - Total records: {len(elements_list)}")
189
+ return elements_list
190
+
191
+ # Collect all elements for counting
192
+ elements | "Count records" >> beam.combiners.ToList() | "Display summary" >> beam.Map(
193
+ count_and_display
194
+ )
195
+
196
+ # Verify that the data was correctly merged by ID
197
+ verify_duplicate_resolution(full_table_name, warehouse_path)
198
+
199
+ elif mode == "rewrite":
200
+ with beam.Pipeline(options=beam_options) as p:
201
+ # Rewrite table data files to materialize positional deletes
202
+ print(f"๐Ÿ”„ Rewriting Iceberg table to materialize positional deletes")
203
+ print(f" - Table: {full_table_name}")
204
+ print(f" - Purpose: Remove positional deletes to enable Beam writes")
205
+ print(f" - Method: Spark rewrite_data_files procedure")
206
+
207
+ # Create a trigger element to start the rewrite
208
+ trigger = p | "Create rewrite trigger" >> beam.Create(
209
+ [f"rewrite_{full_table_name}"]
210
+ )
211
+
212
+ # Use Spark SQL to rewrite the table
213
+ rewrite_results = trigger | "Rewrite table with Spark SQL" >> beam.ParDo(
214
+ SparkSQLIcebergRewrite(
215
+ catalog_uri=rest_catalog_uri,
216
+ warehouse_path=warehouse_path,
217
+ table_name=full_table_name,
218
+ )
219
+ )
220
+
221
+ # Log the results
222
+ rewrite_results | "Log rewrite results" >> beam.Map(
223
+ lambda result: print(f"๐Ÿ“‹ Rewrite result: {result}")
224
+ )
225
+ else:
226
+ raise ValueError(f"Unknown mode: {mode}. Use 'write', 'read', or 'rewrite'.")
@@ -0,0 +1,133 @@
1
+ import pyarrow.fs as pafs
2
+ import argparse
3
+ import logging
4
+
5
+ from apache_beam.options.pipeline_options import PipelineOptions
6
+
7
+ from deltacat.examples.experimental.iceberg.converter.beam import app
8
+
9
+
10
+ if __name__ == "__main__":
11
+ logging.getLogger().setLevel(logging.INFO)
12
+ parser = argparse.ArgumentParser(
13
+ description="DeltaCat Beam Iceberg Converter Example using REST Catalog",
14
+ formatter_class=argparse.RawDescriptionHelpFormatter,
15
+ epilog="""
16
+ Examples:
17
+ # Start REST catalog server first (Iceberg 1.6.0):
18
+ docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0
19
+
20
+ # Install PySpark for read and rewrite modes:
21
+ pip install pyspark
22
+
23
+ # Write sample data with DeltaCAT data file converter (automatic merge by key):
24
+ python main.py --mode write --table-name "deltacat.hello_world"
25
+
26
+ # Read data back (uses Spark SQL to read positional deletes):
27
+ python main.py --mode read --table-name "deltacat.hello_world"
28
+
29
+ # Rewrite table to materialize positional deletes:
30
+ python main.py --mode rewrite --table-name "deltacat.hello_world"
31
+
32
+ # Use custom REST catalog server:
33
+ python main.py --mode write --rest-uri http://localhost:9000 --table-name "deltacat.hello_world"
34
+
35
+ # Use custom warehouse path:
36
+ python main.py --mode write --warehouse-path /tmp/my_warehouse --table-name "deltacat.hello_world"
37
+ """,
38
+ )
39
+
40
+ parser.add_argument(
41
+ "--mode",
42
+ default="write",
43
+ choices=["write", "read", "rewrite"],
44
+ help="Pipeline mode: 'write' to write data, 'read' to read data, 'rewrite' to materialize positional deletes (default: write). "
45
+ " Note: Beam writes may fail on tables processed by external tools.",
46
+ )
47
+
48
+ parser.add_argument(
49
+ "--rest-uri",
50
+ default="http://localhost:8181",
51
+ help="REST catalog server URI (default: http://localhost:8181).",
52
+ )
53
+
54
+ parser.add_argument(
55
+ "--warehouse-path",
56
+ default=None,
57
+ help="Custom warehouse path (default: temporary directory).",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--table-name",
62
+ default=None,
63
+ help="Table name to use (default: autogenerated table name).",
64
+ )
65
+
66
+ parser.add_argument(
67
+ "--deltacat-converter-interval",
68
+ type=float,
69
+ default=5.0,
70
+ help="DeltaCat converter monitoring interval in seconds (default: 5.0).",
71
+ )
72
+
73
+ parser.add_argument(
74
+ "--ray-inactivity-timeout",
75
+ type=int,
76
+ default=20,
77
+ help="Ray cluster shutdown timeout after inactivity in seconds (default: 20).",
78
+ )
79
+
80
+ parser.add_argument(
81
+ "--max-converter-parallelism",
82
+ type=int,
83
+ default=1,
84
+ help="Maximum converter task parallelism - number of concurrent converter tasks (default: 1).",
85
+ )
86
+
87
+ args, beam_args = parser.parse_known_args()
88
+
89
+ beam_options = PipelineOptions(
90
+ beam_args,
91
+ save_main_session=True,
92
+ )
93
+
94
+ print("DeltaCAT Beam Iceberg Upsert Example")
95
+ print("=" * 50)
96
+ print(f"Mode: {args.mode}")
97
+ print(f"REST Catalog URI: {args.rest_uri}")
98
+ print(f"Warehouse Path: {args.warehouse_path or 'temporary directory'}")
99
+ print(f"Table Name: {args.table_name}")
100
+ print(f"Converter Interval: {args.deltacat_converter_interval}s")
101
+ print(f"Ray Inactivity Timeout: {args.ray_inactivity_timeout}s")
102
+ print(f"Max Converter Parallelism: {args.max_converter_parallelism}")
103
+ print()
104
+
105
+ # Remind user about prerequisites
106
+ if args.mode == "write":
107
+ print("Prerequisites:")
108
+ print(" Make sure the Iceberg REST catalog server is running:")
109
+ print(
110
+ " docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
111
+ )
112
+ print()
113
+ elif args.mode in ["read", "rewrite"]:
114
+ print("Prerequisites:")
115
+ print(" Make sure the Iceberg REST catalog server is running:")
116
+ print(
117
+ " docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
118
+ )
119
+ print(" PySpark is required for this mode:")
120
+ print(" pip install pyspark")
121
+ print()
122
+
123
+ app.run(
124
+ beam_options=beam_options,
125
+ mode=args.mode,
126
+ rest_catalog_uri=args.rest_uri,
127
+ warehouse_path=args.warehouse_path,
128
+ table_name=args.table_name,
129
+ deltacat_converter_interval=args.deltacat_converter_interval,
130
+ ray_inactivity_timeout=args.ray_inactivity_timeout,
131
+ filesystem=pafs.LocalFileSystem(),
132
+ max_converter_parallelism=args.max_converter_parallelism,
133
+ )
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick test script to demonstrate the REST catalog workflow.
4
+ This script shows the complete write โ†’ read cycle with DeltaCAT monitoring and conversion.
5
+ """
6
+
7
+ import subprocess
8
+ import sys
9
+ from deltacat.examples.experimental.iceberg.converter.beam.utils.common import (
10
+ generate_random_suffix,
11
+ check_rest_catalog,
12
+ )
13
+ from deltacat.examples.experimental.iceberg.converter.beam.utils.common import (
14
+ verify_duplicate_resolution,
15
+ )
16
+
17
+
18
+ def run_example(mode, table_name, input_text="Workflow Test"):
19
+ """Run the example in the specified mode."""
20
+ print(f"\n๐Ÿš€ Running example in {mode} mode with table: {table_name}")
21
+ cmd = [
22
+ sys.executable,
23
+ "main.py",
24
+ "--mode",
25
+ mode,
26
+ "--input-text",
27
+ input_text,
28
+ "--table-name",
29
+ table_name,
30
+ ]
31
+
32
+ try:
33
+ result = subprocess.run(
34
+ cmd, capture_output=True, text=True, timeout=120
35
+ ) # Increased timeout for converter
36
+ if result.returncode == 0:
37
+ print(f"โœ… {mode.capitalize()} operation completed successfully")
38
+ if mode == "read":
39
+ # Show sample data from the output
40
+ lines = result.stdout.split("\n")
41
+ data_lines = [line for line in lines if "BeamSchema" in line]
42
+ if data_lines:
43
+ print(f"๐Ÿ“Š Found {len(data_lines)} records in table")
44
+ print("Sample records:")
45
+ for line in data_lines[:5]: # Show first 5 records
46
+ print(f" {line}")
47
+ if len(data_lines) > 5:
48
+ print(f" ... and {len(data_lines) - 5} more records")
49
+ return True
50
+ else:
51
+ print(f"โŒ {mode.capitalize()} operation failed:")
52
+ print(result.stderr)
53
+ return False
54
+ except subprocess.TimeoutExpired:
55
+ print(f"โฐ {mode.capitalize()} operation timed out")
56
+ return False
57
+ except Exception as e:
58
+ print(f"โŒ Error running {mode} operation: {e}")
59
+ return False
60
+
61
+
62
+ def main():
63
+ """Main workflow test."""
64
+ print("๐Ÿงช DeltaCAT Beam Iceberg REST Catalog Workflow Test")
65
+ print("=" * 60)
66
+
67
+ # Generate unique table name to avoid conflicts
68
+ random_suffix = generate_random_suffix()
69
+ table_name = f"default.demo_table_{random_suffix}"
70
+ print(f"๐Ÿ“‹ Generated unique table name: {table_name}")
71
+
72
+ # Step 1: Check prerequisites
73
+ if not check_rest_catalog():
74
+ sys.exit(1)
75
+
76
+ # Step 2: Write data (creates table with duplicates and triggers converter)
77
+ print(f"\n๐Ÿ“‹ Phase 1: Writing data and triggering DeltaCAT converter")
78
+ if not run_example("write", table_name, "Workflow Demo User"):
79
+ print("โŒ Write test failed")
80
+ sys.exit(1)
81
+
82
+ # Step 3: Verify upsert merge worked as expected
83
+ print(f"\n๐Ÿ“‹ Phase 2: Direct verification of duplicate resolution")
84
+ verification_success = verify_duplicate_resolution(table_name)
85
+
86
+ # Step 4: Read data back to show final state
87
+ print(f"\n๐Ÿ“‹ Phase 3: Reading final table state")
88
+ if not run_example("read", table_name):
89
+ print("โŒ Read test failed")
90
+ sys.exit(1)
91
+
92
+ # Final summary
93
+ print("\n๐ŸŽ‰ Workflow test completed!")
94
+
95
+ if verification_success:
96
+ print("\nโœ… SUCCESS:")
97
+ print(" โœ… Table creation and writes")
98
+ print(" โœ… DeltaCAT monitoring merged duplicates")
99
+ print(" โœ… Read operations correctly read merged data")
100
+ else:
101
+ print("\nโš ๏ธ PARTIAL SUCCESS:")
102
+ print(" โœ… Table creation and writes")
103
+ print(" โ“ Converter may still be processing or failed")
104
+ print(" ๐Ÿ“ Check logs for converter execution details")
105
+
106
+ print("\n๐Ÿ“š What happened:")
107
+ print(" 1. Beam wrote data creating duplicates (IDs 2,3)")
108
+ print(" 2. DeltaCAT monitoring merged duplicates")
109
+ print(" 3. Table now contains merged data")
110
+
111
+
112
+ if __name__ == "__main__":
113
+ main()
@@ -0,0 +1,3 @@
1
+ """
2
+ Utility modules for the Iceberg converter example.
3
+ """
@@ -0,0 +1,174 @@
1
+ """
2
+ Common utility functions for the Iceberg converter example.
3
+ """
4
+
5
+ import random
6
+ import string
7
+ import time
8
+ from pyiceberg.catalog import load_catalog
9
+ import requests
10
+ from deltacat import local_job_client
11
+ from deltacat.constants import DEFAULT_NAMESPACE
12
+ from deltacat.experimental.converter_agent.table_monitor import _generate_job_name
13
+
14
+
15
+ def generate_random_suffix(length=8):
16
+ """Generate a random string of specified length using letters and digits."""
17
+ return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))
18
+
19
+
20
+ def check_rest_catalog():
21
+ """Check if REST catalog is running."""
22
+ try:
23
+ response = requests.get("http://localhost:8181/v1/config", timeout=5)
24
+ if response.status_code == 200:
25
+ print("โœ… REST catalog is running")
26
+ return True
27
+ except requests.exceptions.RequestException:
28
+ pass
29
+
30
+ print("โŒ REST catalog is not running")
31
+ print(
32
+ "๐Ÿ“‹ Start it with: docker run -d -p 8181:8181 --name iceberg-rest-catalog tabulario/iceberg-rest:1.6.0"
33
+ )
34
+ return False
35
+
36
+
37
+ def wait_for_deltacat_jobs(
38
+ table_name, warehouse_path="/tmp/iceberg_rest_warehouse", timeout=120
39
+ ):
40
+ """
41
+ Wait for DeltaCAT converter jobs to complete by checking job status.
42
+
43
+ Args:
44
+ table_name: Full table name (e.g., "default.demo_table_abc123")
45
+ warehouse_path: Warehouse path used for job tracking
46
+ timeout: Maximum seconds to wait for job completion
47
+
48
+ Returns:
49
+ True if all jobs completed, False if timeout
50
+ """
51
+ print(f"\nโณ Monitoring DeltaCAT converter jobs for table: {table_name}")
52
+
53
+ # Parse table name to get namespace and table name
54
+ if "." in table_name:
55
+ namespace, actual_table_name = table_name.split(".", 1)
56
+ else:
57
+ namespace = DEFAULT_NAMESPACE
58
+ actual_table_name = table_name
59
+
60
+ # Create job key matching the format used in managed.py
61
+ job_name = _generate_job_name(
62
+ warehouse_path=warehouse_path, namespace=namespace, table_name=actual_table_name
63
+ )
64
+
65
+ start_time = time.time()
66
+
67
+ try:
68
+ # Get the job client
69
+ client = local_job_client(ray_init_args={"local_mode": True})
70
+
71
+ while time.time() - start_time < timeout:
72
+ job_details_list = client.list_jobs()
73
+ print(f"๐Ÿ” Job details list: {job_details_list}")
74
+ job_submission_ids = [
75
+ job_details.submission_id for job_details in job_details_list
76
+ ]
77
+
78
+ # Check if we have any tracked jobs for this table
79
+ print(f"๐Ÿ” Looking for submission ID: {job_name} in {job_submission_ids}")
80
+ if job_name in job_submission_ids:
81
+ # Check job status with Ray
82
+ try:
83
+ job_status = client.get_job_status(job_name)
84
+ print(f"๐Ÿ“Š Job {job_name} status: {job_status}")
85
+ # Check if job is still running
86
+ if job_status and str(job_status) in ["PENDING", "RUNNING"]:
87
+ time.sleep(2) # Short polling interval
88
+ continue
89
+ else:
90
+ print(f"โœ… Job {job_name} completed with status: {job_status}")
91
+ return True
92
+
93
+ except Exception as e:
94
+ print(f"โš ๏ธ Could not check job status for {job_name}: {e}")
95
+ # If we can't check status, assume job is done
96
+ return True
97
+ time.sleep(1)
98
+ print(f"โฐ Timeout waiting for DeltaCAT job completion after {timeout} seconds")
99
+ return False
100
+
101
+ except Exception as e:
102
+ print(f"โŒ Error monitoring DeltaCAT jobs: {e}")
103
+ # Fall back to short sleep if monitoring fails
104
+ print(f"๐Ÿ”„ Falling back to {timeout}-second wait...")
105
+ time.sleep(timeout)
106
+ return True
107
+
108
+
109
+ def verify_duplicate_resolution(
110
+ table_name, warehouse_path="/tmp/iceberg_rest_warehouse"
111
+ ):
112
+ """
113
+ Verify that the DeltaCAT converter successfully resolved duplicates.
114
+ """
115
+ try:
116
+ print(f"\n๐Ÿ” Verifying duplicate resolution for table: {table_name}")
117
+
118
+ # Create PyIceberg catalog to check results
119
+ verification_catalog = load_catalog(
120
+ "workflow_verification_catalog",
121
+ **{
122
+ "type": "rest",
123
+ "warehouse": warehouse_path,
124
+ "uri": "http://localhost:8181",
125
+ },
126
+ )
127
+
128
+ # Load the table and scan its contents
129
+ table_identifier = table_name
130
+ tbl = verification_catalog.load_table(table_identifier)
131
+ scan_result = tbl.scan().to_arrow().to_pydict()
132
+
133
+ # Check the results
134
+ result_ids = sorted(scan_result["id"])
135
+ unique_ids = sorted(set(result_ids))
136
+
137
+ print(f"๐Ÿ“Š Final verification results:")
138
+ print(f" - Total records: {len(result_ids)}")
139
+ print(f" - Unique IDs: {len(unique_ids)}")
140
+ print(f" - IDs found: {result_ids}")
141
+
142
+ # Check if duplicates were resolved
143
+ expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9]
144
+ if result_ids == expected_unique_ids:
145
+ # Verify that the latest versions were preserved
146
+ names_by_id = {}
147
+ versions_by_id = {}
148
+ for i, id_val in enumerate(scan_result["id"]):
149
+ names_by_id[id_val] = scan_result["name"][i]
150
+ versions_by_id[id_val] = scan_result["version"][i]
151
+
152
+ if (
153
+ names_by_id.get(2) == "Robert"
154
+ and names_by_id.get(3) == "Charles"
155
+ and versions_by_id.get(2) == 2
156
+ and versions_by_id.get(3) == 2
157
+ ):
158
+ print(f"โœ… Duplicate resolution SUCCESSFUL!")
159
+ print(f" - All 9 IDs are unique")
160
+ print(f" - Latest versions preserved (Bobโ†’Robert, Charlieโ†’Charles)")
161
+ print(f" - Version numbers correct (v2 for updated records)")
162
+ return True
163
+ else:
164
+ print(f"โŒ Latest versions not preserved correctly")
165
+ else:
166
+ print(f"โŒ Duplicates still present or unexpected record count")
167
+ print(f" - Expected: {expected_unique_ids}")
168
+ print(f" - Got: {result_ids}")
169
+
170
+ return False
171
+
172
+ except Exception as e:
173
+ print(f"โŒ Error during verification: {e}")
174
+ return False