deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,173 @@
1
+ """
2
+ DeltaCAT Job-based Managed I/O for Apache Beam
3
+
4
+ This module provides a job-based implementation of the DeltaCAT table monitor
5
+ that uses Ray jobs for better scalability and resource management instead of
6
+ threading.
7
+
8
+ Key Features:
9
+ - Uses DeltaCAT jobs for table monitoring
10
+ - Unique job IDs prevent duplicate monitoring jobs
11
+ - Supports both local and remote Ray clusters
12
+ - Backward compatible with existing managed.py interface
13
+ """
14
+
15
+ import logging
16
+ from typing import Dict, Any
17
+
18
+ import apache_beam as beam
19
+ from pyiceberg.catalog import CatalogType
20
+
21
+ from deltacat.experimental.converter_agent.table_monitor import submit_table_monitor_job
22
+ from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
23
+ import deltacat.logs as logs
24
+
25
+ # Initialize DeltaCAT logger
26
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
27
+
28
+ # Store original functions before monkey-patching
29
+ _original_write = beam.managed.Write
30
+
31
+
32
+ # Create a dictionary of Java catalog impl to CatalogType
33
+ JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE = {
34
+ "org.apache.iceberg.rest.restcatalog": CatalogType.REST,
35
+ "org.apache.iceberg.hive.hivecatalog": CatalogType.HIVE,
36
+ "org.apache.iceberg.aws.glue.gluecatalog": CatalogType.GLUE,
37
+ "org.apache.iceberg.jdbc.jdbccatalog": CatalogType.SQL,
38
+ }
39
+
40
+
41
+ def _extract_catalog_config_from_beam(config: Dict[str, Any]) -> Dict[str, Any]:
42
+ """Extract catalog configuration from Beam config."""
43
+ catalog_properties = config.get("catalog_properties", {})
44
+
45
+ # Extract catalog implementation class
46
+ catalog_impl = catalog_properties.get("catalog-impl")
47
+
48
+ # Extract catalog type
49
+ catalog_type = catalog_properties.get("type")
50
+
51
+ # Extract other relevant properties
52
+ warehouse = catalog_properties.get("warehouse", "")
53
+ uri = catalog_properties.get("uri", "")
54
+
55
+ return {
56
+ "catalog_impl": catalog_impl,
57
+ "type": catalog_type,
58
+ "warehouse": warehouse,
59
+ "uri": uri,
60
+ "catalog_properties": catalog_properties,
61
+ }
62
+
63
+
64
+ def write(*args, **kwargs):
65
+ """Wrapper over beam.managed.Write that automatically creates a DeltaCAT table monitor & converter job."""
66
+ logger.debug(f"Starting DeltaCAT write operation")
67
+ logger.debug(f"args: {args}")
68
+ logger.debug(f"kwargs keys: {list(kwargs.keys()) if kwargs else 'None'}")
69
+
70
+ # Extract and pop deltacat-specific config keys
71
+ config = kwargs.get("config", {}).copy() if kwargs.get("config") else {}
72
+
73
+ # Extract DeltaCAT converter properties from parent config or individual keys (for backward compatibility)
74
+ deltacat_converter_properties = config.pop("deltacat_converter_properties", {})
75
+
76
+ # Support both new nested structure and old flat structure for backward compatibility
77
+ deltacat_converter_interval = deltacat_converter_properties.get(
78
+ "deltacat_converter_interval", 3.0
79
+ )
80
+
81
+ merge_keys = deltacat_converter_properties.get("merge_keys")
82
+
83
+ # Extract filesystem parameter (optional) - can be in converter properties or top-level config
84
+ filesystem = deltacat_converter_properties.get("filesystem", None)
85
+
86
+ # Extract cluster configuration file path (for remote jobs)
87
+ cluster_cfg_file_path = deltacat_converter_properties.get(
88
+ "cluster_cfg_file_path", None
89
+ )
90
+
91
+ # Extract max converter parallelism
92
+ max_converter_parallelism = deltacat_converter_properties.get(
93
+ "max_converter_parallelism",
94
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
95
+ )
96
+
97
+ # Extract ray inactivity timeout
98
+ ray_inactivity_timeout = deltacat_converter_properties.get(
99
+ "ray_inactivity_timeout", 10
100
+ )
101
+
102
+ # Extract table identifier and warehouse path
103
+ table_identifier = config.get("table")
104
+ if not table_identifier:
105
+ raise ValueError("Table is required")
106
+
107
+ if table_identifier and "." in table_identifier:
108
+ namespace, table_name = table_identifier.split(".", 1)
109
+ else:
110
+ namespace = "default"
111
+ table_name = table_identifier
112
+
113
+ warehouse_path = config.get("catalog_properties", {}).get("warehouse", "")
114
+
115
+ # Extract catalog configuration for monitoring
116
+ beam_catalog_config = _extract_catalog_config_from_beam(config)
117
+
118
+ # Derive CatalogType from "catalog_impl" or "type" property
119
+ catalog_impl = beam_catalog_config.get("catalog_impl")
120
+ if catalog_impl:
121
+ catalog_type = JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE.get(catalog_impl.lower())
122
+ if not catalog_type:
123
+ raise ValueError(f"Unsupported catalog implementation: {catalog_impl}")
124
+ else:
125
+ catalog_type_str = beam_catalog_config.get("type")
126
+ if catalog_type_str:
127
+ catalog_type = CatalogType(catalog_type_str.lower())
128
+ else:
129
+ raise ValueError(
130
+ f"No catalog implementation or type found in config: {beam_catalog_config}"
131
+ )
132
+
133
+ # Update kwargs with the modified config
134
+ if "config" in kwargs:
135
+ kwargs["config"] = config
136
+
137
+ logger.debug(f"Preparing to submit table monitor job...")
138
+ logger.debug(f"table_name: {table_name}")
139
+ logger.debug(f"deltacat_converter_interval: {deltacat_converter_interval}s")
140
+ logger.debug(f"merge_keys: {merge_keys}")
141
+ logger.debug(f"warehouse_path: {warehouse_path}")
142
+ logger.debug(
143
+ f"filesystem: {type(filesystem).__name__ if filesystem else 'None (auto-resolve)'}"
144
+ )
145
+ logger.debug(f"cluster_cfg_file_path: {cluster_cfg_file_path or 'None (local)'}")
146
+ logger.debug(f"max_converter_parallelism: {max_converter_parallelism}")
147
+ logger.debug(f"ray_inactivity_timeout: {ray_inactivity_timeout}s")
148
+ logger.debug(
149
+ f"using deltacat_converter_properties: {len(deltacat_converter_properties) > 0}"
150
+ )
151
+ logger.debug(f"catalog_type: {catalog_type}")
152
+
153
+ # Submit monitoring job
154
+ try:
155
+ submit_table_monitor_job(
156
+ warehouse_path=warehouse_path,
157
+ catalog_type=catalog_type,
158
+ catalog_uri=beam_catalog_config.get("uri"),
159
+ namespace=namespace,
160
+ table_name=table_name,
161
+ merge_keys=merge_keys,
162
+ monitor_interval=deltacat_converter_interval,
163
+ filesystem=filesystem,
164
+ cluster_cfg_file_path=cluster_cfg_file_path,
165
+ max_converter_parallelism=max_converter_parallelism,
166
+ ray_inactivity_timeout=ray_inactivity_timeout,
167
+ )
168
+ except Exception as e:
169
+ # Don't fail the write operation, just log the error
170
+ logger.error(f"Failed to submit table monitor job: {e}")
171
+ logger.error(f"Exception traceback:", exc_info=True)
172
+ logger.info(f"Delegating to beam.managed.Write")
173
+ return _original_write(*args, **kwargs)
@@ -0,0 +1,479 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DeltaCAT Table Monitor Job. Automatically runs data converter sessions in response to table updates.
4
+ """
5
+
6
+ import argparse
7
+ import hashlib
8
+ import json
9
+ import logging
10
+ import os
11
+ import time
12
+ from typing import List, Optional
13
+
14
+ import pyarrow.fs as pafs
15
+ import ray
16
+ import deltacat
17
+
18
+ from pyiceberg.catalog import load_catalog, CatalogType
19
+ from pyiceberg.exceptions import NoSuchTableError
20
+
21
+ from deltacat import job_client, local_job_client
22
+ from deltacat.constants import DEFAULT_NAMESPACE
23
+ from deltacat.compute.converter.converter_session import converter_session
24
+ from deltacat.compute.converter.model.converter_session_params import (
25
+ ConverterSessionParams,
26
+ )
27
+ from deltacat.compute.jobs.client import DeltaCatJobClient
28
+ from deltacat.utils.filesystem import (
29
+ resolve_path_and_filesystem,
30
+ FilesystemType,
31
+ )
32
+ import deltacat.logs as logs
33
+
34
+ # Initialize DeltaCAT logger
35
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
+
37
+
38
+ def monitor_table(
39
+ catalog_type: str,
40
+ warehouse_path: str,
41
+ catalog_uri: Optional[str],
42
+ namespace: str,
43
+ table_name: str,
44
+ merge_keys: List[str],
45
+ filesystem_type: FilesystemType = FilesystemType.LOCAL,
46
+ monitor_interval: float = 5.0,
47
+ max_converter_parallelism: int = 1,
48
+ ray_inactivity_timeout: int = 10,
49
+ ) -> None:
50
+ """Monitor an Iceberg table for changes and run converter sessions when needed."""
51
+
52
+ logger.info(
53
+ f"Starting table monitor. Namespace: '{namespace}', Table: '{table_name}', "
54
+ f"Warehouse: '{warehouse_path}', Catalog type: '{catalog_type}', "
55
+ f"Catalog URI: '{catalog_uri or 'None'}', Merge keys: '{merge_keys}', "
56
+ f"Filesystem type: '{filesystem_type}', Monitor interval: '{monitor_interval}s', "
57
+ f"Max converter parallelism: '{max_converter_parallelism}', "
58
+ f"Ray inactivity timeout: '{ray_inactivity_timeout}s'"
59
+ )
60
+
61
+ # Create PyIceberg catalog
62
+ catalog = load_catalog(
63
+ "monitor_catalog",
64
+ type=catalog_type,
65
+ warehouse=warehouse_path,
66
+ uri=catalog_uri or None,
67
+ )
68
+
69
+ # Set up filesystem
70
+ filesystem = FilesystemType.to_filesystem(filesystem_type)
71
+ if filesystem_type == FilesystemType.UNKNOWN:
72
+ normalized_warehouse_path, filesystem = resolve_path_and_filesystem(
73
+ warehouse_path
74
+ )
75
+ warehouse_path = normalized_warehouse_path
76
+
77
+ logger.info(f"Resolved filesystem: {type(filesystem).__name__}")
78
+ logger.info(f"Normalized warehouse path: {warehouse_path}")
79
+
80
+ # Parse table identifier
81
+ if not namespace:
82
+ namespace = DEFAULT_NAMESPACE
83
+ table_identifier = f"{namespace}.{table_name}"
84
+ logger.info(f" - Parsed table - namespace: '{namespace}', table: '{table_name}'")
85
+
86
+ last_snapshot_id = None
87
+ start_time = time.time()
88
+ last_write_time = start_time # Track last time we saw table activity
89
+
90
+ while True:
91
+ # Sleep before starting the first iteration and all subsequent iterations
92
+ logger.debug(f"Sleeping for {monitor_interval}s before next check...")
93
+ time.sleep(monitor_interval)
94
+
95
+ logger.info(f"Checking table {table_identifier} for updates...")
96
+
97
+ # Try to load the table
98
+ try:
99
+ tbl = catalog.load_table(table_identifier)
100
+ current_snapshot_id = tbl.metadata.current_snapshot_id
101
+ if last_snapshot_id != current_snapshot_id:
102
+ logger.info(
103
+ f"New table version detected - snapshot ID: {current_snapshot_id}"
104
+ )
105
+ logger.info(f"Table has {len(tbl.metadata.snapshots)} snapshots")
106
+ logger.info(f"Table format version: {tbl.metadata.format_version}")
107
+
108
+ # Update last activity time when we detect table changes
109
+ last_write_time = time.time()
110
+
111
+ # Always run deduplication when there are snapshots (duplicates can exist within a single snapshot)
112
+ logger.info(
113
+ f"Table has data - triggering converter session to resolve any duplicates..."
114
+ )
115
+
116
+ # Run converter session
117
+ try:
118
+ converter_params = ConverterSessionParams.of(
119
+ {
120
+ "catalog": catalog,
121
+ "iceberg_namespace": namespace,
122
+ "iceberg_table_name": table_name,
123
+ "iceberg_warehouse_bucket_name": warehouse_path,
124
+ "merge_keys": merge_keys,
125
+ "enforce_primary_key_uniqueness": True,
126
+ "task_max_parallelism": max_converter_parallelism,
127
+ "filesystem": filesystem,
128
+ "location_provider_prefix_override": None,
129
+ }
130
+ )
131
+
132
+ logger.debug(f"Converter Session Parameters: {converter_params}")
133
+
134
+ logger.info(f"Starting converter session...")
135
+ updated_metadata = converter_session(params=converter_params)
136
+ logger.info(f"Converter session completed successfully")
137
+ current_snapshot_id = updated_metadata.current_snapshot_id
138
+ logger.info(
139
+ f"Current snapshot ID updated to: {current_snapshot_id}"
140
+ )
141
+ except Exception as e:
142
+ logger.error(f"Converter session failed: {e}")
143
+ logger.error(f"Exception traceback:", exc_info=True)
144
+ last_snapshot_id = current_snapshot_id
145
+ else:
146
+ logger.debug(
147
+ f"No table changes detected (snapshot ID: {current_snapshot_id})"
148
+ )
149
+ except NoSuchTableError:
150
+ logger.info(f"Table {table_identifier} does not exist yet - waiting...")
151
+ except Exception as e:
152
+ logger.error(f"Error in table monitor: {e}")
153
+
154
+ # Check for Ray inactivity timeout
155
+ current_time = time.time()
156
+ inactivity_duration = current_time - last_write_time
157
+
158
+ if inactivity_duration >= ray_inactivity_timeout:
159
+ logger.info(
160
+ f"Ray inactivity timeout reached ({inactivity_duration:.1f}s >= {ray_inactivity_timeout}s)"
161
+ )
162
+ logger.info(
163
+ f"No table activity detected for {inactivity_duration:.1f} seconds, shutting down Ray..."
164
+ )
165
+
166
+ try:
167
+ if ray.is_initialized():
168
+ ray.shutdown()
169
+ logger.info("Ray shutdown successfully due to inactivity")
170
+ else:
171
+ logger.info("Ray was not initialized, nothing to shut down")
172
+ except Exception as e:
173
+ logger.error(f"Error shutting down Ray: {e}")
174
+
175
+ logger.info(f"Table monitor stopping due to inactivity timeout")
176
+ break
177
+
178
+ logger.info(f"Table monitor completed")
179
+
180
+
181
+ def _generate_job_name(warehouse_path: str, namespace: str, table_name: str) -> str:
182
+ """
183
+ Generate a unique job name based on warehouse path, namespace, and table name.
184
+
185
+ Args:
186
+ warehouse_path: Warehouse path
187
+ namespace: Table namespace
188
+ table_name: Table name
189
+
190
+ Returns:
191
+ Job name string.
192
+ """
193
+ # Create a sha1 digest of the warehouse path, namespace, and table name
194
+ digest = hashlib.sha1(
195
+ f"{warehouse_path}-{namespace}-{table_name}".encode()
196
+ ).hexdigest()
197
+ job_name = f"deltacat-monitor-{digest}"
198
+
199
+ return job_name
200
+
201
+
202
+ def _cleanup_terminated_jobs_for_submission_id(
203
+ client: DeltaCatJobClient, submission_id: str
204
+ ) -> bool:
205
+ """Clean up any terminated jobs with the given submission ID."""
206
+ logger.debug(
207
+ f"Searching for terminated jobs to cleanup with submission ID: {submission_id}"
208
+ )
209
+ try:
210
+ all_jobs = client.list_jobs()
211
+ logger.debug(f"All jobs: {all_jobs}")
212
+ for job in all_jobs:
213
+ if job.submission_id == submission_id and job.status.is_terminal():
214
+ logger.info(
215
+ f"Cleaning up terminated job: {submission_id} (status: {job.status})"
216
+ )
217
+ client.delete_job(submission_id)
218
+ return True
219
+ except Exception as e:
220
+ logger.warning(f"Cleanup failed for job '{submission_id}': {e}")
221
+ return False
222
+
223
+
224
+ def submit_table_monitor_job(
225
+ warehouse_path: str,
226
+ catalog_type: CatalogType,
227
+ catalog_uri: Optional[str],
228
+ namespace: str,
229
+ table_name: str,
230
+ merge_keys: list,
231
+ monitor_interval: float,
232
+ max_converter_parallelism: int,
233
+ filesystem: pafs.FileSystem = None,
234
+ cluster_cfg_file_path: Optional[str] = None,
235
+ ray_inactivity_timeout: int = 10,
236
+ ) -> str:
237
+ """
238
+ Submit a table monitor job to Ray cluster.
239
+
240
+ Args:
241
+ warehouse_path: Warehouse path
242
+ catalog_type: Catalog type
243
+ catalog_uri: Catalog URI
244
+ namespace: Table namespace
245
+ table_name: Table name to monitor
246
+ merge_keys: List of merge key column names
247
+ monitor_interval: Seconds between monitoring checks
248
+ max_converter_parallelism: Maximum number of concurrent converter tasks
249
+ filesystem: PyArrow filesystem instance
250
+ cluster_cfg_file_path: Path to cluster config file (None for local)
251
+ ray_inactivity_timeout: Seconds to wait before shutting down Ray cluster
252
+ Returns:
253
+ Job ID of the submitted job
254
+ """
255
+
256
+ # Parse table identifier to extract namespace and table name
257
+ if not namespace:
258
+ namespace = DEFAULT_NAMESPACE
259
+
260
+ # Generate unique job ID based on the warehouse and table path
261
+ job_name = _generate_job_name(
262
+ warehouse_path=warehouse_path, namespace=namespace, table_name=table_name
263
+ )
264
+
265
+ # Resolve the appropriate local or remote job client
266
+ if cluster_cfg_file_path:
267
+ # Submit to remote cluster
268
+ logger.info(
269
+ f"Preparing to submit job to remote cluster: {cluster_cfg_file_path}"
270
+ )
271
+ # Set the cluster name to the job ID to prevent starting multiple Ray clusters monitoring the same table.
272
+ client = job_client(cluster_cfg_file_path, cluster_name_override=job_name)
273
+ else:
274
+ # Submit to local cluster using DeltaCAT local job client
275
+ ray_init_args = {
276
+ "local_mode": True,
277
+ "resources": {"convert_task": max_converter_parallelism},
278
+ }
279
+ logger.info(
280
+ f"Preparing to submit job locally with ray init args: {ray_init_args}"
281
+ )
282
+ client = local_job_client(ray_init_args=ray_init_args)
283
+
284
+ # Add filesystem type - determine from filesystem instance
285
+ filesystem_type = FilesystemType.from_filesystem(filesystem)
286
+
287
+ # Build CLI arguments for table_monitor job
288
+ table_monitor_script_dir = os.path.dirname(os.path.abspath(__file__))
289
+ table_monitor_script_path = os.path.join(
290
+ table_monitor_script_dir, "table_monitor.py"
291
+ )
292
+
293
+ logger.debug(f"Table monitor script path: {table_monitor_script_path}")
294
+ logger.debug(
295
+ f"Table monitor script exists: {os.path.exists(table_monitor_script_path)}"
296
+ )
297
+
298
+ cmd_args = [
299
+ f"python {table_monitor_script_path}",
300
+ f"--catalog-type '{catalog_type.value}'",
301
+ f"--warehouse-path '{warehouse_path}'",
302
+ f"--catalog-uri '{catalog_uri}'",
303
+ f"--namespace '{namespace}'",
304
+ f"--table-name '{table_name}'",
305
+ f"--merge-keys '{json.dumps(merge_keys)}'",
306
+ f"--monitor-interval {monitor_interval}",
307
+ f"--max-converter-parallelism {max_converter_parallelism}",
308
+ f"--ray-inactivity-timeout {ray_inactivity_timeout}",
309
+ f"--filesystem-type '{filesystem_type}'",
310
+ ]
311
+
312
+ # Join all arguments
313
+ entrypoint = " ".join(cmd_args)
314
+ logger.debug(
315
+ f"Submitting table monitor job '{job_name}' with entrypoint: {entrypoint}"
316
+ )
317
+
318
+ # Clean up any terminated jobs with the same submission ID to allow reuse
319
+ _cleanup_terminated_jobs_for_submission_id(client, job_name)
320
+
321
+ # Submit the job with the correct working directory
322
+ # Working directory should be the converter_agent directory where table_monitor.py is located
323
+ job_submission_id = client.submit_job(
324
+ submission_id=job_name,
325
+ entrypoint=entrypoint,
326
+ runtime_env={"working_dir": table_monitor_script_dir},
327
+ )
328
+
329
+ logger.info(f"Table monitor job submitted successfully: {job_submission_id}")
330
+
331
+ return job_submission_id
332
+
333
+
334
+ def run(
335
+ catalog_type: str,
336
+ warehouse_path: str,
337
+ catalog_uri: Optional[str],
338
+ namespace: str,
339
+ table_name: str,
340
+ merge_keys: str,
341
+ filesystem_type: str = "local",
342
+ monitor_interval: float = 1.0,
343
+ max_converter_parallelism: int = 1,
344
+ ray_inactivity_timeout: int = 10,
345
+ ) -> None:
346
+ """Run table monitor with the given parameters."""
347
+
348
+ # Parse merge keys
349
+ merge_keys_list = json.loads(merge_keys)
350
+
351
+ # Run the monitor
352
+ monitor_table(
353
+ catalog_type=catalog_type,
354
+ warehouse_path=warehouse_path,
355
+ catalog_uri=catalog_uri,
356
+ namespace=namespace,
357
+ table_name=table_name,
358
+ merge_keys=merge_keys_list,
359
+ filesystem_type=filesystem_type,
360
+ monitor_interval=monitor_interval,
361
+ max_converter_parallelism=max_converter_parallelism,
362
+ ray_inactivity_timeout=ray_inactivity_timeout,
363
+ )
364
+
365
+
366
+ if __name__ == "__main__":
367
+ """
368
+ DeltaCAT Table Monitor - Monitor Iceberg tables and run converter sessions
369
+
370
+ Example usage:
371
+ $ python table_monitor.py \
372
+ $ --catalog-type 'rest' \
373
+ $ --warehouse-path '/tmp/iceberg-warehouse' \
374
+ $ --catalog-uri 'http://localhost:8181' \
375
+ $ --namespace 'default' \
376
+ $ --table-name 'demo_table' \
377
+ $ --merge-keys '["id"]' \
378
+ $ --monitor-interval 1.0 \
379
+ $ --max-converter-parallelism 2 \
380
+ $ --ray-inactivity-timeout 300.0
381
+ """
382
+
383
+ script_args = [
384
+ (
385
+ ["--catalog-type"],
386
+ {
387
+ "help": "Catalog type name (rest, hive, sql)",
388
+ "type": str,
389
+ "required": True,
390
+ },
391
+ ),
392
+ (
393
+ ["--warehouse-path"],
394
+ {
395
+ "help": "Warehouse path",
396
+ "type": str,
397
+ "required": True,
398
+ },
399
+ ),
400
+ (
401
+ ["--catalog-uri"],
402
+ {
403
+ "help": "Catalog URI",
404
+ "type": str,
405
+ "required": True,
406
+ },
407
+ ),
408
+ (
409
+ ["--namespace"],
410
+ {
411
+ "help": "Table namespace",
412
+ "type": str,
413
+ "required": True,
414
+ },
415
+ ),
416
+ (
417
+ ["--table-name"],
418
+ {
419
+ "help": "Table name to monitor",
420
+ "type": str,
421
+ "required": True,
422
+ },
423
+ ),
424
+ (
425
+ ["--merge-keys"],
426
+ {
427
+ "help": "Comma-separated merge key column names",
428
+ "type": str,
429
+ "required": True,
430
+ },
431
+ ),
432
+ (
433
+ ["--filesystem-type"],
434
+ {
435
+ "help": "Filesystem type",
436
+ "type": str,
437
+ "default": "local",
438
+ },
439
+ ),
440
+ (
441
+ ["--monitor-interval"],
442
+ {
443
+ "help": "Seconds between monitoring checks",
444
+ "type": float,
445
+ "default": 5.0,
446
+ },
447
+ ),
448
+ (
449
+ ["--max-converter-parallelism"],
450
+ {
451
+ "help": "Maximum number of concurrent converter tasks",
452
+ "type": int,
453
+ "default": 1,
454
+ },
455
+ ),
456
+ (
457
+ ["--ray-inactivity-timeout"],
458
+ {
459
+ "help": "Ray inactivity timeout in seconds (Ray will shutdown if no activity)",
460
+ "type": int,
461
+ "default": 300,
462
+ },
463
+ ),
464
+ ]
465
+
466
+ # Parse CLI input arguments
467
+ parser = argparse.ArgumentParser(
468
+ description="DeltaCAT Table Monitor - Monitor Iceberg tables and run converter sessions"
469
+ )
470
+ for args, kwargs in script_args:
471
+ parser.add_argument(*args, **kwargs)
472
+ args = parser.parse_args()
473
+ print(f"[TABLE MONITOR] Command Line Arguments: {args}")
474
+
475
+ # Initialize DeltaCAT
476
+ deltacat.init()
477
+
478
+ # Run the table monitor using the parsed arguments
479
+ run(**vars(args))