deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,173 @@
|
|
1
|
+
"""
|
2
|
+
DeltaCAT Job-based Managed I/O for Apache Beam
|
3
|
+
|
4
|
+
This module provides a job-based implementation of the DeltaCAT table monitor
|
5
|
+
that uses Ray jobs for better scalability and resource management instead of
|
6
|
+
threading.
|
7
|
+
|
8
|
+
Key Features:
|
9
|
+
- Uses DeltaCAT jobs for table monitoring
|
10
|
+
- Unique job IDs prevent duplicate monitoring jobs
|
11
|
+
- Supports both local and remote Ray clusters
|
12
|
+
- Backward compatible with existing managed.py interface
|
13
|
+
"""
|
14
|
+
|
15
|
+
import logging
|
16
|
+
from typing import Dict, Any
|
17
|
+
|
18
|
+
import apache_beam as beam
|
19
|
+
from pyiceberg.catalog import CatalogType
|
20
|
+
|
21
|
+
from deltacat.experimental.converter_agent.table_monitor import submit_table_monitor_job
|
22
|
+
from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
|
23
|
+
import deltacat.logs as logs
|
24
|
+
|
25
|
+
# Initialize DeltaCAT logger
|
26
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
27
|
+
|
28
|
+
# Store original functions before monkey-patching
|
29
|
+
_original_write = beam.managed.Write
|
30
|
+
|
31
|
+
|
32
|
+
# Create a dictionary of Java catalog impl to CatalogType
|
33
|
+
JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE = {
|
34
|
+
"org.apache.iceberg.rest.restcatalog": CatalogType.REST,
|
35
|
+
"org.apache.iceberg.hive.hivecatalog": CatalogType.HIVE,
|
36
|
+
"org.apache.iceberg.aws.glue.gluecatalog": CatalogType.GLUE,
|
37
|
+
"org.apache.iceberg.jdbc.jdbccatalog": CatalogType.SQL,
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
def _extract_catalog_config_from_beam(config: Dict[str, Any]) -> Dict[str, Any]:
|
42
|
+
"""Extract catalog configuration from Beam config."""
|
43
|
+
catalog_properties = config.get("catalog_properties", {})
|
44
|
+
|
45
|
+
# Extract catalog implementation class
|
46
|
+
catalog_impl = catalog_properties.get("catalog-impl")
|
47
|
+
|
48
|
+
# Extract catalog type
|
49
|
+
catalog_type = catalog_properties.get("type")
|
50
|
+
|
51
|
+
# Extract other relevant properties
|
52
|
+
warehouse = catalog_properties.get("warehouse", "")
|
53
|
+
uri = catalog_properties.get("uri", "")
|
54
|
+
|
55
|
+
return {
|
56
|
+
"catalog_impl": catalog_impl,
|
57
|
+
"type": catalog_type,
|
58
|
+
"warehouse": warehouse,
|
59
|
+
"uri": uri,
|
60
|
+
"catalog_properties": catalog_properties,
|
61
|
+
}
|
62
|
+
|
63
|
+
|
64
|
+
def write(*args, **kwargs):
|
65
|
+
"""Wrapper over beam.managed.Write that automatically creates a DeltaCAT table monitor & converter job."""
|
66
|
+
logger.debug(f"Starting DeltaCAT write operation")
|
67
|
+
logger.debug(f"args: {args}")
|
68
|
+
logger.debug(f"kwargs keys: {list(kwargs.keys()) if kwargs else 'None'}")
|
69
|
+
|
70
|
+
# Extract and pop deltacat-specific config keys
|
71
|
+
config = kwargs.get("config", {}).copy() if kwargs.get("config") else {}
|
72
|
+
|
73
|
+
# Extract DeltaCAT converter properties from parent config or individual keys (for backward compatibility)
|
74
|
+
deltacat_converter_properties = config.pop("deltacat_converter_properties", {})
|
75
|
+
|
76
|
+
# Support both new nested structure and old flat structure for backward compatibility
|
77
|
+
deltacat_converter_interval = deltacat_converter_properties.get(
|
78
|
+
"deltacat_converter_interval", 3.0
|
79
|
+
)
|
80
|
+
|
81
|
+
merge_keys = deltacat_converter_properties.get("merge_keys")
|
82
|
+
|
83
|
+
# Extract filesystem parameter (optional) - can be in converter properties or top-level config
|
84
|
+
filesystem = deltacat_converter_properties.get("filesystem", None)
|
85
|
+
|
86
|
+
# Extract cluster configuration file path (for remote jobs)
|
87
|
+
cluster_cfg_file_path = deltacat_converter_properties.get(
|
88
|
+
"cluster_cfg_file_path", None
|
89
|
+
)
|
90
|
+
|
91
|
+
# Extract max converter parallelism
|
92
|
+
max_converter_parallelism = deltacat_converter_properties.get(
|
93
|
+
"max_converter_parallelism",
|
94
|
+
DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
|
95
|
+
)
|
96
|
+
|
97
|
+
# Extract ray inactivity timeout
|
98
|
+
ray_inactivity_timeout = deltacat_converter_properties.get(
|
99
|
+
"ray_inactivity_timeout", 10
|
100
|
+
)
|
101
|
+
|
102
|
+
# Extract table identifier and warehouse path
|
103
|
+
table_identifier = config.get("table")
|
104
|
+
if not table_identifier:
|
105
|
+
raise ValueError("Table is required")
|
106
|
+
|
107
|
+
if table_identifier and "." in table_identifier:
|
108
|
+
namespace, table_name = table_identifier.split(".", 1)
|
109
|
+
else:
|
110
|
+
namespace = "default"
|
111
|
+
table_name = table_identifier
|
112
|
+
|
113
|
+
warehouse_path = config.get("catalog_properties", {}).get("warehouse", "")
|
114
|
+
|
115
|
+
# Extract catalog configuration for monitoring
|
116
|
+
beam_catalog_config = _extract_catalog_config_from_beam(config)
|
117
|
+
|
118
|
+
# Derive CatalogType from "catalog_impl" or "type" property
|
119
|
+
catalog_impl = beam_catalog_config.get("catalog_impl")
|
120
|
+
if catalog_impl:
|
121
|
+
catalog_type = JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE.get(catalog_impl.lower())
|
122
|
+
if not catalog_type:
|
123
|
+
raise ValueError(f"Unsupported catalog implementation: {catalog_impl}")
|
124
|
+
else:
|
125
|
+
catalog_type_str = beam_catalog_config.get("type")
|
126
|
+
if catalog_type_str:
|
127
|
+
catalog_type = CatalogType(catalog_type_str.lower())
|
128
|
+
else:
|
129
|
+
raise ValueError(
|
130
|
+
f"No catalog implementation or type found in config: {beam_catalog_config}"
|
131
|
+
)
|
132
|
+
|
133
|
+
# Update kwargs with the modified config
|
134
|
+
if "config" in kwargs:
|
135
|
+
kwargs["config"] = config
|
136
|
+
|
137
|
+
logger.debug(f"Preparing to submit table monitor job...")
|
138
|
+
logger.debug(f"table_name: {table_name}")
|
139
|
+
logger.debug(f"deltacat_converter_interval: {deltacat_converter_interval}s")
|
140
|
+
logger.debug(f"merge_keys: {merge_keys}")
|
141
|
+
logger.debug(f"warehouse_path: {warehouse_path}")
|
142
|
+
logger.debug(
|
143
|
+
f"filesystem: {type(filesystem).__name__ if filesystem else 'None (auto-resolve)'}"
|
144
|
+
)
|
145
|
+
logger.debug(f"cluster_cfg_file_path: {cluster_cfg_file_path or 'None (local)'}")
|
146
|
+
logger.debug(f"max_converter_parallelism: {max_converter_parallelism}")
|
147
|
+
logger.debug(f"ray_inactivity_timeout: {ray_inactivity_timeout}s")
|
148
|
+
logger.debug(
|
149
|
+
f"using deltacat_converter_properties: {len(deltacat_converter_properties) > 0}"
|
150
|
+
)
|
151
|
+
logger.debug(f"catalog_type: {catalog_type}")
|
152
|
+
|
153
|
+
# Submit monitoring job
|
154
|
+
try:
|
155
|
+
submit_table_monitor_job(
|
156
|
+
warehouse_path=warehouse_path,
|
157
|
+
catalog_type=catalog_type,
|
158
|
+
catalog_uri=beam_catalog_config.get("uri"),
|
159
|
+
namespace=namespace,
|
160
|
+
table_name=table_name,
|
161
|
+
merge_keys=merge_keys,
|
162
|
+
monitor_interval=deltacat_converter_interval,
|
163
|
+
filesystem=filesystem,
|
164
|
+
cluster_cfg_file_path=cluster_cfg_file_path,
|
165
|
+
max_converter_parallelism=max_converter_parallelism,
|
166
|
+
ray_inactivity_timeout=ray_inactivity_timeout,
|
167
|
+
)
|
168
|
+
except Exception as e:
|
169
|
+
# Don't fail the write operation, just log the error
|
170
|
+
logger.error(f"Failed to submit table monitor job: {e}")
|
171
|
+
logger.error(f"Exception traceback:", exc_info=True)
|
172
|
+
logger.info(f"Delegating to beam.managed.Write")
|
173
|
+
return _original_write(*args, **kwargs)
|
@@ -0,0 +1,479 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
DeltaCAT Table Monitor Job. Automatically runs data converter sessions in response to table updates.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import argparse
|
7
|
+
import hashlib
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import os
|
11
|
+
import time
|
12
|
+
from typing import List, Optional
|
13
|
+
|
14
|
+
import pyarrow.fs as pafs
|
15
|
+
import ray
|
16
|
+
import deltacat
|
17
|
+
|
18
|
+
from pyiceberg.catalog import load_catalog, CatalogType
|
19
|
+
from pyiceberg.exceptions import NoSuchTableError
|
20
|
+
|
21
|
+
from deltacat import job_client, local_job_client
|
22
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
23
|
+
from deltacat.compute.converter.converter_session import converter_session
|
24
|
+
from deltacat.compute.converter.model.converter_session_params import (
|
25
|
+
ConverterSessionParams,
|
26
|
+
)
|
27
|
+
from deltacat.compute.jobs.client import DeltaCatJobClient
|
28
|
+
from deltacat.utils.filesystem import (
|
29
|
+
resolve_path_and_filesystem,
|
30
|
+
FilesystemType,
|
31
|
+
)
|
32
|
+
import deltacat.logs as logs
|
33
|
+
|
34
|
+
# Initialize DeltaCAT logger
|
35
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
|
+
|
37
|
+
|
38
|
+
def monitor_table(
|
39
|
+
catalog_type: str,
|
40
|
+
warehouse_path: str,
|
41
|
+
catalog_uri: Optional[str],
|
42
|
+
namespace: str,
|
43
|
+
table_name: str,
|
44
|
+
merge_keys: List[str],
|
45
|
+
filesystem_type: FilesystemType = FilesystemType.LOCAL,
|
46
|
+
monitor_interval: float = 5.0,
|
47
|
+
max_converter_parallelism: int = 1,
|
48
|
+
ray_inactivity_timeout: int = 10,
|
49
|
+
) -> None:
|
50
|
+
"""Monitor an Iceberg table for changes and run converter sessions when needed."""
|
51
|
+
|
52
|
+
logger.info(
|
53
|
+
f"Starting table monitor. Namespace: '{namespace}', Table: '{table_name}', "
|
54
|
+
f"Warehouse: '{warehouse_path}', Catalog type: '{catalog_type}', "
|
55
|
+
f"Catalog URI: '{catalog_uri or 'None'}', Merge keys: '{merge_keys}', "
|
56
|
+
f"Filesystem type: '{filesystem_type}', Monitor interval: '{monitor_interval}s', "
|
57
|
+
f"Max converter parallelism: '{max_converter_parallelism}', "
|
58
|
+
f"Ray inactivity timeout: '{ray_inactivity_timeout}s'"
|
59
|
+
)
|
60
|
+
|
61
|
+
# Create PyIceberg catalog
|
62
|
+
catalog = load_catalog(
|
63
|
+
"monitor_catalog",
|
64
|
+
type=catalog_type,
|
65
|
+
warehouse=warehouse_path,
|
66
|
+
uri=catalog_uri or None,
|
67
|
+
)
|
68
|
+
|
69
|
+
# Set up filesystem
|
70
|
+
filesystem = FilesystemType.to_filesystem(filesystem_type)
|
71
|
+
if filesystem_type == FilesystemType.UNKNOWN:
|
72
|
+
normalized_warehouse_path, filesystem = resolve_path_and_filesystem(
|
73
|
+
warehouse_path
|
74
|
+
)
|
75
|
+
warehouse_path = normalized_warehouse_path
|
76
|
+
|
77
|
+
logger.info(f"Resolved filesystem: {type(filesystem).__name__}")
|
78
|
+
logger.info(f"Normalized warehouse path: {warehouse_path}")
|
79
|
+
|
80
|
+
# Parse table identifier
|
81
|
+
if not namespace:
|
82
|
+
namespace = DEFAULT_NAMESPACE
|
83
|
+
table_identifier = f"{namespace}.{table_name}"
|
84
|
+
logger.info(f" - Parsed table - namespace: '{namespace}', table: '{table_name}'")
|
85
|
+
|
86
|
+
last_snapshot_id = None
|
87
|
+
start_time = time.time()
|
88
|
+
last_write_time = start_time # Track last time we saw table activity
|
89
|
+
|
90
|
+
while True:
|
91
|
+
# Sleep before starting the first iteration and all subsequent iterations
|
92
|
+
logger.debug(f"Sleeping for {monitor_interval}s before next check...")
|
93
|
+
time.sleep(monitor_interval)
|
94
|
+
|
95
|
+
logger.info(f"Checking table {table_identifier} for updates...")
|
96
|
+
|
97
|
+
# Try to load the table
|
98
|
+
try:
|
99
|
+
tbl = catalog.load_table(table_identifier)
|
100
|
+
current_snapshot_id = tbl.metadata.current_snapshot_id
|
101
|
+
if last_snapshot_id != current_snapshot_id:
|
102
|
+
logger.info(
|
103
|
+
f"New table version detected - snapshot ID: {current_snapshot_id}"
|
104
|
+
)
|
105
|
+
logger.info(f"Table has {len(tbl.metadata.snapshots)} snapshots")
|
106
|
+
logger.info(f"Table format version: {tbl.metadata.format_version}")
|
107
|
+
|
108
|
+
# Update last activity time when we detect table changes
|
109
|
+
last_write_time = time.time()
|
110
|
+
|
111
|
+
# Always run deduplication when there are snapshots (duplicates can exist within a single snapshot)
|
112
|
+
logger.info(
|
113
|
+
f"Table has data - triggering converter session to resolve any duplicates..."
|
114
|
+
)
|
115
|
+
|
116
|
+
# Run converter session
|
117
|
+
try:
|
118
|
+
converter_params = ConverterSessionParams.of(
|
119
|
+
{
|
120
|
+
"catalog": catalog,
|
121
|
+
"iceberg_namespace": namespace,
|
122
|
+
"iceberg_table_name": table_name,
|
123
|
+
"iceberg_warehouse_bucket_name": warehouse_path,
|
124
|
+
"merge_keys": merge_keys,
|
125
|
+
"enforce_primary_key_uniqueness": True,
|
126
|
+
"task_max_parallelism": max_converter_parallelism,
|
127
|
+
"filesystem": filesystem,
|
128
|
+
"location_provider_prefix_override": None,
|
129
|
+
}
|
130
|
+
)
|
131
|
+
|
132
|
+
logger.debug(f"Converter Session Parameters: {converter_params}")
|
133
|
+
|
134
|
+
logger.info(f"Starting converter session...")
|
135
|
+
updated_metadata = converter_session(params=converter_params)
|
136
|
+
logger.info(f"Converter session completed successfully")
|
137
|
+
current_snapshot_id = updated_metadata.current_snapshot_id
|
138
|
+
logger.info(
|
139
|
+
f"Current snapshot ID updated to: {current_snapshot_id}"
|
140
|
+
)
|
141
|
+
except Exception as e:
|
142
|
+
logger.error(f"Converter session failed: {e}")
|
143
|
+
logger.error(f"Exception traceback:", exc_info=True)
|
144
|
+
last_snapshot_id = current_snapshot_id
|
145
|
+
else:
|
146
|
+
logger.debug(
|
147
|
+
f"No table changes detected (snapshot ID: {current_snapshot_id})"
|
148
|
+
)
|
149
|
+
except NoSuchTableError:
|
150
|
+
logger.info(f"Table {table_identifier} does not exist yet - waiting...")
|
151
|
+
except Exception as e:
|
152
|
+
logger.error(f"Error in table monitor: {e}")
|
153
|
+
|
154
|
+
# Check for Ray inactivity timeout
|
155
|
+
current_time = time.time()
|
156
|
+
inactivity_duration = current_time - last_write_time
|
157
|
+
|
158
|
+
if inactivity_duration >= ray_inactivity_timeout:
|
159
|
+
logger.info(
|
160
|
+
f"Ray inactivity timeout reached ({inactivity_duration:.1f}s >= {ray_inactivity_timeout}s)"
|
161
|
+
)
|
162
|
+
logger.info(
|
163
|
+
f"No table activity detected for {inactivity_duration:.1f} seconds, shutting down Ray..."
|
164
|
+
)
|
165
|
+
|
166
|
+
try:
|
167
|
+
if ray.is_initialized():
|
168
|
+
ray.shutdown()
|
169
|
+
logger.info("Ray shutdown successfully due to inactivity")
|
170
|
+
else:
|
171
|
+
logger.info("Ray was not initialized, nothing to shut down")
|
172
|
+
except Exception as e:
|
173
|
+
logger.error(f"Error shutting down Ray: {e}")
|
174
|
+
|
175
|
+
logger.info(f"Table monitor stopping due to inactivity timeout")
|
176
|
+
break
|
177
|
+
|
178
|
+
logger.info(f"Table monitor completed")
|
179
|
+
|
180
|
+
|
181
|
+
def _generate_job_name(warehouse_path: str, namespace: str, table_name: str) -> str:
|
182
|
+
"""
|
183
|
+
Generate a unique job name based on warehouse path, namespace, and table name.
|
184
|
+
|
185
|
+
Args:
|
186
|
+
warehouse_path: Warehouse path
|
187
|
+
namespace: Table namespace
|
188
|
+
table_name: Table name
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
Job name string.
|
192
|
+
"""
|
193
|
+
# Create a sha1 digest of the warehouse path, namespace, and table name
|
194
|
+
digest = hashlib.sha1(
|
195
|
+
f"{warehouse_path}-{namespace}-{table_name}".encode()
|
196
|
+
).hexdigest()
|
197
|
+
job_name = f"deltacat-monitor-{digest}"
|
198
|
+
|
199
|
+
return job_name
|
200
|
+
|
201
|
+
|
202
|
+
def _cleanup_terminated_jobs_for_submission_id(
|
203
|
+
client: DeltaCatJobClient, submission_id: str
|
204
|
+
) -> bool:
|
205
|
+
"""Clean up any terminated jobs with the given submission ID."""
|
206
|
+
logger.debug(
|
207
|
+
f"Searching for terminated jobs to cleanup with submission ID: {submission_id}"
|
208
|
+
)
|
209
|
+
try:
|
210
|
+
all_jobs = client.list_jobs()
|
211
|
+
logger.debug(f"All jobs: {all_jobs}")
|
212
|
+
for job in all_jobs:
|
213
|
+
if job.submission_id == submission_id and job.status.is_terminal():
|
214
|
+
logger.info(
|
215
|
+
f"Cleaning up terminated job: {submission_id} (status: {job.status})"
|
216
|
+
)
|
217
|
+
client.delete_job(submission_id)
|
218
|
+
return True
|
219
|
+
except Exception as e:
|
220
|
+
logger.warning(f"Cleanup failed for job '{submission_id}': {e}")
|
221
|
+
return False
|
222
|
+
|
223
|
+
|
224
|
+
def submit_table_monitor_job(
|
225
|
+
warehouse_path: str,
|
226
|
+
catalog_type: CatalogType,
|
227
|
+
catalog_uri: Optional[str],
|
228
|
+
namespace: str,
|
229
|
+
table_name: str,
|
230
|
+
merge_keys: list,
|
231
|
+
monitor_interval: float,
|
232
|
+
max_converter_parallelism: int,
|
233
|
+
filesystem: pafs.FileSystem = None,
|
234
|
+
cluster_cfg_file_path: Optional[str] = None,
|
235
|
+
ray_inactivity_timeout: int = 10,
|
236
|
+
) -> str:
|
237
|
+
"""
|
238
|
+
Submit a table monitor job to Ray cluster.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
warehouse_path: Warehouse path
|
242
|
+
catalog_type: Catalog type
|
243
|
+
catalog_uri: Catalog URI
|
244
|
+
namespace: Table namespace
|
245
|
+
table_name: Table name to monitor
|
246
|
+
merge_keys: List of merge key column names
|
247
|
+
monitor_interval: Seconds between monitoring checks
|
248
|
+
max_converter_parallelism: Maximum number of concurrent converter tasks
|
249
|
+
filesystem: PyArrow filesystem instance
|
250
|
+
cluster_cfg_file_path: Path to cluster config file (None for local)
|
251
|
+
ray_inactivity_timeout: Seconds to wait before shutting down Ray cluster
|
252
|
+
Returns:
|
253
|
+
Job ID of the submitted job
|
254
|
+
"""
|
255
|
+
|
256
|
+
# Parse table identifier to extract namespace and table name
|
257
|
+
if not namespace:
|
258
|
+
namespace = DEFAULT_NAMESPACE
|
259
|
+
|
260
|
+
# Generate unique job ID based on the warehouse and table path
|
261
|
+
job_name = _generate_job_name(
|
262
|
+
warehouse_path=warehouse_path, namespace=namespace, table_name=table_name
|
263
|
+
)
|
264
|
+
|
265
|
+
# Resolve the appropriate local or remote job client
|
266
|
+
if cluster_cfg_file_path:
|
267
|
+
# Submit to remote cluster
|
268
|
+
logger.info(
|
269
|
+
f"Preparing to submit job to remote cluster: {cluster_cfg_file_path}"
|
270
|
+
)
|
271
|
+
# Set the cluster name to the job ID to prevent starting multiple Ray clusters monitoring the same table.
|
272
|
+
client = job_client(cluster_cfg_file_path, cluster_name_override=job_name)
|
273
|
+
else:
|
274
|
+
# Submit to local cluster using DeltaCAT local job client
|
275
|
+
ray_init_args = {
|
276
|
+
"local_mode": True,
|
277
|
+
"resources": {"convert_task": max_converter_parallelism},
|
278
|
+
}
|
279
|
+
logger.info(
|
280
|
+
f"Preparing to submit job locally with ray init args: {ray_init_args}"
|
281
|
+
)
|
282
|
+
client = local_job_client(ray_init_args=ray_init_args)
|
283
|
+
|
284
|
+
# Add filesystem type - determine from filesystem instance
|
285
|
+
filesystem_type = FilesystemType.from_filesystem(filesystem)
|
286
|
+
|
287
|
+
# Build CLI arguments for table_monitor job
|
288
|
+
table_monitor_script_dir = os.path.dirname(os.path.abspath(__file__))
|
289
|
+
table_monitor_script_path = os.path.join(
|
290
|
+
table_monitor_script_dir, "table_monitor.py"
|
291
|
+
)
|
292
|
+
|
293
|
+
logger.debug(f"Table monitor script path: {table_monitor_script_path}")
|
294
|
+
logger.debug(
|
295
|
+
f"Table monitor script exists: {os.path.exists(table_monitor_script_path)}"
|
296
|
+
)
|
297
|
+
|
298
|
+
cmd_args = [
|
299
|
+
f"python {table_monitor_script_path}",
|
300
|
+
f"--catalog-type '{catalog_type.value}'",
|
301
|
+
f"--warehouse-path '{warehouse_path}'",
|
302
|
+
f"--catalog-uri '{catalog_uri}'",
|
303
|
+
f"--namespace '{namespace}'",
|
304
|
+
f"--table-name '{table_name}'",
|
305
|
+
f"--merge-keys '{json.dumps(merge_keys)}'",
|
306
|
+
f"--monitor-interval {monitor_interval}",
|
307
|
+
f"--max-converter-parallelism {max_converter_parallelism}",
|
308
|
+
f"--ray-inactivity-timeout {ray_inactivity_timeout}",
|
309
|
+
f"--filesystem-type '{filesystem_type}'",
|
310
|
+
]
|
311
|
+
|
312
|
+
# Join all arguments
|
313
|
+
entrypoint = " ".join(cmd_args)
|
314
|
+
logger.debug(
|
315
|
+
f"Submitting table monitor job '{job_name}' with entrypoint: {entrypoint}"
|
316
|
+
)
|
317
|
+
|
318
|
+
# Clean up any terminated jobs with the same submission ID to allow reuse
|
319
|
+
_cleanup_terminated_jobs_for_submission_id(client, job_name)
|
320
|
+
|
321
|
+
# Submit the job with the correct working directory
|
322
|
+
# Working directory should be the converter_agent directory where table_monitor.py is located
|
323
|
+
job_submission_id = client.submit_job(
|
324
|
+
submission_id=job_name,
|
325
|
+
entrypoint=entrypoint,
|
326
|
+
runtime_env={"working_dir": table_monitor_script_dir},
|
327
|
+
)
|
328
|
+
|
329
|
+
logger.info(f"Table monitor job submitted successfully: {job_submission_id}")
|
330
|
+
|
331
|
+
return job_submission_id
|
332
|
+
|
333
|
+
|
334
|
+
def run(
|
335
|
+
catalog_type: str,
|
336
|
+
warehouse_path: str,
|
337
|
+
catalog_uri: Optional[str],
|
338
|
+
namespace: str,
|
339
|
+
table_name: str,
|
340
|
+
merge_keys: str,
|
341
|
+
filesystem_type: str = "local",
|
342
|
+
monitor_interval: float = 1.0,
|
343
|
+
max_converter_parallelism: int = 1,
|
344
|
+
ray_inactivity_timeout: int = 10,
|
345
|
+
) -> None:
|
346
|
+
"""Run table monitor with the given parameters."""
|
347
|
+
|
348
|
+
# Parse merge keys
|
349
|
+
merge_keys_list = json.loads(merge_keys)
|
350
|
+
|
351
|
+
# Run the monitor
|
352
|
+
monitor_table(
|
353
|
+
catalog_type=catalog_type,
|
354
|
+
warehouse_path=warehouse_path,
|
355
|
+
catalog_uri=catalog_uri,
|
356
|
+
namespace=namespace,
|
357
|
+
table_name=table_name,
|
358
|
+
merge_keys=merge_keys_list,
|
359
|
+
filesystem_type=filesystem_type,
|
360
|
+
monitor_interval=monitor_interval,
|
361
|
+
max_converter_parallelism=max_converter_parallelism,
|
362
|
+
ray_inactivity_timeout=ray_inactivity_timeout,
|
363
|
+
)
|
364
|
+
|
365
|
+
|
366
|
+
if __name__ == "__main__":
|
367
|
+
"""
|
368
|
+
DeltaCAT Table Monitor - Monitor Iceberg tables and run converter sessions
|
369
|
+
|
370
|
+
Example usage:
|
371
|
+
$ python table_monitor.py \
|
372
|
+
$ --catalog-type 'rest' \
|
373
|
+
$ --warehouse-path '/tmp/iceberg-warehouse' \
|
374
|
+
$ --catalog-uri 'http://localhost:8181' \
|
375
|
+
$ --namespace 'default' \
|
376
|
+
$ --table-name 'demo_table' \
|
377
|
+
$ --merge-keys '["id"]' \
|
378
|
+
$ --monitor-interval 1.0 \
|
379
|
+
$ --max-converter-parallelism 2 \
|
380
|
+
$ --ray-inactivity-timeout 300.0
|
381
|
+
"""
|
382
|
+
|
383
|
+
script_args = [
|
384
|
+
(
|
385
|
+
["--catalog-type"],
|
386
|
+
{
|
387
|
+
"help": "Catalog type name (rest, hive, sql)",
|
388
|
+
"type": str,
|
389
|
+
"required": True,
|
390
|
+
},
|
391
|
+
),
|
392
|
+
(
|
393
|
+
["--warehouse-path"],
|
394
|
+
{
|
395
|
+
"help": "Warehouse path",
|
396
|
+
"type": str,
|
397
|
+
"required": True,
|
398
|
+
},
|
399
|
+
),
|
400
|
+
(
|
401
|
+
["--catalog-uri"],
|
402
|
+
{
|
403
|
+
"help": "Catalog URI",
|
404
|
+
"type": str,
|
405
|
+
"required": True,
|
406
|
+
},
|
407
|
+
),
|
408
|
+
(
|
409
|
+
["--namespace"],
|
410
|
+
{
|
411
|
+
"help": "Table namespace",
|
412
|
+
"type": str,
|
413
|
+
"required": True,
|
414
|
+
},
|
415
|
+
),
|
416
|
+
(
|
417
|
+
["--table-name"],
|
418
|
+
{
|
419
|
+
"help": "Table name to monitor",
|
420
|
+
"type": str,
|
421
|
+
"required": True,
|
422
|
+
},
|
423
|
+
),
|
424
|
+
(
|
425
|
+
["--merge-keys"],
|
426
|
+
{
|
427
|
+
"help": "Comma-separated merge key column names",
|
428
|
+
"type": str,
|
429
|
+
"required": True,
|
430
|
+
},
|
431
|
+
),
|
432
|
+
(
|
433
|
+
["--filesystem-type"],
|
434
|
+
{
|
435
|
+
"help": "Filesystem type",
|
436
|
+
"type": str,
|
437
|
+
"default": "local",
|
438
|
+
},
|
439
|
+
),
|
440
|
+
(
|
441
|
+
["--monitor-interval"],
|
442
|
+
{
|
443
|
+
"help": "Seconds between monitoring checks",
|
444
|
+
"type": float,
|
445
|
+
"default": 5.0,
|
446
|
+
},
|
447
|
+
),
|
448
|
+
(
|
449
|
+
["--max-converter-parallelism"],
|
450
|
+
{
|
451
|
+
"help": "Maximum number of concurrent converter tasks",
|
452
|
+
"type": int,
|
453
|
+
"default": 1,
|
454
|
+
},
|
455
|
+
),
|
456
|
+
(
|
457
|
+
["--ray-inactivity-timeout"],
|
458
|
+
{
|
459
|
+
"help": "Ray inactivity timeout in seconds (Ray will shutdown if no activity)",
|
460
|
+
"type": int,
|
461
|
+
"default": 300,
|
462
|
+
},
|
463
|
+
),
|
464
|
+
]
|
465
|
+
|
466
|
+
# Parse CLI input arguments
|
467
|
+
parser = argparse.ArgumentParser(
|
468
|
+
description="DeltaCAT Table Monitor - Monitor Iceberg tables and run converter sessions"
|
469
|
+
)
|
470
|
+
for args, kwargs in script_args:
|
471
|
+
parser.add_argument(*args, **kwargs)
|
472
|
+
args = parser.parse_args()
|
473
|
+
print(f"[TABLE MONITOR] Command Line Arguments: {args}")
|
474
|
+
|
475
|
+
# Initialize DeltaCAT
|
476
|
+
deltacat.init()
|
477
|
+
|
478
|
+
# Run the table monitor using the parsed arguments
|
479
|
+
run(**vars(args))
|