deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,479 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DeltaCAT Table Monitor Job. Automatically runs data converter sessions in response to table updates.
4
+ """
5
+
6
+ import argparse
7
+ import hashlib
8
+ import json
9
+ import logging
10
+ import os
11
+ import time
12
+ from typing import List, Optional
13
+
14
+ import pyarrow.fs as pafs
15
+ import ray
16
+ import deltacat
17
+
18
+ from pyiceberg.catalog import load_catalog, CatalogType
19
+ from pyiceberg.exceptions import NoSuchTableError
20
+
21
+ from deltacat import job_client, local_job_client
22
+ from deltacat.constants import DEFAULT_NAMESPACE
23
+ from deltacat.compute.converter.converter_session import converter_session
24
+ from deltacat.compute.converter.model.converter_session_params import (
25
+ ConverterSessionParams,
26
+ )
27
+ from deltacat.compute.jobs.client import DeltaCatJobClient
28
+ from deltacat.utils.filesystem import (
29
+ resolve_path_and_filesystem,
30
+ FilesystemType,
31
+ )
32
+ import deltacat.logs as logs
33
+
34
+ # Initialize DeltaCAT logger
35
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
+
37
+
38
+ def monitor_table(
39
+ catalog_type: str,
40
+ warehouse_path: str,
41
+ catalog_uri: Optional[str],
42
+ namespace: str,
43
+ table_name: str,
44
+ merge_keys: List[str],
45
+ filesystem_type: FilesystemType = FilesystemType.LOCAL,
46
+ monitor_interval: float = 5.0,
47
+ max_converter_parallelism: int = 1,
48
+ ray_inactivity_timeout: int = 10,
49
+ ) -> None:
50
+ """Monitor an Iceberg table for changes and run converter sessions when needed."""
51
+
52
+ logger.info(
53
+ f"Starting table monitor. Namespace: '{namespace}', Table: '{table_name}', "
54
+ f"Warehouse: '{warehouse_path}', Catalog type: '{catalog_type}', "
55
+ f"Catalog URI: '{catalog_uri or 'None'}', Merge keys: '{merge_keys}', "
56
+ f"Filesystem type: '{filesystem_type}', Monitor interval: '{monitor_interval}s', "
57
+ f"Max converter parallelism: '{max_converter_parallelism}', "
58
+ f"Ray inactivity timeout: '{ray_inactivity_timeout}s'"
59
+ )
60
+
61
+ # Create PyIceberg catalog
62
+ catalog = load_catalog(
63
+ "monitor_catalog",
64
+ type=catalog_type,
65
+ warehouse=warehouse_path,
66
+ uri=catalog_uri or None,
67
+ )
68
+
69
+ # Set up filesystem
70
+ filesystem = FilesystemType.to_filesystem(filesystem_type)
71
+ if filesystem_type == FilesystemType.UNKNOWN:
72
+ normalized_warehouse_path, filesystem = resolve_path_and_filesystem(
73
+ warehouse_path
74
+ )
75
+ warehouse_path = normalized_warehouse_path
76
+
77
+ logger.info(f"Resolved filesystem: {type(filesystem).__name__}")
78
+ logger.info(f"Normalized warehouse path: {warehouse_path}")
79
+
80
+ # Parse table identifier
81
+ if not namespace:
82
+ namespace = DEFAULT_NAMESPACE
83
+ table_identifier = f"{namespace}.{table_name}"
84
+ logger.info(f" - Parsed table - namespace: '{namespace}', table: '{table_name}'")
85
+
86
+ last_snapshot_id = None
87
+ start_time = time.time()
88
+ last_write_time = start_time # Track last time we saw table activity
89
+
90
+ while True:
91
+ # Sleep before starting the first iteration and all subsequent iterations
92
+ logger.debug(f"Sleeping for {monitor_interval}s before next check...")
93
+ time.sleep(monitor_interval)
94
+
95
+ logger.info(f"Checking table {table_identifier} for updates...")
96
+
97
+ # Try to load the table
98
+ try:
99
+ tbl = catalog.load_table(table_identifier)
100
+ current_snapshot_id = tbl.metadata.current_snapshot_id
101
+ if last_snapshot_id != current_snapshot_id:
102
+ logger.info(
103
+ f"New table version detected - snapshot ID: {current_snapshot_id}"
104
+ )
105
+ logger.info(f"Table has {len(tbl.metadata.snapshots)} snapshots")
106
+ logger.info(f"Table format version: {tbl.metadata.format_version}")
107
+
108
+ # Update last activity time when we detect table changes
109
+ last_write_time = time.time()
110
+
111
+ # Always run deduplication when there are snapshots (duplicates can exist within a single snapshot)
112
+ logger.info(
113
+ f"Table has data - triggering converter session to resolve any duplicates..."
114
+ )
115
+
116
+ # Run converter session
117
+ try:
118
+ converter_params = ConverterSessionParams.of(
119
+ {
120
+ "catalog": catalog,
121
+ "iceberg_namespace": namespace,
122
+ "iceberg_table_name": table_name,
123
+ "iceberg_warehouse_bucket_name": warehouse_path,
124
+ "merge_keys": merge_keys,
125
+ "enforce_primary_key_uniqueness": True,
126
+ "task_max_parallelism": max_converter_parallelism,
127
+ "filesystem": filesystem,
128
+ "location_provider_prefix_override": None,
129
+ }
130
+ )
131
+
132
+ logger.debug(f"Converter Session Parameters: {converter_params}")
133
+
134
+ logger.info(f"Starting converter session...")
135
+ updated_metadata = converter_session(params=converter_params)
136
+ logger.info(f"Converter session completed successfully")
137
+ current_snapshot_id = updated_metadata.current_snapshot_id
138
+ logger.info(
139
+ f"Current snapshot ID updated to: {current_snapshot_id}"
140
+ )
141
+ except Exception as e:
142
+ logger.error(f"Converter session failed: {e}")
143
+ logger.error(f"Exception traceback:", exc_info=True)
144
+ last_snapshot_id = current_snapshot_id
145
+ else:
146
+ logger.debug(
147
+ f"No table changes detected (snapshot ID: {current_snapshot_id})"
148
+ )
149
+ except NoSuchTableError:
150
+ logger.info(f"Table {table_identifier} does not exist yet - waiting...")
151
+ except Exception as e:
152
+ logger.error(f"Error in table monitor: {e}")
153
+
154
+ # Check for Ray inactivity timeout
155
+ current_time = time.time()
156
+ inactivity_duration = current_time - last_write_time
157
+
158
+ if inactivity_duration >= ray_inactivity_timeout:
159
+ logger.info(
160
+ f"Ray inactivity timeout reached ({inactivity_duration:.1f}s >= {ray_inactivity_timeout}s)"
161
+ )
162
+ logger.info(
163
+ f"No table activity detected for {inactivity_duration:.1f} seconds, shutting down Ray..."
164
+ )
165
+
166
+ try:
167
+ if ray.is_initialized():
168
+ ray.shutdown()
169
+ logger.info("Ray shutdown successfully due to inactivity")
170
+ else:
171
+ logger.info("Ray was not initialized, nothing to shut down")
172
+ except Exception as e:
173
+ logger.error(f"Error shutting down Ray: {e}")
174
+
175
+ logger.info(f"Table monitor stopping due to inactivity timeout")
176
+ break
177
+
178
+ logger.info(f"Table monitor completed")
179
+
180
+
181
+ def _generate_job_name(warehouse_path: str, namespace: str, table_name: str) -> str:
182
+ """
183
+ Generate a unique job name based on warehouse path, namespace, and table name.
184
+
185
+ Args:
186
+ warehouse_path: Warehouse path
187
+ namespace: Table namespace
188
+ table_name: Table name
189
+
190
+ Returns:
191
+ Job name string.
192
+ """
193
+ # Create a sha1 digest of the warehouse path, namespace, and table name
194
+ digest = hashlib.sha1(
195
+ f"{warehouse_path}-{namespace}-{table_name}".encode()
196
+ ).hexdigest()
197
+ job_name = f"deltacat-monitor-{digest}"
198
+
199
+ return job_name
200
+
201
+
202
+ def _cleanup_terminated_jobs_for_submission_id(
203
+ client: DeltaCatJobClient, submission_id: str
204
+ ) -> bool:
205
+ """Clean up any terminated jobs with the given submission ID."""
206
+ logger.debug(
207
+ f"Searching for terminated jobs to cleanup with submission ID: {submission_id}"
208
+ )
209
+ try:
210
+ all_jobs = client.list_jobs()
211
+ logger.debug(f"All jobs: {all_jobs}")
212
+ for job in all_jobs:
213
+ if job.submission_id == submission_id and job.status.is_terminal():
214
+ logger.info(
215
+ f"Cleaning up terminated job: {submission_id} (status: {job.status})"
216
+ )
217
+ client.delete_job(submission_id)
218
+ return True
219
+ except Exception as e:
220
+ logger.warning(f"Cleanup failed for job '{submission_id}': {e}")
221
+ return False
222
+
223
+
224
+ def submit_table_monitor_job(
225
+ warehouse_path: str,
226
+ catalog_type: CatalogType,
227
+ catalog_uri: Optional[str],
228
+ namespace: str,
229
+ table_name: str,
230
+ merge_keys: list,
231
+ monitor_interval: float,
232
+ max_converter_parallelism: int,
233
+ filesystem: pafs.FileSystem = None,
234
+ cluster_cfg_file_path: Optional[str] = None,
235
+ ray_inactivity_timeout: int = 10,
236
+ ) -> str:
237
+ """
238
+ Submit a table monitor job to Ray cluster.
239
+
240
+ Args:
241
+ warehouse_path: Warehouse path
242
+ catalog_type: Catalog type
243
+ catalog_uri: Catalog URI
244
+ namespace: Table namespace
245
+ table_name: Table name to monitor
246
+ merge_keys: List of merge key column names
247
+ monitor_interval: Seconds between monitoring checks
248
+ max_converter_parallelism: Maximum number of concurrent converter tasks
249
+ filesystem: PyArrow filesystem instance
250
+ cluster_cfg_file_path: Path to cluster config file (None for local)
251
+ ray_inactivity_timeout: Seconds to wait before shutting down Ray cluster
252
+ Returns:
253
+ Job ID of the submitted job
254
+ """
255
+
256
+ # Parse table identifier to extract namespace and table name
257
+ if not namespace:
258
+ namespace = DEFAULT_NAMESPACE
259
+
260
+ # Generate unique job ID based on the warehouse and table path
261
+ job_name = _generate_job_name(
262
+ warehouse_path=warehouse_path, namespace=namespace, table_name=table_name
263
+ )
264
+
265
+ # Resolve the appropriate local or remote job client
266
+ if cluster_cfg_file_path:
267
+ # Submit to remote cluster
268
+ logger.info(
269
+ f"Preparing to submit job to remote cluster: {cluster_cfg_file_path}"
270
+ )
271
+ # Set the cluster name to the job ID to prevent starting multiple Ray clusters monitoring the same table.
272
+ client = job_client(cluster_cfg_file_path, cluster_name_override=job_name)
273
+ else:
274
+ # Submit to local cluster using DeltaCAT local job client
275
+ ray_init_args = {
276
+ "local_mode": True,
277
+ "resources": {"convert_task": max_converter_parallelism},
278
+ }
279
+ logger.info(
280
+ f"Preparing to submit job locally with ray init args: {ray_init_args}"
281
+ )
282
+ client = local_job_client(ray_init_args=ray_init_args)
283
+
284
+ # Add filesystem type - determine from filesystem instance
285
+ filesystem_type = FilesystemType.from_filesystem(filesystem)
286
+
287
+ # Build CLI arguments for table_monitor job
288
+ table_monitor_script_dir = os.path.dirname(os.path.abspath(__file__))
289
+ table_monitor_script_path = os.path.join(
290
+ table_monitor_script_dir, "table_monitor.py"
291
+ )
292
+
293
+ logger.debug(f"Table monitor script path: {table_monitor_script_path}")
294
+ logger.debug(
295
+ f"Table monitor script exists: {os.path.exists(table_monitor_script_path)}"
296
+ )
297
+
298
+ cmd_args = [
299
+ f"python {table_monitor_script_path}",
300
+ f"--catalog-type '{catalog_type.value}'",
301
+ f"--warehouse-path '{warehouse_path}'",
302
+ f"--catalog-uri '{catalog_uri}'",
303
+ f"--namespace '{namespace}'",
304
+ f"--table-name '{table_name}'",
305
+ f"--merge-keys '{json.dumps(merge_keys)}'",
306
+ f"--monitor-interval {monitor_interval}",
307
+ f"--max-converter-parallelism {max_converter_parallelism}",
308
+ f"--ray-inactivity-timeout {ray_inactivity_timeout}",
309
+ f"--filesystem-type '{filesystem_type}'",
310
+ ]
311
+
312
+ # Join all arguments
313
+ entrypoint = " ".join(cmd_args)
314
+ logger.debug(
315
+ f"Submitting table monitor job '{job_name}' with entrypoint: {entrypoint}"
316
+ )
317
+
318
+ # Clean up any terminated jobs with the same submission ID to allow reuse
319
+ _cleanup_terminated_jobs_for_submission_id(client, job_name)
320
+
321
+ # Submit the job with the correct working directory
322
+ # Working directory should be the converter_agent directory where table_monitor.py is located
323
+ job_submission_id = client.submit_job(
324
+ submission_id=job_name,
325
+ entrypoint=entrypoint,
326
+ runtime_env={"working_dir": table_monitor_script_dir},
327
+ )
328
+
329
+ logger.info(f"Table monitor job submitted successfully: {job_submission_id}")
330
+
331
+ return job_submission_id
332
+
333
+
334
+ def run(
335
+ catalog_type: str,
336
+ warehouse_path: str,
337
+ catalog_uri: Optional[str],
338
+ namespace: str,
339
+ table_name: str,
340
+ merge_keys: str,
341
+ filesystem_type: str = "local",
342
+ monitor_interval: float = 1.0,
343
+ max_converter_parallelism: int = 1,
344
+ ray_inactivity_timeout: int = 10,
345
+ ) -> None:
346
+ """Run table monitor with the given parameters."""
347
+
348
+ # Parse merge keys
349
+ merge_keys_list = json.loads(merge_keys)
350
+
351
+ # Run the monitor
352
+ monitor_table(
353
+ catalog_type=catalog_type,
354
+ warehouse_path=warehouse_path,
355
+ catalog_uri=catalog_uri,
356
+ namespace=namespace,
357
+ table_name=table_name,
358
+ merge_keys=merge_keys_list,
359
+ filesystem_type=filesystem_type,
360
+ monitor_interval=monitor_interval,
361
+ max_converter_parallelism=max_converter_parallelism,
362
+ ray_inactivity_timeout=ray_inactivity_timeout,
363
+ )
364
+
365
+
366
+ if __name__ == "__main__":
367
+ """
368
+ DeltaCAT Table Monitor - Monitor Iceberg tables and run converter sessions
369
+
370
+ Example usage:
371
+ $ python table_monitor.py \
372
+ $ --catalog-type 'rest' \
373
+ $ --warehouse-path '/tmp/iceberg-warehouse' \
374
+ $ --catalog-uri 'http://localhost:8181' \
375
+ $ --namespace 'default' \
376
+ $ --table-name 'demo_table' \
377
+ $ --merge-keys '["id"]' \
378
+ $ --monitor-interval 1.0 \
379
+ $ --max-converter-parallelism 2 \
380
+ $ --ray-inactivity-timeout 300.0
381
+ """
382
+
383
+ script_args = [
384
+ (
385
+ ["--catalog-type"],
386
+ {
387
+ "help": "Catalog type name (rest, hive, sql)",
388
+ "type": str,
389
+ "required": True,
390
+ },
391
+ ),
392
+ (
393
+ ["--warehouse-path"],
394
+ {
395
+ "help": "Warehouse path",
396
+ "type": str,
397
+ "required": True,
398
+ },
399
+ ),
400
+ (
401
+ ["--catalog-uri"],
402
+ {
403
+ "help": "Catalog URI",
404
+ "type": str,
405
+ "required": True,
406
+ },
407
+ ),
408
+ (
409
+ ["--namespace"],
410
+ {
411
+ "help": "Table namespace",
412
+ "type": str,
413
+ "required": True,
414
+ },
415
+ ),
416
+ (
417
+ ["--table-name"],
418
+ {
419
+ "help": "Table name to monitor",
420
+ "type": str,
421
+ "required": True,
422
+ },
423
+ ),
424
+ (
425
+ ["--merge-keys"],
426
+ {
427
+ "help": "Comma-separated merge key column names",
428
+ "type": str,
429
+ "required": True,
430
+ },
431
+ ),
432
+ (
433
+ ["--filesystem-type"],
434
+ {
435
+ "help": "Filesystem type",
436
+ "type": str,
437
+ "default": "local",
438
+ },
439
+ ),
440
+ (
441
+ ["--monitor-interval"],
442
+ {
443
+ "help": "Seconds between monitoring checks",
444
+ "type": float,
445
+ "default": 5.0,
446
+ },
447
+ ),
448
+ (
449
+ ["--max-converter-parallelism"],
450
+ {
451
+ "help": "Maximum number of concurrent converter tasks",
452
+ "type": int,
453
+ "default": 1,
454
+ },
455
+ ),
456
+ (
457
+ ["--ray-inactivity-timeout"],
458
+ {
459
+ "help": "Ray inactivity timeout in seconds (Ray will shutdown if no activity)",
460
+ "type": int,
461
+ "default": 300,
462
+ },
463
+ ),
464
+ ]
465
+
466
+ # Parse CLI input arguments
467
+ parser = argparse.ArgumentParser(
468
+ description="DeltaCAT Table Monitor - Monitor Iceberg tables and run converter sessions"
469
+ )
470
+ for args, kwargs in script_args:
471
+ parser.add_argument(*args, **kwargs)
472
+ args = parser.parse_args()
473
+ print(f"[TABLE MONITOR] Command Line Arguments: {args}")
474
+
475
+ # Initialize DeltaCAT
476
+ deltacat.init()
477
+
478
+ # Run the table monitor using the parsed arguments
479
+ run(**vars(args))
@@ -0,0 +1,129 @@
1
+ import logging
2
+ from typing import Optional, Any, Set
3
+
4
+ from pyiceberg.catalog import Catalog
5
+ from pyiceberg.table import Table
6
+ import deltacat.logs as logs
7
+
8
+ from deltacat.storage.model.scan.push_down import Pushdown, PartitionFilter
9
+ from deltacat.storage.model.scan.scan_plan import ScanPlan
10
+ from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
11
+ from deltacat.storage.util.scan_planner import ScanPlanner
12
+ from deltacat.experimental.storage.iceberg.impl import _try_load_iceberg_table
13
+ from deltacat.experimental.storage.iceberg.visitor import IcebergExpressionVisitor
14
+
15
+ # Initialize DeltaCAT logger
16
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
17
+
18
+
19
+ class IcebergScanPlanner(ScanPlanner):
20
+ def __init__(self, catalog: Catalog):
21
+ self.catalog = catalog
22
+ self.expression_visitor = IcebergExpressionVisitor()
23
+
24
+ @classmethod
25
+ def _collect_filter_fields(cls, expr: Any) -> Set[str]:
26
+ """
27
+ Collects all field names referenced in the filter expression.
28
+
29
+ Args:
30
+ expr: The expression to analyze
31
+
32
+ Returns:
33
+ Set of field names referenced in the expression
34
+ """
35
+ fields = set()
36
+ if hasattr(expr, "field"):
37
+ fields.add(expr.field)
38
+ if hasattr(expr, "left"):
39
+ fields.update(cls._collect_filter_fields(expr.left))
40
+ if hasattr(expr, "right"):
41
+ fields.update(cls._collect_filter_fields(expr.right))
42
+ if hasattr(expr, "expr"):
43
+ fields.update(cls._collect_filter_fields(expr.expr))
44
+ if hasattr(expr, "values"):
45
+ for value in expr.values:
46
+ fields.update(cls._collect_filter_fields(value))
47
+ return fields
48
+
49
+ def create_scan_plan(
50
+ self,
51
+ table_name: str,
52
+ namespace: Optional[str] = None,
53
+ pushdown: Optional[Pushdown] = None,
54
+ ) -> ScanPlan:
55
+ iceberg_table = _try_load_iceberg_table(
56
+ self.catalog, namespace=namespace, table_name=table_name
57
+ )
58
+
59
+ # TODO: implement row, column predicate pushdown to Iceberg
60
+
61
+ # Get the partition spec
62
+ partition_spec = iceberg_table.spec()
63
+
64
+ # Check if the table is partitioned
65
+ is_partitioned = len(partition_spec.fields) > 0
66
+
67
+ scan = iceberg_table.scan()
68
+ if is_partitioned:
69
+ if pushdown and pushdown.partition_filter:
70
+ filter_fields = self._collect_filter_fields(pushdown.partition_filter)
71
+ logger.info(
72
+ f"Pushdown partition filter is enabled, converting to Iceberg. Fields discovered in filter: {', '.join(sorted(filter_fields))}"
73
+ )
74
+ # Handle partition filter if present, DeltaCAT only supports partition-level filters right now
75
+ iceberg_expression = self._convert_partition_filter(
76
+ iceberg_table, pushdown.partition_filter
77
+ )
78
+ scan = scan.filter(iceberg_expression)
79
+
80
+ file_scan_tasks = []
81
+ for scan_task in scan.plan_files():
82
+ file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
83
+ return ScanPlan(file_scan_tasks)
84
+
85
+ @classmethod
86
+ def _validate_partition_references(
87
+ cls, expr: Any, partition_cols: Set[str]
88
+ ) -> None:
89
+ """
90
+ Validates that the expression only references partition columns.
91
+
92
+ Args:
93
+ expr: The expression to validate
94
+ partition_cols: Set of valid partition column names
95
+
96
+ Raises:
97
+ ValueError: If the expression references a non-partition column
98
+ """
99
+ if hasattr(expr, "field"): # Reference type expression
100
+ if expr.field not in partition_cols:
101
+ raise ValueError(
102
+ f"Filter references non-partition column: {expr.field}. "
103
+ f"Partition columns are: {partition_cols}"
104
+ )
105
+ # Recursively validate nested expressions
106
+ if hasattr(expr, "left"):
107
+ cls._validate_partition_references(expr.left, partition_cols)
108
+ if hasattr(expr, "right"):
109
+ cls._validate_partition_references(expr.right, partition_cols)
110
+ if hasattr(expr, "expr"):
111
+ cls._validate_partition_references(expr.expr, partition_cols)
112
+ if hasattr(expr, "values"):
113
+ for value in expr.values:
114
+ cls._validate_partition_references(value, partition_cols)
115
+
116
+ def _convert_partition_filter(
117
+ self, table: Table, partition_filter: PartitionFilter
118
+ ):
119
+ """
120
+ Convert DeltaCAT partition filter to PyIceberg expression,
121
+ validating that only partition columns are referenced.
122
+ """
123
+ partition_cols = set(field.name for field in table.spec().fields)
124
+
125
+ # Validate before converting
126
+ self._validate_partition_references(partition_filter, partition_cols)
127
+
128
+ # Convert to PyIceberg expression
129
+ return self.expression_visitor.visit(partition_filter)
@@ -32,7 +32,7 @@ from deltacat.storage import (
32
32
  NamespaceProperties,
33
33
  )
34
34
  from deltacat.storage.model.manifest import Manifest
35
- from deltacat.storage.iceberg.model import (
35
+ from deltacat.experimental.storage.iceberg.model import (
36
36
  SchemaMapper,
37
37
  PartitionSchemeMapper,
38
38
  SortSchemeMapper,
@@ -41,7 +41,7 @@ from deltacat.storage.iceberg.model import (
41
41
  NamespaceMapper,
42
42
  TableMapper,
43
43
  )
44
- from deltacat.types.media import ContentType, StorageType, TableType
44
+ from deltacat.types.media import ContentType, StorageType, DatasetType
45
45
  from deltacat.utils.common import ReadKwargsProvider
46
46
 
47
47
  from pyiceberg.catalog import Catalog
@@ -281,7 +281,7 @@ def get_latest_delta(
281
281
 
282
282
  def download_delta(
283
283
  delta_like: Union[Delta, DeltaLocator],
284
- table_type: TableType = TableType.PYARROW,
284
+ table_type: DatasetType = DatasetType.PYARROW,
285
285
  storage_type: StorageType = StorageType.DISTRIBUTED,
286
286
  max_parallelism: Optional[int] = None,
287
287
  columns: Optional[List[str]] = None,
@@ -303,7 +303,7 @@ def download_delta(
303
303
  def download_delta_manifest_entry(
304
304
  delta_like: Union[Delta, DeltaLocator],
305
305
  entry_index: int,
306
- table_type: TableType = TableType.PYARROW,
306
+ table_type: DatasetType = DatasetType.PYARROW,
307
307
  columns: Optional[List[str]] = None,
308
308
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
309
309
  *args,
@@ -603,6 +603,8 @@ def stage_delta(
603
603
  properties: Optional[DeltaProperties] = None,
604
604
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
605
605
  content_type: ContentType = ContentType.PARQUET,
606
+ schema: Optional[Schema] = None,
607
+ sort_scheme_id: Optional[str] = None,
606
608
  *args,
607
609
  **kwargs,
608
610
  ) -> Delta:
@@ -66,6 +66,7 @@ from deltacat.storage import (
66
66
  TableVersionLocator,
67
67
  Transform,
68
68
  TransformName,
69
+ TruncateStrategy,
69
70
  TruncateTransform,
70
71
  TruncateTransformParameters,
71
72
  UnknownTransform,
@@ -227,7 +228,10 @@ class TransformMapper(ModelMapper[IcebergTransform, Transform]):
227
228
  )
228
229
  if isinstance(obj, IcebergTruncateTransform):
229
230
  return TruncateTransform.of(
230
- TruncateTransformParameters.of(width=obj.width),
231
+ TruncateTransformParameters.of(
232
+ width=obj.width,
233
+ truncate_strategy=TruncateStrategy.ICEBERG,
234
+ ),
231
235
  )
232
236
  return UnknownTransform.of()
233
237
 
@@ -323,7 +327,7 @@ class PartitionSchemeMapper(ModelMapper[PartitionSpec, PartitionScheme]):
323
327
  elif not schema:
324
328
  err_msg = "Schema is required for Partition Spec conversion."
325
329
  raise ValueError(err_msg)
326
- keys = [PartitionKeyMapper.map(field, schema) for field in obj.fields]
330
+ keys = [PartitionKeyMapper.map(field, schema) for field in obj.fields] or None
327
331
  return PartitionScheme.of(
328
332
  keys=keys,
329
333
  name=name,
@@ -425,7 +429,7 @@ class SortSchemeMapper(ModelMapper[IcebergSortOrder, SortScheme]):
425
429
  elif not schema:
426
430
  err_msg = "Schema is required for Sort Order conversion."
427
431
  raise ValueError(err_msg)
428
- keys = [SortKeyMapper.map(field, schema) for field in obj.fields]
432
+ keys = [SortKeyMapper.map(field, schema) for field in obj.fields] or None
429
433
  return SortScheme.of(
430
434
  keys=keys,
431
435
  name=name,