deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,205 @@
1
+ import time
2
+ import os
3
+ import posixpath
4
+ import pyarrow.fs
5
+ from pyarrow.fs import FileSelector, FileType
6
+ from itertools import chain
7
+ from deltacat.storage.model.transaction import Transaction
8
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
9
+ from deltacat.constants import (
10
+ TXN_DIR_NAME,
11
+ RUNNING_TXN_DIR_NAME,
12
+ FAILED_TXN_DIR_NAME,
13
+ TXN_PART_SEPARATOR,
14
+ )
15
+ from deltacat.storage.model.types import TransactionState
16
+ import logging
17
+ from deltacat import logs
18
+
19
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
+
21
+
22
+ def brute_force_search_matching_metafiles(
23
+ dirty_files_names, filesystem: pyarrow.fs.FileSystem, catalog_root
24
+ ):
25
+ txn_dir_name = TXN_DIR_NAME
26
+ # collect transaction ids of the files
27
+ transaction_ids = []
28
+ for dirty_file in dirty_files_names:
29
+ parts = dirty_file.split(TXN_PART_SEPARATOR)
30
+ if len(parts) < 2:
31
+ continue
32
+ transaction_ids.append(parts[1])
33
+
34
+ def recursive_search(path):
35
+ try:
36
+ selector = FileSelector(path, recursive=False)
37
+ entries = filesystem.get_file_info(selector)
38
+ except Exception as e:
39
+ logger.error(f"Error listing directory '{path}': {e}")
40
+ return
41
+
42
+ for entry in entries:
43
+ base_name = posixpath.basename(entry.path)
44
+ if entry.type == FileType.File:
45
+ for transaction_id in transaction_ids:
46
+ # Look for transaction_id in the filename
47
+ if transaction_id in base_name:
48
+ try:
49
+ filesystem.delete_file(entry.path)
50
+ logger.debug(f"Deleted file: {entry.path}")
51
+ except Exception as e:
52
+ logger.error(f"Error deleting file '{entry.path}': {e}")
53
+
54
+ elif entry.type == FileType.Directory:
55
+ # Skip directories that match txn_dir_name
56
+ if posixpath.basename(entry.path) == txn_dir_name:
57
+ logger.debug(f"Skipping directory: {entry.path}")
58
+ continue
59
+ recursive_search(entry.path)
60
+
61
+ # Start recursive search from the catalog root
62
+ recursive_search(catalog_root)
63
+
64
+ # renaming to successful completion
65
+ for dirty_file in dirty_files_names:
66
+ failed_txn_log_dir = posixpath.join(
67
+ catalog_root, TXN_DIR_NAME, FAILED_TXN_DIR_NAME
68
+ )
69
+ old_log_path = posixpath.join(failed_txn_log_dir, dirty_file)
70
+
71
+ # new_filename = dirty_file.replace(TIMEOUT_TXN, SUCCESSFULLY_CLEANED)
72
+ new_log_path = posixpath.join(failed_txn_log_dir, dirty_file)
73
+ try:
74
+ filesystem.move(old_log_path, new_log_path)
75
+ logger.debug(f"Renamed file from {old_log_path} to {new_log_path}")
76
+ except Exception as e:
77
+ logger.error(f"Error renaming file '{old_log_path}': {e}")
78
+
79
+
80
+ def janitor_delete_timed_out_transaction(catalog_root: str) -> None:
81
+ """
82
+ Traverse the running transactions directory and move transactions that have been
83
+ running longer than the threshold into the failed transactions directory.
84
+ """
85
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(catalog_root)
86
+
87
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
88
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
89
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
90
+
91
+ dirty_files = []
92
+
93
+ running_txn_file_selector = FileSelector(running_txn_log_dir, recursive=False)
94
+ running_txn_info_list = filesystem.get_file_info(running_txn_file_selector)
95
+
96
+ for running_txn_info in running_txn_info_list:
97
+ try:
98
+ filename = posixpath.basename(running_txn_info.path)
99
+ parts = filename.split(TXN_PART_SEPARATOR)
100
+ end_time_str = parts[-1]
101
+ end_time = float(end_time_str)
102
+ current_time = time.time_ns()
103
+ if end_time <= current_time:
104
+ src_path = running_txn_info.path
105
+ new_filename = f"{filename}"
106
+ dest_path = posixpath.join(failed_txn_log_dir, new_filename)
107
+
108
+ # Move the file using copy and delete
109
+ with filesystem.open_input_file(src_path) as src_file:
110
+ contents = src_file.read()
111
+
112
+ with filesystem.open_output_stream(dest_path) as dest_file:
113
+ dest_file.write(contents)
114
+ filesystem.delete_file(src_path)
115
+
116
+ dirty_files.append(new_filename)
117
+
118
+ except Exception as e:
119
+ logger.error(
120
+ f"Error cleaning failed transaction '{running_txn_info.path}': {e}"
121
+ )
122
+
123
+ # Pass catalog_root to the brute force search so it searches from the right place
124
+ brute_force_search_matching_metafiles(
125
+ dirty_files, filesystem, catalog_root_normalized
126
+ )
127
+
128
+
129
+ def janitor_remove_files_in_failed(
130
+ catalog_root: str, filesystem: pyarrow.fs.FileSystem = None
131
+ ) -> None:
132
+ """
133
+ Cleans up metafiles and locator files associated with failed transactions.
134
+ """
135
+ if filesystem is None:
136
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(catalog_root)
137
+ else:
138
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
139
+ catalog_root, filesystem
140
+ )
141
+
142
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
143
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
144
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
145
+ filesystem.create_dir(failed_txn_log_dir, recursive=True)
146
+
147
+ failed_txn_file_selector = FileSelector(failed_txn_log_dir, recursive=False)
148
+ failed_txn_info_list = filesystem.get_file_info(failed_txn_file_selector)
149
+
150
+ for failed_txn_info in failed_txn_info_list:
151
+ try:
152
+ txn = Transaction.read(failed_txn_info.path, filesystem)
153
+ failed_txn_basename = posixpath.basename(failed_txn_info.path)
154
+ should_process = True
155
+ try:
156
+ if txn.state(catalog_root_normalized) == TransactionState.PURGED:
157
+ should_process = False
158
+ except Exception:
159
+ logger.error("Could not check attribute")
160
+ if should_process:
161
+ # Process if the file is marked as currently cleaning.
162
+ txnid = txn.id
163
+
164
+ if txn.state(catalog_root_normalized) == TransactionState.FAILED:
165
+
166
+ txnid = txn.id
167
+
168
+ operations = txn["operations"]
169
+ known_write_paths = chain.from_iterable(
170
+ (op["metafile_write_paths"] + op["locator_write_paths"])
171
+ for op in operations
172
+ )
173
+
174
+ for write_path in known_write_paths:
175
+ full_path = posixpath.join(catalog_root_normalized, write_path)
176
+ try:
177
+ filesystem.delete_file(full_path)
178
+ except Exception as e:
179
+ logger.error(f"Failed to delete file '{full_path}': {e}")
180
+
181
+ new_filename = f"{txnid}"
182
+
183
+ new_failed_txn_log_file_path = posixpath.join(
184
+ failed_txn_log_dir, new_filename
185
+ )
186
+ running_txn_log_path = posixpath.join(
187
+ running_txn_log_dir, new_filename
188
+ )
189
+
190
+ os.delete(running_txn_log_path)
191
+
192
+ os.rename(failed_txn_info.path, new_failed_txn_log_file_path)
193
+ logger.debug(
194
+ f"Cleaned up failed transaction: {failed_txn_basename}"
195
+ )
196
+
197
+ except Exception as e:
198
+ logger.error(
199
+ f"Could not read transaction '{failed_txn_info.path}', skipping: {e}"
200
+ )
201
+
202
+
203
+ def janitor_job(catalog_root_dir: str) -> None:
204
+ janitor_delete_timed_out_transaction(catalog_root_dir)
205
+ janitor_remove_files_in_failed(catalog_root_dir)
@@ -21,11 +21,16 @@ def _run_cmd(cmd: str) -> None:
21
21
  assert exit_code == 0, f"`{cmd}` failed. Exit code: {exit_code}"
22
22
 
23
23
 
24
- def _ray_up(cluster_cfg: str, restart_only: bool = False) -> None:
24
+ def _ray_up(
25
+ cluster_cfg: str, cluster_name_override: str = None, restart_only: bool = False
26
+ ) -> None:
25
27
  restart_flag = "--no-restart" if not restart_only else "--restart-only"
28
+ cluster_name_option = (
29
+ f"-n '{cluster_name_override}'" if cluster_name_override else ""
30
+ )
26
31
  print(f"Starting Ray cluster from '{cluster_cfg}'")
27
32
  _run_cmd(
28
- f"ray up '{cluster_cfg}' -y --no-config-cache {restart_flag} --disable-usage-stats"
33
+ f"ray up '{cluster_cfg}' -y --no-config-cache {restart_flag} {cluster_name_option} --disable-usage-stats"
29
34
  )
30
35
  print(f"Started Ray cluster from '{cluster_cfg}'")
31
36
 
@@ -70,6 +75,7 @@ def _get_head_node_ip(cluster_cfg: str) -> str:
70
75
  check=True,
71
76
  )
72
77
  # the head node IP should be the last line printed to stdout
78
+ # TODO(pdames): add IPv6 support
73
79
  head_node_ip = proc.stdout.splitlines()[-1]
74
80
  if not re.match(
75
81
  r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
@@ -122,6 +128,7 @@ class DeltaCatJobClient(JobSubmissionClient):
122
128
  head_node_ip: str = None,
123
129
  dashboard_wait_time_seconds: int = 30,
124
130
  port: Union[int, str] = "8265",
131
+ cluster_name_override: str = None,
125
132
  ):
126
133
  job_submission_client_url = None
127
134
  try:
@@ -129,10 +136,12 @@ class DeltaCatJobClient(JobSubmissionClient):
129
136
  if cluster_cfg_file_path:
130
137
  if launch_cluster:
131
138
  if not _ray_cluster_running(cluster_cfg_file_path) or restart_ray:
132
- _ray_up(cluster_cfg_file_path)
139
+ _ray_up(cluster_cfg_file_path, cluster_name_override)
133
140
  elif restart_ray:
134
141
  if _ray_cluster_running(cluster_cfg_file_path):
135
- _ray_up(cluster_cfg_file_path, restart_ray)
142
+ _ray_up(
143
+ cluster_cfg_file_path, restart_ray, cluster_name_override
144
+ )
136
145
  else:
137
146
  raise RuntimeError(
138
147
  f"Cannot Restart Ray: Ray Cluster for "
@@ -322,12 +331,12 @@ class DeltaCatJobClient(JobSubmissionClient):
322
331
 
323
332
  def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
324
333
  """
325
- Create a Ray Job Client that can be used to submit jobs to a local Ray
334
+ Create a DeltaCAT Job Client that can be used to submit jobs to a local Ray
326
335
  cluster. Initializes Ray if it's not already running.
327
336
 
328
337
  Args:
329
- *args: Positional arguments to pass to `ray.init()`.
330
- **kwargs: Keyword arguments to pass to `ray.init()`.
338
+ *args: Positional arguments to pass to `deltacat.init()`.
339
+ **kwargs: Keyword arguments to pass to `deltacat.init()`.
331
340
  Returns:
332
341
  DeltaCatJobClient: A client instance that can be used to submit and
333
342
  manage local Ray jobs.
@@ -335,10 +344,11 @@ def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
335
344
  Raises:
336
345
  RuntimeError: If a local Ray Job Server cannot be found.
337
346
  """
338
- if not dc.is_initialized():
339
- context = dc.init(*args, **kwargs)
340
- else:
341
- context = dc.init(ray_init_args={"ignore_reinit_error": True})
347
+ # force reinitialization to ensure that we can get the Ray context
348
+ kwargs["force"] = True
349
+ context = dc.init(*args, **kwargs)
350
+ if context is None:
351
+ raise RuntimeError("Failed to retrieve Ray context.")
342
352
  if context.dashboard_url:
343
353
  head_node_ip, port = context.dashboard_url.split(":")
344
354
  else:
@@ -365,9 +375,11 @@ def job_client(
365
375
  head_node_ip: str = None,
366
376
  dashboard_wait_time_seconds: int = 15,
367
377
  port: Union[str, int] = "8265",
378
+ cluster_name_override: str = None,
368
379
  ) -> DeltaCatJobClient:
369
380
  """
370
- Create a DeltaCAT Job Client that can be used to submit jobs to a remote Ray cluster.
381
+ Create a DeltaCAT Job Client that can be used to submit jobs to a remote
382
+ Ray cluster.
371
383
 
372
384
  Args:
373
385
  cluster_cfg_file_path: Path to the Ray Cluster Launcher
@@ -401,4 +413,5 @@ def job_client(
401
413
  head_node_ip=head_node_ip,
402
414
  dashboard_wait_time_seconds=dashboard_wait_time_seconds,
403
415
  port=port,
416
+ cluster_name_override=cluster_name_override,
404
417
  )
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Optional, Dict, Any
2
+ from typing import Optional, Dict, Any, List
3
3
  from deltacat import logs
4
4
  from deltacat.storage import (
5
5
  Delta,
@@ -61,6 +61,7 @@ def _estimate_resources_required_to_process_delta_using_previous_inflation(
61
61
  def _estimate_resources_required_to_process_delta_using_type_params(
62
62
  delta: Delta,
63
63
  operation_type: OperationType,
64
+ all_column_names: List[str],
64
65
  estimate_resources_params: EstimateResourcesParams,
65
66
  deltacat_storage: unimplemented_deltacat_storage,
66
67
  deltacat_storage_kwargs: Dict[str, Any],
@@ -93,11 +94,30 @@ def _estimate_resources_required_to_process_delta_using_type_params(
93
94
  on_disk_size_bytes=delta.meta.content_length,
94
95
  ),
95
96
  )
96
-
97
+ file_reader_kwargs_provider = kwargs.get(
98
+ "file_reader_kwargs_provider"
99
+ ) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
100
+
101
+ """
102
+ NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
103
+ 1. Nested within deltacat_storage_kwargs during resource estimation
104
+ 2. As a top-level attribute of CompactPartitionsParams during compaction
105
+
106
+ This creates an inconsistent parameter path between resource estimation and compaction flows.
107
+ As a long-term solution, this should be unified to use a single consistent path (either always
108
+ nested in deltacat_storage_kwargs or always as a top-level parameter).
109
+
110
+ For now, this implementation handles the resource estimation case by:
111
+ 1. First checking for file_reader_kwargs_provider as a direct kwarg
112
+ 2. Falling back to deltacat_storage_kwargs if not found
113
+ This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
114
+ """
97
115
  appended = append_content_type_params(
98
116
  delta=delta,
117
+ all_column_names=all_column_names,
99
118
  deltacat_storage=deltacat_storage,
100
119
  deltacat_storage_kwargs=deltacat_storage_kwargs,
120
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
101
121
  )
102
122
 
103
123
  if not appended:
@@ -152,6 +172,10 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
152
172
  operation_type == OperationType.PYARROW_DOWNLOAD
153
173
  ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
154
174
 
175
+ if not estimate_resources_params.max_files_to_sample:
176
+ # we cannot calculate if we cannot sample
177
+ return None
178
+
155
179
  if not delta.manifest:
156
180
  delta.manifest = deltacat_storage.get_delta_manifest(
157
181
  delta.locator,
@@ -168,10 +192,6 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
168
192
  ),
169
193
  )
170
194
 
171
- if not estimate_resources_params.max_files_to_sample:
172
- # we cannot calculate if we cannot sample
173
- return None
174
-
175
195
  sampled_in_memory_size = 0.0
176
196
  sampled_on_disk_size = 0.0
177
197
  sampled_num_rows = 0
@@ -234,6 +254,10 @@ RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS = {
234
254
  _estimate_resources_required_to_process_delta_using_file_sampling,
235
255
  _estimate_resources_required_to_process_delta_using_previous_inflation,
236
256
  ],
257
+ ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION: [
258
+ _estimate_resources_required_to_process_delta_using_file_sampling,
259
+ _estimate_resources_required_to_process_delta_using_previous_inflation,
260
+ ],
237
261
  }
238
262
 
239
263
 
@@ -267,10 +291,18 @@ def estimate_resources_required_to_process_delta(
267
291
  estimate_resources_params.resource_estimation_method
268
292
  )
269
293
 
294
+ all_column_names = deltacat_storage.get_table_version_column_names(
295
+ delta.locator.namespace,
296
+ delta.locator.table_name,
297
+ delta.locator.table_version,
298
+ **deltacat_storage_kwargs,
299
+ )
300
+
270
301
  for func in functions:
271
302
  resources = func(
272
303
  delta=delta,
273
304
  operation_type=operation_type,
305
+ all_column_names=all_column_names,
274
306
  estimate_resources_params=estimate_resources_params,
275
307
  deltacat_storage=deltacat_storage,
276
308
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -23,6 +23,14 @@ class ResourceEstimationMethod(str, Enum):
23
23
  """
24
24
  DEFAULT_V2 = "DEFAULT_V2"
25
25
 
26
+ """
27
+ This approach combines file sampling estimation and inflation based methods
28
+ and runs them in the order specified below:
29
+ 1. FILE_SAMPLING
30
+ 2. PREVIOUS_INFLATION
31
+ """
32
+ FILE_SAMPLING_WITH_PREVIOUS_INFLATION = "FILE_SAMPLING_WITH_PREVIOUS_INFLATION"
33
+
26
34
  """
27
35
  This approach strictly uses previous inflation and average record size to arrive
28
36
  at a resource estimate. It requires users to pass in previous inflation and average
deltacat/constants.py CHANGED
@@ -1,8 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import botocore.exceptions
3
4
 
5
+ from daft.exceptions import DaftTransientError
4
6
  from deltacat.utils.common import env_string, env_bool
5
- import os
7
+ from deltacat.utils.common import env_integer
6
8
 
7
9
  # Environment variables
8
10
  DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -40,7 +42,7 @@ DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
40
42
  )
41
43
  DELTACAT_ROOT = env_string(
42
44
  "DELTACAT_ROOT",
43
- os.path.join(os.getcwd(), ".deltacat"),
45
+ "",
44
46
  )
45
47
 
46
48
  # CLI Args
@@ -92,7 +94,10 @@ REVISION_DIR_NAME: str = "rev"
92
94
  TXN_DIR_NAME: str = "txn"
93
95
  RUNNING_TXN_DIR_NAME: str = "running"
94
96
  FAILED_TXN_DIR_NAME: str = "failed"
97
+ PAUSED_TXN_DIR_NAME: str = "paused"
95
98
  SUCCESS_TXN_DIR_NAME: str = "success"
99
+ DATA_FILE_DIR_NAME: str = "data"
100
+ REV_DIR_NAME: str = "rev"
96
101
  TXN_PART_SEPARATOR = "_"
97
102
 
98
103
  # Storage interface defaults
@@ -104,3 +109,41 @@ DEFAULT_TABLE_VERSION = "1"
104
109
  DEFAULT_STREAM_ID = "stream"
105
110
  DEFAULT_PARTITION_ID = "partition"
106
111
  DEFAULT_PARTITION_VALUES = ["default"]
112
+
113
+ # Transaction Status constants
114
+ SUCCESSFULLY_CLEANED = "cleaned"
115
+ CURRENTLY_CLEANING = "cleaning"
116
+ TIMEOUT_TXN = "timedout"
117
+
118
+ # operation timeout constants
119
+ OPERATION_TIMEOUTS = {
120
+ "create": 5,
121
+ "update": 3,
122
+ "delete": 4,
123
+ "read_siblings": 2,
124
+ "read_children": 2,
125
+ "read_latest": 3,
126
+ "read_exists": 1,
127
+ }
128
+ # Upload/Download Retry Defaults
129
+ UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
130
+ "UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 10 * 60
131
+ )
132
+ UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY = env_integer(
133
+ "UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 30 * 60
134
+ )
135
+ DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY = env_integer(
136
+ "DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY", 30 * 60
137
+ )
138
+ DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
139
+ "DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
140
+ ) # 5 mins
141
+ RETRYABLE_TRANSIENT_ERRORS = (
142
+ OSError,
143
+ botocore.exceptions.ConnectionError,
144
+ botocore.exceptions.HTTPClientError,
145
+ botocore.exceptions.NoCredentialsError,
146
+ botocore.exceptions.ConnectTimeoutError,
147
+ botocore.exceptions.ReadTimeoutError,
148
+ DaftTransientError,
149
+ )