deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
1
+ from deltacat.constants import DEFAULT_NAMESPACE
2
2
  from deltacat.utils.ray_utils.concurrency import (
3
3
  invoke_parallel,
4
4
  task_resource_options_provider,
@@ -13,14 +13,12 @@ from deltacat import logs
13
13
  from deltacat.compute.converter.model.converter_session_params import (
14
14
  ConverterSessionParams,
15
15
  )
16
-
17
-
16
+ from typing import Dict, List, Any, Callable
18
17
  from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
19
18
  from deltacat.compute.converter.steps.convert import convert
20
19
  from deltacat.compute.converter.model.convert_input import ConvertInput
21
20
  from deltacat.compute.converter.pyiceberg.overrides import (
22
21
  fetch_all_bucket_files,
23
- parquet_files_dict_to_iceberg_data_files,
24
22
  )
25
23
  from deltacat.compute.converter.utils.converter_session_utils import (
26
24
  construct_iceberg_table_prefix,
@@ -33,48 +31,112 @@ from deltacat.compute.converter.pyiceberg.catalog import load_table
33
31
  from deltacat.compute.converter.utils.converter_session_utils import (
34
32
  group_all_files_to_each_bucket,
35
33
  )
34
+ from deltacat.compute.converter.model.convert_result import ConvertResult
35
+ from deltacat.compute.converter.utils.converter_session_utils import (
36
+ _get_snapshot_action_description,
37
+ _determine_snapshot_type,
38
+ SnapshotType,
39
+ )
40
+
41
+ from pyiceberg.manifest import DataFile
42
+ from pyiceberg.table.metadata import TableMetadata
36
43
 
37
44
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
38
45
 
39
46
 
40
- def converter_session(params: ConverterSessionParams, **kwargs):
47
+ def converter_session(params: ConverterSessionParams, **kwargs: Any) -> TableMetadata:
41
48
  """
42
- Convert equality delete to position delete.
43
- Compute and memory heavy work from downloading equality delete table and compute position deletes
44
- will be executed on Ray remote tasks.
49
+ Convert equality deletes to position deletes with option to enforce primary key uniqueness.
50
+
51
+ This function processes Iceberg table files to convert equality delete files to position delete files.
52
+ It can optionally enforce primary key uniqueness by keeping only the latest version of each
53
+ primary key across all data files.
54
+
55
+ **Memory Requirements:**
56
+ - Minimum 512MB of free memory is required to run the converter
57
+
58
+ **Process Overview:**
59
+ 1. Fetches all bucket files (data files, equality deletes, position deletes)
60
+ 2. Groups files by bucket for parallel processing
61
+ 3. Converts equality deletes to position deletes using Ray parallel tasks
62
+ 4. Enforces primary key uniqueness if enabled
63
+ 5. Commits appropriate snapshot (append, replace, or delete) to the Iceberg table
64
+
65
+
66
+ Args:
67
+ params: ConverterSessionParams containing all configuration parameters
68
+ - catalog: Iceberg catalog instance
69
+ - iceberg_table_name: Name of the target Iceberg table
70
+ - enforce_primary_key_uniqueness: Whether to enforce PK uniqueness
71
+ - iceberg_warehouse_bucket_name: S3 bucket for Iceberg warehouse
72
+ - iceberg_namespace: Iceberg namespace
73
+ - merge_keys: Optional list of merge key fields (uses table identifier fields if not provided)
74
+ - compact_previous_position_delete_files: Whether to compact existing position delete files
75
+ - task_max_parallelism: Maximum number of parallel Ray tasks
76
+ - s3_client_kwargs: Additional S3 client configuration
77
+ - s3_file_system: S3 file system instance
78
+ - location_provider_prefix_override: Optional prefix override for file locations
79
+ - position_delete_for_multiple_data_files: Whether to generate position deletes for multiple data files
80
+ **kwargs: Additional keyword arguments (currently unused)
81
+
82
+ Raises:
83
+ Exception: If snapshot commitment fails or other critical errors occur
84
+
45
85
  """
46
86
 
47
87
  catalog = params.catalog
48
88
  table_name = params.iceberg_table_name
49
- iceberg_table = load_table(catalog, table_name)
89
+ if "." not in table_name:
90
+ iceberg_namespace = params.iceberg_namespace or DEFAULT_NAMESPACE
91
+ table_name = params.iceberg_table_name
92
+ table_identifier = f"{iceberg_namespace}.{table_name}"
93
+ else:
94
+ table_identifier = table_name
95
+ identifier_parts = table_identifier.split(".")
96
+ iceberg_namespace = identifier_parts[0]
97
+ table_name = identifier_parts[1]
98
+ iceberg_table = load_table(catalog, table_identifier)
50
99
  enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
100
+ iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
101
+ merge_keys = params.merge_keys
102
+ compact_previous_position_delete_files = (
103
+ params.compact_previous_position_delete_files
104
+ )
105
+ task_max_parallelism = params.task_max_parallelism
106
+ s3_client_kwargs = params.s3_client_kwargs
107
+ s3_file_system = params.filesystem
108
+ location_provider_prefix_override = params.location_provider_prefix_override
109
+ position_delete_for_multiple_data_files = (
110
+ params.position_delete_for_multiple_data_files
111
+ )
112
+
51
113
  data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
52
114
  iceberg_table
53
115
  )
116
+
54
117
  convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
55
118
  data_file_dict=data_file_dict,
56
119
  equality_delete_dict=equality_delete_dict,
57
120
  pos_delete_dict=pos_delete_dict,
58
121
  )
59
- iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
60
- iceberg_namespace = params.iceberg_namespace
61
- iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
62
- iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
63
- table_name=table_name,
64
- iceberg_namespace=iceberg_namespace,
65
- )
66
- merge_keys = params.merge_keys
122
+
123
+ if not location_provider_prefix_override:
124
+ iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
125
+ iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
126
+ table_name=table_name,
127
+ iceberg_namespace=iceberg_namespace,
128
+ )
129
+ else:
130
+ iceberg_table_warehouse_prefix = location_provider_prefix_override
131
+
67
132
  # Using table identifier fields as merge keys if merge keys not provided
68
133
  if not merge_keys:
69
134
  identifier_fields_set = iceberg_table.schema().identifier_field_names()
70
135
  identifier_fields = list(identifier_fields_set)
71
136
  else:
72
137
  identifier_fields = merge_keys
73
- if len(identifier_fields) > 1:
74
- raise NotImplementedError(
75
- f"Multiple identifier fields lookup not supported yet."
76
- )
77
- convert_options_provider = functools.partial(
138
+
139
+ convert_options_provider: Callable = functools.partial(
78
140
  task_resource_options_provider,
79
141
  resource_amount_provider=convert_resource_options_provider,
80
142
  )
@@ -86,58 +148,151 @@ def converter_session(params: ConverterSessionParams, **kwargs):
86
148
  # Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
87
149
  max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
88
150
 
89
- compact_small_files = params.compact_small_files
90
- position_delete_for_multiple_data_files = (
91
- params.position_delete_for_multiple_data_files
92
- )
93
- task_max_parallelism = params.task_max_parallelism
94
-
95
- def convert_input_provider(index, item):
151
+ def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
152
+ task_opts = convert_options_provider(index, item)
96
153
  return {
97
154
  "convert_input": ConvertInput.of(
98
- files_for_each_bucket=item,
155
+ convert_input_files=item,
99
156
  convert_task_index=index,
100
157
  iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
101
158
  identifier_fields=identifier_fields,
102
- compact_small_files=compact_small_files,
159
+ compact_previous_position_delete_files=compact_previous_position_delete_files,
160
+ table_io=iceberg_table.io,
161
+ table_metadata=iceberg_table.metadata,
103
162
  enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
104
163
  position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
105
164
  max_parallel_data_file_download=max_parallel_data_file_download,
165
+ s3_client_kwargs=s3_client_kwargs,
166
+ filesystem=s3_file_system,
167
+ task_memory=task_opts["memory"],
106
168
  )
107
169
  }
108
170
 
171
+ logger.info(f"Getting remote convert tasks...")
109
172
  # Ray remote task: convert
110
- # Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
111
173
  # TODO: Add split mechanism to split large buckets
112
174
  convert_tasks_pending = invoke_parallel(
113
- items=convert_input_files_for_all_buckets.items(),
175
+ items=convert_input_files_for_all_buckets,
114
176
  ray_task=convert,
115
177
  max_parallelism=task_max_parallelism,
116
178
  options_provider=convert_options_provider,
117
179
  kwargs_provider=convert_input_provider,
118
180
  )
119
- to_be_deleted_files_list = []
120
- to_be_added_files_dict_list = []
121
- convert_results = ray.get(convert_tasks_pending)
181
+
182
+ to_be_deleted_files_list: List[List[DataFile]] = []
183
+ logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
184
+
185
+ convert_results: List[ConvertResult] = ray.get(convert_tasks_pending)
186
+ logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
187
+
188
+ total_position_delete_record_count = sum(
189
+ convert_result.position_delete_record_count
190
+ for convert_result in convert_results
191
+ )
192
+ total_input_data_file_record_count = sum(
193
+ convert_result.input_data_files_record_count
194
+ for convert_result in convert_results
195
+ )
196
+ total_data_file_hash_columns_in_memory_sizes = sum(
197
+ convert_result.input_data_files_hash_columns_in_memory_sizes
198
+ for convert_result in convert_results
199
+ )
200
+ total_position_delete_file_in_memory_sizes = sum(
201
+ convert_result.position_delete_in_memory_sizes
202
+ for convert_result in convert_results
203
+ )
204
+ total_position_delete_on_disk_sizes = sum(
205
+ convert_result.position_delete_on_disk_sizes
206
+ for convert_result in convert_results
207
+ )
208
+ total_input_data_files_on_disk_size = sum(
209
+ convert_result.input_data_files_on_disk_size
210
+ for convert_result in convert_results
211
+ )
212
+
213
+ # Calculate memory usage statistics
214
+ max_peak_memory_usage = max(
215
+ convert_result.peak_memory_usage_bytes for convert_result in convert_results
216
+ )
217
+ avg_memory_usage_percentage = sum(
218
+ convert_result.memory_usage_percentage for convert_result in convert_results
219
+ ) / len(convert_results)
220
+ max_memory_usage_percentage = max(
221
+ convert_result.memory_usage_percentage for convert_result in convert_results
222
+ )
223
+
224
+ logger.info(
225
+ f"Aggregated stats for {table_identifier}: "
226
+ f"total position delete record count: {total_position_delete_record_count}, "
227
+ f"total input data file record count: {total_input_data_file_record_count}, "
228
+ f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
229
+ f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
230
+ f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}, "
231
+ f"total input data files on disk size: {total_input_data_files_on_disk_size}, "
232
+ f"max peak memory usage: {max_peak_memory_usage} bytes, "
233
+ f"average memory usage percentage: {avg_memory_usage_percentage:.2f}%, "
234
+ f"max memory usage percentage: {max_memory_usage_percentage:.2f}%"
235
+ )
236
+
237
+ to_be_added_files_list: List[DataFile] = []
122
238
  for convert_result in convert_results:
123
- to_be_deleted_files_list.extend(convert_result[0].values())
124
- to_be_added_files_dict_list.append(convert_result[1])
239
+ to_be_added_files = convert_result.to_be_added_files
240
+ to_be_deleted_files = convert_result.to_be_deleted_files
241
+
242
+ to_be_deleted_files_list.extend(to_be_deleted_files.values())
243
+ to_be_added_files_list.extend(to_be_added_files)
125
244
 
126
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
127
- io=iceberg_table.io,
128
- table_metadata=iceberg_table.metadata,
129
- files_dict_list=to_be_added_files_dict_list,
245
+ logger.info(f"To be deleted files list length: {len(to_be_deleted_files_list)}")
246
+ logger.info(f"To be added files list length: {len(to_be_added_files_list)}")
247
+
248
+ # Determine snapshot type and commit
249
+ snapshot_type = _determine_snapshot_type(
250
+ to_be_deleted_files_list, to_be_added_files_list
130
251
  )
131
252
 
132
- if not to_be_deleted_files_list:
133
- commit_append_snapshot(
134
- iceberg_table=iceberg_table,
135
- new_position_delete_files=new_position_delete_files,
253
+ if snapshot_type == SnapshotType.NONE:
254
+ logger.info(
255
+ _get_snapshot_action_description(
256
+ snapshot_type, to_be_deleted_files_list, to_be_added_files_list
257
+ )
136
258
  )
137
- else:
138
- commit_replace_snapshot(
139
- iceberg_table=iceberg_table,
140
- # equality_delete_files + data file that all rows are deleted
141
- to_be_deleted_files_list=to_be_deleted_files_list,
142
- new_position_delete_files=new_position_delete_files,
259
+ return
260
+
261
+ logger.info(
262
+ f"Snapshot action: {_get_snapshot_action_description(snapshot_type, to_be_deleted_files_list, to_be_added_files_list)}"
263
+ )
264
+
265
+ try:
266
+ if snapshot_type == SnapshotType.APPEND:
267
+ logger.info(f"Committing append snapshot for {table_identifier}.")
268
+ updated_table_metadata = commit_append_snapshot(
269
+ iceberg_table=iceberg_table,
270
+ new_position_delete_files=to_be_added_files_list,
271
+ )
272
+ elif snapshot_type == SnapshotType.REPLACE:
273
+ logger.info(f"Committing replace snapshot for {table_identifier}.")
274
+ updated_table_metadata = commit_replace_snapshot(
275
+ iceberg_table=iceberg_table,
276
+ to_be_deleted_files=to_be_deleted_files_list,
277
+ new_position_delete_files=to_be_added_files_list,
278
+ )
279
+ elif snapshot_type == SnapshotType.DELETE:
280
+ logger.info(f"Committing delete snapshot for {table_identifier}.")
281
+ updated_table_metadata = commit_replace_snapshot(
282
+ iceberg_table=iceberg_table,
283
+ to_be_deleted_files=to_be_deleted_files_list,
284
+ new_position_delete_files=[], # No new files to add
285
+ )
286
+ else:
287
+ logger.warning(f"Unexpected snapshot type: {snapshot_type}")
288
+ return
289
+
290
+ logger.info(
291
+ f"Committed new Iceberg snapshot for {table_identifier}: {updated_table_metadata.current_snapshot_id}"
143
292
  )
293
+
294
+ # Return the updated table metadata with the new snapshot
295
+ return updated_table_metadata
296
+ except Exception as e:
297
+ logger.error(f"Failed to commit snapshot for {table_identifier}: {str(e)}")
298
+ raise
@@ -1,20 +1,25 @@
1
1
  from __future__ import annotations
2
- from typing import Dict, List
2
+ from typing import Dict, List, Any, Optional
3
3
  from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
4
+ from fsspec import AbstractFileSystem
4
5
 
5
6
 
6
7
  class ConvertInput(Dict):
7
8
  @staticmethod
8
9
  def of(
9
- convert_input_files,
10
- convert_task_index,
11
- iceberg_table_warehouse_prefix,
12
- identifier_fields,
13
- compact_small_files,
14
- enforce_primary_key_uniqueness,
15
- position_delete_for_multiple_data_files,
16
- max_parallel_data_file_download,
17
- s3_file_system,
10
+ convert_input_files: ConvertInputFiles,
11
+ convert_task_index: int,
12
+ iceberg_table_warehouse_prefix: str,
13
+ identifier_fields: List[str],
14
+ table_io: Any,
15
+ table_metadata: Any,
16
+ compact_previous_position_delete_files: bool,
17
+ enforce_primary_key_uniqueness: bool,
18
+ position_delete_for_multiple_data_files: bool,
19
+ max_parallel_data_file_download: int,
20
+ filesystem: Optional[AbstractFileSystem],
21
+ s3_client_kwargs: Optional[Dict[str, Any]],
22
+ task_memory: float,
18
23
  ) -> ConvertInput:
19
24
 
20
25
  result = ConvertInput()
@@ -22,13 +27,19 @@ class ConvertInput(Dict):
22
27
  result["convert_task_index"] = convert_task_index
23
28
  result["identifier_fields"] = identifier_fields
24
29
  result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
25
- result["compact_small_files"] = compact_small_files
30
+ result["table_io"] = table_io
31
+ result["table_metadata"] = table_metadata
32
+ result[
33
+ "compact_previous_position_delete_files"
34
+ ] = compact_previous_position_delete_files
26
35
  result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
27
36
  result[
28
37
  "position_delete_for_multiple_data_files"
29
38
  ] = position_delete_for_multiple_data_files
30
39
  result["max_parallel_data_file_download"] = max_parallel_data_file_download
31
- result["s3_file_system"] = s3_file_system
40
+ result["filesystem"] = filesystem
41
+ result["s3_client_kwargs"] = s3_client_kwargs
42
+ result["task_memory"] = task_memory
32
43
 
33
44
  return result
34
45
 
@@ -49,8 +60,16 @@ class ConvertInput(Dict):
49
60
  return self["iceberg_table_warehouse_prefix"]
50
61
 
51
62
  @property
52
- def compact_small_files(self) -> bool:
53
- return self["compact_small_files"]
63
+ def table_io(self) -> Any:
64
+ return self["table_io"]
65
+
66
+ @property
67
+ def table_metadata(self) -> Any:
68
+ return self["table_metadata"]
69
+
70
+ @property
71
+ def compact_previous_position_delete_files(self) -> bool:
72
+ return self["compact_previous_position_delete_files"]
54
73
 
55
74
  @property
56
75
  def enforce_primary_key_uniqueness(self) -> bool:
@@ -65,5 +84,13 @@ class ConvertInput(Dict):
65
84
  return self["max_parallel_data_file_download"]
66
85
 
67
86
  @property
68
- def s3_file_system(self):
69
- return self["s3_file_system"]
87
+ def filesystem(self) -> Optional[AbstractFileSystem]:
88
+ return self["filesystem"]
89
+
90
+ @property
91
+ def s3_client_kwargs(self) -> Optional[Dict[str, Any]]:
92
+ return self["s3_client_kwargs"]
93
+
94
+ @property
95
+ def task_memory(self) -> float:
96
+ return self["task_memory"]
@@ -1,15 +1,21 @@
1
1
  from __future__ import annotations
2
- from typing import Dict
2
+ from typing import Dict, List, Any, Optional, Tuple
3
+ from pyiceberg.manifest import DataFile
4
+
5
+ # Type aliases to simplify nested types
6
+ DataFileWithSequence = Tuple[int, DataFile] # (sequence_number, data_file)
7
+ DataFileList = List[DataFileWithSequence] # List of data files with sequence numbers
8
+ DataFileListGroup = List[DataFileList] # Group of data file lists
3
9
 
4
10
 
5
11
  class ConvertInputFiles(Dict):
6
12
  @staticmethod
7
13
  def of(
8
- partition_value,
9
- all_data_files_for_dedupe=None,
10
- applicable_data_files=None,
11
- applicable_equality_delete_files=None,
12
- existing_position_delete_files=None,
14
+ partition_value: Any,
15
+ all_data_files_for_dedupe: Optional[DataFileList] = None,
16
+ applicable_data_files: Optional[DataFileListGroup] = None,
17
+ applicable_equality_delete_files: Optional[DataFileListGroup] = None,
18
+ existing_position_delete_files: Optional[DataFileList] = None,
13
19
  ) -> ConvertInputFiles:
14
20
 
15
21
  result = ConvertInputFiles()
@@ -21,41 +27,52 @@ class ConvertInputFiles(Dict):
21
27
  return result
22
28
 
23
29
  @property
24
- def partition_value(self):
30
+ def partition_value(self) -> Any:
25
31
  return self["partition_value"]
26
32
 
27
33
  @property
28
- def all_data_files_for_dedupe(self):
34
+ def all_data_files_for_dedupe(self) -> Optional[DataFileList]:
29
35
  return self["all_data_files_for_dedupe"]
30
36
 
31
37
  @property
32
- def applicable_data_files(self):
38
+ def applicable_data_files(self) -> Optional[DataFileListGroup]:
33
39
  return self["applicable_data_files"]
34
40
 
35
41
  @property
36
- def applicable_equality_delete_files(self):
42
+ def applicable_equality_delete_files(
43
+ self,
44
+ ) -> Optional[DataFileListGroup]:
37
45
  return self["applicable_equality_delete_files"]
38
46
 
39
47
  @property
40
- def existing_position_delete_files(self):
48
+ def existing_position_delete_files(self) -> Optional[DataFileList]:
41
49
  return self["existing_position_delete_files"]
42
50
 
43
51
  @partition_value.setter
44
- def partition_value(self, partition_value):
52
+ def partition_value(self, partition_value: Any) -> None:
45
53
  self["partition_value"] = partition_value
46
54
 
47
55
  @all_data_files_for_dedupe.setter
48
- def all_data_files_for_dedupe(self, all_data_files_for_dedupe):
56
+ def all_data_files_for_dedupe(
57
+ self, all_data_files_for_dedupe: Optional[DataFileList]
58
+ ) -> None:
49
59
  self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
50
60
 
51
61
  @applicable_data_files.setter
52
- def applicable_data_files(self, applicable_data_files):
62
+ def applicable_data_files(
63
+ self, applicable_data_files: Optional[DataFileListGroup]
64
+ ) -> None:
53
65
  self["applicable_data_files"] = applicable_data_files
54
66
 
55
67
  @applicable_equality_delete_files.setter
56
- def applicable_equality_delete_files(self, applicable_equality_delete_files):
68
+ def applicable_equality_delete_files(
69
+ self,
70
+ applicable_equality_delete_files: Optional[DataFileListGroup],
71
+ ) -> None:
57
72
  self["applicable_equality_delete_files"] = applicable_equality_delete_files
58
73
 
59
74
  @existing_position_delete_files.setter
60
- def existing_position_delete_files(self, existing_position_delete_files):
75
+ def existing_position_delete_files(
76
+ self, existing_position_delete_files: Optional[DataFileList]
77
+ ) -> None:
61
78
  self["existing_position_delete_files"] = existing_position_delete_files
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, List, Any
3
+ from pyiceberg.manifest import DataFile
4
+
5
+
6
+ class ConvertResult(Dict):
7
+ @staticmethod
8
+ def of(
9
+ convert_task_index: int,
10
+ to_be_added_files: List[DataFile],
11
+ to_be_deleted_files: Dict[Any, List[DataFile]],
12
+ position_delete_record_count: int,
13
+ input_data_files_record_count: int,
14
+ input_data_files_hash_columns_in_memory_sizes: int,
15
+ position_delete_in_memory_sizes: int,
16
+ position_delete_on_disk_sizes: int,
17
+ input_data_files_on_disk_size: int,
18
+ peak_memory_usage_bytes: int,
19
+ memory_usage_percentage: float,
20
+ ) -> ConvertResult:
21
+
22
+ result = ConvertResult()
23
+ result["convert_task_index"] = convert_task_index
24
+ result["to_be_added_files"] = to_be_added_files
25
+ result["to_be_deleted_files"] = to_be_deleted_files
26
+ result["position_delete_record_count"] = position_delete_record_count
27
+ result["input_data_files_record_count"] = input_data_files_record_count
28
+ result[
29
+ "input_data_files_hash_columns_in_memory_sizes"
30
+ ] = input_data_files_hash_columns_in_memory_sizes
31
+ result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
32
+ result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
33
+ result["input_data_files_on_disk_size"] = input_data_files_on_disk_size
34
+ result["peak_memory_usage_bytes"] = peak_memory_usage_bytes
35
+ result["memory_usage_percentage"] = memory_usage_percentage
36
+ return result
37
+
38
+ @property
39
+ def convert_task_index(self) -> int:
40
+ return self["convert_task_index"]
41
+
42
+ @property
43
+ def to_be_added_files(self) -> List[DataFile]:
44
+ return self["to_be_added_files"]
45
+
46
+ @property
47
+ def to_be_deleted_files(self) -> Dict[Any, List[DataFile]]:
48
+ return self["to_be_deleted_files"]
49
+
50
+ @property
51
+ def position_delete_record_count(self) -> int:
52
+ return self["position_delete_record_count"]
53
+
54
+ @property
55
+ def input_data_files_record_count(self) -> int:
56
+ return self["input_data_files_record_count"]
57
+
58
+ @property
59
+ def input_data_files_hash_columns_in_memory_sizes(self) -> int:
60
+ return self["input_data_files_hash_columns_in_memory_sizes"]
61
+
62
+ @property
63
+ def position_delete_in_memory_sizes(self) -> int:
64
+ return self["position_delete_in_memory_sizes"]
65
+
66
+ @property
67
+ def position_delete_on_disk_sizes(self) -> int:
68
+ return self["position_delete_on_disk_sizes"]
69
+
70
+ @property
71
+ def input_data_files_on_disk_size(self) -> int:
72
+ return self["input_data_files_on_disk_size"]
73
+
74
+ @property
75
+ def peak_memory_usage_bytes(self) -> int:
76
+ return self["peak_memory_usage_bytes"]
77
+
78
+ @property
79
+ def memory_usage_percentage(self) -> float:
80
+ return self["memory_usage_percentage"]