deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,9 @@
1
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
2
+
3
+ # Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
4
+ DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
5
+
6
+
7
+ # Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
8
+ # e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
9
+ IDENTIFIER_FIELD_DELIMITER = "c303282d"
@@ -0,0 +1,298 @@
1
+ from deltacat.constants import DEFAULT_NAMESPACE
2
+ from deltacat.utils.ray_utils.concurrency import (
3
+ invoke_parallel,
4
+ task_resource_options_provider,
5
+ )
6
+ import ray
7
+ import functools
8
+ from deltacat.compute.converter.utils.convert_task_options import (
9
+ convert_resource_options_provider,
10
+ )
11
+ import logging
12
+ from deltacat import logs
13
+ from deltacat.compute.converter.model.converter_session_params import (
14
+ ConverterSessionParams,
15
+ )
16
+ from typing import Dict, List, Any, Callable
17
+ from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
18
+ from deltacat.compute.converter.steps.convert import convert
19
+ from deltacat.compute.converter.model.convert_input import ConvertInput
20
+ from deltacat.compute.converter.pyiceberg.overrides import (
21
+ fetch_all_bucket_files,
22
+ )
23
+ from deltacat.compute.converter.utils.converter_session_utils import (
24
+ construct_iceberg_table_prefix,
25
+ )
26
+ from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
27
+ commit_replace_snapshot,
28
+ commit_append_snapshot,
29
+ )
30
+ from deltacat.compute.converter.pyiceberg.catalog import load_table
31
+ from deltacat.compute.converter.utils.converter_session_utils import (
32
+ group_all_files_to_each_bucket,
33
+ )
34
+ from deltacat.compute.converter.model.convert_result import ConvertResult
35
+ from deltacat.compute.converter.utils.converter_session_utils import (
36
+ _get_snapshot_action_description,
37
+ _determine_snapshot_type,
38
+ SnapshotType,
39
+ )
40
+
41
+ from pyiceberg.manifest import DataFile
42
+ from pyiceberg.table.metadata import TableMetadata
43
+
44
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
45
+
46
+
47
+ def converter_session(params: ConverterSessionParams, **kwargs: Any) -> TableMetadata:
48
+ """
49
+ Convert equality deletes to position deletes with option to enforce primary key uniqueness.
50
+
51
+ This function processes Iceberg table files to convert equality delete files to position delete files.
52
+ It can optionally enforce primary key uniqueness by keeping only the latest version of each
53
+ primary key across all data files.
54
+
55
+ **Memory Requirements:**
56
+ - Minimum 512MB of free memory is required to run the converter
57
+
58
+ **Process Overview:**
59
+ 1. Fetches all bucket files (data files, equality deletes, position deletes)
60
+ 2. Groups files by bucket for parallel processing
61
+ 3. Converts equality deletes to position deletes using Ray parallel tasks
62
+ 4. Enforces primary key uniqueness if enabled
63
+ 5. Commits appropriate snapshot (append, replace, or delete) to the Iceberg table
64
+
65
+
66
+ Args:
67
+ params: ConverterSessionParams containing all configuration parameters
68
+ - catalog: Iceberg catalog instance
69
+ - iceberg_table_name: Name of the target Iceberg table
70
+ - enforce_primary_key_uniqueness: Whether to enforce PK uniqueness
71
+ - iceberg_warehouse_bucket_name: S3 bucket for Iceberg warehouse
72
+ - iceberg_namespace: Iceberg namespace
73
+ - merge_keys: Optional list of merge key fields (uses table identifier fields if not provided)
74
+ - compact_previous_position_delete_files: Whether to compact existing position delete files
75
+ - task_max_parallelism: Maximum number of parallel Ray tasks
76
+ - s3_client_kwargs: Additional S3 client configuration
77
+ - s3_file_system: S3 file system instance
78
+ - location_provider_prefix_override: Optional prefix override for file locations
79
+ - position_delete_for_multiple_data_files: Whether to generate position deletes for multiple data files
80
+ **kwargs: Additional keyword arguments (currently unused)
81
+
82
+ Raises:
83
+ Exception: If snapshot commitment fails or other critical errors occur
84
+
85
+ """
86
+
87
+ catalog = params.catalog
88
+ table_name = params.iceberg_table_name
89
+ if "." not in table_name:
90
+ iceberg_namespace = params.iceberg_namespace or DEFAULT_NAMESPACE
91
+ table_name = params.iceberg_table_name
92
+ table_identifier = f"{iceberg_namespace}.{table_name}"
93
+ else:
94
+ table_identifier = table_name
95
+ identifier_parts = table_identifier.split(".")
96
+ iceberg_namespace = identifier_parts[0]
97
+ table_name = identifier_parts[1]
98
+ iceberg_table = load_table(catalog, table_identifier)
99
+ enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
100
+ iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
101
+ merge_keys = params.merge_keys
102
+ compact_previous_position_delete_files = (
103
+ params.compact_previous_position_delete_files
104
+ )
105
+ task_max_parallelism = params.task_max_parallelism
106
+ s3_client_kwargs = params.s3_client_kwargs
107
+ s3_file_system = params.filesystem
108
+ location_provider_prefix_override = params.location_provider_prefix_override
109
+ position_delete_for_multiple_data_files = (
110
+ params.position_delete_for_multiple_data_files
111
+ )
112
+
113
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
114
+ iceberg_table
115
+ )
116
+
117
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
118
+ data_file_dict=data_file_dict,
119
+ equality_delete_dict=equality_delete_dict,
120
+ pos_delete_dict=pos_delete_dict,
121
+ )
122
+
123
+ if not location_provider_prefix_override:
124
+ iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
125
+ iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
126
+ table_name=table_name,
127
+ iceberg_namespace=iceberg_namespace,
128
+ )
129
+ else:
130
+ iceberg_table_warehouse_prefix = location_provider_prefix_override
131
+
132
+ # Using table identifier fields as merge keys if merge keys not provided
133
+ if not merge_keys:
134
+ identifier_fields_set = iceberg_table.schema().identifier_field_names()
135
+ identifier_fields = list(identifier_fields_set)
136
+ else:
137
+ identifier_fields = merge_keys
138
+
139
+ convert_options_provider: Callable = functools.partial(
140
+ task_resource_options_provider,
141
+ resource_amount_provider=convert_resource_options_provider,
142
+ )
143
+
144
+ # TODO (zyiqin): max_parallel_data_file_download should be determined by memory requirement for each bucket.
145
+ # Specifically, for case when files for one bucket memory requirement exceed one worker node's memory limit, WITHOUT rebasing with larger hash bucket count,
146
+ # 1. We can control parallel files to download by adjusting max_parallel_data_file_download.
147
+ # 2. Implement two-layer converter tasks, with convert tasks to spin up child convert tasks.
148
+ # Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
149
+ max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
150
+
151
+ def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
152
+ task_opts = convert_options_provider(index, item)
153
+ return {
154
+ "convert_input": ConvertInput.of(
155
+ convert_input_files=item,
156
+ convert_task_index=index,
157
+ iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
158
+ identifier_fields=identifier_fields,
159
+ compact_previous_position_delete_files=compact_previous_position_delete_files,
160
+ table_io=iceberg_table.io,
161
+ table_metadata=iceberg_table.metadata,
162
+ enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
163
+ position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
164
+ max_parallel_data_file_download=max_parallel_data_file_download,
165
+ s3_client_kwargs=s3_client_kwargs,
166
+ filesystem=s3_file_system,
167
+ task_memory=task_opts["memory"],
168
+ )
169
+ }
170
+
171
+ logger.info(f"Getting remote convert tasks...")
172
+ # Ray remote task: convert
173
+ # TODO: Add split mechanism to split large buckets
174
+ convert_tasks_pending = invoke_parallel(
175
+ items=convert_input_files_for_all_buckets,
176
+ ray_task=convert,
177
+ max_parallelism=task_max_parallelism,
178
+ options_provider=convert_options_provider,
179
+ kwargs_provider=convert_input_provider,
180
+ )
181
+
182
+ to_be_deleted_files_list: List[List[DataFile]] = []
183
+ logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
184
+
185
+ convert_results: List[ConvertResult] = ray.get(convert_tasks_pending)
186
+ logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
187
+
188
+ total_position_delete_record_count = sum(
189
+ convert_result.position_delete_record_count
190
+ for convert_result in convert_results
191
+ )
192
+ total_input_data_file_record_count = sum(
193
+ convert_result.input_data_files_record_count
194
+ for convert_result in convert_results
195
+ )
196
+ total_data_file_hash_columns_in_memory_sizes = sum(
197
+ convert_result.input_data_files_hash_columns_in_memory_sizes
198
+ for convert_result in convert_results
199
+ )
200
+ total_position_delete_file_in_memory_sizes = sum(
201
+ convert_result.position_delete_in_memory_sizes
202
+ for convert_result in convert_results
203
+ )
204
+ total_position_delete_on_disk_sizes = sum(
205
+ convert_result.position_delete_on_disk_sizes
206
+ for convert_result in convert_results
207
+ )
208
+ total_input_data_files_on_disk_size = sum(
209
+ convert_result.input_data_files_on_disk_size
210
+ for convert_result in convert_results
211
+ )
212
+
213
+ # Calculate memory usage statistics
214
+ max_peak_memory_usage = max(
215
+ convert_result.peak_memory_usage_bytes for convert_result in convert_results
216
+ )
217
+ avg_memory_usage_percentage = sum(
218
+ convert_result.memory_usage_percentage for convert_result in convert_results
219
+ ) / len(convert_results)
220
+ max_memory_usage_percentage = max(
221
+ convert_result.memory_usage_percentage for convert_result in convert_results
222
+ )
223
+
224
+ logger.info(
225
+ f"Aggregated stats for {table_identifier}: "
226
+ f"total position delete record count: {total_position_delete_record_count}, "
227
+ f"total input data file record count: {total_input_data_file_record_count}, "
228
+ f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
229
+ f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
230
+ f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}, "
231
+ f"total input data files on disk size: {total_input_data_files_on_disk_size}, "
232
+ f"max peak memory usage: {max_peak_memory_usage} bytes, "
233
+ f"average memory usage percentage: {avg_memory_usage_percentage:.2f}%, "
234
+ f"max memory usage percentage: {max_memory_usage_percentage:.2f}%"
235
+ )
236
+
237
+ to_be_added_files_list: List[DataFile] = []
238
+ for convert_result in convert_results:
239
+ to_be_added_files = convert_result.to_be_added_files
240
+ to_be_deleted_files = convert_result.to_be_deleted_files
241
+
242
+ to_be_deleted_files_list.extend(to_be_deleted_files.values())
243
+ to_be_added_files_list.extend(to_be_added_files)
244
+
245
+ logger.info(f"To be deleted files list length: {len(to_be_deleted_files_list)}")
246
+ logger.info(f"To be added files list length: {len(to_be_added_files_list)}")
247
+
248
+ # Determine snapshot type and commit
249
+ snapshot_type = _determine_snapshot_type(
250
+ to_be_deleted_files_list, to_be_added_files_list
251
+ )
252
+
253
+ if snapshot_type == SnapshotType.NONE:
254
+ logger.info(
255
+ _get_snapshot_action_description(
256
+ snapshot_type, to_be_deleted_files_list, to_be_added_files_list
257
+ )
258
+ )
259
+ return
260
+
261
+ logger.info(
262
+ f"Snapshot action: {_get_snapshot_action_description(snapshot_type, to_be_deleted_files_list, to_be_added_files_list)}"
263
+ )
264
+
265
+ try:
266
+ if snapshot_type == SnapshotType.APPEND:
267
+ logger.info(f"Committing append snapshot for {table_identifier}.")
268
+ updated_table_metadata = commit_append_snapshot(
269
+ iceberg_table=iceberg_table,
270
+ new_position_delete_files=to_be_added_files_list,
271
+ )
272
+ elif snapshot_type == SnapshotType.REPLACE:
273
+ logger.info(f"Committing replace snapshot for {table_identifier}.")
274
+ updated_table_metadata = commit_replace_snapshot(
275
+ iceberg_table=iceberg_table,
276
+ to_be_deleted_files=to_be_deleted_files_list,
277
+ new_position_delete_files=to_be_added_files_list,
278
+ )
279
+ elif snapshot_type == SnapshotType.DELETE:
280
+ logger.info(f"Committing delete snapshot for {table_identifier}.")
281
+ updated_table_metadata = commit_replace_snapshot(
282
+ iceberg_table=iceberg_table,
283
+ to_be_deleted_files=to_be_deleted_files_list,
284
+ new_position_delete_files=[], # No new files to add
285
+ )
286
+ else:
287
+ logger.warning(f"Unexpected snapshot type: {snapshot_type}")
288
+ return
289
+
290
+ logger.info(
291
+ f"Committed new Iceberg snapshot for {table_identifier}: {updated_table_metadata.current_snapshot_id}"
292
+ )
293
+
294
+ # Return the updated table metadata with the new snapshot
295
+ return updated_table_metadata
296
+ except Exception as e:
297
+ logger.error(f"Failed to commit snapshot for {table_identifier}: {str(e)}")
298
+ raise
@@ -0,0 +1,96 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, List, Any, Optional
3
+ from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
4
+ from fsspec import AbstractFileSystem
5
+
6
+
7
+ class ConvertInput(Dict):
8
+ @staticmethod
9
+ def of(
10
+ convert_input_files: ConvertInputFiles,
11
+ convert_task_index: int,
12
+ iceberg_table_warehouse_prefix: str,
13
+ identifier_fields: List[str],
14
+ table_io: Any,
15
+ table_metadata: Any,
16
+ compact_previous_position_delete_files: bool,
17
+ enforce_primary_key_uniqueness: bool,
18
+ position_delete_for_multiple_data_files: bool,
19
+ max_parallel_data_file_download: int,
20
+ filesystem: Optional[AbstractFileSystem],
21
+ s3_client_kwargs: Optional[Dict[str, Any]],
22
+ task_memory: float,
23
+ ) -> ConvertInput:
24
+
25
+ result = ConvertInput()
26
+ result["convert_input_files"] = convert_input_files
27
+ result["convert_task_index"] = convert_task_index
28
+ result["identifier_fields"] = identifier_fields
29
+ result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
30
+ result["table_io"] = table_io
31
+ result["table_metadata"] = table_metadata
32
+ result[
33
+ "compact_previous_position_delete_files"
34
+ ] = compact_previous_position_delete_files
35
+ result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
36
+ result[
37
+ "position_delete_for_multiple_data_files"
38
+ ] = position_delete_for_multiple_data_files
39
+ result["max_parallel_data_file_download"] = max_parallel_data_file_download
40
+ result["filesystem"] = filesystem
41
+ result["s3_client_kwargs"] = s3_client_kwargs
42
+ result["task_memory"] = task_memory
43
+
44
+ return result
45
+
46
+ @property
47
+ def convert_input_files(self) -> ConvertInputFiles:
48
+ return self["convert_input_files"]
49
+
50
+ @property
51
+ def identifier_fields(self) -> List[str]:
52
+ return self["identifier_fields"]
53
+
54
+ @property
55
+ def convert_task_index(self) -> int:
56
+ return self["convert_task_index"]
57
+
58
+ @property
59
+ def iceberg_table_warehouse_prefix(self) -> str:
60
+ return self["iceberg_table_warehouse_prefix"]
61
+
62
+ @property
63
+ def table_io(self) -> Any:
64
+ return self["table_io"]
65
+
66
+ @property
67
+ def table_metadata(self) -> Any:
68
+ return self["table_metadata"]
69
+
70
+ @property
71
+ def compact_previous_position_delete_files(self) -> bool:
72
+ return self["compact_previous_position_delete_files"]
73
+
74
+ @property
75
+ def enforce_primary_key_uniqueness(self) -> bool:
76
+ return self["enforce_primary_key_uniqueness"]
77
+
78
+ @property
79
+ def position_delete_for_multiple_data_files(self) -> bool:
80
+ return self["position_delete_for_multiple_data_files"]
81
+
82
+ @property
83
+ def max_parallel_data_file_download(self) -> int:
84
+ return self["max_parallel_data_file_download"]
85
+
86
+ @property
87
+ def filesystem(self) -> Optional[AbstractFileSystem]:
88
+ return self["filesystem"]
89
+
90
+ @property
91
+ def s3_client_kwargs(self) -> Optional[Dict[str, Any]]:
92
+ return self["s3_client_kwargs"]
93
+
94
+ @property
95
+ def task_memory(self) -> float:
96
+ return self["task_memory"]
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, List, Any, Optional, Tuple
3
+ from pyiceberg.manifest import DataFile
4
+
5
+ # Type aliases to simplify nested types
6
+ DataFileWithSequence = Tuple[int, DataFile] # (sequence_number, data_file)
7
+ DataFileList = List[DataFileWithSequence] # List of data files with sequence numbers
8
+ DataFileListGroup = List[DataFileList] # Group of data file lists
9
+
10
+
11
+ class ConvertInputFiles(Dict):
12
+ @staticmethod
13
+ def of(
14
+ partition_value: Any,
15
+ all_data_files_for_dedupe: Optional[DataFileList] = None,
16
+ applicable_data_files: Optional[DataFileListGroup] = None,
17
+ applicable_equality_delete_files: Optional[DataFileListGroup] = None,
18
+ existing_position_delete_files: Optional[DataFileList] = None,
19
+ ) -> ConvertInputFiles:
20
+
21
+ result = ConvertInputFiles()
22
+ result["partition_value"] = partition_value
23
+ result["all_data_files_for_dedupe"] = all_data_files_for_dedupe
24
+ result["applicable_data_files"] = applicable_data_files
25
+ result["applicable_equality_delete_files"] = applicable_equality_delete_files
26
+ result["existing_position_delete_files"] = existing_position_delete_files
27
+ return result
28
+
29
+ @property
30
+ def partition_value(self) -> Any:
31
+ return self["partition_value"]
32
+
33
+ @property
34
+ def all_data_files_for_dedupe(self) -> Optional[DataFileList]:
35
+ return self["all_data_files_for_dedupe"]
36
+
37
+ @property
38
+ def applicable_data_files(self) -> Optional[DataFileListGroup]:
39
+ return self["applicable_data_files"]
40
+
41
+ @property
42
+ def applicable_equality_delete_files(
43
+ self,
44
+ ) -> Optional[DataFileListGroup]:
45
+ return self["applicable_equality_delete_files"]
46
+
47
+ @property
48
+ def existing_position_delete_files(self) -> Optional[DataFileList]:
49
+ return self["existing_position_delete_files"]
50
+
51
+ @partition_value.setter
52
+ def partition_value(self, partition_value: Any) -> None:
53
+ self["partition_value"] = partition_value
54
+
55
+ @all_data_files_for_dedupe.setter
56
+ def all_data_files_for_dedupe(
57
+ self, all_data_files_for_dedupe: Optional[DataFileList]
58
+ ) -> None:
59
+ self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
60
+
61
+ @applicable_data_files.setter
62
+ def applicable_data_files(
63
+ self, applicable_data_files: Optional[DataFileListGroup]
64
+ ) -> None:
65
+ self["applicable_data_files"] = applicable_data_files
66
+
67
+ @applicable_equality_delete_files.setter
68
+ def applicable_equality_delete_files(
69
+ self,
70
+ applicable_equality_delete_files: Optional[DataFileListGroup],
71
+ ) -> None:
72
+ self["applicable_equality_delete_files"] = applicable_equality_delete_files
73
+
74
+ @existing_position_delete_files.setter
75
+ def existing_position_delete_files(
76
+ self, existing_position_delete_files: Optional[DataFileList]
77
+ ) -> None:
78
+ self["existing_position_delete_files"] = existing_position_delete_files
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, List, Any
3
+ from pyiceberg.manifest import DataFile
4
+
5
+
6
+ class ConvertResult(Dict):
7
+ @staticmethod
8
+ def of(
9
+ convert_task_index: int,
10
+ to_be_added_files: List[DataFile],
11
+ to_be_deleted_files: Dict[Any, List[DataFile]],
12
+ position_delete_record_count: int,
13
+ input_data_files_record_count: int,
14
+ input_data_files_hash_columns_in_memory_sizes: int,
15
+ position_delete_in_memory_sizes: int,
16
+ position_delete_on_disk_sizes: int,
17
+ input_data_files_on_disk_size: int,
18
+ peak_memory_usage_bytes: int,
19
+ memory_usage_percentage: float,
20
+ ) -> ConvertResult:
21
+
22
+ result = ConvertResult()
23
+ result["convert_task_index"] = convert_task_index
24
+ result["to_be_added_files"] = to_be_added_files
25
+ result["to_be_deleted_files"] = to_be_deleted_files
26
+ result["position_delete_record_count"] = position_delete_record_count
27
+ result["input_data_files_record_count"] = input_data_files_record_count
28
+ result[
29
+ "input_data_files_hash_columns_in_memory_sizes"
30
+ ] = input_data_files_hash_columns_in_memory_sizes
31
+ result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
32
+ result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
33
+ result["input_data_files_on_disk_size"] = input_data_files_on_disk_size
34
+ result["peak_memory_usage_bytes"] = peak_memory_usage_bytes
35
+ result["memory_usage_percentage"] = memory_usage_percentage
36
+ return result
37
+
38
+ @property
39
+ def convert_task_index(self) -> int:
40
+ return self["convert_task_index"]
41
+
42
+ @property
43
+ def to_be_added_files(self) -> List[DataFile]:
44
+ return self["to_be_added_files"]
45
+
46
+ @property
47
+ def to_be_deleted_files(self) -> Dict[Any, List[DataFile]]:
48
+ return self["to_be_deleted_files"]
49
+
50
+ @property
51
+ def position_delete_record_count(self) -> int:
52
+ return self["position_delete_record_count"]
53
+
54
+ @property
55
+ def input_data_files_record_count(self) -> int:
56
+ return self["input_data_files_record_count"]
57
+
58
+ @property
59
+ def input_data_files_hash_columns_in_memory_sizes(self) -> int:
60
+ return self["input_data_files_hash_columns_in_memory_sizes"]
61
+
62
+ @property
63
+ def position_delete_in_memory_sizes(self) -> int:
64
+ return self["position_delete_in_memory_sizes"]
65
+
66
+ @property
67
+ def position_delete_on_disk_sizes(self) -> int:
68
+ return self["position_delete_on_disk_sizes"]
69
+
70
+ @property
71
+ def input_data_files_on_disk_size(self) -> int:
72
+ return self["input_data_files_on_disk_size"]
73
+
74
+ @property
75
+ def peak_memory_usage_bytes(self) -> int:
76
+ return self["peak_memory_usage_bytes"]
77
+
78
+ @property
79
+ def memory_usage_percentage(self) -> float:
80
+ return self["memory_usage_percentage"]