deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
14
14
  ExecutionCompactionResult,
15
15
  )
16
16
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
17
- from deltacat.compute.compactor.utils import round_completion_file as rcf
18
17
  from deltacat.compute.compactor import DeltaAnnotated
19
18
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
20
19
  DeleteStrategy,
@@ -27,9 +26,9 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
27
26
  from deltacat.storage import (
28
27
  Delta,
29
28
  DeltaLocator,
30
- Manifest,
31
- Partition,
29
+ PartitionLocator,
32
30
  )
31
+ from deltacat.storage.model.manifest import Manifest
33
32
  from deltacat.compute.compactor.model.compact_partition_params import (
34
33
  CompactPartitionParams,
35
34
  )
@@ -37,13 +36,14 @@ from deltacat.utils.resources import (
37
36
  get_current_process_peak_memory_usage_in_bytes,
38
37
  )
39
38
  from deltacat.compute.compactor_v2.private.compaction_utils import (
39
+ _get_rci_source_partition_locator,
40
40
  _fetch_compaction_metadata,
41
41
  _build_uniform_deltas,
42
42
  _group_uniform_deltas,
43
43
  _stage_new_partition,
44
44
  _run_hash_and_merge,
45
45
  _process_merge_results,
46
- _write_new_round_completion_file,
46
+ _create_round_completion_info,
47
47
  _commit_compaction_result,
48
48
  )
49
49
  from deltacat.utils.metrics import metrics
@@ -65,7 +65,7 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
65
65
 
66
66
  @metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
67
67
  @categorize_errors
68
- def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
68
+ def compact_partition(params: CompactPartitionParams, **kwargs) -> None:
69
69
  assert (
70
70
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
71
71
  ), "hash_bucket_count is a required arg for compactor v2"
@@ -85,7 +85,6 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
85
85
  **kwargs,
86
86
  )
87
87
  _commit_compaction_result(params, execute_compaction_result)
88
- return execute_compaction_result.round_completion_file_s3_url
89
88
 
90
89
 
91
90
  def _execute_compaction(
@@ -100,12 +99,12 @@ def _execute_compaction(
100
99
  previous_compacted_delta_manifest,
101
100
  round_completion_info,
102
101
  ) = fetch_compaction_metadata_result
103
- rcf_source_partition_locator: rcf.PartitionLocator = (
104
- params.rebase_source_partition_locator or params.source_partition_locator
102
+ rci_source_partition_locator: PartitionLocator = _get_rci_source_partition_locator(
103
+ params
105
104
  )
106
105
 
107
- base_audit_url: str = rcf_source_partition_locator.path(
108
- f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
106
+ base_audit_url: str = rci_source_partition_locator.path(
107
+ f"{params.compaction_artifact_path}/compaction-audit"
109
108
  )
110
109
  audit_url: str = f"{base_audit_url}.json"
111
110
  logger.info(f"Compaction audit will be written to {audit_url}")
@@ -140,9 +139,9 @@ def _execute_compaction(
140
139
  )
141
140
  if not input_deltas:
142
141
  logger.info("No input deltas found to compact.")
143
- return ExecutionCompactionResult(None, None, None, False)
142
+ return ExecutionCompactionResult(None, None, False)
144
143
  build_uniform_deltas_result: tuple[
145
- List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition
144
+ List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
146
145
  ] = _build_uniform_deltas(
147
146
  params, compaction_audit, input_deltas, delta_discovery_start
148
147
  )
@@ -203,13 +202,13 @@ def _execute_compaction(
203
202
 
204
203
  compaction_audit.save_round_completion_stats(mat_results)
205
204
 
206
- compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
205
+ compaction_result: ExecutionCompactionResult = _create_round_completion_info(
207
206
  params,
208
207
  compaction_audit,
209
208
  compacted_partition,
210
209
  audit_url,
211
210
  hb_id_to_entry_indices_range,
212
- rcf_source_partition_locator,
211
+ rci_source_partition_locator,
213
212
  new_compacted_delta_locator,
214
213
  pyarrow_write_result,
215
214
  round_completion_info,
@@ -49,7 +49,7 @@ def _aggregate_delete_deltas(input_deltas: List[Delta]) -> Dict[int, List[Delta]
49
49
  ] = [
50
50
  (is_delete, list(delete_delta_group))
51
51
  for (is_delete, _), delete_delta_group in itertools.groupby(
52
- input_deltas, lambda d: (d.type is DeltaType.DELETE, d.delete_parameters)
52
+ input_deltas, lambda d: (d.type is DeltaType.DELETE, d.meta.entry_params)
53
53
  )
54
54
  ]
55
55
  for (
@@ -89,11 +89,11 @@ def _get_delete_file_envelopes(
89
89
  consecutive_delete_tables: List[pa.Table] = []
90
90
  for delete_delta in delete_delta_sequence:
91
91
  assert (
92
- delete_delta.delete_parameters is not None
92
+ delete_delta.meta.entry_params is not None
93
93
  ), "Delete type deltas are required to have delete parameters defined"
94
94
  delete_columns: Optional[
95
95
  List[str]
96
- ] = delete_delta.delete_parameters.equality_column_names
96
+ ] = delete_delta.meta.entry_params.equality_field_locators
97
97
  assert len(delete_columns) > 0, "At least 1 delete column is required"
98
98
  # delete columns should exist in underlying table
99
99
  delete_dataset = params.deltacat_storage.download_delta(
@@ -13,7 +13,6 @@ from typing import Optional
13
13
  class ExecutionCompactionResult:
14
14
  new_compacted_partition: Optional[Partition]
15
15
  new_round_completion_info: Optional[RoundCompletionInfo]
16
- round_completion_file_s3_url: Optional[str]
17
16
  is_inplace_compacted: bool
18
17
 
19
18
  def __iter__(self):
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Any
4
4
  from deltacat.utils.metrics import MetricsConfig
5
5
  from deltacat.utils.common import ReadKwargsProvider
6
6
  from deltacat.io.object_store import IObjectStore
7
- from deltacat.storage import interface as unimplemented_deltacat_storage
7
+ from deltacat.storage import metastore
8
8
  from deltacat.compute.compactor import DeltaAnnotated
9
9
 
10
10
 
@@ -15,12 +15,13 @@ class HashBucketInput(Dict):
15
15
  primary_keys: List[str],
16
16
  num_hash_buckets: int,
17
17
  num_hash_groups: int,
18
+ all_column_names: List[str],
18
19
  hb_task_index: Optional[int] = 0,
19
20
  enable_profiler: Optional[bool] = False,
20
21
  metrics_config: Optional[MetricsConfig] = None,
21
22
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
22
23
  object_store: Optional[IObjectStore] = None,
23
- deltacat_storage=unimplemented_deltacat_storage,
24
+ deltacat_storage=metastore,
24
25
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
25
26
  memory_logs_enabled: Optional[bool] = None,
26
27
  ) -> HashBucketInput:
@@ -31,6 +32,7 @@ class HashBucketInput(Dict):
31
32
  result["hb_task_index"] = hb_task_index
32
33
  result["num_hash_buckets"] = num_hash_buckets
33
34
  result["num_hash_groups"] = num_hash_groups
35
+ result["all_column_names"] = all_column_names
34
36
  result["enable_profiler"] = enable_profiler
35
37
  result["metrics_config"] = metrics_config
36
38
  result["read_kwargs_provider"] = read_kwargs_provider
@@ -61,6 +63,10 @@ class HashBucketInput(Dict):
61
63
  def num_hash_groups(self) -> int:
62
64
  return self["num_hash_groups"]
63
65
 
66
+ @property
67
+ def all_column_names(self) -> List[str]:
68
+ return self["all_column_names"]
69
+
64
70
  @property
65
71
  def enable_profiler(self) -> Optional[bool]:
66
72
  return self.get("enable_profiler")
@@ -78,7 +84,7 @@ class HashBucketInput(Dict):
78
84
  return self.get("object_store")
79
85
 
80
86
  @property
81
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
87
+ def deltacat_storage(self) -> metastore:
82
88
  return self.get("deltacat_storage")
83
89
 
84
90
  @property
@@ -16,7 +16,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
16
16
  hash_group_index_to_hash_bucket_indices,
17
17
  )
18
18
 
19
- from deltacat.storage import interface as unimplemented_deltacat_storage
19
+ from deltacat.storage import metastore
20
20
 
21
21
  from deltacat.io.object_store import IObjectStore
22
22
 
@@ -87,11 +87,13 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
87
87
  def __init__(
88
88
  self,
89
89
  uniform_deltas: List[DeltaAnnotated],
90
+ all_column_names: List[str],
90
91
  read_kwargs_provider: Optional[ReadKwargsProvider],
91
- deltacat_storage=unimplemented_deltacat_storage,
92
+ deltacat_storage=metastore,
92
93
  deltacat_storage_kwargs: Optional[dict] = None,
93
94
  ):
94
95
  self._deltas = uniform_deltas
96
+ self._all_column_names = all_column_names
95
97
  self._read_kwargs_provider = read_kwargs_provider
96
98
  self._deltacat_storage = deltacat_storage
97
99
  self._deltacat_storage_kwargs = deltacat_storage_kwargs
@@ -110,6 +112,7 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
110
112
  total_size_bytes,
111
113
  ) = read_delta_file_envelopes(
112
114
  annotated_delta,
115
+ self._all_column_names,
113
116
  self._read_kwargs_provider,
114
117
  self._deltacat_storage,
115
118
  self._deltacat_storage_kwargs,
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Dict, List, Optional, Any
3
+ from typing import Dict, List, Optional, Any, Set
4
4
 
5
5
  from deltacat.compute.compactor_v2.model.merge_file_group import (
6
6
  MergeFileGroupsProvider,
@@ -12,9 +12,10 @@ from deltacat.utils.metrics import MetricsConfig
12
12
  from deltacat.utils.common import ReadKwargsProvider
13
13
  from deltacat.io.object_store import IObjectStore
14
14
  from deltacat.storage import (
15
+ Manifest,
15
16
  Partition,
16
17
  SortKey,
17
- interface as unimplemented_deltacat_storage,
18
+ metastore,
18
19
  )
19
20
  from deltacat.compute.compactor_v2.constants import (
20
21
  DROP_DUPLICATES,
@@ -32,23 +33,26 @@ class MergeInput(Dict):
32
33
  write_to_partition: Partition,
33
34
  compacted_file_content_type: ContentType,
34
35
  primary_keys: List[str],
36
+ all_column_names: List[str],
35
37
  drop_duplicates: Optional[bool] = DROP_DUPLICATES,
36
38
  sort_keys: Optional[List[SortKey]] = None,
37
39
  merge_task_index: Optional[int] = 0,
38
40
  max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
39
41
  enable_profiler: Optional[bool] = False,
40
42
  metrics_config: Optional[MetricsConfig] = None,
41
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
43
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
42
44
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
43
45
  round_completion_info: Optional[RoundCompletionInfo] = None,
44
46
  object_store: Optional[IObjectStore] = None,
45
47
  delete_strategy: Optional[DeleteStrategy] = None,
46
- delete_file_envelopes: Optional[List] = None,
47
- deltacat_storage=unimplemented_deltacat_storage,
48
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
49
+ deltacat_storage=metastore,
48
50
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
51
  memory_logs_enabled: Optional[bool] = None,
50
52
  disable_copy_by_reference: Optional[bool] = None,
51
53
  hash_bucket_count: Optional[int] = None,
54
+ original_fields: Optional[Set[str]] = None,
55
+ compacted_manifest: Optional[Manifest] = None,
52
56
  ) -> MergeInput:
53
57
 
54
58
  result = MergeInput()
@@ -56,13 +60,14 @@ class MergeInput(Dict):
56
60
  result["write_to_partition"] = write_to_partition
57
61
  result["compacted_file_content_type"] = compacted_file_content_type
58
62
  result["primary_keys"] = primary_keys
63
+ result["all_column_names"] = all_column_names
59
64
  result["drop_duplicates"] = drop_duplicates
60
65
  result["sort_keys"] = sort_keys
61
66
  result["merge_task_index"] = merge_task_index
62
67
  result["max_records_per_output_file"] = max_records_per_output_file
63
68
  result["enable_profiler"] = enable_profiler
64
69
  result["metrics_config"] = metrics_config
65
- result["s3_table_writer_kwargs"] = s3_table_writer_kwargs or {}
70
+ result["table_writer_kwargs"] = table_writer_kwargs or {}
66
71
  result["read_kwargs_provider"] = read_kwargs_provider
67
72
  result["round_completion_info"] = round_completion_info
68
73
  result["object_store"] = object_store
@@ -73,6 +78,8 @@ class MergeInput(Dict):
73
78
  result["memory_logs_enabled"] = memory_logs_enabled
74
79
  result["disable_copy_by_reference"] = disable_copy_by_reference
75
80
  result["hash_bucket_count"] = hash_bucket_count
81
+ result["original_fields"] = original_fields
82
+ result["compacted_manifest"] = compacted_manifest
76
83
  return result
77
84
 
78
85
  @property
@@ -91,6 +98,10 @@ class MergeInput(Dict):
91
98
  def primary_keys(self) -> List[str]:
92
99
  return self["primary_keys"]
93
100
 
101
+ @property
102
+ def all_column_names(self) -> List[str]:
103
+ return self["all_column_names"]
104
+
94
105
  @property
95
106
  def drop_duplicates(self) -> int:
96
107
  return self["drop_duplicates"]
@@ -116,8 +127,8 @@ class MergeInput(Dict):
116
127
  return self.get("metrics_config")
117
128
 
118
129
  @property
119
- def s3_table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
120
- return self.get("s3_table_writer_kwargs")
130
+ def table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
131
+ return self.get("table_writer_kwargs")
121
132
 
122
133
  @property
123
134
  def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
@@ -132,7 +143,7 @@ class MergeInput(Dict):
132
143
  return self.get("object_store")
133
144
 
134
145
  @property
135
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
146
+ def deltacat_storage(self) -> metastore:
136
147
  return self["deltacat_storage"]
137
148
 
138
149
  @property
@@ -160,3 +171,11 @@ class MergeInput(Dict):
160
171
  @property
161
172
  def hash_bucket_count(self) -> int:
162
173
  return self["hash_bucket_count"]
174
+
175
+ @property
176
+ def original_fields(self) -> Optional[Set[str]]:
177
+ return self.get("original_fields")
178
+
179
+ @property
180
+ def compacted_manifest(self) -> Optional[Manifest]:
181
+ return self.get("compacted_manifest")