deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import json
2
-
2
+ import tempfile
3
3
  import unittest
4
+ import uuid
4
5
 
5
6
 
6
7
  class TestCompactPartitionParams(unittest.TestCase):
@@ -8,9 +9,14 @@ class TestCompactPartitionParams(unittest.TestCase):
8
9
  def setUpClass(cls):
9
10
  from deltacat.types.media import ContentType
10
11
  from deltacat.utils.metrics import MetricsConfig, MetricsTarget
12
+ from deltacat.catalog import CatalogProperties
13
+
14
+ # Create a temporary catalog for testing
15
+ tmpdir = tempfile.mkdtemp()
16
+ cls.test_catalog = CatalogProperties(root=tmpdir)
11
17
 
12
18
  cls.VALID_COMPACT_PARTITION_PARAMS = {
13
- "compaction_artifact_s3_bucket": "foobar",
19
+ "catalog": cls.test_catalog,
14
20
  "compacted_file_content_type": ContentType.PARQUET,
15
21
  "deltacat_storage": "foobar",
16
22
  "destination_partition_locator": {
@@ -23,15 +29,16 @@ class TestCompactPartitionParams(unittest.TestCase):
23
29
  "tableVersion": "1",
24
30
  },
25
31
  "streamId": "foobar",
26
- "storageType": "fooType",
32
+ "format": "fooType",
27
33
  },
28
34
  "partitionValues": [],
29
- "partitionId": None,
35
+ "partitionId": str(uuid.uuid4()),
30
36
  },
31
37
  "hash_bucket_count": 200,
32
38
  "last_stream_position_to_compact": 168000000000,
33
39
  "list_deltas_kwargs": {"equivalent_table_types": []},
34
40
  "primary_keys": {"id"},
41
+ "all_column_names": ["id", "foo", "bar", "baz"],
35
42
  "properties": {
36
43
  "parent_stream_position": "1688000000000",
37
44
  },
@@ -47,12 +54,12 @@ class TestCompactPartitionParams(unittest.TestCase):
47
54
  "table_version": "1",
48
55
  },
49
56
  "streamId": "foobar",
50
- "storageType": "fooType",
57
+ "format": "fooType",
51
58
  },
52
59
  "partitionValues": [],
53
60
  "partitionId": "79612ea39ac5493eae925abe60767d42",
54
61
  },
55
- "s3_table_writer_kwargs": {
62
+ "table_writer_kwargs": {
56
63
  "version": "1.0",
57
64
  "flavor": "foobar",
58
65
  "coerce_timestamps": "ms",
@@ -67,7 +74,7 @@ class TestCompactPartitionParams(unittest.TestCase):
67
74
  "tableVersion": "2",
68
75
  },
69
76
  "streamId": "foobar",
70
- "storageType": "fooType",
77
+ "format": "fooType",
71
78
  },
72
79
  "partitionValues": [],
73
80
  "partitionId": "79612ea39ac5493eae925abe60767d42",
@@ -103,10 +110,8 @@ class TestCompactPartitionParams(unittest.TestCase):
103
110
  json.loads(serialized_params)["compacted_file_content_type"]
104
111
  == params.compacted_file_content_type
105
112
  )
106
- assert (
107
- json.loads(serialized_params)["compaction_artifact_s3_bucket"]
108
- == params.compaction_artifact_s3_bucket
109
- )
113
+ catalog_json = json.loads(serialized_params)["catalog"]
114
+ assert catalog_json["_root"] == params.catalog.root
110
115
  assert (
111
116
  json.loads(serialized_params)["hash_bucket_count"]
112
117
  == params.hash_bucket_count
@@ -1,43 +1,40 @@
1
- import ray
1
+ import tempfile
2
2
  import os
3
- from moto import mock_s3
3
+ from typing import Any, Callable, Dict, List, Optional, Set
4
4
  import pytest
5
- import boto3
6
- from boto3.resources.base import ServiceResource
7
5
  import pyarrow as pa
6
+ import ray
7
+
8
8
  from deltacat.io.file_object_store import FileObjectStore
9
9
  from pytest_benchmark.fixture import BenchmarkFixture
10
- import tempfile
11
10
 
12
11
  from deltacat.tests.compute.test_util_constant import (
13
- TEST_S3_RCF_BUCKET_NAME,
14
12
  DEFAULT_NUM_WORKERS,
15
13
  DEFAULT_WORKER_INSTANCE_CPUS,
16
14
  )
17
15
  from deltacat.tests.compute.test_util_common import (
18
- get_rcf,
16
+ get_rci_from_partition,
17
+ read_audit_file,
18
+ PartitionKey,
19
+ get_compacted_delta_locator_from_partition,
19
20
  )
20
- from deltacat.tests.test_utils.utils import read_s3_contents
21
- from deltacat.compute.compactor.model.compactor_version import CompactorVersion
22
21
  from deltacat.tests.compute.test_util_common import (
23
- get_compacted_delta_locator_from_rcf,
22
+ create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
24
23
  )
24
+
25
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
25
26
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
26
27
  CompactionSessionAuditInfo,
27
28
  )
28
- from deltacat.tests.compute.test_util_create_table_deltas_repo import (
29
- create_src_w_deltas_destination_rebase_w_deltas_strategy,
30
- )
31
29
  from deltacat.tests.compute.compact_partition_rebase_test_cases import (
32
30
  REBASE_TEST_CASES,
33
31
  )
34
- from typing import Any, Callable, Dict, List, Optional, Set
35
- from deltacat.types.media import StorageType
32
+ from deltacat.types.media import StorageType, ContentType
36
33
  from deltacat.storage import (
37
34
  DeltaLocator,
38
35
  Partition,
36
+ metastore,
39
37
  )
40
- from deltacat.types.media import ContentType
41
38
  from deltacat.compute.compactor.model.compact_partition_params import (
42
39
  CompactPartitionParams,
43
40
  )
@@ -48,11 +45,6 @@ from deltacat.utils.placement import (
48
45
  PlacementGroupManager,
49
46
  )
50
47
 
51
- DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
52
- "db_file_path",
53
- "deltacat/tests/local_deltacat_storage/db_test.sqlite",
54
- )
55
-
56
48
 
57
49
  """
58
50
  MODULE scoped fixtures
@@ -66,54 +58,11 @@ def setup_ray_cluster():
66
58
  ray.shutdown()
67
59
 
68
60
 
69
- @pytest.fixture(autouse=True, scope="module")
70
- def mock_aws_credential():
71
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
72
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
73
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
74
- os.environ["AWS_SESSION_TOKEN"] = "testing"
75
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
76
- yield
77
-
78
-
79
- @pytest.fixture(autouse=True, scope="module")
80
- def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
81
- # make sure the database file is deleted after all the compactor package tests are completed
82
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
83
- os.remove(DATABASE_FILE_PATH_VALUE)
84
-
85
-
86
- @pytest.fixture(scope="module")
87
- def s3_resource(mock_aws_credential):
88
- with mock_s3():
89
- yield boto3.resource("s3")
90
-
91
-
92
- @pytest.fixture(autouse=True, scope="module")
93
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
94
- s3_resource.create_bucket(
95
- ACL="authenticated-read",
96
- Bucket=TEST_S3_RCF_BUCKET_NAME,
97
- )
98
- yield
99
-
100
-
101
61
  """
102
62
  FUNCTION scoped fixtures
103
63
  """
104
64
 
105
65
 
106
- @pytest.fixture(scope="function")
107
- def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
108
- # see deltacat/tests/local_deltacat_storage/README.md for documentation
109
- kwargs_for_local_deltacat_storage: Dict[str, Any] = {
110
- DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
111
- }
112
- yield kwargs_for_local_deltacat_storage
113
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
114
- os.remove(DATABASE_FILE_PATH_VALUE)
115
-
116
-
117
66
  @pytest.fixture(autouse=True, scope="function")
118
67
  def enable_bucketing_spec_validation(monkeypatch):
119
68
  """
@@ -199,14 +148,13 @@ def enable_bucketing_spec_validation(monkeypatch):
199
148
  ],
200
149
  ids=[test_name for test_name in REBASE_TEST_CASES],
201
150
  )
202
- def test_compact_partition_rebase_same_source_and_destination(
151
+ def test_compact_partition_rebase_same_source_and_destination_main(
203
152
  mocker,
204
- s3_resource: ServiceResource,
205
- local_deltacat_storage_kwargs: Dict[str, Any],
153
+ main_deltacat_storage_kwargs: Dict[str, Any],
206
154
  test_name: str,
207
155
  primary_keys: Set[str],
208
156
  sort_keys: List[Optional[Any]],
209
- partition_keys_param: Optional[List[Any]],
157
+ partition_keys_param: Optional[List[PartitionKey]],
210
158
  partition_values_param: List[Optional[str]],
211
159
  input_deltas_param: List[pa.Array],
212
160
  input_deltas_delta_type: str,
@@ -225,21 +173,20 @@ def test_compact_partition_rebase_same_source_and_destination(
225
173
  compact_partition_func: Callable,
226
174
  benchmark: BenchmarkFixture,
227
175
  ):
228
- import deltacat.tests.local_deltacat_storage as ds
229
-
230
- ds_mock_kwargs = local_deltacat_storage_kwargs
176
+ ds_mock_kwargs = main_deltacat_storage_kwargs
231
177
  """
232
178
  This test tests the scenario where source partition locator == destination partition locator,
233
179
  but rebase source partition locator is different.
234
180
  This scenario could occur when hash bucket count changes.
181
+
182
+ This version uses the main metastore implementation instead of local storage.
235
183
  """
236
184
  partition_keys = partition_keys_param
237
185
  (
238
186
  source_table_stream,
239
187
  _,
240
188
  rebased_table_stream,
241
- ) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
242
- primary_keys,
189
+ ) = create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
243
190
  sort_keys,
244
191
  partition_keys,
245
192
  input_deltas_param,
@@ -247,14 +194,31 @@ def test_compact_partition_rebase_same_source_and_destination(
247
194
  partition_values_param,
248
195
  ds_mock_kwargs,
249
196
  )
250
- source_partition: Partition = ds.get_partition(
197
+
198
+ # Convert partition values for partition lookup (same as in the helper function)
199
+ converted_partition_values_for_lookup = partition_values_param
200
+ if partition_values_param and partition_keys:
201
+ converted_partition_values_for_lookup = []
202
+ for i, (value, pk) in enumerate(zip(partition_values_param, partition_keys)):
203
+ if pk.key_type.value == "int": # Use .value to get string representation
204
+ converted_partition_values_for_lookup.append(int(value))
205
+ else:
206
+ converted_partition_values_for_lookup.append(value)
207
+
208
+ source_partition: Partition = metastore.get_partition(
251
209
  source_table_stream.locator,
252
- partition_values_param,
210
+ converted_partition_values_for_lookup,
253
211
  **ds_mock_kwargs,
254
212
  )
255
- rebased_partition: Partition = ds.get_partition(
213
+ rebased_partition: Partition = metastore.get_partition(
256
214
  rebased_table_stream.locator,
257
- partition_values_param,
215
+ converted_partition_values_for_lookup,
216
+ **ds_mock_kwargs,
217
+ )
218
+ all_column_names = metastore.get_table_version_column_names(
219
+ rebased_table_stream.locator.table_locator.namespace,
220
+ rebased_table_stream.locator.table_locator.table_name,
221
+ rebased_table_stream.locator.table_version_locator.table_version,
258
222
  **ds_mock_kwargs,
259
223
  )
260
224
  num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
@@ -269,10 +233,10 @@ def test_compact_partition_rebase_same_source_and_destination(
269
233
  with tempfile.TemporaryDirectory() as test_dir:
270
234
  compact_partition_params = CompactPartitionParams.of(
271
235
  {
272
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
236
+ "catalog": ds_mock_kwargs.get("inner"),
273
237
  "compacted_file_content_type": ContentType.PARQUET,
274
238
  "dd_max_parallelism_ratio": 1.0,
275
- "deltacat_storage": ds,
239
+ "deltacat_storage": metastore,
276
240
  "deltacat_storage_kwargs": ds_mock_kwargs,
277
241
  "destination_partition_locator": rebased_partition.locator,
278
242
  "hash_bucket_count": hash_bucket_count_param,
@@ -284,11 +248,11 @@ def test_compact_partition_rebase_same_source_and_destination(
284
248
  "object_store": FileObjectStore(test_dir),
285
249
  "pg_config": pgm,
286
250
  "primary_keys": primary_keys,
251
+ "all_column_names": all_column_names,
287
252
  "read_kwargs_provider": read_kwargs_provider_param,
288
253
  "rebase_source_partition_locator": source_partition.locator,
289
254
  "rebase_source_partition_high_watermark": rebased_partition.stream_position,
290
255
  "records_per_compacted_file": records_per_compacted_file_param,
291
- "s3_client_kwargs": {},
292
256
  "source_partition_locator": rebased_partition.locator,
293
257
  "sort_keys": sort_keys if sort_keys else None,
294
258
  "drop_duplicates": drop_duplicates_param,
@@ -305,16 +269,14 @@ def test_compact_partition_rebase_same_source_and_destination(
305
269
  object_store_put_many_spy = mocker.spy(FileObjectStore, "put_many")
306
270
 
307
271
  # execute
308
- rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
272
+ benchmark(compact_partition_func, compact_partition_params)
309
273
 
310
- round_completion_info: RoundCompletionInfo = get_rcf(
311
- s3_resource, rcf_file_s3_uri
312
- )
313
- audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
314
- round_completion_info.compaction_audit_url
274
+ # Get RoundCompletionInfo from the compacted partition
275
+ round_completion_info: RoundCompletionInfo = get_rci_from_partition(
276
+ rebased_partition.locator, metastore, catalog=ds_mock_kwargs.get("inner")
315
277
  )
316
278
 
317
- # assert if RCF covers all files
279
+ # assert if RCI covers all files
318
280
  if compactor_version != CompactorVersion.V1.value:
319
281
  previous_end = None
320
282
  for start, end in round_completion_info.hb_index_to_entry_range.values():
@@ -325,8 +287,12 @@ def test_compact_partition_rebase_same_source_and_destination(
325
287
  == round_completion_info.compacted_pyarrow_write_result.files
326
288
  )
327
289
 
328
- compaction_audit_obj: Dict[str, Any] = read_s3_contents(
329
- s3_resource, audit_bucket, audit_key
290
+ # Get catalog root for audit file resolution
291
+ catalog = ds_mock_kwargs.get("inner")
292
+ catalog_root = catalog.root
293
+
294
+ compaction_audit_obj: Dict[str, Any] = read_audit_file(
295
+ round_completion_info.compaction_audit_url, catalog_root
330
296
  )
331
297
  compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
332
298
  **compaction_audit_obj
@@ -336,13 +302,17 @@ def test_compact_partition_rebase_same_source_and_destination(
336
302
  assert (
337
303
  execute_compaction_result_spy.call_args.args[-1] is False
338
304
  ), "Table version erroneously marked as in-place compacted!"
339
- compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
340
- s3_resource, rcf_file_s3_uri
305
+ compacted_delta_locator: DeltaLocator = (
306
+ get_compacted_delta_locator_from_partition(
307
+ rebased_partition.locator,
308
+ metastore,
309
+ catalog=ds_mock_kwargs.get("inner"),
310
+ )
341
311
  )
342
312
  assert (
343
313
  compacted_delta_locator.stream_position == last_stream_position_to_compact
344
314
  ), "Compacted delta locator must be equal to last stream position"
345
- tables = ds.download_delta(
315
+ tables = metastore.download_delta(
346
316
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
347
317
  )
348
318
  actual_rebase_compacted_table = pa.concat_tables(tables)
@@ -351,7 +321,7 @@ def test_compact_partition_rebase_same_source_and_destination(
351
321
  if primary_keys:
352
322
  sorting_cols.extend([(val, "ascending") for val in primary_keys])
353
323
  if sort_keys:
354
- sorting_cols.extend(sort_keys)
324
+ sorting_cols.extend([pa_key for key in sort_keys for pa_key in key.arrow])
355
325
 
356
326
  rebase_expected_compact_partition_result = (
357
327
  rebase_expected_compact_partition_result.combine_chunks().sort_by(