deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,266 @@
1
+ import ray
2
+ from deltacat.types.media import ContentType
3
+ import pyarrow as pa
4
+
5
+ import pytest
6
+ import tempfile
7
+ from deltacat.storage import metastore
8
+ from deltacat.tests.test_utils.pyarrow import (
9
+ stage_partition_from_file_paths,
10
+ commit_delta_to_staged_partition,
11
+ create_table_from_csv_file_paths,
12
+ )
13
+ from deltacat.storage.model.schema import Schema
14
+ from deltacat.utils.pyarrow import (
15
+ ReadKwargsProviderPyArrowCsvPureUtf8,
16
+ ReadKwargsProviderPyArrowSchemaOverride,
17
+ )
18
+
19
+
20
+ class TestContentTypeParamsMain:
21
+ TEST_NAMESPACE = "test_content_type_params_main"
22
+ TEST_ENTRY_INDEX = 0
23
+ DEDUPE_BASE_COMPACTED_TABLE_STRING_PK = "deltacat/tests/compute/compactor_v2/steps/data/dedupe_base_compacted_table_string_pk.csv"
24
+ DEDUPE_NO_DUPLICATION_STRING_PK = "deltacat/tests/compute/compactor_v2/steps/data/dedupe_table_no_duplication_string_pk.csv"
25
+
26
+ @pytest.fixture(scope="module", autouse=True)
27
+ def setup_ray_cluster(self):
28
+ ray.init(local_mode=True, ignore_reinit_error=True)
29
+ yield
30
+ ray.shutdown()
31
+
32
+ @pytest.fixture(scope="function")
33
+ def main_deltacat_storage_kwargs(self):
34
+ # Create a temporary directory for main storage
35
+ temp_dir = tempfile.mkdtemp()
36
+ from deltacat.catalog import CatalogProperties
37
+
38
+ catalog_properties = CatalogProperties(root=temp_dir)
39
+ storage_kwargs = {"catalog": catalog_properties}
40
+ yield storage_kwargs
41
+ # Clean up temporary directory
42
+ import shutil
43
+
44
+ shutil.rmtree(temp_dir, ignore_errors=True)
45
+
46
+ def test__download_parquet_metadata_for_manifest_entry_sanity(
47
+ self, main_deltacat_storage_kwargs
48
+ ):
49
+ from deltacat.compute.compactor_v2.utils.content_type_params import (
50
+ _download_parquet_metadata_for_manifest_entry,
51
+ )
52
+ from deltacat.types.partial_download import PartialParquetParameters
53
+
54
+ # Create schema from CSV file
55
+ csv_table = create_table_from_csv_file_paths(
56
+ [self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK]
57
+ )
58
+ schema = Schema.of(csv_table.schema)
59
+ partition = stage_partition_from_file_paths(
60
+ self.TEST_NAMESPACE,
61
+ [self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
62
+ schema,
63
+ **main_deltacat_storage_kwargs,
64
+ )
65
+ test_delta = commit_delta_to_staged_partition(
66
+ partition,
67
+ csv_table,
68
+ **main_deltacat_storage_kwargs,
69
+ )
70
+ test_entry_index = 0
71
+ obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
72
+ test_delta,
73
+ test_entry_index,
74
+ ["pk", "value"],
75
+ metastore,
76
+ main_deltacat_storage_kwargs,
77
+ )
78
+ parquet_metadata = ray.get(obj_ref)
79
+ partial_parquet_params = parquet_metadata["partial_parquet_params"]
80
+
81
+ # validate
82
+ assert isinstance(parquet_metadata, dict)
83
+ assert "entry_index" in parquet_metadata
84
+ assert "partial_parquet_params" in parquet_metadata
85
+ assert parquet_metadata["entry_index"] == test_entry_index
86
+ assert isinstance(partial_parquet_params, PartialParquetParameters)
87
+
88
+ assert partial_parquet_params.row_groups_to_download == [0]
89
+ assert partial_parquet_params.num_row_groups == 1
90
+ assert partial_parquet_params.num_rows == 8
91
+ assert isinstance(partial_parquet_params.in_memory_size_bytes, float)
92
+ assert partial_parquet_params.in_memory_size_bytes > 0
93
+
94
+ pq_metadata = partial_parquet_params.pq_metadata
95
+ assert pq_metadata.num_columns == 2
96
+ assert pq_metadata.num_rows == 8
97
+ assert pq_metadata.num_row_groups == 1
98
+ assert pq_metadata.format_version == "2.6"
99
+
100
+ assert (
101
+ test_delta.manifest.entries[self.TEST_ENTRY_INDEX].meta.content_type
102
+ == ContentType.PARQUET.value
103
+ )
104
+
105
+ @pytest.mark.parametrize(
106
+ "read_kwargs_provider,expected_values",
107
+ [
108
+ (
109
+ ReadKwargsProviderPyArrowCsvPureUtf8(),
110
+ {
111
+ "num_rows": 6,
112
+ "num_columns": 2,
113
+ "num_row_groups": 1,
114
+ "format_version": "2.6",
115
+ "column_types": [pa.string(), pa.string()],
116
+ },
117
+ ),
118
+ (
119
+ ReadKwargsProviderPyArrowSchemaOverride(
120
+ schema=pa.schema(
121
+ [
122
+ ("id", pa.string()),
123
+ ("value", pa.int64()),
124
+ ]
125
+ )
126
+ ),
127
+ {
128
+ "num_rows": 6,
129
+ "num_columns": 2,
130
+ "num_row_groups": 1,
131
+ "format_version": "2.6",
132
+ "column_types": [pa.string(), pa.int64()],
133
+ },
134
+ ),
135
+ (
136
+ ReadKwargsProviderPyArrowSchemaOverride(
137
+ schema=None,
138
+ pq_coerce_int96_timestamp_unit="ms",
139
+ parquet_reader_type="daft",
140
+ ),
141
+ {
142
+ "num_rows": 6,
143
+ "num_columns": 2,
144
+ "num_row_groups": 1,
145
+ "format_version": "2.6",
146
+ "column_types": None, # Will use default type inference
147
+ },
148
+ ),
149
+ ],
150
+ )
151
+ def test__download_parquet_metadata_for_manifest_entry_with_read_kwargs_provider(
152
+ self, read_kwargs_provider, expected_values, main_deltacat_storage_kwargs
153
+ ):
154
+ from deltacat.compute.compactor_v2.utils.content_type_params import (
155
+ _download_parquet_metadata_for_manifest_entry,
156
+ )
157
+
158
+ # Create schema from CSV file
159
+ csv_table = create_table_from_csv_file_paths(
160
+ [self.DEDUPE_NO_DUPLICATION_STRING_PK]
161
+ )
162
+ schema = Schema.of(csv_table.schema)
163
+ partition = stage_partition_from_file_paths(
164
+ self.TEST_NAMESPACE,
165
+ [self.DEDUPE_NO_DUPLICATION_STRING_PK],
166
+ schema,
167
+ **main_deltacat_storage_kwargs,
168
+ )
169
+ test_delta = commit_delta_to_staged_partition(
170
+ partition,
171
+ csv_table,
172
+ **main_deltacat_storage_kwargs,
173
+ )
174
+ test_entry_index = 0
175
+ obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
176
+ test_delta,
177
+ test_entry_index,
178
+ ["pk", "value"],
179
+ metastore,
180
+ main_deltacat_storage_kwargs,
181
+ read_kwargs_provider,
182
+ )
183
+ parquet_metadata = ray.get(obj_ref)
184
+ partial_parquet_params = parquet_metadata["partial_parquet_params"]
185
+
186
+ # validate
187
+ assert isinstance(parquet_metadata, dict)
188
+ assert "entry_index" in parquet_metadata
189
+ assert "partial_parquet_params" in parquet_metadata
190
+ assert parquet_metadata["entry_index"] == self.TEST_ENTRY_INDEX
191
+
192
+ assert partial_parquet_params.row_groups_to_download == [0]
193
+ assert (
194
+ partial_parquet_params.num_row_groups == expected_values["num_row_groups"]
195
+ )
196
+ assert partial_parquet_params.num_rows == expected_values["num_rows"]
197
+ assert isinstance(partial_parquet_params.in_memory_size_bytes, float)
198
+ assert partial_parquet_params.in_memory_size_bytes > 0
199
+
200
+ pq_metadata = partial_parquet_params.pq_metadata
201
+ assert pq_metadata.num_columns == expected_values["num_columns"]
202
+ assert pq_metadata.num_rows == expected_values["num_rows"]
203
+ assert pq_metadata.num_row_groups == expected_values["num_row_groups"]
204
+ assert pq_metadata.format_version == expected_values["format_version"]
205
+
206
+ assert (
207
+ test_delta.manifest.entries[self.TEST_ENTRY_INDEX].meta.content_type
208
+ == ContentType.PARQUET.value
209
+ )
210
+
211
+ def test_download_parquet_metadata_for_manifest_entry_file_reader_kwargs_present_top_level_and_deltacat_storage_kwarg(
212
+ self, main_deltacat_storage_kwargs, caplog
213
+ ):
214
+ from deltacat.compute.compactor_v2.utils.content_type_params import (
215
+ _download_parquet_metadata_for_manifest_entry,
216
+ )
217
+
218
+ test_file_reader_kwargs_provider = ReadKwargsProviderPyArrowCsvPureUtf8()
219
+
220
+ main_deltacat_storage_kwargs[
221
+ "file_reader_kwargs_provider"
222
+ ] = ReadKwargsProviderPyArrowCsvPureUtf8()
223
+
224
+ # Create schema from CSV file
225
+ csv_table = create_table_from_csv_file_paths(
226
+ [self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK]
227
+ )
228
+ schema = Schema.of(csv_table.schema)
229
+ partition = stage_partition_from_file_paths(
230
+ self.TEST_NAMESPACE,
231
+ [self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
232
+ schema,
233
+ **main_deltacat_storage_kwargs,
234
+ )
235
+ test_delta = commit_delta_to_staged_partition(
236
+ partition,
237
+ csv_table,
238
+ **main_deltacat_storage_kwargs,
239
+ )
240
+ test_entry_index = 0
241
+ obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
242
+ test_delta,
243
+ test_entry_index,
244
+ ["pk", "value"],
245
+ metastore,
246
+ main_deltacat_storage_kwargs,
247
+ test_file_reader_kwargs_provider,
248
+ )
249
+ parquet_metadata = ray.get(obj_ref)
250
+
251
+ # validate
252
+ assert isinstance(parquet_metadata, dict)
253
+ assert "entry_index" in parquet_metadata
254
+ assert "partial_parquet_params" in parquet_metadata
255
+ assert parquet_metadata["entry_index"] == test_entry_index
256
+
257
+ # Check that warning was logged about duplicate file_reader_kwargs_provider
258
+ # Note: In main storage, this warning might not be logged or captured due to Ray remote execution
259
+ # The main functionality is validated by successful parquet_metadata retrieval
260
+ print(f"Captured {len(caplog.records)} log records")
261
+ if len(caplog.records) > 0:
262
+ assert any(
263
+ "file_reader_kwargs_provider" in record.message
264
+ for record in caplog.records
265
+ )
266
+ # Test passes as long as the main functionality works (parquet_metadata retrieval)
@@ -0,0 +1,45 @@
1
+ import pyarrow as pa
2
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
3
+ group_by_pk_hash_bucket,
4
+ )
5
+
6
+
7
+ class TestGroupByPkHashBucket:
8
+ def test_sanity(self):
9
+ record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
10
+ pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
11
+ record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
12
+ table = pa.Table.from_batches([record_batch])
13
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
14
+
15
+ assert len(grouped_array) == 3
16
+ total_records = 0
17
+ for arr in grouped_array:
18
+ if arr is not None:
19
+ total_records += len(arr[1])
20
+
21
+ assert total_records == len(table)
22
+
23
+ def test_when_record_batches_exceed_int_max_size(self):
24
+ record = pa.array(["12bytestring" * 90_000_000])
25
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
26
+ table = pa.Table.from_batches([record_batch, record_batch])
27
+
28
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
29
+
30
+ assert len(grouped_array) == 3
31
+ # two record batches are preserved as combining them
32
+ # would exceed 2GB.
33
+ assert len(grouped_array[2].to_batches()) == 2
34
+
35
+ def test_when_record_batches_less_than_int_max_size(self):
36
+ record = pa.array(["12bytestring" * 90_000])
37
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
38
+ table = pa.Table.from_batches([record_batch, record_batch])
39
+
40
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
41
+
42
+ assert len(grouped_array) == 3
43
+ # Combined the arrays into one record batch as the size
44
+ # would not exceed 2GB.
45
+ assert len(grouped_array[1].to_batches()) == 1
@@ -1,6 +1,36 @@
1
1
  import unittest
2
2
  import ray
3
- from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
3
+ from deltacat.compute.compactor_v2.utils.task_options import (
4
+ _get_task_options,
5
+ _get_merge_task_options,
6
+ logger,
7
+ )
8
+ from deltacat.compute.resource_estimation.model import (
9
+ EstimateResourcesParams,
10
+ ResourceEstimationMethod,
11
+ )
12
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
13
+ from deltacat.compute.compactor import (
14
+ PyArrowWriteResult,
15
+ RoundCompletionInfo,
16
+ )
17
+ from deltacat.types.media import (
18
+ ContentType,
19
+ ContentEncoding,
20
+ )
21
+ from deltacat.storage import (
22
+ DeltaLocator,
23
+ Manifest,
24
+ ManifestMeta,
25
+ ManifestEntry,
26
+ ManifestEntryList,
27
+ )
28
+ from unittest.mock import MagicMock
29
+ from typing import Optional
30
+
31
+ from deltacat.compute.compactor_v2.constants import (
32
+ AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
33
+ )
4
34
 
5
35
 
6
36
  @ray.remote
@@ -14,11 +44,93 @@ def throwing_func():
14
44
 
15
45
 
16
46
  class TestTaskOptions(unittest.TestCase):
47
+ TEST_INDEX = 0
48
+ TEST_HB_GROUP_IDX = 0
49
+ TEST_STREAM_POSITION = 1_000_000
50
+ TEST_NUM_HASH_GROUPS = 1
51
+
17
52
  @classmethod
18
53
  def setUpClass(cls):
19
54
  ray.init(local_mode=True, ignore_reinit_error=True)
20
55
  super().setUpClass()
21
56
 
57
+ @classmethod
58
+ def tearDownClass(cls) -> None:
59
+ ray.shutdown()
60
+
61
+ def _make_estimate_resource_params(
62
+ cls,
63
+ resource_estimation_method: Optional[
64
+ ResourceEstimationMethod
65
+ ] = ResourceEstimationMethod.DEFAULT,
66
+ previous_inflation: Optional[int] = 7,
67
+ average_record_size_bytes: Optional[int] = 1000,
68
+ ):
69
+ return EstimateResourcesParams.of(
70
+ resource_estimation_method=resource_estimation_method,
71
+ previous_inflation=previous_inflation,
72
+ average_record_size_bytes=average_record_size_bytes,
73
+ )
74
+
75
+ def _make_manifest(
76
+ self,
77
+ source_content_length: Optional[int] = 1000,
78
+ content_type: Optional[ContentType] = ContentType.PARQUET,
79
+ content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
80
+ uri: Optional[str] = "test",
81
+ url: Optional[str] = "test",
82
+ author: Optional[str] = "foo",
83
+ entry_uuid: Optional[str] = "foo",
84
+ manifest_uuid: Optional[str] = "bar",
85
+ ) -> Manifest:
86
+ meta = ManifestMeta.of(
87
+ 10,
88
+ 10,
89
+ content_type=content_type,
90
+ content_encoding=content_encoding,
91
+ source_content_length=source_content_length,
92
+ )
93
+
94
+ return Manifest.of(
95
+ entries=ManifestEntryList.of(
96
+ [
97
+ ManifestEntry.of(
98
+ uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
99
+ )
100
+ ]
101
+ ),
102
+ author=author,
103
+ uuid=manifest_uuid,
104
+ )
105
+
106
+ def make_round_completion_info(
107
+ self,
108
+ high_watermark: Optional[int] = 1_000_000,
109
+ compacted_delta_locator: Optional[DeltaLocator] = None,
110
+ records_written: Optional[int] = 10,
111
+ bytes_written: Optional[int] = 10,
112
+ files_written: Optional[int] = 10,
113
+ rows_dropped: Optional[int] = 10,
114
+ sort_keys_bit_width: Optional[int] = 0,
115
+ hash_bucket_count: Optional[int] = 1,
116
+ hb_index_to_entry_range: Optional[dict] = None,
117
+ ) -> RoundCompletionInfo:
118
+ if compacted_delta_locator is None:
119
+ compacted_delta_locator = MagicMock(spec=DeltaLocator)
120
+
121
+ hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
122
+
123
+ return RoundCompletionInfo.of(
124
+ compacted_delta_locator=compacted_delta_locator,
125
+ high_watermark=high_watermark,
126
+ compacted_pyarrow_write_result=PyArrowWriteResult.of(
127
+ records_written, bytes_written, files_written, rows_dropped
128
+ ),
129
+ sort_keys_bit_width=sort_keys_bit_width,
130
+ hb_index_to_entry_range=hb_index_to_entry_range,
131
+ hash_bucket_count=hash_bucket_count,
132
+ )
133
+
22
134
  def test_get_task_options_sanity(self):
23
135
  opts = _get_task_options(0.01, 0.01)
24
136
  result_ref = valid_func.options(**opts).remote()
@@ -31,3 +143,160 @@ class TestTaskOptions(unittest.TestCase):
31
143
  result_ref = throwing_func.options(**opts).remote()
32
144
 
33
145
  self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
146
+
147
+ def test_get_merge_task_options_memory_logs_enabled_sanity(self):
148
+ test_index = 0
149
+ test_hb_group_idx = 0
150
+ test_debug_memory_params = {"merge_task_index": test_index}
151
+ test_estimate_memory_params = self._make_estimate_resource_params()
152
+ test_ray_custom_resources = {}
153
+ test_rcf = self.make_round_completion_info()
154
+ test_manifest = self._make_manifest()
155
+ expected_task_opts = {
156
+ "max_retries": 3,
157
+ "memory": 1680.64,
158
+ "num_cpus": 0.01,
159
+ "scheduling_strategy": "SPREAD",
160
+ }
161
+ expected_previous_inflation = 1.0
162
+ expected_average_record_size = 1.0
163
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
164
+ # At least one log of level DEBUG must be emitted
165
+ actual_merge_tasks_opts = _get_merge_task_options(
166
+ index=test_index,
167
+ hb_group_idx=test_hb_group_idx,
168
+ data_size=1,
169
+ pk_size_bytes=1,
170
+ num_rows=1,
171
+ num_hash_groups=1,
172
+ total_memory_buffer_percentage=1,
173
+ incremental_index_array_size=1,
174
+ debug_memory_params=test_debug_memory_params,
175
+ ray_custom_resources=test_ray_custom_resources,
176
+ estimate_resources_params=test_estimate_memory_params,
177
+ round_completion_info=test_rcf,
178
+ compacted_delta_manifest=test_manifest,
179
+ memory_logs_enabled=True,
180
+ )
181
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
182
+ log_message_round_completion_info = cm.records[0].getMessage()
183
+ log_message_debug_memory_params = cm.records[1].getMessage()
184
+ self.assertIn(
185
+ f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
186
+ log_message_round_completion_info,
187
+ )
188
+ self.assertIn(
189
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
190
+ log_message_debug_memory_params,
191
+ )
192
+ self.assertIn(
193
+ f"'previous_inflation': {expected_previous_inflation}",
194
+ log_message_debug_memory_params,
195
+ )
196
+ self.assertIn(
197
+ f"'average_record_size': {expected_average_record_size}",
198
+ log_message_debug_memory_params,
199
+ )
200
+
201
+ def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
202
+ self,
203
+ ):
204
+ test_index = 0
205
+ test_hb_group_idx = 0
206
+ test_debug_memory_params = {"merge_task_index": test_index}
207
+ test_estimate_memory_params = self._make_estimate_resource_params()
208
+ test_ray_custom_resources = {}
209
+ test_rcf = self.make_round_completion_info(
210
+ bytes_written=0, records_written=0, files_written=0, rows_dropped=0
211
+ )
212
+ test_manifest = self._make_manifest()
213
+ expected_task_opts = {
214
+ "max_retries": 3,
215
+ "memory": 1680.64,
216
+ "num_cpus": 0.01,
217
+ "scheduling_strategy": "SPREAD",
218
+ }
219
+ expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
220
+ expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
221
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
222
+ # At least one log of level DEBUG must be emitted
223
+ actual_merge_tasks_opts = _get_merge_task_options(
224
+ index=test_index,
225
+ hb_group_idx=test_hb_group_idx,
226
+ data_size=1,
227
+ pk_size_bytes=1,
228
+ num_rows=1,
229
+ num_hash_groups=1,
230
+ total_memory_buffer_percentage=1,
231
+ incremental_index_array_size=1,
232
+ debug_memory_params=test_debug_memory_params,
233
+ ray_custom_resources=test_ray_custom_resources,
234
+ estimate_resources_params=test_estimate_memory_params,
235
+ round_completion_info=test_rcf,
236
+ compacted_delta_manifest=test_manifest,
237
+ memory_logs_enabled=True,
238
+ )
239
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
240
+ log_message_round_completion_info = cm.records[0].getMessage()
241
+ log_message_debug_memory_params = cm.records[1].getMessage()
242
+ self.assertIn(
243
+ f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
244
+ log_message_round_completion_info,
245
+ )
246
+ self.assertIn(
247
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
248
+ log_message_debug_memory_params,
249
+ )
250
+ self.assertIn(
251
+ f"'previous_inflation': {expected_previous_inflation}",
252
+ log_message_debug_memory_params,
253
+ )
254
+ self.assertIn(
255
+ f"'average_record_size': {expected_average_record_size}",
256
+ log_message_debug_memory_params,
257
+ )
258
+
259
+ def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
260
+ self,
261
+ ):
262
+ test_index = 0
263
+ test_hb_group_idx = 0
264
+ test_debug_memory_params = {"merge_task_index": test_index}
265
+ test_estimate_memory_params = self._make_estimate_resource_params()
266
+ test_ray_custom_resources = {}
267
+ test_rcf = None
268
+ test_manifest = self._make_manifest()
269
+ expected_task_opts = {
270
+ "max_retries": 3,
271
+ "memory": 1680.64,
272
+ "num_cpus": 0.01,
273
+ "scheduling_strategy": "SPREAD",
274
+ }
275
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
276
+ # At least one log of level DEBUG must be emitted
277
+ actual_merge_tasks_opts = _get_merge_task_options(
278
+ index=test_index,
279
+ hb_group_idx=test_hb_group_idx,
280
+ data_size=1,
281
+ pk_size_bytes=1,
282
+ num_rows=1,
283
+ num_hash_groups=1,
284
+ total_memory_buffer_percentage=1,
285
+ incremental_index_array_size=1,
286
+ debug_memory_params=test_debug_memory_params,
287
+ ray_custom_resources=test_ray_custom_resources,
288
+ estimate_resources_params=test_estimate_memory_params,
289
+ round_completion_info=test_rcf,
290
+ compacted_delta_manifest=test_manifest,
291
+ memory_logs_enabled=True,
292
+ )
293
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
294
+ log_message_debug_memory_params = cm.records[0].getMessage()
295
+ self.assertIn(
296
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
297
+ log_message_debug_memory_params,
298
+ )
299
+ self.assertNotIn(
300
+ "'average_record_size'",
301
+ log_message_debug_memory_params,
302
+ )
@@ -1,9 +1,8 @@
1
- import os
2
1
  import tempfile
3
2
  import shutil
4
- from typing import Dict
5
3
 
6
4
  import pytest
5
+ from deltacat.catalog.model.properties import CatalogProperties
7
6
 
8
7
 
9
8
  @pytest.fixture
@@ -25,51 +24,16 @@ def temp_dir():
25
24
 
26
25
 
27
26
  @pytest.fixture(scope="function")
28
- def local_deltacat_storage_kwargs(temp_dir):
27
+ def main_deltacat_storage_kwargs(temp_dir):
29
28
  """
30
- Fixture that creates a temporary database file for each test function
31
- and returns storage kwargs dictionary.
29
+ Fixture that creates a CatalogProperties object for each test function
30
+ using the main metastore implementation and cleans up afterwards.
32
31
 
33
32
  Returns:
34
- dict: A dictionary with db_file_path key pointing to a temporary database file
33
+ dict: A dictionary with 'inner' key pointing to CatalogProperties
35
34
  """
36
- # Create a unique database file in the temporary directory
37
- db_file_path = os.path.join(temp_dir, "db_test.sqlite")
38
-
39
- # Return kwargs dictionary ready to use
40
- kwargs = {"db_file_path": db_file_path}
35
+ catalog = CatalogProperties(root=temp_dir)
36
+ kwargs = {"inner": catalog}
41
37
  yield kwargs
42
38
 
43
- # Cleanup: remove the database file if it exists
44
- if os.path.exists(db_file_path):
45
- os.remove(db_file_path)
46
-
47
-
48
- def create_local_deltacat_storage_file() -> Dict[str, str]:
49
- """
50
- Helper function to create a local deltacat storage file
51
-
52
- Essentially uses the same approach as local_deltacat_storage_kwargs, but more flexible
53
- if the consumer does not want to use a function scoped fixture
54
-
55
- Returns: kwargs to use for local deltacat storage, i.e. {"db_file_path": $db_file}
56
- """
57
- temp_dir = tempfile.mkdtemp()
58
- db_file_path = os.path.join(temp_dir, "db_test.sqlite")
59
- return {"db_file_path": db_file_path}
60
-
61
-
62
- def clean_up_local_deltacat_storage_file(local_storage_kwargs: Dict[str, str]):
63
- """
64
- Cleans up local file and directory created by create_local_deltacat_storage_file
65
- """
66
- db_file = local_storage_kwargs["db_file_path"]
67
- dir_path = os.path.dirname(db_file)
68
-
69
- # Remove the database file if it exists
70
- if os.path.exists(db_file):
71
- os.remove(db_file)
72
-
73
- # Remove the temporary directory if it exists
74
- if os.path.exists(dir_path):
75
- shutil.rmtree(dir_path)
39
+ # Cleanup happens automatically via temp_dir fixture