deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,148 @@
1
+ from tenacity import (
2
+ Retrying,
3
+ retry_if_exception_type,
4
+ stop_after_delay,
5
+ wait_random_exponential,
6
+ )
7
+ from typing import Union, Optional, Dict, Any, List, Callable
8
+ from deltacat.types.tables import (
9
+ CapturedBlockWritePaths,
10
+ UuidBlockWritePathProvider,
11
+ )
12
+ from deltacat.types.tables import (
13
+ get_table_writer,
14
+ get_table_length,
15
+ TABLE_CLASS_TO_SLICER_FUNC,
16
+ )
17
+ from deltacat.exceptions import RetryableError
18
+ from deltacat.storage import (
19
+ DistributedDataset,
20
+ LocalTable,
21
+ )
22
+ from deltacat.types.media import (
23
+ ContentEncoding,
24
+ ContentType,
25
+ )
26
+ from deltacat.constants import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
27
+ import s3fs
28
+ import boto3
29
+ from boto3.session import Session
30
+ from botocore.credentials import Credentials
31
+
32
+
33
+ def get_credential() -> Credentials:
34
+ boto3_session: Session = boto3.Session()
35
+ credentials: Credentials = boto3_session.get_credentials()
36
+ return credentials
37
+
38
+
39
+ def get_s3_file_system(content_type):
40
+ import s3fs # noqa: F401
41
+
42
+ token_holder = get_credential()
43
+ content_encoding = ContentEncoding.IDENTITY
44
+
45
+ s3_file_system = s3fs.S3FileSystem(
46
+ key=token_holder.access_key,
47
+ secret=token_holder.secret_key,
48
+ token=token_holder.token,
49
+ s3_additional_kwargs={
50
+ "ServerSideEncryption": "aws:kms",
51
+ # TODO: Get tagging from table properties
52
+ "ContentType": content_type.value,
53
+ "ContentEncoding": content_encoding.value,
54
+ },
55
+ )
56
+ return s3_file_system
57
+
58
+
59
+ def upload_table_with_retry(
60
+ table: Union[LocalTable, DistributedDataset],
61
+ s3_url_prefix: str,
62
+ s3_table_writer_kwargs: Optional[Dict[str, Any]],
63
+ content_type: ContentType = ContentType.PARQUET,
64
+ max_records_per_file: Optional[int] = 4000000,
65
+ filesystem: Optional[s3fs.S3FileSystem] = None,
66
+ **s3_client_kwargs: Any,
67
+ ) -> List[str]:
68
+ """
69
+ Writes the given table to 1 or more S3 files and return the paths
70
+ of the S3 files written.
71
+ """
72
+ retrying = Retrying(
73
+ wait=wait_random_exponential(multiplier=1, max=60),
74
+ stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
75
+ retry=retry_if_exception_type(RetryableError),
76
+ )
77
+
78
+ if s3_table_writer_kwargs is None:
79
+ s3_table_writer_kwargs = {}
80
+
81
+ if not filesystem:
82
+ filesystem = get_s3_file_system(content_type=content_type)
83
+ capture_object = CapturedBlockWritePaths()
84
+ block_write_path_provider = UuidBlockWritePathProvider(
85
+ capture_object=capture_object, base_path=s3_url_prefix
86
+ )
87
+ s3_table_writer_func = get_table_writer(table)
88
+ table_record_count = get_table_length(table)
89
+ if max_records_per_file is None or not table_record_count:
90
+ retrying(
91
+ fn=upload_table,
92
+ table_slices=table,
93
+ s3_base_url=f"{s3_url_prefix}",
94
+ s3_file_system=filesystem,
95
+ s3_table_writer_func=s3_table_writer_func,
96
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
97
+ block_write_path_provider=block_write_path_provider,
98
+ content_type=content_type,
99
+ **s3_client_kwargs,
100
+ )
101
+ else:
102
+ table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
103
+ table_slices = table_slicer_func(table, max_records_per_file)
104
+ for table_slice in table_slices:
105
+ retrying(
106
+ fn=upload_table,
107
+ table_slices=table_slice,
108
+ s3_base_url=f"{s3_url_prefix}",
109
+ s3_file_system=filesystem,
110
+ s3_table_writer_func=s3_table_writer_func,
111
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
112
+ block_write_path_provider=block_write_path_provider,
113
+ content_type=content_type,
114
+ **s3_client_kwargs,
115
+ )
116
+ del block_write_path_provider
117
+ write_paths = capture_object.write_paths()
118
+ s3_write_paths = []
119
+ for path in write_paths:
120
+ s3_write_path = construct_s3_url(path)
121
+ s3_write_paths.append(s3_write_path)
122
+ return s3_write_paths
123
+
124
+
125
+ def construct_s3_url(path: Optional[str]) -> Optional[str]:
126
+ if path:
127
+ return f"s3://{path}"
128
+ return None
129
+
130
+
131
+ def upload_table(
132
+ table_slices: Union[LocalTable, DistributedDataset],
133
+ s3_base_url: str,
134
+ s3_file_system: s3fs.S3FileSystem,
135
+ s3_table_writer_func: Callable,
136
+ block_write_path_provider: UuidBlockWritePathProvider,
137
+ content_type: ContentType,
138
+ s3_table_writer_kwargs: Dict[str, Any],
139
+ ) -> None:
140
+ s3_table_writer_func(
141
+ table_slices,
142
+ s3_base_url,
143
+ s3_file_system,
144
+ block_write_path_provider,
145
+ content_type.value,
146
+ **s3_table_writer_kwargs,
147
+ )
148
+ # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
@@ -0,0 +1,205 @@
1
+ import time
2
+ import os
3
+ import posixpath
4
+ import pyarrow.fs
5
+ from pyarrow.fs import FileSelector, FileType
6
+ from itertools import chain
7
+ from deltacat.storage.model.transaction import Transaction
8
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
9
+ from deltacat.constants import (
10
+ TXN_DIR_NAME,
11
+ RUNNING_TXN_DIR_NAME,
12
+ FAILED_TXN_DIR_NAME,
13
+ TXN_PART_SEPARATOR,
14
+ )
15
+ from deltacat.storage.model.types import TransactionState
16
+ import logging
17
+ from deltacat import logs
18
+
19
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
+
21
+
22
+ def brute_force_search_matching_metafiles(
23
+ dirty_files_names, filesystem: pyarrow.fs.FileSystem, catalog_root
24
+ ):
25
+ txn_dir_name = TXN_DIR_NAME
26
+ # collect transaction ids of the files
27
+ transaction_ids = []
28
+ for dirty_file in dirty_files_names:
29
+ parts = dirty_file.split(TXN_PART_SEPARATOR)
30
+ if len(parts) < 2:
31
+ continue
32
+ transaction_ids.append(parts[1])
33
+
34
+ def recursive_search(path):
35
+ try:
36
+ selector = FileSelector(path, recursive=False)
37
+ entries = filesystem.get_file_info(selector)
38
+ except Exception as e:
39
+ logger.error(f"Error listing directory '{path}': {e}")
40
+ return
41
+
42
+ for entry in entries:
43
+ base_name = posixpath.basename(entry.path)
44
+ if entry.type == FileType.File:
45
+ for transaction_id in transaction_ids:
46
+ # Look for transaction_id in the filename
47
+ if transaction_id in base_name:
48
+ try:
49
+ filesystem.delete_file(entry.path)
50
+ logger.debug(f"Deleted file: {entry.path}")
51
+ except Exception as e:
52
+ logger.error(f"Error deleting file '{entry.path}': {e}")
53
+
54
+ elif entry.type == FileType.Directory:
55
+ # Skip directories that match txn_dir_name
56
+ if posixpath.basename(entry.path) == txn_dir_name:
57
+ logger.debug(f"Skipping directory: {entry.path}")
58
+ continue
59
+ recursive_search(entry.path)
60
+
61
+ # Start recursive search from the catalog root
62
+ recursive_search(catalog_root)
63
+
64
+ # renaming to successful completion
65
+ for dirty_file in dirty_files_names:
66
+ failed_txn_log_dir = posixpath.join(
67
+ catalog_root, TXN_DIR_NAME, FAILED_TXN_DIR_NAME
68
+ )
69
+ old_log_path = posixpath.join(failed_txn_log_dir, dirty_file)
70
+
71
+ # new_filename = dirty_file.replace(TIMEOUT_TXN, SUCCESSFULLY_CLEANED)
72
+ new_log_path = posixpath.join(failed_txn_log_dir, dirty_file)
73
+ try:
74
+ filesystem.move(old_log_path, new_log_path)
75
+ logger.debug(f"Renamed file from {old_log_path} to {new_log_path}")
76
+ except Exception as e:
77
+ logger.error(f"Error renaming file '{old_log_path}': {e}")
78
+
79
+
80
+ def janitor_delete_timed_out_transaction(catalog_root: str) -> None:
81
+ """
82
+ Traverse the running transactions directory and move transactions that have been
83
+ running longer than the threshold into the failed transactions directory.
84
+ """
85
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(catalog_root)
86
+
87
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
88
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
89
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
90
+
91
+ dirty_files = []
92
+
93
+ running_txn_file_selector = FileSelector(running_txn_log_dir, recursive=False)
94
+ running_txn_info_list = filesystem.get_file_info(running_txn_file_selector)
95
+
96
+ for running_txn_info in running_txn_info_list:
97
+ try:
98
+ filename = posixpath.basename(running_txn_info.path)
99
+ parts = filename.split(TXN_PART_SEPARATOR)
100
+ end_time_str = parts[-1]
101
+ end_time = float(end_time_str)
102
+ current_time = time.time_ns()
103
+ if end_time <= current_time:
104
+ src_path = running_txn_info.path
105
+ new_filename = f"{filename}"
106
+ dest_path = posixpath.join(failed_txn_log_dir, new_filename)
107
+
108
+ # Move the file using copy and delete
109
+ with filesystem.open_input_file(src_path) as src_file:
110
+ contents = src_file.read()
111
+
112
+ with filesystem.open_output_stream(dest_path) as dest_file:
113
+ dest_file.write(contents)
114
+ filesystem.delete_file(src_path)
115
+
116
+ dirty_files.append(new_filename)
117
+
118
+ except Exception as e:
119
+ logger.error(
120
+ f"Error cleaning failed transaction '{running_txn_info.path}': {e}"
121
+ )
122
+
123
+ # Pass catalog_root to the brute force search so it searches from the right place
124
+ brute_force_search_matching_metafiles(
125
+ dirty_files, filesystem, catalog_root_normalized
126
+ )
127
+
128
+
129
+ def janitor_remove_files_in_failed(
130
+ catalog_root: str, filesystem: pyarrow.fs.FileSystem = None
131
+ ) -> None:
132
+ """
133
+ Cleans up metafiles and locator files associated with failed transactions.
134
+ """
135
+ if filesystem is None:
136
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(catalog_root)
137
+ else:
138
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
139
+ catalog_root, filesystem
140
+ )
141
+
142
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
143
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
144
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
145
+ filesystem.create_dir(failed_txn_log_dir, recursive=True)
146
+
147
+ failed_txn_file_selector = FileSelector(failed_txn_log_dir, recursive=False)
148
+ failed_txn_info_list = filesystem.get_file_info(failed_txn_file_selector)
149
+
150
+ for failed_txn_info in failed_txn_info_list:
151
+ try:
152
+ txn = Transaction.read(failed_txn_info.path, filesystem)
153
+ failed_txn_basename = posixpath.basename(failed_txn_info.path)
154
+ should_process = True
155
+ try:
156
+ if txn.state(catalog_root_normalized) == TransactionState.PURGED:
157
+ should_process = False
158
+ except Exception:
159
+ logger.error("Could not check attribute")
160
+ if should_process:
161
+ # Process if the file is marked as currently cleaning.
162
+ txnid = txn.id
163
+
164
+ if txn.state(catalog_root_normalized) == TransactionState.FAILED:
165
+
166
+ txnid = txn.id
167
+
168
+ operations = txn["operations"]
169
+ known_write_paths = chain.from_iterable(
170
+ (op["metafile_write_paths"] + op["locator_write_paths"])
171
+ for op in operations
172
+ )
173
+
174
+ for write_path in known_write_paths:
175
+ full_path = posixpath.join(catalog_root_normalized, write_path)
176
+ try:
177
+ filesystem.delete_file(full_path)
178
+ except Exception as e:
179
+ logger.error(f"Failed to delete file '{full_path}': {e}")
180
+
181
+ new_filename = f"{txnid}"
182
+
183
+ new_failed_txn_log_file_path = posixpath.join(
184
+ failed_txn_log_dir, new_filename
185
+ )
186
+ running_txn_log_path = posixpath.join(
187
+ running_txn_log_dir, new_filename
188
+ )
189
+
190
+ os.delete(running_txn_log_path)
191
+
192
+ os.rename(failed_txn_info.path, new_failed_txn_log_file_path)
193
+ logger.debug(
194
+ f"Cleaned up failed transaction: {failed_txn_basename}"
195
+ )
196
+
197
+ except Exception as e:
198
+ logger.error(
199
+ f"Could not read transaction '{failed_txn_info.path}', skipping: {e}"
200
+ )
201
+
202
+
203
+ def janitor_job(catalog_root_dir: str) -> None:
204
+ janitor_delete_timed_out_transaction(catalog_root_dir)
205
+ janitor_remove_files_in_failed(catalog_root_dir)
File without changes