deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,144 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, Dict, Any, List
3
+ from deltacat.compute.converter.constants import (
4
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
5
+ )
6
+ from deltacat.constants import DEFAULT_NAMESPACE
7
+ from fsspec import AbstractFileSystem
8
+ from pyiceberg.catalog import Catalog
9
+
10
+
11
+ class ConverterSessionParams(dict):
12
+ """
13
+ This class represents the parameters passed to convert_ (deltacat/compute/compactor/compaction_session.py)
14
+ """
15
+
16
+ @staticmethod
17
+ def of(params: Optional[Dict[str, Any]]) -> ConverterSessionParams:
18
+ params = {} if params is None else params
19
+ assert params.get("catalog") is not None, "catalog is a required arg"
20
+ assert (
21
+ params.get("iceberg_table_name") is not None
22
+ ), "iceberg_table_name is a required arg"
23
+ assert (
24
+ params.get("iceberg_warehouse_bucket_name") is not None
25
+ ), "iceberg_warehouse_bucket_name is a required arg"
26
+ result = ConverterSessionParams(params)
27
+
28
+ result.iceberg_namespace = params.get("iceberg_namespace", DEFAULT_NAMESPACE)
29
+ result.enforce_primary_key_uniqueness = params.get(
30
+ "enforce_primary_key_uniqueness", False
31
+ )
32
+ result.compact_previous_position_delete_files = params.get(
33
+ "compact_previous_position_delete_files", False
34
+ )
35
+
36
+ # For Iceberg v3 spec, option to produce delete vector that can establish 1:1 mapping with data files.
37
+ result.position_delete_for_multiple_data_files = params.get(
38
+ "position_delete_for_multiple_data_files", True
39
+ )
40
+ result.task_max_parallelism = params.get(
41
+ "task_max_parallelism", DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
42
+ )
43
+ result.merge_keys = params.get("merge_keys", None)
44
+ result.s3_client_kwargs = params.get("s3_client_kwargs", {})
45
+ result.filesystem = params.get("filesystem", None)
46
+ result.s3_prefix_override = params.get("s3_prefix_override", None)
47
+
48
+ return result
49
+
50
+ @property
51
+ def catalog(self) -> Catalog:
52
+ return self["catalog"]
53
+
54
+ @property
55
+ def iceberg_table_name(self) -> str:
56
+ return self["iceberg_table_name"]
57
+
58
+ @property
59
+ def iceberg_warehouse_bucket_name(self) -> str:
60
+ return self["iceberg_warehouse_bucket_name"]
61
+
62
+ @property
63
+ def iceberg_namespace(self) -> str:
64
+ return self["iceberg_namespace"]
65
+
66
+ @iceberg_namespace.setter
67
+ def iceberg_namespace(self, iceberg_namespace: str) -> None:
68
+ self["iceberg_namespace"] = iceberg_namespace
69
+
70
+ @property
71
+ def enforce_primary_key_uniqueness(self) -> bool:
72
+ return self["enforce_primary_key_uniqueness"]
73
+
74
+ @enforce_primary_key_uniqueness.setter
75
+ def enforce_primary_key_uniqueness(
76
+ self, enforce_primary_key_uniqueness: bool
77
+ ) -> None:
78
+ self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
79
+
80
+ @property
81
+ def compact_previous_position_delete_files(self) -> bool:
82
+ return self["compact_previous_position_delete_files"]
83
+
84
+ @compact_previous_position_delete_files.setter
85
+ def compact_previous_position_delete_files(
86
+ self, compact_previous_position_delete_files: bool
87
+ ) -> None:
88
+ self[
89
+ "compact_previous_position_delete_files"
90
+ ] = compact_previous_position_delete_files
91
+
92
+ @property
93
+ def position_delete_for_multiple_data_files(self) -> bool:
94
+ return self["position_delete_for_multiple_data_files"]
95
+
96
+ @position_delete_for_multiple_data_files.setter
97
+ def position_delete_for_multiple_data_files(
98
+ self, position_delete_for_multiple_data_files: bool
99
+ ) -> None:
100
+ self[
101
+ "position_delete_for_multiple_data_files"
102
+ ] = position_delete_for_multiple_data_files
103
+
104
+ @property
105
+ def task_max_parallelism(self) -> int:
106
+ return self["task_max_parallelism"]
107
+
108
+ @task_max_parallelism.setter
109
+ def task_max_parallelism(self, task_max_parallelism: int) -> None:
110
+ self["task_max_parallelism"] = task_max_parallelism
111
+
112
+ @property
113
+ def merge_keys(self) -> Optional[List[str]]:
114
+ return self["merge_keys"]
115
+
116
+ @merge_keys.setter
117
+ def merge_keys(self, merge_keys: Optional[List[str]]) -> None:
118
+ self["merge_keys"] = merge_keys
119
+
120
+ @property
121
+ def s3_client_kwargs(self) -> Dict[str, Any]:
122
+ return self["s3_client_kwargs"]
123
+
124
+ @s3_client_kwargs.setter
125
+ def s3_client_kwargs(self, s3_client_kwargs: Dict[str, Any]) -> None:
126
+ self["s3_client_kwargs"] = s3_client_kwargs
127
+
128
+ @property
129
+ def filesystem(self) -> Optional[AbstractFileSystem]:
130
+ return self["filesystem"]
131
+
132
+ @filesystem.setter
133
+ def filesystem(self, filesystem: Optional[AbstractFileSystem]) -> None:
134
+ self["filesystem"] = filesystem
135
+
136
+ @property
137
+ def location_provider_prefix_override(self) -> Optional[str]:
138
+ return self["location_provider_prefix_override"]
139
+
140
+ @location_provider_prefix_override.setter
141
+ def location_provider_prefix_override(
142
+ self, location_provider_prefix_override: Optional[str]
143
+ ) -> None:
144
+ self["location_provider_prefix_override"] = location_provider_prefix_override
@@ -0,0 +1,78 @@
1
+ from typing import Optional, Dict, Any
2
+ from pyiceberg.table import Table
3
+ from pyiceberg.catalog import Catalog, load_catalog as pyiceberg_load_catalog
4
+ from botocore.credentials import Credentials
5
+ import boto3
6
+ from boto3.session import Session
7
+
8
+
9
+ def load_catalog(
10
+ iceberg_catalog_name: str, iceberg_catalog_properties: Dict[str, Any]
11
+ ) -> Catalog:
12
+ catalog = pyiceberg_load_catalog(
13
+ name=iceberg_catalog_name,
14
+ **iceberg_catalog_properties,
15
+ )
16
+ return catalog
17
+
18
+
19
+ def get_s3_path(
20
+ bucket_name: str,
21
+ database_name: Optional[str] = None,
22
+ table_name: Optional[str] = None,
23
+ ) -> str:
24
+ result_path = f"s3://{bucket_name}"
25
+ if database_name is not None:
26
+ result_path += f"/{database_name}.db"
27
+
28
+ if table_name is not None:
29
+ result_path += f"/{table_name}"
30
+ return result_path
31
+
32
+
33
+ def get_bucket_name() -> str:
34
+ return "test-bucket"
35
+
36
+
37
+ def get_s3_prefix() -> str:
38
+ return get_s3_path(get_bucket_name())
39
+
40
+
41
+ def get_credential() -> Credentials:
42
+ boto3_session: Session = boto3.Session()
43
+ credentials: Credentials = boto3_session.get_credentials()
44
+ return credentials
45
+
46
+
47
+ def get_glue_catalog() -> Catalog:
48
+ credential = get_credential()
49
+ # Credentials are refreshable, so accessing your access key / secret key
50
+ # separately can lead to a race condition. Use this to get an actual matched
51
+ # set.
52
+ credential = credential.get_frozen_credentials()
53
+ access_key_id = credential.access_key
54
+ secret_access_key = credential.secret_key
55
+ session_token = credential.token
56
+ s3_path = get_s3_prefix()
57
+ glue_catalog = pyiceberg_load_catalog(
58
+ "glue",
59
+ **{
60
+ "warehouse": s3_path,
61
+ "type": "glue",
62
+ "aws_access_key_id": access_key_id,
63
+ "aws_secret_access_key": secret_access_key,
64
+ "aws_session_token": session_token,
65
+ "region_name": "us-east-1",
66
+ "s3.access-key-id": access_key_id,
67
+ "s3.secret-access-key": secret_access_key,
68
+ "s3.session-token": session_token,
69
+ "s3.region": "us-east-1",
70
+ },
71
+ )
72
+
73
+ return glue_catalog
74
+
75
+
76
+ def load_table(catalog: Catalog, table_name: str) -> Table:
77
+ loaded_table = catalog.load_table(table_name)
78
+ return loaded_table
@@ -0,0 +1,263 @@
1
+ from collections import defaultdict
2
+ import logging
3
+ from deltacat import logs
4
+ import pyarrow
5
+ import pyarrow.parquet as pq
6
+ from pyiceberg.io.pyarrow import (
7
+ parquet_path_to_id_mapping,
8
+ StatisticsCollector,
9
+ MetricModeTypes,
10
+ DataFileStatistics,
11
+ MetricsMode,
12
+ StatsAggregator,
13
+ )
14
+ from typing import Dict, List, Set, Any, Tuple
15
+ from deltacat.compute.converter.utils.iceberg_columns import (
16
+ ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN,
17
+ ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN,
18
+ )
19
+ from pyiceberg.io.pyarrow import (
20
+ compute_statistics_plan,
21
+ )
22
+ from pyiceberg.manifest import (
23
+ DataFile,
24
+ DataFileContent,
25
+ FileFormat,
26
+ )
27
+ from pyiceberg.table import _min_sequence_number, _open_manifest, Table
28
+ from pyiceberg.utils.concurrent import ExecutorFactory
29
+ from itertools import chain
30
+ from pyiceberg.typedef import (
31
+ KeyDefaultDict,
32
+ )
33
+ from pyiceberg.schema import Schema
34
+ from pyiceberg.io import FileIO
35
+ from deltacat.compute.converter.model.convert_input_files import (
36
+ DataFileList,
37
+ )
38
+
39
+
40
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
41
+
42
+
43
+ def parquet_path_to_id_mapping_override(schema: Schema) -> Dict[str, int]:
44
+ res = parquet_path_to_id_mapping(schema)
45
+ # Override here to insert position delete reserved column field IDs
46
+ res["file_path"] = ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN
47
+ res["pos"] = ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN
48
+ return res
49
+
50
+
51
+ def data_file_statistics_from_parquet_metadata(
52
+ parquet_metadata: pq.FileMetaData,
53
+ stats_columns: Dict[int, StatisticsCollector],
54
+ parquet_column_mapping: Dict[str, int],
55
+ ) -> DataFileStatistics:
56
+ """
57
+ Overrides original Pyiceberg function: Compute and return DataFileStatistics that includes the following.
58
+
59
+ - record_count
60
+ - column_sizes
61
+ - value_counts
62
+ - null_value_counts
63
+ - nan_value_counts
64
+ - column_aggregates
65
+ - split_offsets
66
+
67
+ Args:
68
+ parquet_metadata (pyarrow.parquet.FileMetaData): A pyarrow metadata object.
69
+ stats_columns (Dict[int, StatisticsCollector]): The statistics gathering plan. It is required to
70
+ set the mode for column metrics collection
71
+ parquet_column_mapping (Dict[str, int]): The mapping of the parquet file name to the field ID
72
+ """
73
+ column_sizes: Dict[int, int] = {}
74
+ value_counts: Dict[int, int] = {}
75
+ split_offsets: List[int] = []
76
+
77
+ null_value_counts: Dict[int, int] = {}
78
+ nan_value_counts: Dict[int, int] = {}
79
+
80
+ col_aggs = {}
81
+
82
+ invalidate_col: Set[int] = set()
83
+ for r in range(parquet_metadata.num_row_groups):
84
+ # References:
85
+ # https://github.com/apache/iceberg/blob/fc381a81a1fdb8f51a0637ca27cd30673bd7aad3/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java#L232
86
+ # https://github.com/apache/parquet-mr/blob/ac29db4611f86a07cc6877b416aa4b183e09b353/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java#L184
87
+
88
+ row_group = parquet_metadata.row_group(r)
89
+
90
+ data_offset = row_group.column(0).data_page_offset
91
+ dictionary_offset = row_group.column(0).dictionary_page_offset
92
+
93
+ if row_group.column(0).has_dictionary_page and dictionary_offset < data_offset:
94
+ split_offsets.append(dictionary_offset)
95
+ else:
96
+ split_offsets.append(data_offset)
97
+
98
+ for pos in range(parquet_metadata.num_columns):
99
+ column = row_group.column(pos)
100
+ field_id = parquet_column_mapping[column.path_in_schema]
101
+ if field_id in stats_columns:
102
+ stats_col = stats_columns[field_id]
103
+
104
+ column_sizes.setdefault(field_id, 0)
105
+ column_sizes[field_id] += column.total_compressed_size
106
+
107
+ if stats_col.mode == MetricsMode(MetricModeTypes.NONE):
108
+ continue
109
+
110
+ value_counts[field_id] = (
111
+ value_counts.get(field_id, 0) + column.num_values
112
+ )
113
+
114
+ if column.is_stats_set:
115
+ try:
116
+ statistics = column.statistics
117
+
118
+ if statistics.has_null_count:
119
+ null_value_counts[field_id] = (
120
+ null_value_counts.get(field_id, 0)
121
+ + statistics.null_count
122
+ )
123
+
124
+ if stats_col.mode == MetricsMode(MetricModeTypes.COUNTS):
125
+ continue
126
+
127
+ if field_id not in col_aggs:
128
+ col_aggs[field_id] = StatsAggregator(
129
+ stats_col.iceberg_type,
130
+ statistics.physical_type,
131
+ stats_col.mode.length,
132
+ )
133
+
134
+ col_aggs[field_id].update_min(statistics.min)
135
+ col_aggs[field_id].update_max(statistics.max)
136
+
137
+ except pyarrow.lib.ArrowNotImplementedError as e:
138
+ invalidate_col.add(field_id)
139
+ logger.warning(e)
140
+ else:
141
+ # Note: Removed original adding columns without stats to invalid column logic here
142
+ logger.warning(
143
+ "PyArrow statistics missing for column %d when writing file", pos
144
+ )
145
+
146
+ split_offsets.sort()
147
+
148
+ for field_id in invalidate_col:
149
+ del col_aggs[field_id]
150
+ del null_value_counts[field_id]
151
+
152
+ return DataFileStatistics(
153
+ record_count=parquet_metadata.num_rows,
154
+ column_sizes=column_sizes,
155
+ value_counts=value_counts,
156
+ null_value_counts=null_value_counts,
157
+ nan_value_counts=nan_value_counts,
158
+ column_aggregates=col_aggs,
159
+ split_offsets=split_offsets,
160
+ )
161
+
162
+
163
+ def parquet_files_dict_to_iceberg_data_files(
164
+ io: FileIO,
165
+ table_metadata: Any,
166
+ files_dict: Dict[Any, List[str]],
167
+ file_content_type: DataFileContent,
168
+ ) -> List[DataFile]:
169
+ iceberg_files = []
170
+ schema = table_metadata.schema()
171
+ for partition_value, file_paths in files_dict.items():
172
+ for file_path in file_paths:
173
+ input_file = io.new_input(file_path)
174
+ with input_file.open() as input_stream:
175
+ parquet_metadata = pq.read_metadata(input_stream)
176
+
177
+ # Removed _check_pyarrow_schema_compatible() here since reserved columns does not comply to all rules.
178
+
179
+ statistics = data_file_statistics_from_parquet_metadata(
180
+ parquet_metadata=parquet_metadata,
181
+ stats_columns=compute_statistics_plan(
182
+ schema, table_metadata.properties
183
+ ),
184
+ parquet_column_mapping=parquet_path_to_id_mapping_override(schema),
185
+ )
186
+
187
+ data_file = DataFile(
188
+ content=file_content_type,
189
+ file_path=file_path,
190
+ file_format=FileFormat.PARQUET,
191
+ partition=partition_value,
192
+ file_size_in_bytes=len(input_file),
193
+ sort_order_id=None,
194
+ spec_id=table_metadata.default_spec_id,
195
+ equality_ids=None,
196
+ key_metadata=None,
197
+ **statistics.to_serialized_dict(),
198
+ )
199
+ iceberg_files.append(data_file)
200
+ return iceberg_files
201
+
202
+
203
+ def fetch_all_bucket_files(
204
+ table: Table,
205
+ ) -> Tuple[Dict[Any, DataFileList], Dict[Any, DataFileList], Dict[Any, DataFileList]]:
206
+ # step 1: filter manifests using partition summaries
207
+ # the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id
208
+ data_scan = table.scan()
209
+ snapshot = data_scan.snapshot()
210
+ if not snapshot:
211
+ return iter([])
212
+ manifest_evaluators = KeyDefaultDict(data_scan._build_manifest_evaluator)
213
+
214
+ manifests = [
215
+ manifest_file
216
+ for manifest_file in snapshot.manifests(data_scan.io)
217
+ if manifest_evaluators[manifest_file.partition_spec_id](manifest_file)
218
+ ]
219
+
220
+ # step 2: filter the data files in each manifest
221
+ # this filter depends on the partition spec used to write the manifest file
222
+ partition_evaluators = KeyDefaultDict(data_scan._build_partition_evaluator)
223
+ residual_evaluators = KeyDefaultDict(data_scan._build_residual_evaluator)
224
+ min_sequence_number = _min_sequence_number(manifests)
225
+
226
+ # {"bucket_index": List[DataFile]}
227
+ data_entries = defaultdict(list)
228
+ equality_data_entries = defaultdict(list)
229
+ positional_delete_entries = defaultdict(list)
230
+
231
+ executor = ExecutorFactory.get_or_create()
232
+ for manifest_entry in chain(
233
+ *executor.map(
234
+ lambda args: _open_manifest(*args),
235
+ [
236
+ (
237
+ data_scan.io,
238
+ manifest,
239
+ partition_evaluators[manifest.partition_spec_id],
240
+ residual_evaluators[manifest.partition_spec_id],
241
+ data_scan._build_metrics_evaluator(),
242
+ )
243
+ for manifest in manifests
244
+ if data_scan._check_sequence_number(min_sequence_number, manifest)
245
+ ],
246
+ )
247
+ ):
248
+ data_file = manifest_entry.data_file
249
+ file_sequence_number = manifest_entry.sequence_number
250
+ data_file_tuple = (file_sequence_number, data_file)
251
+ partition_value = data_file.partition
252
+
253
+ if data_file.content == DataFileContent.DATA:
254
+ data_entries[partition_value].append(data_file_tuple)
255
+ elif data_file.content == DataFileContent.POSITION_DELETES:
256
+ positional_delete_entries[partition_value].append(data_file_tuple)
257
+ elif data_file.content == DataFileContent.EQUALITY_DELETES:
258
+ equality_data_entries[partition_value].append(data_file_tuple)
259
+ else:
260
+ logger.warning(
261
+ f"Unknown DataFileContent ({data_file.content}): {manifest_entry}"
262
+ )
263
+ return data_entries, equality_data_entries, positional_delete_entries