deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/types/tables.py CHANGED
@@ -1,11 +1,36 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import multiprocessing
1
6
  from enum import Enum
2
- from typing import Callable, Dict, Type, Union
7
+ from functools import partial
8
+ from typing import (
9
+ Callable,
10
+ Dict,
11
+ Type,
12
+ Union,
13
+ Optional,
14
+ Any,
15
+ List,
16
+ Tuple,
17
+ TYPE_CHECKING,
18
+ )
19
+ from uuid import uuid4
3
20
 
21
+ import daft
4
22
  import numpy as np
5
23
  import pandas as pd
24
+ import polars as pl
6
25
  import pyarrow as pa
26
+ import pyarrow.dataset as ds
27
+ import pyarrow.fs
7
28
  import pyarrow.parquet as papq
8
- from ray.data.dataset import Dataset
29
+ import ray
30
+ from ray.data.block import Block, BlockMetadata, BlockAccessor
31
+ from ray.data._internal.pandas_block import PandasBlockSchema
32
+ from ray.data.dataset import Dataset as RayDataset, MaterializedDataset
33
+ from ray.data.datasource import FilenameProvider
9
34
  from ray.data.read_api import (
10
35
  from_arrow,
11
36
  from_arrow_refs,
@@ -13,74 +38,506 @@ from ray.data.read_api import (
13
38
  from_pandas,
14
39
  from_pandas_refs,
15
40
  )
41
+ from tenacity import (
42
+ Retrying,
43
+ wait_random_exponential,
44
+ stop_after_delay,
45
+ retry_if_exception_type,
46
+ )
16
47
 
17
- import deltacat.storage as dcs
18
- from deltacat.types.media import TableType, DistributedDatasetType
48
+ from deltacat.compute.compactor_v2.constants import MAX_RECORDS_PER_COMPACTED_FILE
49
+ from deltacat import logs
50
+ from deltacat.constants import (
51
+ UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY,
52
+ RETRYABLE_TRANSIENT_ERRORS,
53
+ DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY,
54
+ )
55
+ from deltacat.storage.model.types import (
56
+ Dataset,
57
+ LocalTable,
58
+ DistributedDataset,
59
+ LocalDataset,
60
+ )
61
+ from deltacat.storage.model.schema import SchemaConsistencyType
62
+ from deltacat.types.media import (
63
+ DatasetType,
64
+ DistributedDatasetType,
65
+ ContentType,
66
+ EXPLICIT_COMPRESSION_CONTENT_TYPES,
67
+ ContentEncoding,
68
+ CONTENT_TYPE_TO_EXT,
69
+ CONTENT_ENCODING_TO_EXT,
70
+ )
19
71
  from deltacat.utils import numpy as np_utils
20
72
  from deltacat.utils import pandas as pd_utils
73
+ from deltacat.utils import polars as pl_utils
21
74
  from deltacat.utils import pyarrow as pa_utils
22
- from deltacat.utils import daft as daft_utils
23
75
  from deltacat.utils.ray_utils import dataset as ds_utils
76
+ from deltacat.storage.model.manifest import (
77
+ ManifestEntryList,
78
+ ManifestEntry,
79
+ ManifestMeta,
80
+ EntryParams,
81
+ EntryType,
82
+ Manifest,
83
+ )
84
+ from deltacat.exceptions import (
85
+ RetryableError,
86
+ RetryableUploadTableError,
87
+ NonRetryableUploadTableError,
88
+ categorize_errors,
89
+ RetryableDownloadTableError,
90
+ NonRetryableDownloadTableError,
91
+ )
92
+ from deltacat.utils.common import ReadKwargsProvider
93
+ from deltacat.types.partial_download import PartialFileDownloadParams
94
+ from deltacat.utils.ray_utils.concurrency import invoke_parallel
24
95
 
25
- TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
26
- TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
27
- TableType.PYARROW.value: pa_utils.s3_file_to_table,
28
- TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
29
- TableType.NUMPY.value: np_utils.s3_file_to_ndarray,
96
+ if TYPE_CHECKING:
97
+ from deltacat.storage.model.schema import Schema
98
+ from deltacat.storage.model.table import Table
99
+ from deltacat.storage.model.table_version import TableVersion
100
+
101
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
102
+
103
+
104
+ TABLE_TYPE_TO_READER_FUNC: Dict[str, Callable] = {
105
+ DatasetType.PYARROW_PARQUET.value: pa_utils.file_to_parquet,
106
+ DatasetType.PYARROW.value: pa_utils.file_to_table,
107
+ DatasetType.PANDAS.value: pd_utils.file_to_dataframe,
108
+ DatasetType.NUMPY.value: np_utils.file_to_ndarray,
109
+ DatasetType.POLARS.value: pl_utils.file_to_dataframe,
30
110
  }
31
111
 
112
+
32
113
  TABLE_CLASS_TO_WRITER_FUNC: Dict[
33
- Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
114
+ Type[Union[LocalTable, DistributedDataset]], Callable
34
115
  ] = {
35
116
  pa.Table: pa_utils.table_to_file,
36
117
  pd.DataFrame: pd_utils.dataframe_to_file,
118
+ pl.DataFrame: pl_utils.dataframe_to_file,
37
119
  np.ndarray: np_utils.ndarray_to_file,
38
- Dataset: ds_utils.dataset_to_file,
120
+ RayDataset: ds_utils.dataset_to_file,
121
+ MaterializedDataset: ds_utils.dataset_to_file,
39
122
  }
40
123
 
41
124
  TABLE_CLASS_TO_SLICER_FUNC: Dict[
42
- Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
125
+ Type[Union[LocalTable, DistributedDataset]], Callable
43
126
  ] = {
44
127
  pa.Table: pa_utils.slice_table,
45
128
  pd.DataFrame: pd_utils.slice_dataframe,
129
+ pl.DataFrame: pl_utils.slice_table,
46
130
  np.ndarray: np_utils.slice_ndarray,
47
- Dataset: ds_utils.slice_dataset,
131
+ RayDataset: ds_utils.slice_dataset,
132
+ MaterializedDataset: ds_utils.slice_dataset,
48
133
  }
49
134
 
50
135
  TABLE_CLASS_TO_SIZE_FUNC: Dict[
51
- Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
136
+ Type[Union[LocalTable, DistributedDataset]], Callable
52
137
  ] = {
53
138
  pa.Table: pa_utils.table_size,
54
139
  papq.ParquetFile: pa_utils.parquet_file_size,
55
140
  pd.DataFrame: pd_utils.dataframe_size,
141
+ pl.DataFrame: pl_utils.dataframe_size,
56
142
  np.ndarray: np_utils.ndarray_size,
57
- Dataset: ds_utils.dataset_size,
143
+ RayDataset: ds_utils.dataset_size,
144
+ MaterializedDataset: ds_utils.dataset_size,
145
+ }
146
+
147
+ TABLE_CLASS_TO_COLUMN_NAMES_FUNC: Dict[
148
+ Type[Union[LocalTable, DistributedDataset]], Callable
149
+ ] = {
150
+ pa.Table: lambda table: table.schema.names,
151
+ papq.ParquetFile: lambda table: table.schema.names,
152
+ pd.DataFrame: lambda table: table.columns.tolist(),
153
+ pl.DataFrame: lambda table: table.columns,
154
+ np.ndarray: lambda table: [f"{i}" for i in range(table.shape[1])],
155
+ daft.DataFrame: lambda table: table.column_names,
156
+ RayDataset: lambda table: table.schema().names,
157
+ MaterializedDataset: lambda table: table.schema().names,
158
+ }
159
+
160
+ TABLE_CLASS_TO_SCHEMA_FUNC: Dict[
161
+ Type[Union[LocalTable, DistributedDataset]], Callable
162
+ ] = {
163
+ pa.Table: lambda table: table.schema,
164
+ papq.ParquetFile: lambda table: table.schema_arrow,
165
+ pd.DataFrame: lambda table: pa.Schema.from_pandas(table),
166
+ pl.DataFrame: lambda table: table.to_arrow().schema,
167
+ np.ndarray: lambda table: pa.Schema.from_pandas(pd.DataFrame(table)),
168
+ daft.DataFrame: lambda table: table.schema().to_pyarrow_schema(),
169
+ RayDataset: lambda table: table.schema().base_schema,
170
+ MaterializedDataset: lambda table: table.schema().base_schema,
171
+ }
172
+
173
+ TABLE_TYPE_TO_EMPTY_TABLE_FUNC: Dict[str, Callable] = {
174
+ DatasetType.PYARROW.value: lambda: pa.Table.from_pydict({}),
175
+ DatasetType.PANDAS.value: lambda: pd.DataFrame(),
176
+ DatasetType.POLARS.value: lambda: pl.DataFrame(),
177
+ DatasetType.NUMPY.value: lambda: np.array([]),
178
+ DatasetType.DAFT.value: lambda: daft.DataFrame(),
179
+ DatasetType.RAY_DATASET.value: lambda: ray.data.from_items([]),
180
+ MaterializedDataset: lambda: ray.data.from_items([]),
58
181
  }
59
182
 
60
- TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
61
- pa.Table: TableType.PYARROW.value,
62
- papq.ParquetFile: TableType.PYARROW_PARQUET.value,
63
- pd.DataFrame: TableType.PANDAS.value,
64
- np.ndarray: TableType.NUMPY.value,
183
+
184
+ def _numpy_array_to_pyarrow(table: np.ndarray, schema: pa.Schema) -> pa.Table:
185
+ """Convert NumPy array to PyArrow Table via Pandas for complex type support."""
186
+ # Convert NumPy -> Pandas -> PyArrow to handle complex types like structs
187
+ # This follows the same path as Pandas conversion which handles all complex types properly
188
+ pandas_df = _numpy_array_to_pandas(table, schema=schema)
189
+ return pa.Table.from_pandas(pandas_df, schema=schema)
190
+
191
+
192
+ def _numpy_array_to_pandas(
193
+ table: np.ndarray, *, schema: Optional[pa.Schema] = None, **kwargs
194
+ ) -> pd.DataFrame:
195
+ """Convert NumPy array to pandas DataFrame."""
196
+ if schema and isinstance(schema, pa.Schema):
197
+ if table.ndim == 1:
198
+ # 1D array: single column
199
+ column_names = [schema.names[0]] if schema.names else ["0"]
200
+ return pd.DataFrame({column_names[0]: table}, **kwargs)
201
+ elif table.ndim == 2:
202
+ # 2D array: multiple columns
203
+ column_names = (
204
+ schema.names
205
+ if len(schema.names) == table.shape[1]
206
+ else [f"{i}" for i in range(table.shape[1])]
207
+ )
208
+ return pd.DataFrame(table, columns=column_names, **kwargs)
209
+ else:
210
+ raise ValueError(
211
+ f"NumPy arrays with {table.ndim} dimensions are not supported. "
212
+ f"Only 1D and 2D arrays are supported."
213
+ )
214
+
215
+ # Fallback to generic column names
216
+ return pd.DataFrame(table, **kwargs)
217
+
218
+
219
+ def _ray_dataset_to_pyarrow(table, *, schema, **kwargs):
220
+ """Convert Ray Dataset to PyArrow tables and concatenate."""
221
+ arrow_refs = table.to_arrow_refs(**kwargs)
222
+ arrow_tables = ray.get(arrow_refs)
223
+ if len(arrow_tables) == 1:
224
+ return arrow_tables[0]
225
+ # Unify schemas to support schema evolution across blocks/files
226
+ try:
227
+ return pa.concat_tables(
228
+ arrow_tables, promote_options="permissive", unify_schemas=True
229
+ )
230
+ except TypeError:
231
+ return pa.concat_tables(arrow_tables, promote_options="permissive")
232
+
233
+
234
+ TABLE_CLASS_TO_PYARROW_FUNC: Dict[
235
+ Type[Union[LocalTable, DistributedDataset]], Callable
236
+ ] = {
237
+ pa.Table: lambda table, *, schema, **kwargs: table,
238
+ papq.ParquetFile: lambda table, *, schema, **kwargs: table.read(**kwargs),
239
+ pd.DataFrame: lambda table, *, schema, **kwargs: pa.Table.from_pandas(
240
+ table, schema=schema, preserve_index=False, **kwargs
241
+ ),
242
+ pl.DataFrame: lambda table, *, schema, **kwargs: pl.DataFrame.to_arrow(
243
+ table, **kwargs
244
+ ),
245
+ np.ndarray: lambda table, *, schema, **kwargs: _numpy_array_to_pyarrow(
246
+ table, schema, **kwargs
247
+ ),
248
+ RayDataset: _ray_dataset_to_pyarrow,
249
+ MaterializedDataset: _ray_dataset_to_pyarrow,
250
+ daft.DataFrame: lambda table, *, schema, **kwargs: table.to_arrow(**kwargs),
251
+ }
252
+
253
+ TABLE_CLASS_TO_PANDAS_FUNC: Dict[
254
+ Type[Union[LocalTable, DistributedDataset]], Callable
255
+ ] = {
256
+ pa.Table: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
257
+ papq.ParquetFile: lambda table, *, schema=None, **kwargs: table.read(
258
+ **kwargs
259
+ ).to_pandas(**kwargs),
260
+ pd.DataFrame: lambda table, *, schema=None, **kwargs: table,
261
+ pl.DataFrame: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
262
+ np.ndarray: lambda table, *, schema=None, **kwargs: _numpy_array_to_pandas(
263
+ table, schema=schema, **kwargs
264
+ ),
265
+ RayDataset: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
266
+ MaterializedDataset: lambda table, *, schema=None, **kwargs: table.to_pandas(
267
+ **kwargs
268
+ ),
269
+ daft.DataFrame: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
270
+ }
271
+
272
+
273
+ def _pyarrow_to_polars(pa_table: pa.Table, **kwargs) -> pl.DataFrame:
274
+ """Convert PyArrow table to Polars DataFrame with clean schema."""
275
+ # PyArrow metadata can contain invalid UTF-8 sequences that cause Polars to raise an error
276
+ # Create a new table without metadata that might contain invalid UTF-8
277
+ clean_schema = pa.schema(
278
+ [
279
+ pa.field(field.name, field.type, nullable=field.nullable)
280
+ for field in pa_table.schema
281
+ ]
282
+ )
283
+ clean_table = pa.Table.from_arrays(pa_table.columns, schema=clean_schema)
284
+ return pl.from_arrow(clean_table, **kwargs)
285
+
286
+
287
+ def _pyarrow_to_numpy(pa_table: pa.Table, **kwargs) -> np.ndarray:
288
+ """Convert PyArrow table to numpy array."""
289
+ if pa_table.num_columns == 1:
290
+ return pa_table.column(0).to_numpy(**kwargs)
291
+ else:
292
+ return pa_table.to_pandas().values
293
+
294
+
295
+ def _pandas_to_numpy(pd_df: pd.DataFrame, **kwargs) -> np.ndarray:
296
+ """Convert Pandas DataFrame to numpy array."""
297
+ if len(pd_df.columns) == 1:
298
+ return pd_df.iloc[:, 0].to_numpy(**kwargs)
299
+ else:
300
+ return pd_df.values
301
+
302
+
303
+ DATASET_TYPE_FROM_PYARROW: Dict[DatasetType, Callable[[pa.Table, Dataset], Any]] = {
304
+ DatasetType.PYARROW: lambda pa_table, **kwargs: pa_table,
305
+ DatasetType.PANDAS: lambda pa_table, **kwargs: pa_table.to_pandas(**kwargs),
306
+ DatasetType.POLARS: lambda pa_table, **kwargs: _pyarrow_to_polars(
307
+ pa_table, **kwargs
308
+ ),
309
+ DatasetType.DAFT: lambda pa_table, **kwargs: daft.from_arrow(pa_table, **kwargs),
310
+ DatasetType.NUMPY: lambda pa_table, **kwargs: _pyarrow_to_numpy(pa_table, **kwargs),
311
+ DatasetType.RAY_DATASET: lambda pa_table, **kwargs: ray.data.from_arrow(pa_table),
312
+ DatasetType.PYARROW_PARQUET: lambda pa_table, **kwargs: pa_table, # ParquetFile is read as PyArrow Table
313
+ }
314
+
315
+
316
+ DATASET_TYPE_FROM_PANDAS: Dict[DatasetType, Callable[[pd.DataFrame, Dataset], Any]] = {
317
+ DatasetType.PANDAS: lambda pd_df, **kwargs: pd_df,
318
+ DatasetType.PYARROW: lambda pd_df, **kwargs: pa.Table.from_pandas(pd_df, **kwargs),
319
+ DatasetType.POLARS: lambda pd_df, **kwargs: pl.from_pandas(pd_df, **kwargs),
320
+ DatasetType.DAFT: lambda pd_df, **kwargs: daft.from_pandas(pd_df, **kwargs),
321
+ DatasetType.NUMPY: lambda pd_df, **kwargs: _pandas_to_numpy(pd_df, **kwargs),
322
+ DatasetType.RAY_DATASET: lambda pd_df, **kwargs: ray.data.from_pandas(
323
+ pd_df, **kwargs
324
+ ),
325
+ }
326
+
327
+
328
+ def append_column_to_parquet_file(
329
+ parquet_file: papq.ParquetFile,
330
+ column_name: str,
331
+ column_value: Any,
332
+ ) -> pa.Table:
333
+ """
334
+ Append a column to a ParquetFile by converting to PyArrow Table first.
335
+
336
+ Args:
337
+ parquet_file: The ParquetFile to add column to
338
+ column_name: Name of the new column
339
+ column_value: Value to populate in all rows of the new column
340
+
341
+ Returns:
342
+ PyArrow Table with the new column
343
+ """
344
+ # Convert ParquetFile to Table
345
+ table = parquet_file.read()
346
+
347
+ # Use the existing PyArrow append column function
348
+ num_rows = table.num_rows
349
+ column_array = pa.array([column_value] * num_rows)
350
+ return table.append_column(column_name, column_array)
351
+
352
+
353
+ TABLE_CLASS_TO_APPEND_COLUMN_FUNC: Dict[
354
+ Type[Union[LocalTable, DistributedDataset]], Callable
355
+ ] = {
356
+ pa.Table: pa_utils.append_column_to_table,
357
+ papq.ParquetFile: append_column_to_parquet_file,
358
+ pd.DataFrame: pd_utils.append_column_to_dataframe,
359
+ pl.DataFrame: pl_utils.append_column_to_table,
360
+ np.ndarray: np_utils.append_column_to_ndarray,
361
+ }
362
+
363
+ TABLE_CLASS_TO_SELECT_COLUMNS_FUNC: Dict[
364
+ Type[Union[LocalTable, DistributedDataset]], Callable
365
+ ] = {
366
+ pa.Table: pa_utils.select_columns,
367
+ pd.DataFrame: pd_utils.select_columns,
368
+ pl.DataFrame: pl_utils.select_columns,
369
+ }
370
+
371
+ TABLE_CLASS_TO_TABLE_TYPE: Dict[Union[LocalTable, DistributedDataset], str] = {
372
+ pa.Table: DatasetType.PYARROW.value,
373
+ papq.ParquetFile: DatasetType.PYARROW_PARQUET.value,
374
+ pl.DataFrame: DatasetType.POLARS.value,
375
+ pd.DataFrame: DatasetType.PANDAS.value,
376
+ np.ndarray: DatasetType.NUMPY.value,
377
+ daft.DataFrame: DatasetType.DAFT.value,
378
+ RayDataset: DatasetType.RAY_DATASET.value,
379
+ MaterializedDataset: DatasetType.RAY_DATASET.value,
65
380
  }
66
381
 
67
382
  TABLE_TYPE_TO_DATASET_CREATE_FUNC: Dict[str, Callable] = {
68
- TableType.PYARROW.value: from_arrow,
69
- TableType.PYARROW_PARQUET.value: from_arrow,
70
- TableType.NUMPY.value: from_numpy,
71
- TableType.PANDAS.value: from_pandas,
383
+ DatasetType.PYARROW.value: from_arrow,
384
+ DatasetType.PYARROW_PARQUET.value: from_arrow,
385
+ DatasetType.NUMPY.value: from_numpy,
386
+ DatasetType.PANDAS.value: from_pandas,
72
387
  }
73
388
 
74
389
  TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
75
- TableType.PYARROW.value: from_arrow_refs,
76
- TableType.PYARROW_PARQUET.value: from_arrow_refs,
77
- TableType.NUMPY.value: from_numpy,
78
- TableType.PANDAS.value: from_pandas_refs,
390
+ DatasetType.PYARROW.value: from_arrow_refs,
391
+ DatasetType.PYARROW_PARQUET.value: from_arrow_refs,
392
+ DatasetType.NUMPY.value: from_numpy,
393
+ DatasetType.PANDAS.value: from_pandas_refs,
394
+ DatasetType.POLARS.value: from_arrow_refs, # We cast Polars to Arrow for Ray Datasets
395
+ DatasetType.RAY_DATASET.value: from_arrow_refs, # Ray Datasets are created from Arrow refs
396
+ }
397
+
398
+ TABLE_TYPE_TO_CONCAT_FUNC: Dict[str, Callable] = {
399
+ DatasetType.PYARROW_PARQUET.value: pa_utils.concat_tables,
400
+ DatasetType.PYARROW.value: pa_utils.concat_tables,
401
+ DatasetType.PANDAS.value: pd_utils.concat_dataframes,
402
+ DatasetType.NUMPY.value: np_utils.concat_ndarrays,
403
+ DatasetType.POLARS.value: pl_utils.concat_dataframes,
79
404
  }
80
405
 
81
406
 
407
+ def _infer_schema_from_numpy_array(data: np.ndarray) -> Schema:
408
+ """Infer schema from NumPy array."""
409
+ if data.ndim > 2:
410
+ raise ValueError(
411
+ f"NumPy arrays with {data.ndim} dimensions are not supported. "
412
+ f"Only 1D and 2D arrays are supported."
413
+ )
414
+ # Handle object dtype by converting to pandas first
415
+ df = pd.DataFrame(data)
416
+ arrow_schema = pa.Schema.from_pandas(df)
417
+
418
+ from deltacat.storage.model.schema import Schema
419
+
420
+ return Schema.of(schema=arrow_schema)
421
+
422
+
423
+ def _infer_schema_from_ray_dataset(data: RayDataset) -> Schema:
424
+ """Infer schema from Ray Dataset."""
425
+ ray_schema = data.schema()
426
+ base_schema = ray_schema.base_schema
427
+
428
+ if isinstance(base_schema, pa.Schema):
429
+ arrow_schema = base_schema
430
+ elif isinstance(base_schema, PandasBlockSchema):
431
+ try:
432
+ dtype_dict = {
433
+ name: dtype for name, dtype in zip(base_schema.names, base_schema.types)
434
+ }
435
+ empty_df = pd.DataFrame(columns=base_schema.names).astype(dtype_dict)
436
+ arrow_schema = pa.Schema.from_pandas(empty_df)
437
+ except Exception as e:
438
+ raise ValueError(
439
+ f"Failed to convert Ray Dataset PandasBlockSchema to PyArrow schema: {e}"
440
+ )
441
+ else:
442
+ raise ValueError(
443
+ f"Unsupported Ray Dataset schema type: {type(base_schema)}. "
444
+ f"Expected PyArrow Schema or PandasBlockSchema, got {base_schema}"
445
+ )
446
+
447
+ from deltacat.storage.model.schema import Schema
448
+
449
+ return Schema.of(schema=arrow_schema)
450
+
451
+
452
+ def _infer_schema_from_pandas_dataframe(data: pd.DataFrame) -> Schema:
453
+ """Infer schema from Pandas DataFrame."""
454
+ from deltacat.storage.model.schema import Schema
455
+
456
+ arrow_schema = pa.Schema.from_pandas(data)
457
+ return Schema.of(schema=arrow_schema)
458
+
459
+
460
+ def _infer_schema_from_polars_dataframe(data: pl.DataFrame) -> Schema:
461
+ """Infer schema from Polars DataFrame."""
462
+ from deltacat.storage.model.schema import Schema
463
+
464
+ arrow_table = data.to_arrow()
465
+ return Schema.of(schema=arrow_table.schema)
466
+
467
+
468
+ def _infer_schema_from_pyarrow(
469
+ data: Union[pa.Table, pa.RecordBatch, ds.Dataset]
470
+ ) -> Schema:
471
+ """Infer schema from PyArrow Table, RecordBatch, or Dataset."""
472
+ from deltacat.storage.model.schema import Schema
473
+
474
+ return Schema.of(schema=data.schema)
475
+
476
+
477
+ def _infer_schema_from_daft_dataframe(data: daft.DataFrame) -> Schema:
478
+ """Infer schema from Daft DataFrame."""
479
+ from deltacat.storage.model.schema import Schema
480
+
481
+ daft_schema = data.schema()
482
+ arrow_schema = daft_schema.to_pyarrow_schema()
483
+ return Schema.of(schema=arrow_schema)
484
+
485
+
486
+ TABLE_CLASS_TO_SCHEMA_INFERENCE_FUNC: Dict[
487
+ Type[Union[LocalTable, DistributedDataset], Callable]
488
+ ] = {
489
+ pd.DataFrame: _infer_schema_from_pandas_dataframe,
490
+ pl.DataFrame: _infer_schema_from_polars_dataframe,
491
+ pa.Table: _infer_schema_from_pyarrow,
492
+ pa.RecordBatch: _infer_schema_from_pyarrow,
493
+ ds.Dataset: _infer_schema_from_pyarrow,
494
+ RayDataset: _infer_schema_from_ray_dataset,
495
+ MaterializedDataset: _infer_schema_from_ray_dataset, # MaterializedDataset uses same schema inference as RayDataset
496
+ daft.DataFrame: _infer_schema_from_daft_dataframe,
497
+ np.ndarray: _infer_schema_from_numpy_array,
498
+ }
499
+
500
+
501
+ def infer_table_schema(data: Union[LocalTable, DistributedDataset]) -> Schema:
502
+ """Infer schema from a table or dataset."""
503
+ infer_schema_func = _get_table_function(
504
+ data,
505
+ TABLE_CLASS_TO_SCHEMA_INFERENCE_FUNC,
506
+ "schema inference",
507
+ )
508
+ return infer_schema_func(data)
509
+
510
+
511
+ def concat_tables(tables: List[LocalTable], table_type: DatasetType) -> LocalTable:
512
+ """
513
+ Concatenate a list of tables into a single table using the appropriate
514
+ concatenation function for the given table type.
515
+
516
+ Args:
517
+ tables: List of tables to concatenate
518
+ table_type: The DatasetType indicating which concatenation function to use
519
+
520
+ Returns:
521
+ Single concatenated table of the appropriate type
522
+
523
+ Raises:
524
+ ValueError: If no concatenation function is found for the table type
525
+ """
526
+ concat_func = _get_table_type_function(
527
+ table_type, TABLE_TYPE_TO_CONCAT_FUNC, "concatenation"
528
+ )
529
+ return concat_func(tables)
530
+
531
+
532
+ def _daft_reader_wrapper(*args, **kwargs):
533
+ """Wrapper for daft reader with lazy import to avoid circular import."""
534
+ from deltacat.utils.daft import files_to_dataframe
535
+
536
+ return files_to_dataframe(*args, **kwargs)
537
+
538
+
82
539
  DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
83
- DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
540
+ DistributedDatasetType.DAFT.value: _daft_reader_wrapper,
84
541
  }
85
542
 
86
543
 
@@ -89,13 +546,14 @@ class TableWriteMode(str, Enum):
89
546
  Enum controlling how a given dataset will be written to a table.
90
547
 
91
548
  AUTO: CREATE if the table doesn't exist, APPEND if the table exists
92
- without primary keys, and MERGE if the table exists with primary keys.
549
+ without merge keys, and MERGE if the table exists with merge keys.
93
550
  CREATE: Create the table if it doesn't exist, throw an error if it does.
94
551
  APPEND: Append to the table if it exists, throw an error if it doesn't.
95
552
  REPLACE: Replace existing table contents with the data to write.
96
- MERGE: Insert, update, or delete records matching a given predicate.
97
- Updates or inserts records based on the table's primary and sort keys by
553
+ MERGE: Insert or update records matching table merge keys.
554
+ Updates or inserts records based on the table's merge and sort keys by
98
555
  default.
556
+ DELETE: Delete records matching table merge keys.
99
557
  """
100
558
 
101
559
  AUTO = "auto"
@@ -103,29 +561,1869 @@ class TableWriteMode(str, Enum):
103
561
  APPEND = "append"
104
562
  REPLACE = "replace"
105
563
  MERGE = "merge"
564
+ DELETE = "delete"
106
565
 
107
566
 
108
- def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
109
- return len(table) if not isinstance(table, Dataset) else table.count()
567
+ class SchemaEvolutionMode(str, Enum):
568
+ """
569
+ Enum controlling how schema changes are handled when writing to a table.
570
+
571
+ MANUAL: Schema changes must be explicitly handled by the user. New fields
572
+ not in the existing schema will cause an error.
573
+ AUTO: Schema changes are automatically handled. New fields are added to
574
+ the schema using the table's default_schema_consistency_type.
575
+ DISABLED: Schema changes are disabled. The schema that the table was
576
+ created with is immutable.
577
+ """
578
+
579
+ MANUAL = "manual"
580
+ AUTO = "auto"
581
+ DISABLED = "disabled"
582
+
583
+
584
+ class TableProperty(str, Enum):
585
+ """
586
+ Enum defining known table property key names.
587
+ """
588
+
589
+ READ_OPTIMIZATION_LEVEL = "read_optimization_level"
590
+ RECORDS_PER_COMPACTED_FILE = "records_per_compacted_file"
591
+ APPENDED_RECORD_COUNT_COMPACTION_TRIGGER = (
592
+ "appended_record_count_compaction_trigger"
593
+ )
594
+ APPENDED_FILE_COUNT_COMPACTION_TRIGGER = "appended_file_count_compaction_trigger"
595
+ APPENDED_DELTA_COUNT_COMPACTION_TRIGGER = "appended_delta_count_compaction_trigger"
596
+ DEFAULT_COMPACTION_HASH_BUCKET_COUNT = "default_compaction_hash_bucket_count"
597
+ SCHEMA_EVOLUTION_MODE = "schema_evolution_mode"
598
+ DEFAULT_SCHEMA_CONSISTENCY_TYPE = "default_schema_consistency_type"
599
+ SUPPORTED_READER_TYPES = "supported_reader_types"
600
+
601
+ def read_table_property(
602
+ table_or_table_version: Union[Table, TableVersion], property: TableProperty
603
+ ) -> Any:
604
+ properties = table_or_table_version.properties or {}
605
+ value = properties.get(property.value, TablePropertyDefaultValues[property])
606
+
607
+ # Handle property type conversion
608
+ if property == TableProperty.SUPPORTED_READER_TYPES and isinstance(value, list):
609
+ # Convert string values back to DatasetType enums
610
+ return [DatasetType(v) for v in value]
611
+ if property == TableProperty.SCHEMA_EVOLUTION_MODE:
612
+ return SchemaEvolutionMode(value)
613
+ if property == TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE:
614
+ return SchemaConsistencyType(value)
615
+ if property == TableProperty.READ_OPTIMIZATION_LEVEL:
616
+ return TableReadOptimizationLevel(value)
617
+ return value
618
+
619
+
620
+ class TableReadOptimizationLevel(str, Enum):
621
+ """
622
+ Enum controlling the how much to optimize reads when writing to a table. Different levels
623
+ here correspond to different tradeoffs between write and read performance.
624
+
625
+ NONE: No read optimization. Deletes and updates are resolved by finding the values
626
+ that match merge key predicates by running compaction at read time. Provides the
627
+ fastest/cheapest writes but slow/expensive reads. Resilient to conflicts with concurrent
628
+ writes, including table management jobs like compaction.
629
+
630
+ MODERATE: Discover record indexes that match merge key predicates at write time and record
631
+ those values as logically deleted (e.g., using a bitmask). Provides faster/cheaper reads but
632
+ slower/more-expensive writes. May conflict with concurrent writes that remove/replace data
633
+ files like compaction.
634
+
635
+ MAX: Materialize all deletes and updates at write time by running compaction during
636
+ every write. Provides fast/cheap reads but slow/expensive writes. May conflict with
637
+ concurrent writes, including table management jobs like compaction.
638
+ """
639
+
640
+ NONE = "none"
641
+ MODERATE = "moderate"
642
+ MAX = "max"
643
+
644
+
645
+ TablePropertyDefaultValues: Dict[TableProperty, Any] = {
646
+ TableProperty.READ_OPTIMIZATION_LEVEL: TableReadOptimizationLevel.MAX,
647
+ TableProperty.RECORDS_PER_COMPACTED_FILE: MAX_RECORDS_PER_COMPACTED_FILE,
648
+ TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER: MAX_RECORDS_PER_COMPACTED_FILE
649
+ * 2,
650
+ TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER: 1000,
651
+ TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER: 100,
652
+ TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT: 8,
653
+ TableProperty.SCHEMA_EVOLUTION_MODE: SchemaEvolutionMode.AUTO,
654
+ TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE: SchemaConsistencyType.NONE,
655
+ TableProperty.SUPPORTED_READER_TYPES: [d for d in DatasetType],
656
+ }
110
657
 
111
658
 
112
- def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
113
- table_writer_func = TABLE_CLASS_TO_WRITER_FUNC.get(type(table))
114
- if table_writer_func is None:
659
+ def _get_table_function(
660
+ table: Union[LocalTable, DistributedDataset],
661
+ function_map: Dict[Type, Callable],
662
+ operation_name: str,
663
+ ) -> Callable:
664
+ """Generic helper to look up table-type-specific functions."""
665
+ table_func = function_map.get(type(table))
666
+ if table_func is None:
115
667
  msg = (
116
- f"No writer found for table type: {type(table)}.\n"
117
- f"Known table types: {TABLE_CLASS_TO_WRITER_FUNC.keys}"
668
+ f"No {operation_name} function found for table type: {type(table)}.\n"
669
+ f"Known table types: {list(function_map.keys())}"
118
670
  )
119
671
  raise ValueError(msg)
120
- return table_writer_func
672
+ return table_func
121
673
 
122
674
 
123
- def get_table_slicer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
124
- table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
125
- if table_slicer_func is None:
675
+ def _get_table_type_function(
676
+ table_type: DatasetType, function_map: Dict[str, Callable], operation_name: str
677
+ ) -> Callable:
678
+ """Generic helper to look up DatasetType-specific functions."""
679
+ table_func = function_map.get(table_type.value)
680
+ if table_func is None:
126
681
  msg = (
127
- f"No slicer found for table type: {type(table)}.\n"
128
- f"Known table types: {TABLE_CLASS_TO_SLICER_FUNC.keys}"
682
+ f"No {operation_name} function found for table type: {table_type}.\n"
683
+ f"Known table types: {list(function_map.keys())}"
129
684
  )
130
685
  raise ValueError(msg)
131
- return table_slicer_func
686
+ return table_func
687
+
688
+
689
+ def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
690
+ if not tables: # Empty list
691
+ return pd.DataFrame()
692
+
693
+ # Convert list elements
694
+ all_tables = []
695
+ for i, table in enumerate(tables):
696
+ try:
697
+ converted_table = conversion_fn(table, **kwargs)
698
+ all_tables.append(converted_table)
699
+ except Exception as e:
700
+ raise ValueError(f"Failed to convert list element {i}: {e}") from e
701
+
702
+ # Concatenate with error handling - handle different table types
703
+ try:
704
+ # Check if we have PyArrow tables
705
+ if all(isinstance(table, pa.Table) for table in all_tables):
706
+ # Use PyArrow concatenation for PyArrow tables
707
+ return pa.concat_tables(all_tables, promote_options="permissive")
708
+ else:
709
+ # Use pandas concatenation for other types
710
+ return pd.concat(all_tables, ignore_index=True, sort=False)
711
+ except Exception as e:
712
+ raise ValueError(f"Failed to concatenate {len(all_tables)} tables: {e}") from e
713
+
714
+
715
+ def get_table_length(
716
+ table: Union[LocalTable, DistributedDataset, BlockAccessor]
717
+ ) -> int:
718
+ """
719
+ Generic function to get the length of a table or distributed dataset.
720
+
721
+ Args:
722
+ table: The local table or distributed dataset to get the length of
723
+
724
+ Returns:
725
+ Length of the table or distributed dataset in rows
726
+ """
727
+ # Handle DAFT DataFrames dynamically
728
+ if hasattr(table, "count_rows") and str(type(table).__module__).startswith("daft"):
729
+ return table.count_rows()
730
+ elif isinstance(table, RayDataset):
731
+ return table.count()
732
+ elif isinstance(table, papq.ParquetFile):
733
+ return table.metadata.num_rows
734
+ else:
735
+ return len(table)
736
+
737
+
738
+ def dataset_length(table: Dataset) -> int:
739
+ """
740
+ Generic function to get the length of a dataset in records.
741
+ If the input is a list of tables, the length is the sum of the
742
+ lengths of the tables.
743
+
744
+ Args:
745
+ table: The dataset to get the length of
746
+
747
+ Returns:
748
+ Length of the dataset in records
749
+ """
750
+ if isinstance(table, list):
751
+ return sum(get_table_length(t) for t in table)
752
+ return get_table_length(table)
753
+
754
+
755
+ def get_table_size(table: Union[LocalTable, DistributedDataset]) -> int:
756
+ """
757
+ Generic function to get the size of a table or distributed dataset.
758
+
759
+ Args:
760
+ table: The local table or distributed dataset to get the size of
761
+
762
+ Returns:
763
+ Size of the table or distributed dataset
764
+ """
765
+ table_size_func = _get_table_function(table, TABLE_CLASS_TO_SIZE_FUNC, "size")
766
+ return table_size_func(table)
767
+
768
+
769
+ def dataset_size(table: Dataset) -> int:
770
+ """
771
+ Generic function to get the size of a dataset in bytes.
772
+ If the input is a list of tables, the size is the sum of the
773
+ sizes of the tables.
774
+
775
+ Args:
776
+ table: The dataset to get the size of
777
+
778
+ Returns:
779
+ Size of the dataset in bytes
780
+ """
781
+ if isinstance(table, list):
782
+ return sum(get_table_size(t) for t in table)
783
+ return get_table_size(table)
784
+
785
+
786
+ def get_table_column_names(table: Union[LocalTable, DistributedDataset]) -> List[str]:
787
+ """
788
+ Generic function to get the column names of a table or distributed dataset.
789
+
790
+ Args:
791
+ table: The local table or distributed dataset to get the column names of
792
+
793
+ Returns:
794
+ List of column names
795
+ """
796
+ column_names_func = _get_table_function(
797
+ table, TABLE_CLASS_TO_COLUMN_NAMES_FUNC, "column names"
798
+ )
799
+ return column_names_func(table)
800
+
801
+
802
+ def dataset_column_names(table: Dataset) -> List[str]:
803
+ """
804
+ Generic function to get the column names of a dataset.
805
+ If the input is a list of tables, unique column names are
806
+ returned in the order they are first seen in the list.
807
+
808
+ Args:
809
+ table: The dataset to get the column names of
810
+
811
+ Returns:
812
+ List of column names
813
+ """
814
+ if isinstance(table, list):
815
+ # use dictionary keys as an ordered set
816
+ column_names = {}
817
+ for t in table:
818
+ for column_name in get_table_column_names(t):
819
+ column_names[column_name] = None
820
+ return list(column_names.keys())
821
+ return get_table_column_names(table)
822
+
823
+
824
+ def get_table_schema(table: Union[LocalTable, DistributedDataset]) -> pa.Schema:
825
+ """
826
+ Generic function to get the PyArrow schema of a table or distributed dataset.
827
+
828
+ Args:
829
+ table: The local table or distributed dataset to get the schema of
830
+
831
+ Returns:
832
+ PyArrow Schema object
833
+ """
834
+ schema_func = _get_table_function(table, TABLE_CLASS_TO_SCHEMA_FUNC, "schema")
835
+ return schema_func(table)
836
+
837
+
838
+ def dataset_schema(table: Dataset) -> pa.Schema:
839
+ """
840
+ Generic function to get the PyArrow schema of a dataset. If the input is a list of
841
+ tables, uses pyarrow.unify_schemas(schemas, promote_options="permissive").
842
+
843
+ Args:
844
+ table: The dataset to get the schema of
845
+
846
+ Returns:
847
+ PyArrow Schema object
848
+ """
849
+ if isinstance(table, list):
850
+ return pa.unify_schemas(
851
+ [get_table_schema(t) for t in table], promote_options="permissive"
852
+ )
853
+ return get_table_schema(table)
854
+
855
+
856
+ def get_table_writer(table: Union[LocalTable, DistributedDataset]) -> Callable:
857
+ """
858
+ Generic function to get a table writer function for a given dataset type.
859
+
860
+ Args:
861
+ table: The local table or distributed dataset to get the writer function for
862
+
863
+ Returns:
864
+ Writer function for the given dataset type
865
+ """
866
+ return _get_table_function(table, TABLE_CLASS_TO_WRITER_FUNC, "writer")
867
+
868
+
869
+ def get_table_slicer(table: Union[LocalTable, DistributedDataset]) -> Callable:
870
+ """
871
+ Generic function to get a table slicer function for a given dataset type.
872
+
873
+ Args:
874
+ table: The local table or distributed dataset to get the slicer function for
875
+
876
+ Returns:
877
+ Slicer function for the given dataset type
878
+ """
879
+ return _get_table_function(table, TABLE_CLASS_TO_SLICER_FUNC, "slicer")
880
+
881
+
882
+ def get_dataset_type(dataset: Dataset) -> DatasetType:
883
+ """Get the DatasetType enum value for a given dataset object.
884
+
885
+ Args:
886
+ dataset: The dataset object to identify
887
+
888
+ Returns:
889
+ DatasetType enum value corresponding to the dataset type
890
+
891
+ Raises:
892
+ ValueError: If the dataset type is not supported
893
+ """
894
+ dataset_type_str = _get_table_function(
895
+ dataset, TABLE_CLASS_TO_TABLE_TYPE, "dataset type identification"
896
+ )
897
+ return DatasetType(dataset_type_str)
898
+
899
+
900
+ def table_to_pyarrow(
901
+ table: Union[LocalTable, DistributedDataset],
902
+ *,
903
+ schema: Optional[pa.Schema] = None,
904
+ **kwargs,
905
+ ) -> pa.Table:
906
+ """
907
+ Convert a single table or distributed dataset to PyArrow Table format.
908
+
909
+ Args:
910
+ table: The local table or distributed dataset to convert
911
+ schema: Optional schema to use for the conversion
912
+ **kwargs: Additional arguments passed to the conversion function
913
+
914
+ Returns:
915
+ PyArrow Table created from the provided dataset
916
+ """
917
+ to_pyarrow_func = _get_table_function(
918
+ table, TABLE_CLASS_TO_PYARROW_FUNC, "pyarrow conversion"
919
+ )
920
+ return to_pyarrow_func(table, schema=schema, **kwargs)
921
+
922
+
923
+ def table_to_pandas(
924
+ table: Union[LocalTable, DistributedDataset],
925
+ *,
926
+ schema: Optional[pa.Schema] = None,
927
+ **kwargs,
928
+ ) -> pd.DataFrame:
929
+ """
930
+ Convert a single table or distributed dataset to pandas DataFrame format.
931
+
932
+ Args:
933
+ table: The local table or distributed dataset to convert
934
+ schema: Optional schema to use for the conversion
935
+ **kwargs: Additional arguments passed to the conversion function
936
+
937
+ Returns:
938
+ pandas DataFrame created from the provided dataset
939
+ """
940
+ to_pandas_func = _get_table_function(
941
+ table, TABLE_CLASS_TO_PANDAS_FUNC, "pandas conversion"
942
+ )
943
+ return to_pandas_func(table, schema=schema, **kwargs)
944
+
945
+
946
+ def to_pyarrow(
947
+ table: Dataset, *, schema: Optional[pa.Schema] = None, **kwargs
948
+ ) -> pa.Table:
949
+ """
950
+ Convert any supported dataset type to PyArrow Table format.
951
+
952
+ Args:
953
+ table: The table/dataset to convert
954
+ schema: Optional schema to use for the conversion
955
+ **kwargs: Additional arguments passed to the conversion function
956
+
957
+ Returns:
958
+ PyArrow Table created from the provided dataset
959
+ """
960
+ if isinstance(table, list):
961
+ return _convert_all(table, table_to_pyarrow, schema=schema, **kwargs)
962
+ return table_to_pyarrow(table, schema=schema, **kwargs)
963
+
964
+
965
+ def to_pandas(
966
+ table: Dataset, *, schema: Optional[pa.Schema] = None, **kwargs
967
+ ) -> pd.DataFrame:
968
+ """
969
+ Convert any supported dataset type to pandas DataFrame format.
970
+
971
+ Args:
972
+ table: The table/dataset to convert
973
+ schema: Optional schema to use for the conversion
974
+ **kwargs: Additional arguments passed to the conversion function
975
+
976
+ Returns:
977
+ pandas DataFrame created from the provided dataset
978
+ """
979
+ if isinstance(table, list):
980
+ return _convert_all(table, table_to_pandas, schema=schema, **kwargs)
981
+ return table_to_pandas(table, schema=schema, **kwargs)
982
+
983
+
984
+ def from_pyarrow(pa_table: pa.Table, target_type: DatasetType, **kwargs) -> Dataset:
985
+ """Convert PyArrow Table to the specified dataset type.
986
+
987
+ Args:
988
+ pa_table: PyArrow Table to convert
989
+ target_type: Target DatasetType to convert to
990
+ **kwargs: Additional arguments passed to the conversion function
991
+
992
+ Returns:
993
+ Dataset converted to the target type
994
+
995
+ Raises:
996
+ ValueError: If target_type is not supported
997
+ """
998
+ conversion_func = _get_table_type_function(
999
+ target_type,
1000
+ DATASET_TYPE_FROM_PYARROW,
1001
+ f"{target_type} conversion",
1002
+ )
1003
+ return conversion_func(pa_table, **kwargs)
1004
+
1005
+
1006
+ def from_pandas(pd_df: pd.DataFrame, target_type: DatasetType, **kwargs) -> Dataset:
1007
+ """Convert Pandas DataFrame to the specified dataset type.
1008
+
1009
+ Args:
1010
+ pd_df: Pandas DataFrame to convert
1011
+ target_type: Target DatasetType to convert to
1012
+ **kwargs: Additional arguments passed to the conversion function
1013
+
1014
+ Returns:
1015
+ Dataset converted to the target type
1016
+
1017
+ Raises:
1018
+ ValueError: If target_type is not supported
1019
+ """
1020
+ conversion_func = _get_table_type_function(
1021
+ target_type,
1022
+ DATASET_TYPE_FROM_PANDAS,
1023
+ f"{target_type} conversion",
1024
+ )
1025
+ return conversion_func(pd_df, **kwargs)
1026
+
1027
+
1028
+ def empty_table(table_type: DatasetType) -> Dataset:
1029
+ """
1030
+ Create an empty table of the given type.
1031
+ """
1032
+ empty_table_func = _get_table_type_function(
1033
+ table_type, TABLE_TYPE_TO_EMPTY_TABLE_FUNC, "empty table"
1034
+ )
1035
+ return empty_table_func()
1036
+
1037
+
1038
+ def append_column_to_table(
1039
+ table: LocalTable,
1040
+ column_name: str,
1041
+ column_value: Any,
1042
+ ) -> LocalTable:
1043
+ """
1044
+ Generic function to append a column with a specified value to any supported dataset type.
1045
+
1046
+ Args:
1047
+ table: The table/dataset to add column to
1048
+ column_name: Name of the new column
1049
+ column_value: Value to populate in all rows of the new column
1050
+ table_type: Type of the dataset
1051
+
1052
+ Returns:
1053
+ Updated table with the new column
1054
+ """
1055
+ append_column_to_table_func = _get_table_function(
1056
+ table, TABLE_CLASS_TO_APPEND_COLUMN_FUNC, "append column"
1057
+ )
1058
+ return append_column_to_table_func(table, column_name, column_value)
1059
+
1060
+
1061
+ def select_columns_from_table(
1062
+ table: LocalTable,
1063
+ column_names: List[str],
1064
+ ) -> LocalTable:
1065
+ """
1066
+ Generic function to select columns from any supported dataset type.
1067
+
1068
+ Args:
1069
+ table: The table/dataset to select columns from
1070
+ column_names: List of column names to select
1071
+
1072
+ Returns:
1073
+ Updated table with the selected columns
1074
+ """
1075
+ select_columns_func = _get_table_function(
1076
+ table, TABLE_CLASS_TO_SELECT_COLUMNS_FUNC, "select columns"
1077
+ )
1078
+ return select_columns_func(table, column_names)
1079
+
1080
+
1081
+ def write_sliced_table(
1082
+ table: Union[LocalTable, DistributedDataset],
1083
+ base_path: str,
1084
+ filesystem: Optional[pa.fs.FileSystem],
1085
+ max_records_per_entry: Optional[int],
1086
+ table_writer_fn: Callable,
1087
+ table_slicer_fn: Callable,
1088
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
1089
+ content_type: ContentType = ContentType.PARQUET,
1090
+ entry_params: Optional[EntryParams] = None,
1091
+ entry_type: Optional[EntryType] = EntryType.DATA,
1092
+ ) -> ManifestEntryList:
1093
+ """
1094
+ Writes table slices to 1 or more files and returns
1095
+ manifest entries describing the uploaded files.
1096
+
1097
+ Args:
1098
+ table: The local table or distributed dataset to write
1099
+ base_path: The base path to write the table to
1100
+ filesystem: The filesystem to write the table to
1101
+ table_writer_fn: The function to write the table to
1102
+ table_slicer_fn: The function to slice the table into multiple files
1103
+ table_writer_kwargs: Additional arguments to pass to the table writer
1104
+ content_type: The content type to write the table to
1105
+ entry_params: Manifest entry parameters
1106
+ entry_type: The manifest entry types to write
1107
+
1108
+ Returns:
1109
+ Manifest entries describing the uploaded files
1110
+ """
1111
+ # @retry decorator can't be pickled by Ray, so wrap upload in Retrying
1112
+ retrying = Retrying(
1113
+ wait=wait_random_exponential(multiplier=1, max=60),
1114
+ stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
1115
+ retry=retry_if_exception_type(RetryableError),
1116
+ )
1117
+
1118
+ manifest_entries = ManifestEntryList()
1119
+ table_record_count = get_table_length(table)
1120
+
1121
+ if max_records_per_entry is None or not table_record_count:
1122
+ # write the whole table to a single file
1123
+ manifest_entries = retrying(
1124
+ write_table,
1125
+ table,
1126
+ f"{base_path}", # cast any non-string arg to string
1127
+ filesystem,
1128
+ table_writer_fn,
1129
+ table_writer_kwargs,
1130
+ content_type,
1131
+ entry_params,
1132
+ entry_type,
1133
+ )
1134
+ else:
1135
+ # iteratively write table slices
1136
+ table_slices = table_slicer_fn(table, max_records_per_entry)
1137
+ for table_slice in table_slices:
1138
+ slice_entries = retrying(
1139
+ write_table,
1140
+ table_slice,
1141
+ f"{base_path}", # cast any non-string arg to string
1142
+ filesystem,
1143
+ table_writer_fn,
1144
+ table_writer_kwargs,
1145
+ content_type,
1146
+ entry_params,
1147
+ entry_type,
1148
+ )
1149
+ manifest_entries.extend(slice_entries)
1150
+ return manifest_entries
1151
+
1152
+
1153
+ def write_table(
1154
+ table: Union[LocalTable, DistributedDataset],
1155
+ base_path: str,
1156
+ filesystem: Optional[pa.fs.FileSystem],
1157
+ table_writer_fn: Callable,
1158
+ table_writer_kwargs: Optional[Dict[str, Any]],
1159
+ content_type: ContentType = ContentType.PARQUET,
1160
+ entry_params: Optional[EntryParams] = None,
1161
+ entry_type: Optional[EntryType] = EntryType.DATA,
1162
+ ) -> ManifestEntryList:
1163
+ """
1164
+ Writes the given table to 1 or more files and return
1165
+ manifest entries describing the uploaded files.
1166
+
1167
+ Args:
1168
+ table: The local table or distributed dataset to write
1169
+ base_path: The base path to write the table to
1170
+ filesystem: The filesystem to write the table to
1171
+ table_writer_fn: The function to write the table to
1172
+ table_writer_kwargs: Additional arguments to pass to the table writer
1173
+ content_type: The content type to write the table to
1174
+ entry_params: Manifest entry parameters
1175
+ entry_type: The manifest entry types to write
1176
+
1177
+ Returns:
1178
+ Manifest entries describing the uploaded files
1179
+ """
1180
+ if table_writer_kwargs is None:
1181
+ table_writer_kwargs = {}
1182
+
1183
+ # Determine content_encoding before writing files so we can include it in filenames
1184
+ content_encoding = None
1185
+ if content_type in EXPLICIT_COMPRESSION_CONTENT_TYPES:
1186
+ # TODO(pdames): Support other user-specified encodings at write time.
1187
+ content_encoding = ContentEncoding.GZIP
1188
+
1189
+ wrapped_obj = (
1190
+ CapturedBlockWritePathsActor.remote()
1191
+ if isinstance(table, RayDataset)
1192
+ else CapturedBlockWritePathsBase()
1193
+ )
1194
+ capture_object = CapturedBlockWritePaths(wrapped_obj)
1195
+ block_write_path_provider = UuidBlockWritePathProvider(
1196
+ capture_object,
1197
+ base_path=base_path,
1198
+ content_type=content_type,
1199
+ content_encoding=content_encoding,
1200
+ )
1201
+ # Extract schema, schema_id, and sort_scheme_id from table_writer_kwargs
1202
+ schema = table_writer_kwargs.pop("schema", None)
1203
+ schema_id = table_writer_kwargs.pop("schema_id", None)
1204
+ sort_scheme_id = table_writer_kwargs.pop("sort_scheme_id", None)
1205
+ table_writer_fn(
1206
+ table,
1207
+ base_path,
1208
+ filesystem,
1209
+ block_write_path_provider,
1210
+ content_type.value,
1211
+ schema=schema,
1212
+ **table_writer_kwargs,
1213
+ )
1214
+ # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
1215
+ del block_write_path_provider
1216
+ blocks = capture_object.blocks()
1217
+ write_paths = capture_object.write_paths()
1218
+ metadata = get_block_metadata_list(table, write_paths, blocks)
1219
+ manifest_entries = ManifestEntryList()
1220
+ for block_idx, path in enumerate(write_paths):
1221
+ try:
1222
+ manifest_entry = ManifestEntry.from_path(
1223
+ path=path,
1224
+ filesystem=filesystem,
1225
+ record_count=metadata[block_idx].num_rows,
1226
+ source_content_length=metadata[block_idx].size_bytes,
1227
+ content_type=content_type.value,
1228
+ content_encoding=content_encoding,
1229
+ entry_type=entry_type,
1230
+ entry_params=entry_params,
1231
+ schema_id=schema_id,
1232
+ sort_scheme_id=sort_scheme_id,
1233
+ )
1234
+ manifest_entries.append(manifest_entry)
1235
+ except RETRYABLE_TRANSIENT_ERRORS as e:
1236
+ _handle_retryable_error(e, path, "write", RetryableUploadTableError)
1237
+ except BaseException as e:
1238
+ _handle_non_retryable_error(
1239
+ e,
1240
+ path,
1241
+ "upload",
1242
+ NonRetryableUploadTableError,
1243
+ f"and content_type={content_type}",
1244
+ )
1245
+ return manifest_entries
1246
+
1247
+
1248
+ @ray.remote
1249
+ class CapturedBlockWritePathsActor:
1250
+ def __init__(self):
1251
+ self._wrapped = CapturedBlockWritePathsBase()
1252
+
1253
+ def extend(self, write_paths: List[str], blocks: List[Block]) -> None:
1254
+ self._wrapped.extend(write_paths, blocks)
1255
+
1256
+ def write_paths(self) -> List[str]:
1257
+ return self._wrapped.write_paths()
1258
+
1259
+ def blocks(self) -> List[Block]:
1260
+ return self._wrapped.blocks()
1261
+
1262
+
1263
+ class CapturedBlockWritePathsBase:
1264
+ def __init__(self):
1265
+ self._write_paths: List[str] = []
1266
+ self._blocks: List[Block] = []
1267
+
1268
+ def extend(self, write_paths: List[str], blocks: List[Block]) -> None:
1269
+ try:
1270
+ iter(write_paths)
1271
+ except TypeError:
1272
+ pass
1273
+ else:
1274
+ self._write_paths.extend(write_paths)
1275
+ try:
1276
+ iter(blocks)
1277
+ except TypeError:
1278
+ pass
1279
+ else:
1280
+ self._blocks.extend(blocks)
1281
+
1282
+ def write_paths(self) -> List[str]:
1283
+ return self._write_paths
1284
+
1285
+ def blocks(self) -> List[Block]:
1286
+ return self._blocks
1287
+
1288
+
1289
+ class CapturedBlockWritePaths:
1290
+ def __init__(self, wrapped=CapturedBlockWritePathsBase()):
1291
+ self._wrapped = wrapped
1292
+
1293
+ def extend(self, write_paths: List[str], blocks: List[Block]) -> None:
1294
+ return (
1295
+ self._wrapped.extend(write_paths, blocks)
1296
+ if isinstance(self._wrapped, CapturedBlockWritePathsBase)
1297
+ else ray.get(self._wrapped.extend.remote(write_paths, blocks))
1298
+ )
1299
+
1300
+ def write_paths(self) -> List[str]:
1301
+ return (
1302
+ self._wrapped.write_paths()
1303
+ if isinstance(self._wrapped, CapturedBlockWritePathsBase)
1304
+ else ray.get(self._wrapped.write_paths.remote())
1305
+ )
1306
+
1307
+ def blocks(self) -> List[Block]:
1308
+ return (
1309
+ self._wrapped.blocks()
1310
+ if isinstance(self._wrapped, CapturedBlockWritePathsBase)
1311
+ else ray.get(self._wrapped.blocks.remote())
1312
+ )
1313
+
1314
+
1315
+ class UuidBlockWritePathProvider(FilenameProvider):
1316
+ """Block write path provider implementation that writes each
1317
+ dataset block out to a file of the form: {base_path}/{uuid}
1318
+ """
1319
+
1320
+ def __init__(
1321
+ self,
1322
+ capture_object: CapturedBlockWritePaths,
1323
+ base_path: Optional[str] = None,
1324
+ content_type: Optional[ContentType] = None,
1325
+ content_encoding: Optional[ContentEncoding] = None,
1326
+ ):
1327
+ self.base_path = base_path
1328
+ self.content_type = content_type
1329
+ self.content_encoding = content_encoding
1330
+ self.write_paths: List[str] = []
1331
+ self.blocks: List[Block] = []
1332
+ self.capture_object = capture_object
1333
+
1334
+ def __del__(self):
1335
+ if self.write_paths or self.blocks:
1336
+ self.capture_object.extend(
1337
+ self.write_paths,
1338
+ self.blocks,
1339
+ )
1340
+
1341
+ def get_filename_for_block(
1342
+ self,
1343
+ block: Block,
1344
+ task_index: int,
1345
+ block_index: int,
1346
+ ) -> str:
1347
+ if self.base_path is None:
1348
+ raise ValueError(
1349
+ "Base path must be provided to UuidBlockWritePathProvider",
1350
+ )
1351
+ return self._get_write_path_for_block(
1352
+ base_path=self.base_path,
1353
+ block=block,
1354
+ block_index=block_index,
1355
+ )
1356
+
1357
+ def _get_write_path_for_block(
1358
+ self,
1359
+ base_path: str,
1360
+ *,
1361
+ block: Optional[Block] = None,
1362
+ **kwargs,
1363
+ ) -> str:
1364
+ # Generate base UUID filename
1365
+ filename = str(uuid4())
1366
+
1367
+ # Add content type extension if available
1368
+ if self.content_type:
1369
+ content_type_extension = None
1370
+ content_type_extension = CONTENT_TYPE_TO_EXT.get(self.content_type)
1371
+ if content_type_extension:
1372
+ filename += content_type_extension
1373
+
1374
+ # Add content encoding extension if available
1375
+ if self.content_encoding:
1376
+ encoding_extension = None
1377
+ encoding_extension = CONTENT_ENCODING_TO_EXT.get(self.content_encoding)
1378
+ if encoding_extension:
1379
+ filename += encoding_extension
1380
+
1381
+ write_path = f"{base_path}/{filename}"
1382
+ self.write_paths.append(write_path)
1383
+ if block is not None:
1384
+ self.blocks.append(block)
1385
+ return write_path
1386
+
1387
+ def __call__(
1388
+ self,
1389
+ base_path: str,
1390
+ *,
1391
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1392
+ dataset_uuid: Optional[str] = None,
1393
+ block: Optional[Block] = None,
1394
+ block_index: Optional[int] = None,
1395
+ file_format: Optional[str] = None,
1396
+ ) -> str:
1397
+ return self._get_write_path_for_block(
1398
+ base_path,
1399
+ filesystem=filesystem,
1400
+ dataset_uuid=dataset_uuid,
1401
+ block=block,
1402
+ block_index=block_index,
1403
+ file_format=file_format,
1404
+ )
1405
+
1406
+
1407
+ def get_block_metadata_list(
1408
+ table: LocalTable,
1409
+ write_paths: List[str],
1410
+ blocks: List[Block],
1411
+ ) -> List[BlockMetadata]:
1412
+ """
1413
+ Get the block metadata for a given table.
1414
+
1415
+ Args:
1416
+ table: The local table or distributed dataset to get the block metadata for
1417
+ write_paths: The list of write paths for the table
1418
+ blocks: The list of blocks to get the metadata for
1419
+
1420
+ Returns:
1421
+ List of block metadata
1422
+ """
1423
+ block_meta_list: List[BlockMetadata] = []
1424
+ if not blocks:
1425
+ # this must be a local table - ensure it was written to only 1 file
1426
+ assert len(write_paths) == 1, (
1427
+ f"Expected table of type '{type(table)}' to be written to 1 "
1428
+ f"file, but found {len(write_paths)} files."
1429
+ )
1430
+ blocks = [table]
1431
+ for block in blocks:
1432
+ block_meta_list.append(get_block_metadata(block))
1433
+ return block_meta_list
1434
+
1435
+
1436
+ def get_block_metadata(
1437
+ table: Union[LocalTable, DistributedDataset, BlockAccessor],
1438
+ ) -> BlockMetadata:
1439
+ """
1440
+ Get the block metadata for a given table.
1441
+
1442
+ Args:
1443
+ table: The local table or distributed dataset to get the block metadata for
1444
+
1445
+ Returns:
1446
+ Block metadata
1447
+ """
1448
+ table_size = None
1449
+ table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
1450
+ if table_size_func:
1451
+ table_size = table_size_func(table)
1452
+ else:
1453
+ logger.warning(f"Unable to estimate '{type(table)}' table size.")
1454
+ if isinstance(table, BlockAccessor):
1455
+ table = table.to_block()
1456
+ return BlockMetadata(
1457
+ num_rows=get_table_length(table),
1458
+ size_bytes=table_size,
1459
+ schema=None,
1460
+ input_files=None,
1461
+ exec_stats=None,
1462
+ )
1463
+
1464
+
1465
+ def _reconstruct_manifest_entry_uri(
1466
+ manifest_entry: ManifestEntry,
1467
+ **kwargs,
1468
+ ) -> ManifestEntry:
1469
+ """
1470
+ Reconstruct the full URI for a manifest entry.
1471
+
1472
+ Args:
1473
+ manifest_entry: The manifest entry to reconstruct the URI for
1474
+ **kwargs: Additional arguments to pass to the catalog properties
1475
+
1476
+ Returns:
1477
+ Manifest entry with the reconstructed URI
1478
+ """
1479
+ # Reconstruct full URI with scheme for external readers (see GitHub issue #567)
1480
+ from deltacat.catalog import get_catalog_properties
1481
+
1482
+ # Only pass kwargs that CatalogProperties actually accepts
1483
+ catalog_kwargs = _filter_kwargs_for_catalog_properties(kwargs)
1484
+ catalog_properties = get_catalog_properties(**catalog_kwargs)
1485
+
1486
+ original_uri = manifest_entry.uri
1487
+ reconstructed_uri = catalog_properties.reconstruct_full_path(original_uri)
1488
+ if original_uri != reconstructed_uri:
1489
+ # Create a copy of the manifest entry with the reconstructed URI
1490
+ reconstructed_entry = ManifestEntry(
1491
+ uri=reconstructed_uri, url=manifest_entry.url, meta=manifest_entry.meta
1492
+ )
1493
+ return reconstructed_entry
1494
+ return manifest_entry
1495
+
1496
+
1497
+ def _filter_kwargs_for_external_readers(kwargs: Dict[str, Any]) -> Dict[str, Any]:
1498
+ """
1499
+ Filter out DeltaCAT system kwargs that external file readers don't expect.
1500
+
1501
+ Use this when passing kwargs to external libraries like PyArrow, Pandas, Polars, etc.
1502
+ This removes all DeltaCAT-specific parameters that would cause TypeErrors in external readers.
1503
+
1504
+ Args:
1505
+ kwargs: The dictionary of arguments to filter
1506
+
1507
+ Returns:
1508
+ Dictionary of arguments with DeltaCAT system kwargs removed
1509
+ """
1510
+ return {
1511
+ k: v
1512
+ for k, v in kwargs.items()
1513
+ if k
1514
+ not in [
1515
+ # DeltaCAT catalog/storage system kwargs
1516
+ "inner",
1517
+ "catalog",
1518
+ "ray_options_provider",
1519
+ "distributed_dataset_type",
1520
+ # DeltaCAT schema/reader kwargs
1521
+ "table_version_schema",
1522
+ "entry_params",
1523
+ # Daft-specific kwargs
1524
+ "io_config",
1525
+ "ray_init_options",
1526
+ # DeltaCAT processing kwargs
1527
+ "column_names",
1528
+ "include_columns",
1529
+ "file_reader_kwargs_provider",
1530
+ "file_path_column",
1531
+ "max_parallelism",
1532
+ ]
1533
+ }
1534
+
1535
+
1536
+ def _filter_kwargs_for_catalog_properties(kwargs: Dict[str, Any]) -> Dict[str, Any]:
1537
+ """
1538
+ Filter kwargs to only include those that CatalogProperties accepts.
1539
+
1540
+ Use this when calling get_catalog_properties() or CatalogProperties.__init__().
1541
+ Uses a whitelist approach - only passes known compatible parameters.
1542
+
1543
+ CatalogProperties.__init__ accepts: root, filesystem, storage
1544
+ get_catalog_properties also accepts: catalog, inner
1545
+
1546
+ Args:
1547
+ kwargs: The dictionary of arguments to filter
1548
+
1549
+ Returns:
1550
+ Dictionary containing only CatalogProperties-compatible kwargs
1551
+ """
1552
+ return {
1553
+ k: v
1554
+ for k, v in kwargs.items()
1555
+ if k in ["root", "filesystem", "storage", "catalog", "inner"]
1556
+ }
1557
+
1558
+
1559
+ def _filter_kwargs_for_reader_functions(kwargs: Dict[str, Any]) -> Dict[str, Any]:
1560
+ """
1561
+ Filter kwargs for internal DeltaCAT reader functions that need most params but not catalog-specific ones.
1562
+
1563
+ Use this for internal DeltaCAT functions that need file reader kwargs, schema kwargs, etc.
1564
+ but should not receive catalog/storage system parameters.
1565
+ Preserves table_version_schema, entry_params, file reader kwargs, etc.
1566
+
1567
+ Args:
1568
+ kwargs: The dictionary of arguments to filter
1569
+
1570
+ Returns:
1571
+ Dictionary with catalog/storage system kwargs removed
1572
+ """
1573
+ return {
1574
+ k: v
1575
+ for k, v in kwargs.items()
1576
+ if k
1577
+ not in ["inner", "catalog", "ray_options_provider", "distributed_dataset_type"]
1578
+ }
1579
+
1580
+
1581
+ def _extract_content_metadata(
1582
+ manifest_entry: ManifestEntry,
1583
+ ) -> Tuple[ContentType, ContentEncoding, str]:
1584
+ """
1585
+ Extract content type, encoding, and path from manifest entry.
1586
+
1587
+ Args:
1588
+ manifest_entry: The manifest entry to extract the content metadata from
1589
+
1590
+ Returns:
1591
+ Tuple of content type, encoding, and path
1592
+ """
1593
+ content_type = manifest_entry.meta.content_type
1594
+ assert content_type, f"Unknown content type for manifest entry: {manifest_entry}"
1595
+ content_type = ContentType(content_type)
1596
+
1597
+ content_encoding = manifest_entry.meta.content_encoding
1598
+ assert (
1599
+ content_encoding
1600
+ ), f"Unknown content encoding for manifest entry: {manifest_entry}"
1601
+ content_encoding = ContentEncoding(content_encoding)
1602
+
1603
+ path = manifest_entry.uri
1604
+ if path is None:
1605
+ path = manifest_entry.url
1606
+
1607
+ return content_type, content_encoding, path
1608
+
1609
+
1610
+ def _extract_partial_download_params(
1611
+ manifest_entry: ManifestEntry,
1612
+ ) -> Optional[PartialFileDownloadParams]:
1613
+ """
1614
+ Extract partial file download parameters from manifest entry.
1615
+
1616
+ Args:
1617
+ manifest_entry: The manifest entry to extract the partial file download parameters from
1618
+
1619
+ Returns:
1620
+ Partial file download parameters
1621
+ """
1622
+ if not manifest_entry.meta or not manifest_entry.meta.content_type_parameters:
1623
+ return None
1624
+
1625
+ for type_params in manifest_entry.meta.content_type_parameters:
1626
+ if isinstance(type_params, PartialFileDownloadParams):
1627
+ return type_params
1628
+ return None
1629
+
1630
+
1631
+ def _create_retry_wrapper():
1632
+ """
1633
+ Create a standardized Tenacity Retrying wrapper for file operations.
1634
+
1635
+ Returns:
1636
+ Tenacity Retrying wrapper
1637
+ """
1638
+ return Retrying(
1639
+ wait=wait_random_exponential(multiplier=1, max=60),
1640
+ stop=stop_after_delay(DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY),
1641
+ retry=retry_if_exception_type(RetryableError),
1642
+ )
1643
+
1644
+
1645
+ def _remove_file_path_column(
1646
+ include_columns: Optional[List[str]],
1647
+ file_path_column: Optional[str],
1648
+ ) -> Optional[List[str]]:
1649
+ """Remove the file path system column from the include_columns list.
1650
+
1651
+ Args:
1652
+ include_columns: The list of columns to include in a selection
1653
+ file_path_column: Optional file path system column name to remove from the selection
1654
+
1655
+ Returns:
1656
+ List of columns to include without the file path system column
1657
+ """
1658
+ if file_path_column and include_columns:
1659
+ return [col for col in include_columns if col != file_path_column]
1660
+ return include_columns
1661
+
1662
+
1663
+ def _prepare_download_arguments(
1664
+ table_type: DatasetType,
1665
+ column_names: Optional[List[str]],
1666
+ include_columns: Optional[List[str]],
1667
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider],
1668
+ file_path_column: Optional[str],
1669
+ **kwargs,
1670
+ ) -> Dict[str, Any]:
1671
+ """Prepare standardized arguments for download operations.
1672
+
1673
+ Args:
1674
+ table_type: The type of table to download
1675
+ column_names: The list of column names in the table
1676
+ include_columns: The list of columns to include in the selection
1677
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs
1678
+ file_path_column: The file path system column name
1679
+ **kwargs: Additional arguments to pass to the file reader
1680
+
1681
+ Returns:
1682
+ Dictionary of arguments for the download operation
1683
+ """
1684
+ reader_kwargs = _filter_kwargs_for_external_readers(kwargs)
1685
+ processed_include_columns = _remove_file_path_column(
1686
+ include_columns, file_path_column
1687
+ )
1688
+
1689
+ return {
1690
+ "table_type": table_type,
1691
+ "column_names": column_names,
1692
+ "include_columns": processed_include_columns,
1693
+ "file_reader_kwargs_provider": file_reader_kwargs_provider,
1694
+ "file_path_column": file_path_column,
1695
+ **reader_kwargs,
1696
+ }
1697
+
1698
+
1699
+ def _handle_retryable_error(e: Exception, path: str, operation: str, error_class: type):
1700
+ """Handle retryable errors with standardized error message."""
1701
+ raise error_class(
1702
+ f"Retry {operation} for: {path} after receiving {type(e).__name__}: {e}"
1703
+ ) from e
1704
+
1705
+
1706
+ def _handle_non_retryable_error(
1707
+ e: Exception, path: str, operation: str, error_class: type, extra_context: str = ""
1708
+ ):
1709
+ """Handle non-retryable errors with logging and standardized error message."""
1710
+ context = f" {extra_context}" if extra_context else ""
1711
+ logger.warning(
1712
+ f"{operation.title()} has failed for {path}{context}. Error: {e}",
1713
+ exc_info=True,
1714
+ )
1715
+ raise error_class(
1716
+ f"{operation.title()} has failed for {path}{context}: Error: {e}"
1717
+ ) from e
1718
+
1719
+
1720
+ def from_manifest_table(
1721
+ manifest_table: Union[LocalDataset, DistributedDataset],
1722
+ dataset_type: DatasetType = DatasetType.DAFT,
1723
+ schema: Optional[pa.Schema] = None,
1724
+ **kwargs,
1725
+ ) -> Dataset:
1726
+ """
1727
+ Read a manifest table (containing file paths and metadata) and download the actual data.
1728
+
1729
+ This utility function takes the output from a schemaless table read (which returns
1730
+ manifest entries instead of data) and downloads the actual file contents.
1731
+
1732
+ Args:
1733
+ manifest_table: Dataset containing manifest entries with file paths and metadata
1734
+ dataset_type: The type of dataset to return (DAFT, RAY_DATASET, PYARROW, etc.)
1735
+ schema: Optional PyArrow schema to enforce consistent column names across formats
1736
+ **kwargs: Additional arguments forwarded to download functions
1737
+
1738
+ Returns:
1739
+ Dataset containing the actual file contents
1740
+ """
1741
+ # Convert the manifest table to pandas for easier processing
1742
+ # TODO(pdames): Iterate over each input manifest table in its native format
1743
+ manifest_df = to_pandas(manifest_table)
1744
+
1745
+ # Reconstruct ManifestEntry objects from the manifest data
1746
+ manifest_entries = []
1747
+ for _, row in manifest_df.iterrows():
1748
+ # Create ManifestMeta from the row data
1749
+ meta = ManifestMeta.of(
1750
+ content_length=row.get("meta_content_length"),
1751
+ record_count=row.get("meta_record_count"),
1752
+ content_type=row.get("meta_content_type"),
1753
+ content_encoding=row.get("meta_content_encoding"),
1754
+ )
1755
+
1756
+ # Create ManifestEntry
1757
+ entry = ManifestEntry.of(
1758
+ url=row["path"],
1759
+ meta=meta,
1760
+ mandatory=row.get("mandatory", True),
1761
+ uuid=row.get("id"),
1762
+ )
1763
+ manifest_entries.append(entry)
1764
+
1765
+ # Create a new Manifest from the entries
1766
+ reconstructed_manifest = Manifest.of(entries=manifest_entries)
1767
+
1768
+ # Add schema to kwargs if provided
1769
+ if schema is not None:
1770
+ kwargs["table_version_schema"] = schema
1771
+
1772
+ # Choose the appropriate download function based on dataset type
1773
+ if dataset_type in DatasetType.distributed():
1774
+ # Use distributed download function
1775
+ # Map DatasetType to DistributedDatasetType
1776
+ distributed_type_map = {
1777
+ DatasetType.DAFT: DistributedDatasetType.DAFT,
1778
+ DatasetType.RAY_DATASET: DistributedDatasetType.RAY_DATASET,
1779
+ }
1780
+ distributed_dataset_type = distributed_type_map.get(dataset_type)
1781
+ if distributed_dataset_type is None:
1782
+ raise ValueError(f"Unsupported distributed dataset type: {dataset_type}")
1783
+
1784
+ return download_manifest_entries_distributed(
1785
+ manifest=reconstructed_manifest,
1786
+ distributed_dataset_type=distributed_dataset_type,
1787
+ **kwargs,
1788
+ )
1789
+ else:
1790
+ # Use local download function
1791
+ return download_manifest_entries(
1792
+ manifest=reconstructed_manifest,
1793
+ table_type=dataset_type,
1794
+ **kwargs,
1795
+ )
1796
+
1797
+
1798
+ def download_manifest_entries(
1799
+ manifest: Manifest,
1800
+ table_type: DatasetType = DatasetType.PYARROW,
1801
+ max_parallelism: Optional[int] = 1,
1802
+ column_names: Optional[List[str]] = None,
1803
+ include_columns: Optional[List[str]] = None,
1804
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
1805
+ file_path_column: Optional[str] = None,
1806
+ **kwargs,
1807
+ ) -> LocalDataset:
1808
+ """Download all entries in the manifest.
1809
+
1810
+ Args:
1811
+ manifest: The manifest containing the entries to download
1812
+ table_type: Dataset type to load the entries into
1813
+ max_parallelism: Maximum parallelism to use
1814
+ column_names: The list of column names in the table
1815
+ include_columns: The list of columns to include in the selection
1816
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
1817
+ (e.g., to pass in a custom schema for a Parquet file)
1818
+ file_path_column: Optional file path system column name
1819
+ **kwargs: Additional arguments to pass to the file reader
1820
+
1821
+ Returns:
1822
+ Local dataset
1823
+ """
1824
+ if max_parallelism and max_parallelism <= 1:
1825
+ return _download_manifest_entries(
1826
+ manifest,
1827
+ table_type,
1828
+ column_names,
1829
+ include_columns,
1830
+ file_reader_kwargs_provider,
1831
+ file_path_column,
1832
+ **kwargs,
1833
+ )
1834
+ else:
1835
+ return _download_manifest_entries_parallel(
1836
+ manifest,
1837
+ table_type,
1838
+ max_parallelism,
1839
+ column_names,
1840
+ include_columns,
1841
+ file_reader_kwargs_provider,
1842
+ file_path_column,
1843
+ **kwargs,
1844
+ )
1845
+
1846
+
1847
+ def _download_manifest_entries(
1848
+ manifest: Manifest,
1849
+ table_type: DatasetType = DatasetType.PYARROW,
1850
+ column_names: Optional[List[str]] = None,
1851
+ include_columns: Optional[List[str]] = None,
1852
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
1853
+ file_path_column: Optional[str] = None,
1854
+ **kwargs,
1855
+ ) -> LocalDataset:
1856
+ """Download all entries in the manifest.
1857
+
1858
+ Args:
1859
+ manifest: The manifest containing the entries to download
1860
+ table_type: Dataset type to load the entries into
1861
+ column_names: The list of column names in the table
1862
+ include_columns: The list of columns to include in the selection
1863
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
1864
+ (e.g., to pass in a custom schema for a Parquet file)
1865
+ file_path_column: Optional file path system column name
1866
+ **kwargs: Additional arguments to pass to the file reader
1867
+
1868
+ Returns:
1869
+ Local dataset
1870
+ """
1871
+ download_args = _prepare_download_arguments(
1872
+ table_type,
1873
+ column_names,
1874
+ include_columns,
1875
+ file_reader_kwargs_provider,
1876
+ file_path_column,
1877
+ **kwargs,
1878
+ )
1879
+ result = []
1880
+ for e in manifest.entries:
1881
+ manifest_entry = _reconstruct_manifest_entry_uri(e, **kwargs)
1882
+ result.append(
1883
+ download_manifest_entry(manifest_entry=manifest_entry, **download_args)
1884
+ )
1885
+
1886
+ return result
1887
+
1888
+
1889
+ @ray.remote
1890
+ def download_manifest_entry_ray(
1891
+ manifest_entry: ManifestEntry,
1892
+ table_type: DatasetType = DatasetType.PYARROW,
1893
+ column_names: Optional[List[str]] = None,
1894
+ include_columns: Optional[List[str]] = None,
1895
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
1896
+ content_type: Optional[ContentType] = None,
1897
+ content_encoding: Optional[ContentEncoding] = None,
1898
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1899
+ file_path_column: Optional[str] = None,
1900
+ **kwargs,
1901
+ ) -> LocalTable:
1902
+ """
1903
+ Ray remote function for downloading manifest entries.
1904
+
1905
+ Args:
1906
+ manifest_entry: The manifest entry to download
1907
+ table_type: Dataset type to load the entry into
1908
+ column_names: The list of column names in the table
1909
+ include_columns: The list of columns to include in the selection
1910
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
1911
+ (e.g., to pass in a custom schema for a Parquet file)
1912
+ content_type: Optional content type of the file
1913
+ content_encoding: Optional content encoding of the file
1914
+ filesystem: Optional PyArrow filesystem to use to read the file
1915
+ file_path_column: Optional file path system column name
1916
+ **kwargs: Additional arguments to pass to the file reader
1917
+
1918
+ Returns:
1919
+ Local table
1920
+ """
1921
+ # Make sure we normalize the table type to PyArrow to provide the correct
1922
+ # input type to from_arrow_refs
1923
+ effective_table_type = table_type
1924
+ if table_type == DatasetType.RAY_DATASET:
1925
+ effective_table_type = DatasetType.PYARROW
1926
+
1927
+ # Call the regular download function
1928
+ result = download_manifest_entry(
1929
+ manifest_entry=manifest_entry,
1930
+ table_type=effective_table_type,
1931
+ column_names=column_names,
1932
+ include_columns=include_columns,
1933
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
1934
+ content_type=content_type,
1935
+ content_encoding=content_encoding,
1936
+ filesystem=filesystem,
1937
+ file_path_column=file_path_column,
1938
+ **kwargs,
1939
+ )
1940
+
1941
+ # Convert Polars DataFrame to Arrow Table for Ray dataset compatibility
1942
+ if isinstance(result, pl.DataFrame):
1943
+ result = result.to_arrow()
1944
+
1945
+ # Cast string_view columns to string to avoid cloudpickle issues
1946
+ if isinstance(result, pa.Table):
1947
+ result = _cast_string_view_to_string(result)
1948
+
1949
+ return result
1950
+
1951
+
1952
+ def download_manifest_entries_distributed(
1953
+ manifest: Manifest,
1954
+ table_type: DatasetType = DatasetType.PYARROW,
1955
+ max_parallelism: Optional[int] = 1000,
1956
+ column_names: Optional[List[str]] = None,
1957
+ include_columns: Optional[List[str]] = None,
1958
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
1959
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
1960
+ distributed_dataset_type: Optional[
1961
+ DistributedDatasetType
1962
+ ] = DistributedDatasetType.RAY_DATASET,
1963
+ file_path_column: Optional[str] = None,
1964
+ **kwargs,
1965
+ ) -> DistributedDataset:
1966
+ """Download all entries in the manifest using the given distributed dataset type.
1967
+
1968
+ Args:
1969
+ manifest: The manifest containing the entries to download
1970
+ table_type: Dataset type to load the entries into
1971
+ max_parallelism: Maximum parallelism to use
1972
+ column_names: The list of column names in the table
1973
+ include_columns: The list of columns to include in the selection
1974
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
1975
+ (e.g., to pass in a custom schema for a Parquet file)
1976
+ ray_options_provider: Optional provider of Ray options
1977
+ distributed_dataset_type: Optional distributed dataset type to use
1978
+ file_path_column: Optional file path system column name
1979
+ **kwargs: Additional arguments to pass to the file reader
1980
+
1981
+ Returns:
1982
+ Distributed dataset
1983
+ """
1984
+ params = {
1985
+ "manifest": manifest,
1986
+ "table_type": table_type,
1987
+ "max_parallelism": max_parallelism,
1988
+ "column_names": column_names,
1989
+ "include_columns": include_columns,
1990
+ "file_reader_kwargs_provider": file_reader_kwargs_provider,
1991
+ "ray_options_provider": ray_options_provider,
1992
+ "file_path_column": file_path_column,
1993
+ **kwargs,
1994
+ }
1995
+
1996
+ if (
1997
+ distributed_dataset_type
1998
+ and distributed_dataset_type.value == DistributedDatasetType.RAY_DATASET.value
1999
+ ):
2000
+ result = _download_manifest_entries_ray_data_distributed(**params)
2001
+ return result
2002
+ elif distributed_dataset_type is not None:
2003
+ params["distributed_dataset_type"] = distributed_dataset_type
2004
+ return _download_manifest_entries_all_dataset_distributed(**params)
2005
+ else:
2006
+ raise ValueError(
2007
+ f"Distributed dataset type {distributed_dataset_type} not supported."
2008
+ )
2009
+
2010
+
2011
+ def _cast_string_view_to_string(table: pa.Table) -> pa.Table:
2012
+ """
2013
+ Cast any string_view columns to string type for Ray dataset compatibility.
2014
+
2015
+ This addresses compatibility issues where Ray datasets may have trouble with
2016
+ string_view columns written by Polars to Feather.
2017
+
2018
+ Args:
2019
+ table: PyArrow table that may contain string_view columns
2020
+
2021
+ Returns:
2022
+ PyArrow table with string_view columns cast to string type
2023
+ """
2024
+ if not isinstance(table, pa.Table):
2025
+ return table
2026
+
2027
+ schema = table.schema
2028
+ has_string_view = False
2029
+
2030
+ # Check if any columns are string_view
2031
+ for field in schema:
2032
+ if pa.types.is_string_view(field.type):
2033
+ has_string_view = True
2034
+ break
2035
+
2036
+ if not has_string_view:
2037
+ return table
2038
+
2039
+ # Convert to pandas and back to normalize string types
2040
+ # This is a workaround since direct casting from string_view to string is not supported
2041
+ try:
2042
+ pandas_df = table.to_pandas()
2043
+ # Convert back to PyArrow table, which should use regular string type
2044
+ return pa.Table.from_pandas(pandas_df, preserve_index=False)
2045
+ except Exception:
2046
+ # If pandas conversion fails, return original table
2047
+ return table
2048
+
2049
+
2050
+ def _download_manifest_entries_ray_data_distributed(
2051
+ manifest: Manifest,
2052
+ table_type: DatasetType = DatasetType.PYARROW,
2053
+ max_parallelism: Optional[int] = 1000,
2054
+ column_names: Optional[List[str]] = None,
2055
+ include_columns: Optional[List[str]] = None,
2056
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
2057
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
2058
+ file_path_column: Optional[str] = None,
2059
+ **kwargs,
2060
+ ) -> DistributedDataset:
2061
+ """Download all entries in the manifest into a Ray dataset.
2062
+
2063
+ Args:
2064
+ manifest: The manifest containing the entries to download
2065
+ table_type: Dataset type to load the entries into
2066
+ max_parallelism: Maximum parallelism to use
2067
+ column_names: The list of column names in the table
2068
+ include_columns: The list of columns to include in the selection
2069
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
2070
+ (e.g., to pass in a custom schema for a Parquet file)
2071
+ ray_options_provider: Optional provider of Ray options
2072
+ file_path_column: Optional file path system column name
2073
+ **kwargs: Additional arguments to pass to the file reader
2074
+
2075
+ Returns:
2076
+ Ray dataset
2077
+ """
2078
+ table_pending_ids = []
2079
+ manifest_entries = manifest.entries
2080
+
2081
+ if manifest_entries:
2082
+ table_pending_ids = invoke_parallel(
2083
+ manifest_entries,
2084
+ download_manifest_entry_ray,
2085
+ table_type,
2086
+ column_names,
2087
+ include_columns,
2088
+ file_reader_kwargs_provider,
2089
+ max_parallelism=max_parallelism,
2090
+ options_provider=ray_options_provider,
2091
+ file_path_column=file_path_column,
2092
+ **kwargs, # Pass through kwargs like include_paths
2093
+ )
2094
+
2095
+ create_func = _get_table_type_function(
2096
+ table_type, TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS, "dataset create"
2097
+ )
2098
+ return create_func(table_pending_ids)
2099
+
2100
+
2101
+ def _group_manifest_uris_by_content_type(
2102
+ manifest: Manifest, **kwargs
2103
+ ) -> Dict[Tuple[str, str], List[str]]:
2104
+ """
2105
+ Group manifest URIs by content type and content encoding.
2106
+
2107
+ Args:
2108
+ manifest: The manifest containing the entries to group by content type
2109
+ **kwargs: Additional arguments to pass to the catalog properties
2110
+
2111
+ Returns:
2112
+ Dictionary mapping (content_type, content_encoding) tuples to lists of URIs
2113
+ """
2114
+ from deltacat.catalog import get_catalog_properties
2115
+
2116
+ catalog_properties = get_catalog_properties(**kwargs)
2117
+
2118
+ uris_by_type = {}
2119
+
2120
+ for entry in manifest.entries or []:
2121
+ content_type = entry.meta.content_type
2122
+ content_encoding = entry.meta.content_encoding
2123
+ key = (content_type, content_encoding)
2124
+
2125
+ if key not in uris_by_type:
2126
+ uris_by_type[key] = []
2127
+
2128
+ full_uri = catalog_properties.reconstruct_full_path(entry.uri)
2129
+ uris_by_type[key].append(full_uri)
2130
+
2131
+ return uris_by_type
2132
+
2133
+
2134
+ def _download_manifest_entries_all_dataset_distributed(
2135
+ manifest: Manifest,
2136
+ table_type: DatasetType = DatasetType.PYARROW,
2137
+ max_parallelism: Optional[int] = 1000,
2138
+ column_names: Optional[List[str]] = None,
2139
+ include_columns: Optional[List[str]] = None,
2140
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
2141
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
2142
+ distributed_dataset_type: Optional[DatasetType] = DatasetType.DAFT,
2143
+ file_path_column: Optional[str] = None,
2144
+ **kwargs,
2145
+ ) -> DistributedDataset:
2146
+ """Download all entries in the manifest into a distributed dataset other than Ray Dataset.
2147
+
2148
+ Args:
2149
+ manifest: The manifest containing the entries to download
2150
+ table_type: Dataset type to load the entries into
2151
+ max_parallelism: Maximum parallelism to use
2152
+ column_names: The list of column names in the table
2153
+ include_columns: The list of columns to include in the selection
2154
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
2155
+ (e.g., to pass in a custom schema for a Parquet file)
2156
+ ray_options_provider: Optional provider of Ray options
2157
+ distributed_dataset_type: Optional distributed dataset type to use
2158
+ file_path_column: Optional file path system column name
2159
+ **kwargs: Additional arguments to pass to the file reader
2160
+
2161
+ Returns:
2162
+ Distributed dataset
2163
+ """
2164
+ # Group manifest entries by content type instead of validating consistency
2165
+ # Filter out table_version_schema from kwargs passed to catalog properties
2166
+ filtered_kwargs = _filter_kwargs_for_catalog_properties(kwargs)
2167
+ uris_by_content_type = _group_manifest_uris_by_content_type(
2168
+ manifest, **filtered_kwargs
2169
+ )
2170
+
2171
+ # If only one content type, use the original single-reader logic
2172
+ if len(uris_by_content_type) == 1:
2173
+ content_type, content_encoding = next(iter(uris_by_content_type.keys()))
2174
+ uris = next(iter(uris_by_content_type.values()))
2175
+
2176
+ # Keep table_version_schema for the reader, but filter other system kwargs
2177
+ reader_kwargs = _filter_kwargs_for_reader_functions(kwargs)
2178
+
2179
+ try:
2180
+ reader_func = DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[
2181
+ distributed_dataset_type.value
2182
+ ]
2183
+ except KeyError:
2184
+ raise ValueError(
2185
+ f"Unsupported distributed dataset type={distributed_dataset_type}. "
2186
+ f"Supported types: {list(DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC.keys())}"
2187
+ )
2188
+
2189
+ return reader_func(
2190
+ uris=uris,
2191
+ content_type=content_type,
2192
+ content_encoding=content_encoding,
2193
+ column_names=column_names,
2194
+ include_columns=include_columns,
2195
+ read_func_kwargs_provider=file_reader_kwargs_provider,
2196
+ ray_options_provider=ray_options_provider,
2197
+ file_path_column=file_path_column,
2198
+ **reader_kwargs,
2199
+ )
2200
+
2201
+ # Multiple content types - read each group and union them (only for Daft)
2202
+ if distributed_dataset_type != DistributedDatasetType.DAFT:
2203
+ raise ValueError(
2204
+ f"Mixed content types are only supported for Daft datasets. "
2205
+ f"Got {len(uris_by_content_type)} different content types with dataset type {distributed_dataset_type}"
2206
+ )
2207
+
2208
+ # Keep table_version_schema for the reader, but filter other system kwargs
2209
+ reader_kwargs = _filter_kwargs_for_reader_functions(kwargs)
2210
+
2211
+ try:
2212
+ reader_func = DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[
2213
+ distributed_dataset_type.value
2214
+ ]
2215
+ except KeyError:
2216
+ raise ValueError(
2217
+ f"Unsupported distributed dataset type={distributed_dataset_type}. "
2218
+ f"Supported types: {list(DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC.keys())}"
2219
+ )
2220
+
2221
+ # Read each content type group into a separate DataFrame
2222
+ dataframes = []
2223
+ for (content_type, content_encoding), uris in uris_by_content_type.items():
2224
+ df = reader_func(
2225
+ uris=uris,
2226
+ content_type=content_type,
2227
+ content_encoding=content_encoding,
2228
+ column_names=column_names,
2229
+ include_columns=include_columns,
2230
+ read_func_kwargs_provider=file_reader_kwargs_provider,
2231
+ ray_options_provider=ray_options_provider,
2232
+ file_path_column=file_path_column,
2233
+ **reader_kwargs,
2234
+ )
2235
+ dataframes.append(df)
2236
+
2237
+ # Union all DataFrames using Daft's union_all
2238
+ if len(dataframes) == 1:
2239
+ return dataframes[0]
2240
+
2241
+ result = dataframes[0]
2242
+ for df in dataframes[1:]:
2243
+ result = result.union_all(df)
2244
+
2245
+ return result
2246
+
2247
+
2248
+ def _download_manifest_entries_parallel(
2249
+ manifest: Manifest,
2250
+ table_type: DatasetType = DatasetType.PYARROW,
2251
+ max_parallelism: Optional[int] = None,
2252
+ column_names: Optional[List[str]] = None,
2253
+ include_columns: Optional[List[str]] = None,
2254
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
2255
+ file_path_column: Optional[str] = None,
2256
+ **kwargs,
2257
+ ) -> LocalDataset:
2258
+ """Download all entries in the manifest into a local dataset using multiprocessing.
2259
+
2260
+ Args:
2261
+ manifest: The manifest containing the entries to download
2262
+ table_type: Dataset type to load the entries into
2263
+ max_parallelism: Maximum parallel processes to use for entry downloads
2264
+ column_names: The list of column names in the table
2265
+ include_columns: The list of columns to include in the selection
2266
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
2267
+ (e.g., to pass in a custom schema for a Parquet file)
2268
+ file_path_column: Optional file path system column name
2269
+ **kwargs: Additional arguments to pass to the file reader
2270
+
2271
+ Returns:
2272
+ Local dataset
2273
+ """
2274
+ download_args = _prepare_download_arguments(
2275
+ table_type,
2276
+ column_names,
2277
+ include_columns,
2278
+ file_reader_kwargs_provider,
2279
+ file_path_column,
2280
+ **kwargs,
2281
+ )
2282
+
2283
+ entries_to_process = []
2284
+ for e in manifest.entries:
2285
+ manifest_entry = _reconstruct_manifest_entry_uri(e, **kwargs)
2286
+ entries_to_process.append(manifest_entry)
2287
+
2288
+ tables = []
2289
+ pool = multiprocessing.Pool(max_parallelism)
2290
+
2291
+ downloader = partial(download_manifest_entry, **download_args)
2292
+ for table in pool.map(downloader, entries_to_process):
2293
+ tables.append(table)
2294
+ return tables
2295
+
2296
+
2297
+ def download_manifest_entry(
2298
+ manifest_entry: ManifestEntry,
2299
+ table_type: DatasetType = DatasetType.PYARROW,
2300
+ column_names: Optional[List[str]] = None,
2301
+ include_columns: Optional[List[str]] = None,
2302
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
2303
+ content_type: Optional[ContentType] = None,
2304
+ content_encoding: Optional[ContentEncoding] = None,
2305
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
2306
+ file_path_column: Optional[str] = None,
2307
+ **kwargs,
2308
+ ) -> LocalTable:
2309
+ """Download a single entry in the manifest into a local table.
2310
+
2311
+ Args:
2312
+ manifest_entry: The manifest entry to download
2313
+ table_type: Dataset type to load the entry into
2314
+ column_names: The list of column names in the table
2315
+ include_columns: The list of columns to include in the selection
2316
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
2317
+ (e.g., to pass in a custom schema for a Parquet file)
2318
+ content_type: Optional content type of the file
2319
+ content_encoding: Optional content encoding of the file
2320
+ filesystem: Optional PyArrow filesystem to use to read the file
2321
+ file_path_column: Optional file path system column name
2322
+ **kwargs: Additional arguments to pass to the file reader
2323
+
2324
+ Returns:
2325
+ Local table
2326
+ """
2327
+ # Extract manifest metadata
2328
+ (
2329
+ extracted_content_type,
2330
+ extracted_content_encoding,
2331
+ path,
2332
+ ) = _extract_content_metadata(manifest_entry)
2333
+ content_type = content_type or extracted_content_type
2334
+ content_encoding = content_encoding or extracted_content_encoding
2335
+
2336
+ # Extract partial download parameters
2337
+ partial_file_download_params = _extract_partial_download_params(manifest_entry)
2338
+
2339
+ # Filter kwargs and process file path column
2340
+ reader_kwargs = _filter_kwargs_for_external_readers(kwargs)
2341
+ processed_include_columns = _remove_file_path_column(
2342
+ include_columns, file_path_column
2343
+ )
2344
+
2345
+ # Create retry wrapper and read file
2346
+ retrying = _create_retry_wrapper()
2347
+ table = retrying(
2348
+ read_file,
2349
+ path,
2350
+ content_type,
2351
+ content_encoding,
2352
+ table_type,
2353
+ column_names,
2354
+ processed_include_columns,
2355
+ file_reader_kwargs_provider,
2356
+ partial_file_download_params,
2357
+ filesystem,
2358
+ **reader_kwargs,
2359
+ )
2360
+
2361
+ # Add file path column if requested
2362
+ if file_path_column:
2363
+ if isinstance(table, papq.ParquetFile):
2364
+ logger.warning(
2365
+ f"Skipping file_path_column '{file_path_column}' for lazily materialized ParquetFile. "
2366
+ f"File path information can be retrieved from the ParquetFile object's metadata. "
2367
+ f"Use read_as=DatasetType.PYARROW to materialize with file path column."
2368
+ )
2369
+ else:
2370
+ table = append_column_to_table(table, file_path_column, manifest_entry.uri)
2371
+
2372
+ return table
2373
+
2374
+
2375
+ @categorize_errors
2376
+ def read_file(
2377
+ path: str,
2378
+ content_type: ContentType,
2379
+ content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
2380
+ table_type: DatasetType = DatasetType.PYARROW,
2381
+ column_names: Optional[List[str]] = None,
2382
+ include_columns: Optional[List[str]] = None,
2383
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
2384
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
2385
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
2386
+ **kwargs,
2387
+ ) -> LocalTable:
2388
+ """Read a file into a local table.
2389
+
2390
+ Args:
2391
+ path: The path to the file to read
2392
+ content_type: The content type of the file
2393
+ content_encoding: The content encoding of the file
2394
+ table_type: Dataset type to load the file into
2395
+ column_names: The list of column names in the table
2396
+ include_columns: The list of columns to include in the selection
2397
+ file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
2398
+ (e.g., to pass in a custom schema for a Parquet file)
2399
+ partial_file_download_params: Optional partial file download parameters
2400
+ filesystem: Optional PyArrow filesystem to use to read the file
2401
+ **kwargs: Additional arguments to pass to the file reader
2402
+
2403
+ Returns:
2404
+ Local table
2405
+ """
2406
+ reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
2407
+ try:
2408
+ table = reader(
2409
+ path,
2410
+ content_type.value,
2411
+ content_encoding.value,
2412
+ filesystem,
2413
+ column_names,
2414
+ include_columns,
2415
+ file_reader_kwargs_provider,
2416
+ partial_file_download_params,
2417
+ **kwargs,
2418
+ )
2419
+ return table
2420
+ except RETRYABLE_TRANSIENT_ERRORS as e:
2421
+ _handle_retryable_error(e, path, "download", RetryableDownloadTableError)
2422
+ except BaseException as e:
2423
+ _handle_non_retryable_error(
2424
+ e,
2425
+ path,
2426
+ "read",
2427
+ NonRetryableDownloadTableError,
2428
+ f"and content_type={content_type} and encoding={content_encoding}",
2429
+ )