deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,305 @@
1
+ from __future__ import annotations
2
+
3
+ import threading
4
+ from threading import Thread
5
+ from typing import Any, List, Set, Protocol, TypeVar, Dict, Iterable
6
+
7
+ from pyarrow import RecordBatch, Table
8
+ from deltacat.storage.model.partition import PartitionLocator
9
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
10
+ ManifestIO,
11
+ DeltacatManifestIO,
12
+ )
13
+
14
+ from deltacat.experimental.storage.rivulet import Schema
15
+ from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstWriter
16
+ from deltacat.experimental.storage.rivulet.serializer import (
17
+ MEMTABLE_DATA,
18
+ DataSerializer,
19
+ )
20
+ from deltacat.experimental.storage.rivulet.serializer_factory import (
21
+ DataSerializerFactory,
22
+ )
23
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import (
24
+ DatasetWriter,
25
+ DATA,
26
+ )
27
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTWriter
28
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
29
+
30
+ INPUT_ROW = TypeVar("INPUT_ROW")
31
+
32
+
33
+ class Memtable(Protocol[INPUT_ROW]):
34
+ """
35
+ Protocol defining the interface for a memtable that can store and sort records of type T.
36
+ """
37
+
38
+ def add_record(self, record: INPUT_ROW) -> bool:
39
+ """
40
+ Add a record to the memtable.
41
+
42
+ Args:
43
+ record: The record to add of type INPUT_ROW
44
+
45
+ Returns:
46
+ bool: True if the memtable is full after adding the record, False otherwise
47
+ """
48
+ ...
49
+
50
+ def get_sorted_records(self, schema: Schema) -> MEMTABLE_DATA:
51
+ """
52
+ Get all records in the memtable in sorted order.
53
+
54
+ Returns:
55
+ List[T]: A list of sorted records
56
+ """
57
+ ...
58
+
59
+
60
+ class DictMemTable(Memtable[Dict[str, Any]]):
61
+ """
62
+ Unit of in memory buffering of sorted records before records are written to file
63
+
64
+ TODO future improvements:
65
+ 1. build b+ tree of record indexes on insertion
66
+ OR If we end up using arrow as intermediate format, we can use
67
+ pyarrow compute sort
68
+ 2. Probably we will re-write in rust
69
+ """
70
+
71
+ def __init__(self, merge_key: str):
72
+ self.row_size = 0
73
+ self.merge_key = merge_key
74
+
75
+ self._records: List[Dict[str, Any]] = []
76
+ self.lock = threading.Lock()
77
+
78
+ def add_record(self, record: Dict[str, Any]):
79
+ with self.lock:
80
+ self._records.append(record)
81
+ self.row_size += 1
82
+
83
+ if self.row_size >= MemtableDatasetWriter.MAX_ROW_SIZE:
84
+ return True
85
+ return False
86
+
87
+ def get_sorted_records(self, schema: Schema) -> List[Dict[str, Any]]:
88
+ """
89
+ Gets sorted records
90
+
91
+ :return: iterator over sorted record
92
+ """
93
+ with self.lock:
94
+ self._records.sort(key=lambda x: x.__getitem__(self.merge_key))
95
+ return self._records
96
+
97
+
98
+ class RecordBatchMemTable(Memtable[RecordBatch]):
99
+ """
100
+ Note that this will not respect max row size.
101
+ """
102
+
103
+ def __init__(self, merge_key: str):
104
+ self.row_size = 0
105
+ self.merge_key = merge_key
106
+
107
+ # list of full record batches in memtable
108
+ self._records_batches: List[RecordBatch] = []
109
+ self.lock = threading.Lock()
110
+
111
+ def add_record(self, record: RecordBatch):
112
+ with self.lock:
113
+ self._records_batches.append(record)
114
+ self.row_size += record.num_rows
115
+
116
+ if self.row_size >= MemtableDatasetWriter.MAX_ROW_SIZE:
117
+ return True
118
+ return False
119
+
120
+ def get_sorted_records(self, schema: Schema) -> Table:
121
+ """
122
+ Gets sorted records
123
+
124
+ :return: iterator over sorted record
125
+ """
126
+ with self.lock:
127
+ # Note that we are providing schema so that pyarrow does not infer it
128
+ table = Table.from_batches(self._records_batches, schema.to_pyarrow())
129
+ return table.sort_by(self.merge_key)
130
+
131
+
132
+ class MemtableDatasetWriter(DatasetWriter):
133
+ # Note that this max row size is not respected when PyArrow RecordBatches are used
134
+ # In that case, the entire record batch is written within one memtable even if the row count overflows
135
+ MAX_ROW_SIZE = 1000000
136
+ """
137
+ Buffers data into rotating memtables. When a memtable reaches a certain size, it is flushed to disk and a new memtable is allocated
138
+
139
+ Uses DataWriter which will be format specific for writing data
140
+ Uses MetadataWriter for writing metadata
141
+
142
+ TODO Future Improvements
143
+ 1. Maybe we should re-write this class in Rust (pending testing)
144
+ """
145
+
146
+ def __init__(
147
+ self,
148
+ file_provider: FileProvider,
149
+ schema: Schema,
150
+ locator: PartitionLocator,
151
+ file_format: str | None = None,
152
+ sst_writer: SSTWriter = None,
153
+ manifest_io: ManifestIO = None,
154
+ ):
155
+
156
+ if not sst_writer:
157
+ sst_writer = JsonSstWriter()
158
+ if not manifest_io:
159
+ manifest_io = DeltacatManifestIO(file_provider.uri, locator)
160
+
161
+ self.schema = schema
162
+
163
+ self.file_provider = file_provider
164
+ self.data_serializer: DataSerializer = DataSerializerFactory.get_serializer(
165
+ self.schema, self.file_provider, file_format
166
+ )
167
+ self.sst_writer = sst_writer
168
+ self.manifest_io = manifest_io
169
+
170
+ self._sst_files: Set[str] = set()
171
+ self.__curr_memtable = None
172
+ self.__open_memtables = []
173
+ self.__rlock = threading.RLock()
174
+ self.__open_threads: List[Thread] = []
175
+ self._locator = locator
176
+
177
+ def write_dict(self, record: Dict[str, Any]) -> None:
178
+
179
+ # Construct memtable if doesn't exist. If previous memtable wrong type, rotate
180
+ memtable_ctor = lambda: DictMemTable(self.schema.get_merge_key())
181
+ if not self.__curr_memtable:
182
+ self.__curr_memtable = memtable_ctor()
183
+ try:
184
+ isinstance(self.__curr_memtable, DictMemTable)
185
+ except TypeError:
186
+ self.__rotate_memtable(memtable_ctor)
187
+
188
+ # Write record(s). If memtable is full, rotate
189
+ if self.__curr_memtable.add_record(record):
190
+ self.__rotate_memtable(memtable_ctor)
191
+
192
+ def write_record_batch(self, record: RecordBatch) -> None:
193
+ # Construct memtable if doesn't exist. If previous memtable wrong type, rotate
194
+ memtable_ctor = lambda: RecordBatchMemTable(self.schema.get_merge_key())
195
+ if not self.__curr_memtable:
196
+ self.__curr_memtable = memtable_ctor()
197
+
198
+ try:
199
+ isinstance(self.__curr_memtable, RecordBatchMemTable)
200
+ except TypeError:
201
+ self.__rotate_memtable(memtable_ctor)
202
+
203
+ # Write record(s). If memtable is full, rotate
204
+ if self.__curr_memtable.add_record(record):
205
+ self.__rotate_memtable(memtable_ctor)
206
+
207
+ def write(self, data: DATA) -> None:
208
+ if isinstance(data, RecordBatch):
209
+ self.write_record_batch(data)
210
+ elif isinstance(data, Iterable):
211
+ for x in data:
212
+ if isinstance(x, dict):
213
+ self.write_dict(x)
214
+ elif isinstance(x, RecordBatch):
215
+ self.write_record_batch(x)
216
+ else:
217
+ raise ValueError(
218
+ f"Iterable contained unsupported type {type(x).__name__}."
219
+ f" Supported data types to write are: {DATA}"
220
+ )
221
+ else:
222
+ raise ValueError(
223
+ f"Unsupported data type {type(data).__name__}. Supported data types to write are: {DATA}"
224
+ )
225
+
226
+ def flush(self) -> str:
227
+ """
228
+ Explicitly flush any data and metadata and commit to dataset
229
+ """
230
+ self.__flush_memtable(self.__curr_memtable)
231
+ for thread in [t for t in self.__open_threads if t.is_alive()]:
232
+ thread.join()
233
+
234
+ manifest_location = self.__write_manifest_file()
235
+ self._sst_files.clear()
236
+
237
+ return manifest_location
238
+
239
+ def __enter__(self) -> Any:
240
+ """
241
+ Enter and exit method allows python "with" statement
242
+ """
243
+ return self
244
+
245
+ def __exit__(self, exc_type, exc_value, traceback):
246
+ """
247
+ Closes all open memtables and ensures all data is flushed.
248
+ """
249
+ self.flush()
250
+ # return False to propogate up error messages
251
+ return False
252
+
253
+ def __rotate_memtable(self, memtable_constructor_closure):
254
+ """
255
+ Replace the active memtable
256
+ :return:
257
+ """
258
+ with self.__rlock:
259
+ self.__flush_memtable(self.__curr_memtable)
260
+ self.__curr_memtable = memtable_constructor_closure()
261
+ self.__open_memtables.append(self.__curr_memtable)
262
+
263
+ # Reap dead threads
264
+ self.__open_threads = [t for t in self.__open_threads if t.is_alive()]
265
+
266
+ def __flush_memtable(self, memtable):
267
+ thread = threading.Thread(target=self.__flush_memtable_async, args=(memtable,))
268
+ thread.start()
269
+ with self.__rlock:
270
+ self.__open_threads.append(thread)
271
+
272
+ def __flush_memtable_async(self, memtable: Memtable):
273
+ """
274
+ Flushes data and metadata for a given memtable
275
+ Called asynchronously in background thread
276
+ """
277
+ if not memtable:
278
+ return
279
+
280
+ sst_metadata_list = self.data_serializer.flush_batch(
281
+ memtable.get_sorted_records(self.schema)
282
+ )
283
+
284
+ # short circuit if no data/metadata written
285
+ if not sst_metadata_list:
286
+ with self.__rlock:
287
+ self.__open_memtables.remove(memtable)
288
+ return
289
+
290
+ # Write SST. Each memtable is going to have a dedicated L0 SST file because that is the unit at which
291
+ # we have contiguously sorted data
292
+ sst_file = self.file_provider.provide_l0_sst_file()
293
+
294
+ with self.__rlock:
295
+ self.sst_writer.write(sst_file, sst_metadata_list)
296
+ self._sst_files.add(sst_file.location)
297
+
298
+ if memtable in self.__open_memtables:
299
+ self.__open_memtables.remove(memtable)
300
+
301
+ def __write_manifest_file(self) -> str:
302
+ """
303
+ Write the manifest file to the filesystem at the given URI.
304
+ """
305
+ return self.manifest_io.write(list(self._sst_files), self.schema, 0)
deltacat/io/__init__.py CHANGED
@@ -0,0 +1,13 @@
1
+ from deltacat.io.reader.deltacat_read_api import read_deltacat
2
+ from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
3
+ from deltacat.io.datasource.deltacat_datasource import (
4
+ METAFILE_DATA_COLUMN_NAME,
5
+ METAFILE_TYPE_COLUMN_NAME,
6
+ )
7
+
8
+ __all__ = [
9
+ "read_deltacat",
10
+ "DeltacatReadType",
11
+ "METAFILE_DATA_COLUMN_NAME",
12
+ "METAFILE_TYPE_COLUMN_NAME",
13
+ ]
File without changes
@@ -0,0 +1,91 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Callable, Dict, Optional, cast
5
+
6
+ import pyarrow as pa
7
+ from ray.data import Dataset
8
+
9
+ from deltacat.utils.url import DeltaCatUrl
10
+ from deltacat.io.datasink.deltacat_datasink import DeltaCatDatasink
11
+
12
+
13
+ class DeltaCatDataset(Dataset):
14
+ @staticmethod
15
+ def from_dataset(dataset: Dataset) -> DeltaCatDataset:
16
+ # cast to DeltacatDataset in-place since it only adds new methods
17
+ dataset.__class__ = DeltaCatDataset
18
+ return cast(DeltaCatDataset, dataset)
19
+
20
+ def write_deltacat(
21
+ self,
22
+ url: DeltaCatUrl,
23
+ *,
24
+ # if the source dataset only contains DeltaCAT metadata, then only copy the metadata to the destination... if it contains external source file paths, then register them in a new Delta.
25
+ metadata_only: bool = False,
26
+ # merge all deltas as part of the write operation
27
+ copy_on_write: Optional[bool] = False,
28
+ filesystem: Optional[pa.fs.S3FileSystem] = None,
29
+ try_create_dir: bool = True,
30
+ arrow_open_stream_args: Optional[Dict[str, Any]] = None,
31
+ arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
32
+ min_rows_per_file: Optional[int] = None,
33
+ ray_remote_args: Dict[str, Any] = None,
34
+ concurrency: Optional[int] = None,
35
+ **arrow_parquet_args,
36
+ ) -> None:
37
+ """Writes the dataset to files and commits DeltaCAT metadata indexing
38
+ the files written.
39
+
40
+ This is only supported for datasets convertible to Arrow records.
41
+ To control the number of files, use ``.repartition()``.
42
+
43
+ Unless a custom block path provider is given, the format of the output
44
+ files will be {uuid}_{block_idx}.{extension}, where ``uuid`` is a
45
+ unique id for the dataset.
46
+
47
+ The DeltaCAT manifest will be written to ``f"{path}/manifest``
48
+
49
+ Examples:
50
+ >>> ds.write_deltacat("s3://catalog/root/path")
51
+
52
+ Time complexity: O(dataset size / parallelism)
53
+
54
+ Args:
55
+ url: The path to the root directory where materialized files and
56
+ DeltaCAT manifest will be written.
57
+ filesystem: The filesystem implementation to write to. This should
58
+ be either a PyArrow S3FileSystem.
59
+ try_create_dir: Try to create all directories in destination path
60
+ if True. Does nothing if all directories already exist.
61
+ arrow_open_stream_args: kwargs passed to
62
+ pyarrow.fs.S3FileSystem.open_output_stream
63
+ filename_provider: FilenameProvider implementation
64
+ to write each dataset block to a custom output path.
65
+ arrow_parquet_args_fn: Callable that returns a dictionary of write
66
+ arguments to use when writing each block to a file. Overrides
67
+ any duplicate keys from arrow_parquet_args. This should be used
68
+ instead of arrow_parquet_args if any of your write arguments
69
+ cannot be pickled, or if you'd like to lazily resolve the write
70
+ arguments for each dataset block.
71
+ arrow_parquet_args: Options to pass to
72
+ pyarrow.parquet.write_table(), which is used to write out each
73
+ block to a file.
74
+ """
75
+ datasink = DeltaCatDatasink(
76
+ url,
77
+ metadata_only=metadata_only,
78
+ copy_on_write=copy_on_write,
79
+ arrow_parquet_args_fn=arrow_parquet_args_fn,
80
+ arrow_parquet_args=arrow_parquet_args,
81
+ min_rows_per_file=min_rows_per_file,
82
+ filesystem=filesystem,
83
+ try_create_dir=try_create_dir,
84
+ open_stream_args=arrow_open_stream_args,
85
+ dataset_uuid=self._uuid,
86
+ )
87
+ self.write_datasink(
88
+ datasink,
89
+ ray_remote_args=ray_remote_args,
90
+ concurrency=concurrency,
91
+ )
File without changes
@@ -0,0 +1,207 @@
1
+ import logging
2
+
3
+ from collections import OrderedDict
4
+ from typing import Dict, Any, Optional, List, Iterable
5
+
6
+ from ray.data import Datasink
7
+ from ray.data._internal.execution.interfaces import TaskContext
8
+ from ray.data.block import Block, BlockAccessor
9
+ from ray.data.datasource import WriteResult
10
+
11
+ from ray.data.datasource.filename_provider import (
12
+ FilenameProvider,
13
+ )
14
+
15
+ from deltacat import logs
16
+
17
+ from deltacat.constants import METAFILE_FORMAT_MSGPACK
18
+ from deltacat.storage import Metafile
19
+ from deltacat.io.datasource.deltacat_datasource import (
20
+ METAFILE_DATA_COLUMN_NAME,
21
+ METAFILE_TYPE_COLUMN_NAME,
22
+ )
23
+ from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlWriter
24
+
25
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
+
27
+
28
+ class CapturingBlockWritePathProvider(FilenameProvider):
29
+ """Delegating block write path provider that saves an ordered dictionary of
30
+ input keyword arguments for every block write path returned."""
31
+
32
+ def __init__(
33
+ self,
34
+ block_write_path_provider: FilenameProvider,
35
+ base_path: Optional[str] = None,
36
+ ):
37
+ self.base_path = base_path
38
+ self.block_write_path_provider = block_write_path_provider
39
+ self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
40
+
41
+ def get_filename_for_block(
42
+ self,
43
+ block: Any,
44
+ task_index: int,
45
+ block_index: int,
46
+ ) -> str:
47
+ if self.base_path is None:
48
+ raise ValueError(
49
+ "Base path must be provided to CapturingBlockWritePathProvider",
50
+ )
51
+ return self._get_write_path_for_block(
52
+ base_path=self.base_path,
53
+ block=block,
54
+ block_index=block_index,
55
+ )
56
+
57
+ def _get_write_path_for_block(
58
+ self,
59
+ base_path: str,
60
+ *args,
61
+ **kwargs,
62
+ ) -> str:
63
+ filename = self.block_write_path_provider.get_filename_for_block(
64
+ *args,
65
+ **kwargs,
66
+ )
67
+ write_path = f"{base_path}/{filename}"
68
+ kwargs["base_path"] = base_path
69
+ self.write_path_kwargs[write_path] = kwargs
70
+ return write_path
71
+
72
+
73
+ class DeltaCatWriteResult:
74
+ def __init__(self):
75
+ self.metadata = None
76
+ self.path = None
77
+ self.dataset_uuid = None
78
+ self.block_write_path_provider = None
79
+ self.content_type = None
80
+ self.content_encoding = None
81
+ self.filesystem = None
82
+
83
+
84
+ class DeltaCatDatasink(Datasink[List[Metafile]]):
85
+ def __init__(
86
+ self,
87
+ url: DeltaCatUrl,
88
+ *,
89
+ metadata_only: bool = False,
90
+ copy_on_write: Optional[bool] = False,
91
+ ):
92
+ self._url = url
93
+ self._metadata_only = metadata_only
94
+ self._copy_on_write = copy_on_write
95
+
96
+ def on_write_start(self) -> None:
97
+ pass
98
+
99
+ def write(
100
+ self,
101
+ blocks: Iterable[Block],
102
+ ctx: TaskContext,
103
+ ) -> List[Metafile]:
104
+ for block in blocks:
105
+ pa_table = BlockAccessor.for_block(block).to_arrow()
106
+ if (
107
+ METAFILE_DATA_COLUMN_NAME in pa_table.column_names
108
+ and METAFILE_TYPE_COLUMN_NAME in pa_table.column_names
109
+ ):
110
+ for pa_scalar in pa_table[METAFILE_DATA_COLUMN_NAME]:
111
+ metafile_msgpack_bytes = pa_scalar.as_py()
112
+ metafile = Metafile.deserialize(
113
+ serialized=metafile_msgpack_bytes,
114
+ meta_format=METAFILE_FORMAT_MSGPACK,
115
+ )
116
+ # TODO(pdames): Add `metafile` to writer as a kwarg instead
117
+ # of constructing a new URL with the metafile as input.
118
+ writer_url = DeltaCatUrlWriter(self._url, metafile=metafile)
119
+ # TODO(pdames): Run writes in order from catalog -> delta
120
+ # by truncating the URL down to just dc://{catalog-name}
121
+ # and rebuilding all path elements from there.
122
+ writer_url.write(metafile)
123
+ else:
124
+ raise NotImplementedError(
125
+ f"Expected {METAFILE_DATA_COLUMN_NAME} and "
126
+ f"{METAFILE_TYPE_COLUMN_NAME} columns in the input block, "
127
+ f"but found {pa_table.column_names}."
128
+ )
129
+
130
+ def on_write_complete(
131
+ self,
132
+ write_result: WriteResult[List[Metafile]],
133
+ ):
134
+ pass
135
+
136
+
137
+ """
138
+ def write(
139
+ self,
140
+ blocks: Iterable[Block],
141
+ ctx: TaskContext,
142
+ ) -> List[ObjectRef[DeltacatWriteResult]]:
143
+ paths, filesystem = resolve_paths_and_filesystem(
144
+ self.path,
145
+ self.filesystem,
146
+ )
147
+ assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
148
+ path = paths[0]
149
+ write_results = super().write(blocks)
150
+ # append a summary of this write operation in the last write result
151
+ metadata = [BlockAccessor.for_block(_).get_metadata() for _ in blocks]
152
+ rwr = DeltacatWriteResult()
153
+ rwr.metadata = metadata
154
+ rwr.path = path
155
+ rwr.dataset_uuid = self.dataset_uuid
156
+ rwr.block_write_path_provider = self.filename_provider
157
+ rwr.content_type = ContentType.PARQUET.value
158
+ rwr.content_encoding = ContentEncoding.IDENTITY.value
159
+ rwr.filesystem = filesystem
160
+ rwr_obj_ref = ray.put(rwr)
161
+ write_results.append(rwr_obj_ref)
162
+ return write_results
163
+
164
+ def on_write_complete(self, write_results: List[Any], **kwargs) -> None:
165
+ # TODO (pdames): time latency of this operation - overall s3 write times
166
+ # are 2-3x pure read_parquet_fast() times
167
+ # restore the write operation summary from the last write result
168
+ result: DeltacatWriteResult = write_results[len(write_results) - 1]
169
+ write_path_args = result.block_write_path_provider.write_path_kwargs
170
+ blocks_written = len(write_path_args)
171
+ expected_blocks_written = len(result.metadata)
172
+ # TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
173
+ # Blocks filtered/split/merged to more/less write paths?
174
+ assert blocks_written == expected_blocks_written, (
175
+ f"Dataset write result validation failed. Found "
176
+ f"{blocks_written}/{expected_blocks_written} Dataset blocks "
177
+ f"written. Refusing to commit DeltaCAT Manifest."
178
+ )
179
+ manifest_entries = ManifestEntryList()
180
+ for block_idx, path in enumerate(write_path_args.keys()):
181
+ file_info = result.filesystem.get_file_info(path)
182
+ if file_info.type == pyarrow.fs.FileType.File:
183
+ content_length = file_info.size
184
+ else:
185
+ raise FileNotFoundError(ENOENT, strerror(ENOENT), path)
186
+ num_rows = result.metadata[block_idx].num_rows
187
+ source_content_length = result.metadata[block_idx].size_bytes
188
+ manifest_entry_meta = ManifestMeta.of(
189
+ int(num_rows) if num_rows is not None else None,
190
+ int(content_length) if content_length is not None else None,
191
+ result.content_type,
192
+ result.content_encoding,
193
+ int(source_content_length) if source_content_length else None,
194
+ )
195
+ parsed_url = parse_s3_url(path)
196
+ manifest_entry = ManifestEntry.of(
197
+ parsed_url.url,
198
+ manifest_entry_meta,
199
+ )
200
+ manifest_entries.append(manifest_entry)
201
+ manifest = Manifest.of(manifest_entries)
202
+ manifest_path = f"{result.path}/manifest"
203
+ logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
204
+ with result.filesystem.open_output_stream(manifest_path) as f:
205
+ f.write(json.dumps(manifest).encode("utf-8"))
206
+ logger.debug(f"Manifest committed to: {manifest_path}")
207
+ """
File without changes