deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -88,7 +88,7 @@ def round_robin_options_provider(
88
88
  **kwargs,
89
89
  ) -> Dict[str, Any]:
90
90
  """Returns a resource dictionary that can be included with ray remote
91
- options to round robin indexed tasks or actors across a list of resource
91
+ options to round-robin indexed tasks or actors across a list of resource
92
92
  keys. For example, the following code round-robins 100 tasks across all
93
93
  live cluster nodes:
94
94
  ```
@@ -2,7 +2,11 @@ import logging
2
2
  from typing import Callable, Dict, List, Optional, Union
3
3
 
4
4
  from fsspec import AbstractFileSystem
5
+
6
+ import pyarrow as pa
5
7
  from pyarrow import csv as pacsv
8
+ import pyarrow.fs as pafs
9
+
6
10
  from ray.data import Dataset
7
11
  from ray.data.datasource import FilenameProvider
8
12
 
@@ -16,7 +20,7 @@ def write_parquet(
16
20
  dataset: Dataset,
17
21
  base_path: str,
18
22
  *,
19
- filesystem: AbstractFileSystem,
23
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
20
24
  block_path_provider: Union[Callable, FilenameProvider],
21
25
  **kwargs,
22
26
  ) -> None:
@@ -34,16 +38,36 @@ def write_csv(
34
38
  dataset: Dataset,
35
39
  base_path: str,
36
40
  *,
37
- filesystem: AbstractFileSystem,
41
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
38
42
  block_path_provider: Union[Callable, FilenameProvider],
39
43
  **kwargs,
40
44
  ) -> None:
45
+ """
46
+ Write a Ray Dataset to a CSV file (or other delimited text format).
47
+ """
48
+ # Extract CSV-specific options from kwargs
49
+ delimiter = kwargs.pop("delimiter", ",")
50
+ quoting_style = kwargs.pop("quoting_style", None)
51
+ include_header = kwargs.pop("include_header", False)
52
+
53
+ # Create a function that will generate WriteOptions inside the worker process
54
+ def arrow_csv_args_fn():
55
+ write_options = pacsv.WriteOptions(
56
+ delimiter=delimiter,
57
+ include_header=include_header,
58
+ quoting_style=quoting_style,
59
+ )
60
+ return {"write_options": write_options}
61
+
62
+ # Check if the block_path_provider will generate .gz files to avoid double compression
63
+ pa_open_stream_args = {}
64
+ if not (
65
+ hasattr(block_path_provider, "content_encoding")
66
+ and block_path_provider.content_encoding == ContentEncoding.GZIP
67
+ ):
68
+ # Block path provider will not generate .gz files, so we need to apply explicit compression
69
+ pa_open_stream_args["compression"] = ContentEncoding.GZIP.value
41
70
 
42
- # column names are kept in table metadata, so omit header
43
- arrow_csv_args_fn = lambda: {
44
- "write_options": pacsv.WriteOptions(include_header=False)
45
- }
46
- pa_open_stream_args = {"compression": ContentEncoding.GZIP.value}
47
71
  dataset.write_csv(
48
72
  base_path,
49
73
  arrow_open_stream_args=pa_open_stream_args,
@@ -55,12 +79,76 @@ def write_csv(
55
79
  )
56
80
 
57
81
 
82
+ def write_json(
83
+ dataset: Dataset,
84
+ base_path: str,
85
+ *,
86
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
87
+ block_path_provider: Union[Callable, FilenameProvider],
88
+ **kwargs,
89
+ ) -> None:
90
+ """
91
+ Write a Ray Dataset to a JSON file using Ray's native JSON writer.
92
+ """
93
+ # Check if the block_path_provider will generate .gz files to avoid double compression
94
+ pa_open_stream_args = {}
95
+ if not (
96
+ hasattr(block_path_provider, "content_encoding")
97
+ and block_path_provider.content_encoding == ContentEncoding.GZIP
98
+ ):
99
+ # Block path provider will not generate .gz files, so we need to apply explicit compression
100
+ pa_open_stream_args["compression"] = ContentEncoding.GZIP.value
101
+
102
+ dataset.write_json(
103
+ base_path,
104
+ arrow_open_stream_args=pa_open_stream_args,
105
+ filesystem=filesystem,
106
+ try_create_dir=False,
107
+ filename_provider=block_path_provider,
108
+ **kwargs,
109
+ )
110
+
111
+
58
112
  CONTENT_TYPE_TO_DATASET_WRITE_FUNC: Dict[str, Callable] = {
113
+ ContentType.UNESCAPED_TSV.value: write_csv,
114
+ ContentType.TSV.value: write_csv,
59
115
  ContentType.CSV.value: write_csv,
116
+ ContentType.PSV.value: write_csv,
60
117
  ContentType.PARQUET.value: write_parquet,
118
+ ContentType.JSON.value: write_json,
61
119
  }
62
120
 
63
121
 
122
+ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, any]:
123
+ """
124
+ Returns writer kwargs for the given content type when writing with Ray Dataset.
125
+ """
126
+ if content_type == ContentType.UNESCAPED_TSV.value:
127
+ return {
128
+ "delimiter": "\t",
129
+ "include_header": False,
130
+ "quoting_style": "none",
131
+ }
132
+ if content_type == ContentType.TSV.value:
133
+ return {
134
+ "delimiter": "\t",
135
+ "include_header": False,
136
+ }
137
+ if content_type == ContentType.CSV.value:
138
+ return {
139
+ "delimiter": ",",
140
+ "include_header": False,
141
+ }
142
+ if content_type == ContentType.PSV.value:
143
+ return {
144
+ "delimiter": "|",
145
+ "include_header": False,
146
+ }
147
+ if content_type in {ContentType.PARQUET.value, ContentType.JSON.value}:
148
+ return {}
149
+ raise ValueError(f"Unsupported content type: {content_type}")
150
+
151
+
64
152
  def slice_dataset(dataset: Dataset, max_len: Optional[int]) -> List[Dataset]:
65
153
  """
66
154
  Returns equally-sized dataset slices of up to `max_len` records each.
@@ -88,9 +176,10 @@ def dataset_size(dataset: Dataset) -> int:
88
176
  def dataset_to_file(
89
177
  table: Dataset,
90
178
  base_path: str,
91
- file_system: AbstractFileSystem,
179
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
92
180
  block_path_provider: Union[Callable, FilenameProvider],
93
181
  content_type: str = ContentType.PARQUET.value,
182
+ schema: Optional[pa.Schema] = None,
94
183
  **kwargs,
95
184
  ) -> None:
96
185
  """
@@ -103,10 +192,12 @@ def dataset_to_file(
103
192
  f" implemented. Known content types: "
104
193
  f"{CONTENT_TYPE_TO_DATASET_WRITE_FUNC.keys}"
105
194
  )
195
+ writer_kwargs = content_type_to_writer_kwargs(content_type)
196
+ writer_kwargs.update(kwargs)
106
197
  writer(
107
198
  table,
108
199
  base_path,
109
- filesystem=file_system,
200
+ filesystem=filesystem,
110
201
  block_path_provider=block_path_provider,
111
- **kwargs,
202
+ **writer_kwargs,
112
203
  )
@@ -21,7 +21,7 @@ def node_resource_keys(
21
21
  keys = []
22
22
  node_dict = ray.nodes()
23
23
  if node_dict:
24
- for node in ray.nodes():
24
+ for node in node_dict:
25
25
  if filter_fn(node):
26
26
  for key in node["Resources"].keys():
27
27
  if key.startswith("node:"):
@@ -37,7 +37,7 @@ def current_node_resource_key() -> str:
37
37
  actors on that node via:
38
38
  `foo.options(resources={get_current_node_resource_key(): 0.01}).remote()`
39
39
  """
40
- current_node_id = ray.get_runtime_context().get_node_id().hex()
40
+ current_node_id = ray.get_runtime_context().get_node_id()
41
41
  keys = node_resource_keys(lambda n: n["NodeID"] == current_node_id)
42
42
  assert (
43
43
  len(keys) <= 1
@@ -45,6 +45,47 @@ def current_node_resource_key() -> str:
45
45
  return keys[0] if len(keys) == 1 else None
46
46
 
47
47
 
48
+ def current_node_resources() -> Dict[str, float]:
49
+ """Get's Ray's resources for the current node as a dictionary.
50
+
51
+ Example Return Value:
52
+ >>> {
53
+ >>> 'memory': 17611605607.0,
54
+ >>> 'node:127.0.0.1': 1.0,
55
+ >>> 'node:__internal_head__': 1.0,
56
+ >>> 'object_store_memory': 2147483648.0,
57
+ >>> 'CPU': 10.0,
58
+ >>> }
59
+ """
60
+ current_node_id = ray.get_runtime_context().get_node_id()
61
+ node_dict = ray.nodes()
62
+ if node_dict:
63
+ for node in node_dict:
64
+ if node["NodeID"] == current_node_id:
65
+ return node["Resources"]
66
+ else:
67
+ raise ValueError("No node dictionary found on current node.")
68
+ return {}
69
+
70
+
71
+ def find_max_single_node_resource_type(resource_type: str) -> float:
72
+ """Finds the max resource amount available on any single cluster node
73
+ for the given resource type. Returns the max resource amount as a float."""
74
+ node_dict = ray.nodes()
75
+ max_single_node_resource_amount = 0
76
+ if node_dict:
77
+ for node in node_dict:
78
+ node_resource_amount = node["Resources"].get(resource_type)
79
+ if node_resource_amount is not None:
80
+ max_single_node_resource_amount = max(
81
+ max_single_node_resource_amount,
82
+ node_resource_amount,
83
+ )
84
+ else:
85
+ raise ValueError("No node dictionary found on current node.")
86
+ return max_single_node_resource_amount
87
+
88
+
48
89
  def is_node_alive(node: Dict[str, Any]) -> bool:
49
90
  """Takes a node from `ray.nodes()` as input. Returns True if the node is
50
91
  alive, and False otherwise."""
@@ -67,6 +108,17 @@ def live_node_waiter(min_live_nodes: int, poll_interval_seconds: float = 0.5) ->
67
108
  time.sleep(poll_interval_seconds)
68
109
 
69
110
 
111
+ def live_cpu_waiter(min_live_cpus: int, poll_interval_seconds: float = 0.5) -> None:
112
+ """Waits until the given minimum number of live CPUs are present in the
113
+ cluster. Checks the current number of live CPUs every
114
+ `poll_interval_seconds`."""
115
+ live_cpus = cluster_cpus()
116
+ while live_cpus < min_live_cpus:
117
+ live_cpus = cluster_cpus()
118
+ logger.info(f"Waiting for Live CPUs: {live_cpus}/{min_live_cpus}")
119
+ time.sleep(poll_interval_seconds)
120
+
121
+
70
122
  def live_node_resource_keys() -> List[str]:
71
123
  """Get Ray resource keys for all live cluster nodes as a list of strings of
72
124
  the form: "node:{node_resource_name}". The returned keys can be used to
@@ -83,7 +135,7 @@ def other_live_node_resource_keys() -> List[str]:
83
135
 
84
136
  For example, invoking this function from your Ray application driver on the
85
137
  head node returns the resource keys of all live worker nodes."""
86
- current_node_id = ray.get_runtime_context().get_node_id().hex()
138
+ current_node_id = ray.get_runtime_context().get_node_id()
87
139
  return node_resource_keys(
88
140
  lambda n: n["NodeID"] != current_node_id and is_node_alive(n)
89
141
  )
@@ -97,7 +149,7 @@ def other_node_resource_keys() -> List[str]:
97
149
 
98
150
  For example, invoking this function from your Ray application driver on the
99
151
  head node returns the resource keys of all worker nodes."""
100
- current_node_id = ray.get_runtime_context().get_node_id().hex()
152
+ current_node_id = ray.get_runtime_context().get_node_id()
101
153
  return node_resource_keys(lambda n: n["NodeID"] != current_node_id)
102
154
 
103
155