deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/api.py CHANGED
@@ -1,56 +1,238 @@
1
- from typing import Any
2
-
1
+ import time
2
+ from dataclasses import dataclass
3
+ from typing import Any, Union, List, Optional, Dict, Callable, Tuple
4
+ import logging
3
5
 
6
+ import ray
4
7
  import deltacat as dc
5
- from deltacat.catalog import Catalog
8
+ import pyarrow.fs as pafs
9
+
10
+ from pyarrow.fs import FileType
11
+ from ray.exceptions import OutOfMemoryError
12
+
13
+ from deltacat.constants import BYTES_PER_GIBIBYTE
14
+ from deltacat.io import (
15
+ read_deltacat,
16
+ DeltacatReadType,
17
+ )
18
+ from deltacat.storage import (
19
+ Namespace,
20
+ Table,
21
+ TableVersion,
22
+ Stream,
23
+ Partition,
24
+ Delta,
25
+ Dataset,
26
+ DistributedDataset,
27
+ ListResult,
28
+ LocalTable,
29
+ Metafile,
30
+ )
31
+ from deltacat.types.media import DatasetType
32
+ from deltacat.utils.url import (
33
+ DeltaCatUrl,
34
+ DeltaCatUrlReader,
35
+ DeltaCatUrlWriter,
36
+ )
37
+ from deltacat.utils.common import ReadKwargsProvider
38
+ from deltacat.types.tables import (
39
+ get_table_size,
40
+ get_table_length,
41
+ )
42
+ from deltacat.utils.filesystem import (
43
+ resolve_path_and_filesystem,
44
+ get_file_info,
45
+ )
46
+ from deltacat.utils.performance import timed_invocation
47
+ from deltacat.utils.ray_utils.runtime import (
48
+ current_node_resources,
49
+ live_cpu_waiter,
50
+ live_node_resource_keys,
51
+ other_live_node_resource_keys,
52
+ find_max_single_node_resource_type,
53
+ )
54
+ from deltacat import logs
55
+
56
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
57
+
58
+ """
59
+ # CLI Example of Copying from Source to Dest without file conversion
60
+ # (i.e., register only - shallow copy):
61
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
62
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
63
+
64
+ # CLI Example of Copying from Source to Dest without file conversion
65
+ # (i.e., register only - deep copy):
66
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table -r
67
+ # The above command will make a deep copy of all JSON files found in the source
68
+ # to the catalog data file directory in the destination.
69
+
70
+ # CLI Example of Copying from Source to Dest with file conversion
71
+ # (i.e., deep copy with file content type transformation):
72
+ $ dcat convert json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/ --type FEATHER
73
+ # The above command will read JSON files found in the source, transform them to
74
+ # Arrow Feather files, and register them in the destination.
75
+
76
+ # Python Example of Copying from Source to Dest with file conversion
77
+ # (i.e., deep copy with file content type transformation):
78
+ >>> ds = dc.get("json+s3://my_bucket/log_manager/")
79
+ >>> dc.put("dc://my_deltacat_catalog/log_manager/", dataset=ds, type=ContentType.FEATHER)
80
+ # Or, equivalently, we can do the write directly from the dataset:
81
+ >>> ds.write_deltacat("dc://my_deltacat_catalog/log_manager/", type=ContentType.FEATHER)
82
+ """
83
+
84
+
85
+ def copy(
86
+ src: DeltaCatUrl,
87
+ dst: DeltaCatUrl,
88
+ *,
89
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
90
+ extension_to_memory_multiplier: Dict[str, float] = {
91
+ "pq": 5,
92
+ "parquet": 5,
93
+ "feather": 1.5,
94
+ "arrow": 1.5,
95
+ "csv": 1.5,
96
+ "tsv": 1.5,
97
+ "psv": 1.5,
98
+ "txt": 1.5,
99
+ "json": 1.5,
100
+ "jsonl": 1.5,
101
+ "gz": 35,
102
+ "bz2": 35,
103
+ "zip": 35,
104
+ "zst": 35,
105
+ "7z": 35,
106
+ "*": 2.5,
107
+ },
108
+ minimum_worker_cpus: int = 0,
109
+ reader_args: Dict[str, Any] = {},
110
+ writer_args: Dict[str, Any] = {},
111
+ filesystem: Optional[pafs.FileSystem] = None,
112
+ ) -> Union[Metafile, str]:
113
+ """
114
+ Copies data from the source datastore to the destination datastore. By
115
+ default, this method launches one parallel Ray process to read/transform
116
+ each input file found in the source followed by one parallel Ray process
117
+ to write each output file to the destination. To ensure that adequate
118
+ resources are available to complete the operation, you may optionally
119
+ specify minimum cluster and/or worker CPUs to wait for before starting
120
+ parallel processing.
121
+
122
+ Args:
123
+ src: DeltaCAT URL of the source datastore to read.
124
+ dst: DeltaCAT URL of the destination datastore to write.
125
+ transforms: List of transforms to apply to the source dataset prior
126
+ to write it to the destination datastore. Transforms take the in-memory
127
+ dataset type read (e.g., Polars DataFrame) and source DeltaCAT URL as
128
+ input and return the same dataset type as output. Transforms are
129
+ applied to the dataset in the order given.
130
+ extension_to_memory_multiplier: Dictionary of file extensions to
131
+ in-memory inflation estimates for that extension (i.e., the amount
132
+ of memory required to read a source file, apply transforms, and write
133
+ it back to a destination file).
134
+ minimum_worker_cpus: The minimum number of Ray worker CPUs
135
+ to wait for before starting distributed execution. Useful for cases
136
+ where the operation is known to suffer from resource starvation (e.g.,
137
+ out-of-memory errors) if started before the cluster has launched a
138
+ minimum number of required worker nodes.
139
+ reader_args: Additional keyword arguments to forward to the reader
140
+ associated with the in-memory dataset and datastore type to read
141
+ (e.g., polars.read_csv(**kwargs)).
142
+ writer_args: Additional keyword arguments to forward to the writer
143
+ associated with the in-memory dataset type read and datastore type to
144
+ write (e.g., polars.DataFrame.write_parquet(**kwargs)).
145
+ filesystem: Optional PyArrow filesystem to use for file IO. Will be
146
+ automatically resolved from the input path if not specified, and
147
+ will attempt to automatically resolve storage read/write
148
+ credentials for the associated source/dest file cloud provider(s).
149
+ Try providing your own filesystem with credentials, retry strategy,
150
+ etc. pre-configured if you encounter latency issues or errors
151
+ reading/writing files.
152
+
153
+ Returns:
154
+ None
155
+ """
156
+ if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
157
+ return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
158
+ else:
159
+ return _copy_external_ray(
160
+ src,
161
+ dst,
162
+ transforms=transforms,
163
+ extension_to_memory_multiplier=extension_to_memory_multiplier,
164
+ minimum_worker_cpus=minimum_worker_cpus,
165
+ reader_args=reader_args,
166
+ writer_args=writer_args,
167
+ filesystem=filesystem,
168
+ )
169
+
170
+
171
+ def _copy_objects_in_order(
172
+ src_objects: List[Metafile],
173
+ destination: DeltaCatUrl,
174
+ ) -> Union[Metafile, List[Metafile]]:
175
+ dc_dest_url = DeltaCatUrl(destination.url)
176
+ catalog_name = dc_dest_url.catalog_name
6
177
 
178
+ copied_results = []
7
179
 
8
- def copy(source, destination):
9
- src_parts = source.split("/")
10
- src_parts = [part for part in src_parts if part]
11
- dst_parts = destination.split("/")
12
- dst_parts = [part for part in dst_parts if part]
13
- if not dc.is_initialized():
14
- raise ValueError("Catalog not initialized.")
15
- if len(src_parts) != len(dst_parts) and len(src_parts) != len(dst_parts) + 1:
180
+ # Group objects by type for hierarchical copying
181
+ # Copy objects in strict hierarchical order
182
+ # Namespace -> Table -> TableVersion -> Stream -> Partition -> Delta
183
+ ordered_objects_by_type = {
184
+ Namespace: [],
185
+ Table: [],
186
+ TableVersion: [],
187
+ Stream: [],
188
+ Partition: [],
189
+ Delta: [],
190
+ }
191
+
192
+ for obj in src_objects:
193
+ obj_class = Metafile.get_class(obj.to_serializable())
194
+ ordered_objects_by_type[obj_class].append(obj)
195
+
196
+ # TODO(pdames): Support copying uncommitted streams/partitions.
197
+ # TODO(pdames): Support parallel/distributed copies.
198
+ for obj_class, objects in ordered_objects_by_type.items():
199
+ if objects:
200
+ logger.info(f"Copying {len(objects)} {obj_class} objects...")
201
+ if obj_class == TableVersion:
202
+ # sort table versions by ascending table version
203
+ objects.sort(key=lambda x: x.current_version_number())
204
+ if obj_class == Delta:
205
+ # sort deltas by ascending stream position
206
+ objects.sort(key=lambda x: x.stream_position)
207
+ for i, obj in enumerate(objects):
208
+ logger.info(f"Copying object {i+1}/{len(objects)}: {obj.url}")
209
+ dest_url = DeltaCatUrl(obj.url(catalog_name=catalog_name))
210
+ logger.info(f"Destination URL for object {i+1}/{len(objects)}: {dest_url}")
211
+ result = put(dest_url, metafile=obj)
212
+ copied_results.append(result)
213
+ logger.info(f"Successfully copied object {i+1}/{len(objects)}")
214
+ return copied_results[0] if len(copied_results) == 1 else copied_results
215
+
216
+
217
+ def _copy_dc(
218
+ source: DeltaCatUrl,
219
+ destination: DeltaCatUrl,
220
+ recursive: bool = False,
221
+ ) -> Union[Metafile, List[Metafile]]:
222
+ dc.raise_if_not_initialized()
223
+ if len(source.url.split("/")) != len(destination.url.split("/")):
16
224
  # TODO(pdames): Better error message.
17
225
  raise ValueError(
18
226
  f"Cannot copy {source} to {destination}. "
19
227
  f"Source and destination must share the same type."
20
228
  )
21
- src_obj = get(source)
22
- if len(src_parts) == 1:
23
- # copy the given catalog
24
- raise NotImplementedError
25
- elif len(src_parts) == 2:
26
- # TODO(pdames): Make catalog specification optional if there is only
27
- # one catalog (e.g., auto-retrieve src_parts[0]/dst_parts[0])
28
- # copy the given namespace
29
- src_namespace_name = src_parts[1]
30
- dst_catalog_name = dst_parts[0]
31
- dst_namespace_name = dst_parts[1] if len(dst_parts) >= 2 else src_namespace_name
32
- new_namespace = dc.create_namespace(
33
- namespace=dst_namespace_name,
34
- properties=src_obj.properties,
35
- catalog=dst_catalog_name,
36
- )
37
- return new_namespace
38
- elif len(src_parts) == 3:
39
- # copy the given table
40
- raise NotImplementedError
41
- elif len(src_parts) == 4:
42
- # copy the given table version
43
- raise NotImplementedError
44
- elif len(src_parts) == 5:
45
- # copy the given stream
46
- raise NotImplementedError
47
- elif len(src_parts) == 6:
48
- # copy the given partition
49
- raise NotImplementedError
50
- elif len(src_parts) == 7:
51
- # copy the given partition delta
52
- raise NotImplementedError
53
- raise ValueError(f"Invalid path: {src_parts}")
229
+ if recursive:
230
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/**")), recursive=True)
231
+ elif source.url.endswith("/*"):
232
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/*")))
233
+ else:
234
+ src_objects = [get(source)]
235
+ return _copy_objects_in_order(src_objects, destination)
54
236
 
55
237
 
56
238
  def concat(source, destination):
@@ -65,98 +247,132 @@ def move(source, destination):
65
247
  raise NotImplementedError
66
248
 
67
249
 
68
- def list(path):
69
- raise NotImplementedError
250
+ def _list_all_metafiles(
251
+ url: DeltaCatUrl,
252
+ recursive: bool = False,
253
+ **kwargs,
254
+ ) -> List[Metafile]:
255
+ reader = DeltaCatUrlReader(url)
256
+ list_results: List[ListResult[Metafile]] = []
257
+ lister = reader.listers.pop(0)[0]
258
+ # the top-level lister doesn't have any missing keyword args
259
+ metafiles: ListResult[Metafile] = lister(**kwargs)
260
+ list_results.append(metafiles)
261
+ if recursive:
262
+ # Process each level of the hierarchy
263
+ current_level_metafiles = [mf for mf in metafiles.all_items()]
70
264
 
265
+ for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
266
+ next_level_metafiles = []
267
+ # each subsequent lister needs to inject missing keyword args from the parent metafile
268
+ for metafile in current_level_metafiles:
269
+ kwargs_update = (
270
+ {kwarg_name: kwarg_val_resolver_fn(metafile)}
271
+ if kwarg_name and kwarg_val_resolver_fn
272
+ else {}
273
+ )
274
+ lister_kwargs = {
275
+ **kwargs,
276
+ **kwargs_update,
277
+ }
278
+ child_metafiles = lister(**lister_kwargs)
279
+ list_results.append(child_metafiles)
280
+ next_level_metafiles.extend(child_metafiles.all_items())
281
+ # Move to the next level for the next iteration
282
+ current_level_metafiles = next_level_metafiles
283
+ return [
284
+ metafile for list_result in list_results for metafile in list_result.all_items()
285
+ ]
71
286
 
72
- def get(path) -> Any:
73
- parts = path.split("/")
74
- parts = [part for part in parts if part]
75
- if not dc.is_initialized():
76
- # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
77
- # last session.
78
- raise ValueError("Catalog not initialized.")
79
- if len(parts) == 1:
80
- # TODO(pdames): Save all catalogs registered from the last session on
81
- # disk so that users don't need to re-initialize them every time.
82
- # get the given catalog
83
- catalog_name = parts[0]
84
- return dc.get_catalog(catalog_name)
85
- elif len(parts) == 2:
86
- # get the given namespace
87
- catalog_name = parts[0]
88
- namespace_name = parts[1]
89
- return dc.get_namespace(
90
- namespace=namespace_name,
91
- catalog=catalog_name,
92
- )
93
- elif len(parts) == 3:
94
- # get the given table
95
- raise NotImplementedError
96
- elif len(parts) == 4:
97
- # get the given table version
98
- raise NotImplementedError
99
- elif len(parts) == 5:
100
- # get the given stream
101
- raise NotImplementedError
102
- elif len(parts) == 6:
103
- # get the given partition
104
- raise NotImplementedError
105
- elif len(parts) == 7:
106
- # get the given partition delta
107
- raise NotImplementedError
108
- raise ValueError(f"Invalid path: {path}")
109
-
110
-
111
- def put(path, *args, **kwargs) -> Any:
112
- parts = path.split("/")
113
- parts = [part for part in parts if part]
114
- if len(parts) == 1:
115
- # TODO(pdames): Save all catalogs registered from the last session on
116
- # disk so that users don't need to re-initialize them every time.
117
- # register the given catalog
118
- catalog_name = parts[0]
119
- # Initialize default catalog using kwargs
120
- catalog = Catalog(**kwargs)
121
- return dc.put_catalog(catalog_name, catalog)
122
- elif len(parts) == 2:
123
- # register the given namespace
124
- catalog_name = parts[0]
125
- namespace_name = parts[1]
126
- if not dc.is_initialized():
127
- # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
128
- # last session.
129
- raise ValueError("Catalog not initialized.")
130
- new_namespace = dc.create_namespace(
131
- namespace=namespace_name,
132
- catalog=catalog_name,
133
- *args,
287
+
288
+ class CustomReadKwargsProvider(ReadKwargsProvider):
289
+ def __init__(
290
+ self,
291
+ datasource_type: str,
292
+ kwargs: Dict[str, Any],
293
+ ):
294
+ self._datasource_type = datasource_type
295
+ self._kwargs = kwargs
296
+
297
+ def _get_kwargs(
298
+ self,
299
+ datasource_type: str,
300
+ kwargs: Dict[str, Any],
301
+ ) -> Dict[str, Any]:
302
+ if datasource_type == self._datasource_type:
303
+ kwargs.update(self._kwargs)
304
+ return kwargs
305
+
306
+
307
+ def list(
308
+ url: DeltaCatUrl,
309
+ *,
310
+ recursive: bool = False,
311
+ dataset_type: Optional[DatasetType] = None,
312
+ **kwargs,
313
+ ) -> Union[List[Metafile], LocalTable, DistributedDataset]:
314
+ if not url.is_deltacat_catalog_url():
315
+ raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
316
+ if dataset_type in DatasetType.distributed():
317
+ if dataset_type == DatasetType.RAY_DATASET:
318
+ read_type = (
319
+ DeltacatReadType.METADATA_LIST
320
+ if not recursive
321
+ else DeltacatReadType.METADATA_LIST_RECURSIVE
322
+ )
323
+ return read_deltacat(
324
+ [url],
325
+ deltacat_read_type=read_type,
326
+ timestamp_as_of=None,
327
+ merge_on_read=False,
328
+ read_kwargs_provider=CustomReadKwargsProvider(
329
+ datasource_type=url.datastore_type,
330
+ kwargs=kwargs,
331
+ ),
332
+ )
333
+ else:
334
+ raise NotImplementedError(
335
+ f"Unsupported dataset type: {dataset_type.name}. "
336
+ f"Supported Dataset Types: {DatasetType.RAY_DATASET.name}",
337
+ )
338
+ else:
339
+ # return a local list of metafiles
340
+ # TODO(pdames): Cast the list to the appropriate local dataset type.
341
+ return _list_all_metafiles(
342
+ url=url,
343
+ recursive=recursive,
134
344
  **kwargs,
135
345
  )
136
- return new_namespace
137
- elif len(parts) == 3:
138
- # register the given table
139
- raise NotImplementedError
140
- elif len(parts) == 4:
141
- # register the given table version
142
- raise NotImplementedError
143
- elif len(parts) == 5:
144
- # register the given stream
145
- raise NotImplementedError
146
- elif len(parts) == 6:
147
- # register the given partition
148
- raise NotImplementedError
149
- elif len(parts) == 7:
150
- # register the given partition delta
151
- raise NotImplementedError
152
- raise ValueError(f"Invalid path: {path}")
346
+
347
+
348
+ def get(
349
+ url,
350
+ *args,
351
+ **kwargs,
352
+ ) -> Union[Metafile, Dataset]:
353
+ reader = DeltaCatUrlReader(url)
354
+ return reader.read(*args, **kwargs)
355
+
356
+
357
+ def put(
358
+ url: DeltaCatUrl,
359
+ metafile: Optional[Metafile] = None,
360
+ *args,
361
+ **kwargs,
362
+ ) -> Union[Metafile, str]:
363
+ writer = DeltaCatUrlWriter(url, metafile=metafile)
364
+ return writer.write(*args, **kwargs)
365
+
366
+
367
+ def touch(path):
368
+ raise NotImplementedError
153
369
 
154
370
 
155
371
  def exists(path):
156
372
  raise NotImplementedError
157
373
 
158
374
 
159
- def query(path, expression):
375
+ def query(expression):
160
376
  raise NotImplementedError
161
377
 
162
378
 
@@ -166,3 +382,197 @@ def tail(path):
166
382
 
167
383
  def head(path):
168
384
  raise NotImplementedError
385
+
386
+
387
+ def _copy_external_ray(
388
+ src: DeltaCatUrl,
389
+ dst: DeltaCatUrl,
390
+ *,
391
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
392
+ extension_to_memory_multiplier: Dict[str, float] = {
393
+ "pq": 5,
394
+ "parquet": 5,
395
+ "feather": 1.5,
396
+ "arrow": 1.5,
397
+ "csv": 1.5,
398
+ "tsv": 1.5,
399
+ "psv": 1.5,
400
+ "txt": 1.5,
401
+ "json": 1.5,
402
+ "jsonl": 1.5,
403
+ "gz": 35,
404
+ "bz2": 35,
405
+ "zip": 35,
406
+ "zst": 35,
407
+ "7z": 35,
408
+ "*": 2.5,
409
+ },
410
+ minimum_worker_cpus: int = 0,
411
+ reader_args: Dict[str, Any] = {},
412
+ writer_args: Dict[str, Any] = {},
413
+ filesystem: pafs.FileSystem = None,
414
+ ) -> str:
415
+ logger.info(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
416
+
417
+ if not isinstance(src, DeltaCatUrl):
418
+ raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
419
+
420
+ # wait for required resources
421
+ head_cpu_count = int(current_node_resources()["CPU"])
422
+ if minimum_worker_cpus > 0:
423
+ logger.info(f"Waiting for {minimum_worker_cpus} worker CPUs...")
424
+ live_cpu_waiter(
425
+ min_live_cpus=minimum_worker_cpus + head_cpu_count,
426
+ )
427
+ logger.info(f"{minimum_worker_cpus} worker CPUs found!")
428
+ # start job execution
429
+ cluster_resources = ray.cluster_resources()
430
+ logger.info(f"Cluster Resources: {cluster_resources}")
431
+ logger.info(f"Available Cluster Resources: {ray.available_resources()}")
432
+ cluster_cpus = int(cluster_resources["CPU"])
433
+ logger.info(f"Cluster CPUs: {cluster_cpus}")
434
+ all_node_resource_keys = live_node_resource_keys()
435
+ logger.info(
436
+ f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}"
437
+ )
438
+ worker_node_resource_keys = other_live_node_resource_keys()
439
+ logger.info(
440
+ f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
441
+ )
442
+ worker_cpu_count = cluster_cpus - head_cpu_count
443
+ logger.info(f"Total worker CPUs: {worker_cpu_count}")
444
+
445
+ # estimate memory requirements based on file extension
446
+ estimated_memory_bytes = 0
447
+ if extension_to_memory_multiplier:
448
+ logger.info(f"Resolving stats collection filesystem for: {src.url_path}.")
449
+ path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
450
+ if isinstance(filesystem, pafs.GcsFileSystem):
451
+ from datetime import timedelta
452
+
453
+ # Configure a retry time limit for GcsFileSystem so that it
454
+ # doesn't hang forever trying to get file info (e.g., when
455
+ # trying to get a public file w/o anonymous=True).
456
+ filesystem = pafs.GcsFileSystem(
457
+ anonymous=True,
458
+ retry_time_limit=timedelta(seconds=10),
459
+ )
460
+ logger.info(f"Using filesystem {type(filesystem)} to get file size of: {path}")
461
+ file_info = get_file_info(path, filesystem)
462
+ if file_info.type != FileType.File:
463
+ raise ValueError(
464
+ f"Expected `src` to be a file but got `{file_info.type}` at "
465
+ f"`{src.url_path}`."
466
+ )
467
+ inflation_multiplier = extension_to_memory_multiplier.get(file_info.extension)
468
+ if inflation_multiplier is None:
469
+ inflation_multiplier = extension_to_memory_multiplier.get("*")
470
+ estimated_memory_bytes = inflation_multiplier * file_info.size
471
+ logger.info(
472
+ f"Estimated Memory Required for Copy: "
473
+ f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
474
+ )
475
+ logger.info(f"Starting DeltaCAT Copy at: {time.time_ns()}")
476
+
477
+ index_result = None
478
+ num_cpus = 1
479
+ # TODO(pdames): remove hard-coding - issues encountered when going greater
480
+ # than 2 include verifying that the scope of schedulable nodes doesn't
481
+ # result in all large files lining up for the one large node in the cluster
482
+ # that can actually handle them (which is worse if it's also the head node)
483
+ max_allowed_cpus = 2
484
+ while not index_result:
485
+ copy_task_pending, latency = timed_invocation(
486
+ copy_task.options(num_cpus=num_cpus, memory=estimated_memory_bytes).remote,
487
+ src=src,
488
+ dest=dst,
489
+ dataset_type=DatasetType.POLARS,
490
+ transforms=transforms,
491
+ reader_args=reader_args,
492
+ writer_args=writer_args,
493
+ )
494
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
495
+ try:
496
+ index_result, latency = timed_invocation(
497
+ ray.get,
498
+ copy_task_pending,
499
+ )
500
+ except OutOfMemoryError as e:
501
+ logger.warning(f"Copy Task Ran Out of Memory: {e}")
502
+ max_single_node_cpus = min(
503
+ max_allowed_cpus, find_max_single_node_resource_type("CPU")
504
+ )
505
+ num_cpus += 1
506
+ if num_cpus > max_single_node_cpus:
507
+ raise e
508
+ logger.info(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
509
+
510
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
511
+ logger.info(f"Time to Complete Copy Task: {latency} seconds")
512
+
513
+ total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
514
+
515
+ logger.info(f"Records Copied: {index_result.table_length}")
516
+ logger.info(f"Bytes Copied: {total_gib_indexed} GiB")
517
+ logger.info(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
518
+ logger.info(f"Finished Copy at: {time.time_ns()}")
519
+
520
+ return dst.url
521
+
522
+
523
+ @ray.remote(scheduling_strategy="SPREAD")
524
+ def copy_task(
525
+ src: DeltaCatUrl,
526
+ dest: DeltaCatUrl,
527
+ dataset_type: DatasetType,
528
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
529
+ reader_args: Dict[str, Any] = {},
530
+ writer_args: Dict[str, Any] = {},
531
+ ) -> Tuple[Optional[int], int]:
532
+ """
533
+ Indexes a DeltaCAT source URL into a DeltaCAT destination URL.
534
+ """
535
+ table, latency = timed_invocation(
536
+ read_table,
537
+ src=src,
538
+ dataset_type=dataset_type,
539
+ transforms=transforms,
540
+ reader_args=reader_args,
541
+ )
542
+ logger.debug(f"Time to read {src.url_path}: {latency} seconds")
543
+
544
+ table_size = get_table_size(table)
545
+ logger.debug(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
546
+
547
+ table_length = get_table_length(table)
548
+ logger.debug(f"Table Records: {table_length}")
549
+
550
+ writer = DeltaCatUrlWriter(dest, dataset_type)
551
+ written_file_path, latency = timed_invocation(
552
+ writer.write,
553
+ "",
554
+ table,
555
+ **writer_args,
556
+ )
557
+ logger.debug(f"Time to write {written_file_path}: {latency}")
558
+
559
+ return CopyResult(table_size, table_length)
560
+
561
+
562
+ def read_table(
563
+ src: DeltaCatUrl,
564
+ dataset_type: DatasetType,
565
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
566
+ reader_args: Dict[str, Any] = {},
567
+ ) -> LocalTable:
568
+ reader = DeltaCatUrlReader(src, dataset_type)
569
+ table: LocalTable = reader.read(**reader_args)
570
+ for transform in transforms:
571
+ table = transform(table, src)
572
+ return table
573
+
574
+
575
+ @dataclass(frozen=True)
576
+ class CopyResult:
577
+ table_size: int
578
+ table_length: int