deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/api.py ADDED
@@ -0,0 +1,578 @@
1
+ import time
2
+ from dataclasses import dataclass
3
+ from typing import Any, Union, List, Optional, Dict, Callable, Tuple
4
+ import logging
5
+
6
+ import ray
7
+ import deltacat as dc
8
+ import pyarrow.fs as pafs
9
+
10
+ from pyarrow.fs import FileType
11
+ from ray.exceptions import OutOfMemoryError
12
+
13
+ from deltacat.constants import BYTES_PER_GIBIBYTE
14
+ from deltacat.io import (
15
+ read_deltacat,
16
+ DeltacatReadType,
17
+ )
18
+ from deltacat.storage import (
19
+ Namespace,
20
+ Table,
21
+ TableVersion,
22
+ Stream,
23
+ Partition,
24
+ Delta,
25
+ Dataset,
26
+ DistributedDataset,
27
+ ListResult,
28
+ LocalTable,
29
+ Metafile,
30
+ )
31
+ from deltacat.types.media import DatasetType
32
+ from deltacat.utils.url import (
33
+ DeltaCatUrl,
34
+ DeltaCatUrlReader,
35
+ DeltaCatUrlWriter,
36
+ )
37
+ from deltacat.utils.common import ReadKwargsProvider
38
+ from deltacat.types.tables import (
39
+ get_table_size,
40
+ get_table_length,
41
+ )
42
+ from deltacat.utils.filesystem import (
43
+ resolve_path_and_filesystem,
44
+ get_file_info,
45
+ )
46
+ from deltacat.utils.performance import timed_invocation
47
+ from deltacat.utils.ray_utils.runtime import (
48
+ current_node_resources,
49
+ live_cpu_waiter,
50
+ live_node_resource_keys,
51
+ other_live_node_resource_keys,
52
+ find_max_single_node_resource_type,
53
+ )
54
+ from deltacat import logs
55
+
56
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
57
+
58
+ """
59
+ # CLI Example of Copying from Source to Dest without file conversion
60
+ # (i.e., register only - shallow copy):
61
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
62
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
63
+
64
+ # CLI Example of Copying from Source to Dest without file conversion
65
+ # (i.e., register only - deep copy):
66
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table -r
67
+ # The above command will make a deep copy of all JSON files found in the source
68
+ # to the catalog data file directory in the destination.
69
+
70
+ # CLI Example of Copying from Source to Dest with file conversion
71
+ # (i.e., deep copy with file content type transformation):
72
+ $ dcat convert json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/ --type FEATHER
73
+ # The above command will read JSON files found in the source, transform them to
74
+ # Arrow Feather files, and register them in the destination.
75
+
76
+ # Python Example of Copying from Source to Dest with file conversion
77
+ # (i.e., deep copy with file content type transformation):
78
+ >>> ds = dc.get("json+s3://my_bucket/log_manager/")
79
+ >>> dc.put("dc://my_deltacat_catalog/log_manager/", dataset=ds, type=ContentType.FEATHER)
80
+ # Or, equivalently, we can do the write directly from the dataset:
81
+ >>> ds.write_deltacat("dc://my_deltacat_catalog/log_manager/", type=ContentType.FEATHER)
82
+ """
83
+
84
+
85
+ def copy(
86
+ src: DeltaCatUrl,
87
+ dst: DeltaCatUrl,
88
+ *,
89
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
90
+ extension_to_memory_multiplier: Dict[str, float] = {
91
+ "pq": 5,
92
+ "parquet": 5,
93
+ "feather": 1.5,
94
+ "arrow": 1.5,
95
+ "csv": 1.5,
96
+ "tsv": 1.5,
97
+ "psv": 1.5,
98
+ "txt": 1.5,
99
+ "json": 1.5,
100
+ "jsonl": 1.5,
101
+ "gz": 35,
102
+ "bz2": 35,
103
+ "zip": 35,
104
+ "zst": 35,
105
+ "7z": 35,
106
+ "*": 2.5,
107
+ },
108
+ minimum_worker_cpus: int = 0,
109
+ reader_args: Dict[str, Any] = {},
110
+ writer_args: Dict[str, Any] = {},
111
+ filesystem: Optional[pafs.FileSystem] = None,
112
+ ) -> Union[Metafile, str]:
113
+ """
114
+ Copies data from the source datastore to the destination datastore. By
115
+ default, this method launches one parallel Ray process to read/transform
116
+ each input file found in the source followed by one parallel Ray process
117
+ to write each output file to the destination. To ensure that adequate
118
+ resources are available to complete the operation, you may optionally
119
+ specify minimum cluster and/or worker CPUs to wait for before starting
120
+ parallel processing.
121
+
122
+ Args:
123
+ src: DeltaCAT URL of the source datastore to read.
124
+ dst: DeltaCAT URL of the destination datastore to write.
125
+ transforms: List of transforms to apply to the source dataset prior
126
+ to write it to the destination datastore. Transforms take the in-memory
127
+ dataset type read (e.g., Polars DataFrame) and source DeltaCAT URL as
128
+ input and return the same dataset type as output. Transforms are
129
+ applied to the dataset in the order given.
130
+ extension_to_memory_multiplier: Dictionary of file extensions to
131
+ in-memory inflation estimates for that extension (i.e., the amount
132
+ of memory required to read a source file, apply transforms, and write
133
+ it back to a destination file).
134
+ minimum_worker_cpus: The minimum number of Ray worker CPUs
135
+ to wait for before starting distributed execution. Useful for cases
136
+ where the operation is known to suffer from resource starvation (e.g.,
137
+ out-of-memory errors) if started before the cluster has launched a
138
+ minimum number of required worker nodes.
139
+ reader_args: Additional keyword arguments to forward to the reader
140
+ associated with the in-memory dataset and datastore type to read
141
+ (e.g., polars.read_csv(**kwargs)).
142
+ writer_args: Additional keyword arguments to forward to the writer
143
+ associated with the in-memory dataset type read and datastore type to
144
+ write (e.g., polars.DataFrame.write_parquet(**kwargs)).
145
+ filesystem: Optional PyArrow filesystem to use for file IO. Will be
146
+ automatically resolved from the input path if not specified, and
147
+ will attempt to automatically resolve storage read/write
148
+ credentials for the associated source/dest file cloud provider(s).
149
+ Try providing your own filesystem with credentials, retry strategy,
150
+ etc. pre-configured if you encounter latency issues or errors
151
+ reading/writing files.
152
+
153
+ Returns:
154
+ None
155
+ """
156
+ if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
157
+ return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
158
+ else:
159
+ return _copy_external_ray(
160
+ src,
161
+ dst,
162
+ transforms=transforms,
163
+ extension_to_memory_multiplier=extension_to_memory_multiplier,
164
+ minimum_worker_cpus=minimum_worker_cpus,
165
+ reader_args=reader_args,
166
+ writer_args=writer_args,
167
+ filesystem=filesystem,
168
+ )
169
+
170
+
171
+ def _copy_objects_in_order(
172
+ src_objects: List[Metafile],
173
+ destination: DeltaCatUrl,
174
+ ) -> Union[Metafile, List[Metafile]]:
175
+ dc_dest_url = DeltaCatUrl(destination.url)
176
+ catalog_name = dc_dest_url.catalog_name
177
+
178
+ copied_results = []
179
+
180
+ # Group objects by type for hierarchical copying
181
+ # Copy objects in strict hierarchical order
182
+ # Namespace -> Table -> TableVersion -> Stream -> Partition -> Delta
183
+ ordered_objects_by_type = {
184
+ Namespace: [],
185
+ Table: [],
186
+ TableVersion: [],
187
+ Stream: [],
188
+ Partition: [],
189
+ Delta: [],
190
+ }
191
+
192
+ for obj in src_objects:
193
+ obj_class = Metafile.get_class(obj.to_serializable())
194
+ ordered_objects_by_type[obj_class].append(obj)
195
+
196
+ # TODO(pdames): Support copying uncommitted streams/partitions.
197
+ # TODO(pdames): Support parallel/distributed copies.
198
+ for obj_class, objects in ordered_objects_by_type.items():
199
+ if objects:
200
+ logger.info(f"Copying {len(objects)} {obj_class} objects...")
201
+ if obj_class == TableVersion:
202
+ # sort table versions by ascending table version
203
+ objects.sort(key=lambda x: x.current_version_number())
204
+ if obj_class == Delta:
205
+ # sort deltas by ascending stream position
206
+ objects.sort(key=lambda x: x.stream_position)
207
+ for i, obj in enumerate(objects):
208
+ logger.info(f"Copying object {i+1}/{len(objects)}: {obj.url}")
209
+ dest_url = DeltaCatUrl(obj.url(catalog_name=catalog_name))
210
+ logger.info(f"Destination URL for object {i+1}/{len(objects)}: {dest_url}")
211
+ result = put(dest_url, metafile=obj)
212
+ copied_results.append(result)
213
+ logger.info(f"Successfully copied object {i+1}/{len(objects)}")
214
+ return copied_results[0] if len(copied_results) == 1 else copied_results
215
+
216
+
217
+ def _copy_dc(
218
+ source: DeltaCatUrl,
219
+ destination: DeltaCatUrl,
220
+ recursive: bool = False,
221
+ ) -> Union[Metafile, List[Metafile]]:
222
+ dc.raise_if_not_initialized()
223
+ if len(source.url.split("/")) != len(destination.url.split("/")):
224
+ # TODO(pdames): Better error message.
225
+ raise ValueError(
226
+ f"Cannot copy {source} to {destination}. "
227
+ f"Source and destination must share the same type."
228
+ )
229
+ if recursive:
230
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/**")), recursive=True)
231
+ elif source.url.endswith("/*"):
232
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/*")))
233
+ else:
234
+ src_objects = [get(source)]
235
+ return _copy_objects_in_order(src_objects, destination)
236
+
237
+
238
+ def concat(source, destination):
239
+ raise NotImplementedError
240
+
241
+
242
+ def delete(source):
243
+ raise NotImplementedError
244
+
245
+
246
+ def move(source, destination):
247
+ raise NotImplementedError
248
+
249
+
250
+ def _list_all_metafiles(
251
+ url: DeltaCatUrl,
252
+ recursive: bool = False,
253
+ **kwargs,
254
+ ) -> List[Metafile]:
255
+ reader = DeltaCatUrlReader(url)
256
+ list_results: List[ListResult[Metafile]] = []
257
+ lister = reader.listers.pop(0)[0]
258
+ # the top-level lister doesn't have any missing keyword args
259
+ metafiles: ListResult[Metafile] = lister(**kwargs)
260
+ list_results.append(metafiles)
261
+ if recursive:
262
+ # Process each level of the hierarchy
263
+ current_level_metafiles = [mf for mf in metafiles.all_items()]
264
+
265
+ for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
266
+ next_level_metafiles = []
267
+ # each subsequent lister needs to inject missing keyword args from the parent metafile
268
+ for metafile in current_level_metafiles:
269
+ kwargs_update = (
270
+ {kwarg_name: kwarg_val_resolver_fn(metafile)}
271
+ if kwarg_name and kwarg_val_resolver_fn
272
+ else {}
273
+ )
274
+ lister_kwargs = {
275
+ **kwargs,
276
+ **kwargs_update,
277
+ }
278
+ child_metafiles = lister(**lister_kwargs)
279
+ list_results.append(child_metafiles)
280
+ next_level_metafiles.extend(child_metafiles.all_items())
281
+ # Move to the next level for the next iteration
282
+ current_level_metafiles = next_level_metafiles
283
+ return [
284
+ metafile for list_result in list_results for metafile in list_result.all_items()
285
+ ]
286
+
287
+
288
+ class CustomReadKwargsProvider(ReadKwargsProvider):
289
+ def __init__(
290
+ self,
291
+ datasource_type: str,
292
+ kwargs: Dict[str, Any],
293
+ ):
294
+ self._datasource_type = datasource_type
295
+ self._kwargs = kwargs
296
+
297
+ def _get_kwargs(
298
+ self,
299
+ datasource_type: str,
300
+ kwargs: Dict[str, Any],
301
+ ) -> Dict[str, Any]:
302
+ if datasource_type == self._datasource_type:
303
+ kwargs.update(self._kwargs)
304
+ return kwargs
305
+
306
+
307
+ def list(
308
+ url: DeltaCatUrl,
309
+ *,
310
+ recursive: bool = False,
311
+ dataset_type: Optional[DatasetType] = None,
312
+ **kwargs,
313
+ ) -> Union[List[Metafile], LocalTable, DistributedDataset]:
314
+ if not url.is_deltacat_catalog_url():
315
+ raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
316
+ if dataset_type in DatasetType.distributed():
317
+ if dataset_type == DatasetType.RAY_DATASET:
318
+ read_type = (
319
+ DeltacatReadType.METADATA_LIST
320
+ if not recursive
321
+ else DeltacatReadType.METADATA_LIST_RECURSIVE
322
+ )
323
+ return read_deltacat(
324
+ [url],
325
+ deltacat_read_type=read_type,
326
+ timestamp_as_of=None,
327
+ merge_on_read=False,
328
+ read_kwargs_provider=CustomReadKwargsProvider(
329
+ datasource_type=url.datastore_type,
330
+ kwargs=kwargs,
331
+ ),
332
+ )
333
+ else:
334
+ raise NotImplementedError(
335
+ f"Unsupported dataset type: {dataset_type.name}. "
336
+ f"Supported Dataset Types: {DatasetType.RAY_DATASET.name}",
337
+ )
338
+ else:
339
+ # return a local list of metafiles
340
+ # TODO(pdames): Cast the list to the appropriate local dataset type.
341
+ return _list_all_metafiles(
342
+ url=url,
343
+ recursive=recursive,
344
+ **kwargs,
345
+ )
346
+
347
+
348
+ def get(
349
+ url,
350
+ *args,
351
+ **kwargs,
352
+ ) -> Union[Metafile, Dataset]:
353
+ reader = DeltaCatUrlReader(url)
354
+ return reader.read(*args, **kwargs)
355
+
356
+
357
+ def put(
358
+ url: DeltaCatUrl,
359
+ metafile: Optional[Metafile] = None,
360
+ *args,
361
+ **kwargs,
362
+ ) -> Union[Metafile, str]:
363
+ writer = DeltaCatUrlWriter(url, metafile=metafile)
364
+ return writer.write(*args, **kwargs)
365
+
366
+
367
+ def touch(path):
368
+ raise NotImplementedError
369
+
370
+
371
+ def exists(path):
372
+ raise NotImplementedError
373
+
374
+
375
+ def query(expression):
376
+ raise NotImplementedError
377
+
378
+
379
+ def tail(path):
380
+ raise NotImplementedError
381
+
382
+
383
+ def head(path):
384
+ raise NotImplementedError
385
+
386
+
387
+ def _copy_external_ray(
388
+ src: DeltaCatUrl,
389
+ dst: DeltaCatUrl,
390
+ *,
391
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
392
+ extension_to_memory_multiplier: Dict[str, float] = {
393
+ "pq": 5,
394
+ "parquet": 5,
395
+ "feather": 1.5,
396
+ "arrow": 1.5,
397
+ "csv": 1.5,
398
+ "tsv": 1.5,
399
+ "psv": 1.5,
400
+ "txt": 1.5,
401
+ "json": 1.5,
402
+ "jsonl": 1.5,
403
+ "gz": 35,
404
+ "bz2": 35,
405
+ "zip": 35,
406
+ "zst": 35,
407
+ "7z": 35,
408
+ "*": 2.5,
409
+ },
410
+ minimum_worker_cpus: int = 0,
411
+ reader_args: Dict[str, Any] = {},
412
+ writer_args: Dict[str, Any] = {},
413
+ filesystem: pafs.FileSystem = None,
414
+ ) -> str:
415
+ logger.info(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
416
+
417
+ if not isinstance(src, DeltaCatUrl):
418
+ raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
419
+
420
+ # wait for required resources
421
+ head_cpu_count = int(current_node_resources()["CPU"])
422
+ if minimum_worker_cpus > 0:
423
+ logger.info(f"Waiting for {minimum_worker_cpus} worker CPUs...")
424
+ live_cpu_waiter(
425
+ min_live_cpus=minimum_worker_cpus + head_cpu_count,
426
+ )
427
+ logger.info(f"{minimum_worker_cpus} worker CPUs found!")
428
+ # start job execution
429
+ cluster_resources = ray.cluster_resources()
430
+ logger.info(f"Cluster Resources: {cluster_resources}")
431
+ logger.info(f"Available Cluster Resources: {ray.available_resources()}")
432
+ cluster_cpus = int(cluster_resources["CPU"])
433
+ logger.info(f"Cluster CPUs: {cluster_cpus}")
434
+ all_node_resource_keys = live_node_resource_keys()
435
+ logger.info(
436
+ f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}"
437
+ )
438
+ worker_node_resource_keys = other_live_node_resource_keys()
439
+ logger.info(
440
+ f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
441
+ )
442
+ worker_cpu_count = cluster_cpus - head_cpu_count
443
+ logger.info(f"Total worker CPUs: {worker_cpu_count}")
444
+
445
+ # estimate memory requirements based on file extension
446
+ estimated_memory_bytes = 0
447
+ if extension_to_memory_multiplier:
448
+ logger.info(f"Resolving stats collection filesystem for: {src.url_path}.")
449
+ path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
450
+ if isinstance(filesystem, pafs.GcsFileSystem):
451
+ from datetime import timedelta
452
+
453
+ # Configure a retry time limit for GcsFileSystem so that it
454
+ # doesn't hang forever trying to get file info (e.g., when
455
+ # trying to get a public file w/o anonymous=True).
456
+ filesystem = pafs.GcsFileSystem(
457
+ anonymous=True,
458
+ retry_time_limit=timedelta(seconds=10),
459
+ )
460
+ logger.info(f"Using filesystem {type(filesystem)} to get file size of: {path}")
461
+ file_info = get_file_info(path, filesystem)
462
+ if file_info.type != FileType.File:
463
+ raise ValueError(
464
+ f"Expected `src` to be a file but got `{file_info.type}` at "
465
+ f"`{src.url_path}`."
466
+ )
467
+ inflation_multiplier = extension_to_memory_multiplier.get(file_info.extension)
468
+ if inflation_multiplier is None:
469
+ inflation_multiplier = extension_to_memory_multiplier.get("*")
470
+ estimated_memory_bytes = inflation_multiplier * file_info.size
471
+ logger.info(
472
+ f"Estimated Memory Required for Copy: "
473
+ f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
474
+ )
475
+ logger.info(f"Starting DeltaCAT Copy at: {time.time_ns()}")
476
+
477
+ index_result = None
478
+ num_cpus = 1
479
+ # TODO(pdames): remove hard-coding - issues encountered when going greater
480
+ # than 2 include verifying that the scope of schedulable nodes doesn't
481
+ # result in all large files lining up for the one large node in the cluster
482
+ # that can actually handle them (which is worse if it's also the head node)
483
+ max_allowed_cpus = 2
484
+ while not index_result:
485
+ copy_task_pending, latency = timed_invocation(
486
+ copy_task.options(num_cpus=num_cpus, memory=estimated_memory_bytes).remote,
487
+ src=src,
488
+ dest=dst,
489
+ dataset_type=DatasetType.POLARS,
490
+ transforms=transforms,
491
+ reader_args=reader_args,
492
+ writer_args=writer_args,
493
+ )
494
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
495
+ try:
496
+ index_result, latency = timed_invocation(
497
+ ray.get,
498
+ copy_task_pending,
499
+ )
500
+ except OutOfMemoryError as e:
501
+ logger.warning(f"Copy Task Ran Out of Memory: {e}")
502
+ max_single_node_cpus = min(
503
+ max_allowed_cpus, find_max_single_node_resource_type("CPU")
504
+ )
505
+ num_cpus += 1
506
+ if num_cpus > max_single_node_cpus:
507
+ raise e
508
+ logger.info(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
509
+
510
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
511
+ logger.info(f"Time to Complete Copy Task: {latency} seconds")
512
+
513
+ total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
514
+
515
+ logger.info(f"Records Copied: {index_result.table_length}")
516
+ logger.info(f"Bytes Copied: {total_gib_indexed} GiB")
517
+ logger.info(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
518
+ logger.info(f"Finished Copy at: {time.time_ns()}")
519
+
520
+ return dst.url
521
+
522
+
523
+ @ray.remote(scheduling_strategy="SPREAD")
524
+ def copy_task(
525
+ src: DeltaCatUrl,
526
+ dest: DeltaCatUrl,
527
+ dataset_type: DatasetType,
528
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
529
+ reader_args: Dict[str, Any] = {},
530
+ writer_args: Dict[str, Any] = {},
531
+ ) -> Tuple[Optional[int], int]:
532
+ """
533
+ Indexes a DeltaCAT source URL into a DeltaCAT destination URL.
534
+ """
535
+ table, latency = timed_invocation(
536
+ read_table,
537
+ src=src,
538
+ dataset_type=dataset_type,
539
+ transforms=transforms,
540
+ reader_args=reader_args,
541
+ )
542
+ logger.debug(f"Time to read {src.url_path}: {latency} seconds")
543
+
544
+ table_size = get_table_size(table)
545
+ logger.debug(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
546
+
547
+ table_length = get_table_length(table)
548
+ logger.debug(f"Table Records: {table_length}")
549
+
550
+ writer = DeltaCatUrlWriter(dest, dataset_type)
551
+ written_file_path, latency = timed_invocation(
552
+ writer.write,
553
+ "",
554
+ table,
555
+ **writer_args,
556
+ )
557
+ logger.debug(f"Time to write {written_file_path}: {latency}")
558
+
559
+ return CopyResult(table_size, table_length)
560
+
561
+
562
+ def read_table(
563
+ src: DeltaCatUrl,
564
+ dataset_type: DatasetType,
565
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
566
+ reader_args: Dict[str, Any] = {},
567
+ ) -> LocalTable:
568
+ reader = DeltaCatUrlReader(src, dataset_type)
569
+ table: LocalTable = reader.read(**reader_args)
570
+ for transform in transforms:
571
+ table = transform(table, src)
572
+ return table
573
+
574
+
575
+ @dataclass(frozen=True)
576
+ class CopyResult:
577
+ table_size: int
578
+ table_length: int
deltacat/aws/constants.py CHANGED
@@ -1,32 +1,9 @@
1
- import botocore
2
1
  from typing import Set
3
- from daft.exceptions import DaftTransientError
4
2
  from deltacat.utils.common import env_integer, env_string
5
3
 
6
4
 
7
5
  DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
8
- DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
9
- "DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
10
- ) # 5 mins
11
6
  BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
12
7
  BOTO_TIMEOUT_ERROR_CODES: Set[str] = {"ReadTimeoutError", "ConnectTimeoutError"}
13
8
  BOTO_THROTTLING_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
14
- RETRYABLE_TRANSIENT_ERRORS = (
15
- OSError,
16
- botocore.exceptions.ConnectionError,
17
- botocore.exceptions.HTTPClientError,
18
- botocore.exceptions.NoCredentialsError,
19
- botocore.exceptions.ConnectTimeoutError,
20
- botocore.exceptions.ReadTimeoutError,
21
- DaftTransientError,
22
- )
23
9
  AWS_REGION = env_string("AWS_REGION", "us-east-1")
24
- UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
25
- "UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 10 * 60
26
- )
27
- UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY = env_integer(
28
- "UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 30 * 60
29
- )
30
- DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY = env_integer(
31
- "DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY", 30 * 60
32
- )