deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
1
+ """
2
+ Spark SQL utilities for Iceberg table operations.
3
+
4
+ This module provides Beam DoFn classes that use Spark SQL to work with Iceberg tables,
5
+ """
6
+
7
+ import os
8
+ import apache_beam as beam
9
+ from apache_beam import Row
10
+
11
+
12
+ class SparkSQLIcebergRead(beam.DoFn):
13
+ """
14
+ Custom Beam DoFn that uses Spark SQL to read Iceberg tables.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ table_name: str,
20
+ catalog_uri: str = "http://localhost:8181",
21
+ warehouse: str = "warehouse/",
22
+ ):
23
+ """
24
+ Initialize the Spark SQL reader.
25
+
26
+ Args:
27
+ table_name: Name of the Iceberg table
28
+ catalog_uri: URI of the Iceberg REST catalog
29
+ warehouse: Warehouse path
30
+ """
31
+ self.table_name = table_name
32
+ self.catalog_uri = catalog_uri
33
+ self.warehouse = warehouse
34
+ self.spark = None
35
+
36
+ def setup(self):
37
+ """Set up Spark session (called once per worker)."""
38
+ try:
39
+ from pyspark.sql import SparkSession
40
+ import importlib.metadata
41
+
42
+ # Get Spark version for dependency resolution
43
+ try:
44
+ spark_version = ".".join(
45
+ importlib.metadata.version("pyspark").split(".")[:2]
46
+ )
47
+ except Exception:
48
+ spark_version = "3.5" # Default fallback
49
+
50
+ scala_version = "2.12"
51
+ iceberg_version = "1.6.0"
52
+
53
+ print(f"🔧 Setting up Spark session for reading {self.table_name}")
54
+ print(f" - Spark version: {spark_version}")
55
+ print(f" - Iceberg version: {iceberg_version}")
56
+
57
+ # Set Spark packages for Iceberg runtime
58
+ os.environ["PYSPARK_SUBMIT_ARGS"] = (
59
+ f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version} "
60
+ f"pyspark-shell"
61
+ )
62
+
63
+ # Create Spark session with Iceberg REST catalog configuration
64
+ self.spark = (
65
+ SparkSession.builder.appName(f"DeltaCAT Read - {self.table_name}")
66
+ .config("spark.sql.session.timeZone", "UTC")
67
+ .config(
68
+ "spark.serializer", "org.apache.spark.serializer.KryoSerializer"
69
+ )
70
+ .config(
71
+ "spark.sql.extensions",
72
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
73
+ )
74
+ # Configure REST catalog
75
+ .config(
76
+ "spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog"
77
+ )
78
+ .config("spark.sql.catalog.rest.type", "rest")
79
+ .config("spark.sql.catalog.rest.uri", self.catalog_uri)
80
+ .config("spark.sql.catalog.rest.warehouse", self.warehouse)
81
+ # Set REST as default catalog
82
+ .config("spark.sql.defaultCatalog", "rest")
83
+ # Local mode configuration (within Beam workers)
84
+ .config("spark.master", "local[1]") # Single thread per worker
85
+ .config("spark.sql.adaptive.enabled", "true")
86
+ # Networking binding
87
+ .config("spark.driver.bindAddress", "127.0.0.1")
88
+ .config("spark.driver.host", "127.0.0.1")
89
+ .config("spark.ui.enabled", "false")
90
+ .config("spark.sql.adaptive.coalescePartitions.enabled", "false")
91
+ .getOrCreate()
92
+ )
93
+
94
+ print(f"✅ Spark session created successfully")
95
+
96
+ except Exception as e:
97
+ print(f"❌ Failed to set up Spark session: {e}")
98
+ raise
99
+
100
+ def teardown(self):
101
+ """Clean up Spark session (called once per worker)."""
102
+ if self.spark:
103
+ try:
104
+ self.spark.stop()
105
+ print("✅ Spark session stopped")
106
+ except Exception as e:
107
+ print(f"⚠️ Error stopping Spark session: {e}")
108
+
109
+ def process(self, element):
110
+ """
111
+ Process element (read from Iceberg table using Spark SQL).
112
+
113
+ Args:
114
+ element: Input element (not used, just triggers the read)
115
+
116
+ Yields:
117
+ Records from the Iceberg table
118
+ """
119
+ try:
120
+ if not self.spark:
121
+ raise RuntimeError("Spark session not initialized")
122
+
123
+ print(f"📖 Reading table {self.table_name} using Spark SQL")
124
+
125
+ # Read from Iceberg table using Spark SQL
126
+ df = self.spark.sql(f"SELECT * FROM {self.table_name}")
127
+
128
+ # Collect all records
129
+ records = df.collect()
130
+
131
+ print(f"📊 Successfully read {len(records)} records from {self.table_name}")
132
+
133
+ # Convert Spark rows to Beam Row objects and yield
134
+ for row in records:
135
+ row_dict = row.asDict()
136
+ # Convert to Beam Row for consistency with write mode
137
+ beam_row = Row(**row_dict)
138
+ yield beam_row
139
+
140
+ except Exception as e:
141
+ print(f"❌ Failed to read from table {self.table_name}: {e}")
142
+ raise
143
+
144
+
145
+ class SparkSQLIcebergRewrite(beam.DoFn):
146
+ """
147
+ Custom Beam DoFn that uses Spark SQL to rewrite Iceberg table data files.
148
+
149
+ This uses Spark's rewrite_data_files procedure to materialize positional deletes
150
+ by rewriting data files. The result is a "clean" table without positional deletes.
151
+ """
152
+
153
+ def __init__(self, catalog_uri, warehouse_path, table_name):
154
+ self.catalog_uri = catalog_uri
155
+ self.warehouse_path = warehouse_path
156
+ self.table_name = table_name
157
+
158
+ def setup(self):
159
+ """Initialize Spark session for rewrite operations."""
160
+ try:
161
+ from pyspark.sql import SparkSession
162
+ import importlib.metadata
163
+
164
+ print(f"🔧 Setting up Spark session for rewriting {self.table_name}")
165
+
166
+ # Detect Spark version for appropriate Iceberg runtime
167
+ spark_version = importlib.metadata.version("pyspark")
168
+ major_minor = ".".join(spark_version.split(".")[:2])
169
+ print(f" - Spark version: {major_minor}")
170
+ print(f" - Iceberg version: 1.6.0")
171
+
172
+ # Configure Spark with Iceberg
173
+ self.spark = (
174
+ SparkSession.builder.appName("IcebergRewrite")
175
+ .config(
176
+ "spark.jars.packages",
177
+ f"org.apache.iceberg:iceberg-spark-runtime-{major_minor}_2.12:1.6.0",
178
+ )
179
+ .config(
180
+ "spark.sql.extensions",
181
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
182
+ )
183
+ .config(
184
+ "spark.sql.catalog.spark_catalog",
185
+ "org.apache.iceberg.spark.SparkSessionCatalog",
186
+ )
187
+ .config("spark.sql.catalog.spark_catalog.type", "rest")
188
+ .config("spark.sql.catalog.spark_catalog.uri", self.catalog_uri)
189
+ .config(
190
+ "spark.sql.catalog.spark_catalog.warehouse", self.warehouse_path
191
+ )
192
+ .config("spark.driver.bindAddress", "127.0.0.1")
193
+ .config("spark.driver.host", "127.0.0.1")
194
+ .config("spark.ui.enabled", "false")
195
+ .getOrCreate()
196
+ )
197
+
198
+ print("✅ Spark session created successfully")
199
+
200
+ except ImportError as e:
201
+ raise RuntimeError(
202
+ f"PySpark is required for rewrite mode. Install with: pip install pyspark"
203
+ ) from e
204
+ except Exception as e:
205
+ raise RuntimeError(f"Failed to create Spark session: {e}") from e
206
+
207
+ def process(self, element):
208
+ """Rewrite table data files to materialize positional deletes."""
209
+ try:
210
+ print(
211
+ f"📋 Rewriting table {self.table_name} to materialize positional deletes"
212
+ )
213
+
214
+ # Use Spark's rewrite_data_files procedure with delete_file_threshold=1
215
+ # This forces rewrite even when there's only 1 positional delete file
216
+ rewrite_sql = f"""
217
+ CALL spark_catalog.system.rewrite_data_files(
218
+ table => '{self.table_name}',
219
+ options => map('delete-file-threshold', '1')
220
+ )
221
+ """
222
+
223
+ print(f"🔄 Executing rewrite procedure with delete_file_threshold=1...")
224
+ print(f" SQL: {rewrite_sql.strip()}")
225
+ print(
226
+ f" Rationale: Forces rewrite even with single positional delete file"
227
+ )
228
+
229
+ result = self.spark.sql(rewrite_sql)
230
+
231
+ # Collect results to see what was rewritten
232
+ rewrite_result = result.collect()[0]
233
+ print(f"📊 Rewrite result: {rewrite_result}")
234
+
235
+ # Check if we actually rewrote anything
236
+ if rewrite_result.rewritten_data_files_count > 0:
237
+ print(
238
+ f"✅ Successfully rewrote {rewrite_result.rewritten_data_files_count} data files"
239
+ )
240
+ print(
241
+ f" - Added {rewrite_result.added_data_files_count} new data files"
242
+ )
243
+ print(f" - Rewrote {rewrite_result.rewritten_bytes_count} bytes")
244
+ print(f" - Positional deletes have been materialized!")
245
+ else:
246
+ print(f"⚠️ No files were rewritten (rewritten_data_files_count=0)")
247
+ print(f" - This may indicate no positional deletes exist")
248
+ print(f" - Or the table may already be in optimal state")
249
+
250
+ yield f"Rewrite completed for {self.table_name}"
251
+
252
+ except Exception as e:
253
+ print(f"❌ Error during rewrite: {e}")
254
+ import traceback
255
+
256
+ traceback.print_exc()
257
+ yield f"Rewrite failed for {self.table_name}: {e}"
258
+
259
+ def teardown(self):
260
+ """Clean up Spark session."""
261
+ if hasattr(self, "spark"):
262
+ print("✅ Spark session stopped")
263
+ self.spark.stop()
@@ -0,0 +1,184 @@
1
+ import os
2
+ import logging
3
+
4
+ import uuid
5
+ import daft
6
+ from pyiceberg.catalog import CatalogType
7
+
8
+ import deltacat as dc
9
+
10
+ from deltacat import logs
11
+ from deltacat import IcebergCatalog
12
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
13
+ from env import store_cli_args_in_os_environ
14
+
15
+ from pyiceberg.schema import (
16
+ Schema,
17
+ NestedField,
18
+ DoubleType,
19
+ StringType,
20
+ )
21
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
22
+ from pyiceberg.transforms import BucketTransform
23
+
24
+ from deltacat.experimental.storage.iceberg.model import (
25
+ SchemaMapper,
26
+ PartitionSchemeMapper,
27
+ )
28
+ from deltacat.env import create_ray_runtime_environment
29
+
30
+ # initialize the driver logger
31
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
32
+
33
+
34
+ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
35
+ """
36
+ This is an e2e example that
37
+ 1. creates a DeltaCAT Table (backed by an Iceberg Table) in Glue
38
+ 2. writes data into the DeltaCAT Table
39
+ 3. reads data from the DeltaCAT Table using Daft
40
+
41
+ To run the script:
42
+ 1. prepare an AWS Account
43
+ 1. prepare a S3 location where the data will be written to, which will be used in Step 3.
44
+ 2. prepare an IAM Role that has access to the S3 location and Glue
45
+ 2. retrieve the IAM Role AWS Credential and cache locally in ~/.aws/credentials
46
+ 3. run below command to execute the example
47
+ ```
48
+ make venv && source venv/bin/activate
49
+ python -m deltacat.examples.iceberg.iceberg_bucket_writer --warehouse=s3://<YOUR_S3_LOCATION>
50
+ ```
51
+
52
+ """
53
+ # create any runtime environment required to run the example
54
+ runtime_env = create_ray_runtime_environment()
55
+
56
+ # Start by initializing DeltaCAT and registering available Catalogs.
57
+ # Ray will be initialized automatically via `ray.init()`.
58
+ # Only the `iceberg` data catalog is provided so it will become the default.
59
+ # If initializing multiple catalogs, use the `default_catalog_name` param
60
+ # to specify which catalog should be the default.
61
+
62
+ dc.init(
63
+ catalogs={
64
+ # the name of the DeltaCAT catalog is "iceberg"
65
+ "iceberg": dc.Catalog(
66
+ # Apache Iceberg implementation of deltacat.catalog.interface
67
+ impl=IcebergCatalog,
68
+ # kwargs for pyiceberg.catalog.load_catalog start here...
69
+ # the name of the Iceberg catalog is "example-iceberg-catalog"
70
+ name="example-iceberg-catalog",
71
+ # for additional properties see:
72
+ # https://py.iceberg.apache.org/configuration/
73
+ config=IcebergCatalogConfig(
74
+ type=CatalogType.GLUE,
75
+ properties={
76
+ "warehouse": warehouse,
77
+ "region_name": "us-east-1",
78
+ },
79
+ ),
80
+ )
81
+ },
82
+ # pass the runtime environment into ray.init()
83
+ ray_init_args={"runtime_env": runtime_env},
84
+ )
85
+
86
+ # define a native Iceberg table schema
87
+ schema = Schema(
88
+ NestedField(field_id=1, name="symbol", field_type=StringType(), required=True),
89
+ NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
90
+ NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
91
+ )
92
+
93
+ # define a native Iceberg partition spec
94
+ partition_spec = PartitionSpec(
95
+ PartitionField(
96
+ source_id=1,
97
+ field_id=1000,
98
+ transform=BucketTransform(2),
99
+ name="symbol_bucket",
100
+ )
101
+ )
102
+
103
+ # define a native Iceberg sort order
104
+ # sort_order = SortOrder(SortField(source_id=1, transform=IdentityTransform()))
105
+
106
+ # define the Daft dataframe to write
107
+ df = daft.from_pydict(
108
+ {
109
+ "symbol": ["amzn", "goog", "meta", "msft"],
110
+ "bid": [157.16, 150.55, 392.03, 403.25],
111
+ "ask": [157.17, 150.56, 392.09, 403.27],
112
+ }
113
+ )
114
+
115
+ # write to a table named `test_namespace.test_table_bucketed-<SUFFIX>`
116
+ # we don't need to specify which catalog to create this table in since
117
+ # only the "iceberg" catalog is available
118
+ table_name = f"test_table_bucketed-{uuid.uuid4().hex[:8]}"
119
+ namespace = "test_namespace"
120
+ print(f"Creating Glue Table: {namespace}.{table_name}")
121
+ dc.write_to_table(
122
+ data=df,
123
+ # path=warehouse + "/datafiles",
124
+ table=table_name,
125
+ namespace=namespace,
126
+ schema=SchemaMapper.map(schema),
127
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
128
+ # sort_keys=SortSchemeMapper.map(sort_order, schema),
129
+ )
130
+
131
+ print(f"Getting Glue Table: {namespace}.{table_name}")
132
+ table_definition = dc.get_table(name=table_name, namespace=namespace)
133
+ print(f"Retrieved Glue Table: {table_definition}")
134
+
135
+ # Read Data from DeltaCAT Table (backed by Iceberg) using Daft
136
+ daft_dataframe = dc.read_table(table=table_name, namespace=namespace)
137
+
138
+ daft_dataframe.where(df["bid"] > 200.0).show()
139
+ # Expected result:
140
+ # ╭────────┬─────────┬─────────╮
141
+ # │ symbol ┆ bid ┆ ask │
142
+ # │ --- ┆ --- ┆ --- │
143
+ # │ Utf8 ┆ Float64 ┆ Float64 │
144
+ # ╞════════╪═════════╪═════════╡
145
+ # │ meta ┆ 392.03 ┆ 392.09 │
146
+ # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
147
+ # │ msft ┆ 403.25 ┆ 403.27 │
148
+ # ╰────────┴─────────┴─────────╯
149
+
150
+ daft_dataframe.select("symbol").show()
151
+ # Expected result:
152
+ # ╭────────╮
153
+ # │ symbol │
154
+ # │ --- │
155
+ # │ Utf8 │
156
+ # ╞════════╡
157
+ # │ meta │
158
+ # ├╌╌╌╌╌╌╌╌┤
159
+ # │ amzn │
160
+ # ├╌╌╌╌╌╌╌╌┤
161
+ # │ goog │
162
+ # ├╌╌╌╌╌╌╌╌┤
163
+ # │ msft │
164
+ # ╰────────╯
165
+
166
+
167
+ if __name__ == "__main__":
168
+ example_script_args = [
169
+ (
170
+ [
171
+ "--warehouse",
172
+ ],
173
+ {
174
+ "help": "S3 path for Iceberg file storage.",
175
+ "type": str,
176
+ },
177
+ ),
178
+ ]
179
+
180
+ # store any CLI args in the runtime environment
181
+ store_cli_args_in_os_environ(example_script_args)
182
+
183
+ # run the example using os.environ as kwargs
184
+ run(**os.environ)
@@ -0,0 +1,147 @@
1
+ import os
2
+ import logging
3
+ import deltacat as dc
4
+
5
+ from deltacat import logs
6
+ from deltacat import IcebergCatalog
7
+ from env import store_cli_args_in_os_environ
8
+
9
+ from pyiceberg.schema import (
10
+ Schema,
11
+ NestedField,
12
+ DoubleType,
13
+ StringType,
14
+ TimestampType,
15
+ FloatType,
16
+ StructType,
17
+ )
18
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
19
+ from pyiceberg.transforms import DayTransform, IdentityTransform
20
+ from pyiceberg.table.sorting import SortField, SortOrder
21
+
22
+ from deltacat.exceptions import TableAlreadyExistsError
23
+ from deltacat.experimental.storage.iceberg.model import (
24
+ SchemaMapper,
25
+ PartitionSchemeMapper,
26
+ SortSchemeMapper,
27
+ )
28
+ from deltacat.env import create_ray_runtime_environment
29
+
30
+ # initialize the driver logger
31
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
32
+
33
+
34
+ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
35
+ # create any runtime environment required to run the example
36
+ runtime_env = create_ray_runtime_environment()
37
+
38
+ # Start by initializing DeltaCAT and registering available Catalogs.
39
+ # Ray will be initialized automatically via `ray.init()`.
40
+ # Only the `iceberg` data catalog is provided so it will become the default.
41
+ # If initializing multiple catalogs, use the `default_catalog_name` param
42
+ # to specify which catalog should be the default.
43
+ dc.init(
44
+ catalogs={
45
+ # the name of the DeltaCAT catalog is "iceberg"
46
+ "iceberg": dc.Catalog(
47
+ # Apache Iceberg implementation of deltacat.catalog.interface
48
+ impl=IcebergCatalog,
49
+ # kwargs for pyiceberg.catalog.load_catalog start here...
50
+ # the name of the Iceberg catalog is "example-iceberg-catalog"
51
+ name="example-iceberg-catalog",
52
+ # for additional properties see:
53
+ # https://py.iceberg.apache.org/configuration/
54
+ properties={
55
+ "type": "glue",
56
+ "region_name": "us-east-1",
57
+ "warehouse": warehouse,
58
+ },
59
+ )
60
+ },
61
+ # pass the runtime environment into ray.init()
62
+ ray_init_args={"runtime_env": runtime_env},
63
+ )
64
+
65
+ # define a native Iceberg table schema
66
+ schema = Schema(
67
+ NestedField(
68
+ field_id=1, name="datetime", field_type=TimestampType(), required=True
69
+ ),
70
+ NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
71
+ NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
72
+ NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
73
+ NestedField(
74
+ field_id=5,
75
+ name="details",
76
+ field_type=StructType(
77
+ NestedField(
78
+ field_id=6,
79
+ name="created_by",
80
+ field_type=StringType(),
81
+ required=False,
82
+ ),
83
+ ),
84
+ required=False,
85
+ ),
86
+ )
87
+
88
+ # define a native Iceberg partition spec
89
+ partition_spec = PartitionSpec(
90
+ PartitionField(
91
+ source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
92
+ )
93
+ )
94
+
95
+ # define a native Iceberg sort order
96
+ sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
97
+
98
+ # create a table named `test_namespace.test_table`
99
+ # we don't need to specify which catalog to create this table in since
100
+ # only the "iceberg" catalog is available
101
+ table_name = "test_table"
102
+ namespace = "test_namespace"
103
+ print(f"Creating Glue Table: {namespace}.{table_name}")
104
+ try:
105
+ table_definition = dc.create_table(
106
+ table=table_name,
107
+ namespace=namespace,
108
+ schema=SchemaMapper.map(schema),
109
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
110
+ sort_keys=SortSchemeMapper.map(sort_order, schema),
111
+ )
112
+ print(f"Created Glue Table: {table_definition}")
113
+ except TableAlreadyExistsError:
114
+ print(f"Glue Table `{namespace}.{table_name}` already exists.")
115
+
116
+ print(f"Getting Glue Table: {namespace}.{table_name}")
117
+ table_definition = dc.get_table(table_name, namespace)
118
+ print(f"Retrieved Glue Table: {table_definition}")
119
+
120
+
121
+ if __name__ == "__main__":
122
+ example_script_args = [
123
+ (
124
+ [
125
+ "--warehouse",
126
+ ],
127
+ {
128
+ "help": "S3 path for Iceberg file storage.",
129
+ "type": str,
130
+ },
131
+ ),
132
+ (
133
+ [
134
+ "--STAGE",
135
+ ],
136
+ {
137
+ "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
138
+ "type": str,
139
+ },
140
+ ),
141
+ ]
142
+
143
+ # store any CLI args in the runtime environment
144
+ store_cli_args_in_os_environ(example_script_args)
145
+
146
+ # run the example using os.environ as kwargs
147
+ run(**os.environ)
@@ -0,0 +1,29 @@
1
+ import ray
2
+ import deltacat
3
+ import daft
4
+
5
+
6
+ def print_package_version_info():
7
+ print(f"DeltaCAT Version: {deltacat.__version__}")
8
+ print(f"Ray Version: {ray.__version__}")
9
+ print(f"Daft Version: {daft.__version__}")
10
+
11
+
12
+ @ray.remote
13
+ def hello_worker():
14
+ print("Hello, Worker!")
15
+ print_package_version_info()
16
+
17
+
18
+ def run():
19
+ print("Hello, Driver!")
20
+ print_package_version_info()
21
+ hello_worker.remote()
22
+
23
+
24
+ if __name__ == "__main__":
25
+ # initialize deltacat
26
+ deltacat.init()
27
+
28
+ # run the example
29
+ run()
File without changes
File without changes
File without changes