deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
1
+ """
2
+ Spark SQL utilities for Iceberg table operations.
3
+
4
+ This module provides Beam DoFn classes that use Spark SQL to work with Iceberg tables,
5
+ """
6
+
7
+ import os
8
+ import apache_beam as beam
9
+ from apache_beam import Row
10
+
11
+
12
+ class SparkSQLIcebergRead(beam.DoFn):
13
+ """
14
+ Custom Beam DoFn that uses Spark SQL to read Iceberg tables.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ table_name: str,
20
+ catalog_uri: str = "http://localhost:8181",
21
+ warehouse: str = "warehouse/",
22
+ ):
23
+ """
24
+ Initialize the Spark SQL reader.
25
+
26
+ Args:
27
+ table_name: Name of the Iceberg table
28
+ catalog_uri: URI of the Iceberg REST catalog
29
+ warehouse: Warehouse path
30
+ """
31
+ self.table_name = table_name
32
+ self.catalog_uri = catalog_uri
33
+ self.warehouse = warehouse
34
+ self.spark = None
35
+
36
+ def setup(self):
37
+ """Set up Spark session (called once per worker)."""
38
+ try:
39
+ from pyspark.sql import SparkSession
40
+ import importlib.metadata
41
+
42
+ # Get Spark version for dependency resolution
43
+ try:
44
+ spark_version = ".".join(
45
+ importlib.metadata.version("pyspark").split(".")[:2]
46
+ )
47
+ except Exception:
48
+ spark_version = "3.5" # Default fallback
49
+
50
+ scala_version = "2.12"
51
+ iceberg_version = "1.6.0"
52
+
53
+ print(f"🔧 Setting up Spark session for reading {self.table_name}")
54
+ print(f" - Spark version: {spark_version}")
55
+ print(f" - Iceberg version: {iceberg_version}")
56
+
57
+ # Set Spark packages for Iceberg runtime
58
+ os.environ["PYSPARK_SUBMIT_ARGS"] = (
59
+ f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version} "
60
+ f"pyspark-shell"
61
+ )
62
+
63
+ # Create Spark session with Iceberg REST catalog configuration
64
+ self.spark = (
65
+ SparkSession.builder.appName(f"DeltaCAT Read - {self.table_name}")
66
+ .config("spark.sql.session.timeZone", "UTC")
67
+ .config(
68
+ "spark.serializer", "org.apache.spark.serializer.KryoSerializer"
69
+ )
70
+ .config(
71
+ "spark.sql.extensions",
72
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
73
+ )
74
+ # Configure REST catalog
75
+ .config(
76
+ "spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog"
77
+ )
78
+ .config("spark.sql.catalog.rest.type", "rest")
79
+ .config("spark.sql.catalog.rest.uri", self.catalog_uri)
80
+ .config("spark.sql.catalog.rest.warehouse", self.warehouse)
81
+ # Set REST as default catalog
82
+ .config("spark.sql.defaultCatalog", "rest")
83
+ # Local mode configuration (within Beam workers)
84
+ .config("spark.master", "local[1]") # Single thread per worker
85
+ .config("spark.sql.adaptive.enabled", "true")
86
+ # Networking binding
87
+ .config("spark.driver.bindAddress", "127.0.0.1")
88
+ .config("spark.driver.host", "127.0.0.1")
89
+ .config("spark.ui.enabled", "false")
90
+ .config("spark.sql.adaptive.coalescePartitions.enabled", "false")
91
+ .getOrCreate()
92
+ )
93
+
94
+ print(f"✅ Spark session created successfully")
95
+
96
+ except Exception as e:
97
+ print(f"❌ Failed to set up Spark session: {e}")
98
+ raise
99
+
100
+ def teardown(self):
101
+ """Clean up Spark session (called once per worker)."""
102
+ if self.spark:
103
+ try:
104
+ self.spark.stop()
105
+ print("✅ Spark session stopped")
106
+ except Exception as e:
107
+ print(f"⚠️ Error stopping Spark session: {e}")
108
+
109
+ def process(self, element):
110
+ """
111
+ Process element (read from Iceberg table using Spark SQL).
112
+
113
+ Args:
114
+ element: Input element (not used, just triggers the read)
115
+
116
+ Yields:
117
+ Records from the Iceberg table
118
+ """
119
+ try:
120
+ if not self.spark:
121
+ raise RuntimeError("Spark session not initialized")
122
+
123
+ print(f"📖 Reading table {self.table_name} using Spark SQL")
124
+
125
+ # Read from Iceberg table using Spark SQL
126
+ df = self.spark.sql(f"SELECT * FROM {self.table_name}")
127
+
128
+ # Collect all records
129
+ records = df.collect()
130
+
131
+ print(f"📊 Successfully read {len(records)} records from {self.table_name}")
132
+
133
+ # Convert Spark rows to Beam Row objects and yield
134
+ for row in records:
135
+ row_dict = row.asDict()
136
+ # Convert to Beam Row for consistency with write mode
137
+ beam_row = Row(**row_dict)
138
+ yield beam_row
139
+
140
+ except Exception as e:
141
+ print(f"❌ Failed to read from table {self.table_name}: {e}")
142
+ raise
143
+
144
+
145
+ class SparkSQLIcebergRewrite(beam.DoFn):
146
+ """
147
+ Custom Beam DoFn that uses Spark SQL to rewrite Iceberg table data files.
148
+
149
+ This uses Spark's rewrite_data_files procedure to materialize positional deletes
150
+ by rewriting data files. The result is a "clean" table without positional deletes.
151
+ """
152
+
153
+ def __init__(self, catalog_uri, warehouse_path, table_name):
154
+ self.catalog_uri = catalog_uri
155
+ self.warehouse_path = warehouse_path
156
+ self.table_name = table_name
157
+
158
+ def setup(self):
159
+ """Initialize Spark session for rewrite operations."""
160
+ try:
161
+ from pyspark.sql import SparkSession
162
+ import importlib.metadata
163
+
164
+ print(f"🔧 Setting up Spark session for rewriting {self.table_name}")
165
+
166
+ # Detect Spark version for appropriate Iceberg runtime
167
+ spark_version = importlib.metadata.version("pyspark")
168
+ major_minor = ".".join(spark_version.split(".")[:2])
169
+ print(f" - Spark version: {major_minor}")
170
+ print(f" - Iceberg version: 1.6.0")
171
+
172
+ # Configure Spark with Iceberg
173
+ self.spark = (
174
+ SparkSession.builder.appName("IcebergRewrite")
175
+ .config(
176
+ "spark.jars.packages",
177
+ f"org.apache.iceberg:iceberg-spark-runtime-{major_minor}_2.12:1.6.0",
178
+ )
179
+ .config(
180
+ "spark.sql.extensions",
181
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
182
+ )
183
+ .config(
184
+ "spark.sql.catalog.spark_catalog",
185
+ "org.apache.iceberg.spark.SparkSessionCatalog",
186
+ )
187
+ .config("spark.sql.catalog.spark_catalog.type", "rest")
188
+ .config("spark.sql.catalog.spark_catalog.uri", self.catalog_uri)
189
+ .config(
190
+ "spark.sql.catalog.spark_catalog.warehouse", self.warehouse_path
191
+ )
192
+ .config("spark.driver.bindAddress", "127.0.0.1")
193
+ .config("spark.driver.host", "127.0.0.1")
194
+ .config("spark.ui.enabled", "false")
195
+ .getOrCreate()
196
+ )
197
+
198
+ print("✅ Spark session created successfully")
199
+
200
+ except ImportError as e:
201
+ raise RuntimeError(
202
+ f"PySpark is required for rewrite mode. Install with: pip install pyspark"
203
+ ) from e
204
+ except Exception as e:
205
+ raise RuntimeError(f"Failed to create Spark session: {e}") from e
206
+
207
+ def process(self, element):
208
+ """Rewrite table data files to materialize positional deletes."""
209
+ try:
210
+ print(
211
+ f"📋 Rewriting table {self.table_name} to materialize positional deletes"
212
+ )
213
+
214
+ # Use Spark's rewrite_data_files procedure with delete_file_threshold=1
215
+ # This forces rewrite even when there's only 1 positional delete file
216
+ rewrite_sql = f"""
217
+ CALL spark_catalog.system.rewrite_data_files(
218
+ table => '{self.table_name}',
219
+ options => map('delete-file-threshold', '1')
220
+ )
221
+ """
222
+
223
+ print(f"🔄 Executing rewrite procedure with delete_file_threshold=1...")
224
+ print(f" SQL: {rewrite_sql.strip()}")
225
+ print(
226
+ f" Rationale: Forces rewrite even with single positional delete file"
227
+ )
228
+
229
+ result = self.spark.sql(rewrite_sql)
230
+
231
+ # Collect results to see what was rewritten
232
+ rewrite_result = result.collect()[0]
233
+ print(f"📊 Rewrite result: {rewrite_result}")
234
+
235
+ # Check if we actually rewrote anything
236
+ if rewrite_result.rewritten_data_files_count > 0:
237
+ print(
238
+ f"✅ Successfully rewrote {rewrite_result.rewritten_data_files_count} data files"
239
+ )
240
+ print(
241
+ f" - Added {rewrite_result.added_data_files_count} new data files"
242
+ )
243
+ print(f" - Rewrote {rewrite_result.rewritten_bytes_count} bytes")
244
+ print(f" - Positional deletes have been materialized!")
245
+ else:
246
+ print(f"⚠️ No files were rewritten (rewritten_data_files_count=0)")
247
+ print(f" - This may indicate no positional deletes exist")
248
+ print(f" - Or the table may already be in optimal state")
249
+
250
+ yield f"Rewrite completed for {self.table_name}"
251
+
252
+ except Exception as e:
253
+ print(f"❌ Error during rewrite: {e}")
254
+ import traceback
255
+
256
+ traceback.print_exc()
257
+ yield f"Rewrite failed for {self.table_name}: {e}"
258
+
259
+ def teardown(self):
260
+ """Clean up Spark session."""
261
+ if hasattr(self, "spark"):
262
+ print("✅ Spark session stopped")
263
+ self.spark.stop()
@@ -1,14 +1,16 @@
1
1
  import os
2
2
  import logging
3
3
 
4
+ import uuid
4
5
  import daft
6
+ from pyiceberg.catalog import CatalogType
7
+
5
8
  import deltacat as dc
6
9
 
7
10
  from deltacat import logs
8
11
  from deltacat import IcebergCatalog
9
- from deltacat.examples.common.fixtures import (
10
- store_cli_args_in_os_environ,
11
- )
12
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
13
+ from env import store_cli_args_in_os_environ
12
14
 
13
15
  from pyiceberg.schema import (
14
16
  Schema,
@@ -19,7 +21,7 @@ from pyiceberg.schema import (
19
21
  from pyiceberg.partitioning import PartitionSpec, PartitionField
20
22
  from pyiceberg.transforms import BucketTransform
21
23
 
22
- from deltacat.storage.iceberg.model import (
24
+ from deltacat.experimental.storage.iceberg.model import (
23
25
  SchemaMapper,
24
26
  PartitionSchemeMapper,
25
27
  )
@@ -30,6 +32,24 @@ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
30
32
 
31
33
 
32
34
  def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
35
+ """
36
+ This is an e2e example that
37
+ 1. creates a DeltaCAT Table (backed by an Iceberg Table) in Glue
38
+ 2. writes data into the DeltaCAT Table
39
+ 3. reads data from the DeltaCAT Table using Daft
40
+
41
+ To run the script:
42
+ 1. prepare an AWS Account
43
+ 1. prepare a S3 location where the data will be written to, which will be used in Step 3.
44
+ 2. prepare an IAM Role that has access to the S3 location and Glue
45
+ 2. retrieve the IAM Role AWS Credential and cache locally in ~/.aws/credentials
46
+ 3. run below command to execute the example
47
+ ```
48
+ make venv && source venv/bin/activate
49
+ python -m deltacat.examples.iceberg.iceberg_bucket_writer --warehouse=s3://<YOUR_S3_LOCATION>
50
+ ```
51
+
52
+ """
33
53
  # create any runtime environment required to run the example
34
54
  runtime_env = create_ray_runtime_environment()
35
55
 
@@ -38,6 +58,7 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
38
58
  # Only the `iceberg` data catalog is provided so it will become the default.
39
59
  # If initializing multiple catalogs, use the `default_catalog_name` param
40
60
  # to specify which catalog should be the default.
61
+
41
62
  dc.init(
42
63
  catalogs={
43
64
  # the name of the DeltaCAT catalog is "iceberg"
@@ -49,11 +70,13 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
49
70
  name="example-iceberg-catalog",
50
71
  # for additional properties see:
51
72
  # https://py.iceberg.apache.org/configuration/
52
- properties={
53
- "type": "glue",
54
- "region_name": "us-east-1",
55
- "warehouse": warehouse,
56
- },
73
+ config=IcebergCatalogConfig(
74
+ type=CatalogType.GLUE,
75
+ properties={
76
+ "warehouse": warehouse,
77
+ "region_name": "us-east-1",
78
+ },
79
+ ),
57
80
  )
58
81
  },
59
82
  # pass the runtime environment into ray.init()
@@ -89,10 +112,10 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
89
112
  }
90
113
  )
91
114
 
92
- # write to a table named `test_namespace.test_table_bucketed`
115
+ # write to a table named `test_namespace.test_table_bucketed-<SUFFIX>`
93
116
  # we don't need to specify which catalog to create this table in since
94
117
  # only the "iceberg" catalog is available
95
- table_name = "test_table_bucketed"
118
+ table_name = f"test_table_bucketed-{uuid.uuid4().hex[:8]}"
96
119
  namespace = "test_namespace"
97
120
  print(f"Creating Glue Table: {namespace}.{table_name}")
98
121
  dc.write_to_table(
@@ -106,9 +129,40 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
106
129
  )
107
130
 
108
131
  print(f"Getting Glue Table: {namespace}.{table_name}")
109
- table_definition = dc.get_table(table_name, namespace)
132
+ table_definition = dc.get_table(name=table_name, namespace=namespace)
110
133
  print(f"Retrieved Glue Table: {table_definition}")
111
134
 
135
+ # Read Data from DeltaCAT Table (backed by Iceberg) using Daft
136
+ daft_dataframe = dc.read_table(table=table_name, namespace=namespace)
137
+
138
+ daft_dataframe.where(df["bid"] > 200.0).show()
139
+ # Expected result:
140
+ # ╭────────┬─────────┬─────────╮
141
+ # │ symbol ┆ bid ┆ ask │
142
+ # │ --- ┆ --- ┆ --- │
143
+ # │ Utf8 ┆ Float64 ┆ Float64 │
144
+ # ╞════════╪═════════╪═════════╡
145
+ # │ meta ┆ 392.03 ┆ 392.09 │
146
+ # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
147
+ # │ msft ┆ 403.25 ┆ 403.27 │
148
+ # ╰────────┴─────────┴─────────╯
149
+
150
+ daft_dataframe.select("symbol").show()
151
+ # Expected result:
152
+ # ╭────────╮
153
+ # │ symbol │
154
+ # │ --- │
155
+ # │ Utf8 │
156
+ # ╞════════╡
157
+ # │ meta │
158
+ # ├╌╌╌╌╌╌╌╌┤
159
+ # │ amzn │
160
+ # ├╌╌╌╌╌╌╌╌┤
161
+ # │ goog │
162
+ # ├╌╌╌╌╌╌╌╌┤
163
+ # │ msft │
164
+ # ╰────────╯
165
+
112
166
 
113
167
  if __name__ == "__main__":
114
168
  example_script_args = [
@@ -121,15 +175,6 @@ if __name__ == "__main__":
121
175
  "type": str,
122
176
  },
123
177
  ),
124
- (
125
- [
126
- "--STAGE",
127
- ],
128
- {
129
- "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
130
- "type": str,
131
- },
132
- ),
133
178
  ]
134
179
 
135
180
  # store any CLI args in the runtime environment
@@ -4,9 +4,7 @@ import deltacat as dc
4
4
 
5
5
  from deltacat import logs
6
6
  from deltacat import IcebergCatalog
7
- from deltacat.examples.common.fixtures import (
8
- store_cli_args_in_os_environ,
9
- )
7
+ from env import store_cli_args_in_os_environ
10
8
 
11
9
  from pyiceberg.schema import (
12
10
  Schema,
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
22
20
  from pyiceberg.table.sorting import SortField, SortOrder
23
21
 
24
22
  from deltacat.exceptions import TableAlreadyExistsError
25
- from deltacat.storage.iceberg.model import (
23
+ from deltacat.experimental.storage.iceberg.model import (
26
24
  SchemaMapper,
27
25
  PartitionSchemeMapper,
28
26
  SortSchemeMapper,
@@ -1,12 +1,10 @@
1
1
  import ray
2
2
  import deltacat
3
3
  import daft
4
- import pyiceberg
5
4
 
6
5
 
7
6
  def print_package_version_info():
8
7
  print(f"DeltaCAT Version: {deltacat.__version__}")
9
- print(f"PyIceberg Version: {pyiceberg.__version__}")
10
8
  print(f"Ray Version: {ray.__version__}")
11
9
  print(f"Daft Version: {daft.__version__}")
12
10
 
@@ -24,4 +22,8 @@ def run():
24
22
 
25
23
 
26
24
  if __name__ == "__main__":
25
+ # initialize deltacat
26
+ deltacat.init()
27
+
28
+ # run the example
27
29
  run()
@@ -0,0 +1,163 @@
1
+ import argparse
2
+
3
+ from datetime import datetime
4
+
5
+ import ray
6
+
7
+ import deltacat
8
+ import daft
9
+ import pyarrow as pa
10
+ import pandas as pd
11
+ import polars as pl
12
+ import numpy as np
13
+
14
+ from deltacat import DeltaCatUrl
15
+
16
+
17
+ def print_package_version_info() -> None:
18
+ print(f"DeltaCAT Version: {deltacat.__version__}")
19
+ print(f"Ray Version: {ray.__version__}")
20
+ print(f"Daft Version: {daft.__version__}")
21
+ print(f"NumPy Version: {np.__version__}")
22
+ print(f"PyArrow Version: {pa.__version__}")
23
+ print(f"Polars Version: {pl.__version__}")
24
+ print(f"Pandas Version: {pd.__version__}")
25
+
26
+
27
+ def json_path_to_regex(path: str):
28
+ if not path:
29
+ raise ValueError("Path cannot be empty")
30
+ parts = path.split("/")
31
+ leaf_key = parts.pop()
32
+ regex = r""
33
+ for part in parts:
34
+ if part.strip(): # discard leading and/or redundant separators
35
+ regex += rf'"{part}"\s*:\s*[{{\[].*?'
36
+ regex += rf'"{leaf_key}"\s*:\s*"(?<{leaf_key}>.*?)"'
37
+ return regex
38
+
39
+
40
+ def run(
41
+ source: str,
42
+ dest: str,
43
+ ) -> None:
44
+ # print package version info
45
+ print_package_version_info()
46
+
47
+ # run a synchronous copy from the source to the destination
48
+ deltacat.copy(
49
+ DeltaCatUrl(source),
50
+ DeltaCatUrl(dest),
51
+ # reader arguments to pass to the default reader (polars)
52
+ # for the given text-based datasource, it accepts the same
53
+ # arguments as polars.read_csv except for `source`, `n_threads`
54
+ # `new_columns`, `separator`, `has_header`, `quote_char`, and
55
+ # `infer_schema`.
56
+ reader_args={
57
+ "low_memory": True, # try to use less memory (++stability, --perf)
58
+ "batch_size": 1024, # text line count read into a buffer at once
59
+ "use_pyarrow": True, # use the native pyarrow reader
60
+ },
61
+ # writer arguments to pass to the default writer (polars)
62
+ # for the given parquet-based datasink, it generally accepts the same
63
+ # arguments as polars.DataFrame.write_{dest-type} except for `file`
64
+ writer_args={
65
+ "compression": "lz4", # faster compression & decompression
66
+ # "compression": "zstd", # better compression ratio
67
+ # "compression": "snappy", # compatible w/ older Parquet readers
68
+ },
69
+ # Transforms to run against the default polars dataframe read.
70
+ # By default, each transform takes a polars dataframe `df` as input
71
+ # and produces a polars dataframe as output. All transforms listed
72
+ # are run in order (i.e., the dataframe output from transform[0]
73
+ # is the dataframe input to transform[1]).
74
+ #
75
+ # See:
76
+ # https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
77
+ # https://docs.pola.rs/api/python/stable/reference/expressions/index.html
78
+ transforms=[
79
+ lambda df, src: df.rename(
80
+ {"text": "utf8_body"},
81
+ ),
82
+ lambda df, src: df.with_columns(
83
+ pl.col("utf8_body").hash().alias("utf8_body_hash"),
84
+ pl.lit(datetime.utcnow()).dt.datetime().alias("processing_time"),
85
+ pl.lit(src.url_path).alias("source_file_path"),
86
+ ),
87
+ ],
88
+ )
89
+
90
+
91
+ if __name__ == "__main__":
92
+ """
93
+ Example 1: Run this script locally using Ray:
94
+ $ python indexer.py \
95
+ $ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
96
+ $ --dest 'parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet'
97
+
98
+ Example 2: Submit this script as a local Ray job using a local job client:
99
+ >>> from deltacat import local_job_client
100
+ >>> client = local_job_client()
101
+ >>> # read the source file as line-delimited text
102
+ >>> src = "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31"
103
+ >>> # write to the destination file using the default DeltaCAT Parquet writer (i.e., polars.DataFrame.write_parquet)
104
+ >>> dst = "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet"
105
+ >>> try:
106
+ >>> job_run_result = client.run_job(
107
+ >>> # Entrypoint shell command to run the indexer job
108
+ >>> entrypoint=f"python indexer.py --source '{src}' --dest '{dst}'",
109
+ >>> # Path to the local directory that contains the indexer.py file
110
+ >>> runtime_env={"working_dir": "./deltacat/examples/indexer.py"},
111
+ >>> )
112
+ >>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
113
+ >>> print(f"Job ID {job_run_result.job_id} logs: ")
114
+ >>> print(job_run_result.job_logs)
115
+ >>> except RuntimeError as e:
116
+ >>> print(f"Job Run Failed: {e}")
117
+ >>> except TimeoutError as e:
118
+ >>> print(f"Job Run Timed Out: {e}")
119
+
120
+ Example 3: Submit this script as a remote Ray job using a remote job client:
121
+ >>> from deltacat import job_client
122
+ >>> # use `deltacat.yaml` from the current working directory as the ray cluster launcher config file
123
+ >>> # automatically launches the cluster if it doesn't exist or has died
124
+ >>> # automatically forwards the ray cluster's dashboard for viewing in a web browser @ http://localhost:8265
125
+ >>> client = job_client()
126
+ >>> # ... follow the same steps as above to submit a synchronous indexer job ...
127
+ >>>
128
+ >>> # OR use an explicit cluster launcher config file path
129
+ >>> client = job_client("/Users/pdames/workspace/deltacat.yaml")
130
+ >>> # ... follow the same steps as above to submit a synchronous indexer job ...
131
+ """
132
+ script_args = [
133
+ (
134
+ [
135
+ "--source",
136
+ ],
137
+ {
138
+ "help": "Source DeltaCAT URL to index.",
139
+ "type": str,
140
+ },
141
+ ),
142
+ (
143
+ [
144
+ "--dest",
145
+ ],
146
+ {
147
+ "help": "Destination DeltaCAT URL to index.",
148
+ "type": str,
149
+ },
150
+ ),
151
+ ]
152
+ # parse CLI input arguments
153
+ parser = argparse.ArgumentParser()
154
+ for args, kwargs in script_args:
155
+ parser.add_argument(*args, **kwargs)
156
+ args = parser.parse_args()
157
+ print(f"Command Line Arguments: {args}")
158
+
159
+ # initialize deltacat
160
+ deltacat.init()
161
+
162
+ # run the example using the parsed arguments
163
+ run(**vars(args))