deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -9,11 +9,8 @@ from functools import partial
9
9
  import ray
10
10
 
11
11
  from deltacat import logs
12
- from deltacat.annotations import ExperimentalAPI
13
- from deltacat.catalog.main import impl as DeltaCatCatalog
14
- from deltacat.catalog.iceberg import impl as IcebergCatalog
15
- from deltacat.catalog import CatalogProperties
16
- from deltacat.catalog.iceberg import IcebergCatalogConfig
12
+ from deltacat.catalog.main import impl as dcat
13
+ from deltacat.catalog.model.properties import CatalogProperties
17
14
  from deltacat.constants import DEFAULT_CATALOG
18
15
 
19
16
  all_catalogs: Optional[ray.actor.ActorHandle] = None
@@ -22,14 +19,20 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
19
 
23
20
 
24
21
  class Catalog:
25
- def __init__(self, impl: ModuleType = DeltaCatCatalog, *args, **kwargs):
22
+ def __init__(
23
+ self,
24
+ config: Optional[Union[CatalogProperties, Any]] = None,
25
+ impl: ModuleType = dcat,
26
+ *args,
27
+ **kwargs,
28
+ ):
26
29
  """
27
30
  Constructor for a Catalog.
28
31
 
29
- Invokes `impl.initialize(*args, **kwargs)` and stores its return value
30
- in the `inner` property, which captures all state required to
31
- deterministically reconstruct this Catalog instance on any node (and
32
- must therefore be pickleable by Ray cloudpickle).
32
+ Invokes `impl.initialize(config, *args, **kwargs)` and stores its
33
+ return value in the `inner` property. This captures all state required
34
+ to deterministically reconstruct this Catalog instance on any node, and
35
+ must be pickleable by Ray cloudpickle.
33
36
  """
34
37
  if not isinstance(self, Catalog):
35
38
  # self may contain the tuple returned from __reduce__ (ray pickle bug?)
@@ -40,32 +43,15 @@ class Catalog:
40
43
  err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
41
44
  raise RuntimeError(err_msg)
42
45
 
46
+ self._config = config
43
47
  self._impl = impl
44
- self._inner = self._impl.initialize(*args, **kwargs)
48
+ self._inner = self._impl.initialize(config=config, *args, **kwargs)
45
49
  self._args = args
46
50
  self._kwargs = kwargs
47
51
 
48
- @classmethod
49
- @ExperimentalAPI
50
- def iceberg(cls, config: IcebergCatalogConfig, *args, **kwargs):
51
- """
52
- !!! ICEBERG SUPPORT IS EXPERIMENTAL !!!
53
-
54
- Factory method to construct a catalog from Iceberg catalog params
55
-
56
- This method is just a wrapper around __init__ with stronger typing. You may still call __init__,
57
- plumbing __params__ through as kwargs
58
- """
59
- return cls(impl=IcebergCatalog, *args, **{"config": config, **kwargs})
60
-
61
- @classmethod
62
- def default(cls, config: CatalogProperties, *args, **kwargs):
63
- """
64
- Factory method to construct a catalog with the default implementation
65
-
66
- Uses CatalogProperties as configuration
67
- """
68
- return cls(impl=DeltaCatCatalog, *args, **{"config": config, **kwargs})
52
+ @property
53
+ def config(self):
54
+ return self._config
69
55
 
70
56
  @property
71
57
  def impl(self):
@@ -79,7 +65,11 @@ class Catalog:
79
65
  def __reduce__(self):
80
66
  # instantiated catalogs may fail to pickle, so exclude _inner
81
67
  # (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
82
- return partial(self.__class__, **self._kwargs), (self._impl, *self._args)
68
+ return partial(self.__class__, **self._kwargs), (
69
+ self._config,
70
+ self._impl,
71
+ *self._args,
72
+ )
83
73
 
84
74
  def __str__(self):
85
75
  string_rep = f"{self.__class__.__name__}("
@@ -102,38 +92,62 @@ class Catalogs:
102
92
  catalogs: Union[Catalog, Dict[str, Catalog]],
103
93
  default: Optional[str] = None,
104
94
  ):
95
+ self._catalogs = {}
96
+ self._default_catalog_name = None
97
+ self._default_catalog = None
98
+ self.update(catalogs, default)
99
+
100
+ def all(self) -> Dict[str, Catalog]:
101
+ return self._catalogs
102
+
103
+ def update(
104
+ self,
105
+ catalogs: Union[Catalog, Dict[str, Catalog]],
106
+ default: Optional[str] = None,
107
+ ) -> None:
105
108
  if isinstance(catalogs, Catalog):
106
109
  catalogs = {DEFAULT_CATALOG: catalogs}
107
110
  elif not isinstance(catalogs, dict):
108
111
  raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
109
- self.catalogs: Dict[str, Catalog] = catalogs
112
+ self._catalogs.update(catalogs)
110
113
  if default:
111
114
  if default not in catalogs:
112
115
  raise ValueError(
113
116
  f"Default catalog `{default}` not found in: {catalogs}"
114
117
  )
115
- self.default_catalog = self.catalogs[default]
118
+ self._default_catalog = self._catalogs[default]
119
+ self._default_catalog_name = default
116
120
  elif len(catalogs) == 1:
117
- self.default_catalog = list(self.catalogs.values())[0]
121
+ self._default_catalog = list(self._catalogs.values())[0]
118
122
  else:
119
- self.default_catalog = None
120
-
121
- def all(self) -> Dict[str, Catalog]:
122
- return self.catalogs
123
+ self._default_catalog = None
123
124
 
124
125
  def names(self) -> List[str]:
125
- return list(self.catalogs.keys())
126
+ return list(self._catalogs.keys())
126
127
 
127
128
  def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
128
- self.catalogs[name] = catalog
129
- if set_default:
130
- self.default_catalog = catalog
129
+ self._catalogs[name] = catalog
130
+ if set_default or len(self._catalogs) == 1:
131
+ self._default_catalog = catalog
131
132
 
132
133
  def get(self, name) -> Optional[Catalog]:
133
- return self.catalogs.get(name)
134
+ return self._catalogs.get(name)
135
+
136
+ def pop(self, name) -> Optional[Catalog]:
137
+ catalog = self._catalogs.pop(name, None)
138
+ if catalog and self._default_catalog_name == name:
139
+ if len(self._catalogs) == 1:
140
+ self._default_catalog = list(self._catalogs.values())[0]
141
+ else:
142
+ self._default_catalog = None
143
+ return catalog
144
+
145
+ def clear(self) -> None:
146
+ self._catalogs.clear()
147
+ self._default_catalog = None
134
148
 
135
149
  def default(self) -> Optional[Catalog]:
136
- return self.default_catalog
150
+ return self._default_catalog
137
151
 
138
152
 
139
153
  def is_initialized(*args, **kwargs) -> bool:
@@ -142,12 +156,9 @@ def is_initialized(*args, **kwargs) -> bool:
142
156
  """
143
157
  global all_catalogs
144
158
 
145
- # If ray is not initialized, then Catalogs cannot be initialized
146
159
  if not ray.is_initialized():
147
- # Any existing actor reference stored in catalog_module must be stale - reset it
160
+ # Any existing Catalogs actor reference must be stale - reset it
148
161
  all_catalogs = None
149
- return False
150
-
151
162
  return all_catalogs is not None
152
163
 
153
164
 
@@ -168,10 +179,10 @@ def raise_if_not_initialized(
168
179
  def init(
169
180
  catalogs: Union[Dict[str, Catalog], Catalog] = {},
170
181
  default: Optional[str] = None,
171
- ray_init_args: Dict[str, Any] = None,
182
+ ray_init_args: Dict[str, Any] = {},
172
183
  *,
173
- force_reinitialize=False,
174
- ) -> None:
184
+ force=False,
185
+ ) -> Optional[ray.runtime.BaseContext]:
175
186
  """
176
187
  Initialize DeltaCAT catalogs.
177
188
 
@@ -180,18 +191,20 @@ def init(
180
191
  :param default: The name of the default Catalog. If only one Catalog is
181
192
  provided, it will always be the default.
182
193
  :param ray_init_args: Keyword arguments to pass to `ray.init()`.
183
- :param force_reinitialize: Whether to force Ray reinitialization.
194
+ :param force: Whether to force DeltaCAT reinitialization. If True, reruns
195
+ ray.init(**ray_init_args) and overwrites all previously registered
196
+ catalogs.
197
+ :returns: The Ray context object if Ray was initialized, otherwise None.
184
198
  """
185
199
  global all_catalogs
186
200
 
187
- if is_initialized() and not force_reinitialize:
201
+ if is_initialized() and not force:
188
202
  logger.warning("DeltaCAT already initialized.")
189
- return
190
- else:
191
- if ray_init_args:
192
- ray.init(**ray_init_args)
193
- else:
194
- ray.init()
203
+ return None
204
+
205
+ # initialize ray (and ignore reinitialization errors)
206
+ ray_init_args["ignore_reinit_error"] = True
207
+ context = ray.init(**ray_init_args)
195
208
 
196
209
  # register custom serializer for catalogs since these may contain
197
210
  # unserializable objects like boto3 clients with SSLContext
@@ -201,9 +214,42 @@ def init(
201
214
  # TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
202
215
  # with all catalogs from the last session
203
216
  all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
217
+ return context
204
218
 
205
219
 
206
- def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
220
+ def init_local(
221
+ path: Optional[str] = None,
222
+ ray_init_args: Dict[str, Any] = {},
223
+ *,
224
+ force=False,
225
+ ) -> Optional[ray.runtime.BaseContext]:
226
+ """
227
+ Initialize DeltaCAT with a default local catalog.
228
+
229
+ This is a convenience function that creates a default catalog for local usage.
230
+ Equivalent to calling init(catalogs={"default": Catalog()}).
231
+
232
+ :param path: Optional path for catalog root directory. If not provided, uses
233
+ the default behavior of CatalogProperties (DELTACAT_ROOT env var or
234
+ "./.deltacat/").
235
+ :param ray_init_args: Keyword arguments to pass to `ray.init()`.
236
+ :param force: Whether to force DeltaCAT reinitialization. If True, reruns
237
+ ray.init(**ray_init_args) and overwrites all previously registered
238
+ catalogs.
239
+ :returns: The Ray context object if Ray was initialized, otherwise None.
240
+ """
241
+ from deltacat.catalog.model.properties import CatalogProperties
242
+
243
+ config = CatalogProperties(root=path) if path is not None else None
244
+ return init(
245
+ catalogs={"default": Catalog(config=config)},
246
+ default="default",
247
+ ray_init_args=ray_init_args,
248
+ force=force,
249
+ )
250
+
251
+
252
+ def get_catalog(name: Optional[str] = None) -> Catalog:
207
253
  """
208
254
  Get a catalog by name, or the default catalog if no name is provided.
209
255
 
@@ -232,7 +278,7 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
232
278
  else:
233
279
  catalog = ray.get(all_catalogs.default.remote())
234
280
  if not catalog:
235
- available_catalogs = ray.get(all_catalogs.all.remote()).values()
281
+ available_catalogs = list(ray.get(all_catalogs.all.remote()).keys())
236
282
  raise ValueError(
237
283
  f"Call to get_catalog without name set failed because there "
238
284
  f"is no default Catalog set. Available catalogs: "
@@ -241,17 +287,44 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
241
287
  return catalog
242
288
 
243
289
 
290
+ def clear_catalogs() -> None:
291
+ """
292
+ Clear all catalogs from the global map of named catalogs.
293
+ """
294
+ if all_catalogs:
295
+ ray.get(all_catalogs.clear.remote())
296
+
297
+
298
+ def pop_catalog(name: str) -> Optional[Catalog]:
299
+ """
300
+ Remove a named catalog from the global map of named catalogs.
301
+
302
+ Args:
303
+ name: Name of the catalog to remove.
304
+
305
+ Returns:
306
+ The removed catalog, or None if not found.
307
+ """
308
+ global all_catalogs
309
+
310
+ if not all_catalogs:
311
+ return None
312
+ catalog = ray.get(all_catalogs.pop.remote(name))
313
+ return catalog
314
+
315
+
244
316
  def put_catalog(
245
317
  name: str,
246
318
  catalog: Catalog = None,
247
319
  *,
248
320
  default: bool = False,
249
- ray_init_args: Dict[str, Any] = None,
321
+ ray_init_args: Dict[str, Any] = {},
250
322
  fail_if_exists: bool = False,
251
323
  **kwargs,
252
324
  ) -> Catalog:
253
325
  """
254
- Add a named catalog to the global map of named catalogs. Initializes ray if not already initialized.
326
+ Add a named catalog to the global map of named catalogs. Initializes
327
+ DeltaCAT if not already initialized.
255
328
 
256
329
  Args:
257
330
  name: Name of the catalog.
@@ -261,8 +334,8 @@ def put_catalog(
261
334
  default: Make this the default catalog if multiple catalogs are
262
335
  available. If only one catalog is available, it will always be the
263
336
  default.
264
- ray_init_args: Ray initialization args (used only if ray not already
265
- initialized)
337
+ ray_init_args: Ray initialization args (used only if ray is not already
338
+ initialized).
266
339
  fail_if_exists: if True, raises an error if a catalog with the given
267
340
  name already exists. If False, inserts or replaces the given
268
341
  catalog name.
@@ -276,6 +349,8 @@ def put_catalog(
276
349
 
277
350
  if not catalog:
278
351
  catalog = Catalog(**kwargs)
352
+ if name is None:
353
+ raise ValueError("Catalog name cannot be None")
279
354
 
280
355
  # Initialize, if necessary
281
356
  if not is_initialized():
@@ -283,25 +358,27 @@ def put_catalog(
283
358
  if not default:
284
359
  logger.info(
285
360
  f"Calling put_catalog with set_as_default=False, "
286
- f"but still setting Catalog {catalog} as default since it is the only catalog."
361
+ f"but still setting Catalog {catalog} as default since it is "
362
+ f"the only catalog."
287
363
  )
288
364
  init({name: catalog}, ray_init_args=ray_init_args)
289
- return
365
+ return catalog
290
366
 
291
367
  # Fail if fail_if_exists and catalog already exists
292
368
  if fail_if_exists:
293
- catalog_already_exists = False
294
369
  try:
295
370
  get_catalog(name)
296
- # Note - need to set state catalog_already_exists and throw ValueError later, or else it will be
297
- # caught in the except block which is meant to catch the ValueError from get_catalog
298
- catalog_already_exists = True
299
- except ValueError:
300
- pass
301
- if catalog_already_exists:
371
+ # If we get here, catalog exists - raise error
302
372
  raise ValueError(
303
- f"Failed to put catalog {name} because it already exists and fail_if_exists={fail_if_exists}"
373
+ f"Failed to put catalog {name} because it already exists and "
374
+ f"fail_if_exists={fail_if_exists}"
304
375
  )
376
+ except ValueError as e:
377
+ if "not found" not in str(e):
378
+ # Re-raise if it's not a "catalog not found" error
379
+ raise
380
+ # If catalog doesn't exist, continue normally
381
+ pass
305
382
 
306
383
  # Add the catalog (which may overwrite existing if fail_if_exists=False)
307
384
  ray.get(all_catalogs.put.remote(name, catalog, default))
@@ -1,5 +1,9 @@
1
1
  from __future__ import annotations
2
+
2
3
  from typing import Optional, Any
4
+ import urllib.parse
5
+
6
+ import os
3
7
 
4
8
  import pyarrow
5
9
  from deltacat.constants import DELTACAT_ROOT
@@ -8,18 +12,17 @@ from deltacat.utils.filesystem import resolve_path_and_filesystem
8
12
 
9
13
 
10
14
  def get_catalog_properties(
11
- *args,
15
+ *,
12
16
  catalog: Optional[CatalogProperties] = None,
13
17
  inner: Optional[CatalogProperties] = None,
14
18
  **kwargs,
15
19
  ) -> CatalogProperties:
16
20
  """
17
- Helper function to fetch CatalogProperties instance. You are meant to call this by providing your functions
18
- kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
21
+ Helper function to fetch CatalogProperties instance.
19
22
 
20
- This will look for a CatalogProperty value in the kwargs "catalog" or "inner". If these are found, it returns
21
- the CatalogProperty value under that kwarg. Otherwise, it will pass through kwargs to the CatalogProperties
22
- constructor.
23
+ This will look first look for CatalogProperties in either "catalog"
24
+ or "inner" and otherwise passes all keyword arguments to the
25
+ CatalogProperties constructor.
23
26
  """
24
27
  properties = catalog if catalog is not None else inner
25
28
  if properties is not None and isinstance(properties, CatalogProperties):
@@ -39,21 +42,22 @@ class CatalogProperties:
39
42
  DeltaCAT catalog instance. Properties are set from system environment
40
43
  variables unless explicit overrides are provided during initialization.
41
44
 
42
- Catalog and storage APIs rely on the property catalog to retrieve durable state about the catalog they're
43
- working against.
45
+ Catalog and storage APIs rely on the property catalog to retrieve durable
46
+ state about the catalog they're working against.
44
47
 
45
48
  Attributes:
46
- root (str): URI string The root path where catalog metadata and data
47
- files are stored. Root is determined (in prededence order) by:
48
- 1. check "root" input argument
49
- 2. check env variable "DELTACAT_ROOT"
50
- 3. default to ${cwd}/.deltacat
49
+ root: The root path for catalog metadata and data storage. Resolved by
50
+ searching for the root path in the following order:
51
+ 1. "root" constructor input argument
52
+ 2. "DELTACAT_ROOT" system environment variable
53
+ 3. default to "./.deltacat/"
51
54
 
52
55
  filesystem: The filesystem implementation that should be used for
53
56
  reading/writing files. If None, a filesystem will be inferred from
54
57
  the catalog root path.
55
58
 
56
- storage: Storage class implementation (overrides default filesystem storage impl)
59
+ storage: Storage class implementation (overrides default filesystem
60
+ storage impl)
57
61
  """
58
62
 
59
63
  def __init__(
@@ -66,21 +70,26 @@ class CatalogProperties:
66
70
  Initialize a CatalogProperties instance.
67
71
 
68
72
  Args:
69
- root: A single directory path that serves as the catalog root dir.
73
+ root: Catalog root directory path. Uses the "DELTACAT_ROOT"
74
+ system environment variable if not set, and defaults to
75
+ "./.deltacat/" if this environment variable is not set.
70
76
  filesystem: The filesystem implementation that should be used for
71
77
  reading these files. If None, a filesystem will be inferred.
72
- If not None, the provided filesystem will still be validated
73
- against the provided path to ensure compatibility.
78
+ If provided, this will be validated for compatibility with the
79
+ catalog root path.
80
+ storage: DeltaCAT storage implementation override.
74
81
  """
75
82
  # set root, using precedence rules described in pydoc
76
83
  if root is None:
77
84
  # Check environment variables
78
- # This is set or defaulted in constants.py
79
85
  root = DELTACAT_ROOT
80
- if root is None:
81
- raise ValueError(
82
- "Expected environment variable DELTACAT_ROOT to be set or defaulted"
83
- )
86
+ if not root:
87
+ # Default to "./.deltacat/"
88
+ root = os.path.join(os.getcwd(), ".deltacat")
89
+
90
+ # Store the original root with its scheme for reconstruction later
91
+ self._original_root = root
92
+ self._original_scheme = urllib.parse.urlparse(root).scheme
84
93
 
85
94
  resolved_root, resolved_filesystem = resolve_path_and_filesystem(
86
95
  path=root,
@@ -105,6 +114,38 @@ class CatalogProperties:
105
114
  """
106
115
  return self._storage
107
116
 
117
+ def reconstruct_full_path(self, path: str) -> str:
118
+ """
119
+ Reconstruct a full path with the original scheme for external readers.
120
+
121
+ This addresses GitHub issue #567 by ensuring that cloud storage URIs
122
+ include the relevant scheme prefix (e.g., s3://) that some file readers
123
+ require regardless of the filesystem being used to read the file
124
+ (e.g., Daft).
125
+
126
+ Args:
127
+ path: A path relative to the catalog root or absolute path
128
+
129
+ Returns:
130
+ Full path with appropriate scheme prefix for external readers
131
+ """
132
+ # If the path already has a scheme, return it as-is
133
+ if urllib.parse.urlparse(path).scheme:
134
+ return path
135
+
136
+ # If we don't have an original scheme (local filesystem), return as-is
137
+ if not self._original_scheme:
138
+ return path
139
+
140
+ # Reconstruct the full path with the original scheme
141
+ # Handle both absolute and relative paths
142
+ if path.startswith("/"):
143
+ # Absolute path - this shouldn't happen normally but handle it
144
+ return f"{self._original_scheme}:/{path}"
145
+ else:
146
+ # Relative path - prepend the s3:// scheme
147
+ return f"{self._original_scheme}://{path}"
148
+
108
149
  def __str__(self):
109
150
  return (
110
151
  f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"