deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,47 +1,129 @@
1
1
  # Allow self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Any, Dict, List, Optional
4
+ import logging
5
+ from types import ModuleType
5
6
 
7
+ from typing import Any, Dict, List, Optional, Union
8
+ from functools import partial
6
9
  import ray
7
10
 
8
- from deltacat.catalog import interface as catalog_interface
11
+ from deltacat import logs
12
+ from deltacat.annotations import ExperimentalAPI
13
+ from deltacat.catalog.main import impl as DeltacatCatalog
14
+ from deltacat.catalog.iceberg import impl as IcebergCatalog
15
+ from deltacat.catalog import CatalogProperties
16
+ from deltacat.catalog.iceberg import IcebergCatalogConfig
17
+ from deltacat.constants import DEFAULT_CATALOG
9
18
 
10
- all_catalogs: Optional[Catalogs] = None
19
+ all_catalogs: Optional[ray.actor.ActorHandle] = None
20
+
21
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
22
 
12
23
 
13
24
  class Catalog:
14
- def __init__(self, impl=catalog_interface, *args, **kwargs):
25
+ def __init__(self, impl: ModuleType = DeltacatCatalog, *args, **kwargs):
26
+ """
27
+ Constructor for a Catalog.
28
+
29
+ The args and kwargs here will be plumbed through to the catalog initialize function, and the results
30
+ are stored in Catalog.inner. Any state which is required (like: metastore root URI, pyiceberg native catalog)
31
+ MUST be returned by initialize.
32
+
33
+ Note: all initialization configuration MUST be pickle-able. When `Catalog` is pickled, _inner is excluded.
34
+ Instead, we only pass impl/args/kwargs, which are pickled and then _inner is re-constituted by calling __init__.
35
+ See `ray.util.register_serializer` in Catalogs class.
36
+ """
37
+ if not isinstance(self, Catalog):
38
+ # self may contain the tuple returned from __reduce__ (ray pickle bug?)
39
+ if callable(self[0]) and isinstance(self[1], tuple):
40
+ logger.info(f"Invoking {self[0]} with positional args: {self[1]}")
41
+ return self[0](*self[1])
42
+ else:
43
+ err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
44
+ raise RuntimeError(err_msg)
45
+
15
46
  self._impl = impl
16
- self._impl.initialize(*args, **kwargs)
47
+ self._inner = self._impl.initialize(*args, **kwargs)
48
+ self._args = args
49
+ self._kwargs = kwargs
50
+
51
+ @classmethod
52
+ @ExperimentalAPI
53
+ def iceberg(cls, config: IcebergCatalogConfig, *args, **kwargs):
54
+ """
55
+ !!! ICEBERG SUPPORT IS EXPERIMENTAL !!!
56
+
57
+ Factory method to construct a catalog from Iceberg catalog params
58
+
59
+ This method is just a wrapper around __init__ with stronger typing. You may still call __init__,
60
+ plumbing __params__ through as kwargs
61
+ """
62
+ return cls(impl=IcebergCatalog, *args, **{"config": config, **kwargs})
63
+
64
+ @classmethod
65
+ def default(cls, config: CatalogProperties, *args, **kwargs):
66
+ """
67
+ Factory method to construct a catalog with the default implementation
68
+
69
+ Uses CatalogProperties as configuration
70
+ """
71
+ return cls(impl=DeltacatCatalog, *args, **{"config": config, **kwargs})
17
72
 
18
73
  @property
19
74
  def impl(self):
20
75
  return self._impl
21
76
 
77
+ @property
78
+ def inner(self) -> Optional[Any]:
79
+ return self._inner
80
+
81
+ # support pickle, copy, deepcopy, etc.
82
+ def __reduce__(self):
83
+ # instantiated catalogs may fail to pickle, so exclude _inner
84
+ # (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
85
+ return partial(self.__class__, **self._kwargs), (self._impl, *self._args)
86
+
87
+ def __str__(self):
88
+ string_rep = f"{self.__class__.__name__}("
89
+ if self._args:
90
+ string_rep += f"args={self._args}, "
91
+ if self._kwargs:
92
+ string_rep += f"kwargs={self._kwargs}, "
93
+ if self._inner:
94
+ string_rep += f"inner={self._inner})"
95
+ return string_rep
96
+
97
+ def __repr__(self):
98
+ return self.__str__()
99
+
22
100
 
23
101
  @ray.remote
24
102
  class Catalogs:
25
103
  def __init__(
26
104
  self,
27
- catalogs: Dict[str, Catalog],
28
- default_catalog_name: str = None,
105
+ catalogs: Union[Catalog, Dict[str, Catalog]],
106
+ default: Optional[str] = None,
29
107
  *args,
30
108
  **kwargs,
31
109
  ):
32
- if default_catalog_name and default_catalog_name not in catalogs:
110
+ if default and default not in catalogs:
33
111
  raise ValueError(
34
- f"Catalog {default_catalog_name} not found "
35
- f"in catalogs to register: {catalogs}"
112
+ f"Catalog {default} not found " f"in catalogs to register: {catalogs}"
36
113
  )
37
114
  if not catalogs:
38
115
  raise ValueError(
39
116
  f"No catalogs given to register. "
40
117
  f"Please specify one or more catalogs."
41
118
  )
119
+
120
+ # if user only provides single Catalog, override it to be a map with default key
121
+ if isinstance(catalogs, Catalog):
122
+ catalogs = {DEFAULT_CATALOG: catalogs}
123
+
42
124
  self.catalogs: Dict[str, Catalog] = catalogs
43
- if default_catalog_name:
44
- self.default_catalog = self.catalogs[default_catalog_name]
125
+ if default:
126
+ self.default_catalog = self.catalogs[default]
45
127
  elif len(catalogs) == 1:
46
128
  self.default_catalog = list(self.catalogs.values())[0]
47
129
  else:
@@ -53,8 +135,10 @@ class Catalogs:
53
135
  def names(self) -> List[str]:
54
136
  return list(self.catalogs.keys())
55
137
 
56
- def put(self, name: str, catalog: Catalog) -> None:
138
+ def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
57
139
  self.catalogs[name] = catalog
140
+ if set_default:
141
+ self.default_catalog = catalog
58
142
 
59
143
  def get(self, name) -> Catalog:
60
144
  return self.catalogs.get(name)
@@ -63,21 +147,144 @@ class Catalogs:
63
147
  return self.default_catalog
64
148
 
65
149
 
150
+ def is_initialized(*args, **kwargs) -> bool:
151
+ """
152
+ Check if DeltaCAT is initialized
153
+ """
154
+ global all_catalogs
155
+
156
+ # If ray is not initialized, then Catalogs cannot be initialized
157
+ if not ray.is_initialized():
158
+ # Any existing actor reference stored in catalog_module must be stale - reset it
159
+ all_catalogs = None
160
+ return False
161
+
162
+ return all_catalogs is not None
163
+
164
+
66
165
  def init(
67
- catalogs: Dict[str, Catalog],
68
- default_catalog_name: str = None,
166
+ catalogs: Union[Dict[str, Catalog], Catalog],
167
+ default: Optional[str] = None,
69
168
  ray_init_args: Dict[str, Any] = None,
70
169
  *args,
170
+ force_reinitialize=False,
71
171
  **kwargs,
72
172
  ) -> None:
173
+ """
174
+ Initialize DeltaCAT catalogs.
73
175
 
74
- if not ray.is_initialized():
176
+ :param catalogs: Either a single Catalog instance or a map of string to Catalog instance
177
+ :param default: The Catalog to use by default. If only one Catalog is provided, it will
178
+ be set as the default
179
+ :param ray_init_args: kwargs to pass to ray initialization
180
+ :param force_reinitialize: if True, force the reinitialization of Ray. If false, will do nothing if ray already initialized
181
+ """
182
+ global all_catalogs
183
+
184
+ if is_initialized() and not force_reinitialize:
185
+ logger.warning("DeltaCAT already initialized.")
186
+ return
187
+ else:
75
188
  if ray_init_args:
76
189
  ray.init(**ray_init_args)
77
190
  else:
78
- ray.init(address="auto")
191
+ ray.init()
79
192
 
80
- global all_catalogs
81
- all_catalogs = Catalogs.remote(
82
- catalogs=catalogs, default_catalog_name=default_catalog_name
193
+ # register custom serializer for catalogs since these may contain
194
+ # unserializable objects like boto3 clients with SSLContext
195
+ ray.util.register_serializer(
196
+ Catalog, serializer=Catalog.__reduce__, deserializer=Catalog.__init__
83
197
  )
198
+ all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
199
+
200
+
201
+ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
202
+ """
203
+ Get a catalog by name, or the default catalog if no name is provided.
204
+
205
+ Args:
206
+ name: Name of catalog to retrieve (optional, uses default if not provided)
207
+
208
+ Returns:
209
+ The requested Catalog, or ValueError if it does not exist
210
+ """
211
+ global all_catalogs
212
+
213
+ if not all_catalogs:
214
+ raise ValueError(
215
+ "No catalogs available! Call "
216
+ "`deltacat.init(catalogs={...})` to register one or more "
217
+ "catalogs then retry."
218
+ )
219
+
220
+ if name is not None:
221
+ catalog = ray.get(all_catalogs.get.remote(name))
222
+ if not catalog:
223
+ available_catalogs = ray.get(all_catalogs.all.remote()).values()
224
+ raise ValueError(
225
+ f"Catalog '{name}' not found. Available catalogs: "
226
+ f"{available_catalogs}."
227
+ )
228
+ return catalog
229
+
230
+ else:
231
+ catalog = ray.get(all_catalogs.default.remote())
232
+ if not catalog:
233
+ available_catalogs = ray.get(all_catalogs.all.remote()).values()
234
+ raise ValueError(
235
+ f"Call to get_catalog without name set failed because there is no default Catalog set. Available catalogs: "
236
+ f"{available_catalogs}."
237
+ )
238
+ return catalog
239
+
240
+
241
+ def put_catalog(
242
+ name: str,
243
+ catalog: Catalog = None,
244
+ *,
245
+ default: bool = False,
246
+ ray_init_args: Dict[str, Any] = None,
247
+ fail_if_exists: bool = False,
248
+ **kwargs,
249
+ ) -> None:
250
+ """
251
+ Add a named catalog to the global map of named catalogs. Initializes ray if not already initialized.
252
+
253
+ Args:
254
+ name: name of catalog
255
+ catalog: catalog instance to use, if provided
256
+ default: Make this the default catalog if multiple catalogs are available.
257
+ ignored if this is the only catalog available, since it will always be the default catalog.
258
+ ray_init_args: ray initialization args (used only if ray not already initialized)
259
+ fail_if_exists: if True, raises KeyError if the catalog name already exists. Otherwise, overwrite catalog
260
+ """
261
+ global all_catalogs
262
+
263
+ # Initialize, if necessary
264
+ if not is_initialized():
265
+ # NOTE - since we are initializing with a single catalog, it will be set to the default
266
+ if not default:
267
+ logger.info(
268
+ f"Calling put_catalog with set_as_default=False, "
269
+ f"but still setting Catalog {catalog} as default since it is the only catalog."
270
+ )
271
+ init({name: catalog}, ray_init_args=ray_init_args)
272
+ return
273
+
274
+ # Fail if fail_if_exists and catalog already exists
275
+ if fail_if_exists:
276
+ catalog_already_exists = False
277
+ try:
278
+ get_catalog(name)
279
+ # Note - need to set state catalog_already_exists and throw ValueError later, or else it will be
280
+ # caught in the except block which is meant to catch the ValueError from get_catalog
281
+ catalog_already_exists = True
282
+ except ValueError:
283
+ pass
284
+ if catalog_already_exists:
285
+ raise ValueError(
286
+ f"Failed to put catalog {name} because it already exists and fail_if_exists={fail_if_exists}"
287
+ )
288
+
289
+ # Add the catalog (which may overwrite existing if fail_if_exists=False)
290
+ ray.get(all_catalogs.put.remote(name, catalog, default))
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, Any
3
+
4
+ import pyarrow
5
+ from deltacat.constants import DELTACAT_ROOT
6
+
7
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
8
+
9
+
10
+ def get_catalog_properties(
11
+ *args,
12
+ catalog: Optional[CatalogProperties] = None,
13
+ inner: Optional[CatalogProperties] = None,
14
+ **kwargs,
15
+ ) -> CatalogProperties:
16
+ """
17
+ Helper function to fetch CatalogProperties instance. You are meant to call this by providing your functions
18
+ kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
19
+
20
+ This will look for a CatalogProperty value in the kwargs "catalog" or "inner". If these are found, it returns
21
+ the CatalogProperty value under that kwarg. Otherwise, it will pass through kwargs to the CatalogProperties
22
+ constructor.
23
+ """
24
+ properties = catalog if catalog is not None else inner
25
+ if properties is not None and isinstance(properties, CatalogProperties):
26
+ return properties
27
+ elif properties is not None and not isinstance(properties, CatalogProperties):
28
+ raise ValueError(
29
+ f"Expected catalog properties of type {CatalogProperties.__name__} "
30
+ f"but found {type(properties)}."
31
+ )
32
+ else:
33
+ return CatalogProperties(**kwargs)
34
+
35
+
36
+ class CatalogProperties:
37
+ """
38
+ DeltaCAT catalog properties used to deterministically resolve a durable
39
+ DeltaCAT catalog instance. Properties are set from system environment
40
+ variables unless explicit overrides are provided during initialization.
41
+
42
+ Catalog and storage APIs rely on the property catalog to retrieve durable state about the catalog they're
43
+ working against.
44
+
45
+ Attributes:
46
+ root (str): URI string The root path where catalog metadata and data
47
+ files are stored. Root is determined (in prededence order) by:
48
+ 1. check kwargs for "root"
49
+ 2. check env variable "DELTACAT_ROOT"
50
+ 3. default to ${cwd}/.deltacat
51
+
52
+ filesystem: The filesystem implementation that should be used for
53
+ reading/writing files. If None, a filesystem will be inferred from
54
+ the catalog root path.
55
+
56
+ storage: Storage class implementation (overrides default filesystem storage impl)
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ root: Optional[str] = None,
62
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
63
+ storage=None,
64
+ *args,
65
+ **kwargs,
66
+ ):
67
+ """
68
+ Initialize a CatalogProperties instance.
69
+
70
+ Args:
71
+ root: A single directory path that serves as the catalog root dir.
72
+ filesystem: The filesystem implementation that should be used for
73
+ reading these files. If None, a filesystem will be inferred.
74
+ If not None, the provided filesystem will still be validated
75
+ against the provided path to ensure compatibility.
76
+ """
77
+ # set root, using precedence rules described in pydoc
78
+ if root is None:
79
+ # Check environment variables
80
+ # This is set or defaulted in constants.py
81
+ root = DELTACAT_ROOT
82
+ if root is None:
83
+ raise ValueError(
84
+ "Expected environment variable DELTACAT_ROOT to be set or defaulted"
85
+ )
86
+
87
+ resolved_root, resolved_filesystem = resolve_path_and_filesystem(
88
+ path=root,
89
+ filesystem=filesystem,
90
+ )
91
+ self._root = resolved_root
92
+ self._filesystem = resolved_filesystem
93
+ self._storage = storage
94
+
95
+ @property
96
+ def root(self) -> str:
97
+ return self._root
98
+
99
+ @property
100
+ def filesystem(self) -> Optional[pyarrow.fs.FileSystem]:
101
+ return self._filesystem
102
+
103
+ @property
104
+ def storage(self) -> Optional[Any]:
105
+ """
106
+ Return overridden storage impl, if any
107
+ """
108
+ return self._storage
109
+
110
+ def __str__(self):
111
+ return (
112
+ f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"
113
+ )
114
+
115
+ def __repr__(self):
116
+ return self.__str__()
@@ -1,19 +1,30 @@
1
1
  # Allow self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Optional, Any
5
+
4
6
  from deltacat.storage import Stream, Table, TableVersion
7
+ from deltacat.storage.model.scan.push_down import Pushdown
8
+ from deltacat.storage.model.scan.scan_plan import ScanPlan
9
+ from deltacat.storage.util.scan_planner import ScanPlanner
5
10
 
6
11
 
7
12
  class TableDefinition(dict):
8
13
  @staticmethod
9
14
  def of(
10
- table: Table, table_version: TableVersion, stream: Stream
15
+ table: Table,
16
+ table_version: TableVersion,
17
+ stream: Stream,
18
+ native_object: Optional[Any] = None,
19
+ scan_planner: Optional[ScanPlanner] = None,
11
20
  ) -> TableDefinition:
12
21
  return TableDefinition(
13
22
  {
14
23
  "table": table,
15
24
  "tableVersion": table_version,
16
25
  "stream": stream,
26
+ "nativeObject": native_object,
27
+ "scan_planner": scan_planner,
17
28
  }
18
29
  )
19
30
 
@@ -28,3 +39,23 @@ class TableDefinition(dict):
28
39
  @property
29
40
  def stream(self) -> Stream:
30
41
  return self["stream"]
42
+
43
+ @property
44
+ def native_object(self) -> Optional[Any]:
45
+ return self.get("nativeObject")
46
+
47
+ @property
48
+ def scan_planner(self) -> Optional[ScanPlanner]:
49
+ return self.get("scan_planner")
50
+
51
+ def create_scan_plan(self, pushdown: Optional[Pushdown] = None) -> ScanPlan:
52
+ if not self.scan_planner:
53
+ raise RuntimeError(
54
+ f"ScanPlanner is not initialized for table '{self.table.table_name}' "
55
+ f"of namespace '{self.table.namespace}'"
56
+ )
57
+ return self.scan_planner.create_scan_plan(
58
+ table_name=self.table.table_name,
59
+ namespace=self.table.namespace,
60
+ pushdown=pushdown,
61
+ )
@@ -8,6 +8,7 @@ from typing import List, Union
8
8
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
9
9
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
10
10
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
11
+ from deltacat.compute.compactor_v2.model.merge_result import MergeResult
11
12
  from deltacat.utils.performance import timed_invocation
12
13
  from deltacat.utils.resources import ClusterUtilization, get_size_of_object_in_bytes
13
14
  from deltacat.compute.compactor import PyArrowWriteResult
@@ -670,13 +671,13 @@ class CompactionSessionAuditInfo(dict):
670
671
  self, output_size_bytes: float
671
672
  ) -> CompactionSessionAuditInfo:
672
673
  self["outputSizeBytes"] = output_size_bytes
673
- return output_size_bytes
674
+ return self
674
675
 
675
676
  def set_output_size_pyarrow_bytes(
676
677
  self, output_size_pyarrow_bytes: float
677
678
  ) -> CompactionSessionAuditInfo:
678
679
  self["outputSizePyarrowBytes"] = output_size_pyarrow_bytes
679
- return output_size_pyarrow_bytes
680
+ return self
680
681
 
681
682
  def set_total_cluster_memory_bytes(
682
683
  self, total_cluster_memory_bytes: float
@@ -787,7 +788,10 @@ class CompactionSessionAuditInfo(dict):
787
788
  self,
788
789
  step_name: str,
789
790
  task_results: Union[
790
- List[HashBucketResult], List[DedupeResult], List[MaterializeResult]
791
+ List[HashBucketResult],
792
+ List[DedupeResult],
793
+ List[MaterializeResult],
794
+ List[MergeResult],
791
795
  ],
792
796
  task_results_retrieved_at: float,
793
797
  invoke_time_in_seconds: float,
@@ -11,10 +11,10 @@ from deltacat import logs
11
11
  from deltacat.storage import (
12
12
  Delta,
13
13
  DeltaType,
14
- Manifest,
15
14
  ManifestEntry,
16
15
  ManifestEntryList,
17
16
  )
17
+ from deltacat.storage.model.manifest import Manifest
18
18
 
19
19
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
20
 
@@ -107,7 +107,7 @@ class DeltaAnnotated(Delta):
107
107
  assert len(src_da_annotations) == len(src_da_entries), (
108
108
  f"Unexpected Error: Length of delta annotations "
109
109
  f"({len(src_da_annotations)}) doesn't mach the length of "
110
- f"delta manifest entries ({len(src_da_entries)}).",
110
+ f"delta manifest entries ({len(src_da_entries)})."
111
111
  )
112
112
  for i, src_entry in enumerate(src_da_entries):
113
113
  # create a new da group if src and dest has different delta locator
@@ -161,7 +161,7 @@ class DeltaAnnotated(Delta):
161
161
  assert len(src_da_annotations) == len(src_da_entries), (
162
162
  f"Unexpected Error: Length of delta annotations "
163
163
  f"({len(src_da_annotations)}) doesn't mach the length of "
164
- f"delta manifest entries ({len(src_da_entries)}).",
164
+ f"delta manifest entries ({len(src_da_entries)})."
165
165
  )
166
166
  src_da_entries_length = len(src_da_entries)
167
167
  equal_length = src_da_entries_length // pieces
@@ -37,7 +37,9 @@ class DeltaFileEnvelope(dict):
37
37
  pointing to a file from the uncompacted source table, False if
38
38
  this Locator is pointing to a file in the compacted destination
39
39
  table.
40
- table_storage_strategy: The way the table object is stored in the delta file envelope. If None just stores the table normally
40
+ file_record_count: Record count in the delta file table.
41
+ table_storage_strategy: The way the table object is stored in the
42
+ delta file envelope. If None just stores the table normally
41
43
  Returns:
42
44
  A delta file envelope.
43
45
 
@@ -31,9 +31,11 @@ class DeltaFileLocator(Locator, tuple):
31
31
 
32
32
  file_index: Index of the file in the Delta Manifest.
33
33
 
34
+ file_record_count: Count of records in the Delta File.
35
+
34
36
  Returns:
35
37
  delta_file_locator: The Delta File Locator Tuple as
36
- (is_source_delta, stream_position, file_index).
38
+ (is_src_delta, stream_position, file_index, file_record_count).
37
39
  """
38
40
  return DeltaFileLocator(
39
41
  (is_src_delta, stream_position, file_index, file_record_count)
@@ -1,7 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Tuple
4
+ from typing import List, Tuple, Union
5
5
  from deltacat.storage import DeltaLocator, PartitionLocator
6
6
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
7
7
  from typing import Any, Dict, Optional
@@ -34,7 +34,7 @@ class RoundCompletionInfo(dict):
34
34
 
35
35
  @staticmethod
36
36
  def of(
37
- high_watermark: HighWatermark,
37
+ high_watermark: Union[HighWatermark, int],
38
38
  compacted_delta_locator: DeltaLocator,
39
39
  compacted_pyarrow_write_result: PyArrowWriteResult,
40
40
  sort_keys_bit_width: int,
@@ -66,7 +66,7 @@ class RoundCompletionInfo(dict):
66
66
  return rci
67
67
 
68
68
  @property
69
- def high_watermark(self) -> HighWatermark:
69
+ def high_watermark(self) -> Union[HighWatermark, int]:
70
70
  val: Dict[str, Any] = self.get("highWatermark")
71
71
  if (
72
72
  val is not None
@@ -111,7 +111,7 @@ class RoundCompletionInfo(dict):
111
111
  return self["hashBucketCount"]
112
112
 
113
113
  @property
114
- def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
114
+ def hb_index_to_entry_range(self) -> Optional[Dict[str, Tuple[int, int]]]:
115
115
  """
116
116
  The start index is inclusive and end index is exclusive by default.
117
117
  """
@@ -130,5 +130,5 @@ class RoundCompletionInfo(dict):
130
130
  return self.get("inputAverageRecordSizeBytes")
131
131
 
132
132
  @staticmethod
133
- def get_audit_bucket_name_and_key(compaction_audit_url: str) -> Tuple[str, str]:
133
+ def get_audit_bucket_name_and_key(compaction_audit_url: str) -> List[str]:
134
134
  return compaction_audit_url.replace("s3://", "").split("/", 1)
@@ -4,7 +4,7 @@ from ray.types import ObjectRef
4
4
 
5
5
  from typing import Any, Union
6
6
 
7
- from abc import ABC, abstractmethod, abstractproperty
7
+ from abc import ABC, abstractmethod
8
8
  from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
9
9
  from deltacat.storage import (
10
10
  LocalTable,
@@ -15,7 +15,8 @@ LocalTableReference = Union[ObjectRef, LocalTable]
15
15
 
16
16
 
17
17
  class LocalTableStorageStrategy(ABC):
18
- @abstractproperty
18
+ @property
19
+ @abstractmethod
19
20
  def object_store(cls) -> IObjectStore:
20
21
  pass
21
22
 
@@ -138,7 +138,7 @@ def repartition(
138
138
  )
139
139
  logger.info(f"Getting {len(repar_tasks_pending)} task results...")
140
140
  repar_results: List[RepartitionResult] = ray.get(repar_tasks_pending)
141
- repar_results: List[Delta] = [rp.range_deltas for rp in repar_results]
141
+ repar_results: List[List[Delta]] = [rp.range_deltas for rp in repar_results]
142
142
  transposed = list(itertools.zip_longest(*repar_results, fillvalue=None))
143
143
  ordered_deltas: List[Delta] = [
144
144
  i for sublist in transposed for i in sublist if i is not None
@@ -15,7 +15,8 @@ from deltacat.compute.compactor import (
15
15
  DeltaFileEnvelope,
16
16
  DeltaFileLocator,
17
17
  )
18
- from deltacat.storage.model.sort_key import SortKey, SortOrder
18
+ from deltacat.storage.model.sort_key import SortKey
19
+ from deltacat.storage import SortOrder
19
20
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
20
21
  from deltacat.compute.compactor.utils import system_columns as sc
21
22
  from deltacat.utils.ray_utils.runtime import (
@@ -155,15 +156,21 @@ def _timed_dedupe(
155
156
  sort_keys.extend(
156
157
  [
157
158
  SortKey.of(
158
- sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
159
+ [sc._PARTITION_STREAM_POSITION_COLUMN_NAME],
159
160
  SortOrder.ASCENDING,
160
161
  ),
161
162
  SortKey.of(
162
- sc._ORDERED_FILE_IDX_COLUMN_NAME, SortOrder.ASCENDING
163
+ [sc._ORDERED_FILE_IDX_COLUMN_NAME],
164
+ SortOrder.ASCENDING,
163
165
  ),
164
166
  ]
165
167
  )
166
- table = table.take(pc.sort_indices(table, sort_keys=sort_keys))
168
+ table = table.take(
169
+ pc.sort_indices(
170
+ table,
171
+ sort_keys=[pa_key for key in sort_keys for pa_key in key.arrow],
172
+ )
173
+ )
167
174
 
168
175
  # drop duplicates by primary key hash column
169
176
  logger.info(