deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,294 @@
1
+ from __future__ import annotations
2
+
3
+ import threading
4
+ from threading import Thread
5
+ from typing import Any, List, Set, Protocol, TypeVar, Dict, Iterable
6
+
7
+ from pyarrow import RecordBatch, Table
8
+ from deltacat.storage.model.partition import PartitionLocator
9
+ from deltacat.storage.rivulet.metastore.delta import ManifestIO, DeltacatManifestIO
10
+
11
+ from deltacat.storage.rivulet import Schema
12
+ from deltacat.storage.rivulet.metastore.json_sst import JsonSstWriter
13
+ from deltacat.storage.rivulet.serializer import MEMTABLE_DATA, DataSerializer
14
+ from deltacat.storage.rivulet.serializer_factory import DataSerializerFactory
15
+ from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter, DATA
16
+ from deltacat.storage.rivulet.metastore.sst import SSTWriter
17
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
18
+
19
+ INPUT_ROW = TypeVar("INPUT_ROW")
20
+
21
+
22
+ class Memtable(Protocol[INPUT_ROW]):
23
+ """
24
+ Protocol defining the interface for a memtable that can store and sort records of type T.
25
+ """
26
+
27
+ def add_record(self, record: INPUT_ROW) -> bool:
28
+ """
29
+ Add a record to the memtable.
30
+
31
+ Args:
32
+ record: The record to add of type INPUT_ROW
33
+
34
+ Returns:
35
+ bool: True if the memtable is full after adding the record, False otherwise
36
+ """
37
+ ...
38
+
39
+ def get_sorted_records(self, schema: Schema) -> MEMTABLE_DATA:
40
+ """
41
+ Get all records in the memtable in sorted order.
42
+
43
+ Returns:
44
+ List[T]: A list of sorted records
45
+ """
46
+ ...
47
+
48
+
49
+ class DictMemTable(Memtable[Dict[str, Any]]):
50
+ """
51
+ Unit of in memory buffering of sorted records before records are written to file
52
+
53
+ TODO future improvements:
54
+ 1. build b+ tree of record indexes on insertion
55
+ OR If we end up using arrow as intermediate format, we can use
56
+ pyarrow compute sort
57
+ 2. Probably we will re-write in rust
58
+ """
59
+
60
+ def __init__(self, merge_key: str):
61
+ self.row_size = 0
62
+ self.merge_key = merge_key
63
+
64
+ self._records: List[Dict[str, Any]] = []
65
+ self.lock = threading.Lock()
66
+
67
+ def add_record(self, record: Dict[str, Any]):
68
+ with self.lock:
69
+ self._records.append(record)
70
+ self.row_size += 1
71
+
72
+ if self.row_size >= MemtableDatasetWriter.MAX_ROW_SIZE:
73
+ return True
74
+ return False
75
+
76
+ def get_sorted_records(self, schema: Schema) -> List[Dict[str, Any]]:
77
+ """
78
+ Gets sorted records
79
+
80
+ :return: iterator over sorted record
81
+ """
82
+ with self.lock:
83
+ self._records.sort(key=lambda x: x.__getitem__(self.merge_key))
84
+ return self._records
85
+
86
+
87
+ class RecordBatchMemTable(Memtable[RecordBatch]):
88
+ """
89
+ Note that this will not respect max row size.
90
+ """
91
+
92
+ def __init__(self, merge_key: str):
93
+ self.row_size = 0
94
+ self.merge_key = merge_key
95
+
96
+ # list of full record batches in memtable
97
+ self._records_batches: List[RecordBatch] = []
98
+ self.lock = threading.Lock()
99
+
100
+ def add_record(self, record: RecordBatch):
101
+ with self.lock:
102
+ self._records_batches.append(record)
103
+ self.row_size += record.num_rows
104
+
105
+ if self.row_size >= MemtableDatasetWriter.MAX_ROW_SIZE:
106
+ return True
107
+ return False
108
+
109
+ def get_sorted_records(self, schema: Schema) -> Table:
110
+ """
111
+ Gets sorted records
112
+
113
+ :return: iterator over sorted record
114
+ """
115
+ with self.lock:
116
+ # Note that we are providing schema so that pyarrow does not infer it
117
+ table = Table.from_batches(self._records_batches, schema.to_pyarrow())
118
+ return table.sort_by(self.merge_key)
119
+
120
+
121
+ class MemtableDatasetWriter(DatasetWriter):
122
+ # Note that this max row size is not respected when PyArrow RecordBatches are used
123
+ # In that case, the entire record batch is written within one memtable even if the row count overflows
124
+ MAX_ROW_SIZE = 1000000
125
+ """
126
+ Buffers data into rotating memtables. When a memtable reaches a certain size, it is flushed to disk and a new memtable is allocated
127
+
128
+ Uses DataWriter which will be format specific for writing data
129
+ Uses MetadataWriter for writing metadata
130
+
131
+ TODO Future Improvements
132
+ 1. Maybe we should re-write this class in Rust (pending testing)
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ file_provider: FileProvider,
138
+ schema: Schema,
139
+ locator: PartitionLocator,
140
+ file_format: str | None = None,
141
+ sst_writer: SSTWriter = None,
142
+ manifest_io: ManifestIO = None,
143
+ ):
144
+
145
+ if not sst_writer:
146
+ sst_writer = JsonSstWriter()
147
+ if not manifest_io:
148
+ manifest_io = DeltacatManifestIO(file_provider.uri, locator)
149
+
150
+ self.schema = schema
151
+
152
+ self.file_provider = file_provider
153
+ self.data_serializer: DataSerializer = DataSerializerFactory.get_serializer(
154
+ self.schema, self.file_provider, file_format
155
+ )
156
+ self.sst_writer = sst_writer
157
+ self.manifest_io = manifest_io
158
+
159
+ self._sst_files: Set[str] = set()
160
+ self.__curr_memtable = None
161
+ self.__open_memtables = []
162
+ self.__rlock = threading.RLock()
163
+ self.__open_threads: List[Thread] = []
164
+ self._locator = locator
165
+
166
+ def write_dict(self, record: Dict[str, Any]) -> None:
167
+
168
+ # Construct memtable if doesn't exist. If previous memtable wrong type, rotate
169
+ memtable_ctor = lambda: DictMemTable(self.schema.get_merge_key())
170
+ if not self.__curr_memtable:
171
+ self.__curr_memtable = memtable_ctor()
172
+ try:
173
+ isinstance(self.__curr_memtable, DictMemTable)
174
+ except TypeError:
175
+ self.__rotate_memtable(memtable_ctor)
176
+
177
+ # Write record(s). If memtable is full, rotate
178
+ if self.__curr_memtable.add_record(record):
179
+ self.__rotate_memtable(memtable_ctor)
180
+
181
+ def write_record_batch(self, record: RecordBatch) -> None:
182
+ # Construct memtable if doesn't exist. If previous memtable wrong type, rotate
183
+ memtable_ctor = lambda: RecordBatchMemTable(self.schema.get_merge_key())
184
+ if not self.__curr_memtable:
185
+ self.__curr_memtable = memtable_ctor()
186
+
187
+ try:
188
+ isinstance(self.__curr_memtable, RecordBatchMemTable)
189
+ except TypeError:
190
+ self.__rotate_memtable(memtable_ctor)
191
+
192
+ # Write record(s). If memtable is full, rotate
193
+ if self.__curr_memtable.add_record(record):
194
+ self.__rotate_memtable(memtable_ctor)
195
+
196
+ def write(self, data: DATA) -> None:
197
+ if isinstance(data, RecordBatch):
198
+ self.write_record_batch(data)
199
+ elif isinstance(data, Iterable):
200
+ for x in data:
201
+ if isinstance(x, dict):
202
+ self.write_dict(x)
203
+ elif isinstance(x, RecordBatch):
204
+ self.write_record_batch(x)
205
+ else:
206
+ raise ValueError(
207
+ f"Iterable contained unsupported type {type(x).__name__}."
208
+ f" Supported data types to write are: {DATA}"
209
+ )
210
+ else:
211
+ raise ValueError(
212
+ f"Unsupported data type {type(data).__name__}. Supported data types to write are: {DATA}"
213
+ )
214
+
215
+ def flush(self) -> str:
216
+ """
217
+ Explicitly flush any data and metadata and commit to dataset
218
+ """
219
+ self.__flush_memtable(self.__curr_memtable)
220
+ for thread in [t for t in self.__open_threads if t.is_alive()]:
221
+ thread.join()
222
+
223
+ manifest_location = self.__write_manifest_file()
224
+ self._sst_files.clear()
225
+
226
+ return manifest_location
227
+
228
+ def __enter__(self) -> Any:
229
+ """
230
+ Enter and exit method allows python "with" statement
231
+ """
232
+ return self
233
+
234
+ def __exit__(self, exc_type, exc_value, traceback):
235
+ """
236
+ Closes all open memtables and ensures all data is flushed.
237
+ """
238
+ self.flush()
239
+ # return False to propogate up error messages
240
+ return False
241
+
242
+ def __rotate_memtable(self, memtable_constructor_closure):
243
+ """
244
+ Replace the active memtable
245
+ :return:
246
+ """
247
+ with self.__rlock:
248
+ self.__flush_memtable(self.__curr_memtable)
249
+ self.__curr_memtable = memtable_constructor_closure()
250
+ self.__open_memtables.append(self.__curr_memtable)
251
+
252
+ # Reap dead threads
253
+ self.__open_threads = [t for t in self.__open_threads if t.is_alive()]
254
+
255
+ def __flush_memtable(self, memtable):
256
+ thread = threading.Thread(target=self.__flush_memtable_async, args=(memtable,))
257
+ thread.start()
258
+ with self.__rlock:
259
+ self.__open_threads.append(thread)
260
+
261
+ def __flush_memtable_async(self, memtable: Memtable):
262
+ """
263
+ Flushes data and metadata for a given memtable
264
+ Called asynchronously in background thread
265
+ """
266
+ if not memtable:
267
+ return
268
+
269
+ sst_metadata_list = self.data_serializer.flush_batch(
270
+ memtable.get_sorted_records(self.schema)
271
+ )
272
+
273
+ # short circuit if no data/metadata written
274
+ if not sst_metadata_list:
275
+ with self.__rlock:
276
+ self.__open_memtables.remove(memtable)
277
+ return
278
+
279
+ # Write SST. Each memtable is going to have a dedicated L0 SST file because that is the unit at which
280
+ # we have contiguously sorted data
281
+ sst_file = self.file_provider.provide_l0_sst_file()
282
+
283
+ with self.__rlock:
284
+ self.sst_writer.write(sst_file, sst_metadata_list)
285
+ self._sst_files.add(sst_file.location)
286
+
287
+ if memtable in self.__open_memtables:
288
+ self.__open_memtables.remove(memtable)
289
+
290
+ def __write_manifest_file(self) -> str:
291
+ """
292
+ Write the manifest file to the filesystem at the given URI.
293
+ """
294
+ return self.manifest_io.write(list(self._sst_files), self.schema, 0)
@@ -0,0 +1 @@
1
+ # NOTE - this module is renamed because it is shadowing the stdlib io module when running tests in Pycharm
@@ -0,0 +1,324 @@
1
+ import unittest
2
+ import pytest
3
+ import ray
4
+ import tempfile
5
+ import shutil
6
+ import uuid
7
+ from unittest import mock
8
+ import os
9
+
10
+ from deltacat.catalog import CatalogProperties
11
+ from pyiceberg.catalog import Catalog as IcebergCatalog
12
+
13
+ from deltacat.catalog.model.catalog import (
14
+ Catalog,
15
+ init,
16
+ get_catalog,
17
+ put_catalog,
18
+ is_initialized,
19
+ )
20
+ from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
21
+
22
+ from pyiceberg.catalog import CatalogType
23
+
24
+
25
+ # Test module to mock a catalog implementation
26
+ class MockCatalogImpl:
27
+ @staticmethod
28
+ def initialize(*args, **kwargs):
29
+ # Return some state that the catalog would normally maintain
30
+ return {"initialized": True, "args": args, "kwargs": kwargs}
31
+
32
+
33
+ @pytest.fixture(scope="function")
34
+ def reset_catalogs_ray_actor():
35
+ """
36
+ Setup and teardown for Ray environment for tests.
37
+
38
+ This will kill the actor all_catalogs, essentially wiping global state for catalogs
39
+
40
+ NOTE: tests using this fixture must be run serially. As of April 7 2025, the unit test suite had various
41
+ failures if run in parallel, in part because the state of all_catalogs in ray is shared across tests.
42
+
43
+ NOTE: when using this fixture, ensure you pass ray_init_args={"ignore_reinit_error": True} into all
44
+ functions which may re-initialize ray. This is because the production code checks the all_catalogs actor
45
+ in order to determine whether it needs to initialize Ray
46
+ """
47
+ # Reset the global catalog_actor state before each test
48
+ import deltacat.catalog.model.catalog as catalog_module
49
+
50
+ # Initialize Ray if not already initialized
51
+ if not ray.is_initialized():
52
+ ray.init(ignore_reinit_error=True)
53
+ yield
54
+
55
+ # Clean up the actor if it exists
56
+ if catalog_module.all_catalogs is not None:
57
+ try:
58
+ ray.kill(catalog_module.all_catalogs)
59
+ except Exception:
60
+ pass
61
+ finally:
62
+ catalog_module.all_catalogs = None
63
+
64
+
65
+ class TestCatalog(unittest.TestCase):
66
+ """Tests for the Catalog class itself, without Ray initialization."""
67
+
68
+ def test_catalog_constructor(self):
69
+ """Test that the Catalog constructor correctly initializes with the given implementation."""
70
+ catalog = Catalog(impl=MockCatalogImpl)
71
+
72
+ self.assertEqual(catalog.impl, MockCatalogImpl)
73
+
74
+ # Check that inner state was correctly initialized
75
+ # This just asserts that kwargs were plumbed through from Catalog constructor
76
+ self.assertTrue(catalog.inner["initialized"])
77
+ self.assertEqual(catalog.inner["args"], ())
78
+ self.assertEqual(catalog.inner["kwargs"], {})
79
+
80
+ def test_iceberg_factory_method(self):
81
+ """Test the iceberg factory method correctly creates an Iceberg catalog."""
82
+ # Create a mock for the Iceberg catalog module
83
+ with mock.patch(
84
+ "deltacat.catalog.model.catalog.IcebergCatalog"
85
+ ) as mock_iceberg_catalog:
86
+ # Configure the mock to return a known value when initialize is called
87
+ mock_iceberg_catalog.initialize.return_value = {"iceberg": True}
88
+
89
+ # Create an Iceberg catalog config and invoke iceberg factory method
90
+ config = IcebergCatalogConfig(type=CatalogType.IN_MEMORY, properties={})
91
+ catalog = Catalog.iceberg(config)
92
+
93
+ # Check that the implementation is set to iceberg_catalog
94
+ self.assertEqual(catalog.impl, mock_iceberg_catalog)
95
+ # Check that the inner state is set to the output of initialize
96
+ self.assertEqual(catalog.inner, {"iceberg": True})
97
+
98
+
99
+ class TestCatalogsIntegration:
100
+ """Integration tests for Default catalog functionality."""
101
+
102
+ temp_dir = None
103
+
104
+ @classmethod
105
+ def setup_class(cls):
106
+ cls.temp_dir = tempfile.mkdtemp()
107
+ # Other tests are going to have initialized ray catalog. Initialize here to ensure
108
+ # that when this test class is run individuall it mimicks running with other tests
109
+ catalog = Catalog(impl=MockCatalogImpl)
110
+ init(
111
+ catalog,
112
+ ray_init_args={"ignore_reinit_error": True},
113
+ **{"force_reinitialize": True},
114
+ )
115
+
116
+ @classmethod
117
+ def teardown_class(cls):
118
+ if cls.temp_dir and os.path.exists(cls.temp_dir):
119
+ shutil.rmtree(cls.temp_dir)
120
+
121
+ def test_init_single_catalog(self, reset_catalogs_ray_actor):
122
+ """Test initializing a single catalog."""
123
+
124
+ catalog = Catalog(impl=MockCatalogImpl)
125
+
126
+ # Initialize with a single catalog and Ray init args including the namespace
127
+ init(
128
+ catalog,
129
+ ray_init_args={"ignore_reinit_error": True},
130
+ **{"force_reinitialize": True},
131
+ )
132
+
133
+ assert is_initialized()
134
+
135
+ # Get the default catalog and check it's the same one we initialized with
136
+ retrieved_catalog = get_catalog()
137
+ assert retrieved_catalog.impl == MockCatalogImpl
138
+ assert retrieved_catalog.inner["initialized"]
139
+
140
+ def test_init_multiple_catalogs(self, reset_catalogs_ray_actor):
141
+ """Test initializing multiple catalogs."""
142
+ # Create catalogs
143
+ catalog1 = Catalog(impl=MockCatalogImpl, id=1)
144
+ catalog2 = Catalog(impl=MockCatalogImpl, id=2)
145
+
146
+ # Initialize with multiple catalogs and Ray init args including the namespace
147
+ catalogs_dict = {"catalog1": catalog1, "catalog2": catalog2}
148
+ init(
149
+ catalogs_dict,
150
+ ray_init_args={"ignore_reinit_error": True},
151
+ **{"force_reinitialize": True},
152
+ )
153
+
154
+ assert is_initialized()
155
+
156
+ # Get catalogs by name and check they're the same ones we initialized with
157
+ retrieved_catalog1 = get_catalog("catalog1")
158
+ assert retrieved_catalog1.impl == MockCatalogImpl
159
+ assert retrieved_catalog1.inner["kwargs"]["id"] == 1
160
+
161
+ retrieved_catalog2 = get_catalog("catalog2")
162
+ assert retrieved_catalog2.impl == MockCatalogImpl
163
+ assert retrieved_catalog2.inner["kwargs"]["id"] == 2
164
+
165
+ def test_init_with_default_catalog_name(self, reset_catalogs_ray_actor):
166
+ """Test initializing with a specified default catalog name."""
167
+ # Create catalogs
168
+ catalog1 = Catalog(impl=MockCatalogImpl, id=1)
169
+ catalog2 = Catalog(impl=MockCatalogImpl, id=2)
170
+
171
+ # Initialize with multiple catalogs and specify a default
172
+ catalogs_dict = {"catalog1": catalog1, "catalog2": catalog2}
173
+ init(
174
+ catalogs_dict,
175
+ default="catalog2",
176
+ ray_init_args={"ignore_reinit_error": True},
177
+ **{"force_reinitialize": True},
178
+ )
179
+
180
+ # Get the default catalog and check it's catalog2
181
+ default_catalog = get_catalog()
182
+ assert default_catalog.impl == MockCatalogImpl
183
+ assert default_catalog.inner["kwargs"]["id"] == 2
184
+
185
+ def test_put_catalog(self, reset_catalogs_ray_actor):
186
+ """Test adding a catalog after initialization."""
187
+ # Initialize with a single catalog
188
+ catalog1 = Catalog(impl=MockCatalogImpl, id=1)
189
+ catalog2 = Catalog(impl=MockCatalogImpl, id=2)
190
+ init(
191
+ {"catalog1": catalog1},
192
+ ray_init_args={"ignore_reinit_error": True},
193
+ **{"force_reinitialize": True},
194
+ )
195
+
196
+ # Add a second catalog
197
+ put_catalog("catalog2", catalog2)
198
+
199
+ # Check both catalogs are available
200
+ retrieved_catalog1 = get_catalog("catalog1")
201
+ assert retrieved_catalog1.inner["kwargs"]["id"] == 1
202
+
203
+ retrieved_catalog2 = get_catalog("catalog2")
204
+ assert retrieved_catalog2.inner["kwargs"]["id"] == 2
205
+
206
+ def test_put_catalog_that_already_exists(self, reset_catalogs_ray_actor):
207
+ catalog = Catalog(impl=MockCatalogImpl, id=1)
208
+ catalog2 = Catalog(impl=MockCatalogImpl, id=2)
209
+ put_catalog(
210
+ "test_catalog",
211
+ catalog,
212
+ id=1,
213
+ ray_init_args={"ignore_reinit_error": True},
214
+ )
215
+
216
+ # Try to add another catalog with the same name. Should not error
217
+ put_catalog(
218
+ "test_catalog",
219
+ catalog2,
220
+ ray_init_args={"ignore_reinit_error": True},
221
+ )
222
+
223
+ retrieved_catalog = get_catalog("test_catalog")
224
+ assert retrieved_catalog.inner["kwargs"]["id"] == 2
225
+
226
+ # If fail_if_exists, put call should fail
227
+ with pytest.raises(ValueError):
228
+ put_catalog(
229
+ "test_catalog",
230
+ catalog,
231
+ ray_init_args={"ignore_reinit_error": True},
232
+ fail_if_exists=True,
233
+ )
234
+
235
+ def test_get_catalog_nonexistent(self, reset_catalogs_ray_actor):
236
+ """Test that trying to get a nonexistent catalog raises an error."""
237
+ # Initialize with a catalog
238
+ catalog = Catalog(impl=MockCatalogImpl)
239
+ init(
240
+ {"test_catalog": catalog},
241
+ ray_init_args={"ignore_reinit_error": True},
242
+ **{"force_reinitialize": True},
243
+ )
244
+
245
+ # Try to get a nonexistent catalog
246
+ with pytest.raises(ValueError):
247
+ get_catalog("nonexistent")
248
+
249
+ def test_get_catalog_no_default(self, reset_catalogs_ray_actor):
250
+ """Test that trying to get the default catalog when none is set raises an error."""
251
+ # Initialize with multiple catalogs but no default
252
+ catalog1 = Catalog(impl=MockCatalogImpl, id=1)
253
+ catalog2 = Catalog(impl=MockCatalogImpl, id=2)
254
+ init(
255
+ {"catalog1": catalog1, "catalog2": catalog2},
256
+ ray_init_args={"ignore_reinit_error": True},
257
+ **{"force_reinitialize": True},
258
+ )
259
+
260
+ # Try to get the default catalog
261
+ with pytest.raises(ValueError):
262
+ get_catalog()
263
+
264
+ def test_default_catalog_initialization(self, reset_catalogs_ray_actor):
265
+ """Test that a Default catalog can be initialized and accessed using the factory method."""
266
+ from deltacat.catalog.model.properties import CatalogProperties
267
+
268
+ catalog_name = str(uuid.uuid4())
269
+
270
+ # Create the catalog properties
271
+ config = CatalogProperties(root=self.temp_dir)
272
+
273
+ # Create the catalog using the factory method
274
+ catalog = Catalog.default(config)
275
+
276
+ # Initialize DeltaCAT with this catalog
277
+ init(
278
+ {catalog_name: catalog},
279
+ ray_init_args={"ignore_reinit_error": True},
280
+ **{"force_reinitialize": True},
281
+ )
282
+
283
+ # Retrieve the catalog and verify it's the same one
284
+ retrieved_catalog = get_catalog(catalog_name)
285
+ assert retrieved_catalog.impl.__name__ == "deltacat.catalog.main.impl"
286
+ assert isinstance(retrieved_catalog.inner, CatalogProperties)
287
+ assert retrieved_catalog.inner.root == self.temp_dir
288
+
289
+ def test_default_catalog_initialization_from_kwargs(self, reset_catalogs_ray_actor):
290
+
291
+ catalog_name = str(uuid.uuid4())
292
+ # Initialize DeltaCAT with this catalog
293
+ from deltacat.catalog.main import impl as DeltacatCatalog
294
+
295
+ put_catalog(
296
+ catalog_name,
297
+ Catalog(DeltacatCatalog, **{"root": "test_root"}),
298
+ ray_init_args={"ignore_reinit_error": True},
299
+ )
300
+
301
+ # Retrieve the catalog and verify it's the same one
302
+ retrieved_catalog = get_catalog(catalog_name)
303
+ assert retrieved_catalog.impl.__name__ == "deltacat.catalog.main.impl"
304
+ assert isinstance(retrieved_catalog.inner, CatalogProperties)
305
+ assert retrieved_catalog.inner.root == "test_root"
306
+
307
+ def test_iceberg_catalog_initialization(self, reset_catalogs_ray_actor):
308
+ """Test that an Iceberg catalog can be initialized and accessed."""
309
+ catalog_name = str(uuid.uuid4())
310
+
311
+ # Create the Iceberg catalog config
312
+ config = IcebergCatalogConfig(
313
+ type=CatalogType.IN_MEMORY, properties={"warehouse": self.temp_dir}
314
+ )
315
+
316
+ # Create the catalog using the factory method
317
+ catalog = Catalog.iceberg(config)
318
+
319
+ put_catalog(catalog_name, catalog, ray_init_args={"ignore_reinit_error": True})
320
+
321
+ # Retrieve the catalog and verify it's the same one
322
+ retrieved_catalog = get_catalog(catalog_name)
323
+ assert retrieved_catalog.impl.__name__ == "deltacat.catalog.iceberg.impl"
324
+ assert isinstance(retrieved_catalog.inner, IcebergCatalog)
@@ -1,20 +1,23 @@
1
1
  import unittest
2
2
  import sqlite3
3
+ import uuid
4
+
3
5
  import ray
4
6
  import os
5
7
  import deltacat.tests.local_deltacat_storage as ds
8
+ from deltacat import Catalog
9
+ from deltacat.catalog import CatalogProperties
6
10
  from deltacat.utils.common import current_time_ms
7
11
  from deltacat.tests.test_utils.pyarrow import (
8
12
  create_delta_from_csv_file,
9
13
  commit_delta_to_partition,
10
14
  )
11
15
  from deltacat.types.media import DistributedDatasetType, ContentType
12
- from deltacat.catalog import default_catalog_impl as dc
16
+ import deltacat as dc
13
17
 
14
18
 
15
19
  class TestReadTable(unittest.TestCase):
16
20
  READ_TABLE_NAMESPACE = "catalog_read_table_namespace"
17
- LOCAL_CATALOG_NAME = "local_catalog"
18
21
  DB_FILE_PATH = f"{current_time_ms()}.db"
19
22
  SAMPLE_FILE_PATH = "deltacat/tests/catalog/data/sample_table.csv"
20
23
 
@@ -31,6 +34,13 @@ class TestReadTable(unittest.TestCase):
31
34
  }
32
35
  cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
33
36
 
37
+ cls.catalog_name = str(uuid.uuid4())
38
+ catalog_config = CatalogProperties(storage=ds)
39
+ dc.put_catalog(
40
+ cls.catalog_name,
41
+ catalog=Catalog.default(config=catalog_config),
42
+ ray_init_args={"ignore_reinit_error": True},
43
+ )
34
44
  super().setUpClass()
35
45
 
36
46
  @classmethod
@@ -49,13 +59,12 @@ class TestReadTable(unittest.TestCase):
49
59
  **self.kwargs,
50
60
  )
51
61
 
52
- dc.initialize(ds=ds)
53
62
  df = dc.read_table(
54
63
  table=READ_TABLE_TABLE_NAME,
55
64
  namespace=self.READ_TABLE_NAMESPACE,
56
- catalog=self.LOCAL_CATALOG_NAME,
65
+ catalog=self.catalog_name,
57
66
  distributed_dataset_type=DistributedDatasetType.DAFT,
58
- deltacat_storage_kwargs=self.kwargs,
67
+ **self.kwargs,
59
68
  )
60
69
 
61
70
  # verify
@@ -81,14 +90,13 @@ class TestReadTable(unittest.TestCase):
81
90
  )
82
91
 
83
92
  # action
84
- dc.initialize(ds=ds)
85
93
  df = dc.read_table(
86
94
  table=READ_TABLE_TABLE_NAME,
87
95
  namespace=self.READ_TABLE_NAMESPACE,
88
- catalog=self.LOCAL_CATALOG_NAME,
96
+ catalog=self.catalog_name,
89
97
  distributed_dataset_type=DistributedDatasetType.DAFT,
90
98
  merge_on_read=False,
91
- deltacat_storage_kwargs=self.kwargs,
99
+ **self.kwargs,
92
100
  )
93
101
 
94
102
  # verify